├── .gitattributes ├── .gitignore ├── README.md ├── beam.lua ├── data └── opensubssmall │ └── input.txt ├── eval.lua ├── neuralconvo.lua ├── seq2seq.lua ├── tokenizer.lua ├── trainoptim.lua └── util ├── ModelTracker.lua ├── Tester.lua └── WordSplitLMMinibatchLoader.lua /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | .idea 5 | 6 | # Folder config file 7 | Desktop.ini 8 | 9 | # Recycle Bin used on file shares 10 | $RECYCLE.BIN/ 11 | 12 | # Windows Installer files 13 | *.cab 14 | *.msi 15 | *.msm 16 | *.msp 17 | 18 | # Windows shortcuts 19 | *.lnk 20 | 21 | # ========================= 22 | # Operating System Files 23 | # ========================= 24 | 25 | # OSX 26 | # ========================= 27 | 28 | .DS_Store 29 | .AppleDouble 30 | .LSOverride 31 | 32 | # Thumbnails 33 | ._* 34 | 35 | # Files that might appear in the root of a volume 36 | .DocumentRevisions-V100 37 | .fseventsd 38 | .Spotlight-V100 39 | .TemporaryItems 40 | .Trashes 41 | .VolumeIcon.icns 42 | 43 | # Directories potentially created on remote AFP share 44 | .AppleDB 45 | .AppleDesktop 46 | Network Trash Folder 47 | Temporary Items 48 | .apdisk 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Torch Neural Conversation Model 2 | 3 | This is an implementation of seq2seq for language models implemented in torch. 4 | 5 | The main features are: 6 | 7 | 1. Multilayer LSTM's 8 | 2. Batch Processing 9 | 3. Explicit Vocabulary Size 10 | 4. Adagrad (but easy to use any torch.optim plugins) 11 | 5. Train/Test split 12 | 6. Gradient clipping 13 | 7. Large dataset support (above the normal LUA JIT limits, but limited by your system RAM) 14 | 8. Beam Search for Decoding 15 | 16 | 17 | 18 | This is really an extension of awesome work from the Element Research People: (rnn) and macournoyer's great project (neralconvo) and some helpful code from karpathy's char-rnn. 19 | 20 | This is technically in beta form, but I have confirmed that it is working. 21 | 22 | ##Examples 23 | 24 | I did a quick training with the first 9 million examples of the opensubs dataset for three epochs 25 | (with minibatch of size 1, adagrad learning rate .01, 25 words in, 25 words out) 26 | 27 | These outputs are the top five beams (using th beam.lua) 28 | 29 | **Ask: hi** 30 | 31 | * Hi. 32 | * How are you? 33 | * What are you doing here? 34 | * What are you doing? 35 | * How are you doing? 36 | 37 | 38 | **Ask: where are you from ?** 39 | 40 | * I dont know. 41 | * Im from ohio. 42 | * From the north. 43 | * I dont know... 44 | * I dont know...... but i dont know. 45 | 46 | **Ask: how old are you?** 47 | * \. 48 | * Im \. 49 | * I dont know. 50 | * \? 51 | * \, \. 52 | 53 | **Ask: goodbye** 54 | 55 | * Goodbye 56 | * Goodbye. 57 | * What are you doing? 58 | * Goodbye... 59 | * What are you doing here? 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | ##Installation 68 | 69 | ## Tests 70 | 71 | To run, use th trainoptim.lua --cuda 72 | 73 | (well- you don't have to use cuda, but this would be crazy...) 74 | 75 | When you want to test the model, run th beam.lua --cuda (or without the cuda flag if you trained it some other way) 76 | 77 | ##Dataset 78 | 79 | I put a small sample from the opensubs dataset up. Really, you can add any dataset in the form: 80 | 81 | input | response 82 | 83 | with the pipe ('|') dividing the two. You should preprocess your data a bit if you use it like this. (Lua isn't the greatest for writing this kind pf preprocessing) 84 | 85 | every new line is a new pair. 86 | 87 | 88 | -------------------------------------------------------------------------------- /beam.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Created by IntelliJ IDEA. 3 | -- User: user 4 | -- Date: 7/1/2016 5 | -- Time: 8:47 PM 6 | -- To change this template use File | Settings | File Templates. 7 | -- 8 | 9 | 10 | require 'neuralconvo' 11 | require 'util.Tester' 12 | local tokenizer = require "tokenizer" 13 | local list = require "pl.List" 14 | require 'nn' 15 | local WordSplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader' 16 | 17 | 18 | local options = {} 19 | 20 | if loader == nil then 21 | cmd = torch.CmdLine() 22 | cmd:text('Options:') 23 | cmd:option('--cuda', false, 'use CUDA. Training must be done on CUDA') 24 | cmd:option('--debug', false, 'show debug info') 25 | cmd:option('--dataset', "model.t7", 'show debug info') 26 | cmd:option('--vocablocation', "data/opensubssmall/vocabwords.t7", 'show debug info') 27 | cmd:text() 28 | options = cmd:parse(arg) 29 | 30 | 31 | -- Enabled CUDA 32 | if options.cuda then 33 | require 'cutorch' 34 | require 'cunn' 35 | end 36 | 37 | -- Data 38 | loader = WordSplitLMMinibatchLoader.createFromJustVocab(options.vocablocation) 39 | 40 | end 41 | 42 | if model == nil then 43 | print("-- Loading model") 44 | model = torch.load("data/"..options.dataset) 45 | end 46 | 47 | 48 | 49 | function say(text) 50 | print(getResponseBeam(text,loader,model,options.debug,5)) 51 | end 52 | 53 | 54 | repeat 55 | io.write("Ask: ") 56 | io.flush() 57 | answer=io.read() 58 | 59 | io.write(say(answer)) 60 | 61 | until answer=="end" 62 | -------------------------------------------------------------------------------- /eval.lua: -------------------------------------------------------------------------------- 1 | require 'neuralconvo' 2 | require 'util.Tester' 3 | local tokenizer = require "tokenizer" 4 | local list = require "pl.List" 5 | require 'nn' 6 | local WordSplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader' 7 | 8 | 9 | local options = {} 10 | 11 | if loader == nil then 12 | cmd = torch.CmdLine() 13 | cmd:text('Options:') 14 | cmd:option('--cuda', false, 'use CUDA. Training must be done on CUDA') 15 | cmd:option('--debug', false, 'show debug info') 16 | cmd:option('--dataset', "model.t7", 'show debug info') 17 | cmd:option('--vocablocation', "data/opensubssmall/vocabwords.t7", 'show debug info') 18 | cmd:text() 19 | options = cmd:parse(arg) 20 | 21 | 22 | -- Enabled CUDA 23 | if options.cuda then 24 | require 'cutorch' 25 | require 'cunn' 26 | end 27 | 28 | -- Data 29 | loader = WordSplitLMMinibatchLoader.createFromJustVocab(options.vocablocation) 30 | 31 | end 32 | 33 | if model == nil then 34 | print("-- Loading model") 35 | model = torch.load("data/"..options.dataset) 36 | end 37 | 38 | 39 | 40 | function say(text) 41 | print(getResponse(text,loader,model,options.debug)) 42 | end 43 | 44 | repeat 45 | io.write("Ask: ") 46 | io.flush() 47 | answer=io.read() 48 | 49 | io.write(say(answer)) 50 | 51 | until answer=="end" 52 | -------------------------------------------------------------------------------- /neuralconvo.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'nn' 3 | require 'rnn' 4 | 5 | neuralconvo = {} 6 | 7 | 8 | torch.include('neuralconvo', 'seq2seq.lua') 9 | 10 | return neuralconvo -------------------------------------------------------------------------------- /seq2seq.lua: -------------------------------------------------------------------------------- 1 | -- Based on https://github.com/Element-Research/rnn/blob/master/examples/encoder-decoder-coupling.lua 2 | local Seq2Seq = torch.class("neuralconvo.Seq2Seq") 3 | 4 | function Seq2Seq:__init(vocabSize, hiddenSize,clipping,nlayers) 5 | require 'optim' 6 | self.vocabSize = assert(vocabSize, "vocabSize required at arg #1") 7 | self.hiddenSize = assert(hiddenSize, "hiddenSize required at arg #2") 8 | self.useSecondLayer = usesecondlayer or false 9 | print("Vocab Size: "..vocabSize) 10 | self.numLayers = nlayers or 1 11 | print ("Nlayers: ".. self.numLayers) 12 | self.useSeqLSTM = true -- faster implementation of LSTM + Sequencer 13 | 14 | self:buildModel() 15 | 16 | self.gradientclipping = clipping 17 | end 18 | 19 | function Seq2Seq:buildModel() 20 | -- Encoder 21 | self.encoder = nn.Sequential() 22 | self.encoder:add(nn.LookupTableMaskZero(self.vocabSize, self.hiddenSize)) 23 | self.encoder.lstmLayers = {} 24 | for i=1,self.numLayers do 25 | if self.useSeqLSTM then 26 | self.encoder.lstmLayers[i] = nn.SeqLSTM(self.hiddenSize, self.hiddenSize) 27 | self.encoder.lstmLayers[i]:maskZero() 28 | self.encoder:add(self.encoder.lstmLayers[i]) 29 | else 30 | self.encoder.lstmLayers[i] = nn.LSTM(self.hiddenSize, self.hiddenSize):maskZero(1) 31 | self.encoder:add(nn.Sequencer(self.encoder.lstmLayers[i])) 32 | end 33 | end 34 | self.encoder:add(nn.Select(1, -1)) 35 | 36 | -- Decoder 37 | self.decoder = nn.Sequential() 38 | self.decoder:add(nn.LookupTableMaskZero(self.vocabSize, self.hiddenSize)) 39 | self.decoder.lstmLayers = {} 40 | for i=1,self.numLayers do 41 | if self.useSeqLSTM then 42 | self.decoder.lstmLayers[i] = nn.SeqLSTM(self.hiddenSize, self.hiddenSize) 43 | self.decoder.lstmLayers[i]:maskZero() 44 | self.decoder:add(self.decoder.lstmLayers[i]) 45 | else 46 | self.decoder.lstmLayers[i] = nn.LSTM(self.hiddenSize, self.hiddenSize):maskZero(1) 47 | self.decoder:add(nn.Sequencer(self.decoder.lstmLayers[i])) 48 | end 49 | end 50 | self.decoder:add(nn.Sequencer(nn.MaskZero(nn.Linear(self.hiddenSize, self.vocabSize), 1))) 51 | self.decoder:add(nn.Sequencer(nn.MaskZero(nn.LogSoftMax(), 1))) 52 | 53 | self.criterion = nn.SequencerCriterion(nn.MaskZeroCriterion(nn.ClassNLLCriterion(),1)) 54 | 55 | 56 | 57 | 58 | 59 | self.encoder:zeroGradParameters() 60 | self.decoder:zeroGradParameters() 61 | 62 | 63 | 64 | 65 | 66 | 67 | self.c=nn.Container() 68 | self.c:add(self.encoder) 69 | self.c:add(self.decoder) 70 | self.x,self.dl_dx = self.c:getParameters() 71 | self.optimState={} 72 | 73 | end 74 | 75 | 76 | function Seq2Seq:cuda() 77 | self.encoder:cuda() 78 | self.decoder:cuda() 79 | 80 | if self.criterion then 81 | self.criterion:cuda() 82 | end 83 | 84 | 85 | 86 | self.c:cuda(); 87 | self.x,self.dl_dx = self.c:getParameters() 88 | 89 | end 90 | 91 | --[[ Forward coupling: Copy encoder cell and output to decoder LSTM ]]-- 92 | function Seq2Seq:forwardConnect(enc, dec, seqLen) 93 | for i=1,#enc.lstmLayers do 94 | if self.useSeqLSTM then 95 | dec.lstmLayers[i].userPrevOutput = enc.lstmLayers[i].output[seqLen] 96 | dec.lstmLayers[i].userPrevCell = enc.lstmLayers[i].cell[seqLen] 97 | else 98 | dec.lstmLayers[i].userPrevOutput = nn.rnn.recursiveCopy(dec.lstmLayers[i].userPrevOutput, enc.lstmLayers[i].outputs[seqLen]) 99 | dec.lstmLayers[i].userPrevCell = nn.rnn.recursiveCopy(dec.lstmLayers[i].userPrevCell, enc.lstmLayers[i].cells[seqLen]) 100 | end 101 | end 102 | end 103 | 104 | --[[ Backward coupling: Copy decoder gradients to encoder LSTM ]]-- 105 | function Seq2Seq:backwardConnect(enc, dec) 106 | for i=1,#enc.lstmLayers do 107 | if self.useSeqLSTM then 108 | enc.lstmLayers[i].userNextGradCell = dec.lstmLayers[i].userGradPrevCell 109 | enc.lstmLayers[i].gradPrevOutput = dec.lstmLayers[i].userGradPrevOutput 110 | else 111 | enc.lstmLayers[i].userNextGradCell = nn.rnn.recursiveCopy(enc.lstmLayers[i].userNextGradCell, dec.lstmLayers[i].userGradPrevCell) 112 | enc.lstmLayers[i].gradPrevOutput = nn.rnn.recursiveCopy(enc.lstmLayers[i].gradPrevOutput, dec.lstmLayers[i].userGradPrevOutput) 113 | end 114 | end 115 | end 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | function Seq2Seq:train(input, target,targety, learn) 128 | --these are just 1d vectors with word ids 129 | local encoderInput = input 130 | local decoderInput = target 131 | local decoderTarget = targety 132 | 133 | if learn == nil then learn =true end 134 | 135 | -- Forward pass 136 | self.encoder:forward(encoderInput) 137 | self:forwardConnect(self.encoder,self.decoder,encoderInput:size(1)) 138 | local decoderOutput = self.decoder:forward(decoderInput) 139 | local Edecoder = self.criterion:forward(decoderOutput, decoderTarget) 140 | 141 | if Edecoder ~= Edecoder then -- Exit early on bad error 142 | return Edecoder 143 | end 144 | 145 | 146 | -- Backward pass 147 | if learn then 148 | local gEdec = self.criterion:backward(decoderOutput, decoderTarget) 149 | self.decoder:backward(decoderInput, gEdec) 150 | self:backwardConnect(self.encoder,self.decoder) 151 | self.encoder:backward(encoderInput, self.zeroTensor) 152 | 153 | end 154 | 155 | 156 | 157 | self.decoder:forget() 158 | self.encoder:forget() 159 | 160 | return Edecoder/decoderTarget:size(1) 161 | end 162 | 163 | function Seq2Seq:update() 164 | 165 | self.dl_dx:clamp(-self.gradientclipping,self.gradientclipping); 166 | 167 | self.encoder:updateGradParameters(self.momentum) 168 | self.decoder:updateGradParameters(self.momentum) 169 | self.decoder:updateParameters(self.learningRate) 170 | self.encoder:updateParameters(self.learningRate) 171 | 172 | self.encoder:zeroGradParameters() 173 | self.decoder:zeroGradParameters() 174 | 175 | 176 | --self.decoder:forget() 177 | --self.encoder:forget() 178 | 179 | end 180 | 181 | 182 | function Seq2Seq:trainOptim(minibatch,optimizer) 183 | --these are just 1d vectors with word ids 184 | 185 | 186 | local myseq = self 187 | local Edecoder; 188 | local err; 189 | 190 | optimizer=optimizer or "adagrad" 191 | 192 | local feval = function(x_new) 193 | local totalerr = 0 194 | local totaln = 0 195 | 196 | for _,example in ipairs(minibatch) do 197 | 198 | --[Note: added a fix from a versioning problem on rnn: 199 | -- -- https://github.com/macournoyer/neuralconvo/issues/17]-- 200 | 201 | local encoderInput = example["input"] 202 | local target = example["target"] 203 | local decoderInput = target 204 | local decoderTarget = example["targetout"] 205 | 206 | local encoderOutput = myseq.encoder:forward(encoderInput) 207 | myseq:forwardConnect(myseq.encoder,myseq.decoder,encoderInput:size(1)) 208 | local decoderOutput = myseq.decoder:forward(decoderInput) 209 | -- print(decoderOutput ) 210 | -- print(decoderTarget) 211 | --io.read() 212 | 213 | local thiserr=myseq.criterion:forward(decoderOutput, decoderTarget) 214 | local nonzeroinputs = example["nonzeroTargets"] 215 | 216 | 217 | totalerr = totalerr+thiserr 218 | totaln=totaln+decoderTarget:size(1) 219 | 220 | 221 | 222 | local gEdec = myseq.criterion:backward(decoderOutput, decoderTarget) 223 | myseq.decoder:backward(decoderInput, gEdec) 224 | 225 | myseq:backwardConnect(myseq.encoder,myseq.decoder) 226 | 227 | myseq.encoder:backward(encoderInput, torch.Tensor(encoderOutput:size()):zero()) 228 | 229 | 230 | myseq.decoder:forget() 231 | myseq.encoder:forget() 232 | 233 | end 234 | if totaln==0 then err=0 235 | else err=totalerr/totaln 236 | end 237 | myseq.dl_dx:clamp(-self.gradientclipping,self.gradientclipping); 238 | return err, myseq.dl_dx 239 | 240 | end 241 | 242 | 243 | 244 | if(optimizer=="adagrad") then _,err=optim.adagrad(feval,self.x ,self.optimState) 245 | elseif (optimizer=="rmsprop") then _,err=optim.rmsprop(feval,self.x ,self.optimState) 246 | end 247 | self.encoder:zeroGradParameters() 248 | self.decoder:zeroGradParameters() 249 | 250 | 251 | self.decoder:forget() 252 | self.encoder:forget() 253 | 254 | return err[1] 255 | end 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | local MAX_OUTPUT_SIZE = 20 267 | 268 | function Seq2Seq:eval(input) 269 | assert(self.goToken, "No goToken specified") 270 | assert(self.eosToken, "No eosToken specified") 271 | 272 | 273 | self.encoder:forward(input) 274 | 275 | 276 | local predictions = {} 277 | local probabilities = {} 278 | 279 | -- Forward and all of it's output recursively back to the decoder 280 | local output = {self.goToken} 281 | 282 | for i = 1, MAX_OUTPUT_SIZE do 283 | --wondering if we really need to forward connect before each run because we are 284 | --kind of starting over each run here 285 | self:forwardConnect(self.encoder,self.decoder,input:size(1)) 286 | --#output selects the last prediction of the chain 287 | local prediction = self.decoder:forward(torch.Tensor({output}):t())[#output] 288 | --print(prediction) 289 | -- prediction contains the probabilities for each word IDs. 290 | -- The index of the probability is the word ID. 291 | --2 is to sort over the second dimension 292 | local prob, wordIds = prediction:topk(5, 2, true, true) 293 | 294 | -- First one is the most likely. 295 | next_output = wordIds[1][1] 296 | --use second guess if unk token 297 | if next_output==self.unknownToken then next_output = wordIds[1][2] end 298 | --print(wordIds) 299 | --print(next_output) 300 | --io.read() 301 | table.insert(output, next_output) 302 | 303 | -- Terminate on EOS token 304 | if next_output == self.eosToken then 305 | break 306 | end 307 | 308 | table.insert(predictions, wordIds) 309 | table.insert(probabilities, prob) 310 | end 311 | 312 | self.decoder:forget() 313 | self.encoder:forget() 314 | self.encoder:zeroGradParameters() 315 | self.decoder:zeroGradParameters() 316 | self.decoder:training() 317 | self.encoder:training() 318 | 319 | return output,predictions, probabilities 320 | end 321 | 322 | 323 | function Seq2Seq:evalBeam(input,beamsize) 324 | --run encoder 325 | self.encoder:forward(input) 326 | local beams = {} 327 | -- Forward and all of it's output recursively back to the decoder 328 | local beam1 = {}; 329 | beam1.currentOutput={self.goToken} 330 | beam1.length=1; 331 | beam1.finished=false 332 | beam1.prob=1; 333 | beam1.problist = {} 334 | table.insert(beam1.problist,1) 335 | table.insert(beams,beam1) 336 | 337 | local n=1 338 | 339 | while(n<25) do 340 | local newbeams = {} 341 | --print(beams) 342 | for _, beam in pairs(beams) do 343 | io.write('.') 344 | 345 | if(beam.finished==false) then 346 | local nb=self:runOneBeam(beam,beamsize,input:size(1)) 347 | 348 | for _,nbi in pairs(nb) do 349 | newbeams[nbi.prob]=nbi; 350 | end 351 | else 352 | newbeams[beam.prob]=beam; 353 | end 354 | 355 | 356 | 357 | end 358 | --print("Full Beams") 359 | --print(newbeams) 360 | beams= self:shrinkBeam(newbeams,beamsize) 361 | --print("Shrunken Beams") 362 | --print(beams) 363 | --io.read() 364 | n=n+1 365 | 366 | 367 | end 368 | 369 | 370 | 371 | 372 | 373 | self.decoder:forget() 374 | self.encoder:forget() 375 | self.encoder:zeroGradParameters() 376 | self.decoder:zeroGradParameters() 377 | self.decoder:training() 378 | self.encoder:training() 379 | 380 | return beams 381 | 382 | end 383 | 384 | --returns topn child beams 385 | --input size is just so we know where to forward connect 386 | function Seq2Seq:runOneBeam(beam,beamsize,inputsize) 387 | --just a table of wordids 388 | local output = beam.currentOutput 389 | self:forwardConnect(self.encoder,self.decoder,inputsize) 390 | local prediction = self.decoder:forward(torch.Tensor({output}):t())[#output] 391 | local probs, wordIds = prediction:topk(beamsize, 2, true, true) 392 | local beams = {} 393 | 394 | for i =1, beamsize do 395 | local newbeam = {}; 396 | newbeam.length = beam.length+1 397 | newbeam.finished=false 398 | local newoutputs = {} 399 | local next_output = wordIds[1][i] 400 | local next_output_prob = torch.exp(probs[1][i]) 401 | newbeam.prob = beam.prob*next_output_prob 402 | --store a list of all of the probabilities 403 | newbeam.problist = {} 404 | for k,v in ipairs(beam.problist) do 405 | table.insert(newbeam.problist, v) 406 | end 407 | table.insert(newbeam.problist, next_output_prob) 408 | 409 | 410 | 411 | for k,v in ipairs(output) do 412 | table.insert(newoutputs, v) 413 | end 414 | 415 | table.insert(newoutputs, next_output) 416 | newbeam.currentOutput=newoutputs; 417 | 418 | --make the score zero if there is an unknown 419 | if next_output==self.unknownToken then newbeam.prob=0 end 420 | if next_output==self.eosToken then newbeam.finished=true end 421 | 422 | table.insert(beams,newbeam); 423 | 424 | 425 | end 426 | 427 | return beams; 428 | 429 | 430 | end 431 | 432 | function Seq2Seq:shrinkBeam(beams,beamsize) 433 | 434 | local i = 1 435 | local shrunkenBeam = {} 436 | 437 | 438 | for score,beam in self:pairsByKeys(beams,function(a, b) return a > b end) do 439 | shrunkenBeam[score]=beam; 440 | 441 | if(i==beamsize)then return shrunkenBeam end 442 | i=i+1 443 | end 444 | return shrunkenBeam 445 | 446 | end 447 | 448 | 449 | function Seq2Seq:pairsByKeys (t,f) 450 | 451 | local a = {} 452 | for n in pairs(t) do table.insert(a, n) end 453 | table.sort(a, f) 454 | local i = 0 -- iterator variable 455 | local iter = function () -- iterator function 456 | i = i + 1 457 | if a[i] == nil then return nil 458 | else return a[i], t[a[i]] 459 | end 460 | end 461 | return iter 462 | end -------------------------------------------------------------------------------- /tokenizer.lua: -------------------------------------------------------------------------------- 1 | local lexer = require "pl.lexer" 2 | local yield = coroutine.yield 3 | local M = {} 4 | 5 | local function word(token) 6 | return yield("word", token) 7 | end 8 | 9 | local function quote(token) 10 | return yield("quote", token) 11 | end 12 | 13 | local function space(token) 14 | return yield("space", token) 15 | end 16 | 17 | local function tag(token) 18 | return yield("tag", token) 19 | end 20 | 21 | local function punct(token) 22 | return yield("punct", token) 23 | end 24 | 25 | local function endpunct(token) 26 | return yield("endpunct", token) 27 | end 28 | 29 | local function unknown(token) 30 | return yield("unknown", token) 31 | end 32 | 33 | function M.tokenize(text) 34 | --make sure there are spaces around certain characters so that we predict them as individual units 35 | local newtext =text 36 | newtext=newtext:lower() 37 | newtext = newtext:gsub("'", "") 38 | newtext = newtext:gsub('-', " ") 39 | newtext = newtext:gsub(',',' , ') 40 | newtext = newtext:gsub('%.',' . ') 41 | newtext = newtext:gsub('%:',' : ') 42 | newtext = newtext:gsub('%;',' ; ') 43 | newtext = newtext:gsub('%?',' ? ') 44 | newtext = newtext:gsub('%!',' ! ') 45 | newtext = newtext:gsub('\n',' \n ') 46 | local matchstring = "([^%s]+)" 47 | local words = newtext:gmatch(matchstring ) 48 | return words 49 | 50 | 51 | 52 | 53 | end 54 | 55 | 56 | function M.join(words) 57 | local s = table.concat(words, " ") 58 | s = s:gsub("^%l", string.upper) 59 | s = s:gsub(" (') ", "%1") 60 | s = s:gsub(" ([,:;%-%.%?!])", "%1") 61 | 62 | return s 63 | end 64 | 65 | return M -------------------------------------------------------------------------------- /trainoptim.lua: -------------------------------------------------------------------------------- 1 | require 'neuralconvo' 2 | require 'xlua' 3 | require 'util.ModelTracker' 4 | require 'nn' 5 | require 'util.Tester' 6 | 7 | local WordSplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader' 8 | 9 | torch.setheaptracking(true) 10 | 11 | cmd = torch.CmdLine() 12 | cmd:text('Options:') 13 | cmd:option('--dataDir', 'data/opensubssmall/', 'approximate size of dataset to use (0 = all)') 14 | cmd:option('--vocabSize', 15000, 'Vocab Size') 15 | cmd:option('--cuda', false, 'use CUDA') 16 | cmd:option('--hiddenSize', 1000, 'number of hidden units in LSTM') 17 | cmd:option('--nlayers', 2, 'Number of Layers') 18 | cmd:option('--learningRate', 0.01, 'learning rate at t=0') 19 | --cmd:option('--momentum', 0.9, 'momentum') 20 | cmd:option('--minLR', 0.00001, 'minimum learning rate') 21 | cmd:option('--saturateEpoch', 20, 'epoch at which linear decayed LR will reach minLR') 22 | cmd:option('--maxEpoch', 10, 'maximum number of epochs to run') 23 | cmd:option('--batchSize', 1, 'minibatch size') 24 | cmd:option('--seqLength',50,'Max Sequence Length'); 25 | cmd:option('-seq_length_in',25,'length of sequence input') 26 | cmd:option('-seq_length_out',25,'length of sequence output') 27 | 28 | 29 | --Mike Additions 30 | 31 | cmd:option('--grad_clip',5,'clip gradients at this value ') 32 | cmd:option('--track',0,'Use ModelTracker') 33 | cmd:option('--supermodelid',30627892,'Modeltracking- Supermodel ID') 34 | cmd:option('--rmsprop', false, 'use RMSProp') 35 | cmd:text() 36 | options = cmd:parse(arg) 37 | 38 | if options.dataset == 0 then 39 | options.dataset = nil 40 | end 41 | 42 | 43 | --for modeltracking online 44 | local crossid=-99 45 | if(options.track==1) then 46 | local desc = "" 47 | for k, v in pairs( options ) do desc = desc..k..": "..tostring(v).." " end 48 | 49 | local sm=ModelTracker.createSubmodel({["name"]="Neuraltalk lr:"..options.learningRate.." ",["description"]=desc,["supermodelid"]=options.supermodelid}) 50 | local cross= ModelTracker.createCross({["name"]="Main",["description"]="Main Cross",["submodelid"]=sm.submodelid}) 51 | crossid=cross.crossid 52 | end 53 | 54 | 55 | 56 | 57 | 58 | -- Data 59 | print("-- Loading dataset") 60 | --[[ 61 | dataset = neuralconvo.DataSet(neuralconvo.OpensubsDialogs("data/opensubs"), 62 | { 63 | loadFirst = options.dataset, 64 | minWordFreq = options.minWordFreq 65 | }) 66 | ]]-- 67 | local loader = WordSplitLMMinibatchLoader.create(options.dataDir, options.batchSize, options.seqLength, {.945, .0001, .05} ,options.vocabSize) 68 | 69 | 70 | -- Model 71 | model = neuralconvo.Seq2Seq(loader.vocab_size, options.hiddenSize,options.grad_clip,options.nlayers) 72 | model.goToken = loader.goToken 73 | model.eosToken = loader.eosToken 74 | model.unknownToken = loader.unknownToken 75 | 76 | -- Training parameters 77 | model.criterion = nn.SequencerCriterion(nn.ClassNLLCriterion()) 78 | model.learningRate = options.learningRate 79 | --model.momentum = options.momentum 80 | local decayFactor = (options.minLR - options.learningRate) / options.saturateEpoch 81 | local minMeanError = nil 82 | 83 | model.optimState.learningRate=options.learningRate 84 | 85 | print('Loading Model') 86 | -- Enabled CUDA 87 | if options.cuda then 88 | require 'cutorch' 89 | require 'cunn' 90 | model:cuda() 91 | end 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | function runValidationSet() 100 | local n = loader.nval 101 | local splitIndex = 2 102 | local testerr= torch.Tensor(n):fill(0) 103 | for i = 1,n do 104 | 105 | local inputx,outputx,outputy=loader:getBatch(splitIndex ,i,options.seq_length_in,options.seq_length_out) 106 | if(inputx:nDimension()~=0 and outputx:nDimension()~=0 and outputy:nDimension()~=0 and outputx:nonzero():nDimension()~=0) then 107 | if options.cuda then 108 | inputx = inputx:cuda() 109 | outputx = outputx:cuda() 110 | outputy = outputy:cuda() 111 | 112 | end 113 | 114 | local minibatch={} 115 | table.insert(minibatch,{input=inputx,target=outputx,targetout=outputy}) 116 | local err = model:train(inputx, outputx,outputy,false) 117 | 118 | testerr[i]=err 119 | end 120 | xlua.progress(i, n) 121 | end 122 | 123 | print("Validation Error: "..testerr:mean()) 124 | collectgarbage() 125 | return testerr:mean() 126 | 127 | 128 | end 129 | 130 | local reportEvery = 1000 131 | local testEvery=20000 132 | local first=true 133 | 134 | -- Run the experiment 135 | local totalcount=1 136 | for epoch = 1, options.maxEpoch do 137 | print("\n-- Epoch " .. epoch .. " / " .. options.maxEpoch) 138 | print("") 139 | 140 | --shuffle training batches 141 | loader.train = WordSplitLMMinibatchLoader.shuffleTensorRows(loader.train) 142 | 143 | 144 | local errorssmall = torch.Tensor(reportEvery):fill(0) 145 | local timer = torch.Timer() 146 | 147 | 148 | local j= 1 149 | 150 | for i = 1, loader.ntrain do 151 | if(1%200==0) then collectgarbage() end 152 | local inputx,outputx,outputy =loader:getBatch(1 ,i,options.seq_length_in,options.seq_length_out) 153 | 154 | local encInSeq = inputx; 155 | local decInSeq= outputx; 156 | local decOutSeq=outputy; 157 | 158 | --[[ 159 | print('EncIn') 160 | print(encInSeq) 161 | print(tensor2sent(encInSeq,loader)) 162 | print('DecIn') 163 | print(tensor2sent(decInSeq,loader)) 164 | print('DecOut') 165 | print(tensor2sent(decOutSeq,loader)) 166 | 167 | io.read() 168 | ]]-- 169 | 170 | if(inputx:nDimension()~=0 and outputx:nDimension()~=0 and outputy:nDimension()~=0 and outputx:nonzero():nDimension()~=0 ) then 171 | --have to do this before cuda'ing 172 | local nonzerot = decInSeq:nonzero():size(1) 173 | if options.cuda then -- ship the input arrays to GPU 174 | -- have to convert to float because integers can't be cuda()'d 175 | encInSeq = encInSeq:float():cuda() 176 | decInSeq = decInSeq:float():cuda() 177 | decOutSeq = decOutSeq:float():cuda() 178 | end 179 | local minibatch={} 180 | table.insert(minibatch,{input=encInSeq,target=decInSeq,targetout=decOutSeq,nonzeroTargets=nonzerot}) 181 | 182 | 183 | local err 184 | if options.rmsprop then 185 | err= model:trainOptim(minibatch, "rmsprop") 186 | if(first) then 187 | print("Using RMSProp") 188 | first=false 189 | end 190 | 191 | else err= model:trainOptim(minibatch, "adagrad") 192 | end 193 | 194 | errorssmall[j]=err 195 | j=j+1 196 | end 197 | if j == reportEvery then 198 | 199 | print(string.format("Error = %.3f", (errorssmall:mean()) )..string.format(" Progress = %.1f", (totalcount)) ) 200 | if(options.track==1) then 201 | pcall(ModelTracker.sendStatistic({["category"]="Next",["name"]="Loss",["group"]="train",["n"]=totalcount,["crossid"]=crossid,["value"]=errorssmall:mean()})) 202 | end 203 | errorssmall = torch.Tensor(reportEvery):fill(0) 204 | j=1 205 | end 206 | 207 | xlua.progress(i, loader.ntrain) 208 | i = i + 1 209 | --test set 210 | if i % testEvery==0 then 211 | if loader.nval>0 then 212 | local meanerr=runValidationSet() 213 | end 214 | 215 | if(options.track==1) then 216 | pcall(ModelTracker.sendStatistic({["category"]="Next",["name"]="Loss",["group"]="test",["n"]=totalcount,["crossid"]=crossid,["value"]=meanerr})) 217 | end 218 | print("Hi : ".. getResponse("Hi",loader,model)) 219 | print("What is your name : ".. getResponse("What is your name",loader,model)) 220 | print("How old are you : ".. getResponse("How old are you ",loader,model)) 221 | print("What is the meaning of life : ".. getResponse("What is the meaning of life ",loader,model)) 222 | 223 | end 224 | 225 | if i% 1000 ==0 then 226 | print("Hi : ".. getResponse("Hi",loader,model)) 227 | print("What is your name : ".. getResponse("What is your name",loader,model)) 228 | print("How old are you : ".. getResponse("How old are you ",loader,model)) 229 | 230 | end 231 | 232 | 233 | if(totalcount % 1000000==0) then 234 | print("\n(Saving model ...)") 235 | torch.save("data/model.t7", model) 236 | 237 | end 238 | 239 | if(totalcount%100000==0 and options.track==1)then 240 | 241 | 242 | local report="" 243 | report=report.."

Hi : "..getResponse("Hi",loader,model).."

" 244 | report=report.."

What is your name : "..getResponse("What is your name ",loader,model).."

" 245 | report=report.."

How old are you : "..getResponse("How old are you ",loader,model).."

" 246 | report=report.."

What is the meaning of life : "..getResponse("What is the meaning of life",loader,model).."

" 247 | report=report.."

Do you like swimming : "..getResponse("Do you like swimming ",loader,model).."

" 248 | report=report.."

It's been a long day : "..getResponse("It's been a long day ",loader,model).."

" 249 | report=report.."

goodbye : "..getResponse("goodbye ",loader,model).."

" 250 | 251 | ModelTracker.sendReport({["reportname"]="Dialogue At Epoch: "..epoch.." Iteration: "..i,["parentid"]=crossid,["report"]=report}) 252 | 253 | 254 | end 255 | 256 | 257 | 258 | 259 | 260 | totalcount=totalcount+1 261 | 262 | 263 | end 264 | 265 | timer:stop() 266 | 267 | 268 | print("\nEpoch stats:") 269 | 270 | -- Save the model if it improved. 271 | --if minMeanError == nil or errors:mean() < minMeanError then 272 | -- print("\n(Saving model ...)") 273 | -- torch.save("data/model.t7", model) 274 | -- minMeanError = errors:mean() 275 | --end 276 | print("\n(Saving model ...)") 277 | torch.save("data/model.t7", model) 278 | 279 | 280 | model.learningRate = model.learningRate + decayFactor 281 | model.learningRate = math.max(options.minLR, model.learningRate) 282 | end 283 | 284 | 285 | -- Load testing script 286 | require "eval" -------------------------------------------------------------------------------- /util/ModelTracker.lua: -------------------------------------------------------------------------------- 1 | ModelTracker = {} 2 | 3 | --(from mtanana) this is my own custom model tracker I'd be happy to open source it if someone wants it 4 | 5 | --table is the lua table that will become json and the funct is the function id 6 | function ModelTracker.sendJsonObj(table,funct) 7 | 8 | local endpoint = "http://camber:8080/modeltracker/tracking.jsp" 9 | -- load required modules 10 | local http = require("socket.http") --luasocket 11 | local ltn12 = require("ltn12") 12 | local mime = require("mime") 13 | local io = require("io") 14 | local json = require("json") -- luajson 15 | local url = require("socket.url") 16 | 17 | -- Create a Lua table to represent our entity to save 18 | --- This is from our doc REST example: http://docs.kinvey.com/rest-appdata.html 19 | --jamesBond = { ["firstName"] = "James", ["lastName"] = "Bond", ["email"] = "james.bond@mi6.gov.uk", ["age"] = 34 } 20 | 21 | -- Save the table to the backend 22 | --- convert to json 23 | local jsstr = url.escape(json.encode(table)) 24 | 25 | --- build a http request 26 | local request = endpoint.."?function="..funct.."&jsonobj="..jsstr 27 | 28 | local response_body = { } 29 | --- send the request 30 | ok, code, headers = http.request{url = request, method = "POST", sink = ltn12.sink.table(response_body)} 31 | 32 | --- show that we got a valid response 33 | -- print(code) -- should be 201 for POST success 34 | saveditem = response_body[1]; -- kinvey appdata responses return arrays (which are tables in Lua) 35 | --print(saveditem) 36 | 37 | --- convert from json to lua object 38 | objAsTable = json.decode(saveditem) 39 | return objAsTable 40 | end 41 | 42 | 43 | 44 | --CODE FOR BUILDING NEW SUPERMODEL: 45 | 46 | --require 'util.ModelTracker' 47 | --id=ModelTracker.createSupermodel("New SM","a testing sm"); 48 | 49 | --{["name"]="NewCross",["description"]="newcross",["submodelid"]=-99} 50 | function ModelTracker.createCross(newcross) 51 | return ModelTracker.sendJsonObj(newcross,"1001") 52 | end 53 | --{["name"]="NewSM",["description"]="newsm",["supermodelid"]=-99} 54 | function ModelTracker.createSubmodel(newsubmodel) 55 | return ModelTracker.sendJsonObj(newsubmodel,"1002") 56 | end 57 | --{["modelname"]="NewSM",["description"]="newsm"} 58 | function ModelTracker.createSupermodel(name,description) 59 | local newsupermodel = {["modelname"]=name,["modeldescription"]=description} 60 | return ModelTracker.sendJsonObj(newsupermodel,"1003") 61 | end 62 | 63 | --{["category"]="changetalk",["group"]="train",["n"]=200,["crossid"]=-99,["value"]=3.45} 64 | function ModelTracker.sendStatistic(statistic) 65 | return ModelTracker.sendJsonObj(statistic,"1000") 66 | end 67 | --{["reportname"]="",["parentid"]=crossid,["report"]="The text of the report"} 68 | function ModelTracker.sendReport(report) 69 | return ModelTracker.sendJsonObj(report,"1010") 70 | end -------------------------------------------------------------------------------- /util/Tester.lua: -------------------------------------------------------------------------------- 1 | -- 2 | -- Created by IntelliJ IDEA. 3 | -- User: user 4 | -- Date: 3/13/2016 5 | -- Time: 2:55 PM 6 | -- To change this template use File | Settings | File Templates. 7 | -- 8 | 9 | local tokenizer = require "tokenizer" 10 | local list = require "pl.List" 11 | 12 | 13 | -- Word IDs to sentence 14 | function pred2sent(wordIds,dataset) 15 | local words = {} 16 | 17 | 18 | for _, wordId in ipairs(wordIds) do 19 | local id = wordId 20 | if id ~= 0 and id~=dataset.goToken and id~=dataset.eosToken then 21 | local word = dataset.id2word[id] 22 | table.insert(words, word) 23 | end 24 | end 25 | --print(words) 26 | return tokenizer.join(words) 27 | end 28 | 29 | function tensor2sent(wordIds, dataset) 30 | local words = {} 31 | 32 | 33 | for i=1,wordIds:size(1) do 34 | local id = wordIds[i][1] 35 | if id ~= 0 then 36 | local word = dataset.id2word[id] 37 | table.insert(words, word) 38 | end 39 | end 40 | 41 | return tokenizer.join(words) 42 | end 43 | 44 | function printmytable(t) 45 | for i,v in ipairs(t) do 46 | print(v) 47 | end 48 | 49 | end 50 | 51 | --word ids and probabilites are both tables of the length of the final output 52 | 53 | function printProbabilityTable(wordIds, predictions,probabilities, num,dataset) 54 | print(string.rep("-", num * 22)) 55 | -- printmytable(wordIds) 56 | -- printmytable(probabilities) 57 | --p is the final output word id 58 | 59 | 60 | for p, probs in ipairs(probabilities) do 61 | --print(p) 62 | local line = "| " 63 | wordId = wordIds[p] 64 | local probs = probabilities[p]; 65 | local preds = predictions[p]; 66 | 67 | for i = 1, num do 68 | 69 | local pr = torch.exp(probs[1][i]) 70 | --print(wordId) 71 | local w = preds[1][i] 72 | local word = dataset.id2word[w] 73 | -- print(word) 74 | -- local t = probabilities[1][p] 75 | -- print("prob.."..t) 76 | -- print("wordid.."..wordId[1][i]) 77 | line = line .. string.format("%-10s(%4d%%)", word, pr * 100) .. " | " 78 | end 79 | print(line) 80 | end 81 | 82 | print(string.rep("-", num * 22)) 83 | end 84 | 85 | function getResponse(text,dataset,model,debug) 86 | debug = debug or false 87 | local wordIds = {} 88 | 89 | for word in tokenizer.tokenize(text) do 90 | local id = dataset.word2id[word] or dataset.unknownToken 91 | table.insert(wordIds, id) 92 | end 93 | 94 | local input = torch.Tensor({wordIds}):t() 95 | 96 | --predictions is a table of tensors of word ids 97 | --probabilities are the matching probs (well...log activations) 98 | local output,predictions, probabilities = model:eval(input) 99 | --print("Predictions") 100 | --print(predictions) 101 | --print(probabilities) 102 | local phrase = pred2sent(output,dataset) 103 | 104 | if debug then 105 | printProbabilityTable(output, predictions,probabilities, 4,dataset) 106 | end 107 | phrase = phrase or '' 108 | return phrase 109 | 110 | end 111 | 112 | function getResponseBeam(text,dataset,model,debug,beamsize) 113 | debug = debug or false 114 | local wordIds = {} 115 | 116 | for word in tokenizer.tokenize(text) do 117 | local id = dataset.word2id[word] or dataset.unknownToken 118 | table.insert(wordIds, id) 119 | end 120 | 121 | local input = torch.Tensor({wordIds}):t() 122 | 123 | --predictions is a table of tensors of word ids 124 | --probabilities are the matching probs (well...log activations) 125 | local beams = model:evalBeam(input,beamsize) 126 | --print("Predictions") 127 | --print(predictions) 128 | --print(probabilities) 129 | local phrase = '\n' 130 | for score,beam in model:pairsByKeys(beams,function(a, b) return a > b end) do 131 | 132 | local scoretensor = torch.Tensor(beam.problist) 133 | local meanscore = torch.mean(scoretensor) 134 | local sent = pred2sent(beam.currentOutput,dataset) 135 | local sscore = string.format("%.4f",score) 136 | local sscore2 = string.format("%.4f",meanscore) 137 | phrase = phrase..sscore..', '..sscore2..': ' ..sent.. '\n' 138 | end 139 | 140 | 141 | 142 | 143 | if debug then 144 | --printProbabilityTable(output, predictions,probabilities, 4,dataset) 145 | end 146 | phrase = phrase or '' 147 | return phrase 148 | 149 | end 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /util/WordSplitLMMinibatchLoader.lua: -------------------------------------------------------------------------------- 1 | 2 | --Modified by Mike Tanana from Andrew Karpathy and Wojciech Zaremba 3 | --Changed to support word models and Seq2Seq 4 | 5 | --input comes in the form of a file where each line has a speaker and response 6 | --in the form: speakerone utterance | speaker two response 7 | 8 | 9 | 10 | local WordSplitLMMinibatchLoader = {} 11 | WordSplitLMMinibatchLoader.__index = WordSplitLMMinibatchLoader 12 | 13 | WordSplitLMMinibatchLoader.tokenizer = require "tokenizer" 14 | 15 | 16 | 17 | function WordSplitLMMinibatchLoader.shuffle(t) 18 | local n = #t 19 | while n > 2 do 20 | local k = math.random(n) 21 | t[n], t[k] = t[k], t[n] 22 | n = n - 1 23 | end 24 | return t 25 | end 26 | 27 | function WordSplitLMMinibatchLoader.createFromJustVocab(vocabfile) 28 | local self = {} 29 | setmetatable(self, WordSplitLMMinibatchLoader) 30 | self:loadExistingVocabFile(vocabfile) 31 | 32 | return self 33 | 34 | end 35 | 36 | function WordSplitLMMinibatchLoader:loadExistingVocabFile(vocabfilename) 37 | 38 | --this is word to index 39 | self.vocab_mapping = torch.load(vocabfilename) 40 | self.id2word = {} 41 | self.word2id=self.vocab_mapping 42 | 43 | --count vocab and make reverse mapping 44 | self.vocab_size = 0 45 | for word,idx in pairs(self.vocab_mapping) do 46 | self.vocab_size = self.vocab_size + 1 47 | self.id2word[idx] = word 48 | end 49 | print('Vocab Size'..self.vocab_size) 50 | 51 | 52 | 53 | self.goToken = self.vocab_mapping[''] 54 | self.eosToken = self.vocab_mapping[''] 55 | self.unknownToken = self.vocab_mapping[''] 56 | 57 | 58 | end 59 | 60 | function WordSplitLMMinibatchLoader.create(data_dir, batch_size,seq_length, split_fractions,vocabsize) 61 | -- split_fractions is e.g. {0.9, 0.05, 0.05} 62 | 63 | local self = {} 64 | setmetatable(self, WordSplitLMMinibatchLoader) 65 | 66 | local input_file = path.join(data_dir, 'input.txt') 67 | local vocab_file = path.join(data_dir, 'vocabwords.t7') 68 | local tensor_file = path.join(data_dir, 'datawords.t7') 69 | 70 | -- fetch file attributes to determine if we need to rerun preprocessing 71 | local run_prepro = false 72 | if not (path.exists(vocab_file) or path.exists(tensor_file)) then 73 | -- prepro files do not exist, generate them 74 | print('vocab.t7 and data.t7 do not exist. Running preprocessing...') 75 | run_prepro = true 76 | else 77 | -- check if the input file was modified since last time we 78 | -- ran the prepro. if so, we have to rerun the preprocessing 79 | local input_attr = lfs.attributes(input_file) 80 | local vocab_attr = lfs.attributes(vocab_file) 81 | local tensor_attr = lfs.attributes(tensor_file) 82 | if input_attr.modification > vocab_attr.modification or input_attr.modification > tensor_attr.modification then 83 | print('vocab.t7 or data.t7 detected as stale. Re-running preprocessing...') 84 | run_prepro = true 85 | end 86 | end 87 | if run_prepro then 88 | -- construct a tensor with all the data, and vocab file 89 | print('one-time setup: preprocessing input text file ' .. input_file .. '...') 90 | self:text_to_tensor(input_file, vocab_file, tensor_file,vocabsize,split_fractions) 91 | end 92 | 93 | print('loading data files...') 94 | --in this file rows are dialogue pairs: first half is speaker1 second half is speaker 2 95 | --the data should always store at least one more than you are going to predict (otherwise the final step will be incorrect) 96 | local data = torch.load(tensor_file) 97 | self.train = data.train 98 | self.val =data.val 99 | self.test = data.test 100 | 101 | self:loadExistingVocabFile(vocab_file) 102 | 103 | --shuffle rows 104 | WordSplitLMMinibatchLoader.shuffleTensorRows(self.train) 105 | 106 | 107 | 108 | -- divide data to train/val and allocate rest to test 109 | self.ntrain = math.floor(self.train:size(1)/batch_size )-1 110 | 111 | if self.val ~=nil then 112 | self.nval = math.floor(self.val:size(1)/batch_size)-1 113 | else 114 | self.nval=0 115 | end 116 | if self.test ~=nil then 117 | self.ntest = math.floor(self.test:size(1)/batch_size)-1 118 | else 119 | self.ntest=0 120 | end 121 | 122 | self.batch_size = batch_size 123 | print ('Val Size: ' .. self.nval) 124 | 125 | self.split_sizes = {self.ntrain, self.nval, self.ntest} 126 | self.batch_ix = {0,0,0 } 127 | 128 | 129 | 130 | print(string.format('data load done. Number of data batches in train: %d, val: %d, test: %d', self.ntrain, self.nval, self.ntest)) 131 | collectgarbage() 132 | 133 | --self:writeTxtFile(1,self.ntrain,self.vocab_mapping,"train.txt"); 134 | --self:writeTxtFile(2,self.nval,self.vocab_mapping,"test.txt"); 135 | 136 | 137 | 138 | 139 | 140 | return self 141 | end 142 | 143 | function WordSplitLMMinibatchLoader:reset_batch_pointer(split_index, batch_index) 144 | batch_index = batch_index or 0 145 | self.batch_ix[split_index] = batch_index 146 | end 147 | 148 | function WordSplitLMMinibatchLoader:writeTxtFile(split_index,n,vocab,filename) 149 | print("Saving data "..filename) 150 | --get the numerically indexed vocab table 151 | local ivocab = {} 152 | for c,i in pairs(vocab) do ivocab[i] = c end 153 | 154 | 155 | local file = io.open(filename, "a") 156 | 157 | 158 | for key= 1,n do 159 | local inx,outx,outy=self:next_batch(split_index,25,25) 160 | self:writeBatch(file,outx,ivocab) 161 | 162 | end 163 | 164 | file:close() 165 | 166 | end 167 | function WordSplitLMMinibatchLoader:writeBatch(file,batch,ivocab) 168 | 169 | for row =1 , batch:size(1) do 170 | for col=1 , batch:size(2) do 171 | local word = ivocab[batch[row][col]] 172 | if(word=="") then break end 173 | if(word=="") then 174 | file:write("") 175 | else 176 | file:write(word.." ") 177 | end 178 | 179 | end 180 | file:write("\n") 181 | end 182 | 183 | end 184 | 185 | 186 | 187 | 188 | function WordSplitLMMinibatchLoader:getBatch(split_index,batchid,insize,outsize) 189 | local set = {} 190 | if split_index ==1 then set = self.train 191 | elseif split_index==2 then set = self.val 192 | elseif split_index==3 then set = self.test 193 | end 194 | 195 | 196 | 197 | -- pull out the correct next batch 198 | local start = (batchid*self.batch_size)+1 199 | 200 | 201 | 202 | local x = set:narrow(1,start,self.batch_size) 203 | local y = getydataforx(x) 204 | --print(x) 205 | --print(y) 206 | --io.read() 207 | 208 | --split the two sequences apart 209 | local length = x:size(2) 210 | local size = length/2 211 | local in_start = math.max(1,size-(insize-1)) 212 | local in_usesize = math.min(size,insize) 213 | local out_usesize = math.min(size,outsize) 214 | 215 | local inputx = x:narrow(2,in_start,in_usesize) 216 | --print(x) 217 | --print(in_start.." "..in_usesize) 218 | --print(inputx) 219 | local inputy = y:narrow(2,in_start,in_usesize) 220 | inputx,inputy = self:trimPaddingFromLeft(inputx,inputy) 221 | local outputx = x:narrow(2,size+1,out_usesize) 222 | local outputy = y:narrow(2,size+1,out_usesize):clone() 223 | outputx,outputy = self:trimPaddingFromRight(outputx,outputy) 224 | --need to do this because the criterion can't handle zeros even though they get masked 225 | outputy[outputy:lt(1)]=self.eosToken 226 | return inputx:t(),outputx:t(),outputy:t() 227 | end 228 | --trims to the first non padding elemend 229 | --assumes t1 and t2 are the same dimensions 230 | function WordSplitLMMinibatchLoader:trimPaddingFromLeft(t1,t2) 231 | local firstValid = 0 232 | for i = 1, t1:size(2) do 233 | for j = 1, t1:size(1) do 234 | local val = t1[j][i] 235 | local val2 = t2[j][i] 236 | if(val~=0 or val2~=0) then 237 | firstValid=i 238 | break 239 | end 240 | end 241 | if(firstValid > 0) then break end 242 | end 243 | if(firstValid==0) then return t1,t2 end 244 | local length = t1:size(2)+1-firstValid 245 | local newt1 = t1:narrow(2,firstValid,length) 246 | local newt2 = t2:narrow(2,firstValid,length) 247 | 248 | return newt1,newt2 249 | 250 | end 251 | --trims to there 252 | function WordSplitLMMinibatchLoader:trimPaddingFromRight(t1,t2) 253 | local firstValid = 0 254 | for i = t1:size(2), 1,-1 do 255 | for j = 1, t1:size(1) do 256 | local val = t1[j][i] 257 | local val2 = t2[j][i] 258 | if(val~=0 or val2~=0) then 259 | firstValid=i 260 | break 261 | end 262 | end 263 | if(firstValid > 0) then break end 264 | end 265 | if(firstValid==0) then return t1,t2 end 266 | local length = firstValid 267 | local newt1 = t1:narrow(2,1,firstValid) 268 | local newt2 = t2:narrow(2,1,firstValid) 269 | 270 | return newt1,newt2 271 | end 272 | 273 | 274 | 275 | 276 | 277 | function getydataforx(xdata) 278 | xt = xdata:t() 279 | yt = xt:clone() --watch out transpose works off the same data 280 | ydata = yt:sub(1,-2):copy(xt:sub(2,-1)) --shift everything down one 281 | yt[-1] = xt[1] --make the last item the same as the first (i.e. make sure you dont' set a seq length that actually uses this) 282 | y = yt:t() --put back into cols are seq length and rows are samples 283 | return y 284 | end 285 | 286 | 287 | --[[ 288 | -- 289 | - deprecated user WordSplitLMMinibatchLoader.tokenizer.tokenize(t)- 290 | function WordSplitLMMinibatchLoader.preprocess(alltext) 291 | --make sure there are spaces around certain characters so that we predict them as individual units 292 | local newtext 293 | newtext = alltext:gsub(',',' , ') 294 | newtext = newtext:gsub('%.',' . ') 295 | newtext = newtext:gsub('%:',' : ') 296 | newtext = newtext:gsub('%;',' ; ') 297 | newtext = newtext:gsub('%?',' ? ') 298 | newtext = newtext:gsub('%!',' ! ') 299 | newtext = newtext:gsub('\n',' \n ') 300 | 301 | 302 | return newtext 303 | end]]-- 304 | 305 | ---Makes sure we split on spaces 306 | ----return nil if we are at the end 307 | function getNextBatchFromFile(torchfile) 308 | --first get main buffer size and create a string 309 | local chars = torchfile:readByte(100000); 310 | if(chars:size()==0) then return nil end 311 | local text = chars:string(); 312 | --now make keep going until we get a space (or it is the end) 313 | local nospace=true; 314 | local extrachars = ""; 315 | while nospace do 316 | local char =torchfile:readByte() 317 | if char==nil or string.char(char)==" " then break end 318 | extrachars=extrachars..string.char(char) 319 | end 320 | text=text..extrachars 321 | --io.write(text) 322 | --return text 323 | return text 324 | 325 | end 326 | 327 | function WordSplitLMMinibatchLoader.shuffleTensorRows(t) 328 | --shuffle tensor 329 | local indexes = torch.randperm(t:size(1)):type('torch.LongTensor') 330 | t = t:index(1,indexes) 331 | return t 332 | 333 | end 334 | ---Makes sure we split on new lines 335 | ----return nil if we are at the end 336 | function WordSplitLMMinibatchLoader.getNextBatchFromFileStandard(file) 337 | --first get main buffer size and create a string 338 | local block= file:read(1000000); 339 | if not block then return nil end 340 | local text = block; 341 | --now make keep going until we get a space (or it is the end) 342 | local nospace=true; 343 | local extrachars = ""; 344 | while nospace do 345 | local char =file:read(1) 346 | if char==nil or char=="\n" or char=="\r" then break end 347 | extrachars=extrachars..char 348 | end 349 | text=text..extrachars 350 | --io.write(text) 351 | --return text 352 | return text 353 | 354 | end 355 | 356 | --input comes in the form of a file where each line has a speaker and response 357 | --in the form: speakerone utterance | speaker two response 358 | 359 | -- *** STATIC method *** 360 | function WordSplitLMMinibatchLoader:text_to_tensor(in_textfile, out_vocabfile, out_tensorfile,vocabsize,split_fractions) 361 | --local timer = torch.Timer() 362 | local matchstring = "([^%s]+)" 363 | print('loading text file...') 364 | local wordcount = {} 365 | local rawdata 366 | local tot_len = 0 367 | local filein = io.open(in_textfile, "r") 368 | --local filein = torch.DiskFile(in_textfile, "r") 369 | --filein:quiet(); 370 | local unknownword = "" 371 | local padding = "" --pads are now just zeros 372 | local go = "" 373 | local eos = "" 374 | 375 | -- create vocabulary if it doesn't exist yet 376 | print('creating vocabulary mapping...') 377 | -- record all characters to a set 378 | local unordered = {} 379 | local count=0 380 | local t=true 381 | local nlines =0 382 | 383 | 384 | while(t ~= nil) do 385 | t=WordSplitLMMinibatchLoader.getNextBatchFromFileStandard(filein) 386 | if t ==nil then break end 387 | -- t=WordSplitLMMinibatchLoader.preprocess(t) 388 | 389 | local words = WordSplitLMMinibatchLoader.tokenizer.tokenize(t) 390 | 391 | 392 | for word in words do 393 | word = word:lower() 394 | if word ~= "|" then --speaker change character 395 | if wordcount[word]==nil then 396 | wordcount[word]=1 397 | else 398 | wordcount[word]=wordcount[word]+1 399 | end 400 | tot_len=tot_len+1 401 | else --if word== "|" easy way to count n dialog elements 402 | nlines=nlines+1 403 | end 404 | end 405 | io.write(tot_len.."\n") 406 | 407 | end 408 | 409 | 410 | filein:close() 411 | 412 | 413 | 414 | 415 | 416 | 417 | -------------------------------------------------------- 418 | --trim vocabulary--------------------------------------- 419 | -------------------------------------------------------- 420 | 421 | --basically start at some very high frequency and then go down until we have added the right number of words 422 | --the ties will kind of be added 'randomly' (really based on which were put in first) 423 | local frequency = 400 --start here and go down 424 | 425 | local vocab_mapping = {} 426 | local index=1 427 | --add special words 428 | vocab_mapping[unknownword]=index; 429 | --index=index+1 430 | --vocab_mapping[padding]=index; 431 | index=index+1 432 | vocab_mapping[go]=index; 433 | index=index+1 434 | vocab_mapping[eos]=index; 435 | index=index+1 436 | local count=0 437 | while frequency >0 do 438 | for key,value in pairs(wordcount) do 439 | if(value>=frequency) then --trim dictionary for rare words 440 | vocab_mapping[key]=index; 441 | index=index+1 442 | count=count+1 443 | wordcount[key]=nil --remove from table 444 | if(count>=vocabsize) then break end 445 | end 446 | end 447 | if(count>=vocabsize) then break end 448 | frequency=frequency-1 449 | end 450 | 451 | 452 | 453 | print("Count: "..count) 454 | print("Length: "..tot_len) 455 | 456 | -------------------------------------------------------- 457 | --build dataset--------------------------------------- 458 | -------------------------------------------------------- 459 | 460 | local length = 50 --size to save of each utterance 461 | 462 | -- construct a tensor with all the data 463 | print('putting data into tensor...') 464 | 465 | --fill with pads first 466 | --rows are dialogue examples by length of examples (*2 b/c utterance-response) 467 | local examples = torch.IntTensor(nlines,length*2):fill(0) 468 | 469 | 470 | 471 | filein = io.open(in_textfile, "r") 472 | 473 | t=true 474 | local row=1 475 | while(t ~= nil) do 476 | t=WordSplitLMMinibatchLoader.getNextBatchFromFileStandard(filein) 477 | if t ==nil then break end 478 | --break into lines 479 | local lines = t:gmatch("[^\r\n]+") 480 | for line in lines do 481 | 482 | --line=WordSplitLMMinibatchLoader.preprocess(line) 483 | --break into words 484 | local words = WordSplitLMMinibatchLoader.tokenizer.tokenize(line) 485 | 486 | local speaker1,speaker2 = WordSplitLMMinibatchLoader.getSpeakersForLine(words) 487 | 488 | 489 | --fill speaker 1 from middle->start 490 | --speaker 1 we want to go from the last word spoken backward 491 | local count=0 492 | for i =speaker1.size,1,-1 do 493 | local word = speaker1[i] 494 | if(countend 505 | for i =1,speaker2.size do 506 | local word = speaker2[i] 507 | if(i<=length) then 508 | word = word:lower() 509 | local idx = vocab_mapping[word] 510 | if idx == nil then idx = vocab_mapping[unknownword] end 511 | local loc = length+i 512 | examples[row][loc]=idx 513 | end 514 | end 515 | --debugging here: 516 | --print(examples[row]) 517 | --io.stdin:read'*l' 518 | 519 | row=row+1 520 | 521 | end 522 | 523 | 524 | end 525 | 526 | --splits 527 | 528 | 529 | local ntrain = math.floor(examples:size(1) * split_fractions[1]) 530 | local nval = math.floor(examples:size(1) * split_fractions[2]) 531 | local ntest = examples:size(1) - nval - ntrain -- the rest goes to test (to ensure this adds up exactly) 532 | 533 | --shuffle tensor 534 | local indexes = torch.randperm(examples:size(1)):type('torch.LongTensor') 535 | examples = examples:index(1,indexes) 536 | 537 | local data = {} 538 | data.train = examples:narrow(1,1,ntrain) 539 | 540 | if nval>0 then 541 | data.val = examples:narrow(1,ntrain+1,nval) 542 | end 543 | if ntest>0 then 544 | data.test = examples:narrow(1,ntrain+nval+1,ntest) 545 | end 546 | 547 | -- save output preprocessed files 548 | print('saving ' .. out_vocabfile) 549 | torch.save(out_vocabfile, vocab_mapping) 550 | print('saving ' .. out_tensorfile) 551 | torch.save(out_tensorfile, data) 552 | 553 | 554 | 555 | 556 | end 557 | --extract speakers from a single line 558 | function WordSplitLMMinibatchLoader.getSpeakersForLine(words) 559 | local speaker1 = {} 560 | speaker1.size=0 561 | local speaker2 = {} 562 | speaker2.size=0 563 | local isspeaker1=true 564 | for word in words do 565 | if word=="|" then 566 | isspeaker1=false 567 | speaker2[speaker2.size+1]="" 568 | speaker2.size=speaker2.size+1 569 | else 570 | if isspeaker1==true then 571 | speaker1[speaker1.size+1]=word 572 | speaker1.size=speaker1.size+1 573 | else 574 | speaker2[speaker2.size+1]=word 575 | speaker2.size=speaker2.size+1 576 | end 577 | end 578 | end 579 | 580 | --add end of speaker tag 581 | speaker2[speaker2.size+1]="" 582 | speaker2.size=speaker2.size+1 583 | 584 | 585 | return speaker1,speaker2 586 | 587 | end 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | return WordSplitLMMinibatchLoader --------------------------------------------------------------------------------