├── .gitattributes
├── .gitignore
├── README.md
├── beam.lua
├── data
    └── opensubssmall
    │   └── input.txt
├── eval.lua
├── neuralconvo.lua
├── seq2seq.lua
├── tokenizer.lua
├── trainoptim.lua
└── util
    ├── ModelTracker.lua
    ├── Tester.lua
    └── WordSplitLMMinibatchLoader.lua


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | .idea
 5 | 
 6 | # Folder config file
 7 | Desktop.ini
 8 | 
 9 | # Recycle Bin used on file shares
10 | $RECYCLE.BIN/
11 | 
12 | # Windows Installer files
13 | *.cab
14 | *.msi
15 | *.msm
16 | *.msp
17 | 
18 | # Windows shortcuts
19 | *.lnk
20 | 
21 | # =========================
22 | # Operating System Files
23 | # =========================
24 | 
25 | # OSX
26 | # =========================
27 | 
28 | .DS_Store
29 | .AppleDouble
30 | .LSOverride
31 | 
32 | # Thumbnails
33 | ._*
34 | 
35 | # Files that might appear in the root of a volume
36 | .DocumentRevisions-V100
37 | .fseventsd
38 | .Spotlight-V100
39 | .TemporaryItems
40 | .Trashes
41 | .VolumeIcon.icns
42 | 
43 | # Directories potentially created on remote AFP share
44 | .AppleDB
45 | .AppleDesktop
46 | Network Trash Folder
47 | Temporary Items
48 | .apdisk
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Torch Neural Conversation Model
 2 | 
 3 | This is an implementation of seq2seq for language models implemented in torch.
 4 | 
 5 | The main features are:
 6 | 
 7 | 1.  Multilayer LSTM's
 8 | 2.  Batch Processing
 9 | 3.  Explicit Vocabulary Size
10 | 4.  Adagrad (but easy to use any torch.optim plugins)
11 | 5.  Train/Test split
12 | 6.  Gradient clipping
13 | 7.  Large dataset support (above the normal LUA JIT limits, but limited by your system RAM)
14 | 8.  Beam Search for Decoding
15 | 
16 | 
17 | 
18 | This is really an extension of awesome work from the Element Research People:  (rnn) and macournoyer's great project (neralconvo) and some helpful code from karpathy's char-rnn.
19 | 
20 | This is technically in beta form, but I have confirmed that it is working.
21 | 
22 | ##Examples
23 | 
24 | I did a quick training with the first 9 million examples of the opensubs dataset for three epochs 
25 | (with minibatch of size 1, adagrad learning rate .01, 25 words in, 25 words out)
26 | 
27 | These outputs are the top five beams (using th beam.lua)
28 | 
29 | **Ask: hi**
30 | 
31 | *  Hi.
32 | *  How are you?
33 | *  What are you doing here?
34 | *  What are you doing?
35 | *  How are you doing?
36 | 
37 | 
38 | **Ask: where are you from ?**
39 | 
40 | *  I dont know.
41 | *  Im from ohio.
42 | *  From the north.
43 | *  I dont know...
44 | *  I dont know...... but i dont know.
45 | 
46 | **Ask: how old are you?**
47 | *  \<number\>.
48 | *  Im \<number\>.
49 | *  I dont know.
50 | *  \<number\>?
51 | *  \<number\>, \<number\>.
52 | 
53 | **Ask: goodbye**
54 | 
55 | *  Goodbye
56 | *  Goodbye.
57 | *  What are you doing?
58 | *  Goodbye...
59 | *  What are you doing here?
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | ##Installation
68 | 
69 | ## Tests
70 | 
71 | To run, use th trainoptim.lua --cuda  
72 | 
73 | (well-  you don't have to use cuda, but this would be crazy...)
74 | 
75 | When you want to test the model, run th beam.lua --cuda (or without the cuda flag if you trained it some other way)
76 | 
77 | ##Dataset
78 | 
79 | I put a small sample from the opensubs dataset up.  Really, you can add any dataset in the form:  
80 | 
81 | input |  response
82 | 
83 | with the pipe ('|') dividing the two.  You should preprocess your data a bit if you use it like this.  (Lua isn't the greatest for writing this kind pf preprocessing)
84 | 
85 | every new line is a new pair.   
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/beam.lua:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Created by IntelliJ IDEA.
 3 | -- User: user
 4 | -- Date: 7/1/2016
 5 | -- Time: 8:47 PM
 6 | -- To change this template use File | Settings | File Templates.
 7 | --
 8 | 
 9 | 
10 | require 'neuralconvo'
11 | require 'util.Tester'
12 | local tokenizer = require "tokenizer"
13 | local list = require "pl.List"
14 | require 'nn'
15 | local WordSplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader'
16 | 
17 | 
18 | local options = {}
19 | 
20 | if loader == nil then
21 |     cmd = torch.CmdLine()
22 |     cmd:text('Options:')
23 |     cmd:option('--cuda', false, 'use CUDA. Training must be done on CUDA')
24 |     cmd:option('--debug', false, 'show debug info')
25 |     cmd:option('--dataset', "model.t7", 'show debug info')
26 |     cmd:option('--vocablocation', "data/opensubssmall/vocabwords.t7", 'show debug info')
27 |     cmd:text()
28 |     options = cmd:parse(arg)
29 | 
30 | 
31 |     -- Enabled CUDA
32 |     if options.cuda then
33 |         require 'cutorch'
34 |         require 'cunn'
35 |     end
36 | 
37 |     -- Data
38 |     loader = WordSplitLMMinibatchLoader.createFromJustVocab(options.vocablocation)
39 | 
40 | end
41 | 
42 | if model == nil then
43 |     print("-- Loading model")
44 |     model = torch.load("data/"..options.dataset)
45 | end
46 | 
47 | 
48 | 
49 | function say(text)
50 |     print(getResponseBeam(text,loader,model,options.debug,5))
51 | end
52 | 
53 | 
54 | repeat
55 |     io.write("Ask: ")
56 |     io.flush()
57 |     answer=io.read()
58 | 
59 |     io.write(say(answer))
60 | 
61 | until answer=="end"
62 | 


--------------------------------------------------------------------------------
/eval.lua:
--------------------------------------------------------------------------------
 1 | require 'neuralconvo'
 2 | require 'util.Tester'
 3 | local tokenizer = require "tokenizer"
 4 | local list = require "pl.List"
 5 | require 'nn'
 6 | local WordSplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader'
 7 | 
 8 | 
 9 | local options = {}
10 | 
11 | if loader == nil then
12 |   cmd = torch.CmdLine()
13 |   cmd:text('Options:')
14 |   cmd:option('--cuda', false, 'use CUDA. Training must be done on CUDA')
15 |   cmd:option('--debug', false, 'show debug info')
16 |   cmd:option('--dataset', "model.t7", 'show debug info')
17 |   cmd:option('--vocablocation', "data/opensubssmall/vocabwords.t7", 'show debug info')
18 |   cmd:text()
19 |   options = cmd:parse(arg)
20 | 
21 | 
22 |   -- Enabled CUDA
23 |   if options.cuda then
24 |     require 'cutorch'
25 |     require 'cunn'
26 |   end
27 | 
28 |   -- Data
29 |   loader = WordSplitLMMinibatchLoader.createFromJustVocab(options.vocablocation)
30 | 
31 | end
32 | 
33 | if model == nil then
34 |   print("-- Loading model")
35 |   model = torch.load("data/"..options.dataset)
36 | end
37 | 
38 | 
39 | 
40 | function say(text)
41 |   print(getResponse(text,loader,model,options.debug))
42 | end
43 | 
44 | repeat
45 |   io.write("Ask: ")
46 |   io.flush()
47 |   answer=io.read()
48 | 
49 |   io.write(say(answer))
50 | 
51 | until answer=="end"
52 | 


--------------------------------------------------------------------------------
/neuralconvo.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'nn'
 3 | require 'rnn'
 4 | 
 5 | neuralconvo = {}
 6 | 
 7 | 
 8 | torch.include('neuralconvo', 'seq2seq.lua')
 9 | 
10 | return neuralconvo


--------------------------------------------------------------------------------
/seq2seq.lua:
--------------------------------------------------------------------------------
  1 | -- Based on https://github.com/Element-Research/rnn/blob/master/examples/encoder-decoder-coupling.lua
  2 | local Seq2Seq = torch.class("neuralconvo.Seq2Seq")
  3 | 
  4 | function Seq2Seq:__init(vocabSize, hiddenSize,clipping,nlayers)
  5 |     require 'optim'
  6 |     self.vocabSize = assert(vocabSize, "vocabSize required at arg #1")
  7 |     self.hiddenSize = assert(hiddenSize, "hiddenSize required at arg #2")
  8 |     self.useSecondLayer = usesecondlayer or false
  9 |     print("Vocab Size: "..vocabSize)
 10 |     self.numLayers = nlayers or 1
 11 |     print ("Nlayers: ".. self.numLayers)
 12 |     self.useSeqLSTM = true -- faster implementation of LSTM + Sequencer
 13 | 
 14 |     self:buildModel()
 15 | 
 16 |     self.gradientclipping = clipping
 17 | end
 18 | 
 19 | function Seq2Seq:buildModel()
 20 |     -- Encoder
 21 |     self.encoder = nn.Sequential()
 22 |     self.encoder:add(nn.LookupTableMaskZero(self.vocabSize, self.hiddenSize))
 23 |     self.encoder.lstmLayers = {}
 24 |     for i=1,self.numLayers do
 25 |         if self.useSeqLSTM then
 26 |             self.encoder.lstmLayers[i] = nn.SeqLSTM(self.hiddenSize, self.hiddenSize)
 27 |             self.encoder.lstmLayers[i]:maskZero()
 28 |             self.encoder:add(self.encoder.lstmLayers[i])
 29 |         else
 30 |             self.encoder.lstmLayers[i] = nn.LSTM(self.hiddenSize, self.hiddenSize):maskZero(1)
 31 |             self.encoder:add(nn.Sequencer(self.encoder.lstmLayers[i]))
 32 |         end
 33 |     end
 34 |     self.encoder:add(nn.Select(1, -1))
 35 | 
 36 |     -- Decoder
 37 |     self.decoder = nn.Sequential()
 38 |     self.decoder:add(nn.LookupTableMaskZero(self.vocabSize, self.hiddenSize))
 39 |     self.decoder.lstmLayers = {}
 40 |     for i=1,self.numLayers do
 41 |         if self.useSeqLSTM then
 42 |             self.decoder.lstmLayers[i] = nn.SeqLSTM(self.hiddenSize, self.hiddenSize)
 43 |             self.decoder.lstmLayers[i]:maskZero()
 44 |             self.decoder:add(self.decoder.lstmLayers[i])
 45 |         else
 46 |             self.decoder.lstmLayers[i] = nn.LSTM(self.hiddenSize, self.hiddenSize):maskZero(1)
 47 |             self.decoder:add(nn.Sequencer(self.decoder.lstmLayers[i]))
 48 |         end
 49 |     end
 50 |     self.decoder:add(nn.Sequencer(nn.MaskZero(nn.Linear(self.hiddenSize, self.vocabSize), 1)))
 51 |     self.decoder:add(nn.Sequencer(nn.MaskZero(nn.LogSoftMax(), 1)))
 52 | 
 53 |     self.criterion = nn.SequencerCriterion(nn.MaskZeroCriterion(nn.ClassNLLCriterion(),1))
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 |     self.encoder:zeroGradParameters()
 60 |     self.decoder:zeroGradParameters()
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 |     self.c=nn.Container()
 68 |     self.c:add(self.encoder)
 69 |     self.c:add(self.decoder)
 70 |     self.x,self.dl_dx = self.c:getParameters()
 71 |     self.optimState={}
 72 | 
 73 | end
 74 | 
 75 | 
 76 | function Seq2Seq:cuda()
 77 |     self.encoder:cuda()
 78 |     self.decoder:cuda()
 79 | 
 80 |     if self.criterion then
 81 |         self.criterion:cuda()
 82 |     end
 83 | 
 84 | 
 85 | 
 86 |     self.c:cuda();
 87 |     self.x,self.dl_dx = self.c:getParameters()
 88 | 
 89 | end
 90 | 
 91 | --[[ Forward coupling: Copy encoder cell and output to decoder LSTM ]]--
 92 | function Seq2Seq:forwardConnect(enc, dec, seqLen)
 93 |     for i=1,#enc.lstmLayers do
 94 |         if self.useSeqLSTM then
 95 |             dec.lstmLayers[i].userPrevOutput = enc.lstmLayers[i].output[seqLen]
 96 |             dec.lstmLayers[i].userPrevCell = enc.lstmLayers[i].cell[seqLen]
 97 |         else
 98 |             dec.lstmLayers[i].userPrevOutput = nn.rnn.recursiveCopy(dec.lstmLayers[i].userPrevOutput, enc.lstmLayers[i].outputs[seqLen])
 99 |             dec.lstmLayers[i].userPrevCell = nn.rnn.recursiveCopy(dec.lstmLayers[i].userPrevCell, enc.lstmLayers[i].cells[seqLen])
100 |         end
101 |     end
102 | end
103 | 
104 | --[[ Backward coupling: Copy decoder gradients to encoder LSTM ]]--
105 | function Seq2Seq:backwardConnect(enc, dec)
106 |     for i=1,#enc.lstmLayers do
107 |         if self.useSeqLSTM then
108 |             enc.lstmLayers[i].userNextGradCell = dec.lstmLayers[i].userGradPrevCell
109 |             enc.lstmLayers[i].gradPrevOutput = dec.lstmLayers[i].userGradPrevOutput
110 |         else
111 |             enc.lstmLayers[i].userNextGradCell = nn.rnn.recursiveCopy(enc.lstmLayers[i].userNextGradCell, dec.lstmLayers[i].userGradPrevCell)
112 |             enc.lstmLayers[i].gradPrevOutput = nn.rnn.recursiveCopy(enc.lstmLayers[i].gradPrevOutput, dec.lstmLayers[i].userGradPrevOutput)
113 |         end
114 |     end
115 | end
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | function Seq2Seq:train(input, target,targety, learn)
128 |     --these are just 1d vectors with word ids
129 |     local encoderInput = input
130 |     local decoderInput = target
131 |     local decoderTarget = targety
132 | 
133 |     if learn == nil then learn =true end
134 | 
135 |     -- Forward pass
136 |     self.encoder:forward(encoderInput)
137 |     self:forwardConnect(self.encoder,self.decoder,encoderInput:size(1))
138 |     local decoderOutput = self.decoder:forward(decoderInput)
139 |     local Edecoder = self.criterion:forward(decoderOutput, decoderTarget)
140 | 
141 |     if Edecoder ~= Edecoder then -- Exit early on bad error
142 |     return Edecoder
143 |     end
144 | 
145 | 
146 |     -- Backward pass
147 |     if learn then
148 |         local gEdec = self.criterion:backward(decoderOutput, decoderTarget)
149 |         self.decoder:backward(decoderInput, gEdec)
150 |         self:backwardConnect(self.encoder,self.decoder)
151 |         self.encoder:backward(encoderInput, self.zeroTensor)
152 | 
153 |     end
154 | 
155 | 
156 | 
157 |     self.decoder:forget()
158 |     self.encoder:forget()
159 | 
160 |     return Edecoder/decoderTarget:size(1)
161 | end
162 | 
163 | function Seq2Seq:update()
164 | 
165 |     self.dl_dx:clamp(-self.gradientclipping,self.gradientclipping);
166 | 
167 |     self.encoder:updateGradParameters(self.momentum)
168 |     self.decoder:updateGradParameters(self.momentum)
169 |     self.decoder:updateParameters(self.learningRate)
170 |     self.encoder:updateParameters(self.learningRate)
171 | 
172 |     self.encoder:zeroGradParameters()
173 |     self.decoder:zeroGradParameters()
174 | 
175 | 
176 |     --self.decoder:forget()
177 |     --self.encoder:forget()
178 | 
179 | end
180 | 
181 | 
182 | function Seq2Seq:trainOptim(minibatch,optimizer)
183 |     --these are just 1d vectors with word ids
184 | 
185 | 
186 |     local myseq = self
187 |     local Edecoder;
188 |     local err;
189 | 
190 |     optimizer=optimizer or "adagrad"
191 | 
192 |     local feval = function(x_new)
193 |         local totalerr = 0
194 |         local totaln = 0
195 | 
196 |         for _,example in ipairs(minibatch) do
197 | 
198 |             --[Note: added a fix from a versioning problem on rnn:
199 |             -- -- https://github.com/macournoyer/neuralconvo/issues/17]--
200 | 
201 |             local encoderInput = example["input"]
202 |             local target = example["target"]
203 |             local decoderInput = target
204 |             local decoderTarget = example["targetout"]
205 | 
206 |             local encoderOutput = myseq.encoder:forward(encoderInput)
207 |             myseq:forwardConnect(myseq.encoder,myseq.decoder,encoderInput:size(1))
208 |             local decoderOutput = myseq.decoder:forward(decoderInput)
209 |             -- print(decoderOutput )
210 |             -- print(decoderTarget)
211 |             --io.read()
212 | 
213 |             local thiserr=myseq.criterion:forward(decoderOutput, decoderTarget)
214 |             local nonzeroinputs = example["nonzeroTargets"]
215 | 
216 | 
217 |             totalerr = totalerr+thiserr
218 |             totaln=totaln+decoderTarget:size(1)
219 | 
220 | 
221 | 
222 |             local gEdec = myseq.criterion:backward(decoderOutput, decoderTarget)
223 |             myseq.decoder:backward(decoderInput, gEdec)
224 | 
225 |             myseq:backwardConnect(myseq.encoder,myseq.decoder)
226 | 
227 |             myseq.encoder:backward(encoderInput,  torch.Tensor(encoderOutput:size()):zero())
228 | 
229 | 
230 |             myseq.decoder:forget()
231 |             myseq.encoder:forget()
232 | 
233 |         end
234 |         if totaln==0 then err=0
235 |         else err=totalerr/totaln
236 |         end
237 |         myseq.dl_dx:clamp(-self.gradientclipping,self.gradientclipping);
238 |         return err, myseq.dl_dx
239 | 
240 |     end
241 | 
242 | 
243 | 
244 |     if(optimizer=="adagrad") then _,err=optim.adagrad(feval,self.x ,self.optimState)
245 |     elseif (optimizer=="rmsprop") then _,err=optim.rmsprop(feval,self.x ,self.optimState)
246 |     end
247 |     self.encoder:zeroGradParameters()
248 |     self.decoder:zeroGradParameters()
249 | 
250 | 
251 |     self.decoder:forget()
252 |     self.encoder:forget()
253 | 
254 |     return err[1]
255 | end
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | local MAX_OUTPUT_SIZE = 20
267 | 
268 | function Seq2Seq:eval(input)
269 |     assert(self.goToken, "No goToken specified")
270 |     assert(self.eosToken, "No eosToken specified")
271 | 
272 | 
273 |     self.encoder:forward(input)
274 | 
275 | 
276 |     local predictions = {}
277 |     local probabilities = {}
278 | 
279 |     -- Forward <go> and all of it's output recursively back to the decoder
280 |     local output = {self.goToken}
281 | 
282 |     for i = 1, MAX_OUTPUT_SIZE do
283 |         --wondering if we really need to forward connect before each run because we are
284 |         --kind of starting over each run here
285 |         self:forwardConnect(self.encoder,self.decoder,input:size(1))
286 |         --#output selects the last prediction of the chain
287 |         local prediction = self.decoder:forward(torch.Tensor({output}):t())[#output]
288 |         --print(prediction)
289 |         -- prediction contains the probabilities for each word IDs.
290 |         -- The index of the probability is the word ID.
291 |         --2 is to sort over the second dimension
292 |         local prob, wordIds = prediction:topk(5, 2, true, true)
293 | 
294 |         -- First one is the most likely.
295 |         next_output = wordIds[1][1]
296 |         --use second guess if unk token
297 |         if next_output==self.unknownToken  then next_output = wordIds[1][2] end
298 |         --print(wordIds)
299 |         --print(next_output)
300 |         --io.read()
301 |         table.insert(output, next_output)
302 | 
303 |         -- Terminate on EOS token
304 |         if next_output == self.eosToken then
305 |             break
306 |         end
307 | 
308 |         table.insert(predictions, wordIds)
309 |         table.insert(probabilities, prob)
310 |     end
311 | 
312 |     self.decoder:forget()
313 |     self.encoder:forget()
314 |     self.encoder:zeroGradParameters()
315 |     self.decoder:zeroGradParameters()
316 |     self.decoder:training()
317 |     self.encoder:training()
318 | 
319 |     return output,predictions, probabilities
320 | end
321 | 
322 | 
323 | function Seq2Seq:evalBeam(input,beamsize)
324 |     --run encoder
325 |     self.encoder:forward(input)
326 |     local beams = {}
327 |     -- Forward <go> and all of it's output recursively back to the decoder
328 |     local beam1 = {};
329 |     beam1.currentOutput={self.goToken}
330 |     beam1.length=1;
331 |     beam1.finished=false
332 |     beam1.prob=1;
333 |     beam1.problist = {}
334 |     table.insert(beam1.problist,1)
335 |     table.insert(beams,beam1)
336 | 
337 |     local n=1
338 | 
339 |     while(n<25) do
340 |         local newbeams = {}
341 |         --print(beams)
342 |         for _, beam in pairs(beams) do
343 |             io.write('.')
344 | 
345 |             if(beam.finished==false) then
346 |                 local nb=self:runOneBeam(beam,beamsize,input:size(1))
347 | 
348 |                 for _,nbi in pairs(nb) do
349 |                     newbeams[nbi.prob]=nbi;
350 |                 end
351 |             else
352 |                 newbeams[beam.prob]=beam;
353 |             end
354 | 
355 | 
356 | 
357 |         end
358 |         --print("Full Beams")
359 |         --print(newbeams)
360 |        beams= self:shrinkBeam(newbeams,beamsize)
361 |         --print("Shrunken Beams")
362 |         --print(beams)
363 |         --io.read()
364 |         n=n+1
365 | 
366 | 
367 |     end
368 | 
369 | 
370 | 
371 | 
372 | 
373 |     self.decoder:forget()
374 |     self.encoder:forget()
375 |     self.encoder:zeroGradParameters()
376 |     self.decoder:zeroGradParameters()
377 |     self.decoder:training()
378 |     self.encoder:training()
379 | 
380 |     return beams
381 | 
382 | end
383 | 
384 | --returns topn child beams
385 | --input size is just so we know where to forward connect
386 | function Seq2Seq:runOneBeam(beam,beamsize,inputsize)
387 |     --just a table of wordids
388 |     local output = beam.currentOutput
389 |     self:forwardConnect(self.encoder,self.decoder,inputsize)
390 |     local prediction = self.decoder:forward(torch.Tensor({output}):t())[#output]
391 |     local probs, wordIds = prediction:topk(beamsize, 2, true, true)
392 |     local beams = {}
393 | 
394 |     for i =1, beamsize do
395 |         local newbeam = {};
396 |         newbeam.length = beam.length+1
397 |         newbeam.finished=false
398 |         local newoutputs = {}
399 |         local next_output = wordIds[1][i]
400 |         local next_output_prob = torch.exp(probs[1][i])
401 |         newbeam.prob = beam.prob*next_output_prob
402 |         --store a list of all of the probabilities
403 |         newbeam.problist = {}
404 |         for k,v in ipairs(beam.problist) do
405 |             table.insert(newbeam.problist, v)
406 |         end
407 |         table.insert(newbeam.problist, next_output_prob)
408 | 
409 | 
410 | 
411 |         for k,v in ipairs(output) do
412 |             table.insert(newoutputs, v)
413 |         end
414 | 
415 |         table.insert(newoutputs, next_output)
416 |         newbeam.currentOutput=newoutputs;
417 | 
418 |         --make the score zero if there is an unknown
419 |         if next_output==self.unknownToken  then newbeam.prob=0 end
420 |         if next_output==self.eosToken  then newbeam.finished=true end
421 | 
422 |         table.insert(beams,newbeam);
423 | 
424 | 
425 |     end
426 | 
427 |     return beams;
428 | 
429 | 
430 | end
431 | 
432 | function Seq2Seq:shrinkBeam(beams,beamsize)
433 | 
434 |     local i = 1
435 |     local shrunkenBeam = {}
436 | 
437 | 
438 |     for score,beam in self:pairsByKeys(beams,function(a, b) return a > b end) do
439 |         shrunkenBeam[score]=beam;
440 | 
441 |         if(i==beamsize)then return shrunkenBeam end
442 |         i=i+1
443 |     end
444 |     return shrunkenBeam
445 | 
446 | end
447 | 
448 | 
449 | function Seq2Seq:pairsByKeys (t,f)
450 | 
451 |     local a = {}
452 |     for n in pairs(t) do table.insert(a, n) end
453 |     table.sort(a, f)
454 |     local i = 0      -- iterator variable
455 |     local iter = function ()   -- iterator function
456 |     i = i + 1
457 |     if a[i] == nil then return nil
458 |     else return a[i], t[a[i]]
459 |     end
460 |     end
461 |     return iter
462 | end


--------------------------------------------------------------------------------
/tokenizer.lua:
--------------------------------------------------------------------------------
 1 | local lexer = require "pl.lexer"
 2 | local yield = coroutine.yield
 3 | local M = {}
 4 | 
 5 | local function word(token)
 6 |   return yield("word", token)
 7 | end
 8 | 
 9 | local function quote(token)
10 |   return yield("quote", token)
11 | end
12 | 
13 | local function space(token)
14 |   return yield("space", token)
15 | end
16 | 
17 | local function tag(token)
18 |   return yield("tag", token)
19 | end
20 | 
21 | local function punct(token)
22 |   return yield("punct", token)
23 | end
24 | 
25 | local function endpunct(token)
26 |   return yield("endpunct", token)
27 | end
28 | 
29 | local function unknown(token)
30 |   return yield("unknown", token)
31 | end
32 | 
33 | function M.tokenize(text)
34 |     --make sure there are spaces around certain characters so that we predict them as individual units
35 |     local newtext =text
36 |     newtext=newtext:lower()
37 |     newtext = newtext:gsub("'", "")
38 |     newtext = newtext:gsub('-', " ")
39 |     newtext = newtext:gsub(',',' , ')
40 |     newtext = newtext:gsub('%.',' . ')
41 |     newtext = newtext:gsub('%:',' : ')
42 |     newtext = newtext:gsub('%;',' ; ')
43 |     newtext = newtext:gsub('%?',' ? ')
44 |     newtext = newtext:gsub('%!',' ! ')
45 |     newtext = newtext:gsub('\n',' \n ')
46 |     local matchstring = "([^%s]+)"
47 |     local words = newtext:gmatch(matchstring )
48 |     return words
49 | 
50 | 
51 | 
52 | 
53 | end
54 | 
55 | 
56 | function M.join(words)
57 |   local s = table.concat(words, " ")
58 |   s = s:gsub("^%l", string.upper)
59 |   s = s:gsub(" (') ", "%1")
60 |   s = s:gsub(" ([,:;%-%.%?!])", "%1")
61 | 
62 |   return s
63 | end
64 | 
65 | return M


--------------------------------------------------------------------------------
/trainoptim.lua:
--------------------------------------------------------------------------------
  1 | require 'neuralconvo'
  2 | require 'xlua'
  3 | require 'util.ModelTracker'
  4 | require 'nn'
  5 | require 'util.Tester'
  6 | 
  7 | local WordSplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader'
  8 | 
  9 | torch.setheaptracking(true)
 10 | 
 11 | cmd = torch.CmdLine()
 12 | cmd:text('Options:')
 13 | cmd:option('--dataDir', 'data/opensubssmall/', 'approximate size of dataset to use (0 = all)')
 14 | cmd:option('--vocabSize', 15000, 'Vocab Size')
 15 | cmd:option('--cuda', false, 'use CUDA')
 16 | cmd:option('--hiddenSize', 1000, 'number of hidden units in LSTM')
 17 | cmd:option('--nlayers', 2, 'Number of Layers')
 18 | cmd:option('--learningRate', 0.01, 'learning rate at t=0')
 19 | --cmd:option('--momentum', 0.9, 'momentum')
 20 | cmd:option('--minLR', 0.00001, 'minimum learning rate')
 21 | cmd:option('--saturateEpoch', 20, 'epoch at which linear decayed LR will reach minLR')
 22 | cmd:option('--maxEpoch', 10, 'maximum number of epochs to run')
 23 | cmd:option('--batchSize', 1, 'minibatch size')
 24 | cmd:option('--seqLength',50,'Max Sequence Length');
 25 | cmd:option('-seq_length_in',25,'length of sequence input')
 26 | cmd:option('-seq_length_out',25,'length of sequence output')
 27 | 
 28 | 
 29 | --Mike Additions
 30 | 
 31 | cmd:option('--grad_clip',5,'clip gradients at this value ')
 32 | cmd:option('--track',0,'Use ModelTracker')
 33 | cmd:option('--supermodelid',30627892,'Modeltracking- Supermodel ID')
 34 | cmd:option('--rmsprop', false, 'use RMSProp')
 35 | cmd:text()
 36 | options = cmd:parse(arg)
 37 | 
 38 | if options.dataset == 0 then
 39 |     options.dataset = nil
 40 | end
 41 | 
 42 | 
 43 | --for modeltracking online
 44 | local crossid=-99
 45 | if(options.track==1) then
 46 |     local desc = ""
 47 |     for k, v in pairs( options ) do desc = desc..k..": "..tostring(v).." " end
 48 | 
 49 |     local sm=ModelTracker.createSubmodel({["name"]="Neuraltalk lr:"..options.learningRate.." ",["description"]=desc,["supermodelid"]=options.supermodelid})
 50 |     local cross= ModelTracker.createCross({["name"]="Main",["description"]="Main Cross",["submodelid"]=sm.submodelid})
 51 |     crossid=cross.crossid
 52 | end
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | -- Data
 59 | print("-- Loading dataset")
 60 | --[[
 61 | dataset = neuralconvo.DataSet(neuralconvo.OpensubsDialogs("data/opensubs"),
 62 |     {
 63 |         loadFirst = options.dataset,
 64 |         minWordFreq = options.minWordFreq
 65 |     })
 66 | ]]--
 67 | local loader = WordSplitLMMinibatchLoader.create(options.dataDir, options.batchSize, options.seqLength, {.945, .0001, .05} ,options.vocabSize)
 68 | 
 69 | 
 70 | -- Model
 71 | model = neuralconvo.Seq2Seq(loader.vocab_size, options.hiddenSize,options.grad_clip,options.nlayers)
 72 | model.goToken = loader.goToken
 73 | model.eosToken = loader.eosToken
 74 | model.unknownToken = loader.unknownToken
 75 | 
 76 | -- Training parameters
 77 | model.criterion = nn.SequencerCriterion(nn.ClassNLLCriterion())
 78 | model.learningRate = options.learningRate
 79 | --model.momentum = options.momentum
 80 | local decayFactor = (options.minLR - options.learningRate) / options.saturateEpoch
 81 | local minMeanError = nil
 82 | 
 83 | model.optimState.learningRate=options.learningRate
 84 | 
 85 | print('Loading Model')
 86 | -- Enabled CUDA
 87 | if options.cuda then
 88 |     require 'cutorch'
 89 |     require 'cunn'
 90 |     model:cuda()
 91 | end
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | function runValidationSet()
100 |     local n = loader.nval
101 |     local splitIndex = 2
102 |     local testerr= torch.Tensor(n):fill(0)
103 |     for i = 1,n do
104 | 
105 |         local inputx,outputx,outputy=loader:getBatch(splitIndex ,i,options.seq_length_in,options.seq_length_out)
106 |         if(inputx:nDimension()~=0 and outputx:nDimension()~=0 and outputy:nDimension()~=0  and outputx:nonzero():nDimension()~=0) then
107 |             if options.cuda then
108 |                 inputx = inputx:cuda()
109 |                 outputx = outputx:cuda()
110 |                 outputy = outputy:cuda()
111 | 
112 |             end
113 | 
114 |             local minibatch={}
115 |             table.insert(minibatch,{input=inputx,target=outputx,targetout=outputy})
116 |             local err = model:train(inputx, outputx,outputy,false)
117 | 
118 |             testerr[i]=err
119 |         end
120 |         xlua.progress(i, n)
121 |     end
122 | 
123 |     print("Validation Error: "..testerr:mean())
124 |     collectgarbage()
125 |     return testerr:mean()
126 | 
127 | 
128 | end
129 | 
130 | local reportEvery = 1000
131 | local testEvery=20000
132 | local first=true
133 | 
134 | -- Run the experiment
135 | local totalcount=1
136 | for epoch = 1, options.maxEpoch do
137 |     print("\n-- Epoch " .. epoch .. " / " .. options.maxEpoch)
138 |     print("")
139 | 
140 |     --shuffle training batches
141 |     loader.train =  WordSplitLMMinibatchLoader.shuffleTensorRows(loader.train)
142 | 
143 | 
144 |     local errorssmall = torch.Tensor(reportEvery):fill(0)
145 |     local timer = torch.Timer()
146 | 
147 | 
148 |     local j= 1
149 | 
150 |     for i = 1, loader.ntrain do
151 |         if(1%200==0) then collectgarbage() end
152 |         local inputx,outputx,outputy  =loader:getBatch(1 ,i,options.seq_length_in,options.seq_length_out)
153 | 
154 |         local encInSeq = inputx;
155 |         local decInSeq= outputx;
156 |         local decOutSeq=outputy;
157 | 
158 |         --[[
159 |         print('EncIn')
160 |         print(encInSeq)
161 |         print(tensor2sent(encInSeq,loader))
162 |         print('DecIn')
163 |         print(tensor2sent(decInSeq,loader))
164 |         print('DecOut')
165 |         print(tensor2sent(decOutSeq,loader))
166 | 
167 |         io.read()
168 |         ]]--
169 | 
170 |         if(inputx:nDimension()~=0 and outputx:nDimension()~=0 and outputy:nDimension()~=0 and outputx:nonzero():nDimension()~=0 ) then
171 |             --have to do this before cuda'ing
172 |             local nonzerot = decInSeq:nonzero():size(1)
173 |             if options.cuda then -- ship the input arrays to GPU
174 |             -- have to convert to float because integers can't be cuda()'d
175 |                 encInSeq = encInSeq:float():cuda()
176 |                 decInSeq = decInSeq:float():cuda()
177 |                 decOutSeq = decOutSeq:float():cuda()
178 |             end
179 |             local minibatch={}
180 |             table.insert(minibatch,{input=encInSeq,target=decInSeq,targetout=decOutSeq,nonzeroTargets=nonzerot})
181 | 
182 | 
183 |             local err
184 |             if options.rmsprop then
185 |                 err= model:trainOptim(minibatch, "rmsprop")
186 |                 if(first) then
187 |                     print("Using RMSProp")
188 |                     first=false
189 |                 end
190 | 
191 |             else err= model:trainOptim(minibatch, "adagrad")
192 |             end
193 | 
194 |             errorssmall[j]=err
195 |             j=j+1
196 |         end
197 |         if j == reportEvery then
198 | 
199 |             print(string.format("Error = %.3f", (errorssmall:mean()) )..string.format(" Progress = %.1f", (totalcount)) )
200 |             if(options.track==1) then
201 |                 pcall(ModelTracker.sendStatistic({["category"]="Next",["name"]="Loss",["group"]="train",["n"]=totalcount,["crossid"]=crossid,["value"]=errorssmall:mean()}))
202 |             end
203 |             errorssmall = torch.Tensor(reportEvery):fill(0)
204 |             j=1
205 |         end
206 | 
207 |         xlua.progress(i, loader.ntrain)
208 |         i = i + 1
209 |         --test set
210 |         if i % testEvery==0 then
211 |             if loader.nval>0 then
212 | 				local meanerr=runValidationSet()
213 | 			end
214 | 			
215 |             if(options.track==1) then
216 |                 pcall(ModelTracker.sendStatistic({["category"]="Next",["name"]="Loss",["group"]="test",["n"]=totalcount,["crossid"]=crossid,["value"]=meanerr}))
217 |             end
218 |             print("Hi : ".. getResponse("Hi",loader,model))
219 |             print("What is your name : ".. getResponse("What is your name",loader,model))
220 |             print("How old are you : ".. getResponse("How old are you ",loader,model))
221 |             print("What is the meaning of life : ".. getResponse("What is the meaning of life ",loader,model))
222 | 
223 |         end
224 | 
225 |         if i% 1000 ==0 then
226 |             print("Hi : ".. getResponse("Hi",loader,model))
227 |             print("What is your name : ".. getResponse("What is your name",loader,model))
228 |             print("How old are you : ".. getResponse("How old are you ",loader,model))
229 | 
230 |         end
231 | 
232 | 
233 |         if(totalcount % 1000000==0) then
234 |             print("\n(Saving model ...)")
235 |             torch.save("data/model.t7", model)
236 | 
237 |         end
238 | 
239 |         if(totalcount%100000==0 and options.track==1)then
240 | 
241 | 
242 |             local report=""
243 |             report=report.."<p>Hi : "..getResponse("Hi",loader,model).."<p>"
244 |             report=report.."<p>What is your name  : "..getResponse("What is your name ",loader,model).."<p>"
245 |             report=report.."<p>How old are you : "..getResponse("How old are you ",loader,model).."<p>"
246 |             report=report.."<p>What is the meaning of life : "..getResponse("What is the meaning of life",loader,model).."<p>"
247 |             report=report.."<p>Do you like swimming : "..getResponse("Do you like swimming ",loader,model).."<p>"
248 |             report=report.."<p>It's been a long day : "..getResponse("It's been a long day ",loader,model).."<p>"
249 |             report=report.."<p>goodbye : "..getResponse("goodbye ",loader,model).."<p>"
250 | 
251 |             ModelTracker.sendReport({["reportname"]="Dialogue At Epoch: "..epoch.." Iteration: "..i,["parentid"]=crossid,["report"]=report})
252 | 
253 | 
254 |         end
255 | 
256 | 
257 | 
258 | 
259 | 
260 |         totalcount=totalcount+1
261 | 
262 | 
263 |     end
264 | 
265 |     timer:stop()
266 | 
267 | 
268 |     print("\nEpoch stats:")
269 | 
270 |     -- Save the model if it improved.
271 |     --if minMeanError == nil or errors:mean() < minMeanError then
272 |     --    print("\n(Saving model ...)")
273 |     --    torch.save("data/model.t7", model)
274 |     --    minMeanError = errors:mean()
275 |     --end
276 |     print("\n(Saving model ...)")
277 |     torch.save("data/model.t7", model)
278 | 
279 | 
280 |     model.learningRate = model.learningRate + decayFactor
281 |     model.learningRate = math.max(options.minLR, model.learningRate)
282 | end
283 | 
284 | 
285 | -- Load testing script
286 | require "eval"


--------------------------------------------------------------------------------
/util/ModelTracker.lua:
--------------------------------------------------------------------------------
 1 | ModelTracker = {}
 2 | 
 3 | --(from mtanana) this is my own custom model tracker  I'd be happy to open source it if someone wants it
 4 | 
 5 | --table is the lua table that will become json and the funct is the function id
 6 | function ModelTracker.sendJsonObj(table,funct)
 7 | 
 8 |     local endpoint = "http://camber:8080/modeltracker/tracking.jsp"
 9 |     -- load required modules
10 |     local http = require("socket.http") --luasocket
11 |     local ltn12 = require("ltn12")
12 |     local mime = require("mime")
13 |     local io = require("io")
14 |     local json = require("json") -- luajson
15 |     local url = require("socket.url")
16 |     
17 |     -- Create a Lua table to represent our entity to save
18 |     --- This is from our doc REST example: http://docs.kinvey.com/rest-appdata.html
19 |     --jamesBond = { ["firstName"] = "James", ["lastName"] =  "Bond", ["email"] = "james.bond@mi6.gov.uk", ["age"] = 34 }
20 |     
21 |     -- Save the table to the backend
22 |     --- convert to json
23 |     local jsstr = url.escape(json.encode(table))
24 |     
25 |     --- build a http request
26 |     local request = endpoint.."?function="..funct.."&jsonobj="..jsstr
27 |     
28 |     local response_body = { }
29 |     --- send the request
30 |     ok, code, headers = http.request{url = request, method = "POST", sink = ltn12.sink.table(response_body)}
31 |     
32 |     --- show that we got a valid response
33 |    -- print(code) -- should be 201 for POST success 
34 |     saveditem = response_body[1]; -- kinvey appdata responses return arrays (which are tables in Lua)
35 |     --print(saveditem)
36 |     
37 |     --- convert from json to lua object
38 |     objAsTable = json.decode(saveditem)
39 |     return objAsTable
40 | end
41 | 
42 | 
43 | 
44 | --CODE FOR BUILDING NEW SUPERMODEL:
45 | 
46 | --require 'util.ModelTracker'
47 | --id=ModelTracker.createSupermodel("New SM","a testing sm");
48 | 
49 | --{["name"]="NewCross",["description"]="newcross",["submodelid"]=-99}
50 | function ModelTracker.createCross(newcross)
51 |     return ModelTracker.sendJsonObj(newcross,"1001")
52 | end
53 | --{["name"]="NewSM",["description"]="newsm",["supermodelid"]=-99}
54 | function ModelTracker.createSubmodel(newsubmodel)
55 |     return ModelTracker.sendJsonObj(newsubmodel,"1002")
56 | end
57 | --{["modelname"]="NewSM",["description"]="newsm"}
58 | function ModelTracker.createSupermodel(name,description)
59 |     local newsupermodel = {["modelname"]=name,["modeldescription"]=description}
60 |     return ModelTracker.sendJsonObj(newsupermodel,"1003")
61 | end
62 | 
63 | --{["category"]="changetalk",["group"]="train",["n"]=200,["crossid"]=-99,["value"]=3.45}
64 | function ModelTracker.sendStatistic(statistic)
65 |     return ModelTracker.sendJsonObj(statistic,"1000")
66 | end
67 | --{["reportname"]="",["parentid"]=crossid,["report"]="The text of the report"}
68 | function ModelTracker.sendReport(report)
69 |     return ModelTracker.sendJsonObj(report,"1010")
70 | end


--------------------------------------------------------------------------------
/util/Tester.lua:
--------------------------------------------------------------------------------
  1 | --
  2 | -- Created by IntelliJ IDEA.
  3 | -- User: user
  4 | -- Date: 3/13/2016
  5 | -- Time: 2:55 PM
  6 | -- To change this template use File | Settings | File Templates.
  7 | --
  8 | 
  9 | local tokenizer = require "tokenizer"
 10 | local list = require "pl.List"
 11 | 
 12 | 
 13 | -- Word IDs to sentence
 14 | function pred2sent(wordIds,dataset)
 15 |     local words = {}
 16 | 
 17 | 
 18 |     for _, wordId in ipairs(wordIds) do
 19 |         local id = wordId
 20 |         if id ~= 0 and id~=dataset.goToken and id~=dataset.eosToken then
 21 |             local word = dataset.id2word[id]
 22 |             table.insert(words, word)
 23 |         end
 24 |     end
 25 |     --print(words)
 26 |     return tokenizer.join(words)
 27 | end
 28 | 
 29 | function tensor2sent(wordIds, dataset)
 30 |     local words = {}
 31 | 
 32 | 
 33 |     for i=1,wordIds:size(1) do
 34 |         local id = wordIds[i][1]
 35 |         if id ~= 0 then
 36 |             local word = dataset.id2word[id]
 37 |             table.insert(words, word)
 38 |         end
 39 |     end
 40 | 
 41 |     return tokenizer.join(words)
 42 | end
 43 | 
 44 | function printmytable(t)
 45 |     for i,v in ipairs(t) do
 46 |             print(v)
 47 |     end
 48 | 
 49 | end
 50 | 
 51 | --word ids and probabilites are both tables of the length of the final output
 52 | 
 53 | function printProbabilityTable(wordIds, predictions,probabilities, num,dataset)
 54 |     print(string.rep("-", num * 22))
 55 |    -- printmytable(wordIds)
 56 |    -- printmytable(probabilities)
 57 |     --p is the final output word id
 58 | 
 59 | 
 60 |     for p, probs in ipairs(probabilities) do
 61 |         --print(p)
 62 |         local line = "| "
 63 |         wordId = wordIds[p]
 64 |         local probs = probabilities[p];
 65 |         local preds = predictions[p];
 66 | 
 67 |         for i = 1, num do
 68 | 
 69 |            local pr =  torch.exp(probs[1][i])
 70 |            --print(wordId)
 71 |            local w = preds[1][i]
 72 |            local word = dataset.id2word[w]
 73 |           -- print(word)
 74 |          --  local t = probabilities[1][p]
 75 |          --  print("prob.."..t)
 76 |          -- print("wordid.."..wordId[1][i])
 77 |           line = line .. string.format("%-10s(%4d%%)", word, pr * 100) .. "  |  "
 78 |         end
 79 |         print(line)
 80 |     end
 81 | 
 82 |     print(string.rep("-", num * 22))
 83 | end
 84 | 
 85 | function getResponse(text,dataset,model,debug)
 86 |     debug = debug or false
 87 |     local wordIds = {}
 88 | 
 89 |     for word in tokenizer.tokenize(text) do
 90 |         local id = dataset.word2id[word] or dataset.unknownToken
 91 |         table.insert(wordIds, id)
 92 |     end
 93 | 
 94 |     local input = torch.Tensor({wordIds}):t()
 95 | 
 96 |     --predictions is a table of tensors of word ids
 97 |     --probabilities are the matching probs (well...log activations)
 98 |     local output,predictions, probabilities = model:eval(input)
 99 |     --print("Predictions")
100 |     --print(predictions)
101 |     --print(probabilities)
102 |     local phrase = pred2sent(output,dataset)
103 | 
104 |     if debug then
105 |        printProbabilityTable(output, predictions,probabilities, 4,dataset)
106 |     end
107 |     phrase = phrase or ''
108 |     return phrase
109 | 
110 | end
111 | 
112 | function getResponseBeam(text,dataset,model,debug,beamsize)
113 |     debug = debug or false
114 |     local wordIds = {}
115 | 
116 |     for word in tokenizer.tokenize(text) do
117 |         local id = dataset.word2id[word] or dataset.unknownToken
118 |         table.insert(wordIds, id)
119 |     end
120 | 
121 |     local input = torch.Tensor({wordIds}):t()
122 | 
123 |     --predictions is a table of tensors of word ids
124 |     --probabilities are the matching probs (well...log activations)
125 |     local beams = model:evalBeam(input,beamsize)
126 |     --print("Predictions")
127 |     --print(predictions)
128 |     --print(probabilities)
129 |     local phrase = '\n'
130 |     for score,beam in model:pairsByKeys(beams,function(a, b) return a > b end) do
131 | 
132 |         local scoretensor = torch.Tensor(beam.problist)
133 |         local meanscore = torch.mean(scoretensor)
134 |         local sent = pred2sent(beam.currentOutput,dataset)
135 |         local sscore = string.format("%.4f",score)
136 |         local sscore2 = string.format("%.4f",meanscore)
137 |         phrase = phrase..sscore..', '..sscore2..': ' ..sent.. '\n'
138 |     end
139 | 
140 | 
141 | 
142 | 
143 |     if debug then
144 |         --printProbabilityTable(output, predictions,probabilities, 4,dataset)
145 |     end
146 |     phrase = phrase or ''
147 |     return phrase
148 | 
149 | end
150 | 
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/util/WordSplitLMMinibatchLoader.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | --Modified by Mike Tanana from Andrew Karpathy and Wojciech Zaremba
  3 | --Changed to support word models and Seq2Seq
  4 | 
  5 | --input comes in the form of a file where each line has a speaker and response
  6 | --in the form:  speakerone utterance | speaker two response
  7 | 
  8 | 
  9 | 
 10 | local WordSplitLMMinibatchLoader = {}
 11 | WordSplitLMMinibatchLoader.__index = WordSplitLMMinibatchLoader
 12 | 
 13 | WordSplitLMMinibatchLoader.tokenizer = require "tokenizer"
 14 | 
 15 | 
 16 | 
 17 | function WordSplitLMMinibatchLoader.shuffle(t)
 18 |     local n = #t
 19 |     while n > 2 do
 20 |         local k = math.random(n)
 21 |         t[n], t[k] = t[k], t[n]
 22 |         n = n - 1
 23 |     end
 24 |     return t
 25 | end
 26 | 
 27 | function WordSplitLMMinibatchLoader.createFromJustVocab(vocabfile)
 28 |     local self = {}
 29 |     setmetatable(self, WordSplitLMMinibatchLoader)
 30 |     self:loadExistingVocabFile(vocabfile)
 31 | 
 32 |     return self
 33 | 
 34 | end
 35 | 
 36 | function WordSplitLMMinibatchLoader:loadExistingVocabFile(vocabfilename)
 37 | 
 38 |     --this is word to index
 39 |     self.vocab_mapping = torch.load(vocabfilename)
 40 |     self.id2word = {}
 41 |     self.word2id=self.vocab_mapping
 42 | 
 43 |     --count vocab and make reverse mapping
 44 |     self.vocab_size = 0
 45 |     for word,idx in pairs(self.vocab_mapping) do
 46 |         self.vocab_size = self.vocab_size + 1
 47 |         self.id2word[idx] = word
 48 |     end
 49 |     print('Vocab Size'..self.vocab_size)
 50 | 
 51 | 
 52 | 
 53 |     self.goToken = self.vocab_mapping['<go>']
 54 |     self.eosToken = self.vocab_mapping['<eos>']
 55 |     self.unknownToken = self.vocab_mapping['<unk>']
 56 | 
 57 | 
 58 | end
 59 | 
 60 | function WordSplitLMMinibatchLoader.create(data_dir, batch_size,seq_length, split_fractions,vocabsize)
 61 |     -- split_fractions is e.g. {0.9, 0.05, 0.05}
 62 | 
 63 |     local self = {}
 64 |     setmetatable(self, WordSplitLMMinibatchLoader)
 65 | 
 66 |     local input_file = path.join(data_dir, 'input.txt')
 67 |     local vocab_file = path.join(data_dir, 'vocabwords.t7')
 68 |     local tensor_file = path.join(data_dir, 'datawords.t7')
 69 | 
 70 |     -- fetch file attributes to determine if we need to rerun preprocessing
 71 |     local run_prepro = false
 72 |     if not (path.exists(vocab_file) or path.exists(tensor_file)) then
 73 |         -- prepro files do not exist, generate them
 74 |         print('vocab.t7 and data.t7 do not exist. Running preprocessing...')
 75 |         run_prepro = true
 76 |     else
 77 |         -- check if the input file was modified since last time we
 78 |         -- ran the prepro. if so, we have to rerun the preprocessing
 79 |         local input_attr = lfs.attributes(input_file)
 80 |         local vocab_attr = lfs.attributes(vocab_file)
 81 |         local tensor_attr = lfs.attributes(tensor_file)
 82 |         if input_attr.modification > vocab_attr.modification or input_attr.modification > tensor_attr.modification then
 83 |             print('vocab.t7 or data.t7 detected as stale. Re-running preprocessing...')
 84 |             run_prepro = true
 85 |         end
 86 |     end
 87 |     if run_prepro then
 88 |         -- construct a tensor with all the data, and vocab file
 89 |         print('one-time setup: preprocessing input text file ' .. input_file .. '...')
 90 |         self:text_to_tensor(input_file, vocab_file, tensor_file,vocabsize,split_fractions)
 91 |     end
 92 | 
 93 |     print('loading data files...')
 94 |     --in this file rows are dialogue pairs: first half is speaker1 second half is speaker 2
 95 |     --the data should always store at least one more than you are going to predict (otherwise the final step will be incorrect)
 96 |     local data = torch.load(tensor_file)
 97 |     self.train = data.train
 98 |     self.val =data.val
 99 |     self.test = data.test
100 | 
101 |     self:loadExistingVocabFile(vocab_file)
102 | 
103 |     --shuffle rows
104 |     WordSplitLMMinibatchLoader.shuffleTensorRows(self.train)
105 | 
106 | 
107 | 
108 |     -- divide data to train/val and allocate rest to test
109 |     self.ntrain = math.floor(self.train:size(1)/batch_size )-1
110 | 	
111 | 	if self.val ~=nil then
112 | 		self.nval = math.floor(self.val:size(1)/batch_size)-1		
113 | 	else
114 | 		self.nval=0
115 | 	end	
116 | 	if self.test ~=nil then		
117 | 		self.ntest = math.floor(self.test:size(1)/batch_size)-1
118 | 	else
119 | 		self.ntest=0
120 | 	end	
121 | 	
122 |     self.batch_size = batch_size
123 |     print ('Val Size: ' .. self.nval)
124 | 
125 |     self.split_sizes = {self.ntrain, self.nval, self.ntest}
126 |     self.batch_ix = {0,0,0 }
127 | 
128 | 
129 | 
130 |     print(string.format('data load done. Number of data batches in train: %d, val: %d, test: %d', self.ntrain, self.nval, self.ntest))
131 |     collectgarbage()
132 | 
133 |     --self:writeTxtFile(1,self.ntrain,self.vocab_mapping,"train.txt");
134 |     --self:writeTxtFile(2,self.nval,self.vocab_mapping,"test.txt");
135 | 
136 | 
137 | 
138 | 
139 | 
140 |     return self
141 | end
142 | 
143 | function WordSplitLMMinibatchLoader:reset_batch_pointer(split_index, batch_index)
144 |     batch_index = batch_index or 0
145 |     self.batch_ix[split_index] = batch_index
146 | end
147 | 
148 | function WordSplitLMMinibatchLoader:writeTxtFile(split_index,n,vocab,filename)
149 |     print("Saving data "..filename)
150 |     --get the numerically indexed vocab table
151 |     local ivocab = {}
152 |     for c,i in pairs(vocab) do ivocab[i] = c end
153 | 
154 | 
155 |     local file = io.open(filename, "a")
156 | 
157 | 
158 |     for key= 1,n  do
159 |         local inx,outx,outy=self:next_batch(split_index,25,25)
160 |         self:writeBatch(file,outx,ivocab)
161 | 
162 |     end
163 | 
164 |     file:close()
165 | 
166 | end
167 | function WordSplitLMMinibatchLoader:writeBatch(file,batch,ivocab)
168 | 
169 |     for row =1 , batch:size(1) do
170 |         for col=1 ,  batch:size(2) do
171 |             local word = ivocab[batch[row][col]]
172 |             if(word=="<pad>") then break end
173 |             if(word=="<go>") then
174 |                 file:write("")
175 |             else
176 |                 file:write(word.." ")
177 |             end
178 | 
179 |         end
180 |         file:write("\n")
181 |     end
182 | 
183 | end
184 | 
185 | 
186 | 
187 | 
188 | function WordSplitLMMinibatchLoader:getBatch(split_index,batchid,insize,outsize)
189 |     local set = {}
190 |     if split_index ==1 then set = self.train
191 |     elseif split_index==2 then set = self.val
192 |     elseif split_index==3 then set = self.test
193 |     end
194 | 
195 | 
196 | 
197 |     -- pull out the correct next batch
198 |     local start = (batchid*self.batch_size)+1
199 | 
200 | 
201 | 
202 |     local x = set:narrow(1,start,self.batch_size)
203 |     local y = getydataforx(x)
204 |     --print(x)
205 |     --print(y)
206 |     --io.read()
207 | 
208 |     --split the two sequences apart
209 |     local length = x:size(2)
210 |     local size = length/2
211 |     local in_start = math.max(1,size-(insize-1))
212 |     local in_usesize = math.min(size,insize)
213 |     local out_usesize = math.min(size,outsize)
214 | 
215 |     local inputx = x:narrow(2,in_start,in_usesize)
216 |     --print(x)
217 |     --print(in_start.." "..in_usesize)
218 |     --print(inputx)
219 |     local inputy = y:narrow(2,in_start,in_usesize)
220 |     inputx,inputy = self:trimPaddingFromLeft(inputx,inputy)
221 |     local outputx = x:narrow(2,size+1,out_usesize)
222 |     local outputy = y:narrow(2,size+1,out_usesize):clone()
223 |     outputx,outputy = self:trimPaddingFromRight(outputx,outputy)
224 |     --need to do this because the criterion can't handle zeros even though they get masked
225 |     outputy[outputy:lt(1)]=self.eosToken
226 |     return inputx:t(),outputx:t(),outputy:t()
227 | end
228 | --trims to the first non padding elemend
229 | --assumes t1 and t2 are the same dimensions
230 | function WordSplitLMMinibatchLoader:trimPaddingFromLeft(t1,t2)
231 |     local firstValid = 0
232 |     for i = 1, t1:size(2) do
233 |         for j = 1, t1:size(1) do
234 |             local val = t1[j][i]
235 |             local val2 = t2[j][i]
236 |             if(val~=0 or val2~=0) then
237 |                 firstValid=i
238 |                 break
239 |             end
240 |         end
241 |         if(firstValid > 0) then break end
242 |     end
243 |     if(firstValid==0) then return t1,t2 end
244 |     local length = t1:size(2)+1-firstValid
245 |     local newt1 = t1:narrow(2,firstValid,length)
246 |     local newt2 = t2:narrow(2,firstValid,length)
247 | 
248 |     return newt1,newt2
249 | 
250 | end
251 | --trims to there
252 | function WordSplitLMMinibatchLoader:trimPaddingFromRight(t1,t2)
253 |     local firstValid = 0
254 |     for i = t1:size(2), 1,-1 do
255 |         for j = 1, t1:size(1) do
256 |             local val = t1[j][i]
257 |             local val2 = t2[j][i]
258 |             if(val~=0 or val2~=0) then
259 |                 firstValid=i
260 |                 break
261 |             end
262 |         end
263 |         if(firstValid > 0) then break end
264 |     end
265 |     if(firstValid==0) then return t1,t2 end
266 |     local length = firstValid
267 |     local newt1 = t1:narrow(2,1,firstValid)
268 |     local newt2 = t2:narrow(2,1,firstValid)
269 | 
270 |     return newt1,newt2
271 | end
272 | 
273 | 
274 | 
275 | 
276 | 
277 | function getydataforx(xdata)
278 |     xt = xdata:t()
279 |     yt = xt:clone()  --watch out transpose works off the same data
280 |     ydata = yt:sub(1,-2):copy(xt:sub(2,-1)) --shift everything down one
281 |     yt[-1] = xt[1] --make the last item the same as the first (i.e. make sure you dont' set a seq length that actually uses this)
282 |     y = yt:t()  --put back into cols are seq length and rows are samples
283 |     return y
284 | end
285 | 
286 | 
287 | --[[
288 | --
289 |  - deprecated user WordSplitLMMinibatchLoader.tokenizer.tokenize(t)-
290 | function WordSplitLMMinibatchLoader.preprocess(alltext)
291 |   --make sure there are spaces around certain characters so that we predict them as individual units
292 |   local newtext
293 |   newtext = alltext:gsub(',',' , ')
294 |   newtext = newtext:gsub('%.',' . ')
295 |   newtext = newtext:gsub('%:',' : ')
296 |   newtext = newtext:gsub('%;',' ; ')
297 |   newtext = newtext:gsub('%?',' ? ')
298 |   newtext = newtext:gsub('%!',' ! ')
299 |   newtext = newtext:gsub('\n',' \n ')
300 | 
301 | 
302 |   return newtext
303 | end]]--
304 | 
305 | ---Makes sure we split on spaces
306 | ----return nil if we are at the end
307 | function getNextBatchFromFile(torchfile)
308 |     --first get main buffer size and create a string
309 |     local chars = torchfile:readByte(100000);
310 |     if(chars:size()==0) then return nil end
311 |     local text = chars:string();
312 |     --now make keep going until we get a space (or it is the end)
313 |     local nospace=true;
314 |     local extrachars = "";
315 |     while nospace do
316 |         local char =torchfile:readByte()
317 |         if char==nil or string.char(char)==" " then break end
318 |         extrachars=extrachars..string.char(char)
319 |     end
320 |     text=text..extrachars
321 |     --io.write(text)
322 |     --return text
323 |     return text
324 | 
325 | end
326 | 
327 | function  WordSplitLMMinibatchLoader.shuffleTensorRows(t)
328 |     --shuffle tensor
329 |     local indexes = torch.randperm(t:size(1)):type('torch.LongTensor')
330 |     t = t:index(1,indexes)
331 |     return t
332 | 
333 | end
334 | ---Makes sure we split on new lines
335 | ----return nil if we are at the end
336 | function WordSplitLMMinibatchLoader.getNextBatchFromFileStandard(file)
337 |     --first get main buffer size and create a string
338 |     local block= file:read(1000000);
339 |     if not block then return nil end
340 |     local text = block;
341 |     --now make keep going until we get a space (or it is the end)
342 |     local nospace=true;
343 |     local extrachars = "";
344 |     while nospace do
345 |         local char =file:read(1)
346 |         if char==nil or char=="\n" or char=="\r" then break end
347 |         extrachars=extrachars..char
348 |     end
349 |     text=text..extrachars
350 |     --io.write(text)
351 |     --return text
352 |     return text
353 | 
354 | end
355 | 
356 | --input comes in the form of a file where each line has a speaker and response
357 | --in the form:  speakerone utterance | speaker two response
358 | 
359 | -- *** STATIC method ***
360 | function WordSplitLMMinibatchLoader:text_to_tensor(in_textfile, out_vocabfile, out_tensorfile,vocabsize,split_fractions)
361 |     --local timer = torch.Timer()
362 |     local matchstring = "([^%s]+)"
363 |     print('loading text file...')
364 |     local wordcount = {}
365 |     local rawdata
366 |     local tot_len = 0
367 |     local filein = io.open(in_textfile, "r")
368 |     --local filein = torch.DiskFile(in_textfile, "r")
369 |     --filein:quiet();
370 |     local unknownword = "<unk>"
371 |     local padding = "<pad>" --pads are now just zeros
372 |     local go = "<go>"
373 |     local eos = "<eos>"
374 | 
375 |     -- create vocabulary if it doesn't exist yet
376 |     print('creating vocabulary mapping...')
377 |     -- record all characters to a set
378 |     local unordered = {}
379 |     local count=0
380 |     local t=true
381 |     local nlines =0
382 | 
383 | 
384 |     while(t ~= nil) do
385 |         t=WordSplitLMMinibatchLoader.getNextBatchFromFileStandard(filein)
386 |         if t ==nil then break end
387 |         -- t=WordSplitLMMinibatchLoader.preprocess(t)
388 | 
389 |         local words = WordSplitLMMinibatchLoader.tokenizer.tokenize(t)
390 | 
391 | 
392 |         for word in words do
393 |             word = word:lower()
394 |             if word ~= "|" then  --speaker change character
395 |             if wordcount[word]==nil then
396 |                 wordcount[word]=1
397 |             else
398 |                 wordcount[word]=wordcount[word]+1
399 |             end
400 |             tot_len=tot_len+1
401 |             else  --if word== "|"  easy way to count n dialog elements
402 |             nlines=nlines+1
403 |             end
404 |         end
405 |         io.write(tot_len.."\n")
406 | 
407 |     end
408 | 
409 | 
410 |     filein:close()
411 | 
412 | 
413 | 
414 | 
415 | 
416 | 
417 |     --------------------------------------------------------
418 |     --trim vocabulary---------------------------------------
419 |     --------------------------------------------------------
420 | 
421 |     --basically start at some very high frequency and then go down until we have added the right number of words
422 |     --the ties will kind of be added 'randomly'  (really based on which were put in first)
423 |     local frequency = 400  --start here and go down
424 | 
425 |     local vocab_mapping = {}
426 |     local index=1
427 |     --add special words
428 |     vocab_mapping[unknownword]=index;
429 |     --index=index+1
430 |     --vocab_mapping[padding]=index;
431 |     index=index+1
432 |     vocab_mapping[go]=index;
433 |     index=index+1
434 |     vocab_mapping[eos]=index;
435 |     index=index+1
436 |     local count=0
437 |     while frequency >0 do
438 |         for key,value in pairs(wordcount) do
439 |             if(value>=frequency) then  --trim dictionary for rare words
440 |             vocab_mapping[key]=index;
441 |             index=index+1
442 |             count=count+1
443 |             wordcount[key]=nil --remove from table
444 |             if(count>=vocabsize) then break end
445 |             end
446 |         end
447 |         if(count>=vocabsize) then break end
448 |         frequency=frequency-1
449 |     end
450 | 
451 | 
452 | 
453 |     print("Count: "..count)
454 |     print("Length: "..tot_len)
455 | 
456 |     --------------------------------------------------------
457 |     --build dataset---------------------------------------
458 |     --------------------------------------------------------
459 | 
460 |     local length = 50 --size to save of each utterance
461 | 
462 |     -- construct a tensor with all the data
463 |     print('putting data into tensor...')
464 | 
465 |     --fill with pads first
466 |     --rows are dialogue examples by length of examples (*2 b/c utterance-response)
467 |     local examples = torch.IntTensor(nlines,length*2):fill(0)
468 | 
469 | 
470 | 
471 |     filein = io.open(in_textfile, "r")
472 | 
473 |     t=true
474 |     local row=1
475 |     while(t ~= nil) do
476 |         t=WordSplitLMMinibatchLoader.getNextBatchFromFileStandard(filein)
477 |         if t ==nil then break end
478 |         --break into lines
479 |         local lines = t:gmatch("[^\r\n]+")
480 |         for line in lines do
481 | 
482 |             --line=WordSplitLMMinibatchLoader.preprocess(line)
483 |             --break into words
484 |             local words = WordSplitLMMinibatchLoader.tokenizer.tokenize(line)
485 | 
486 |             local speaker1,speaker2 = WordSplitLMMinibatchLoader.getSpeakersForLine(words)
487 | 
488 | 
489 |             --fill speaker 1 from middle->start
490 |             --speaker 1 we want to go from the last word spoken backward
491 |             local count=0
492 |             for i =speaker1.size,1,-1 do
493 |                 local word = speaker1[i]
494 |                 if(count<length) then
495 |                     if(word==nil) then for i, v in ipairs(speaker1) do print(i, v) end end
496 |                     word = word:lower()
497 |                     local idx = vocab_mapping[word]
498 |                     if idx == nil then idx = vocab_mapping[unknownword] end
499 |                     local loc = length-count
500 |                     examples[row][loc]=idx
501 |                 end
502 |                 count=count+1
503 |             end
504 |             --speaker 2 middle->end
505 |             for i =1,speaker2.size do
506 |                 local word = speaker2[i]
507 |                 if(i<=length) then
508 |                     word = word:lower()
509 |                     local idx = vocab_mapping[word]
510 |                     if idx == nil then idx = vocab_mapping[unknownword] end
511 |                     local loc = length+i
512 |                     examples[row][loc]=idx
513 |                 end
514 |             end
515 |             --debugging here:
516 |             --print(examples[row])
517 |             --io.stdin:read'*l'
518 | 
519 |             row=row+1
520 | 
521 |         end
522 | 
523 | 
524 |     end
525 | 
526 |     --splits
527 | 
528 | 
529 |     local ntrain = math.floor(examples:size(1) * split_fractions[1])
530 |     local nval = math.floor(examples:size(1) * split_fractions[2])
531 |     local ntest = examples:size(1) - nval - ntrain -- the rest goes to test (to ensure this adds up exactly)
532 | 
533 |     --shuffle tensor
534 |     local indexes = torch.randperm(examples:size(1)):type('torch.LongTensor')
535 |     examples = examples:index(1,indexes)
536 | 
537 |     local data = {}
538 |     data.train = examples:narrow(1,1,ntrain)
539 | 	
540 | 	if nval>0 then 
541 | 		data.val = examples:narrow(1,ntrain+1,nval)		
542 | 	end
543 | 	if ntest>0 then
544 | 		data.test = examples:narrow(1,ntrain+nval+1,ntest)
545 | 	end
546 | 
547 |     -- save output preprocessed files
548 |     print('saving ' .. out_vocabfile)
549 |     torch.save(out_vocabfile, vocab_mapping)
550 |     print('saving ' .. out_tensorfile)
551 |     torch.save(out_tensorfile, data)
552 | 
553 | 
554 | 
555 | 
556 | end
557 | --extract speakers from a single line
558 | function WordSplitLMMinibatchLoader.getSpeakersForLine(words)
559 |     local speaker1 = {}
560 |     speaker1.size=0
561 |     local speaker2 = {}
562 |     speaker2.size=0
563 |     local isspeaker1=true
564 |     for word in words do
565 |         if word=="|" then
566 |             isspeaker1=false
567 |             speaker2[speaker2.size+1]="<go>"
568 |             speaker2.size=speaker2.size+1
569 |         else
570 |             if isspeaker1==true then
571 |                 speaker1[speaker1.size+1]=word
572 |                 speaker1.size=speaker1.size+1
573 |             else
574 |                 speaker2[speaker2.size+1]=word
575 |                 speaker2.size=speaker2.size+1
576 |             end
577 |         end
578 |     end
579 | 
580 |     --add end of speaker tag
581 |     speaker2[speaker2.size+1]="<eos>"
582 |     speaker2.size=speaker2.size+1
583 | 
584 | 
585 |     return speaker1,speaker2
586 | 
587 | end
588 | 
589 | 
590 | 
591 | 
592 | 
593 | 
594 | 
595 | return WordSplitLMMinibatchLoader


--------------------------------------------------------------------------------