├── .gitignore
├── util
    ├── misc.lua
    ├── OneHot.lua
    ├── model_utils.lua
    └── CharSplitLMMinibatchLoader.lua
├── inspect_checkpoint.lua
├── model
    ├── RNN.lua
    ├── GRU.lua
    └── LSTM.lua
├── convert_gpu_cpu_checkpoint.lua
├── sample.lua
├── Readme.md
└── train.lua


/.gitignore:
--------------------------------------------------------------------------------
1 | *.t7
2 | 


--------------------------------------------------------------------------------
/util/misc.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | -- misc utilities
 3 | 
 4 | function clone_list(tensor_list, zero_too)
 5 |     -- utility function. todo: move away to some utils file?
 6 |     -- takes a list of tensors and returns a list of cloned tensors
 7 |     local out = {}
 8 |     for k,v in pairs(tensor_list) do
 9 |         out[k] = v:clone()
10 |         if zero_too then out[k]:zero() end
11 |     end
12 |     return out
13 | end


--------------------------------------------------------------------------------
/util/OneHot.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | local OneHot, parent = torch.class('OneHot', 'nn.Module')
 3 | 
 4 | function OneHot:__init(outputSize)
 5 |   parent.__init(self)
 6 |   self.outputSize = outputSize
 7 |   -- We'll construct one-hot encodings by using the index method to
 8 |   -- reshuffle the rows of an identity matrix. To avoid recreating
 9 |   -- it every iteration we'll cache it.
10 |   self._eye = torch.eye(outputSize)
11 | end
12 | 
13 | function OneHot:updateOutput(input)
14 |   self.output:resize(input:size(1), self.outputSize):zero()
15 |   if self._eye == nil then self._eye = torch.eye(self.outputSize) end
16 |   self._eye = self._eye:float()
17 |   local longInput = input:long()
18 |   self.output:copy(self._eye:index(1, longInput))
19 |   return self.output
20 | end
21 | 


--------------------------------------------------------------------------------
/inspect_checkpoint.lua:
--------------------------------------------------------------------------------
 1 | -- simple script that loads a checkpoint and prints its opts
 2 | 
 3 | require 'torch'
 4 | require 'nn'
 5 | require 'nngraph'
 6 | 
 7 | require 'util.OneHot'
 8 | require 'util.misc'
 9 | 
10 | cmd = torch.CmdLine()
11 | cmd:text()
12 | cmd:text('Load a checkpoint and print its options and validation losses.')
13 | cmd:text()
14 | cmd:text('Options')
15 | cmd:argument('-model','model to load')
16 | cmd:option('-gpuid',0,'gpu to use')
17 | cmd:option('-opencl',0,'use OpenCL (instead of CUDA)')
18 | cmd:text()
19 | 
20 | -- parse input params
21 | opt = cmd:parse(arg)
22 | 
23 | if opt.gpuid >= 0 and opt.opencl == 0 then
24 |     print('using CUDA on GPU ' .. opt.gpuid .. '...')
25 |     require 'cutorch'
26 |     require 'cunn'
27 |     cutorch.setDevice(opt.gpuid + 1)
28 | end
29 | 
30 | if opt.gpuid >= 0 and opt.opencl == 1 then
31 |     print('using OpenCL on GPU ' .. opt.gpuid .. '...')
32 |     require 'cltorch'
33 |     require 'clnn'
34 |     cltorch.setDevice(opt.gpuid + 1)
35 | end
36 | 
37 | local model = torch.load(opt.model)
38 | 
39 | print('opt:')
40 | print(model.opt)
41 | print('val losses:')
42 | print(model.val_losses)
43 | 
44 | 


--------------------------------------------------------------------------------
/model/RNN.lua:
--------------------------------------------------------------------------------
 1 | local RNN = {}
 2 | 
 3 | function RNN.rnn(input_size, rnn_size, n, dropout)
 4 |   
 5 |   -- there are n+1 inputs (hiddens on each layer and x)
 6 |   local inputs = {}
 7 |   table.insert(inputs, nn.Identity()()) -- x
 8 |   for L = 1,n do
 9 |     table.insert(inputs, nn.Identity()()) -- prev_h[L]
10 | 
11 |   end
12 | 
13 |   local x, input_size_L
14 |   local outputs = {}
15 |   for L = 1,n do
16 |     
17 |     local prev_h = inputs[L+1]
18 |     if L == 1 then 
19 |       x = OneHot(input_size)(inputs[1])
20 |       input_size_L = input_size
21 |     else 
22 |       x = outputs[(L-1)] 
23 |       if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
24 |       input_size_L = rnn_size
25 |     end
26 | 
27 |     -- RNN tick
28 |     local i2h = nn.Linear(input_size_L, rnn_size)(x)
29 |     local h2h = nn.Linear(rnn_size, rnn_size)(prev_h)
30 |     local next_h = nn.Tanh()(nn.CAddTable(){i2h, h2h})
31 | 
32 |     table.insert(outputs, next_h)
33 |   end
34 | -- set up the decoder
35 |   local top_h = outputs[#outputs]
36 |   if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end
37 |   local proj = nn.Linear(rnn_size, input_size)(top_h)
38 |   local logsoft = nn.LogSoftMax()(proj)
39 |   table.insert(outputs, logsoft)
40 | 
41 |   return nn.gModule(inputs, outputs)
42 | end
43 | 
44 | return RNN
45 | 


--------------------------------------------------------------------------------
/model/GRU.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | local GRU = {}
 3 | 
 4 | --[[
 5 | Creates one timestep of one GRU
 6 | Paper reference: http://arxiv.org/pdf/1412.3555v1.pdf
 7 | ]]--
 8 | function GRU.gru(input_size, rnn_size, n, dropout)
 9 |   dropout = dropout or 0 
10 |   -- there are n+1 inputs (hiddens on each layer and x)
11 |   local inputs = {}
12 |   table.insert(inputs, nn.Identity()()) -- x
13 |   for L = 1,n do
14 |     table.insert(inputs, nn.Identity()()) -- prev_h[L]
15 |   end
16 | 
17 |   function new_input_sum(insize, xv, hv)
18 |     local i2h = nn.Linear(insize, rnn_size)(xv)
19 |     local h2h = nn.Linear(rnn_size, rnn_size)(hv)
20 |     return nn.CAddTable()({i2h, h2h})
21 |   end
22 | 
23 |   local x, input_size_L
24 |   local outputs = {}
25 |   for L = 1,n do
26 | 
27 |     local prev_h = inputs[L+1]
28 |     -- the input to this layer
29 |     if L == 1 then 
30 |       x = OneHot(input_size)(inputs[1])
31 |       input_size_L = input_size
32 |     else 
33 |       x = outputs[(L-1)] 
34 |       if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
35 |       input_size_L = rnn_size
36 |     end
37 |     -- GRU tick
38 |     -- forward the update and reset gates
39 |     local update_gate = nn.Sigmoid()(new_input_sum(input_size_L, x, prev_h))
40 |     local reset_gate = nn.Sigmoid()(new_input_sum(input_size_L, x, prev_h))
41 |     -- compute candidate hidden state
42 |     local gated_hidden = nn.CMulTable()({reset_gate, prev_h})
43 |     local p2 = nn.Linear(rnn_size, rnn_size)(gated_hidden)
44 |     local p1 = nn.Linear(input_size_L, rnn_size)(x)
45 |     local hidden_candidate = nn.Tanh()(nn.CAddTable()({p1,p2}))
46 |     -- compute new interpolated hidden state, based on the update gate
47 |     local zh = nn.CMulTable()({update_gate, hidden_candidate})
48 |     local zhm1 = nn.CMulTable()({nn.AddConstant(1,false)(nn.MulConstant(-1,false)(update_gate)), prev_h})
49 |     local next_h = nn.CAddTable()({zh, zhm1})
50 | 
51 |     table.insert(outputs, next_h)
52 |   end
53 | -- set up the decoder
54 |   local top_h = outputs[#outputs]
55 |   if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end
56 |   local proj = nn.Linear(rnn_size, input_size)(top_h)
57 |   local logsoft = nn.LogSoftMax()(proj)
58 |   table.insert(outputs, logsoft)
59 | 
60 |   return nn.gModule(inputs, outputs)
61 | end
62 | 
63 | return GRU
64 | 


--------------------------------------------------------------------------------
/model/LSTM.lua:
--------------------------------------------------------------------------------
 1 | 
 2 | local LSTM = {}
 3 | function LSTM.lstm(input_size, rnn_size, n, dropout)
 4 |   dropout = dropout or 0 
 5 | 
 6 |   -- there will be 2*n+1 inputs
 7 |   local inputs = {}
 8 |   table.insert(inputs, nn.Identity()()) -- x
 9 |   for L = 1,n do
10 |     table.insert(inputs, nn.Identity()()) -- prev_c[L]
11 |     table.insert(inputs, nn.Identity()()) -- prev_h[L]
12 |   end
13 | 
14 |   local x, input_size_L
15 |   local outputs = {}
16 |   for L = 1,n do
17 |     -- c,h from previos timesteps
18 |     local prev_h = inputs[L*2+1]
19 |     local prev_c = inputs[L*2]
20 |     -- the input to this layer
21 |     if L == 1 then 
22 |       x = OneHot(input_size)(inputs[1])
23 |       input_size_L = input_size
24 |     else 
25 |       x = outputs[(L-1)*2] 
26 |       if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any
27 |       input_size_L = rnn_size
28 |     end
29 |     -- evaluate the input sums at once for efficiency
30 |     local i2h = nn.Linear(input_size_L, 4 * rnn_size)(x):annotate{name='i2h_'..L}
31 |     local h2h = nn.Linear(rnn_size, 4 * rnn_size)(prev_h):annotate{name='h2h_'..L}
32 |     local all_input_sums = nn.CAddTable()({i2h, h2h})
33 | 
34 |     local reshaped = nn.Reshape(4, rnn_size)(all_input_sums)
35 |     local n1, n2, n3, n4 = nn.SplitTable(2)(reshaped):split(4)
36 |     -- decode the gates
37 |     local in_gate = nn.Sigmoid()(n1)
38 |     local forget_gate = nn.Sigmoid()(n2)
39 |     local out_gate = nn.Sigmoid()(n3)
40 |     -- decode the write inputs
41 |     local in_transform = nn.Tanh()(n4)
42 |     -- perform the LSTM update
43 |     local next_c           = nn.CAddTable()({
44 |         nn.CMulTable()({forget_gate, prev_c}),
45 |         nn.CMulTable()({in_gate,     in_transform})
46 |       })
47 |     -- gated cells form the output
48 |     local next_h = nn.CMulTable()({out_gate, nn.Tanh()(next_c)})
49 |     
50 |     table.insert(outputs, next_c)
51 |     table.insert(outputs, next_h)
52 |   end
53 | 
54 |   -- set up the decoder
55 |   local top_h = outputs[#outputs]
56 |   if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end
57 |   local proj = nn.Linear(rnn_size, input_size)(top_h):annotate{name='decoder'}
58 |   local logsoft = nn.LogSoftMax()(proj)
59 |   table.insert(outputs, logsoft)
60 | 
61 |   return nn.gModule(inputs, outputs)
62 | end
63 | 
64 | return LSTM
65 | 
66 | 


--------------------------------------------------------------------------------
/convert_gpu_cpu_checkpoint.lua:
--------------------------------------------------------------------------------
 1 | --[[
 2 | A quick patch for converting GPU checkpoints to 
 3 | CPU checkpoints until I implement a more long-term
 4 | solution. Takes the path to the model and creates
 5 | a file in the same location and path, but with _cpu.t7
 6 | appended.
 7 | ]]--
 8 | 
 9 | require 'torch'
10 | require 'nn'
11 | require 'nngraph'
12 | require 'lfs'
13 | 
14 | require 'util.OneHot'
15 | require 'util.misc'
16 | 
17 | cmd = torch.CmdLine()
18 | cmd:text()
19 | cmd:text('Sample from a character-level language model')
20 | cmd:text()
21 | cmd:text('Options')
22 | cmd:argument('-model','GPU model checkpoint to convert')
23 | cmd:option('-gpuid',0,'which gpu to use. -1 = use CPU')
24 | cmd:option('-opencl',0,'use OpenCL (instead of CUDA)')
25 | cmd:text()
26 | 
27 | -- parse input params
28 | opt = cmd:parse(arg)
29 | 
30 | -- check that cunn/cutorch are installed if user wants to use the GPU
31 | if opt.gpuid >= 0 and opt.opencl == 0 then
32 |     local ok, cunn = pcall(require, 'cunn')
33 |     local ok2, cutorch = pcall(require, 'cutorch')
34 |     if not ok then print('package cunn not found!') end
35 |     if not ok2 then print('package cutorch not found!') end
36 |     if ok and ok2 then
37 |         print('using CUDA on GPU ' .. opt.gpuid .. '...')
38 |         cutorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua
39 |     else
40 |     	print('Error, no GPU available?')
41 |         os.exit()
42 |     end
43 | end
44 | 
45 | -- check that clnn/cltorch are installed if user wants to use OpenCL
46 | if opt.gpuid >= 0 and opt.opencl == 1 then
47 |     local ok, cunn = pcall(require, 'clnn')
48 |     local ok2, cutorch = pcall(require, 'cltorch')
49 |     if not ok then print('package clnn not found!') end
50 |     if not ok2 then print('package cltorch not found!') end
51 |     if ok and ok2 then
52 |         print('using OpenCL on GPU ' .. opt.gpuid .. '...')
53 |         cltorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua
54 |     else
55 |         print('Error, no GPU available?')
56 |         os.exit()
57 |     end
58 | end
59 | 
60 | print('loading ' .. opt.model)
61 | checkpoint = torch.load(opt.model)
62 | protos = checkpoint.protos
63 | 
64 | -- convert the networks to be CPU models
65 | for k,v in pairs(protos) do
66 | 	print('converting ' .. k .. ' to CPU')
67 | 	protos[k]:double()
68 | end
69 | 
70 | local savefile = opt.model .. '_cpu.t7' -- append "cpu.t7" to filename
71 | torch.save(savefile, checkpoint)
72 | print('saved ' .. savefile)
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/util/model_utils.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | -- adapted from https://github.com/wojciechz/learning_to_execute
  3 | -- utilities for combining/flattening parameters in a model
  4 | -- the code in this script is more general than it needs to be, which is 
  5 | -- why it is kind of a large
  6 | 
  7 | require 'torch'
  8 | local model_utils = {}
  9 | function model_utils.combine_all_parameters(...)
 10 |     --[[ like module:getParameters, but operates on many modules ]]--
 11 | 
 12 |     -- get parameters
 13 |     local networks = {...}
 14 |     local parameters = {}
 15 |     local gradParameters = {}
 16 |     for i = 1, #networks do
 17 |         local net_params, net_grads = networks[i]:parameters()
 18 | 
 19 |         if net_params then
 20 |             for _, p in pairs(net_params) do
 21 |                 parameters[#parameters + 1] = p
 22 |             end
 23 |             for _, g in pairs(net_grads) do
 24 |                 gradParameters[#gradParameters + 1] = g
 25 |             end
 26 |         end
 27 |     end
 28 | 
 29 |     local function storageInSet(set, storage)
 30 |         local storageAndOffset = set[torch.pointer(storage)]
 31 |         if storageAndOffset == nil then
 32 |             return nil
 33 |         end
 34 |         local _, offset = unpack(storageAndOffset)
 35 |         return offset
 36 |     end
 37 | 
 38 |     -- this function flattens arbitrary lists of parameters,
 39 |     -- even complex shared ones
 40 |     local function flatten(parameters)
 41 |         if not parameters or #parameters == 0 then
 42 |             return torch.Tensor()
 43 |         end
 44 |         local Tensor = parameters[1].new
 45 | 
 46 |         local storages = {}
 47 |         local nParameters = 0
 48 |         for k = 1,#parameters do
 49 |             local storage = parameters[k]:storage()
 50 |             if not storageInSet(storages, storage) then
 51 |                 storages[torch.pointer(storage)] = {storage, nParameters}
 52 |                 nParameters = nParameters + storage:size()
 53 |             end
 54 |         end
 55 | 
 56 |         local flatParameters = Tensor(nParameters):fill(1)
 57 |         local flatStorage = flatParameters:storage()
 58 | 
 59 |         for k = 1,#parameters do
 60 |             local storageOffset = storageInSet(storages, parameters[k]:storage())
 61 |             parameters[k]:set(flatStorage,
 62 |                 storageOffset + parameters[k]:storageOffset(),
 63 |                 parameters[k]:size(),
 64 |                 parameters[k]:stride())
 65 |             parameters[k]:zero()
 66 |         end
 67 | 
 68 |         local maskParameters=  flatParameters:float():clone()
 69 |         local cumSumOfHoles = flatParameters:float():cumsum(1)
 70 |         local nUsedParameters = nParameters - cumSumOfHoles[#cumSumOfHoles]
 71 |         local flatUsedParameters = Tensor(nUsedParameters)
 72 |         local flatUsedStorage = flatUsedParameters:storage()
 73 | 
 74 |         for k = 1,#parameters do
 75 |             local offset = cumSumOfHoles[parameters[k]:storageOffset()]
 76 |             parameters[k]:set(flatUsedStorage,
 77 |                 parameters[k]:storageOffset() - offset,
 78 |                 parameters[k]:size(),
 79 |                 parameters[k]:stride())
 80 |         end
 81 | 
 82 |         for _, storageAndOffset in pairs(storages) do
 83 |             local k, v = unpack(storageAndOffset)
 84 |             flatParameters[{{v+1,v+k:size()}}]:copy(Tensor():set(k))
 85 |         end
 86 | 
 87 |         if cumSumOfHoles:sum() == 0 then
 88 |             flatUsedParameters:copy(flatParameters)
 89 |         else
 90 |             local counter = 0
 91 |             for k = 1,flatParameters:nElement() do
 92 |                 if maskParameters[k] == 0 then
 93 |                     counter = counter + 1
 94 |                     flatUsedParameters[counter] = flatParameters[counter+cumSumOfHoles[k]]
 95 |                 end
 96 |             end
 97 |             assert (counter == nUsedParameters)
 98 |         end
 99 |         return flatUsedParameters
100 |     end
101 | 
102 |     -- flatten parameters and gradients
103 |     local flatParameters = flatten(parameters)
104 |     local flatGradParameters = flatten(gradParameters)
105 | 
106 |     -- return new flat vector that contains all discrete parameters
107 |     return flatParameters, flatGradParameters
108 | end
109 | 
110 | 
111 | 
112 | 
113 | function model_utils.clone_many_times(net, T)
114 |     local clones = {}
115 | 
116 |     local params, gradParams
117 |     if net.parameters then
118 |         params, gradParams = net:parameters()
119 |         if params == nil then
120 |             params = {}
121 |         end
122 |     end
123 | 
124 |     local paramsNoGrad
125 |     if net.parametersNoGrad then
126 |         paramsNoGrad = net:parametersNoGrad()
127 |     end
128 | 
129 |     local mem = torch.MemoryFile("w"):binary()
130 |     mem:writeObject(net)
131 | 
132 |     for t = 1, T do
133 |         -- We need to use a new reader for each clone.
134 |         -- We don't want to use the pointers to already read objects.
135 |         local reader = torch.MemoryFile(mem:storage(), "r"):binary()
136 |         local clone = reader:readObject()
137 |         reader:close()
138 | 
139 |         if net.parameters then
140 |             local cloneParams, cloneGradParams = clone:parameters()
141 |             local cloneParamsNoGrad
142 |             for i = 1, #params do
143 |                 cloneParams[i]:set(params[i])
144 |                 cloneGradParams[i]:set(gradParams[i])
145 |             end
146 |             if paramsNoGrad then
147 |                 cloneParamsNoGrad = clone:parametersNoGrad()
148 |                 for i =1,#paramsNoGrad do
149 |                     cloneParamsNoGrad[i]:set(paramsNoGrad[i])
150 |                 end
151 |             end
152 |         end
153 | 
154 |         clones[t] = clone
155 |         collectgarbage()
156 |     end
157 | 
158 |     mem:close()
159 |     return clones
160 | end
161 | 
162 | return model_utils
163 | 


--------------------------------------------------------------------------------
/sample.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | --[[
  3 | 
  4 | This file samples characters from a trained model
  5 | 
  6 | Code is based on implementation in 
  7 | https://github.com/oxford-cs-ml-2015/practical6
  8 | 
  9 | ]]--
 10 | 
 11 | require 'torch'
 12 | require 'nn'
 13 | require 'nngraph'
 14 | require 'optim'
 15 | require 'lfs'
 16 | 
 17 | require 'util.OneHot'
 18 | require 'util.misc'
 19 | 
 20 | cmd = torch.CmdLine()
 21 | cmd:text()
 22 | cmd:text('Sample from a character-level language model')
 23 | cmd:text()
 24 | cmd:text('Options')
 25 | -- required:
 26 | cmd:argument('-model','model checkpoint to use for sampling')
 27 | -- optional parameters
 28 | cmd:option('-seed',123,'random number generator\'s seed')
 29 | cmd:option('-sample',1,' 0 to use max at each timestep, 1 to sample at each timestep')
 30 | cmd:option('-primetext',"",'used as a prompt to "seed" the state of the LSTM using a given sequence, before we sample.')
 31 | cmd:option('-length',2000,'number of characters to sample')
 32 | cmd:option('-temperature',1,'temperature of sampling')
 33 | cmd:option('-gpuid',0,'which gpu to use. -1 = use CPU')
 34 | cmd:option('-opencl',0,'use OpenCL (instead of CUDA)')
 35 | cmd:option('-verbose',1,'set to 0 to ONLY print the sampled text, no diagnostics')
 36 | cmd:text()
 37 | 
 38 | -- parse input params
 39 | opt = cmd:parse(arg)
 40 | 
 41 | -- gated print: simple utility function wrapping a print
 42 | function gprint(str)
 43 |     if opt.verbose == 1 then print(str) end
 44 | end
 45 | 
 46 | -- check that cunn/cutorch are installed if user wants to use the GPU
 47 | if opt.gpuid >= 0 and opt.opencl == 0 then
 48 |     local ok, cunn = pcall(require, 'cunn')
 49 |     local ok2, cutorch = pcall(require, 'cutorch')
 50 |     if not ok then gprint('package cunn not found!') end
 51 |     if not ok2 then gprint('package cutorch not found!') end
 52 |     if ok and ok2 then
 53 |         gprint('using CUDA on GPU ' .. opt.gpuid .. '...')
 54 |         gprint('Make sure that your saved checkpoint was also trained with GPU. If it was trained with CPU use -gpuid -1 for sampling as well')
 55 |         cutorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua
 56 |         cutorch.manualSeed(opt.seed)
 57 |     else
 58 |         gprint('Falling back on CPU mode')
 59 |         opt.gpuid = -1 -- overwrite user setting
 60 |     end
 61 | end
 62 | 
 63 | -- check that clnn/cltorch are installed if user wants to use OpenCL
 64 | if opt.gpuid >= 0 and opt.opencl == 1 then
 65 |     local ok, cunn = pcall(require, 'clnn')
 66 |     local ok2, cutorch = pcall(require, 'cltorch')
 67 |     if not ok then print('package clnn not found!') end
 68 |     if not ok2 then print('package cltorch not found!') end
 69 |     if ok and ok2 then
 70 |         gprint('using OpenCL on GPU ' .. opt.gpuid .. '...')
 71 |         gprint('Make sure that your saved checkpoint was also trained with GPU. If it was trained with CPU use -gpuid -1 for sampling as well')
 72 |         cltorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua
 73 |         torch.manualSeed(opt.seed)
 74 |     else
 75 |         gprint('Falling back on CPU mode')
 76 |         opt.gpuid = -1 -- overwrite user setting
 77 |     end
 78 | end
 79 | 
 80 | torch.manualSeed(opt.seed)
 81 | 
 82 | -- load the model checkpoint
 83 | if not lfs.attributes(opt.model, 'mode') then
 84 |     gprint('Error: File ' .. opt.model .. ' does not exist. Are you sure you didn\'t forget to prepend cv/ ?')
 85 | end
 86 | checkpoint = torch.load(opt.model)
 87 | protos = checkpoint.protos
 88 | protos.rnn:evaluate() -- put in eval mode so that dropout works properly
 89 | 
 90 | -- initialize the vocabulary (and its inverted version)
 91 | local vocab = checkpoint.vocab
 92 | local ivocab = {}
 93 | for c,i in pairs(vocab) do ivocab[i] = c end
 94 | 
 95 | -- initialize the rnn state to all zeros
 96 | gprint('creating an ' .. checkpoint.opt.model .. '...')
 97 | local current_state
 98 | current_state = {}
 99 | for L = 1,checkpoint.opt.num_layers do
100 |     -- c and h for all layers
101 |     local h_init = torch.zeros(1, checkpoint.opt.rnn_size):double()
102 |     if opt.gpuid >= 0 and opt.opencl == 0 then h_init = h_init:cuda() end
103 |     if opt.gpuid >= 0 and opt.opencl == 1 then h_init = h_init:cl() end
104 |     table.insert(current_state, h_init:clone())
105 |     if checkpoint.opt.model == 'lstm' then
106 |         table.insert(current_state, h_init:clone())
107 |     end
108 | end
109 | state_size = #current_state
110 | 
111 | -- do a few seeded timesteps
112 | local seed_text = opt.primetext
113 | if string.len(seed_text) > 0 then
114 |     gprint('seeding with ' .. seed_text)
115 |     gprint('--------------------------')
116 |     for c in seed_text:gmatch'.' do
117 |         prev_char = torch.Tensor{vocab[c]}
118 |         io.write(ivocab[prev_char[1]])
119 |         if opt.gpuid >= 0 and opt.opencl == 0 then prev_char = prev_char:cuda() end
120 |         if opt.gpuid >= 0 and opt.opencl == 1 then prev_char = prev_char:cl() end
121 |         local lst = protos.rnn:forward{prev_char, unpack(current_state)}
122 |         -- lst is a list of [state1,state2,..stateN,output]. We want everything but last piece
123 |         current_state = {}
124 |         for i=1,state_size do table.insert(current_state, lst[i]) end
125 |         prediction = lst[#lst] -- last element holds the log probabilities
126 |     end
127 | else
128 |     -- fill with uniform probabilities over characters (? hmm)
129 |     gprint('missing seed text, using uniform probability over first character')
130 |     gprint('--------------------------')
131 |     prediction = torch.Tensor(1, #ivocab):fill(1)/(#ivocab)
132 |     if opt.gpuid >= 0 and opt.opencl == 0 then prediction = prediction:cuda() end
133 |     if opt.gpuid >= 0 and opt.opencl == 1 then prediction = prediction:cl() end
134 | end
135 | 
136 | -- start sampling/argmaxing
137 | for i=1, opt.length do
138 | 
139 |     -- log probabilities from the previous timestep
140 |     if opt.sample == 0 then
141 |         -- use argmax
142 |         local _, prev_char_ = prediction:max(2)
143 |         prev_char = prev_char_:resize(1)
144 |     else
145 |         -- use sampling
146 |         prediction:div(opt.temperature) -- scale by temperature
147 |         local probs = torch.exp(prediction):squeeze()
148 |         probs:div(torch.sum(probs)) -- renormalize so probs sum to one
149 |         prev_char = torch.multinomial(probs:float(), 1):resize(1):float()
150 |     end
151 | 
152 |     -- forward the rnn for next character
153 |     local lst = protos.rnn:forward{prev_char, unpack(current_state)}
154 |     current_state = {}
155 |     for i=1,state_size do table.insert(current_state, lst[i]) end
156 |     prediction = lst[#lst] -- last element holds the log probabilities
157 | 
158 |     io.write(ivocab[prev_char[1]])
159 | end
160 | io.write('\n') io.flush()
161 | 
162 | 


--------------------------------------------------------------------------------
/util/CharSplitLMMinibatchLoader.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | -- Modified from https://github.com/oxford-cs-ml-2015/practical6
  3 | -- the modification included support for train/val/test splits
  4 | 
  5 | local CharSplitLMMinibatchLoader = {}
  6 | CharSplitLMMinibatchLoader.__index = CharSplitLMMinibatchLoader
  7 | 
  8 | function CharSplitLMMinibatchLoader.create(data_dir, batch_size, seq_length, split_fractions)
  9 |     -- split_fractions is e.g. {0.9, 0.05, 0.05}
 10 | 
 11 |     local self = {}
 12 |     setmetatable(self, CharSplitLMMinibatchLoader)
 13 | 
 14 |     local input_file = path.join(data_dir, 'input.txt')
 15 |     local vocab_file = path.join(data_dir, 'vocab.t7')
 16 |     local tensor_file = path.join(data_dir, 'data.t7')
 17 | 
 18 |     -- fetch file attributes to determine if we need to rerun preprocessing
 19 |     local run_prepro = false
 20 |     if not (path.exists(vocab_file) or path.exists(tensor_file)) then
 21 |         -- prepro files do not exist, generate them
 22 |         print('vocab.t7 and data.t7 do not exist. Running preprocessing...')
 23 |         run_prepro = true
 24 |     else
 25 |         -- check if the input file was modified since last time we 
 26 |         -- ran the prepro. if so, we have to rerun the preprocessing
 27 |         local input_attr = lfs.attributes(input_file)
 28 |         local vocab_attr = lfs.attributes(vocab_file)
 29 |         local tensor_attr = lfs.attributes(tensor_file)
 30 |         if input_attr.modification > vocab_attr.modification or input_attr.modification > tensor_attr.modification then
 31 |             print('vocab.t7 or data.t7 detected as stale. Re-running preprocessing...')
 32 |             run_prepro = true
 33 |         end
 34 |     end
 35 |     if run_prepro then
 36 |         -- construct a tensor with all the data, and vocab file
 37 |         print('one-time setup: preprocessing input text file ' .. input_file .. '...')
 38 |         CharSplitLMMinibatchLoader.text_to_tensor(input_file, vocab_file, tensor_file)
 39 |     end
 40 | 
 41 |     print('loading data files...')
 42 |     local data = torch.load(tensor_file)
 43 |     self.vocab_mapping = torch.load(vocab_file)
 44 | 
 45 |     -- cut off the end so that it divides evenly
 46 |     local len = data:size(1)
 47 |     if len % (batch_size * seq_length) ~= 0 then
 48 |         print('cutting off end of data so that the batches/sequences divide evenly')
 49 |         data = data:sub(1, batch_size * seq_length 
 50 |                     * math.floor(len / (batch_size * seq_length)))
 51 |     end
 52 | 
 53 |     -- count vocab
 54 |     self.vocab_size = 0
 55 |     for _ in pairs(self.vocab_mapping) do 
 56 |         self.vocab_size = self.vocab_size + 1 
 57 |     end
 58 | 
 59 |     -- self.batches is a table of tensors
 60 |     print('reshaping tensor...')
 61 |     self.batch_size = batch_size
 62 |     self.seq_length = seq_length
 63 | 
 64 |     local ydata = data:clone()
 65 |     ydata:sub(1,-2):copy(data:sub(2,-1))
 66 |     ydata[-1] = data[1]
 67 |     self.x_batches = data:view(batch_size, -1):split(seq_length, 2)  -- #rows = #batches
 68 |     self.nbatches = #self.x_batches
 69 |     self.y_batches = ydata:view(batch_size, -1):split(seq_length, 2)  -- #rows = #batches
 70 |     assert(#self.x_batches == #self.y_batches)
 71 | 
 72 |     -- lets try to be helpful here
 73 |     if self.nbatches < 50 then
 74 |         print('WARNING: less than 50 batches in the data in total? Looks like very small dataset. You probably want to use smaller batch_size and/or seq_length.')
 75 |     end
 76 | 
 77 |     -- perform safety checks on split_fractions
 78 |     assert(split_fractions[1] >= 0 and split_fractions[1] <= 1, 'bad split fraction ' .. split_fractions[1] .. ' for train, not between 0 and 1')
 79 |     assert(split_fractions[2] >= 0 and split_fractions[2] <= 1, 'bad split fraction ' .. split_fractions[2] .. ' for val, not between 0 and 1')
 80 |     assert(split_fractions[3] >= 0 and split_fractions[3] <= 1, 'bad split fraction ' .. split_fractions[3] .. ' for test, not between 0 and 1')
 81 |     if split_fractions[3] == 0 then 
 82 |         -- catch a common special case where the user might not want a test set
 83 |         self.ntrain = math.floor(self.nbatches * split_fractions[1])
 84 |         self.nval = self.nbatches - self.ntrain
 85 |         self.ntest = 0
 86 |     else
 87 |         -- divide data to train/val and allocate rest to test
 88 |         self.ntrain = math.floor(self.nbatches * split_fractions[1])
 89 |         self.nval = math.floor(self.nbatches * split_fractions[2])
 90 |         self.ntest = self.nbatches - self.nval - self.ntrain -- the rest goes to test (to ensure this adds up exactly)
 91 |     end
 92 | 
 93 |     self.split_sizes = {self.ntrain, self.nval, self.ntest}
 94 |     self.batch_ix = {0,0,0}
 95 | 
 96 |     print(string.format('data load done. Number of data batches in train: %d, val: %d, test: %d', self.ntrain, self.nval, self.ntest))
 97 |     collectgarbage()
 98 |     return self
 99 | end
100 | 
101 | function CharSplitLMMinibatchLoader:reset_batch_pointer(split_index, batch_index)
102 |     batch_index = batch_index or 0
103 |     self.batch_ix[split_index] = batch_index
104 | end
105 | 
106 | function CharSplitLMMinibatchLoader:next_batch(split_index)
107 |     if self.split_sizes[split_index] == 0 then
108 |         -- perform a check here to make sure the user isn't screwing something up
109 |         local split_names = {'train', 'val', 'test'}
110 |         print('ERROR. Code requested a batch for split ' .. split_names[split_index] .. ', but this split has no data.')
111 |         os.exit() -- crash violently
112 |     end
113 |     -- split_index is integer: 1 = train, 2 = val, 3 = test
114 |     self.batch_ix[split_index] = self.batch_ix[split_index] + 1
115 |     if self.batch_ix[split_index] > self.split_sizes[split_index] then
116 |         self.batch_ix[split_index] = 1 -- cycle around to beginning
117 |     end
118 |     -- pull out the correct next batch
119 |     local ix = self.batch_ix[split_index]
120 |     if split_index == 2 then ix = ix + self.ntrain end -- offset by train set size
121 |     if split_index == 3 then ix = ix + self.ntrain + self.nval end -- offset by train + val
122 |     return self.x_batches[ix], self.y_batches[ix]
123 | end
124 | 
125 | -- *** STATIC method ***
126 | function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, out_tensorfile)
127 |     local timer = torch.Timer()
128 | 
129 |     print('loading text file...')
130 |     local cache_len = 10000
131 |     local rawdata
132 |     local tot_len = 0
133 |     local f = assert(io.open(in_textfile, "r"))
134 | 
135 |     -- create vocabulary if it doesn't exist yet
136 |     print('creating vocabulary mapping...')
137 |     -- record all characters to a set
138 |     local unordered = {}
139 |     rawdata = f:read(cache_len)
140 |     repeat
141 |         for char in rawdata:gmatch'.' do
142 |             if not unordered[char] then unordered[char] = true end
143 |         end
144 |         tot_len = tot_len + #rawdata
145 |         rawdata = f:read(cache_len)
146 |     until not rawdata
147 |     f:close()
148 |     -- sort into a table (i.e. keys become 1..N)
149 |     local ordered = {}
150 |     for char in pairs(unordered) do ordered[#ordered + 1] = char end
151 |     table.sort(ordered)
152 |     -- invert `ordered` to create the char->int mapping
153 |     local vocab_mapping = {}
154 |     for i, char in ipairs(ordered) do
155 |         vocab_mapping[char] = i
156 |     end
157 |     -- construct a tensor with all the data
158 |     print('putting data into tensor...')
159 |     local data = torch.ByteTensor(tot_len) -- store it into 1D first, then rearrange
160 |     f = assert(io.open(in_textfile, "r"))
161 |     local currlen = 0
162 |     rawdata = f:read(cache_len)
163 |     repeat
164 |         for i=1, #rawdata do
165 |             data[currlen+i] = vocab_mapping[rawdata:sub(i, i)] -- lua has no string indexing using []
166 |         end
167 |         currlen = currlen + #rawdata
168 |         rawdata = f:read(cache_len)
169 |     until not rawdata
170 |     f:close()
171 | 
172 |     -- save output preprocessed files
173 |     print('saving ' .. out_vocabfile)
174 |     torch.save(out_vocabfile, vocab_mapping)
175 |     print('saving ' .. out_tensorfile)
176 |     torch.save(out_tensorfile, data)
177 | end
178 | 
179 | return CharSplitLMMinibatchLoader
180 | 
181 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # char-rnn
  3 | 
  4 | This code implements **multi-layer Recurrent Neural Network** (RNN, LSTM, and GRU) for training/sampling from character-level language models. In other words the model takes one text file as input and trains a Recurrent Neural Network that learns to predict the next character in a sequence. The RNN can then be used to generate text character by character that will look like the original training data. The context of this code base is described in detail in my [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).
  5 | 
  6 | If you are new to Torch/Lua/Neural Nets, it might be helpful to know that this code is really just a slightly more fancy version of this [100-line gist](https://gist.github.com/karpathy/d4dee566867f8291f086) that I wrote in Python/numpy. The code in this repo additionally: allows for multiple layers, uses an LSTM instead of a vanilla RNN, has more supporting code for model checkpointing, and is of course much more efficient since it uses mini-batches and can run on a GPU.
  7 | 
  8 | ## Update: torch-rnn
  9 | 
 10 | [Justin Johnson](http://cs.stanford.edu/people/jcjohns/) (@jcjohnson) recently re-implemented char-rnn from scratch with a much nicer/smaller/cleaner/faster Torch code base. It's under the name [torch-rnn](https://github.com/jcjohnson/torch-rnn). It uses Adam for optimization and hard-codes the RNN/LSTM forward/backward passes for space/time efficiency. This also avoids headaches with cloning models in this repo. In other words, torch-rnn should be the default char-rnn implemention to use now instead of the one in this code base.
 11 | 
 12 | ## Requirements
 13 | 
 14 | This code is written in Lua and requires [Torch](http://torch.ch/). If you're on Ubuntu, installing Torch in your home directory may look something like: 
 15 | 
 16 | ```bash
 17 | $ curl -s https://raw.githubusercontent.com/torch/ezinstall/master/install-deps | bash
 18 | $ git clone https://github.com/torch/distro.git ~/torch --recursive
 19 | $ cd ~/torch; 
 20 | $ ./install.sh      # and enter "yes" at the end to modify your bashrc
 21 | $ source ~/.bashrc
 22 | ```
 23 | 
 24 | See the Torch installation documentation for more details. After Torch is installed we need to get a few more packages using [LuaRocks](https://luarocks.org/) (which already came with the Torch install). In particular:
 25 | 
 26 | ```bash
 27 | $ luarocks install nngraph 
 28 | $ luarocks install optim
 29 | $ luarocks install nn
 30 | ```
 31 | 
 32 | If you'd like to train on an NVIDIA GPU using CUDA (this can be to about 15x faster), you'll of course need the GPU, and you will have to install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit). Then get the `cutorch` and `cunn` packages:
 33 | 
 34 | ```bash
 35 | $ luarocks install cutorch
 36 | $ luarocks install cunn
 37 | ```
 38 | 
 39 | If you'd like to use OpenCL GPU instead (e.g. ATI cards), you will instead need to install the `cltorch` and `clnn` packages, and then use the option `-opencl 1` during training ([cltorch issues](https://github.com/hughperkins/cltorch/issues)):
 40 | 
 41 | ```bash
 42 | $ luarocks install cltorch
 43 | $ luarocks install clnn
 44 | ```
 45 | 
 46 | ## Usage
 47 | 
 48 | ### Data
 49 | 
 50 | All input data is stored inside the `data/` directory. You'll notice that there is an example dataset included in the repo (in folder `data/tinyshakespeare`) which consists of a subset of works of Shakespeare. I'm providing a few more datasets on [this page](http://cs.stanford.edu/people/karpathy/char-rnn/).
 51 | 
 52 | **Your own data**: If you'd like to use your own data then create a single file `input.txt` and place it into a folder in the `data/` directory. For example, `data/some_folder/input.txt`. The first time you run the training script it will do some preprocessing and write two more convenience cache files into `data/some_folder`.
 53 | 
 54 | **Dataset sizes**: Note that if your data is too small (1MB is already considered very small) the RNN won't learn very effectively. Remember that it has to learn everything completely from scratch. Conversely if your data is large (more than about 2MB), feel confident to increase `rnn_size` and train a bigger model (see details of training below). It will work *significantly better*. For example with 6MB you can easily go up to `rnn_size` 300 or even more. The biggest that fits on my GPU and that I've trained with this code is `rnn_size` 700 with `num_layers` 3 (2 is default).
 55 | 
 56 | ### Training
 57 | 
 58 | Start training the model using `train.lua`. As a sanity check, to run on the included example dataset simply try:
 59 | 
 60 | ```
 61 | $ th train.lua -gpuid -1
 62 | ```
 63 | 
 64 | Notice that here we are setting the flag `gpuid` to -1, which tells the code to train using CPU, otherwise it defaults to GPU 0.  There are many other flags for various options. Consult `$ th train.lua -help` for comprehensive settings. Here's another example that trains a bigger network and also shows how you can run on your own custom dataset (this already assumes that `data/some_folder/input.txt` exists):
 65 | 
 66 | ```
 67 | $ th train.lua -data_dir data/some_folder -rnn_size 512 -num_layers 2 -dropout 0.5
 68 | ```
 69 | 
 70 | **Checkpoints.** While the model is training it will periodically write checkpoint files to the `cv` folder. The frequency with which these checkpoints are written is controlled with number of iterations, as specified with the `eval_val_every` option (e.g. if this is 1 then a checkpoint is written every iteration). The filename of these checkpoints contains a very important number: the **loss**. For example, a checkpoint with filename `lm_lstm_epoch0.95_2.0681.t7` indicates that at this point the model was on epoch 0.95 (i.e. it has almost done one full pass over the training data), and the loss on validation data was 2.0681. This number is very important because the lower it is, the better the checkpoint works. Once you start to generate data (discussed below), you will want to use the model checkpoint that reports the lowest validation loss. Notice that this might not necessarily be the last checkpoint at the end of training (due to possible overfitting).
 71 | 
 72 | Another important quantities to be aware of are `batch_size` (call it B), `seq_length` (call it S), and the `train_frac` and `val_frac` settings. The batch size specifies how many streams of data are processed in parallel at one time. The sequence length specifies the length of each stream, which is also the limit at which the gradients can propagate backwards in time. For example, if `seq_length` is 20, then the gradient signal will never backpropagate more than 20 time steps, and the model might not *find* dependencies longer than this length in number of characters. Thus, if you have a very difficult dataset where there are a lot of long-term dependencies you will want to increase this setting. Now, if at runtime your input text file has N characters, these first all get split into chunks of size `BxS`. These chunks then get allocated across three splits: train/val/test according to the `frac` settings. By default `train_frac` is 0.95 and `val_frac` is 0.05, which means that 95% of our data chunks will be trained on and 5% of the chunks will be used to estimate the validation loss (and hence the generalization). If your data is small, it's possible that with the default settings you'll only have very few chunks in total (for example 100). This is bad: In these cases you may want to decrease batch size or sequence length.
 73 | 
 74 | Note that you can also initialize parameters from a previously saved checkpoint using `init_from`.
 75 | 
 76 | ### Sampling
 77 | 
 78 | Given a checkpoint file (such as those written to `cv`) we can generate new text. For example:
 79 | 
 80 | ```
 81 | $ th sample.lua cv/some_checkpoint.t7 -gpuid -1
 82 | ```
 83 | 
 84 | Make sure that if your checkpoint was trained with GPU it is also sampled from with GPU, or vice versa. Otherwise the code will (currently) complain. As with the train script, see `$ th sample.lua -help` for full options. One important one is (for example) `-length 10000` which would generate 10,000 characters (default = 2000).
 85 | 
 86 | **Temperature**. An important parameter you may want to play with is `-temperature`, which takes a number in range \(0, 1\] (0 not included), default = 1. The temperature is dividing the predicted log probabilities before the Softmax, so lower temperature will cause the model to make more likely, but also more boring and conservative predictions. Higher temperatures cause the model to take more chances and increase diversity of results, but at a cost of more mistakes.
 87 | 
 88 | **Priming**. It's also possible to prime the model with some starting text using `-primetext`. This starts out the RNN with some hardcoded characters to *warm* it up with some context before it starts generating text. E.g. a fun primetext might be `-primetext "the meaning of life is "`. 
 89 | 
 90 | **Training with GPU but sampling on CPU**. Right now the solution is to use the `convert_gpu_cpu_checkpoint.lua` script to convert your GPU checkpoint to a CPU checkpoint. In near future you will not have to do this explicitly. E.g.:
 91 | 
 92 | ```
 93 | $ th convert_gpu_cpu_checkpoint.lua cv/lm_lstm_epoch30.00_1.3950.t7
 94 | ```
 95 | 
 96 | will create a new file `cv/lm_lstm_epoch30.00_1.3950.t7_cpu.t7` that you can use with the sample script and with `-gpuid -1` for CPU mode.
 97 | 
 98 | Happy sampling!
 99 | 
100 | ## Tips and Tricks
101 | 
102 | ### Monitoring Validation Loss vs. Training Loss
103 | If you're somewhat new to Machine Learning or Neural Networks it can take a bit of expertise to get good models. The most important quantity to keep track of is the difference between your training loss (printed during training) and the validation loss (printed once in a while when the RNN is run on the validation data (by default every 1000 iterations)). In particular:
104 | 
105 | - If your training loss is much lower than validation loss then this means the network might be **overfitting**. Solutions to this are to decrease your network size, or to increase dropout. For example you could try dropout of 0.5 and so on.
106 | - If your training/validation loss are about equal then your model is **underfitting**. Increase the size of your model (either number of layers or the raw number of neurons per layer)
107 | 
108 | ### Approximate number of parameters
109 | 
110 | The two most important parameters that control the model are `rnn_size` and `num_layers`. I would advise that you always use `num_layers` of either 2/3. The `rnn_size` can be adjusted based on how much data you have. The two important quantities to keep track of here are:
111 | 
112 | - The number of parameters in your model. This is printed when you start training.
113 | - The size of your dataset. 1MB file is approximately 1 million characters.
114 | 
115 | These two should be about the same order of magnitude. It's a little tricky to tell. Here are some examples:
116 | 
117 | - I have a 100MB dataset and I'm using the default parameter settings (which currently print 150K parameters). My data size is significantly larger (100 mil >> 0.15 mil), so I expect to heavily underfit. I am thinking I can comfortably afford to make `rnn_size` larger.
118 | - I have a 10MB dataset and running a 10 million parameter model. I'm slightly nervous and I'm carefully monitoring my validation loss. If it's larger than my training loss then I may want to try to increase dropout a bit and see if that heps the validation loss.
119 | 
120 | ### Best models strategy
121 | 
122 | The winning strategy to obtaining very good models (if you have the compute time) is to always err on making the network larger (as large as you're willing to wait for it to compute) and then try different dropout values (between 0,1). Whatever model has the best validation performance (the loss, written in the checkpoint filename, low is good) is the one you should use in the end.
123 | 
124 | It is very common in deep learning to run many different models with many different hyperparameter settings, and in the end take whatever checkpoint gave the best validation performance.
125 | 
126 | By the way, the size of your training and validation splits are also parameters. Make sure you have a decent amount of data in your validation set or otherwise the validation performance will be noisy and not very informative.
127 | 
128 | ## Additional Pointers and Acknowledgements
129 | 
130 | This code was originally based on Oxford University Machine Learning class [practical 6](https://github.com/oxford-cs-ml-2015/practical6), which is in turn based on [learning to execute](https://github.com/wojciechz/learning_to_execute) code from Wojciech Zaremba. Chunks of it were also developed in collaboration with my labmate [Justin Johnson](http://cs.stanford.edu/people/jcjohns/).
131 | 
132 | To learn more about RNN language models I recommend looking at:
133 | 
134 | - [My recent talk](https://skillsmatter.com/skillscasts/6611-visualizing-and-understanding-recurrent-networks) on char-rnn
135 | - [Generating Sequences With Recurrent Neural Networks](http://arxiv.org/abs/1308.0850) by Alex Graves
136 | - [Generating Text with Recurrent Neural Networks](http://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf) by Ilya Sutskever
137 | - [Tomas Mikolov's Thesis](http://www.fit.vutbr.cz/~imikolov/rnnlm/thesis.pdf)
138 | 
139 | ## License
140 | 
141 | MIT
142 | 


--------------------------------------------------------------------------------
/train.lua:
--------------------------------------------------------------------------------
  1 | 
  2 | --[[
  3 | 
  4 | This file trains a character-level multi-layer RNN on text data
  5 | 
  6 | Code is based on implementation in 
  7 | https://github.com/oxford-cs-ml-2015/practical6
  8 | but modified to have multi-layer support, GPU support, as well as
  9 | many other common model/optimization bells and whistles.
 10 | The practical6 code is in turn based on 
 11 | https://github.com/wojciechz/learning_to_execute
 12 | which is turn based on other stuff in Torch, etc... (long lineage)
 13 | 
 14 | ]]--
 15 | 
 16 | require 'torch'
 17 | require 'nn'
 18 | require 'nngraph'
 19 | require 'optim'
 20 | require 'lfs'
 21 | 
 22 | require 'util.OneHot'
 23 | require 'util.misc'
 24 | local CharSplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader'
 25 | local model_utils = require 'util.model_utils'
 26 | local LSTM = require 'model.LSTM'
 27 | local GRU = require 'model.GRU'
 28 | local RNN = require 'model.RNN'
 29 | 
 30 | cmd = torch.CmdLine()
 31 | cmd:text()
 32 | cmd:text('Train a character-level language model')
 33 | cmd:text()
 34 | cmd:text('Options')
 35 | -- data
 36 | cmd:option('-data_dir','data/tinyshakespeare','data directory. Should contain the file input.txt with input data')
 37 | -- model params
 38 | cmd:option('-rnn_size', 128, 'size of LSTM internal state')
 39 | cmd:option('-num_layers', 2, 'number of layers in the LSTM')
 40 | cmd:option('-model', 'lstm', 'lstm,gru or rnn')
 41 | -- optimization
 42 | cmd:option('-learning_rate',2e-3,'learning rate')
 43 | cmd:option('-learning_rate_decay',0.97,'learning rate decay')
 44 | cmd:option('-learning_rate_decay_after',10,'in number of epochs, when to start decaying the learning rate')
 45 | cmd:option('-decay_rate',0.95,'decay rate for rmsprop')
 46 | cmd:option('-dropout',0,'dropout for regularization, used after each RNN hidden layer. 0 = no dropout')
 47 | cmd:option('-seq_length',50,'number of timesteps to unroll for')
 48 | cmd:option('-batch_size',50,'number of sequences to train on in parallel')
 49 | cmd:option('-max_epochs',50,'number of full passes through the training data')
 50 | cmd:option('-grad_clip',5,'clip gradients at this value')
 51 | cmd:option('-train_frac',0.95,'fraction of data that goes into train set')
 52 | cmd:option('-val_frac',0.05,'fraction of data that goes into validation set')
 53 |             -- test_frac will be computed as (1 - train_frac - val_frac)
 54 | cmd:option('-init_from', '', 'initialize network parameters from checkpoint at this path')
 55 | -- bookkeeping
 56 | cmd:option('-seed',123,'torch manual random number generator seed')
 57 | cmd:option('-print_every',1,'how many steps/minibatches between printing out the loss')
 58 | cmd:option('-eval_val_every',1000,'every how many iterations should we evaluate on validation data?')
 59 | cmd:option('-checkpoint_dir', 'cv', 'output directory where checkpoints get written')
 60 | cmd:option('-savefile','lstm','filename to autosave the checkpont to. Will be inside checkpoint_dir/')
 61 | cmd:option('-accurate_gpu_timing',0,'set this flag to 1 to get precise timings when using GPU. Might make code bit slower but reports accurate timings.')
 62 | -- GPU/CPU
 63 | cmd:option('-gpuid',0,'which gpu to use. -1 = use CPU')
 64 | cmd:option('-opencl',0,'use OpenCL (instead of CUDA)')
 65 | cmd:text()
 66 | 
 67 | -- parse input params
 68 | opt = cmd:parse(arg)
 69 | torch.manualSeed(opt.seed)
 70 | -- train / val / test split for data, in fractions
 71 | local test_frac = math.max(0, 1 - (opt.train_frac + opt.val_frac))
 72 | local split_sizes = {opt.train_frac, opt.val_frac, test_frac} 
 73 | 
 74 | -- initialize cunn/cutorch for training on the GPU and fall back to CPU gracefully
 75 | if opt.gpuid >= 0 and opt.opencl == 0 then
 76 |     local ok, cunn = pcall(require, 'cunn')
 77 |     local ok2, cutorch = pcall(require, 'cutorch')
 78 |     if not ok then print('package cunn not found!') end
 79 |     if not ok2 then print('package cutorch not found!') end
 80 |     if ok and ok2 then
 81 |         print('using CUDA on GPU ' .. opt.gpuid .. '...')
 82 |         cutorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua
 83 |         cutorch.manualSeed(opt.seed)
 84 |     else
 85 |         print('If cutorch and cunn are installed, your CUDA toolkit may be improperly configured.')
 86 |         print('Check your CUDA toolkit installation, rebuild cutorch and cunn, and try again.')
 87 |         print('Falling back on CPU mode')
 88 |         opt.gpuid = -1 -- overwrite user setting
 89 |     end
 90 | end
 91 | 
 92 | -- initialize clnn/cltorch for training on the GPU and fall back to CPU gracefully
 93 | if opt.gpuid >= 0 and opt.opencl == 1 then
 94 |     local ok, cunn = pcall(require, 'clnn')
 95 |     local ok2, cutorch = pcall(require, 'cltorch')
 96 |     if not ok then print('package clnn not found!') end
 97 |     if not ok2 then print('package cltorch not found!') end
 98 |     if ok and ok2 then
 99 |         print('using OpenCL on GPU ' .. opt.gpuid .. '...')
100 |         cltorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua
101 |         torch.manualSeed(opt.seed)
102 |     else
103 |         print('If cltorch and clnn are installed, your OpenCL driver may be improperly configured.')
104 |         print('Check your OpenCL driver installation, check output of clinfo command, and try again.')
105 |         print('Falling back on CPU mode')
106 |         opt.gpuid = -1 -- overwrite user setting
107 |     end
108 | end
109 | 
110 | -- create the data loader class
111 | local loader = CharSplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes)
112 | local vocab_size = loader.vocab_size  -- the number of distinct characters
113 | local vocab = loader.vocab_mapping
114 | print('vocab size: ' .. vocab_size)
115 | -- make sure output directory exists
116 | if not path.exists(opt.checkpoint_dir) then lfs.mkdir(opt.checkpoint_dir) end
117 | 
118 | -- define the model: prototypes for one timestep, then clone them in time
119 | local do_random_init = true
120 | if string.len(opt.init_from) > 0 then
121 |     print('loading a model from checkpoint ' .. opt.init_from)
122 |     local checkpoint = torch.load(opt.init_from)
123 |     protos = checkpoint.protos
124 |     -- make sure the vocabs are the same
125 |     local vocab_compatible = true
126 |     local checkpoint_vocab_size = 0
127 |     for c,i in pairs(checkpoint.vocab) do
128 |         if not (vocab[c] == i) then
129 |             vocab_compatible = false
130 |         end
131 |         checkpoint_vocab_size = checkpoint_vocab_size + 1
132 |     end
133 |     if not (checkpoint_vocab_size == vocab_size) then
134 |         vocab_compatible = false
135 |         print('checkpoint_vocab_size: ' .. checkpoint_vocab_size)
136 |     end
137 |     assert(vocab_compatible, 'error, the character vocabulary for this dataset and the one in the saved checkpoint are not the same. This is trouble.')
138 |     -- overwrite model settings based on checkpoint to ensure compatibility
139 |     print('overwriting rnn_size=' .. checkpoint.opt.rnn_size .. ', num_layers=' .. checkpoint.opt.num_layers .. ', model=' .. checkpoint.opt.model .. ' based on the checkpoint.')
140 |     opt.rnn_size = checkpoint.opt.rnn_size
141 |     opt.num_layers = checkpoint.opt.num_layers
142 |     opt.model = checkpoint.opt.model
143 |     do_random_init = false
144 | else
145 |     print('creating an ' .. opt.model .. ' with ' .. opt.num_layers .. ' layers')
146 |     protos = {}
147 |     if opt.model == 'lstm' then
148 |         protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
149 |     elseif opt.model == 'gru' then
150 |         protos.rnn = GRU.gru(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
151 |     elseif opt.model == 'rnn' then
152 |         protos.rnn = RNN.rnn(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
153 |     end
154 |     protos.criterion = nn.ClassNLLCriterion()
155 | end
156 | 
157 | -- the initial state of the cell/hidden states
158 | init_state = {}
159 | for L=1,opt.num_layers do
160 |     local h_init = torch.zeros(opt.batch_size, opt.rnn_size)
161 |     if opt.gpuid >=0 and opt.opencl == 0 then h_init = h_init:cuda() end
162 |     if opt.gpuid >=0 and opt.opencl == 1 then h_init = h_init:cl() end
163 |     table.insert(init_state, h_init:clone())
164 |     if opt.model == 'lstm' then
165 |         table.insert(init_state, h_init:clone())
166 |     end
167 | end
168 | 
169 | -- ship the model to the GPU if desired
170 | if opt.gpuid >= 0 and opt.opencl == 0 then
171 |     for k,v in pairs(protos) do v:cuda() end
172 | end
173 | if opt.gpuid >= 0 and opt.opencl == 1 then
174 |     for k,v in pairs(protos) do v:cl() end
175 | end
176 | 
177 | -- put the above things into one flattened parameters tensor
178 | params, grad_params = model_utils.combine_all_parameters(protos.rnn)
179 | 
180 | -- initialization
181 | if do_random_init then
182 |     params:uniform(-0.08, 0.08) -- small uniform numbers
183 | end
184 | -- initialize the LSTM forget gates with slightly higher biases to encourage remembering in the beginning
185 | if opt.model == 'lstm' then
186 |     for layer_idx = 1, opt.num_layers do
187 |         for _,node in ipairs(protos.rnn.forwardnodes) do
188 |             if node.data.annotations.name == "i2h_" .. layer_idx then
189 |                 print('setting forget gate biases to 1 in LSTM layer ' .. layer_idx)
190 |                 -- the gates are, in order, i,f,o,g, so f is the 2nd block of weights
191 |                 node.data.module.bias[{{opt.rnn_size+1, 2*opt.rnn_size}}]:fill(1.0)
192 |             end
193 |         end
194 |     end
195 | end
196 | 
197 | print('number of parameters in the model: ' .. params:nElement())
198 | -- make a bunch of clones after flattening, as that reallocates memory
199 | clones = {}
200 | for name,proto in pairs(protos) do
201 |     print('cloning ' .. name)
202 |     clones[name] = model_utils.clone_many_times(proto, opt.seq_length, not proto.parameters)
203 | end
204 | 
205 | -- preprocessing helper function
206 | function prepro(x,y)
207 |     x = x:transpose(1,2):contiguous() -- swap the axes for faster indexing
208 |     y = y:transpose(1,2):contiguous()
209 |     if opt.gpuid >= 0 and opt.opencl == 0 then -- ship the input arrays to GPU
210 |         -- have to convert to float because integers can't be cuda()'d
211 |         x = x:float():cuda()
212 |         y = y:float():cuda()
213 |     end
214 |     if opt.gpuid >= 0 and opt.opencl == 1 then -- ship the input arrays to GPU
215 |         x = x:cl()
216 |         y = y:cl()
217 |     end
218 |     return x,y
219 | end
220 | 
221 | -- evaluate the loss over an entire split
222 | function eval_split(split_index, max_batches)
223 |     print('evaluating loss over split index ' .. split_index)
224 |     local n = loader.split_sizes[split_index]
225 |     if max_batches ~= nil then n = math.min(max_batches, n) end
226 | 
227 |     loader:reset_batch_pointer(split_index) -- move batch iteration pointer for this split to front
228 |     local loss = 0
229 |     local rnn_state = {[0] = init_state}
230 |     
231 |     for i = 1,n do -- iterate over batches in the split
232 |         -- fetch a batch
233 |         local x, y = loader:next_batch(split_index)
234 |         x,y = prepro(x,y)
235 |         -- forward pass
236 |         for t=1,opt.seq_length do
237 |             clones.rnn[t]:evaluate() -- for dropout proper functioning
238 |             local lst = clones.rnn[t]:forward{x[t], unpack(rnn_state[t-1])}
239 |             rnn_state[t] = {}
240 |             for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end
241 |             prediction = lst[#lst] 
242 |             loss = loss + clones.criterion[t]:forward(prediction, y[t])
243 |         end
244 |         -- carry over lstm state
245 |         rnn_state[0] = rnn_state[#rnn_state]
246 |         print(i .. '/' .. n .. '...')
247 |     end
248 | 
249 |     loss = loss / opt.seq_length / n
250 |     return loss
251 | end
252 | 
253 | -- do fwd/bwd and return loss, grad_params
254 | local init_state_global = clone_list(init_state)
255 | function feval(x)
256 |     if x ~= params then
257 |         params:copy(x)
258 |     end
259 |     grad_params:zero()
260 | 
261 |     ------------------ get minibatch -------------------
262 |     local x, y = loader:next_batch(1)
263 |     x,y = prepro(x,y)
264 |     ------------------- forward pass -------------------
265 |     local rnn_state = {[0] = init_state_global}
266 |     local predictions = {}           -- softmax outputs
267 |     local loss = 0
268 |     for t=1,opt.seq_length do
269 |         clones.rnn[t]:training() -- make sure we are in correct mode (this is cheap, sets flag)
270 |         local lst = clones.rnn[t]:forward{x[t], unpack(rnn_state[t-1])}
271 |         rnn_state[t] = {}
272 |         for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output
273 |         predictions[t] = lst[#lst] -- last element is the prediction
274 |         loss = loss + clones.criterion[t]:forward(predictions[t], y[t])
275 |     end
276 |     loss = loss / opt.seq_length
277 |     ------------------ backward pass -------------------
278 |     -- initialize gradient at time t to be zeros (there's no influence from future)
279 |     local drnn_state = {[opt.seq_length] = clone_list(init_state, true)} -- true also zeros the clones
280 |     for t=opt.seq_length,1,-1 do
281 |         -- backprop through loss, and softmax/linear
282 |         local doutput_t = clones.criterion[t]:backward(predictions[t], y[t])
283 |         table.insert(drnn_state[t], doutput_t)
284 |         local dlst = clones.rnn[t]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t])
285 |         drnn_state[t-1] = {}
286 |         for k,v in pairs(dlst) do
287 |             if k > 1 then -- k == 1 is gradient on x, which we dont need
288 |                 -- note we do k-1 because first item is dembeddings, and then follow the 
289 |                 -- derivatives of the state, starting at index 2. I know...
290 |                 drnn_state[t-1][k-1] = v
291 |             end
292 |         end
293 |     end
294 |     ------------------------ misc ----------------------
295 |     -- transfer final state to initial state (BPTT)
296 |     init_state_global = rnn_state[#rnn_state] -- NOTE: I don't think this needs to be a clone, right?
297 |     -- grad_params:div(opt.seq_length) -- this line should be here but since we use rmsprop it would have no effect. Removing for efficiency
298 |     -- clip gradient element-wise
299 |     grad_params:clamp(-opt.grad_clip, opt.grad_clip)
300 |     return loss, grad_params
301 | end
302 | 
303 | -- start optimization here
304 | train_losses = {}
305 | val_losses = {}
306 | local optim_state = {learningRate = opt.learning_rate, alpha = opt.decay_rate}
307 | local iterations = opt.max_epochs * loader.ntrain
308 | local iterations_per_epoch = loader.ntrain
309 | local loss0 = nil
310 | for i = 1, iterations do
311 |     local epoch = i / loader.ntrain
312 | 
313 |     local timer = torch.Timer()
314 |     local _, loss = optim.rmsprop(feval, params, optim_state)
315 |     if opt.accurate_gpu_timing == 1 and opt.gpuid >= 0 then
316 |         --[[
317 |         Note on timing: The reported time can be off because the GPU is invoked async. If one
318 |         wants to have exactly accurate timings one must call cutorch.synchronize() right here.
319 |         I will avoid doing so by default because this can incur computational overhead.
320 |         --]]
321 |         cutorch.synchronize()
322 |     end
323 |     local time = timer:time().real
324 |     
325 |     local train_loss = loss[1] -- the loss is inside a list, pop it
326 |     train_losses[i] = train_loss
327 | 
328 |     -- exponential learning rate decay
329 |     if i % loader.ntrain == 0 and opt.learning_rate_decay < 1 then
330 |         if epoch >= opt.learning_rate_decay_after then
331 |             local decay_factor = opt.learning_rate_decay
332 |             optim_state.learningRate = optim_state.learningRate * decay_factor -- decay it
333 |             print('decayed learning rate by a factor ' .. decay_factor .. ' to ' .. optim_state.learningRate)
334 |         end
335 |     end
336 | 
337 |     -- every now and then or on last iteration
338 |     if i % opt.eval_val_every == 0 or i == iterations then
339 |         -- evaluate loss on validation data
340 |         local val_loss = eval_split(2) -- 2 = validation
341 |         val_losses[i] = val_loss
342 | 
343 |         local savefile = string.format('%s/lm_%s_epoch%.2f_%.4f.t7', opt.checkpoint_dir, opt.savefile, epoch, val_loss)
344 |         print('saving checkpoint to ' .. savefile)
345 |         local checkpoint = {}
346 |         checkpoint.protos = protos
347 |         checkpoint.opt = opt
348 |         checkpoint.train_losses = train_losses
349 |         checkpoint.val_loss = val_loss
350 |         checkpoint.val_losses = val_losses
351 |         checkpoint.i = i
352 |         checkpoint.epoch = epoch
353 |         checkpoint.vocab = loader.vocab_mapping
354 |         torch.save(savefile, checkpoint)
355 |     end
356 | 
357 |     if i % opt.print_every == 0 then
358 |         print(string.format("%d/%d (epoch %.3f), train_loss = %6.8f, grad/param norm = %6.4e, time/batch = %.4fs", i, iterations, epoch, train_loss, grad_params:norm() / params:norm(), time))
359 |     end
360 |    
361 |     if i % 10 == 0 then collectgarbage() end
362 | 
363 |     -- handle early stopping if things are going really bad
364 |     if loss[1] ~= loss[1] then
365 |         print('loss is NaN.  This usually indicates a bug.  Please check the issues page for existing issues, or create a new issue, if none exist.  Ideally, please state: your operating system, 32-bit/64-bit, your blas version, cpu/cuda/cl?')
366 |         break -- halt
367 |     end
368 |     if loss0 == nil then loss0 = loss[1] end
369 |     if loss[1] > loss0 * 3 then
370 |         print('loss is exploding, aborting.')
371 |         break -- halt
372 |     end
373 | end
374 | 
375 | 
376 | 


--------------------------------------------------------------------------------