├── .gitignore ├── util ├── misc.lua ├── OneHot.lua ├── model_utils.lua └── CharSplitLMMinibatchLoader.lua ├── inspect_checkpoint.lua ├── model ├── RNN.lua ├── GRU.lua └── LSTM.lua ├── convert_gpu_cpu_checkpoint.lua ├── sample.lua ├── Readme.md └── train.lua /.gitignore: -------------------------------------------------------------------------------- 1 | *.t7 2 | -------------------------------------------------------------------------------- /util/misc.lua: -------------------------------------------------------------------------------- 1 | 2 | -- misc utilities 3 | 4 | function clone_list(tensor_list, zero_too) 5 | -- utility function. todo: move away to some utils file? 6 | -- takes a list of tensors and returns a list of cloned tensors 7 | local out = {} 8 | for k,v in pairs(tensor_list) do 9 | out[k] = v:clone() 10 | if zero_too then out[k]:zero() end 11 | end 12 | return out 13 | end -------------------------------------------------------------------------------- /util/OneHot.lua: -------------------------------------------------------------------------------- 1 | 2 | local OneHot, parent = torch.class('OneHot', 'nn.Module') 3 | 4 | function OneHot:__init(outputSize) 5 | parent.__init(self) 6 | self.outputSize = outputSize 7 | -- We'll construct one-hot encodings by using the index method to 8 | -- reshuffle the rows of an identity matrix. To avoid recreating 9 | -- it every iteration we'll cache it. 10 | self._eye = torch.eye(outputSize) 11 | end 12 | 13 | function OneHot:updateOutput(input) 14 | self.output:resize(input:size(1), self.outputSize):zero() 15 | if self._eye == nil then self._eye = torch.eye(self.outputSize) end 16 | self._eye = self._eye:float() 17 | local longInput = input:long() 18 | self.output:copy(self._eye:index(1, longInput)) 19 | return self.output 20 | end 21 | -------------------------------------------------------------------------------- /inspect_checkpoint.lua: -------------------------------------------------------------------------------- 1 | -- simple script that loads a checkpoint and prints its opts 2 | 3 | require 'torch' 4 | require 'nn' 5 | require 'nngraph' 6 | 7 | require 'util.OneHot' 8 | require 'util.misc' 9 | 10 | cmd = torch.CmdLine() 11 | cmd:text() 12 | cmd:text('Load a checkpoint and print its options and validation losses.') 13 | cmd:text() 14 | cmd:text('Options') 15 | cmd:argument('-model','model to load') 16 | cmd:option('-gpuid',0,'gpu to use') 17 | cmd:option('-opencl',0,'use OpenCL (instead of CUDA)') 18 | cmd:text() 19 | 20 | -- parse input params 21 | opt = cmd:parse(arg) 22 | 23 | if opt.gpuid >= 0 and opt.opencl == 0 then 24 | print('using CUDA on GPU ' .. opt.gpuid .. '...') 25 | require 'cutorch' 26 | require 'cunn' 27 | cutorch.setDevice(opt.gpuid + 1) 28 | end 29 | 30 | if opt.gpuid >= 0 and opt.opencl == 1 then 31 | print('using OpenCL on GPU ' .. opt.gpuid .. '...') 32 | require 'cltorch' 33 | require 'clnn' 34 | cltorch.setDevice(opt.gpuid + 1) 35 | end 36 | 37 | local model = torch.load(opt.model) 38 | 39 | print('opt:') 40 | print(model.opt) 41 | print('val losses:') 42 | print(model.val_losses) 43 | 44 | -------------------------------------------------------------------------------- /model/RNN.lua: -------------------------------------------------------------------------------- 1 | local RNN = {} 2 | 3 | function RNN.rnn(input_size, rnn_size, n, dropout) 4 | 5 | -- there are n+1 inputs (hiddens on each layer and x) 6 | local inputs = {} 7 | table.insert(inputs, nn.Identity()()) -- x 8 | for L = 1,n do 9 | table.insert(inputs, nn.Identity()()) -- prev_h[L] 10 | 11 | end 12 | 13 | local x, input_size_L 14 | local outputs = {} 15 | for L = 1,n do 16 | 17 | local prev_h = inputs[L+1] 18 | if L == 1 then 19 | x = OneHot(input_size)(inputs[1]) 20 | input_size_L = input_size 21 | else 22 | x = outputs[(L-1)] 23 | if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any 24 | input_size_L = rnn_size 25 | end 26 | 27 | -- RNN tick 28 | local i2h = nn.Linear(input_size_L, rnn_size)(x) 29 | local h2h = nn.Linear(rnn_size, rnn_size)(prev_h) 30 | local next_h = nn.Tanh()(nn.CAddTable(){i2h, h2h}) 31 | 32 | table.insert(outputs, next_h) 33 | end 34 | -- set up the decoder 35 | local top_h = outputs[#outputs] 36 | if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end 37 | local proj = nn.Linear(rnn_size, input_size)(top_h) 38 | local logsoft = nn.LogSoftMax()(proj) 39 | table.insert(outputs, logsoft) 40 | 41 | return nn.gModule(inputs, outputs) 42 | end 43 | 44 | return RNN 45 | -------------------------------------------------------------------------------- /model/GRU.lua: -------------------------------------------------------------------------------- 1 | 2 | local GRU = {} 3 | 4 | --[[ 5 | Creates one timestep of one GRU 6 | Paper reference: http://arxiv.org/pdf/1412.3555v1.pdf 7 | ]]-- 8 | function GRU.gru(input_size, rnn_size, n, dropout) 9 | dropout = dropout or 0 10 | -- there are n+1 inputs (hiddens on each layer and x) 11 | local inputs = {} 12 | table.insert(inputs, nn.Identity()()) -- x 13 | for L = 1,n do 14 | table.insert(inputs, nn.Identity()()) -- prev_h[L] 15 | end 16 | 17 | function new_input_sum(insize, xv, hv) 18 | local i2h = nn.Linear(insize, rnn_size)(xv) 19 | local h2h = nn.Linear(rnn_size, rnn_size)(hv) 20 | return nn.CAddTable()({i2h, h2h}) 21 | end 22 | 23 | local x, input_size_L 24 | local outputs = {} 25 | for L = 1,n do 26 | 27 | local prev_h = inputs[L+1] 28 | -- the input to this layer 29 | if L == 1 then 30 | x = OneHot(input_size)(inputs[1]) 31 | input_size_L = input_size 32 | else 33 | x = outputs[(L-1)] 34 | if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any 35 | input_size_L = rnn_size 36 | end 37 | -- GRU tick 38 | -- forward the update and reset gates 39 | local update_gate = nn.Sigmoid()(new_input_sum(input_size_L, x, prev_h)) 40 | local reset_gate = nn.Sigmoid()(new_input_sum(input_size_L, x, prev_h)) 41 | -- compute candidate hidden state 42 | local gated_hidden = nn.CMulTable()({reset_gate, prev_h}) 43 | local p2 = nn.Linear(rnn_size, rnn_size)(gated_hidden) 44 | local p1 = nn.Linear(input_size_L, rnn_size)(x) 45 | local hidden_candidate = nn.Tanh()(nn.CAddTable()({p1,p2})) 46 | -- compute new interpolated hidden state, based on the update gate 47 | local zh = nn.CMulTable()({update_gate, hidden_candidate}) 48 | local zhm1 = nn.CMulTable()({nn.AddConstant(1,false)(nn.MulConstant(-1,false)(update_gate)), prev_h}) 49 | local next_h = nn.CAddTable()({zh, zhm1}) 50 | 51 | table.insert(outputs, next_h) 52 | end 53 | -- set up the decoder 54 | local top_h = outputs[#outputs] 55 | if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end 56 | local proj = nn.Linear(rnn_size, input_size)(top_h) 57 | local logsoft = nn.LogSoftMax()(proj) 58 | table.insert(outputs, logsoft) 59 | 60 | return nn.gModule(inputs, outputs) 61 | end 62 | 63 | return GRU 64 | -------------------------------------------------------------------------------- /model/LSTM.lua: -------------------------------------------------------------------------------- 1 | 2 | local LSTM = {} 3 | function LSTM.lstm(input_size, rnn_size, n, dropout) 4 | dropout = dropout or 0 5 | 6 | -- there will be 2*n+1 inputs 7 | local inputs = {} 8 | table.insert(inputs, nn.Identity()()) -- x 9 | for L = 1,n do 10 | table.insert(inputs, nn.Identity()()) -- prev_c[L] 11 | table.insert(inputs, nn.Identity()()) -- prev_h[L] 12 | end 13 | 14 | local x, input_size_L 15 | local outputs = {} 16 | for L = 1,n do 17 | -- c,h from previos timesteps 18 | local prev_h = inputs[L*2+1] 19 | local prev_c = inputs[L*2] 20 | -- the input to this layer 21 | if L == 1 then 22 | x = OneHot(input_size)(inputs[1]) 23 | input_size_L = input_size 24 | else 25 | x = outputs[(L-1)*2] 26 | if dropout > 0 then x = nn.Dropout(dropout)(x) end -- apply dropout, if any 27 | input_size_L = rnn_size 28 | end 29 | -- evaluate the input sums at once for efficiency 30 | local i2h = nn.Linear(input_size_L, 4 * rnn_size)(x):annotate{name='i2h_'..L} 31 | local h2h = nn.Linear(rnn_size, 4 * rnn_size)(prev_h):annotate{name='h2h_'..L} 32 | local all_input_sums = nn.CAddTable()({i2h, h2h}) 33 | 34 | local reshaped = nn.Reshape(4, rnn_size)(all_input_sums) 35 | local n1, n2, n3, n4 = nn.SplitTable(2)(reshaped):split(4) 36 | -- decode the gates 37 | local in_gate = nn.Sigmoid()(n1) 38 | local forget_gate = nn.Sigmoid()(n2) 39 | local out_gate = nn.Sigmoid()(n3) 40 | -- decode the write inputs 41 | local in_transform = nn.Tanh()(n4) 42 | -- perform the LSTM update 43 | local next_c = nn.CAddTable()({ 44 | nn.CMulTable()({forget_gate, prev_c}), 45 | nn.CMulTable()({in_gate, in_transform}) 46 | }) 47 | -- gated cells form the output 48 | local next_h = nn.CMulTable()({out_gate, nn.Tanh()(next_c)}) 49 | 50 | table.insert(outputs, next_c) 51 | table.insert(outputs, next_h) 52 | end 53 | 54 | -- set up the decoder 55 | local top_h = outputs[#outputs] 56 | if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end 57 | local proj = nn.Linear(rnn_size, input_size)(top_h):annotate{name='decoder'} 58 | local logsoft = nn.LogSoftMax()(proj) 59 | table.insert(outputs, logsoft) 60 | 61 | return nn.gModule(inputs, outputs) 62 | end 63 | 64 | return LSTM 65 | 66 | -------------------------------------------------------------------------------- /convert_gpu_cpu_checkpoint.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | A quick patch for converting GPU checkpoints to 3 | CPU checkpoints until I implement a more long-term 4 | solution. Takes the path to the model and creates 5 | a file in the same location and path, but with _cpu.t7 6 | appended. 7 | ]]-- 8 | 9 | require 'torch' 10 | require 'nn' 11 | require 'nngraph' 12 | require 'lfs' 13 | 14 | require 'util.OneHot' 15 | require 'util.misc' 16 | 17 | cmd = torch.CmdLine() 18 | cmd:text() 19 | cmd:text('Sample from a character-level language model') 20 | cmd:text() 21 | cmd:text('Options') 22 | cmd:argument('-model','GPU model checkpoint to convert') 23 | cmd:option('-gpuid',0,'which gpu to use. -1 = use CPU') 24 | cmd:option('-opencl',0,'use OpenCL (instead of CUDA)') 25 | cmd:text() 26 | 27 | -- parse input params 28 | opt = cmd:parse(arg) 29 | 30 | -- check that cunn/cutorch are installed if user wants to use the GPU 31 | if opt.gpuid >= 0 and opt.opencl == 0 then 32 | local ok, cunn = pcall(require, 'cunn') 33 | local ok2, cutorch = pcall(require, 'cutorch') 34 | if not ok then print('package cunn not found!') end 35 | if not ok2 then print('package cutorch not found!') end 36 | if ok and ok2 then 37 | print('using CUDA on GPU ' .. opt.gpuid .. '...') 38 | cutorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua 39 | else 40 | print('Error, no GPU available?') 41 | os.exit() 42 | end 43 | end 44 | 45 | -- check that clnn/cltorch are installed if user wants to use OpenCL 46 | if opt.gpuid >= 0 and opt.opencl == 1 then 47 | local ok, cunn = pcall(require, 'clnn') 48 | local ok2, cutorch = pcall(require, 'cltorch') 49 | if not ok then print('package clnn not found!') end 50 | if not ok2 then print('package cltorch not found!') end 51 | if ok and ok2 then 52 | print('using OpenCL on GPU ' .. opt.gpuid .. '...') 53 | cltorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua 54 | else 55 | print('Error, no GPU available?') 56 | os.exit() 57 | end 58 | end 59 | 60 | print('loading ' .. opt.model) 61 | checkpoint = torch.load(opt.model) 62 | protos = checkpoint.protos 63 | 64 | -- convert the networks to be CPU models 65 | for k,v in pairs(protos) do 66 | print('converting ' .. k .. ' to CPU') 67 | protos[k]:double() 68 | end 69 | 70 | local savefile = opt.model .. '_cpu.t7' -- append "cpu.t7" to filename 71 | torch.save(savefile, checkpoint) 72 | print('saved ' .. savefile) 73 | 74 | 75 | -------------------------------------------------------------------------------- /util/model_utils.lua: -------------------------------------------------------------------------------- 1 | 2 | -- adapted from https://github.com/wojciechz/learning_to_execute 3 | -- utilities for combining/flattening parameters in a model 4 | -- the code in this script is more general than it needs to be, which is 5 | -- why it is kind of a large 6 | 7 | require 'torch' 8 | local model_utils = {} 9 | function model_utils.combine_all_parameters(...) 10 | --[[ like module:getParameters, but operates on many modules ]]-- 11 | 12 | -- get parameters 13 | local networks = {...} 14 | local parameters = {} 15 | local gradParameters = {} 16 | for i = 1, #networks do 17 | local net_params, net_grads = networks[i]:parameters() 18 | 19 | if net_params then 20 | for _, p in pairs(net_params) do 21 | parameters[#parameters + 1] = p 22 | end 23 | for _, g in pairs(net_grads) do 24 | gradParameters[#gradParameters + 1] = g 25 | end 26 | end 27 | end 28 | 29 | local function storageInSet(set, storage) 30 | local storageAndOffset = set[torch.pointer(storage)] 31 | if storageAndOffset == nil then 32 | return nil 33 | end 34 | local _, offset = unpack(storageAndOffset) 35 | return offset 36 | end 37 | 38 | -- this function flattens arbitrary lists of parameters, 39 | -- even complex shared ones 40 | local function flatten(parameters) 41 | if not parameters or #parameters == 0 then 42 | return torch.Tensor() 43 | end 44 | local Tensor = parameters[1].new 45 | 46 | local storages = {} 47 | local nParameters = 0 48 | for k = 1,#parameters do 49 | local storage = parameters[k]:storage() 50 | if not storageInSet(storages, storage) then 51 | storages[torch.pointer(storage)] = {storage, nParameters} 52 | nParameters = nParameters + storage:size() 53 | end 54 | end 55 | 56 | local flatParameters = Tensor(nParameters):fill(1) 57 | local flatStorage = flatParameters:storage() 58 | 59 | for k = 1,#parameters do 60 | local storageOffset = storageInSet(storages, parameters[k]:storage()) 61 | parameters[k]:set(flatStorage, 62 | storageOffset + parameters[k]:storageOffset(), 63 | parameters[k]:size(), 64 | parameters[k]:stride()) 65 | parameters[k]:zero() 66 | end 67 | 68 | local maskParameters= flatParameters:float():clone() 69 | local cumSumOfHoles = flatParameters:float():cumsum(1) 70 | local nUsedParameters = nParameters - cumSumOfHoles[#cumSumOfHoles] 71 | local flatUsedParameters = Tensor(nUsedParameters) 72 | local flatUsedStorage = flatUsedParameters:storage() 73 | 74 | for k = 1,#parameters do 75 | local offset = cumSumOfHoles[parameters[k]:storageOffset()] 76 | parameters[k]:set(flatUsedStorage, 77 | parameters[k]:storageOffset() - offset, 78 | parameters[k]:size(), 79 | parameters[k]:stride()) 80 | end 81 | 82 | for _, storageAndOffset in pairs(storages) do 83 | local k, v = unpack(storageAndOffset) 84 | flatParameters[{{v+1,v+k:size()}}]:copy(Tensor():set(k)) 85 | end 86 | 87 | if cumSumOfHoles:sum() == 0 then 88 | flatUsedParameters:copy(flatParameters) 89 | else 90 | local counter = 0 91 | for k = 1,flatParameters:nElement() do 92 | if maskParameters[k] == 0 then 93 | counter = counter + 1 94 | flatUsedParameters[counter] = flatParameters[counter+cumSumOfHoles[k]] 95 | end 96 | end 97 | assert (counter == nUsedParameters) 98 | end 99 | return flatUsedParameters 100 | end 101 | 102 | -- flatten parameters and gradients 103 | local flatParameters = flatten(parameters) 104 | local flatGradParameters = flatten(gradParameters) 105 | 106 | -- return new flat vector that contains all discrete parameters 107 | return flatParameters, flatGradParameters 108 | end 109 | 110 | 111 | 112 | 113 | function model_utils.clone_many_times(net, T) 114 | local clones = {} 115 | 116 | local params, gradParams 117 | if net.parameters then 118 | params, gradParams = net:parameters() 119 | if params == nil then 120 | params = {} 121 | end 122 | end 123 | 124 | local paramsNoGrad 125 | if net.parametersNoGrad then 126 | paramsNoGrad = net:parametersNoGrad() 127 | end 128 | 129 | local mem = torch.MemoryFile("w"):binary() 130 | mem:writeObject(net) 131 | 132 | for t = 1, T do 133 | -- We need to use a new reader for each clone. 134 | -- We don't want to use the pointers to already read objects. 135 | local reader = torch.MemoryFile(mem:storage(), "r"):binary() 136 | local clone = reader:readObject() 137 | reader:close() 138 | 139 | if net.parameters then 140 | local cloneParams, cloneGradParams = clone:parameters() 141 | local cloneParamsNoGrad 142 | for i = 1, #params do 143 | cloneParams[i]:set(params[i]) 144 | cloneGradParams[i]:set(gradParams[i]) 145 | end 146 | if paramsNoGrad then 147 | cloneParamsNoGrad = clone:parametersNoGrad() 148 | for i =1,#paramsNoGrad do 149 | cloneParamsNoGrad[i]:set(paramsNoGrad[i]) 150 | end 151 | end 152 | end 153 | 154 | clones[t] = clone 155 | collectgarbage() 156 | end 157 | 158 | mem:close() 159 | return clones 160 | end 161 | 162 | return model_utils 163 | -------------------------------------------------------------------------------- /sample.lua: -------------------------------------------------------------------------------- 1 | 2 | --[[ 3 | 4 | This file samples characters from a trained model 5 | 6 | Code is based on implementation in 7 | https://github.com/oxford-cs-ml-2015/practical6 8 | 9 | ]]-- 10 | 11 | require 'torch' 12 | require 'nn' 13 | require 'nngraph' 14 | require 'optim' 15 | require 'lfs' 16 | 17 | require 'util.OneHot' 18 | require 'util.misc' 19 | 20 | cmd = torch.CmdLine() 21 | cmd:text() 22 | cmd:text('Sample from a character-level language model') 23 | cmd:text() 24 | cmd:text('Options') 25 | -- required: 26 | cmd:argument('-model','model checkpoint to use for sampling') 27 | -- optional parameters 28 | cmd:option('-seed',123,'random number generator\'s seed') 29 | cmd:option('-sample',1,' 0 to use max at each timestep, 1 to sample at each timestep') 30 | cmd:option('-primetext',"",'used as a prompt to "seed" the state of the LSTM using a given sequence, before we sample.') 31 | cmd:option('-length',2000,'number of characters to sample') 32 | cmd:option('-temperature',1,'temperature of sampling') 33 | cmd:option('-gpuid',0,'which gpu to use. -1 = use CPU') 34 | cmd:option('-opencl',0,'use OpenCL (instead of CUDA)') 35 | cmd:option('-verbose',1,'set to 0 to ONLY print the sampled text, no diagnostics') 36 | cmd:text() 37 | 38 | -- parse input params 39 | opt = cmd:parse(arg) 40 | 41 | -- gated print: simple utility function wrapping a print 42 | function gprint(str) 43 | if opt.verbose == 1 then print(str) end 44 | end 45 | 46 | -- check that cunn/cutorch are installed if user wants to use the GPU 47 | if opt.gpuid >= 0 and opt.opencl == 0 then 48 | local ok, cunn = pcall(require, 'cunn') 49 | local ok2, cutorch = pcall(require, 'cutorch') 50 | if not ok then gprint('package cunn not found!') end 51 | if not ok2 then gprint('package cutorch not found!') end 52 | if ok and ok2 then 53 | gprint('using CUDA on GPU ' .. opt.gpuid .. '...') 54 | gprint('Make sure that your saved checkpoint was also trained with GPU. If it was trained with CPU use -gpuid -1 for sampling as well') 55 | cutorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua 56 | cutorch.manualSeed(opt.seed) 57 | else 58 | gprint('Falling back on CPU mode') 59 | opt.gpuid = -1 -- overwrite user setting 60 | end 61 | end 62 | 63 | -- check that clnn/cltorch are installed if user wants to use OpenCL 64 | if opt.gpuid >= 0 and opt.opencl == 1 then 65 | local ok, cunn = pcall(require, 'clnn') 66 | local ok2, cutorch = pcall(require, 'cltorch') 67 | if not ok then print('package clnn not found!') end 68 | if not ok2 then print('package cltorch not found!') end 69 | if ok and ok2 then 70 | gprint('using OpenCL on GPU ' .. opt.gpuid .. '...') 71 | gprint('Make sure that your saved checkpoint was also trained with GPU. If it was trained with CPU use -gpuid -1 for sampling as well') 72 | cltorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua 73 | torch.manualSeed(opt.seed) 74 | else 75 | gprint('Falling back on CPU mode') 76 | opt.gpuid = -1 -- overwrite user setting 77 | end 78 | end 79 | 80 | torch.manualSeed(opt.seed) 81 | 82 | -- load the model checkpoint 83 | if not lfs.attributes(opt.model, 'mode') then 84 | gprint('Error: File ' .. opt.model .. ' does not exist. Are you sure you didn\'t forget to prepend cv/ ?') 85 | end 86 | checkpoint = torch.load(opt.model) 87 | protos = checkpoint.protos 88 | protos.rnn:evaluate() -- put in eval mode so that dropout works properly 89 | 90 | -- initialize the vocabulary (and its inverted version) 91 | local vocab = checkpoint.vocab 92 | local ivocab = {} 93 | for c,i in pairs(vocab) do ivocab[i] = c end 94 | 95 | -- initialize the rnn state to all zeros 96 | gprint('creating an ' .. checkpoint.opt.model .. '...') 97 | local current_state 98 | current_state = {} 99 | for L = 1,checkpoint.opt.num_layers do 100 | -- c and h for all layers 101 | local h_init = torch.zeros(1, checkpoint.opt.rnn_size):double() 102 | if opt.gpuid >= 0 and opt.opencl == 0 then h_init = h_init:cuda() end 103 | if opt.gpuid >= 0 and opt.opencl == 1 then h_init = h_init:cl() end 104 | table.insert(current_state, h_init:clone()) 105 | if checkpoint.opt.model == 'lstm' then 106 | table.insert(current_state, h_init:clone()) 107 | end 108 | end 109 | state_size = #current_state 110 | 111 | -- do a few seeded timesteps 112 | local seed_text = opt.primetext 113 | if string.len(seed_text) > 0 then 114 | gprint('seeding with ' .. seed_text) 115 | gprint('--------------------------') 116 | for c in seed_text:gmatch'.' do 117 | prev_char = torch.Tensor{vocab[c]} 118 | io.write(ivocab[prev_char[1]]) 119 | if opt.gpuid >= 0 and opt.opencl == 0 then prev_char = prev_char:cuda() end 120 | if opt.gpuid >= 0 and opt.opencl == 1 then prev_char = prev_char:cl() end 121 | local lst = protos.rnn:forward{prev_char, unpack(current_state)} 122 | -- lst is a list of [state1,state2,..stateN,output]. We want everything but last piece 123 | current_state = {} 124 | for i=1,state_size do table.insert(current_state, lst[i]) end 125 | prediction = lst[#lst] -- last element holds the log probabilities 126 | end 127 | else 128 | -- fill with uniform probabilities over characters (? hmm) 129 | gprint('missing seed text, using uniform probability over first character') 130 | gprint('--------------------------') 131 | prediction = torch.Tensor(1, #ivocab):fill(1)/(#ivocab) 132 | if opt.gpuid >= 0 and opt.opencl == 0 then prediction = prediction:cuda() end 133 | if opt.gpuid >= 0 and opt.opencl == 1 then prediction = prediction:cl() end 134 | end 135 | 136 | -- start sampling/argmaxing 137 | for i=1, opt.length do 138 | 139 | -- log probabilities from the previous timestep 140 | if opt.sample == 0 then 141 | -- use argmax 142 | local _, prev_char_ = prediction:max(2) 143 | prev_char = prev_char_:resize(1) 144 | else 145 | -- use sampling 146 | prediction:div(opt.temperature) -- scale by temperature 147 | local probs = torch.exp(prediction):squeeze() 148 | probs:div(torch.sum(probs)) -- renormalize so probs sum to one 149 | prev_char = torch.multinomial(probs:float(), 1):resize(1):float() 150 | end 151 | 152 | -- forward the rnn for next character 153 | local lst = protos.rnn:forward{prev_char, unpack(current_state)} 154 | current_state = {} 155 | for i=1,state_size do table.insert(current_state, lst[i]) end 156 | prediction = lst[#lst] -- last element holds the log probabilities 157 | 158 | io.write(ivocab[prev_char[1]]) 159 | end 160 | io.write('\n') io.flush() 161 | 162 | -------------------------------------------------------------------------------- /util/CharSplitLMMinibatchLoader.lua: -------------------------------------------------------------------------------- 1 | 2 | -- Modified from https://github.com/oxford-cs-ml-2015/practical6 3 | -- the modification included support for train/val/test splits 4 | 5 | local CharSplitLMMinibatchLoader = {} 6 | CharSplitLMMinibatchLoader.__index = CharSplitLMMinibatchLoader 7 | 8 | function CharSplitLMMinibatchLoader.create(data_dir, batch_size, seq_length, split_fractions) 9 | -- split_fractions is e.g. {0.9, 0.05, 0.05} 10 | 11 | local self = {} 12 | setmetatable(self, CharSplitLMMinibatchLoader) 13 | 14 | local input_file = path.join(data_dir, 'input.txt') 15 | local vocab_file = path.join(data_dir, 'vocab.t7') 16 | local tensor_file = path.join(data_dir, 'data.t7') 17 | 18 | -- fetch file attributes to determine if we need to rerun preprocessing 19 | local run_prepro = false 20 | if not (path.exists(vocab_file) or path.exists(tensor_file)) then 21 | -- prepro files do not exist, generate them 22 | print('vocab.t7 and data.t7 do not exist. Running preprocessing...') 23 | run_prepro = true 24 | else 25 | -- check if the input file was modified since last time we 26 | -- ran the prepro. if so, we have to rerun the preprocessing 27 | local input_attr = lfs.attributes(input_file) 28 | local vocab_attr = lfs.attributes(vocab_file) 29 | local tensor_attr = lfs.attributes(tensor_file) 30 | if input_attr.modification > vocab_attr.modification or input_attr.modification > tensor_attr.modification then 31 | print('vocab.t7 or data.t7 detected as stale. Re-running preprocessing...') 32 | run_prepro = true 33 | end 34 | end 35 | if run_prepro then 36 | -- construct a tensor with all the data, and vocab file 37 | print('one-time setup: preprocessing input text file ' .. input_file .. '...') 38 | CharSplitLMMinibatchLoader.text_to_tensor(input_file, vocab_file, tensor_file) 39 | end 40 | 41 | print('loading data files...') 42 | local data = torch.load(tensor_file) 43 | self.vocab_mapping = torch.load(vocab_file) 44 | 45 | -- cut off the end so that it divides evenly 46 | local len = data:size(1) 47 | if len % (batch_size * seq_length) ~= 0 then 48 | print('cutting off end of data so that the batches/sequences divide evenly') 49 | data = data:sub(1, batch_size * seq_length 50 | * math.floor(len / (batch_size * seq_length))) 51 | end 52 | 53 | -- count vocab 54 | self.vocab_size = 0 55 | for _ in pairs(self.vocab_mapping) do 56 | self.vocab_size = self.vocab_size + 1 57 | end 58 | 59 | -- self.batches is a table of tensors 60 | print('reshaping tensor...') 61 | self.batch_size = batch_size 62 | self.seq_length = seq_length 63 | 64 | local ydata = data:clone() 65 | ydata:sub(1,-2):copy(data:sub(2,-1)) 66 | ydata[-1] = data[1] 67 | self.x_batches = data:view(batch_size, -1):split(seq_length, 2) -- #rows = #batches 68 | self.nbatches = #self.x_batches 69 | self.y_batches = ydata:view(batch_size, -1):split(seq_length, 2) -- #rows = #batches 70 | assert(#self.x_batches == #self.y_batches) 71 | 72 | -- lets try to be helpful here 73 | if self.nbatches < 50 then 74 | print('WARNING: less than 50 batches in the data in total? Looks like very small dataset. You probably want to use smaller batch_size and/or seq_length.') 75 | end 76 | 77 | -- perform safety checks on split_fractions 78 | assert(split_fractions[1] >= 0 and split_fractions[1] <= 1, 'bad split fraction ' .. split_fractions[1] .. ' for train, not between 0 and 1') 79 | assert(split_fractions[2] >= 0 and split_fractions[2] <= 1, 'bad split fraction ' .. split_fractions[2] .. ' for val, not between 0 and 1') 80 | assert(split_fractions[3] >= 0 and split_fractions[3] <= 1, 'bad split fraction ' .. split_fractions[3] .. ' for test, not between 0 and 1') 81 | if split_fractions[3] == 0 then 82 | -- catch a common special case where the user might not want a test set 83 | self.ntrain = math.floor(self.nbatches * split_fractions[1]) 84 | self.nval = self.nbatches - self.ntrain 85 | self.ntest = 0 86 | else 87 | -- divide data to train/val and allocate rest to test 88 | self.ntrain = math.floor(self.nbatches * split_fractions[1]) 89 | self.nval = math.floor(self.nbatches * split_fractions[2]) 90 | self.ntest = self.nbatches - self.nval - self.ntrain -- the rest goes to test (to ensure this adds up exactly) 91 | end 92 | 93 | self.split_sizes = {self.ntrain, self.nval, self.ntest} 94 | self.batch_ix = {0,0,0} 95 | 96 | print(string.format('data load done. Number of data batches in train: %d, val: %d, test: %d', self.ntrain, self.nval, self.ntest)) 97 | collectgarbage() 98 | return self 99 | end 100 | 101 | function CharSplitLMMinibatchLoader:reset_batch_pointer(split_index, batch_index) 102 | batch_index = batch_index or 0 103 | self.batch_ix[split_index] = batch_index 104 | end 105 | 106 | function CharSplitLMMinibatchLoader:next_batch(split_index) 107 | if self.split_sizes[split_index] == 0 then 108 | -- perform a check here to make sure the user isn't screwing something up 109 | local split_names = {'train', 'val', 'test'} 110 | print('ERROR. Code requested a batch for split ' .. split_names[split_index] .. ', but this split has no data.') 111 | os.exit() -- crash violently 112 | end 113 | -- split_index is integer: 1 = train, 2 = val, 3 = test 114 | self.batch_ix[split_index] = self.batch_ix[split_index] + 1 115 | if self.batch_ix[split_index] > self.split_sizes[split_index] then 116 | self.batch_ix[split_index] = 1 -- cycle around to beginning 117 | end 118 | -- pull out the correct next batch 119 | local ix = self.batch_ix[split_index] 120 | if split_index == 2 then ix = ix + self.ntrain end -- offset by train set size 121 | if split_index == 3 then ix = ix + self.ntrain + self.nval end -- offset by train + val 122 | return self.x_batches[ix], self.y_batches[ix] 123 | end 124 | 125 | -- *** STATIC method *** 126 | function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, out_tensorfile) 127 | local timer = torch.Timer() 128 | 129 | print('loading text file...') 130 | local cache_len = 10000 131 | local rawdata 132 | local tot_len = 0 133 | local f = assert(io.open(in_textfile, "r")) 134 | 135 | -- create vocabulary if it doesn't exist yet 136 | print('creating vocabulary mapping...') 137 | -- record all characters to a set 138 | local unordered = {} 139 | rawdata = f:read(cache_len) 140 | repeat 141 | for char in rawdata:gmatch'.' do 142 | if not unordered[char] then unordered[char] = true end 143 | end 144 | tot_len = tot_len + #rawdata 145 | rawdata = f:read(cache_len) 146 | until not rawdata 147 | f:close() 148 | -- sort into a table (i.e. keys become 1..N) 149 | local ordered = {} 150 | for char in pairs(unordered) do ordered[#ordered + 1] = char end 151 | table.sort(ordered) 152 | -- invert `ordered` to create the char->int mapping 153 | local vocab_mapping = {} 154 | for i, char in ipairs(ordered) do 155 | vocab_mapping[char] = i 156 | end 157 | -- construct a tensor with all the data 158 | print('putting data into tensor...') 159 | local data = torch.ByteTensor(tot_len) -- store it into 1D first, then rearrange 160 | f = assert(io.open(in_textfile, "r")) 161 | local currlen = 0 162 | rawdata = f:read(cache_len) 163 | repeat 164 | for i=1, #rawdata do 165 | data[currlen+i] = vocab_mapping[rawdata:sub(i, i)] -- lua has no string indexing using [] 166 | end 167 | currlen = currlen + #rawdata 168 | rawdata = f:read(cache_len) 169 | until not rawdata 170 | f:close() 171 | 172 | -- save output preprocessed files 173 | print('saving ' .. out_vocabfile) 174 | torch.save(out_vocabfile, vocab_mapping) 175 | print('saving ' .. out_tensorfile) 176 | torch.save(out_tensorfile, data) 177 | end 178 | 179 | return CharSplitLMMinibatchLoader 180 | 181 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | 2 | # char-rnn 3 | 4 | This code implements **multi-layer Recurrent Neural Network** (RNN, LSTM, and GRU) for training/sampling from character-level language models. In other words the model takes one text file as input and trains a Recurrent Neural Network that learns to predict the next character in a sequence. The RNN can then be used to generate text character by character that will look like the original training data. The context of this code base is described in detail in my [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/). 5 | 6 | If you are new to Torch/Lua/Neural Nets, it might be helpful to know that this code is really just a slightly more fancy version of this [100-line gist](https://gist.github.com/karpathy/d4dee566867f8291f086) that I wrote in Python/numpy. The code in this repo additionally: allows for multiple layers, uses an LSTM instead of a vanilla RNN, has more supporting code for model checkpointing, and is of course much more efficient since it uses mini-batches and can run on a GPU. 7 | 8 | ## Update: torch-rnn 9 | 10 | [Justin Johnson](http://cs.stanford.edu/people/jcjohns/) (@jcjohnson) recently re-implemented char-rnn from scratch with a much nicer/smaller/cleaner/faster Torch code base. It's under the name [torch-rnn](https://github.com/jcjohnson/torch-rnn). It uses Adam for optimization and hard-codes the RNN/LSTM forward/backward passes for space/time efficiency. This also avoids headaches with cloning models in this repo. In other words, torch-rnn should be the default char-rnn implemention to use now instead of the one in this code base. 11 | 12 | ## Requirements 13 | 14 | This code is written in Lua and requires [Torch](http://torch.ch/). If you're on Ubuntu, installing Torch in your home directory may look something like: 15 | 16 | ```bash 17 | $ curl -s https://raw.githubusercontent.com/torch/ezinstall/master/install-deps | bash 18 | $ git clone https://github.com/torch/distro.git ~/torch --recursive 19 | $ cd ~/torch; 20 | $ ./install.sh # and enter "yes" at the end to modify your bashrc 21 | $ source ~/.bashrc 22 | ``` 23 | 24 | See the Torch installation documentation for more details. After Torch is installed we need to get a few more packages using [LuaRocks](https://luarocks.org/) (which already came with the Torch install). In particular: 25 | 26 | ```bash 27 | $ luarocks install nngraph 28 | $ luarocks install optim 29 | $ luarocks install nn 30 | ``` 31 | 32 | If you'd like to train on an NVIDIA GPU using CUDA (this can be to about 15x faster), you'll of course need the GPU, and you will have to install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit). Then get the `cutorch` and `cunn` packages: 33 | 34 | ```bash 35 | $ luarocks install cutorch 36 | $ luarocks install cunn 37 | ``` 38 | 39 | If you'd like to use OpenCL GPU instead (e.g. ATI cards), you will instead need to install the `cltorch` and `clnn` packages, and then use the option `-opencl 1` during training ([cltorch issues](https://github.com/hughperkins/cltorch/issues)): 40 | 41 | ```bash 42 | $ luarocks install cltorch 43 | $ luarocks install clnn 44 | ``` 45 | 46 | ## Usage 47 | 48 | ### Data 49 | 50 | All input data is stored inside the `data/` directory. You'll notice that there is an example dataset included in the repo (in folder `data/tinyshakespeare`) which consists of a subset of works of Shakespeare. I'm providing a few more datasets on [this page](http://cs.stanford.edu/people/karpathy/char-rnn/). 51 | 52 | **Your own data**: If you'd like to use your own data then create a single file `input.txt` and place it into a folder in the `data/` directory. For example, `data/some_folder/input.txt`. The first time you run the training script it will do some preprocessing and write two more convenience cache files into `data/some_folder`. 53 | 54 | **Dataset sizes**: Note that if your data is too small (1MB is already considered very small) the RNN won't learn very effectively. Remember that it has to learn everything completely from scratch. Conversely if your data is large (more than about 2MB), feel confident to increase `rnn_size` and train a bigger model (see details of training below). It will work *significantly better*. For example with 6MB you can easily go up to `rnn_size` 300 or even more. The biggest that fits on my GPU and that I've trained with this code is `rnn_size` 700 with `num_layers` 3 (2 is default). 55 | 56 | ### Training 57 | 58 | Start training the model using `train.lua`. As a sanity check, to run on the included example dataset simply try: 59 | 60 | ``` 61 | $ th train.lua -gpuid -1 62 | ``` 63 | 64 | Notice that here we are setting the flag `gpuid` to -1, which tells the code to train using CPU, otherwise it defaults to GPU 0. There are many other flags for various options. Consult `$ th train.lua -help` for comprehensive settings. Here's another example that trains a bigger network and also shows how you can run on your own custom dataset (this already assumes that `data/some_folder/input.txt` exists): 65 | 66 | ``` 67 | $ th train.lua -data_dir data/some_folder -rnn_size 512 -num_layers 2 -dropout 0.5 68 | ``` 69 | 70 | **Checkpoints.** While the model is training it will periodically write checkpoint files to the `cv` folder. The frequency with which these checkpoints are written is controlled with number of iterations, as specified with the `eval_val_every` option (e.g. if this is 1 then a checkpoint is written every iteration). The filename of these checkpoints contains a very important number: the **loss**. For example, a checkpoint with filename `lm_lstm_epoch0.95_2.0681.t7` indicates that at this point the model was on epoch 0.95 (i.e. it has almost done one full pass over the training data), and the loss on validation data was 2.0681. This number is very important because the lower it is, the better the checkpoint works. Once you start to generate data (discussed below), you will want to use the model checkpoint that reports the lowest validation loss. Notice that this might not necessarily be the last checkpoint at the end of training (due to possible overfitting). 71 | 72 | Another important quantities to be aware of are `batch_size` (call it B), `seq_length` (call it S), and the `train_frac` and `val_frac` settings. The batch size specifies how many streams of data are processed in parallel at one time. The sequence length specifies the length of each stream, which is also the limit at which the gradients can propagate backwards in time. For example, if `seq_length` is 20, then the gradient signal will never backpropagate more than 20 time steps, and the model might not *find* dependencies longer than this length in number of characters. Thus, if you have a very difficult dataset where there are a lot of long-term dependencies you will want to increase this setting. Now, if at runtime your input text file has N characters, these first all get split into chunks of size `BxS`. These chunks then get allocated across three splits: train/val/test according to the `frac` settings. By default `train_frac` is 0.95 and `val_frac` is 0.05, which means that 95% of our data chunks will be trained on and 5% of the chunks will be used to estimate the validation loss (and hence the generalization). If your data is small, it's possible that with the default settings you'll only have very few chunks in total (for example 100). This is bad: In these cases you may want to decrease batch size or sequence length. 73 | 74 | Note that you can also initialize parameters from a previously saved checkpoint using `init_from`. 75 | 76 | ### Sampling 77 | 78 | Given a checkpoint file (such as those written to `cv`) we can generate new text. For example: 79 | 80 | ``` 81 | $ th sample.lua cv/some_checkpoint.t7 -gpuid -1 82 | ``` 83 | 84 | Make sure that if your checkpoint was trained with GPU it is also sampled from with GPU, or vice versa. Otherwise the code will (currently) complain. As with the train script, see `$ th sample.lua -help` for full options. One important one is (for example) `-length 10000` which would generate 10,000 characters (default = 2000). 85 | 86 | **Temperature**. An important parameter you may want to play with is `-temperature`, which takes a number in range \(0, 1\] (0 not included), default = 1. The temperature is dividing the predicted log probabilities before the Softmax, so lower temperature will cause the model to make more likely, but also more boring and conservative predictions. Higher temperatures cause the model to take more chances and increase diversity of results, but at a cost of more mistakes. 87 | 88 | **Priming**. It's also possible to prime the model with some starting text using `-primetext`. This starts out the RNN with some hardcoded characters to *warm* it up with some context before it starts generating text. E.g. a fun primetext might be `-primetext "the meaning of life is "`. 89 | 90 | **Training with GPU but sampling on CPU**. Right now the solution is to use the `convert_gpu_cpu_checkpoint.lua` script to convert your GPU checkpoint to a CPU checkpoint. In near future you will not have to do this explicitly. E.g.: 91 | 92 | ``` 93 | $ th convert_gpu_cpu_checkpoint.lua cv/lm_lstm_epoch30.00_1.3950.t7 94 | ``` 95 | 96 | will create a new file `cv/lm_lstm_epoch30.00_1.3950.t7_cpu.t7` that you can use with the sample script and with `-gpuid -1` for CPU mode. 97 | 98 | Happy sampling! 99 | 100 | ## Tips and Tricks 101 | 102 | ### Monitoring Validation Loss vs. Training Loss 103 | If you're somewhat new to Machine Learning or Neural Networks it can take a bit of expertise to get good models. The most important quantity to keep track of is the difference between your training loss (printed during training) and the validation loss (printed once in a while when the RNN is run on the validation data (by default every 1000 iterations)). In particular: 104 | 105 | - If your training loss is much lower than validation loss then this means the network might be **overfitting**. Solutions to this are to decrease your network size, or to increase dropout. For example you could try dropout of 0.5 and so on. 106 | - If your training/validation loss are about equal then your model is **underfitting**. Increase the size of your model (either number of layers or the raw number of neurons per layer) 107 | 108 | ### Approximate number of parameters 109 | 110 | The two most important parameters that control the model are `rnn_size` and `num_layers`. I would advise that you always use `num_layers` of either 2/3. The `rnn_size` can be adjusted based on how much data you have. The two important quantities to keep track of here are: 111 | 112 | - The number of parameters in your model. This is printed when you start training. 113 | - The size of your dataset. 1MB file is approximately 1 million characters. 114 | 115 | These two should be about the same order of magnitude. It's a little tricky to tell. Here are some examples: 116 | 117 | - I have a 100MB dataset and I'm using the default parameter settings (which currently print 150K parameters). My data size is significantly larger (100 mil >> 0.15 mil), so I expect to heavily underfit. I am thinking I can comfortably afford to make `rnn_size` larger. 118 | - I have a 10MB dataset and running a 10 million parameter model. I'm slightly nervous and I'm carefully monitoring my validation loss. If it's larger than my training loss then I may want to try to increase dropout a bit and see if that heps the validation loss. 119 | 120 | ### Best models strategy 121 | 122 | The winning strategy to obtaining very good models (if you have the compute time) is to always err on making the network larger (as large as you're willing to wait for it to compute) and then try different dropout values (between 0,1). Whatever model has the best validation performance (the loss, written in the checkpoint filename, low is good) is the one you should use in the end. 123 | 124 | It is very common in deep learning to run many different models with many different hyperparameter settings, and in the end take whatever checkpoint gave the best validation performance. 125 | 126 | By the way, the size of your training and validation splits are also parameters. Make sure you have a decent amount of data in your validation set or otherwise the validation performance will be noisy and not very informative. 127 | 128 | ## Additional Pointers and Acknowledgements 129 | 130 | This code was originally based on Oxford University Machine Learning class [practical 6](https://github.com/oxford-cs-ml-2015/practical6), which is in turn based on [learning to execute](https://github.com/wojciechz/learning_to_execute) code from Wojciech Zaremba. Chunks of it were also developed in collaboration with my labmate [Justin Johnson](http://cs.stanford.edu/people/jcjohns/). 131 | 132 | To learn more about RNN language models I recommend looking at: 133 | 134 | - [My recent talk](https://skillsmatter.com/skillscasts/6611-visualizing-and-understanding-recurrent-networks) on char-rnn 135 | - [Generating Sequences With Recurrent Neural Networks](http://arxiv.org/abs/1308.0850) by Alex Graves 136 | - [Generating Text with Recurrent Neural Networks](http://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf) by Ilya Sutskever 137 | - [Tomas Mikolov's Thesis](http://www.fit.vutbr.cz/~imikolov/rnnlm/thesis.pdf) 138 | 139 | ## License 140 | 141 | MIT 142 | -------------------------------------------------------------------------------- /train.lua: -------------------------------------------------------------------------------- 1 | 2 | --[[ 3 | 4 | This file trains a character-level multi-layer RNN on text data 5 | 6 | Code is based on implementation in 7 | https://github.com/oxford-cs-ml-2015/practical6 8 | but modified to have multi-layer support, GPU support, as well as 9 | many other common model/optimization bells and whistles. 10 | The practical6 code is in turn based on 11 | https://github.com/wojciechz/learning_to_execute 12 | which is turn based on other stuff in Torch, etc... (long lineage) 13 | 14 | ]]-- 15 | 16 | require 'torch' 17 | require 'nn' 18 | require 'nngraph' 19 | require 'optim' 20 | require 'lfs' 21 | 22 | require 'util.OneHot' 23 | require 'util.misc' 24 | local CharSplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader' 25 | local model_utils = require 'util.model_utils' 26 | local LSTM = require 'model.LSTM' 27 | local GRU = require 'model.GRU' 28 | local RNN = require 'model.RNN' 29 | 30 | cmd = torch.CmdLine() 31 | cmd:text() 32 | cmd:text('Train a character-level language model') 33 | cmd:text() 34 | cmd:text('Options') 35 | -- data 36 | cmd:option('-data_dir','data/tinyshakespeare','data directory. Should contain the file input.txt with input data') 37 | -- model params 38 | cmd:option('-rnn_size', 128, 'size of LSTM internal state') 39 | cmd:option('-num_layers', 2, 'number of layers in the LSTM') 40 | cmd:option('-model', 'lstm', 'lstm,gru or rnn') 41 | -- optimization 42 | cmd:option('-learning_rate',2e-3,'learning rate') 43 | cmd:option('-learning_rate_decay',0.97,'learning rate decay') 44 | cmd:option('-learning_rate_decay_after',10,'in number of epochs, when to start decaying the learning rate') 45 | cmd:option('-decay_rate',0.95,'decay rate for rmsprop') 46 | cmd:option('-dropout',0,'dropout for regularization, used after each RNN hidden layer. 0 = no dropout') 47 | cmd:option('-seq_length',50,'number of timesteps to unroll for') 48 | cmd:option('-batch_size',50,'number of sequences to train on in parallel') 49 | cmd:option('-max_epochs',50,'number of full passes through the training data') 50 | cmd:option('-grad_clip',5,'clip gradients at this value') 51 | cmd:option('-train_frac',0.95,'fraction of data that goes into train set') 52 | cmd:option('-val_frac',0.05,'fraction of data that goes into validation set') 53 | -- test_frac will be computed as (1 - train_frac - val_frac) 54 | cmd:option('-init_from', '', 'initialize network parameters from checkpoint at this path') 55 | -- bookkeeping 56 | cmd:option('-seed',123,'torch manual random number generator seed') 57 | cmd:option('-print_every',1,'how many steps/minibatches between printing out the loss') 58 | cmd:option('-eval_val_every',1000,'every how many iterations should we evaluate on validation data?') 59 | cmd:option('-checkpoint_dir', 'cv', 'output directory where checkpoints get written') 60 | cmd:option('-savefile','lstm','filename to autosave the checkpont to. Will be inside checkpoint_dir/') 61 | cmd:option('-accurate_gpu_timing',0,'set this flag to 1 to get precise timings when using GPU. Might make code bit slower but reports accurate timings.') 62 | -- GPU/CPU 63 | cmd:option('-gpuid',0,'which gpu to use. -1 = use CPU') 64 | cmd:option('-opencl',0,'use OpenCL (instead of CUDA)') 65 | cmd:text() 66 | 67 | -- parse input params 68 | opt = cmd:parse(arg) 69 | torch.manualSeed(opt.seed) 70 | -- train / val / test split for data, in fractions 71 | local test_frac = math.max(0, 1 - (opt.train_frac + opt.val_frac)) 72 | local split_sizes = {opt.train_frac, opt.val_frac, test_frac} 73 | 74 | -- initialize cunn/cutorch for training on the GPU and fall back to CPU gracefully 75 | if opt.gpuid >= 0 and opt.opencl == 0 then 76 | local ok, cunn = pcall(require, 'cunn') 77 | local ok2, cutorch = pcall(require, 'cutorch') 78 | if not ok then print('package cunn not found!') end 79 | if not ok2 then print('package cutorch not found!') end 80 | if ok and ok2 then 81 | print('using CUDA on GPU ' .. opt.gpuid .. '...') 82 | cutorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua 83 | cutorch.manualSeed(opt.seed) 84 | else 85 | print('If cutorch and cunn are installed, your CUDA toolkit may be improperly configured.') 86 | print('Check your CUDA toolkit installation, rebuild cutorch and cunn, and try again.') 87 | print('Falling back on CPU mode') 88 | opt.gpuid = -1 -- overwrite user setting 89 | end 90 | end 91 | 92 | -- initialize clnn/cltorch for training on the GPU and fall back to CPU gracefully 93 | if opt.gpuid >= 0 and opt.opencl == 1 then 94 | local ok, cunn = pcall(require, 'clnn') 95 | local ok2, cutorch = pcall(require, 'cltorch') 96 | if not ok then print('package clnn not found!') end 97 | if not ok2 then print('package cltorch not found!') end 98 | if ok and ok2 then 99 | print('using OpenCL on GPU ' .. opt.gpuid .. '...') 100 | cltorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua 101 | torch.manualSeed(opt.seed) 102 | else 103 | print('If cltorch and clnn are installed, your OpenCL driver may be improperly configured.') 104 | print('Check your OpenCL driver installation, check output of clinfo command, and try again.') 105 | print('Falling back on CPU mode') 106 | opt.gpuid = -1 -- overwrite user setting 107 | end 108 | end 109 | 110 | -- create the data loader class 111 | local loader = CharSplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes) 112 | local vocab_size = loader.vocab_size -- the number of distinct characters 113 | local vocab = loader.vocab_mapping 114 | print('vocab size: ' .. vocab_size) 115 | -- make sure output directory exists 116 | if not path.exists(opt.checkpoint_dir) then lfs.mkdir(opt.checkpoint_dir) end 117 | 118 | -- define the model: prototypes for one timestep, then clone them in time 119 | local do_random_init = true 120 | if string.len(opt.init_from) > 0 then 121 | print('loading a model from checkpoint ' .. opt.init_from) 122 | local checkpoint = torch.load(opt.init_from) 123 | protos = checkpoint.protos 124 | -- make sure the vocabs are the same 125 | local vocab_compatible = true 126 | local checkpoint_vocab_size = 0 127 | for c,i in pairs(checkpoint.vocab) do 128 | if not (vocab[c] == i) then 129 | vocab_compatible = false 130 | end 131 | checkpoint_vocab_size = checkpoint_vocab_size + 1 132 | end 133 | if not (checkpoint_vocab_size == vocab_size) then 134 | vocab_compatible = false 135 | print('checkpoint_vocab_size: ' .. checkpoint_vocab_size) 136 | end 137 | assert(vocab_compatible, 'error, the character vocabulary for this dataset and the one in the saved checkpoint are not the same. This is trouble.') 138 | -- overwrite model settings based on checkpoint to ensure compatibility 139 | print('overwriting rnn_size=' .. checkpoint.opt.rnn_size .. ', num_layers=' .. checkpoint.opt.num_layers .. ', model=' .. checkpoint.opt.model .. ' based on the checkpoint.') 140 | opt.rnn_size = checkpoint.opt.rnn_size 141 | opt.num_layers = checkpoint.opt.num_layers 142 | opt.model = checkpoint.opt.model 143 | do_random_init = false 144 | else 145 | print('creating an ' .. opt.model .. ' with ' .. opt.num_layers .. ' layers') 146 | protos = {} 147 | if opt.model == 'lstm' then 148 | protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout) 149 | elseif opt.model == 'gru' then 150 | protos.rnn = GRU.gru(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout) 151 | elseif opt.model == 'rnn' then 152 | protos.rnn = RNN.rnn(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout) 153 | end 154 | protos.criterion = nn.ClassNLLCriterion() 155 | end 156 | 157 | -- the initial state of the cell/hidden states 158 | init_state = {} 159 | for L=1,opt.num_layers do 160 | local h_init = torch.zeros(opt.batch_size, opt.rnn_size) 161 | if opt.gpuid >=0 and opt.opencl == 0 then h_init = h_init:cuda() end 162 | if opt.gpuid >=0 and opt.opencl == 1 then h_init = h_init:cl() end 163 | table.insert(init_state, h_init:clone()) 164 | if opt.model == 'lstm' then 165 | table.insert(init_state, h_init:clone()) 166 | end 167 | end 168 | 169 | -- ship the model to the GPU if desired 170 | if opt.gpuid >= 0 and opt.opencl == 0 then 171 | for k,v in pairs(protos) do v:cuda() end 172 | end 173 | if opt.gpuid >= 0 and opt.opencl == 1 then 174 | for k,v in pairs(protos) do v:cl() end 175 | end 176 | 177 | -- put the above things into one flattened parameters tensor 178 | params, grad_params = model_utils.combine_all_parameters(protos.rnn) 179 | 180 | -- initialization 181 | if do_random_init then 182 | params:uniform(-0.08, 0.08) -- small uniform numbers 183 | end 184 | -- initialize the LSTM forget gates with slightly higher biases to encourage remembering in the beginning 185 | if opt.model == 'lstm' then 186 | for layer_idx = 1, opt.num_layers do 187 | for _,node in ipairs(protos.rnn.forwardnodes) do 188 | if node.data.annotations.name == "i2h_" .. layer_idx then 189 | print('setting forget gate biases to 1 in LSTM layer ' .. layer_idx) 190 | -- the gates are, in order, i,f,o,g, so f is the 2nd block of weights 191 | node.data.module.bias[{{opt.rnn_size+1, 2*opt.rnn_size}}]:fill(1.0) 192 | end 193 | end 194 | end 195 | end 196 | 197 | print('number of parameters in the model: ' .. params:nElement()) 198 | -- make a bunch of clones after flattening, as that reallocates memory 199 | clones = {} 200 | for name,proto in pairs(protos) do 201 | print('cloning ' .. name) 202 | clones[name] = model_utils.clone_many_times(proto, opt.seq_length, not proto.parameters) 203 | end 204 | 205 | -- preprocessing helper function 206 | function prepro(x,y) 207 | x = x:transpose(1,2):contiguous() -- swap the axes for faster indexing 208 | y = y:transpose(1,2):contiguous() 209 | if opt.gpuid >= 0 and opt.opencl == 0 then -- ship the input arrays to GPU 210 | -- have to convert to float because integers can't be cuda()'d 211 | x = x:float():cuda() 212 | y = y:float():cuda() 213 | end 214 | if opt.gpuid >= 0 and opt.opencl == 1 then -- ship the input arrays to GPU 215 | x = x:cl() 216 | y = y:cl() 217 | end 218 | return x,y 219 | end 220 | 221 | -- evaluate the loss over an entire split 222 | function eval_split(split_index, max_batches) 223 | print('evaluating loss over split index ' .. split_index) 224 | local n = loader.split_sizes[split_index] 225 | if max_batches ~= nil then n = math.min(max_batches, n) end 226 | 227 | loader:reset_batch_pointer(split_index) -- move batch iteration pointer for this split to front 228 | local loss = 0 229 | local rnn_state = {[0] = init_state} 230 | 231 | for i = 1,n do -- iterate over batches in the split 232 | -- fetch a batch 233 | local x, y = loader:next_batch(split_index) 234 | x,y = prepro(x,y) 235 | -- forward pass 236 | for t=1,opt.seq_length do 237 | clones.rnn[t]:evaluate() -- for dropout proper functioning 238 | local lst = clones.rnn[t]:forward{x[t], unpack(rnn_state[t-1])} 239 | rnn_state[t] = {} 240 | for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end 241 | prediction = lst[#lst] 242 | loss = loss + clones.criterion[t]:forward(prediction, y[t]) 243 | end 244 | -- carry over lstm state 245 | rnn_state[0] = rnn_state[#rnn_state] 246 | print(i .. '/' .. n .. '...') 247 | end 248 | 249 | loss = loss / opt.seq_length / n 250 | return loss 251 | end 252 | 253 | -- do fwd/bwd and return loss, grad_params 254 | local init_state_global = clone_list(init_state) 255 | function feval(x) 256 | if x ~= params then 257 | params:copy(x) 258 | end 259 | grad_params:zero() 260 | 261 | ------------------ get minibatch ------------------- 262 | local x, y = loader:next_batch(1) 263 | x,y = prepro(x,y) 264 | ------------------- forward pass ------------------- 265 | local rnn_state = {[0] = init_state_global} 266 | local predictions = {} -- softmax outputs 267 | local loss = 0 268 | for t=1,opt.seq_length do 269 | clones.rnn[t]:training() -- make sure we are in correct mode (this is cheap, sets flag) 270 | local lst = clones.rnn[t]:forward{x[t], unpack(rnn_state[t-1])} 271 | rnn_state[t] = {} 272 | for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end -- extract the state, without output 273 | predictions[t] = lst[#lst] -- last element is the prediction 274 | loss = loss + clones.criterion[t]:forward(predictions[t], y[t]) 275 | end 276 | loss = loss / opt.seq_length 277 | ------------------ backward pass ------------------- 278 | -- initialize gradient at time t to be zeros (there's no influence from future) 279 | local drnn_state = {[opt.seq_length] = clone_list(init_state, true)} -- true also zeros the clones 280 | for t=opt.seq_length,1,-1 do 281 | -- backprop through loss, and softmax/linear 282 | local doutput_t = clones.criterion[t]:backward(predictions[t], y[t]) 283 | table.insert(drnn_state[t], doutput_t) 284 | local dlst = clones.rnn[t]:backward({x[t], unpack(rnn_state[t-1])}, drnn_state[t]) 285 | drnn_state[t-1] = {} 286 | for k,v in pairs(dlst) do 287 | if k > 1 then -- k == 1 is gradient on x, which we dont need 288 | -- note we do k-1 because first item is dembeddings, and then follow the 289 | -- derivatives of the state, starting at index 2. I know... 290 | drnn_state[t-1][k-1] = v 291 | end 292 | end 293 | end 294 | ------------------------ misc ---------------------- 295 | -- transfer final state to initial state (BPTT) 296 | init_state_global = rnn_state[#rnn_state] -- NOTE: I don't think this needs to be a clone, right? 297 | -- grad_params:div(opt.seq_length) -- this line should be here but since we use rmsprop it would have no effect. Removing for efficiency 298 | -- clip gradient element-wise 299 | grad_params:clamp(-opt.grad_clip, opt.grad_clip) 300 | return loss, grad_params 301 | end 302 | 303 | -- start optimization here 304 | train_losses = {} 305 | val_losses = {} 306 | local optim_state = {learningRate = opt.learning_rate, alpha = opt.decay_rate} 307 | local iterations = opt.max_epochs * loader.ntrain 308 | local iterations_per_epoch = loader.ntrain 309 | local loss0 = nil 310 | for i = 1, iterations do 311 | local epoch = i / loader.ntrain 312 | 313 | local timer = torch.Timer() 314 | local _, loss = optim.rmsprop(feval, params, optim_state) 315 | if opt.accurate_gpu_timing == 1 and opt.gpuid >= 0 then 316 | --[[ 317 | Note on timing: The reported time can be off because the GPU is invoked async. If one 318 | wants to have exactly accurate timings one must call cutorch.synchronize() right here. 319 | I will avoid doing so by default because this can incur computational overhead. 320 | --]] 321 | cutorch.synchronize() 322 | end 323 | local time = timer:time().real 324 | 325 | local train_loss = loss[1] -- the loss is inside a list, pop it 326 | train_losses[i] = train_loss 327 | 328 | -- exponential learning rate decay 329 | if i % loader.ntrain == 0 and opt.learning_rate_decay < 1 then 330 | if epoch >= opt.learning_rate_decay_after then 331 | local decay_factor = opt.learning_rate_decay 332 | optim_state.learningRate = optim_state.learningRate * decay_factor -- decay it 333 | print('decayed learning rate by a factor ' .. decay_factor .. ' to ' .. optim_state.learningRate) 334 | end 335 | end 336 | 337 | -- every now and then or on last iteration 338 | if i % opt.eval_val_every == 0 or i == iterations then 339 | -- evaluate loss on validation data 340 | local val_loss = eval_split(2) -- 2 = validation 341 | val_losses[i] = val_loss 342 | 343 | local savefile = string.format('%s/lm_%s_epoch%.2f_%.4f.t7', opt.checkpoint_dir, opt.savefile, epoch, val_loss) 344 | print('saving checkpoint to ' .. savefile) 345 | local checkpoint = {} 346 | checkpoint.protos = protos 347 | checkpoint.opt = opt 348 | checkpoint.train_losses = train_losses 349 | checkpoint.val_loss = val_loss 350 | checkpoint.val_losses = val_losses 351 | checkpoint.i = i 352 | checkpoint.epoch = epoch 353 | checkpoint.vocab = loader.vocab_mapping 354 | torch.save(savefile, checkpoint) 355 | end 356 | 357 | if i % opt.print_every == 0 then 358 | print(string.format("%d/%d (epoch %.3f), train_loss = %6.8f, grad/param norm = %6.4e, time/batch = %.4fs", i, iterations, epoch, train_loss, grad_params:norm() / params:norm(), time)) 359 | end 360 | 361 | if i % 10 == 0 then collectgarbage() end 362 | 363 | -- handle early stopping if things are going really bad 364 | if loss[1] ~= loss[1] then 365 | print('loss is NaN. This usually indicates a bug. Please check the issues page for existing issues, or create a new issue, if none exist. Ideally, please state: your operating system, 32-bit/64-bit, your blas version, cpu/cuda/cl?') 366 | break -- halt 367 | end 368 | if loss0 == nil then loss0 = loss[1] end 369 | if loss[1] > loss0 * 3 then 370 | print('loss is exploding, aborting.') 371 | break -- halt 372 | end 373 | end 374 | 375 | 376 | --------------------------------------------------------------------------------