├── README.md
├── deepqlearn.lua
├── deepqlearn.moon
├── gpuqlearn.lua
├── gpuqlearn.moon
├── test.lua
└── test.moon


/README.md:
--------------------------------------------------------------------------------
 1 | DeepQLearning
 2 | =============
 3 | 
 4 | Written by Blake Milner and Jeff Soldate, with help from Eugenio Culurciello and his lab. Work was
 5 | done as part of a project for BME495, a Computational Neuroscience course at Purdue. The original
 6 | code, written in JavaScript, was developed by Andrej Karpathy, a Ph.D. student at Stanford University.
 7 | 
 8 | Deep Q Learning is a powerful machine learning algorithm utilizing Q-Learning. The state space is implemented using Neural Networks, thus bypassing inefficient static look up tables. This aplication was implemented using Torch 7 and Lua.
 9 | 
10 | In many practical engineering scenarios it is often necessary for an algorithm to perform a
11 | series of decisions in order to accomplish a given task. However, that task itself is not always
12 | well-defined and the intermediate decisions to accomplish it are often complex and ever-changing.
13 | Furthermore, information that contributes to accomplishing the task is often not readily available
14 | until critical intermediate decisions have already been made. Video games are a good example of
15 | situations in which a series of actions is required in order to accomplish a task.
16 | 
17 | This application also presents an AI based approach to learning a game where the rules aren't
18 | immediately known. In recent years very robust algorithms utilizing these concepts have been developed and applied 
19 | successfully to retro Atari video games: http://arxiv.org/pdf/1312.5602v1.pdf.
20 | 
21 | Reinforcement learning methods that encourage both exploration and strategizing have been developed in
22 | order to address this problem. One of these methods, called Q-learning, utilizes a policy in order to
23 | select an optimal action.
24 | 
25 | The Q-learning algorithm hinges on a utility function called the Q-function. This function
26 | accepts a state that contains all pertinent information about the playing field along with a possible
27 | action. The function returns a number that describes the utility of that action. In Q-learning the utility
28 | of an action is evaluated based on the immediate reward gained from taking that action and the
29 | possibility of a delayed reward that the action may lead to. For large games with many states and possible
30 | actions the above approach is very time-consuming and computationally intense. Using a neural network to
31 | represent the Q-function can solve many of these issues by eliminating the need for enumeration in order to completely
32 | support the exploration of the state space.
33 | 
34 | An implementation of the method described above (written in JavaScript) exists and is freely available:
35 | http://cs.stanford.edu/people/karpathy/convnetjs/demo/rldemo.html
36 | 
37 | However, this package is designed for a browser and used primarily as a learning tool. DeepQLearning is a 
38 | partial port of the Q-learning component of this package to the Lua scripting language. The Neural Network 
39 | component is powered by Torch 7, a scientific computing framework used for machine learning. It is the hope 
40 | of the authors that this package can be used to fuel further scientific inquiry into this topic.
41 | 
42 | This page also contains a broswer game that the JS Qlearning engine learns from scratch. If the settings are optimized
43 | then after about 15 minutes the application will have learned to play the game flawlessy.
44 | 
45 | 
46 | Installation and Use
47 | ====================
48 | 
49 | Requirements:
50 | 
51 |  * Torch7 (with nnx and optim package) 
52 | -- A scientific computing framework with wide support for machine learning algorithms. (https://github.com/torch/torch7)
53 | 
54 | 
55 | Usage:
56 | 
57 | The DeepQLearning module can be easily included in a Lua scipt using:
58 | 
59 | ```bash
60 | Brain = require 'deepqlearn'
61 | ```
62 | 
63 | The brain must then be initialized with the number of expected inputs and outputs:
64 | 
65 | ```bash
66 | Brain.init(num_inputs, num_outputs)   
67 | ```
68 | 
69 | An action can be selected from an input state space using:
70 | 
71 | ```bash
72 | action = Brain.forward(state); 
73 | ```
74 | 
75 | Learning can be affected from the last state space input to Brian.forward by giving a reward value:
76 | 
77 | ```bash
78 | Brain.backward(reward); 
79 | ```
80 | 


--------------------------------------------------------------------------------
/deepqlearn.lua:
--------------------------------------------------------------------------------
  1 | require 'math'
  2 | require 'nnx'
  3 | require 'os'
  4 | require 'optim'
  5 | 
  6 | 
  7 | math.randomseed( os.time() )
  8 | torch.setdefaulttensortype('torch.FloatTensor')
  9 | 
 10 | local Brain = {}
 11 | 
 12 | --[[  HELPER FUNCTIONS --]]
 13 |  
 14 | function randf(s, e) 
 15 |   return (math.random(0,(e-s)*9999)/10000) + s;
 16 | end
 17 | 
 18 | -- new methods for table
 19 | 
 20 | function table.merge(t1, t2)
 21 |    local t = t1
 22 |     for i = 1, #t2 do
 23 |         t[#t+1] = t2[i]
 24 |     end
 25 |     return t
 26 | end
 27 | 
 28 | function table.copy(t)
 29 |   local u = { }
 30 |   for k, v in pairs(t) do u[k] = v end
 31 |   return setmetatable(u, getmetatable(t))
 32 | end
 33 | 
 34 | function table.length(T)
 35 |   local count = 0
 36 |   for _ in pairs(T) do count = count + 1 end
 37 |   return count
 38 | end
 39 | 
 40 | -- BRAIN
 41 | 
 42 | function Brain.init(num_states, num_actions)
 43 |    -- Number of past state/action pairs input to the network. 0 = agent lives in-the-moment :)
 44 |    Brain.temporal_window = 2  
 45 |    -- Maximum number of experiences that we will save for training
 46 |    Brain.experience_size = 30000     
 47 |    -- experience necessary to start learning
 48 |    Brain.start_learn_threshold = 300
 49 |    -- gamma is a crucial parameter that controls how much plan-ahead the agent does. In [0,1]
 50 |    -- Determines the amount of weight placed on the utility of the state resulting from an action.
 51 |    Brain.gamma = 0.9;
 52 |    -- number of steps we will learn for
 53 |    Brain.learning_steps_total = 100000
 54 |    -- how many steps of the above to perform only random actions (in the beginning)?
 55 |    Brain.learning_steps_burnin = 300;
 56 |    -- controls exploration exploitation tradeoff. Will decay over time
 57 |    -- a higher epsilon means we are more likely to choose random actions
 58 |    Brain.epsilon = 1.0
 59 |    -- what epsilon value do we bottom out on? 0.0 => purely deterministic policy at end
 60 |    Brain.epsilon_min = 0.05;
 61 |    -- what epsilon to use when learning is turned off. This is for testing
 62 |    Brain.epsilon_test_time = 0.01;
 63 | 
 64 |     --[[ states and actions that go into neural net:
 65 |     	 (state0,action0),(state1,action1), ... , (stateN)
 66 |      	this variable controls the size of that temporal window.
 67 |      --]]
 68 |    Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states;
 69 |    Brain.hidden_nodes = 16
 70 |    Brain.num_states = num_states;
 71 |    Brain.num_actions = num_actions;
 72 |    Brain.net_outputs = Brain.num_actions;
 73 |    
 74 |     --[[ Window size dictates the number of states, actions, rewards, and net inputs that we
 75 |     	save. The temporal window size is the number of time states/actions that are input
 76 |     	to the network and must be smaller than or equal to window_size
 77 | 	--]]
 78 |    Brain.window_size = math.max(Brain.temporal_window, 2);
 79 | 
 80 |    -- advanced feature. Sometimes a random action should be biased towards some values
 81 |    -- for example in flappy bird, we may want to choose to not flap more often
 82 |    Brain.random_action_distribution = {};
 83 |     if(table.length(Brain.random_action_distribution) > 0) then
 84 |       -- this better sum to 1 by the way, and be of length this.num_actions
 85 |       if(table.length(Brain.random_action_distribution) ~= Brain.num_actions) then
 86 |         print('TROUBLE. random_action_distribution should be same length as num_actions.');
 87 |       end
 88 |       
 89 |       local s = 0.0;
 90 |       
 91 |       for k = 1, table.length(Brain.random_action_distribution) do
 92 |          s = s + Brain.random_action_distribution[k]
 93 |       end
 94 |       
 95 |       if(math.abs(s - 1.0) > 0.0001) then
 96 |          print('TROUBLE. random_action_distribution should sum to 1!');
 97 |       end
 98 |     end
 99 |     
100 | 
101 |    -- define architecture
102 |    Brain.net = nn.Sequential()
103 | 
104 |    Brain.net:add(nn.Linear(Brain.net_inputs, Brain.hidden_nodes))
105 |    Brain.net:add(nn.Threshold(0,0))
106 | 
107 |    Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.hidden_nodes))
108 |    Brain.net:add(nn.Threshold(0,0))
109 | 
110 |    Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.net_outputs))
111 |    
112 |    Brain.criterion = nn.MSECriterion()
113 |    
114 |    
115 |    -- other learning parameters
116 |    Brain.learning_rate = 0.01;
117 |    Brain.learning_rate_decay = 5e-7
118 |    Brain.batch_size = 16;
119 |    Brain.momentum = 0.9;
120 |       
121 |    -- various housekeeping variables
122 |    Brain.age = 0; -- incremented every backward()
123 |    
124 |    -- number of times we've called forward - lets us know when our input temporal
125 |    -- window is filled up
126 |    Brain.forward_passes = 0;
127 |    Brain.learning = true;
128 | 
129 | 	-- coefficients for regression
130 | 	Brain.coefL1 = 0.001
131 | 	Brain.coefL2 = 0.001
132 | 
133 | 	-- parameters for optim.sgd
134 | 	Brain.parameters, Brain.gradParameters = Brain.net:getParameters()
135 | 	
136 | 	local exp_table_size = (Brain.experience_size + 1) * (Brain.net_inputs * 2 + 2)
137 | 	io.write(string.format('\nAllocating %.2f GB for experience table...\n\n', (4 * exp_table_size)/(1024^3)))
138 | 	-- experience table
139 | 	Brain.experience = torch.Tensor(exp_table_size)
140 | 	-- tracks number of experiences input into the experience table
141 | 	Brain.eCount = 0
142 | 	-- These windows track old experiences, states, actions, rewards, and net inputs
143 | 	-- over time. They should all start out as empty with a fixed size.
144 | 	-- This is a first in, last out data structure that is shifted along time
145 |    Brain.state_window = {}
146 |    Brain.action_window = {}
147 |    Brain.reward_window = {}
148 |    Brain.net_window = {}
149 |    for i = 1, Brain.window_size do  
150 |       Brain.state_window[i] = {}
151 |       Brain.action_window[i] = {}
152 |       Brain.reward_window[i] = {}
153 |       Brain.net_window[i] = {}
154 |    end
155 | end
156 | 
157 |    -- a bit of a helper function. It returns a random action
158 |    -- we are abstracting this away because in future we may want to 
159 |    -- do more sophisticated things. For example some actions could be more
160 |    -- or less likely at "rest"/default state.
161 | function Brain.random_action()
162 | 	-- if we don't have a random action distribution defined then sample evenly
163 |    if(table.length(Brain.random_action_distribution) == 0) then
164 |    	return (torch.random() % Brain.net_outputs) + 1
165 |    	
166 |       -- okay, lets do some fancier sampling:
167 |    else 
168 |       local p = randf(0, 1);
169 |       local cumprob = 0.0;
170 | 
171 |       for k= 1, Brain.num_actions do
172 |         cumprob = cumprob + Brain.random_action_distribution[k];
173 |         
174 |         if(p < cumprob) then
175 |          return k
176 |         end
177 |       end
178 |    end
179 | end
180 | 
181 |   -- compute the value of doing any action in this state
182 |   -- and return the argmax action and its value
183 | function Brain.policy(state)   
184 |   local action_values = Brain.net:forward(state);
185 |   
186 |   local maxval = action_values[1]
187 |   local max_index = 1
188 |  
189 |  -- find maximum output and note its index and value
190 |   for i = 2, Brain.net_outputs do
191 |   	if action_values[i] > maxval then
192 |   		maxval = action_values[i]
193 |   		max_index = i
194 |   	end
195 |   end
196 |   
197 |   return {action = max_index, value = maxval};
198 | end
199 |     
200 | -- This function assembles the input to the network by concatenating
201 | -- old (state, chosen_action) pairs along with the current state      
202 |   -- return s = (x,a,x,a,x,a,xt) state vector. 
203 | function Brain.getNetInput(xt) 
204 |   local w = {};
205 |   w = table.merge(w, xt); -- start with current state
206 |   
207 |   -- and now go backwards and append states and actions from history temporal_window times
208 |   local n = Brain.window_size + 1; 
209 |   for k = 1, Brain.temporal_window do
210 |     -- state
211 |     w = table.merge(w, Brain.state_window[n-k]);
212 |     -- action, encoded as 1-of-k indicator vector. We scale it up a bit because
213 |     -- we dont want weight regularization to undervalue this information, as it only exists once
214 |     local action1ofk = {};
215 |     for i = 1, Brain.num_actions do
216 |       action1ofk[i] = 0
217 |     end
218 | 
219 |    -- assign action taken for current state to be 1, all others are 0
220 |     action1ofk[Brain.action_window[n-k]] = 1.0*Brain.num_states;
221 |       
222 |     w = table.merge(w, action1ofk);
223 |   end
224 |   
225 |   return w;
226 | end
227 |     
228 | --[[ This function computes an action by either:
229 | 	1. Giving the current state and past (state, action) pairs to the network
230 | 		and letting it choose the best acction
231 | 	2. Choosing a random action
232 | --]]
233 | function Brain.forward(input_array) 
234 |   Brain.forward_passes = Brain.forward_passes + 1;
235 |   
236 |   local action, net_input;
237 |   
238 |     -- if we have enough (state, action) pairs in our memory to fill up
239 |     -- our network input then we'll proceed to let our network choose the action
240 |   if(Brain.forward_passes > Brain.temporal_window ) then
241 |     net_input = Brain.getNetInput(input_array);
242 |     net_input = torch.Tensor(net_input)
243 |     
244 |     -- if learning is turned on then epsilon should be decaying
245 |     if(Brain.learning) then
246 |       -- compute (decaying) epsilon for the epsilon-greedy policy
247 |       local new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin)/(Brain.learning_steps_total - Brain.learning_steps_burnin)
248 |       
249 |       -- don't let epsilon go above 1.0
250 |       Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon)); 
251 |     else
252 |     	-- if learning is turned off then use the epsilon we've specified for testing        
253 |       Brain.epsilon = Brain.epsilon_test_time;
254 |     end
255 |     
256 |     -- use epsilon probability to choose whether we use network action or random action
257 |     if(randf(0, 1) < Brain.epsilon) then
258 |       action = Brain.random_action();
259 |     else
260 |       -- otherwise use our policy to make decision
261 |       local best_action = Brain.policy(net_input);
262 |       action = best_action.action; -- this is the action number
263 |      end
264 |   else
265 |     -- pathological case that happens first few iterations when we can't
266 |     -- fill up our network inputs. Just default to random action in this case
267 |     net_input = {};
268 |     action = Brain.random_action();
269 |   end
270 |   
271 |   -- shift the network input, state, and action chosen into our windows
272 |   table.remove( Brain.net_window, 1)
273 |   table.insert( Brain.net_window, net_input) 
274 | 
275 |   table.remove( Brain.state_window, 1)
276 |   table.insert( Brain.state_window, input_array)      
277 | 
278 |   table.remove( Brain.action_window, 1)
279 |   table.insert( Brain.action_window, action)
280 |   
281 |   return action;
282 | end    
283 |     
284 | --[[ 
285 | 	This function trains the network using the reward resulting from the last action
286 | 	It will save this past experience which consists of:
287 | 		the state, action chosen, whether a reward was obtained, and the
288 | 	 	state that resulted from the action
289 | 	After that, it will train the network (using a batch of experiences) using a 
290 | 	random sampling of our entire experience history.
291 | --]]
292 | function Brain.backward(reward)
293 | 		-- add reward to our history 
294 |       table.remove( Brain.reward_window, 1)
295 |       table.insert( Brain.reward_window, reward)
296 |       
297 |       -- if learning is turned off then don't do anything
298 |       if(not Brain.learning) then 
299 |          return; 
300 |       end
301 |       
302 | 	-- sizes of tensors
303 | 	local e_size
304 | 	local state0_size
305 | 	local action0_size
306 | 	local reward0_size
307 | 	local state1_size
308 |       
309 |       Brain.age = Brain.age + 1;
310 |       
311 |       -- if we've had enough states and actions to fill up our net input then add
312 |       -- this new experience to our history
313 |       if(Brain.forward_passes > Brain.temporal_window + 1) then
314 |       	-- make experience and fill it up
315 |         local n = Brain.window_size;
316 | 	local state0 = Brain.net_window[n-1]:clone();
317 | 	state0_size = state0:size(1)
318 | 	local action0 = torch.Tensor({Brain.action_window[n-1]});
319 | 	action0_size = action0:size(1)
320 | 	local reward0 = torch.Tensor({Brain.reward_window[n-1]});
321 | 	reward0_size = reward0:size(1)
322 | 	local state1 = Brain.net_window[n]:clone();
323 | 	state1_size = state1:size(1)
324 |         
325 |         local e = torch.cat({state0, action0, reward0, state1})
326 | 	e_size = e:size(1) -- experience table size
327 |         
328 |         -- if the number of experiences isn't larger than the max then add more
329 |         if(Brain.eCount < Brain.experience_size) then
330 |           Brain.experience:sub(Brain.eCount*e_size + 1, (Brain.eCount + 1)*e_size):copy(e)
331 | 	  Brain.eCount = Brain.eCount + 1 -- track number of experiences
332 |         else 
333 | 	-- Otherwise replace random experience due to finite allocated memory for the experience table
334 | 	  local ri = torch.random(0, Brain.eCount-1);
335 | 	  Brain.experience:sub(ri*e_size + 1, (ri + 1)*e_size):copy(e)
336 |         end
337 |       end
338 |       
339 |       -- if we have enough experience in memory then start training
340 |      if(Brain.eCount > Brain.start_learn_threshold) then
341 | 		inputs = torch.Tensor(Brain.batch_size, Brain.net_inputs)
342 | 		targets = torch.Tensor(Brain.batch_size, Brain.net_outputs) 
343 | 	
344 |         for k = 1, Brain.batch_size do
345 |         	-- choose random experience
346 |         	local re = math.random(0, Brain.eCount-1);
347 |           	local e = torch.Tensor(Brain.experience:sub(re*e_size + 1, (re + 1)*e_size))
348 |           
349 |           	-- copy state from experience
350 |           	local state0 = e:sub(1, state0_size):clone()
351 |    
352 |    		-- compute best action for the new state
353 |    		local state1 = e:sub(state0_size + action0_size + reward0_size + 1, state0_size + action0_size + reward0_size + state1_size):clone()
354 |           	
355 |           	local best_action = Brain.policy(state1);
356 |    
357 |    			--[[ get current action output values
358 |    				we want to make the target outputs the same as the actual outputs
359 |    				expect for the action that was chose - we want to replace this with
360 | 	   			the reward that was obtained + the utility of the resulting state
361 |    			--]]
362 |    			local all_outputs = Brain.net:forward(state0);
363 | 		  	inputs[k] = state0:clone();      	
364 | 		  	targets[k] = all_outputs:clone();
365 | 		  	local action0 = e:sub(state0_size + 1, state0_size + action0_size)
366 | 			local reward0 = e:sub(state0_size + action0_size + 1, state0_size + action0_size + reward0_size)
367 | 		  	targets[k][action0[1]] = reward0[1] + Brain.gamma * best_action.value;   
368 | 		end
369 | 
370 | 		-- create training function to give to optim.sgd
371 | 		local feval = function(x)
372 | 	     collectgarbage()
373 | 
374 | 	     -- get new network parameters
375 | 	     if x ~= Brain.parameters then
376 | 	        Brain.parameters:copy(x)
377 | 	     end
378 | 
379 | 	     -- reset gradients
380 | 	     Brain.gradParameters:zero()
381 | 
382 | 	     -- evaluate function for complete mini batch
383 | 	     local outputs = Brain.net:forward(inputs)
384 | 	     local f = Brain.criterion:forward(outputs, targets)
385 | 
386 | 	     -- estimate df/dW
387 | 	     local df_do = Brain.criterion:backward(outputs, targets)
388 | 	     Brain.net:backward(inputs, df_do)
389 | 
390 | 	     -- penalties (L1 and L2):
391 | 	     if Brain.coefL1 ~= 0 or Brain.coefL2 ~= 0 then
392 | 	        -- locals:
393 | 	       local norm,sign = torch.norm,torch.sign
394 | 
395 | 	        -- Loss:
396 | 	        f = f + Brain.coefL1 * norm(Brain.parameters,1)
397 | 	        f = f + Brain.coefL2 * norm(Brain.parameters,2)^2/2
398 | 
399 | 	        -- Gradients:
400 | 	        Brain.gradParameters:add( sign(Brain.parameters):mul(Brain.coefL1) + Brain.parameters:clone():mul(Brain.coefL2) )
401 | 	     end
402 | 
403 | 	     -- return f and df/dX
404 | 	     return f, Brain.gradParameters
405 | 	  	end
406 | 
407 | 		-- fire up optim.sgd
408 | 		sgdState = {
409 |             learningRate = Brain.learning_rate,
410 |             momentum = Brain.momentum,
411 |             learningRateDecay = Brain.learning_rate_decay
412 |          }
413 |          
414 |          optim.sgd(feval, Brain.parameters, sgdState)
415 |          
416 |      end
417 | end
418 | 
419 | 
420 | 
421 | -- export
422 | return Brain
423 | 
424 | 
425 | 
426 | 
427 | 


--------------------------------------------------------------------------------
/deepqlearn.moon:
--------------------------------------------------------------------------------
  1 | require 'math'
  2 | require 'nnx'
  3 | require 'os'
  4 | require 'optim'
  5 | 
  6 | math.randomseed os.time!
  7 | torch.setdefaulttensortype 'torch.FloatTensor'
  8 | 
  9 | Brain = {}
 10 | 
 11 | --  HELPER FUNCTIONS --
 12 |  
 13 | export randf = (s, e) ->
 14 | 	return (math.random(0, (e - s) * 9999) / 10000) + s
 15 | 
 16 | -- new methods for table
 17 | 
 18 | table.merge = (t1, t2) ->
 19 | 	t = t1
 20 | 	for i = 1, #t2
 21 | 		t[#t + 1] = t2[i]
 22 | 	return t
 23 | 
 24 | table.copy = (t) ->
 25 | 	u = {k, v for k, v in pairs t}
 26 | 	return setmetatable(u, getmetatable t)
 27 | 
 28 | table.length = (T) ->
 29 | 	count = 0
 30 | 	count += 1 for _ in pairs T
 31 | 	return count
 32 | 
 33 | -- returns experience table for single network decision
 34 | -- contains the state, action chosen, whether a reward was obtained, and the
 35 | -- state that resulted from the action. This is later used to train the network
 36 | -- Remember that the utility of an action is evaluated from the reward gained and
 37 | -- the utility of the state it led to (recursive definition)
 38 | export Experience = (state0, action0, reward0, state1) ->
 39 | 	NewExperience =
 40 | 		state0: state0
 41 | 		action0: action0
 42 | 		reward0: reward0
 43 | 		state1: state1
 44 | 	return NewExperience
 45 | 
 46 | -- BRAIN
 47 | 
 48 | Brain.init = (num_states, num_actions) ->
 49 | 	-- Number of past state/action pairs input to the network. 0 = agent lives in-the-moment :)
 50 | 	Brain.temporal_window = 2
 51 | 	-- Maximum number of experiences that we will save for training
 52 | 	Brain.experience_size = 30000
 53 | 	-- experience necessary to start learning
 54 | 	Brain.start_learn_threshold = 300
 55 | 	-- gamma is a crucial parameter that controls how much plan-ahead the agent does. In [0,1]
 56 | 	-- Determines the amount of weight placed on the utility of the state resulting from an action.
 57 | 	Brain.gamma = 0.9
 58 | 	-- number of steps we will learn for
 59 | 	Brain.learning_steps_total = 100000
 60 | 	-- how many steps of the above to perform only random actions (in the beginning)?
 61 | 	Brain.learning_steps_burnin = 300
 62 | 	-- controls exploration exploitation tradeoff. Will decay over time
 63 | 	-- a higher epsilon means we are more likely to choose random actions
 64 | 	Brain.epsilon = 1.0
 65 | 	-- what epsilon value do we bottom out on? 0.0 => purely deterministic policy at end
 66 | 	Brain.epsilon_min = 0.05
 67 | 	-- what epsilon to use when learning is turned off. This is for testing
 68 | 	Brain.epsilon_test_time = 0.01
 69 | 
 70 | 	[[== states and actions that go into neural net:
 71 | 		(state0,action0),(state1,action1), ... , (stateN)
 72 | 		this variable controls the size of that temporal window.
 73 | 	]]
 74 | 	Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states
 75 | 	Brain.hidden_nodes = 16
 76 | 	Brain.num_states = num_states
 77 | 	Brain.num_actions = num_actions
 78 | 	Brain.net_outputs = Brain.num_actions
 79 | 
 80 | 	[[== Window size dictates the number of states, actions, rewards, and net inputs that we
 81 | 		save. The temporal window size is the number of time states/actions that are input
 82 | 		to the network and must be smaller than or equal to window_size
 83 | 	]]
 84 | 	Brain.window_size = math.max Brain.temporal_window, 2
 85 | 
 86 | 	-- advanced feature. Sometimes a random action should be biased towards some values
 87 | 	-- for example in flappy bird, we may want to choose to not flap more often
 88 | 	Brain.random_action_distribution = {}
 89 | 	if table.length(Brain.random_action_distribution) > 0
 90 | 		-- this better sum to 1 by the way, and be of length this.num_actions
 91 | 		if table.length(Brain.random_action_distribution) != Brain.num_actions
 92 | 			print 'TROUBLE. random_action_distribution should be same length as num_actions.'
 93 | 		
 94 | 		s = 0.0
 95 | 		
 96 | 		for k = 1, table.length Brain.random_action_distribution
 97 | 			s += Brain.random_action_distribution[k]
 98 | 		
 99 | 		if math.abs(s - 1.0) > 0.0001
100 | 			 print 'TROUBLE. random_action_distribution should sum to 1!'
101 | 
102 | 
103 | 	-- define architecture
104 | 	Brain.net = nn.Sequential!
105 | 
106 | 	Brain.net\add nn.Linear Brain.net_inputs, Brain.hidden_nodes
107 | 	Brain.net\add nn.Threshold 0, 0
108 | 
109 | 	Brain.net\add nn.Linear Brain.hidden_nodes, Brain.hidden_nodes
110 | 	Brain.net\add nn.Threshold 0, 0
111 | 
112 | 	Brain.net\add nn.Linear Brain.hidden_nodes, Brain.net_outputs
113 | 
114 | 	Brain.criterion = nn.MSECriterion!
115 | 
116 | 
117 | 	-- other learning parameters
118 | 	Brain.learning_rate = 0.01
119 | 	Brain.learning_rate_decay = 5e-7
120 | 	Brain.batch_size = 16
121 | 	Brain.momentum = 0.9
122 | 		
123 | 	-- various housekeeping variables
124 | 	Brain.age = 0 -- incremented every backward!
125 | 
126 | 	-- number of times we've called forward - lets us know when our input temporal
127 | 	-- window is filled up
128 | 	Brain.forward_passes = 0
129 | 	Brain.learning = true
130 | 
131 | 	-- coefficients for regression
132 | 	Brain.coefL1 = 0.001
133 | 	Brain.coefL2 = 0.001
134 | 
135 | 	-- parameters for optim.sgd
136 | 	Brain.parameters, Brain.gradParameters = Brain.net\getParameters!
137 | 
138 | 	-- These windows track old experiences, states, actions, rewards, and net inputs
139 | 	-- over time. They should all start out as empty with a fixed size.
140 | 	-- This is a first in, last out data structure that is shifted along time
141 | 	Brain.experience = {}
142 | 	Brain.state_window = {}
143 | 	Brain.action_window = {}
144 | 	Brain.reward_window = {}
145 | 	Brain.net_window = {}
146 | 	for i = 1, Brain.window_size
147 | 		Brain.state_window[i] = {}
148 | 		Brain.action_window[i] = {}
149 | 		Brain.reward_window[i] = {}
150 | 		Brain.net_window[i] = {}
151 | 
152 | -- a bit of a helper function. It returns a random action
153 | -- we are abstracting this away because in future we may want to
154 | -- do more sophisticated things. For example some actions could be more
155 | -- or less likely at "rest"/default state.
156 | Brain.random_action = ->
157 | 	-- if we don't have a random action distribution defined then sample evenly
158 | 	if table.length(Brain.random_action_distribution) == 0
159 | 		return (torch.random! % Brain.net_outputs) + 1
160 | 
161 | 	-- okay, lets do some fancier sampling:
162 | 	else
163 | 		p = randf 0, 1
164 | 		cumprob = 0.0
165 | 
166 | 		for k = 1, Brain.num_actions
167 | 			cumprob += Brain.random_action_distribution[k]
168 | 			
169 | 			if p < cumprob
170 | 				return k
171 | 
172 | -- compute the value of doing any action in this state
173 | -- and return the argmax action and its value
174 | Brain.policy = (state) ->
175 | 	tensor_state = torch.Tensor state
176 | 	action_values = Brain.net\forward tensor_state
177 | 	
178 | 	maxval = action_values[1]
179 | 	max_index = 1
180 |  
181 | 	-- find maximum output and note its index and value
182 | 	--max_index = i for i = 2, Brain.net_outputs when action_values[i] > action_values[max_index]
183 | 	for i = 2, Brain.net_outputs
184 | 		if action_values[i] > maxval
185 | 			maxval = action_values[i]
186 | 			max_index = i
187 | 	
188 | 	return action: max_index, value: maxval
189 | 		
190 | -- This function assembles the input to the network by concatenating
191 | -- old (state, chosen_action) pairs along with the current state
192 | 	-- return s = (x,a,x,a,x,a,xt) state vector.
193 | Brain.getNetInput = (xt) ->
194 | 	w = {}
195 | 	w = table.merge(w, xt) -- start with current state
196 | 	
197 | 	-- and now go backwards and append states and actions from history temporal_window times
198 | 	n = Brain.window_size + 1
199 | 	for k = 1, Brain.temporal_window do
200 | 		-- state
201 | 		w = table.merge w, Brain.state_window[n - k]
202 | 		-- action, encoded as 1-of-k indicator vector. We scale it up a bit because
203 | 		-- we don't want weight regularization to undervalue this information, as it only exists once
204 | 		action1ofk = {}
205 | 		action1ofk[i] = 0 for i = 1, Brain.num_actions
206 | 
207 | 		-- assign action taken for current state to be 1, all others are 0
208 | 		action1ofk[Brain.action_window[n - k]] = 1.0 * Brain.num_states
209 | 			
210 | 		w = table.merge w, action1ofk
211 | 	
212 | 	return w
213 | 		
214 | -- This function computes an action by either:
215 | -- 1. Giving the current state and past (state, action) pairs to the network
216 | -- and letting it choose the best acction
217 | -- 2. Choosing a random action
218 | Brain.forward = (input_array) ->
219 | 	Brain.forward_passes += 1
220 | 	
221 | 	local action, net_input
222 | 	
223 | 	-- if we have enough (state, action) pairs in our memory to fill up
224 | 	-- our network input then we'll proceed to let our network choose the action
225 | 	if Brain.forward_passes > Brain.temporal_window
226 | 		net_input = Brain.getNetInput input_array
227 | 		
228 | 		-- if learning is turned on then epsilon should be decaying
229 | 		if Brain.learning
230 | 			-- compute (decaying) epsilon for the epsilon-greedy policy
231 | 			new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin)/(Brain.learning_steps_total - Brain.learning_steps_burnin)
232 | 			
233 | 			-- don't let epsilon go above 1.0
234 | 			Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon))
235 | 		else
236 | 			-- if learning is turned off then use the epsilon we've specified for testing
237 | 			Brain.epsilon = Brain.epsilon_test_time
238 | 		
239 | 		-- use epsilon probability to choose whether we use network action or random action
240 | 		if randf(0, 1) < Brain.epsilon
241 | 			action = Brain.random_action!
242 | 		else
243 | 			-- otherwise use our policy to make decision
244 | 			best_action = Brain.policy net_input
245 | 			action = best_action.action -- this is the action number
246 | 	else
247 | 		-- pathological case that happens first few iterations when we can't
248 | 		-- fill up our network inputs. Just default to random action in this case
249 | 		net_input = {}
250 | 		action = Brain.random_action!
251 | 	
252 | 	-- shift the network input, state, and action chosen into our windows
253 | 	table.remove Brain.net_window, 1
254 | 	table.insert Brain.net_window, net_input
255 | 
256 | 	table.remove Brain.state_window, 1
257 | 	table.insert Brain.state_window, input_array
258 | 
259 | 	table.remove Brain.action_window, 1
260 | 	table.insert Brain.action_window, action
261 | 	
262 | 	return action
263 | 		
264 | -- This function trains the network using the reward resulting from the last action
265 | -- It will save this past experience which consists of:
266 | --  the state, action chosen, whether a reward was obtained, and the
267 | --  state that resulted from the action
268 | -- After that, it will train the network (using a batch of experiences) using a
269 | -- random sampling of our entire experience history.
270 | Brain.backward = (reward) ->
271 | 	-- add reward to our history
272 | 	table.remove Brain.reward_window, 1
273 | 	table.insert Brain.reward_window, reward
274 | 	
275 | 	-- if learning is turned off then don't do anything
276 | 	return unless Brain.learning
277 | 	
278 | 	Brain.age += 1
279 | 	
280 | 	-- if we've had enough states and actions to fill up our net input then add
281 | 	-- this new experience to our history
282 | 	if Brain.forward_passes > Brain.temporal_window + 1
283 | 		-- make experience and fill it up
284 | 		e = Experience nil, nil, nil, nil
285 | 		n = Brain.window_size
286 | 		e.state0 = Brain.net_window[n - 1]
287 | 		e.action0 = Brain.action_window[n - 1]
288 | 		e.reward0 = Brain.reward_window[n - 1]
289 | 		e.state1 = Brain.net_window[n]
290 | 		
291 | 		-- if our experience table isn't larger than the max size then expand
292 | 		if table.length(Brain.experience) < Brain.experience_size
293 | 			table.insert Brain.experience, e
294 | 		-- Otherwise replace random experience. finite memory!
295 | 		else
296 | 			ri = torch.random 1, Brain.experience_size
297 | 			Brain.experience[ri] = e
298 | 	
299 | 	-- if we have enough experience in memory then start training
300 | 	if table.length(Brain.experience) > Brain.start_learn_threshold
301 | 		inputs = torch.Tensor Brain.batch_size, Brain.net_inputs
302 | 		targets = torch.Tensor Brain.batch_size, Brain.net_outputs
303 | 
304 | 		for k = 1, Brain.batch_size
305 | 			-- choose random experience
306 | 			re = math.random 1, table.length Brain.experience
307 | 			e = Brain.experience[re]
308 | 			
309 | 			-- copy state from experience
310 | 			x = torch.Tensor e.state0
311 | 
312 | 			-- compute best action for the new state
313 | 			best_action = Brain.policy e.state1
314 | 
315 | 			-- get current action output values
316 | 			-- we want to make the target outputs the same as the actual outputs
317 | 			-- expect for the action that was chose - we want to replace this with
318 | 			-- the reward that was obtained + the utility of the resulting state
319 | 			all_outputs = Brain.net\forward x
320 | 			inputs[k] = x\clone!
321 | 			targets[k] = all_outputs\clone!
322 | 			targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value
323 | 
324 | 		-- create training function to give to optim.sgd
325 | 		feval = (x) ->
326 | 			collectgarbage!
327 | 
328 | 			-- get new network parameters
329 | 			Brain.parameters\copy x unless x == Brain.parameters
330 | 
331 | 			-- reset gradients
332 | 			Brain.gradParameters\zero!
333 | 
334 | 			-- evaluate function for complete mini batch
335 | 			outputs = Brain.net\forward inputs
336 | 			f = Brain.criterion\forward outputs, targets
337 | 
338 | 			-- estimate df/dW
339 | 			df_do = Brain.criterion\backward outputs, targets
340 | 			Brain.net\backward inputs, df_do
341 | 
342 | 			-- penalties (L1 and L2):
343 | 			if Brain.coefL1 != 0 or Brain.coefL2 != 0
344 | 				-- locals:
345 | 				norm,sign = torch.norm, torch.sign
346 | 
347 | 				-- Loss:
348 | 				f += Brain.coefL1 * norm Brain.parameters, 1
349 | 				f += Brain.coefL2 * 0.5 * norm(Brain.parameters, 2) ^ 2
350 | 
351 | 				-- Gradients:
352 | 				Brain.gradParameters\add(sign(Brain.parameters)\mul(Brain.coefL1) + Brain.parameters\clone!\mul Brain.coefL2)
353 | 
354 | 			-- return f and df/dX
355 | 			return f, Brain.gradParameters
356 | 
357 | 		-- fire up optim.sgd
358 | 		sgdState =
359 | 			learningRate: Brain.learning_rate
360 | 			momentum: Brain.momentum
361 | 			learningRateDecay: Brain.learning_rate_decay
362 | 		
363 | 		optim.sgd feval, Brain.parameters, sgdState
364 | 
365 | 
366 | 
367 | -- export
368 | return Brain


--------------------------------------------------------------------------------
/gpuqlearn.lua:
--------------------------------------------------------------------------------
  1 | require('math')
  2 | require('nnx')
  3 | require('os')
  4 | require('optim')
  5 | require('cutorch')
  6 | require('cunn')
  7 | math.randomseed(os.time())
  8 | torch.setdefaulttensortype('torch.FloatTensor')
  9 | local Brain = { }
 10 | randf = function(s, e)
 11 |   return (math.random(0, (e - s) * 9999) / 10000) + s
 12 | end
 13 | table.merge = function(t1, t2)
 14 |   local t = t1
 15 |   for i = 1, #t2 do
 16 |     t[#t + 1] = t2[i]
 17 |   end
 18 |   return t
 19 | end
 20 | table.copy = function(t)
 21 |   local u
 22 |   do
 23 |     local _tbl_0 = { }
 24 |     for k, v in pairs(t) do
 25 |       _tbl_0[k] = v
 26 |     end
 27 |     u = _tbl_0
 28 |   end
 29 |   return setmetatable(u, getmetatable(t))
 30 | end
 31 | table.length = function(T)
 32 |   local count = 0
 33 |   for _ in pairs(T) do
 34 |     count = count + 1
 35 |   end
 36 |   return count
 37 | end
 38 | Experience = function(state0, action0, reward0, state1)
 39 |   local NewExperience = {
 40 |     state0 = state0,
 41 |     action0 = action0,
 42 |     reward0 = reward0,
 43 |     state1 = state1
 44 |   }
 45 |   return NewExperience
 46 | end
 47 | Brain.init = function(num_states, num_actions)
 48 |   Brain.temporal_window = 2
 49 |   Brain.experience_size = 30000
 50 |   Brain.start_learn_threshold = 300
 51 |   Brain.gamma = 0.9
 52 |   Brain.learning_steps_total = 100000
 53 |   Brain.learning_steps_burnin = 300
 54 |   Brain.epsilon = 1.0
 55 |   Brain.epsilon_min = 0.05
 56 |   Brain.epsilon_test_time = 0.01
 57 |   local _ = [[== states and actions that go into neural net:
 58 | 		(state0,action0),(state1,action1), ... , (stateN)
 59 | 		this variable controls the size of that temporal window.
 60 | 	]]
 61 |   Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states
 62 |   Brain.hidden_nodes = 16
 63 |   Brain.num_states = num_states
 64 |   Brain.num_actions = num_actions
 65 |   Brain.net_outputs = Brain.num_actions
 66 |   _ = [[== Window size dictates the number of states, actions, rewards, and net inputs that we
 67 | 		save. The temporal window size is the number of time states/actions that are input
 68 | 		to the network and must be smaller than or equal to window_size
 69 | 	]]
 70 |   Brain.window_size = math.max(Brain.temporal_window, 2)
 71 |   Brain.random_action_distribution = { }
 72 |   if table.length(Brain.random_action_distribution) > 0 then
 73 |     if table.length(Brain.random_action_distribution) ~= Brain.num_actions then
 74 |       print('TROUBLE. random_action_distribution should be same length as num_actions.')
 75 |     end
 76 |     local s = 0.0
 77 |     for k = 1, table.length(Brain.random_action_distribution) do
 78 |       s = s + Brain.random_action_distribution[k]
 79 |     end
 80 |     if math.abs(s - 1.0) > 0.0001 then
 81 |       print('TROUBLE. random_action_distribution should sum to 1!')
 82 |     end
 83 |   end
 84 |   Brain.net = nn.Sequential()
 85 |   Brain.net:add(nn.Linear(Brain.net_inputs, Brain.hidden_nodes))
 86 |   Brain.net:add(nn.Threshold(0, 0))
 87 |   Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.hidden_nodes))
 88 |   Brain.net:add(nn.Threshold(0, 0))
 89 |   Brain.net:add(nn.Linear(Brain.hidden_nodes, Brain.net_outputs))
 90 |   Brain.net:cuda()
 91 |   Brain.criterion = nn.MSECriterion():cuda()
 92 |   Brain.learning_rate = 0.01
 93 |   Brain.learning_rate_decay = 5e-7
 94 |   Brain.batch_size = 16
 95 |   Brain.momentum = 0.9
 96 |   Brain.age = 0
 97 |   Brain.forward_passes = 0
 98 |   Brain.learning = true
 99 |   Brain.coefL1 = 0.001
100 |   Brain.coefL2 = 0.001
101 |   Brain.parameters, Brain.gradParameters = Brain.net:getParameters()
102 |   Brain.experience = { }
103 |   Brain.state_window = { }
104 |   Brain.action_window = { }
105 |   Brain.reward_window = { }
106 |   Brain.net_window = { }
107 |   for i = 1, Brain.window_size do
108 |     Brain.state_window[i] = { }
109 |     Brain.action_window[i] = { }
110 |     Brain.reward_window[i] = { }
111 |     Brain.net_window[i] = { }
112 |   end
113 | end
114 | Brain.random_action = function()
115 |   if table.length(Brain.random_action_distribution) == 0 then
116 |     return (torch.random() % Brain.net_outputs) + 1
117 |   else
118 |     local p = randf(0, 1)
119 |     local cumprob = 0.0
120 |     for k = 1, Brain.num_actions do
121 |       cumprob = cumprob + Brain.random_action_distribution[k]
122 |       if p < cumprob then
123 |         return k
124 |       end
125 |     end
126 |   end
127 | end
128 | Brain.policy = function(state)
129 |   local tensor_state = torch.Tensor(state):cuda()
130 |   local action_values = Brain.net:forward(tensor_state)
131 |   local maxval = action_values[1]
132 |   local max_index = 1
133 |   for i = 2, Brain.net_outputs do
134 |     if action_values[i] > maxval then
135 |       maxval = action_values[i]
136 |       max_index = i
137 |     end
138 |   end
139 |   return {
140 |     action = max_index,
141 |     value = maxval
142 |   }
143 | end
144 | Brain.getNetInput = function(xt)
145 |   local w = { }
146 |   w = table.merge(w, xt)
147 |   local n = Brain.window_size + 1
148 |   for k = 1, Brain.temporal_window do
149 |     w = table.merge(w, Brain.state_window[n - k])
150 |     local action1ofk = { }
151 |     for i = 1, Brain.num_actions do
152 |       action1ofk[i] = 0
153 |     end
154 |     action1ofk[Brain.action_window[n - k]] = 1.0 * Brain.num_states
155 |     w = table.merge(w, action1ofk)
156 |   end
157 |   return w
158 | end
159 | Brain.forward = function(input_array)
160 |   Brain.forward_passes = Brain.forward_passes + 1
161 |   local action, net_input
162 |   if Brain.forward_passes > Brain.temporal_window then
163 |     net_input = Brain.getNetInput(input_array)
164 |     if Brain.learning then
165 |       local new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin) / (Brain.learning_steps_total - Brain.learning_steps_burnin)
166 |       Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon))
167 |     else
168 |       Brain.epsilon = Brain.epsilon_test_time
169 |     end
170 |     if randf(0, 1) < Brain.epsilon then
171 |       action = Brain.random_action()
172 |     else
173 |       local best_action = Brain.policy(net_input)
174 |       action = best_action.action
175 |     end
176 |   else
177 |     net_input = { }
178 |     action = Brain.random_action()
179 |   end
180 |   table.remove(Brain.net_window, 1)
181 |   table.insert(Brain.net_window, net_input)
182 |   table.remove(Brain.state_window, 1)
183 |   table.insert(Brain.state_window, input_array)
184 |   table.remove(Brain.action_window, 1)
185 |   table.insert(Brain.action_window, action)
186 |   return action
187 | end
188 | Brain.backward = function(reward)
189 |   table.remove(Brain.reward_window, 1)
190 |   table.insert(Brain.reward_window, reward)
191 |   if not (Brain.learning) then
192 |     return 
193 |   end
194 |   Brain.age = Brain.age + 1
195 |   if Brain.forward_passes > Brain.temporal_window + 1 then
196 |     local e = Experience(nil, nil, nil, nil)
197 |     local n = Brain.window_size
198 |     e.state0 = Brain.net_window[n - 1]
199 |     e.action0 = Brain.action_window[n - 1]
200 |     e.reward0 = Brain.reward_window[n - 1]
201 |     e.state1 = Brain.net_window[n]
202 |     if table.length(Brain.experience) < Brain.experience_size then
203 |       table.insert(Brain.experience, e)
204 |     else
205 |       local ri = torch.random(1, Brain.experience_size)
206 |       Brain.experience[ri] = e
207 |     end
208 |   end
209 |   if table.length(Brain.experience) > Brain.start_learn_threshold then
210 |     local inputs = torch.Tensor(Brain.batch_size, Brain.net_inputs):cuda()
211 |     local targets = torch.Tensor(Brain.batch_size, Brain.net_outputs):cuda()
212 |     for k = 1, Brain.batch_size do
213 |       local re = math.random(1, table.length(Brain.experience))
214 |       local e = Brain.experience[re]
215 |       local x = torch.Tensor(e.state0):cuda()
216 |       local best_action = Brain.policy(e.state1)
217 |       local all_outputs = Brain.net:forward(x)
218 |       inputs[k] = x:clone()
219 |       targets[k] = all_outputs:clone()
220 |       targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value
221 |     end
222 |     local feval
223 |     feval = function(x)
224 |       collectgarbage()
225 |       if not (x == Brain.parameters) then
226 |         Brain.parameters:copy(x)
227 |       end
228 |       Brain.gradParameters:zero()
229 |       local outputs = Brain.net:forward(inputs)
230 |       local f = Brain.criterion:forward(outputs, targets)
231 |       local df_do = Brain.criterion:backward(outputs, targets)
232 |       Brain.net:backward(inputs, df_do)
233 |       if Brain.coefL1 ~= 0 or Brain.coefL2 ~= 0 then
234 |         local norm, sign = torch.norm, torch.sign
235 |         f = f + (Brain.coefL1 * norm(Brain.parameters, 1))
236 |         f = f + (Brain.coefL2 * 0.5 * norm(Brain.parameters, 2) ^ 2)
237 |         Brain.gradParameters:add(sign(Brain.parameters):mul(Brain.coefL1) + Brain.parameters:clone():mul(Brain.coefL2))
238 |       end
239 |       return f, Brain.gradParameters
240 |     end
241 |     local sgdState = {
242 |       learningRate = Brain.learning_rate,
243 |       momentum = Brain.momentum,
244 |       learningRateDecay = Brain.learning_rate_decay
245 |     }
246 |     return optim.sgd(feval, Brain.parameters, sgdState)
247 |   end
248 | end
249 | return Brain
250 | 


--------------------------------------------------------------------------------
/gpuqlearn.moon:
--------------------------------------------------------------------------------
  1 | require 'math'
  2 | require 'nnx'
  3 | require 'os'
  4 | require 'optim'
  5 | require 'cutorch'
  6 | require 'cunn'
  7 | 
  8 | math.randomseed os.time!
  9 | torch.setdefaulttensortype 'torch.FloatTensor'
 10 | 
 11 | Brain = {}
 12 | 
 13 | --  HELPER FUNCTIONS --
 14 |  
 15 | export randf = (s, e) ->
 16 | 	return (math.random(0, (e - s) * 9999) / 10000) + s
 17 | 
 18 | -- new methods for table
 19 | 
 20 | table.merge = (t1, t2) ->
 21 | 	t = t1
 22 | 	for i = 1, #t2
 23 | 		t[#t + 1] = t2[i]
 24 | 	return t
 25 | 
 26 | table.copy = (t) ->
 27 | 	u = {k, v for k, v in pairs t}
 28 | 	return setmetatable(u, getmetatable t)
 29 | 
 30 | table.length = (T) ->
 31 | 	count = 0
 32 | 	count += 1 for _ in pairs T
 33 | 	return count
 34 | 
 35 | -- returns experience table for single network decision
 36 | -- contains the state, action chosen, whether a reward was obtained, and the
 37 | -- state that resulted from the action. This is later used to train the network
 38 | -- Remember that the utility of an action is evaluated from the reward gained and
 39 | -- the utility of the state it led to (recursive definition)
 40 | export Experience = (state0, action0, reward0, state1) ->
 41 | 	NewExperience =
 42 | 		state0: state0
 43 | 		action0: action0
 44 | 		reward0: reward0
 45 | 		state1: state1
 46 | 	return NewExperience
 47 | 
 48 | -- BRAIN
 49 | 
 50 | Brain.init = (num_states, num_actions) ->
 51 | 	-- Number of past state/action pairs input to the network. 0 = agent lives in-the-moment :)
 52 | 	Brain.temporal_window = 2
 53 | 	-- Maximum number of experiences that we will save for training
 54 | 	Brain.experience_size = 30000
 55 | 	-- experience necessary to start learning
 56 | 	Brain.start_learn_threshold = 300
 57 | 	-- gamma is a crucial parameter that controls how much plan-ahead the agent does. In [0,1]
 58 | 	-- Determines the amount of weight placed on the utility of the state resulting from an action.
 59 | 	Brain.gamma = 0.9
 60 | 	-- number of steps we will learn for
 61 | 	Brain.learning_steps_total = 100000
 62 | 	-- how many steps of the above to perform only random actions (in the beginning)?
 63 | 	Brain.learning_steps_burnin = 300
 64 | 	-- controls exploration exploitation tradeoff. Will decay over time
 65 | 	-- a higher epsilon means we are more likely to choose random actions
 66 | 	Brain.epsilon = 1.0
 67 | 	-- what epsilon value do we bottom out on? 0.0 => purely deterministic policy at end
 68 | 	Brain.epsilon_min = 0.05
 69 | 	-- what epsilon to use when learning is turned off. This is for testing
 70 | 	Brain.epsilon_test_time = 0.01
 71 | 
 72 | 	[[== states and actions that go into neural net:
 73 | 		(state0,action0),(state1,action1), ... , (stateN)
 74 | 		this variable controls the size of that temporal window.
 75 | 	]]
 76 | 	Brain.net_inputs = (num_states + num_actions) * Brain.temporal_window + num_states
 77 | 	Brain.hidden_nodes = 16
 78 | 	Brain.num_states = num_states
 79 | 	Brain.num_actions = num_actions
 80 | 	Brain.net_outputs = Brain.num_actions
 81 | 
 82 | 	[[== Window size dictates the number of states, actions, rewards, and net inputs that we
 83 | 		save. The temporal window size is the number of time states/actions that are input
 84 | 		to the network and must be smaller than or equal to window_size
 85 | 	]]
 86 | 	Brain.window_size = math.max Brain.temporal_window, 2
 87 | 
 88 | 	-- advanced feature. Sometimes a random action should be biased towards some values
 89 | 	-- for example in flappy bird, we may want to choose to not flap more often
 90 | 	Brain.random_action_distribution = {}
 91 | 	if table.length(Brain.random_action_distribution) > 0
 92 | 		-- this better sum to 1 by the way, and be of length this.num_actions
 93 | 		if table.length(Brain.random_action_distribution) != Brain.num_actions
 94 | 			print 'TROUBLE. random_action_distribution should be same length as num_actions.'
 95 | 		
 96 | 		s = 0.0
 97 | 		
 98 | 		for k = 1, table.length Brain.random_action_distribution
 99 | 			s += Brain.random_action_distribution[k]
100 | 		
101 | 		if math.abs(s - 1.0) > 0.0001
102 | 			 print 'TROUBLE. random_action_distribution should sum to 1!'
103 | 
104 | 
105 | 	-- define architecture
106 | 	Brain.net = nn.Sequential!
107 | 
108 | 	Brain.net\add nn.Linear Brain.net_inputs, Brain.hidden_nodes
109 | 	Brain.net\add nn.Threshold 0, 0
110 | 
111 | 	Brain.net\add nn.Linear Brain.hidden_nodes, Brain.hidden_nodes
112 | 	Brain.net\add nn.Threshold 0, 0
113 | 
114 | 	Brain.net\add nn.Linear Brain.hidden_nodes, Brain.net_outputs
115 | 
116 | 	Brain.net\cuda! -- move network to GPU
117 | 
118 | 	Brain.criterion = nn.MSECriterion!\cuda!
119 | 
120 | 
121 | 	-- other learning parameters
122 | 	Brain.learning_rate = 0.01
123 | 	Brain.learning_rate_decay = 5e-7
124 | 	Brain.batch_size = 16
125 | 	Brain.momentum = 0.9
126 | 		
127 | 	-- various housekeeping variables
128 | 	Brain.age = 0 -- incremented every backward!
129 | 
130 | 	-- number of times we've called forward - lets us know when our input temporal
131 | 	-- window is filled up
132 | 	Brain.forward_passes = 0
133 | 	Brain.learning = true
134 | 
135 | 	-- coefficients for regression
136 | 	Brain.coefL1 = 0.001
137 | 	Brain.coefL2 = 0.001
138 | 
139 | 	-- parameters for optim.sgd
140 | 	Brain.parameters, Brain.gradParameters = Brain.net\getParameters!
141 | 
142 | 	-- These windows track old experiences, states, actions, rewards, and net inputs
143 | 	-- over time. They should all start out as empty with a fixed size.
144 | 	-- This is a first in, last out data structure that is shifted along time
145 | 	Brain.experience = {}
146 | 	Brain.state_window = {}
147 | 	Brain.action_window = {}
148 | 	Brain.reward_window = {}
149 | 	Brain.net_window = {}
150 | 	for i = 1, Brain.window_size
151 | 		Brain.state_window[i] = {}
152 | 		Brain.action_window[i] = {}
153 | 		Brain.reward_window[i] = {}
154 | 		Brain.net_window[i] = {}
155 | 
156 | -- a bit of a helper function. It returns a random action
157 | -- we are abstracting this away because in future we may want to
158 | -- do more sophisticated things. For example some actions could be more
159 | -- or less likely at "rest"/default state.
160 | Brain.random_action = ->
161 | 	-- if we don't have a random action distribution defined then sample evenly
162 | 	if table.length(Brain.random_action_distribution) == 0
163 | 		return (torch.random! % Brain.net_outputs) + 1
164 | 
165 | 	-- okay, lets do some fancier sampling:
166 | 	else
167 | 		p = randf 0, 1
168 | 		cumprob = 0.0
169 | 
170 | 		for k = 1, Brain.num_actions
171 | 			cumprob += Brain.random_action_distribution[k]
172 | 			
173 | 			if p < cumprob
174 | 				return k
175 | 
176 | -- compute the value of doing any action in this state
177 | -- and return the argmax action and its value
178 | Brain.policy = (state) ->
179 | 	tensor_state = torch.Tensor(state)\cuda!
180 | 	action_values = Brain.net\forward tensor_state
181 | 	
182 | 	maxval = action_values[1]
183 | 	max_index = 1
184 |  
185 | 	-- find maximum output and note its index and value
186 | 	--max_index = i for i = 2, Brain.net_outputs when action_values[i] > action_values[max_index]
187 | 	for i = 2, Brain.net_outputs
188 | 		if action_values[i] > maxval
189 | 			maxval = action_values[i]
190 | 			max_index = i
191 | 	
192 | 	return action: max_index, value: maxval
193 | 		
194 | -- This function assembles the input to the network by concatenating
195 | -- old (state, chosen_action) pairs along with the current state
196 | 	-- return s = (x,a,x,a,x,a,xt) state vector.
197 | Brain.getNetInput = (xt) ->
198 | 	w = {}
199 | 	w = table.merge(w, xt) -- start with current state
200 | 	
201 | 	-- and now go backwards and append states and actions from history temporal_window times
202 | 	n = Brain.window_size + 1
203 | 	for k = 1, Brain.temporal_window do
204 | 		-- state
205 | 		w = table.merge w, Brain.state_window[n - k]
206 | 		-- action, encoded as 1-of-k indicator vector. We scale it up a bit because
207 | 		-- we don't want weight regularization to undervalue this information, as it only exists once
208 | 		action1ofk = {}
209 | 		action1ofk[i] = 0 for i = 1, Brain.num_actions
210 | 
211 | 		-- assign action taken for current state to be 1, all others are 0
212 | 		action1ofk[Brain.action_window[n - k]] = 1.0 * Brain.num_states
213 | 			
214 | 		w = table.merge w, action1ofk
215 | 	
216 | 	return w
217 | 		
218 | -- This function computes an action by either:
219 | -- 1. Giving the current state and past (state, action) pairs to the network
220 | -- and letting it choose the best acction
221 | -- 2. Choosing a random action
222 | Brain.forward = (input_array) ->
223 | 	Brain.forward_passes += 1
224 | 	
225 | 	local action, net_input
226 | 	
227 | 	-- if we have enough (state, action) pairs in our memory to fill up
228 | 	-- our network input then we'll proceed to let our network choose the action
229 | 	if Brain.forward_passes > Brain.temporal_window
230 | 		net_input = Brain.getNetInput input_array
231 | 		
232 | 		-- if learning is turned on then epsilon should be decaying
233 | 		if Brain.learning
234 | 			-- compute (decaying) epsilon for the epsilon-greedy policy
235 | 			new_epsilon = 1.0 - (Brain.age - Brain.learning_steps_burnin)/(Brain.learning_steps_total - Brain.learning_steps_burnin)
236 | 			
237 | 			-- don't let epsilon go above 1.0
238 | 			Brain.epsilon = math.min(1.0, math.max(Brain.epsilon_min, new_epsilon))
239 | 		else
240 | 			-- if learning is turned off then use the epsilon we've specified for testing
241 | 			Brain.epsilon = Brain.epsilon_test_time
242 | 		
243 | 		-- use epsilon probability to choose whether we use network action or random action
244 | 		if randf(0, 1) < Brain.epsilon
245 | 			action = Brain.random_action!
246 | 		else
247 | 			-- otherwise use our policy to make decision
248 | 			best_action = Brain.policy net_input
249 | 			action = best_action.action -- this is the action number
250 | 	else
251 | 		-- pathological case that happens first few iterations when we can't
252 | 		-- fill up our network inputs. Just default to random action in this case
253 | 		net_input = {}
254 | 		action = Brain.random_action!
255 | 	
256 | 	-- shift the network input, state, and action chosen into our windows
257 | 	table.remove Brain.net_window, 1
258 | 	table.insert Brain.net_window, net_input
259 | 
260 | 	table.remove Brain.state_window, 1
261 | 	table.insert Brain.state_window, input_array
262 | 
263 | 	table.remove Brain.action_window, 1
264 | 	table.insert Brain.action_window, action
265 | 	
266 | 	return action
267 | 		
268 | -- This function trains the network using the reward resulting from the last action
269 | -- It will save this past experience which consists of:
270 | --  the state, action chosen, whether a reward was obtained, and the
271 | --  state that resulted from the action
272 | -- After that, it will train the network (using a batch of experiences) using a
273 | -- random sampling of our entire experience history.
274 | Brain.backward = (reward) ->
275 | 	-- add reward to our history
276 | 	table.remove Brain.reward_window, 1
277 | 	table.insert Brain.reward_window, reward
278 | 	
279 | 	-- if learning is turned off then don't do anything
280 | 	return unless Brain.learning
281 | 	
282 | 	Brain.age += 1
283 | 	
284 | 	-- if we've had enough states and actions to fill up our net input then add
285 | 	-- this new experience to our history
286 | 	if Brain.forward_passes > Brain.temporal_window + 1
287 | 		-- make experience and fill it up
288 | 		e = Experience nil, nil, nil, nil
289 | 		n = Brain.window_size
290 | 		e.state0 = Brain.net_window[n - 1]
291 | 		e.action0 = Brain.action_window[n - 1]
292 | 		e.reward0 = Brain.reward_window[n - 1]
293 | 		e.state1 = Brain.net_window[n]
294 | 		
295 | 		-- if our experience table isn't larger than the max size then expand
296 | 		if table.length(Brain.experience) < Brain.experience_size
297 | 			table.insert Brain.experience, e
298 | 		-- Otherwise replace random experience. finite memory!
299 | 		else
300 | 			ri = torch.random 1, Brain.experience_size
301 | 			Brain.experience[ri] = e
302 | 	
303 | 	-- if we have enough experience in memory then start training
304 | 	if table.length(Brain.experience) > Brain.start_learn_threshold
305 | 		inputs = torch.Tensor(Brain.batch_size, Brain.net_inputs)\cuda!
306 | 		targets = torch.Tensor(Brain.batch_size, Brain.net_outputs)\cuda!
307 | 
308 | 		for k = 1, Brain.batch_size
309 | 			-- choose random experience
310 | 			re = math.random 1, table.length Brain.experience
311 | 			e = Brain.experience[re]
312 | 			
313 | 			-- copy state from experience
314 | 			x = torch.Tensor(e.state0)\cuda!
315 | 
316 | 			-- compute best action for the new state
317 | 			best_action = Brain.policy e.state1
318 | 
319 | 			-- get current action output values
320 | 			-- we want to make the target outputs the same as the actual outputs
321 | 			-- expect for the action that was chose - we want to replace this with
322 | 			-- the reward that was obtained + the utility of the resulting state
323 | 			all_outputs = Brain.net\forward x
324 | 			inputs[k] = x\clone!
325 | 			targets[k] = all_outputs\clone!
326 | 			targets[k][e.action0] = e.reward0 + Brain.gamma * best_action.value
327 | 
328 | 		-- create training function to give to optim.sgd
329 | 		feval = (x) ->
330 | 			collectgarbage!
331 | 
332 | 			-- get new network parameters
333 | 			Brain.parameters\copy x unless x == Brain.parameters
334 | 
335 | 			-- reset gradients
336 | 			Brain.gradParameters\zero!
337 | 
338 | 			-- evaluate function for complete mini batch
339 | 			outputs = Brain.net\forward inputs
340 | 			f = Brain.criterion\forward outputs, targets
341 | 
342 | 			-- estimate df/dW
343 | 			df_do = Brain.criterion\backward outputs, targets
344 | 			Brain.net\backward inputs, df_do
345 | 
346 | 			-- penalties (L1 and L2):
347 | 			if Brain.coefL1 != 0 or Brain.coefL2 != 0
348 | 				-- locals:
349 | 				norm,sign = torch.norm, torch.sign
350 | 
351 | 				-- Loss:
352 | 				f += Brain.coefL1 * norm Brain.parameters, 1
353 | 				f += Brain.coefL2 * 0.5 * norm(Brain.parameters, 2) ^ 2
354 | 
355 | 				-- Gradients:
356 | 				Brain.gradParameters\add(sign(Brain.parameters)\mul(Brain.coefL1) + Brain.parameters\clone!\mul Brain.coefL2)
357 | 
358 | 			-- return f and df/dX
359 | 			return f, Brain.gradParameters
360 | 
361 | 		-- fire up optim.sgd
362 | 		sgdState =
363 | 			learningRate: Brain.learning_rate
364 | 			momentum: Brain.momentum
365 | 			learningRateDecay: Brain.learning_rate_decay
366 | 		
367 | 		optim.sgd feval, Brain.parameters, sgdState
368 | 
369 | 
370 | 
371 | -- export
372 | return Brain


--------------------------------------------------------------------------------
/test.lua:
--------------------------------------------------------------------------------
 1 | require 'xlua'
 2 | local Brain = require 'deepqlearn'
 3 | 
 4 | function randtable(size, startnum, endnum) 
 5 | 	local rtable = {}
 6 | 	for i = 1, size do
 7 | 		rtable[i+1] = randf(startnum, endnum)
 8 | 	end
 9 | 	
10 |   return rtable
11 | end
12 | 
13 | -- simple test found in readme.md
14 | num_outcomes = 3
15 | 
16 | 
17 | Brain.init(num_outcomes, num_outcomes)   
18 | nb_train = 1000
19 | nb_test  = 1000
20 | 
21 | for k = 0, nb_train do
22 | 	rand_outcome = math.random(1, num_outcomes)
23 | 	state = randtable(num_outcomes, rand_outcome, rand_outcome + 1)
24 | 	
25 |    xlua.progress(k, nb_train)
26 |    
27 |    newstate = table.copy(state) -- make a deep copy
28 |    action = Brain.forward(newstate); -- returns index of chosen action
29 |     
30 |    reward = (action == rand_outcome) and 1 or 0
31 |    
32 |    Brain.backward(reward); -- learning magic happens 
33 | end
34 | 
35 | Brain.epsilon_test_time = 0.0; -- don't make any more random choices
36 | Brain.learning = false;
37 | 
38 | 
39 | -- get an optimal action from the learned policy
40 | local cnt = 0
41 | for k = 1, nb_test do
42 | 	xlua.progress(k, nb_test)
43 |    
44 | 	rand_outcome = math.random(1, num_outcomes)
45 | 	state = randtable(num_outcomes, rand_outcome, rand_outcome + 1)
46 | 	
47 |   
48 |    newstate = table.copy(state)
49 |    output = Brain.forward(newstate)
50 |    if rand_outcome == output then
51 |       cnt = cnt + 1   	
52 |    end
53 |    
54 | end
55 | 
56 | print("Test cases correct: " .. tostring(100 * cnt/nb_test) .. " %")
57 | 
58 | 


--------------------------------------------------------------------------------
/test.moon:
--------------------------------------------------------------------------------
 1 | require 'xlua'
 2 | Brain = require 'deepqlearn'
 3 | 
 4 | randtable = (size, startnum, endnum) ->
 5 | 	rtable = {}
 6 | 	for i = 1, size
 7 | 		rtable[i + 1] = randf startnum, endnum
 8 | 	
 9 | 	return rtable
10 | 
11 | -- simple test found in readme.md
12 | num_outcomes = 3
13 | 
14 | 
15 | Brain.init num_outcomes, num_outcomes
16 | nb_train = 1000
17 | nb_test  = 1000
18 | 
19 | for k = 0, nb_train
20 | 	rand_outcome = math.random 1, num_outcomes
21 | 	state = randtable num_outcomes, rand_outcome, rand_outcome + 1
22 | 	
23 | 	xlua.progress k, nb_train
24 | 
25 | 	newstate = table.copy state -- make a deep copy
26 | 	action = Brain.forward newstate -- returns index of chosen action
27 | 
28 | 	reward = (action == rand_outcome) and 1 or 0
29 | 
30 | 	Brain.backward reward -- learning magic happens 
31 | 
32 | Brain.epsilon_test_time = 0.0 -- don't make any more random choices
33 | Brain.learning = false
34 | 
35 | 
36 | -- get an optimal action from the learned policy
37 | cnt = 0
38 | for k = 1, nb_test
39 | 	xlua.progress k, nb_test
40 | 
41 | 	rand_outcome = math.random 1, num_outcomes
42 | 	state = randtable num_outcomes, rand_outcome, rand_outcome + 1
43 | 
44 | 	newstate = table.copy state
45 | 	output = Brain.forward newstate
46 | 
47 | 	cnt += 1 if rand_outcome == output
48 | 
49 | print "Test cases correct: #{tostring(100 * cnt/nb_test)} %"


--------------------------------------------------------------------------------