├── .gitignore
├── README.md
├── development.txt
├── results.txt
├── scripts
    ├── __init__.py
    ├── agent.py
    ├── async_rl.py
    ├── aws_s3_utility.py
    ├── experiment.py
    ├── file_utils.py
    ├── learning_utils.py
    ├── logger.py
    ├── mdps.py
    ├── policy.py
    ├── qnetwork.py
    ├── recurrent_qnetwork.py
    ├── replay_memory.py
    └── state_adapters.py
└── tests
    ├── __init__.py
    ├── run_tests.py
    ├── test_aws_s3_utility.py
    ├── test_build_network.py
    ├── test_experiment.py
    ├── test_learning_utils.py
    ├── test_logger.py
    ├── test_mdps.py
    ├── test_neural_agent.py
    ├── test_policy.py
    ├── test_qnetwork.py
    ├── test_recurrent_qnetwork.py
    ├── test_replay_memory.py
    └── test_state_adapters.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 | 
37 | # Translations
38 | *.mo
39 | 
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | 
45 | # Rope
46 | .ropeproject
47 | 
48 | # Django stuff:
49 | *.log
50 | *.pot
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | # mac
56 | .DS_Store
57 | 
58 | # binaries
59 | *.bin
60 | *.out
61 | 
62 | # emacs
63 | *~
64 | 
65 | # compile
66 | /compile
67 | 
68 | # aws keys
69 | *.key
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # hierarchical_rl
2 | [paper](https://wulfebw.github.io/assets/CS239_Final_Paper.pdf)
3 | 


--------------------------------------------------------------------------------
/development.txt:
--------------------------------------------------------------------------------
 1 | tonight
 2 | -------
 3 | 1. decide between cnn vs flat
 4 | 2. compressor network (compression loss as incentive, as temp for softmax)
 5 | 3. recurrent network
 6 | 4. stacked recurrent network
 7 | 5. batchnorm
 8 | 6. prioritized experience replay
 9 |     - sliding window -> if sufficiently large, and sampling higher-td experiences, may not need to save in memory for longer 
10 |     - this would require feedback from training to the memory
11 | 7. stacked recurrent with different length of BPTT
12 | 8. why does the value image have all the same values sometimes?
13 | 9. graph the weight updates vs the size of the weights -> should be ratio of 1e-3
14 | 10. plot neural network weights / activations / figure out what visualizations work
15 | 11. maze just showing current room
16 |     - in the maze problem, just show it the current room!!! and it has to figure out from that
17 |     - need some distinguishing markers for each room? that's available in MR, but not with one-hot room matric
18 | 12. maze with key 
19 | 13. read papers
20 | 14. should the loss behave differently? why does it go to zero so quickly?
21 |     - why is it bimodal or something?
22 |         - bimodal proabably b/c sometime batch contains reward pos and sometimes not?
23 | 15. improve visualizations
24 |     - some method of showing how the different runs did
25 | 16. do they perform training after every step? or is it every once in a while or what?
26 | 17. should I think of a better mdp?
27 | 18. some notion of convergence in the test mdps
28 |     - rather than just until "10 in a row or something"
29 | 19. clip TD error?
30 |     - should keep track of it 
31 |         - isn't this just the loss?
32 | 20. 
33 | 
34 | ---
35 | problems with qnet
36 | ---
37 | 1. values go to nan sometimes. why? (particularly with rmsprop, does so immediately -> why?)
38 | 2. need regularization? 
39 | 3. need to make tests checking outputs given set weights and inputs
40 | 5. maybe need to be taking random actions till replay memory full
41 | 6. check that the weights are sensible after training
42 | 
43 | ---
44 | papers to read (again)
45 | ---
46 | - prioritized experience replay
47 | - compressor network 
48 | - hybrid arch
49 | - algorithmic information theory CM
50 | - Memory-based control with recurrent neural networks
51 | - Deep Recurrent Q-Learning for Partially Observable MDPs
52 | 
53 | 


--------------------------------------------------------------------------------
/results.txt:
--------------------------------------------------------------------------------
 1 | ---
 2 | results
 3 | ---
 4 | - this is a list of interesting observations made while training
 5 | 
 6 | ---
 7 | sgd vs adam
 8 | --- 
 9 | - on a one room maze, sgd will fail to find the optimal policy whereas adam finds it quickly
10 | 
11 | ---
12 | small replay memory vs large replay memory
13 | ---
14 | - on the small maze, if you try out a capacity=1000 vs capacity=100000 replay memory, it makes a huge difference
15 | 
16 | ---
17 | numeric vs one-hot state representation
18 | ---
19 | - numeric is much worse than one-hot, one-hot is rote learning however
20 | 
21 | ---
22 | 5 vs 10 size single room
23 | ---
24 | - how much more difficult is the 10 vs 5?
25 | 
26 | ---
27 | 10 size room vs 5 size, 2 room maze
28 | ---
29 | - how much more difficult do walls make the task?
30 | 
31 | ---
32 | conv vs dense one-hot representations
33 | ---
34 | - how does passing the one-hot input as an array to conv compare in performance to the flattened array, dense version?
35 | 
36 | ---
37 | number of hidden units
38 | ---
39 | - what impact does number of hidden units have?
40 | - seems that lower number of units tends to get stuck in local optima that is relatively poor compared to the larger networks (both should be more than sufficient to express value function?)
41 | 
42 | ---
43 | number of hidden layers
44 | ---
45 | - fewer seems to work better for the large maze - likely due to poor learning setup for the larger networks 
46 |     - possible that it just also takes less time
47 |     - batch norm might improve
48 |     - this is also rum on cpu on laptop so also likely due to lack of memories or something
49 | 
50 | ---
51 | prioritized experience replay vs none
52 | ---
53 | - how much better does it do?
54 | 
55 | ---
56 | increasing batch size increases stability
57 | ---
58 | - increasing the batch size improves learning
59 | 
60 | ---
61 | should you regularize q network?
62 | ---
63 | - ? i'm not really sure at all
64 | - seems that it does not make a huge difference 
65 |     - you would think it might given that it could overfit the data in the experience replay
66 |     - used small value of 1e-4 seems to work well
67 | 
68 | ---
69 | clipping td error (i.e., the loss) 
70 | ---
71 | - does clipping the td error max the loss graphs look pretty / smooth?


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wulfebw/hierarchical_rl/0156dd7b1675a0c3a3b7d81cb66721cbba406e28/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/agent.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import collections
  3 | import numpy as np
  4 | import random
  5 | import theano
  6 | 
  7 | import logger
  8 | 
  9 | class Agent(object):
 10 | 
 11 |     def step(self, next_state, reward):
 12 |         """
 13 |         :description: this method implements the agents deciding which action to take and updating its parameters
 14 |         """
 15 |         raise NotImplementedError("Override me")
 16 | 
 17 |     def start_episode(self, state):
 18 |         """
 19 |         :description: initializes an agent for an episode and returns an initial action to take
 20 |         """
 21 |         raise NotImplementedError("Override me")
 22 | 
 23 |     def finish_episode(self, next_state, reward):
 24 |         """
 25 |         :description: finalizes an episode for an agent
 26 |         """
 27 | 
 28 |     def finish_epoch(self, epoch):
 29 |         """
 30 |         :description: performs logging tasks at the end of an epoch
 31 |         """
 32 |         raise NotImplementedError("Override me")
 33 | 
 34 |     def start_testing(self):
 35 |         pass
 36 | 
 37 |     def finish_testing(self):
 38 |         pass
 39 | 
 40 | 
 41 | class TestAgent(Agent):
 42 | 
 43 |     def __init__(self, num_actions):
 44 |         self.actions = range(num_actions)
 45 |         self.steps = 0
 46 |         self.episodes = 0
 47 | 
 48 |     def step(self, next_state, reward):
 49 |         self.steps += 1
 50 |         return random.choice(self.actions)
 51 | 
 52 |     def start_episode(self, state):
 53 |         self.episodes += 1
 54 |         return random.choice(self.actions)
 55 | 
 56 |     def finish_episode(self, next_state, reward):
 57 |         pass
 58 | 
 59 |     def finish_epoch(self, epoch):
 60 |         pass
 61 |         
 62 | 
 63 | class QLearningAgent(Agent):
 64 | 
 65 |     def __init__(self, num_actions, discount, exploration_prob, step_size, logging=True):
 66 |         self.actions = range(num_actions)
 67 |         self.discount = discount
 68 |         self.exploration_prob = exploration_prob
 69 |         self.step_size = step_size
 70 |         self.num_iters = 1
 71 |         self.weights = collections.Counter()
 72 |         self.logger = logger.Logger(agent_name='QLearningAgent', logging=logging)
 73 |         self.prev_state = None
 74 |         self.prev_action = None
 75 | 
 76 |     def step(self, next_state, reward):
 77 |         self.incorporate_feedback(self.prev_state, self.prev_action, reward, next_state, False)
 78 |         action = self.get_action(next_state)
 79 |         
 80 |         self.prev_state = next_state
 81 |         self.prev_action = action
 82 | 
 83 |         self.logger.log_action(action)
 84 |         self.logger.log_reward(reward)
 85 |         return action
 86 | 
 87 |     def feature_extractor(self, state, action):
 88 |         """
 89 |         :description: this is the identity feature extractor, so we use tables here for the function
 90 |         """
 91 |         return [((state, action), 1)]
 92 | 
 93 |     def getQ(self, state, action):
 94 |         """
 95 |         :description: returns the Q value associated with this state-action pair
 96 | 
 97 |         :type state: numpy array
 98 |         :param state: the state of the game
 99 | 
100 |         :type action: int
101 |         :param action: the action for which to retrieve the Q-value
102 |         """
103 |         score = 0
104 |         for f, v in self.feature_extractor(state, action):
105 |             score += self.weights[f] * v
106 |         return score
107 | 
108 |     def get_action(self, state):
109 |         """
110 |         :description: returns an action accoridng to epsilon-greedy policy
111 | 
112 |         :type state: dictionary
113 |         :param state: the state of the game
114 |         """
115 |         self.num_iters += 1
116 | 
117 |         if random.random() < self.exploration_prob:
118 |             return random.choice(self.actions)
119 |         else:
120 |             max_action = max((self.getQ(state, action), action) for action in self.actions)[1]
121 |         return max_action
122 | 
123 |     def incorporate_feedback(self, state, action, reward, next_state, terminal):
124 |         """
125 |         :description: performs a Q-learning update
126 | 
127 |         :type reward: float
128 |         :param reward: reward associated with transitioning to next_state
129 | 
130 |         :type next_state: numpy array
131 |         :param next_state: the new state of the game
132 |         """
133 |         step_size = self.step_size
134 |         prediction = self.getQ(state, action)
135 |         target = reward
136 |         if not terminal:
137 |             target += self.discount * max(self.getQ(next_state, next_action) for next_action in self.actions)
138 | 
139 |         diff = target - prediction
140 |         loss = .5 * diff ** 2
141 |         for f, v in self.feature_extractor(state, action):
142 |             self.weights[f] = self.weights[f] + step_size * diff * v
143 | 
144 |         self.logger.log_loss(loss)
145 |         self.logger.log_weights(self.weights)
146 | 
147 |     def start_episode(self, state):
148 |         self.prev_state = state
149 |         self.prev_action = self.get_action(state)
150 | 
151 |         self.logger.log_action(self.prev_action)
152 |         return self.prev_action
153 | 
154 |     def finish_episode(self, next_state, reward):
155 |         self.incorporate_feedback(self.prev_state, self.prev_action, reward, next_state, True)
156 |         self.logger.finish_episode()
157 | 
158 |     def finish_epoch(self, epoch):
159 |         self.logger.log_epoch(epoch)
160 | 
161 | class NeuralAgent(Agent):
162 |     """
163 |     :description: A class that wraps a network so it may more easily interact with an experiment. 
164 |     """
165 | 
166 |     def __init__(self, network, policy, replay_memory, log, state_adapter):
167 |         """
168 |         :type network: a network class (see e.g., qnetwork.py)
169 |         :param network: the network the agent uses to evaluate states
170 | 
171 |         :type policy: a policy class (see policy.py)
172 |         :param policy: a class that decides which action to take given the values of those actions
173 | 
174 |         :type replay_memory: replay memory class (see replay_memory.py)
175 |         :param replay_memory: replay memory used to store dataset as it is gathered.
176 |         """
177 | 
178 |         self.network = network
179 |         self.policy = policy
180 |         self.replay_memory = replay_memory
181 |         self.logger = log
182 |         self.logger.log_hyperparameters(network, policy, replay_memory)
183 |         self.state_adapter = state_adapter
184 | 
185 |         self.prev_state = None
186 |         self.prev_action = None
187 |         
188 |     def step(self, next_state, reward):
189 |         """
190 |         :description: the primary method of this class, which 'steps' the agent and network forward one time step. This includes selecting an action, making use of the new state and reward, and performing training.
191 | 
192 |         :type next_state: tuple or array
193 |         :param next_state: the next state observed (i.e., s')
194 | 
195 |         :type reward: int 
196 |         :param reward: the reward associated with having moved from the previous state to the current state
197 | 
198 |         :type rval: int
199 |         :param rval: returns the action to next be taken within the environment
200 |         """
201 |         # need to transform an external state format to an internal one
202 |         next_state = self.state_adapter.convert_state_to_agent_format(next_state)
203 | 
204 |         # store current (s,a,r,s') tuple
205 |         self.replay_memory.store((self.prev_state, self.prev_action, reward, next_state, 0))
206 | 
207 |         # perform training
208 |         self.train()
209 | 
210 |         # retrieve an action
211 |         action = self.get_action(next_state)
212 | 
213 |         # set previous values
214 |         self.prev_state = next_state
215 |         self.prev_action = action
216 | 
217 |         # log information
218 |         self.logger.log_reward(reward)
219 |         self.logger.log_action(self.prev_action)
220 | 
221 |         return action
222 | 
223 |     def train(self):
224 |         """
225 |         :description: collects a minibatch of experiences and passes them to the network to train
226 |         """
227 |         # wait until replay memory has samples
228 |         if not self.replay_memory.is_full():
229 |             return
230 | 
231 |         # collect minibatch
232 |         states, actions, rewards, next_states, terminals = self.replay_memory.sample_batch()
233 | 
234 |         # pass to network to perform training
235 |         loss = self.network.train(states, actions, rewards, next_states, terminals)
236 |         self.logger.log_loss(loss)
237 | 
238 |     def get_action(self, state):
239 |         """
240 |         :description: gets an action given the current state. Defers to the network for selecting the action.
241 | 
242 |         :type state: numpy array
243 |         :param state: the state used to determine the action
244 |         """
245 |         q_values = self.network.get_q_values(state)
246 |         return self.policy.choose_action(q_values)
247 | 
248 |     def start_episode(self, state):
249 |         """
250 |         description: determines the first action to take and initializes internal variables
251 |         """
252 |         self.prev_state = self.state_adapter.convert_state_to_agent_format(state)
253 |         self.prev_action = self.get_action(self.prev_state)
254 | 
255 |         self.logger.log_action(self.prev_action)
256 |         return self.prev_action
257 | 
258 |     def finish_episode(self, next_state, reward):
259 |         """
260 |         :description: perform tasks at the end of episode
261 |         """
262 | 
263 |         terminal = 1
264 |         next_state = self.state_adapter.convert_state_to_agent_format(next_state)
265 |         self.replay_memory.store((self.prev_state, self.prev_action, reward, next_state, terminal))
266 |         self.logger.log_reward(reward)
267 |         self.logger.finish_episode()
268 | 
269 |     def finish_epoch(self, epoch):
270 |         """
271 |         :description: perform tasks at the end of an epoch
272 |         """
273 |         self.logger.log_epoch(epoch, self.network, self.policy)
274 | 
275 |     def get_q_values(self, state):
276 |         """
277 |         :description: returns the q values associated with a given state. Used for printing out a representation of the mdp with the values included. 
278 |         """
279 |         state = self.state_adapter.convert_state_to_agent_format(state)
280 |         q_values = self.network.get_q_values(state)
281 |         return q_values
282 | 
283 | class RecurrentNeuralAgent(Agent):
284 |     """
285 |     :description: A class that wraps a recuurent network so it may more easily 
286 |         interact with an experiment. 
287 |     """
288 |     def __init__(self, network, policy, replay_memory, state_adapter, log):
289 |         self.network = network
290 |         self.policy = policy
291 |         self.replay_memory = replay_memory
292 |         self.logger = log
293 |         self.logger.log_hyperparameters(network, policy, replay_memory)
294 |         self.state_adapter = state_adapter
295 | 
296 |         self.prev_state = None
297 |         self.prev_action = None
298 |         
299 |     def step(self, next_state, reward):
300 |         """
301 |         :description: the primary method of this class, which 'steps' the agent and network forward one time step. This includes selecting an action, making use of the new state and reward, and performing training.
302 | 
303 |         :type next_state: tuple or array
304 |         :param next_state: the next state observed (i.e., s')
305 | 
306 |         :type reward: int 
307 |         :param reward: the reward associated with having moved from the previous state to the current state
308 | 
309 |         :type rval: int
310 |         :param rval: returns the action to next be taken within the environment
311 |         """
312 |         # need to transform an external state format to an internal one
313 |         next_state = self.state_adapter.convert_state_to_agent_format(next_state)
314 | 
315 |         # store current (s,a,r,s') tuple
316 |         self.replay_memory.store(self.prev_state, self.prev_action, reward, terminal=False)
317 | 
318 |         # perform training
319 |         self.train()
320 | 
321 |         # retrieve an action
322 |         action = self.get_action(next_state)
323 | 
324 |         # set previous values
325 |         self.prev_state = next_state
326 |         self.prev_action = action
327 | 
328 |         # log information
329 |         self.logger.log_reward(reward)
330 |         self.logger.log_action(self.prev_action)
331 | 
332 |         return action
333 | 
334 |     def train(self):
335 |         """
336 |         :description: collects a minibatch of experiences and passes them to the network to train
337 |         """
338 |         # wait until replay memory has samples
339 |         if not self.replay_memory.is_full():
340 |             return
341 | 
342 |         # collect minibatch
343 |         states, actions, rewards, next_states, terminals = self.replay_memory.sample_batch()
344 | 
345 |         # pass to network to perform training
346 |         loss = self.network.train(states, actions, rewards, next_states, terminals)
347 |         self.logger.log_loss(loss)
348 | 
349 |     def get_action(self, state):
350 |         """
351 |         :description: gets an action given the current state. Defers to the network for selecting the action.
352 | 
353 |         :type state: numpy array
354 |         :param state: the state used to determine the action
355 |         """
356 |         # wait until agent starts learning to use network to decide action
357 |         if not self.replay_memory.is_full():
358 |             return self.policy.random_action()
359 | 
360 |         sequence = self.replay_memory.make_last_sequence(state)
361 |         q_values = self.network.get_q_values(sequence)
362 |         return self.policy.choose_action(q_values)
363 | 
364 |     def start_episode(self, state):
365 |         """
366 |         description: determines the first action to take and initializes internal variables
367 |         """
368 |         self.prev_state = self.state_adapter.convert_state_to_agent_format(state)
369 |         self.prev_action = self.get_action(self.prev_state)
370 | 
371 |         self.logger.log_action(self.prev_action)
372 |         return self.prev_action
373 | 
374 |     def finish_episode(self, next_state, reward):
375 |         """
376 |         :description: perform tasks at the end of episode. We don't store the next_state value
377 |             because the previous state must have been a terminal one. It's in the method
378 |             definition to stay consistent with the other replay memory implementation.
379 |         """
380 |         self.replay_memory.store(self.prev_state, self.prev_action, reward, True)
381 |         self.logger.log_reward(reward)
382 |         self.logger.finish_episode()
383 | 
384 |     def finish_epoch(self, epoch):
385 |         """
386 |         :description: perform tasks at the end of an epoch
387 |         """
388 |         self.logger.log_epoch(epoch, self.network, self.policy)
389 | 
390 |     def get_q_values(self, state):
391 |         """
392 |         :description: returns the q values associated with a given state. Used for printing out a representation of the mdp with the values included. 
393 |         """
394 |         state = self.state_adapter.convert_state_to_agent_format(state)
395 |         q_values = self.network.get_logging_q_values(state)
396 |         return q_values
397 |         
398 | 


--------------------------------------------------------------------------------
/scripts/async_rl.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import collections
  3 | import copy
  4 | import matplotlib.pyplot as plt
  5 | from multiprocessing.pool import ThreadPool
  6 | import numpy as np
  7 | import random
  8 | import sys
  9 | import time
 10 | 
 11 | import learning_utils
 12 | 
 13 | # threading constants
 14 | NUM_THREADS = 2
 15 | 
 16 | # global variables for AsyncSarsa
 17 | WEIGHTS = collections.defaultdict(lambda: 0)
 18 | 
 19 | # global variables for AsyncAdvantageActorCritic
 20 | WEIGHTS = collections.defaultdict(lambda: 0)
 21 | VALUE_WEIGHTS = collections.defaultdict(lambda: 0)
 22 | 
 23 | # logging
 24 | REWARDS = []
 25 | START_STATE_VALUES = []
 26 | 
 27 | class MazeMDP(object):
 28 |    
 29 |     EXIT_REWARD = 1
 30 |     MOVE_REWARD = -.01
 31 |     ACTIONS = [(1,0),(-1,0),(0,1),(0,-1)] 
 32 |     DISCOUNT = 1
 33 |     START_STATE = (0,0)
 34 | 
 35 |     def __init__(self, room_size, num_rooms):
 36 |         self.room_size = room_size
 37 |         self.num_rooms = num_rooms
 38 |         self.max_position = self.room_size * self.num_rooms - 1
 39 |         self.end_state = (self.max_position, self.max_position) 
 40 |         self.computeStates() 
 41 | 
 42 |     def calculate_next_state(self, state, action):
 43 |         return state[0] + action[0], state[1] + action[1]
 44 | 
 45 |     def runs_into_wall(self, state, action):
 46 |         next_state = self.calculate_next_state(state, action)
 47 | 
 48 |         # 1. check for leaving the maze
 49 |         if next_state[0] > self.max_position or next_state[0] < 0 \
 50 |                             or next_state[1] > self.max_position or next_state[1] < 0:
 51 |             return True
 52 | 
 53 |         # 2. check if movement was through doorway and if so return false
 54 |         doorway_position = (self.room_size) / 2
 55 |         # check horizontal movement through doorway
 56 |         if next_state[0] != state[0]:
 57 |             if next_state[1] % self.room_size == doorway_position:
 58 |                 return False
 59 | 
 60 |         # check vertical movement through doorway
 61 |         if next_state[1] != state[1]:
 62 |             if next_state[0] % self.room_size == doorway_position:
 63 |                 return False
 64 | 
 65 |         # 3. check if movement was through a wall
 66 |         room_size = self.room_size
 67 |         # move right to left through wall
 68 |         if state[0] % room_size == room_size - 1 and next_state[0] % room_size == 0:
 69 |             return True
 70 | 
 71 |         # move left to right through wall
 72 |         if next_state[0] % room_size == room_size - 1 and state[0] % room_size == 0:
 73 |             return True
 74 | 
 75 |         # move up through wall
 76 |         if state[1] % room_size == room_size - 1 and next_state[1] % room_size == 0:
 77 |             return True
 78 | 
 79 |         # move down through wall
 80 |         if next_state[1] % room_size == room_size - 1 and state[1] % room_size == 0:
 81 |             return True
 82 | 
 83 |         # if none of the above conditions meet, then have not passed through wall
 84 |         return False
 85 | 
 86 |     def succAndProbReward(self, state, action): 
 87 | 
 88 |         # if we reach the end state then the episode ends
 89 |         if np.array_equal(state, self.end_state):
 90 |             return []
 91 | 
 92 |         if self.runs_into_wall(state, action):
 93 |             # if the action runs us into a wall do nothing
 94 |             next_state = state
 95 |         else:
 96 |             # o/w determine the next position
 97 |             next_state = self.calculate_next_state(state, action)
 98 | 
 99 |         # if next state is exit, then set reward
100 |         reward = self.MOVE_REWARD
101 |         if np.array_equal(next_state, self.end_state):
102 |             reward = self.EXIT_REWARD
103 | 
104 |         return [(next_state, 1, reward)]
105 | 
106 |     def computeStates(self):
107 |         self.states = set()
108 |         queue = []
109 |         self.states.add(self.START_STATE)
110 |         queue.append(self.START_STATE)
111 |         while len(queue) > 0:
112 |             state = queue.pop()
113 |             for action in self.ACTIONS:
114 |                 for newState, prob, reward in self.succAndProbReward(state, action):
115 |                     if newState not in self.states:
116 |                         self.states.add(newState)
117 |                         queue.append(newState)
118 | 
119 |     def print_state_values(self):
120 |         V = {}
121 |         for state in self.states:
122 |             #state_value = max(WEIGHTS[(state, action)] for action in self.ACTIONS)
123 |             state_value = VALUE_WEIGHTS[(state, None)]
124 |             V[state] = state_value
125 | 
126 |         for ridx in reversed(range(self.max_position + 1)):
127 |             for cidx in range(self.max_position + 1):
128 |                 if (ridx, cidx) in V:
129 |                     print '{0:.5f}'.format(V[(ridx, cidx)]),
130 |             print('\n')
131 | 
132 | class Experiment(object):
133 | 
134 |     def __init__(self, mdp, agent, num_episodes, max_steps):
135 |         self.mdp = mdp
136 |         self.agent = agent
137 |         self.num_episodes = num_episodes
138 |         self.max_steps = max_steps
139 | 
140 |     def run(self, agent_id):
141 |         print 'running experiment with agent number {}...'.format(agent_id)
142 | 
143 |         total_rewards = []
144 |         total_reward = 0
145 | 
146 |         for episode in range(self.num_episodes):
147 |             if episode % 100 == 0:
148 |                 print 'running episode {} for agent {}...'.format(episode, agent_id)
149 |             state = self.mdp.START_STATE
150 |             action = self.agent.get_action(state)
151 | 
152 |             for step in range(self.max_steps):
153 |                 transitions = self.mdp.succAndProbReward(state, action)
154 | 
155 |                 if len(transitions) == 0:
156 |                     reward = 0
157 |                     new_state = None
158 |                     break
159 | 
160 |                 new_state, prob, reward = transitions[0]
161 |                 total_reward += reward
162 |                 action = self.agent.incorporateFeedback(state, action, reward, new_state)
163 |                 state = new_state
164 | 
165 |             self.agent.incorporateFeedback(state, action, reward, new_state)
166 |             total_rewards.append(total_reward)
167 |             REWARDS.append(total_reward)
168 |             #START_STATE_VALUES.append(max(WEIGHTS[((0,0), action)] for action in self.mdp.ACTIONS))
169 |             START_STATE_VALUES.append(VALUE_WEIGHTS[((0,0), None)])
170 |             total_reward = 0
171 |         
172 |         print 'average reward of agent {}: {}'.format(agent_id, np.mean(total_rewards))
173 | 
174 | class MultithreadedExperiment(object):
175 | 
176 |     def __init__(self, experiment, num_agents):
177 |         self.experiment = experiment
178 |         self.num_agents = num_agents
179 | 
180 |     def run(self):
181 |         pool = ThreadPool(self.num_agents)
182 |         for idx in range(self.num_agents):
183 |             pool.apply_async(self.run_experiement, args=(self.experiment, idx))
184 | 
185 |         pool.close()
186 |         pool.join()
187 | 
188 |     @staticmethod
189 |     def run_experiement(experiment, agent_id):
190 |         print 'starting experiment with agent number {}...'.format(agent_id)
191 |         experiment_copy = copy.deepcopy(experiment)
192 |         experiment.run(agent_id)
193 | 
194 | class AsyncSarsa(object):
195 | 
196 |     def __init__(self, actions, discount, exploration_prob, learning_rate):
197 |         self.actions = actions
198 |         self.discount = discount
199 |         self.exploration_prob = exploration_prob
200 |         self.learning_rate = learning_rate
201 |         self.num_iters = 0
202 | 
203 |     def feature_extractor(self, state, action):
204 |         return [((state, action), 1)]
205 | 
206 |     def getQ(self, state, action):
207 |         score = 0
208 |         for f, v in self.feature_extractor(state, action):
209 |             score += WEIGHTS[f] * v
210 |         return score
211 | 
212 |     def get_action(self, state):
213 |         self.num_iters += 1
214 | 
215 |         if self.exploration_prob > .05:
216 |             self.exploration_prob -= 1e-8
217 | 
218 |         if random.random() < self.exploration_prob:
219 |             action = random.choice(self.actions)
220 |         else:
221 |             action = max((self.getQ(state, action), action) for action in self.actions)[1]
222 |         return action
223 | 
224 |     def incorporateFeedback(self, state, action, reward, new_state):
225 |         prediction = self.getQ(state, action)
226 |         target = reward
227 |         new_action = None
228 | 
229 |         if new_state != None:
230 |             new_action = self.get_action(new_state)
231 |             target += self.discount * self.getQ(new_state, new_action)
232 | 
233 |         for f, v in self.feature_extractor(state, action):
234 |             WEIGHTS[f] = WEIGHTS[f] + self.learning_rate * (target - prediction) * v
235 | 
236 |         return new_action
237 | 
238 | class AsyncAdvantageActorCritic(object):
239 | 
240 |     def __init__(self, actions, discount, tau, learning_rate):
241 |         self.actions = actions
242 |         self.discount = discount
243 |         self.tau = tau
244 |         self.learning_rate = learning_rate
245 |         self.num_iters = 0
246 | 
247 |     def feature_extractor(self, state, action=None):
248 |         return [((state, action), 1)]
249 | 
250 |     def getV(self, state):
251 |         score = 0
252 |         for f, v in self.feature_extractor(state):
253 |             score += VALUE_WEIGHTS[f] * v
254 |         return score
255 | 
256 |     def getQ(self, state, action):
257 |         score = 0
258 |         for f, v in self.feature_extractor(state, action):
259 |             score += WEIGHTS[f] * v
260 |         return score
261 | 
262 |     def get_action(self, state):
263 |         self.num_iters += 1
264 |         # if self.tau > 1e-9:
265 |         #     self.tau *= .9999
266 |         #     print self.tau
267 | 
268 |         q_values = np.array([self.getQ(state, action) for action in self.actions])
269 |         exp_q_values = np.exp(q_values / (self.tau + 1e-2))
270 |         weights = dict()
271 |         for idx, val in enumerate(exp_q_values):
272 |             weights[idx] = val
273 |         action_idx = learning_utils.weightedRandomChoice(weights)
274 |         action = self.actions[action_idx]
275 |         return action
276 | 
277 |     def incorporateFeedback(self, state, action, reward, new_state):
278 |         prediction = self.getV(state)
279 |         target = reward
280 |         new_action = None
281 | 
282 |         if new_state != None:
283 |             new_action = self.get_action(new_state)
284 |             target += self.discount * self.getV(new_state)
285 | 
286 |         update = self.learning_rate * (target - prediction)
287 |         for f, v in self.feature_extractor(state):
288 |             VALUE_WEIGHTS[f] = VALUE_WEIGHTS[f] + 2 * update
289 | 
290 |         for f, v in self.feature_extractor(state, action):
291 |             WEIGHTS[f] = WEIGHTS[f] + update * 1
292 | 
293 |         return new_action
294 | 
295 | def plot_values(values, ylabel):
296 |     values = np.mean(np.reshape(values, (-1, 4)), axis=1).reshape(-1)
297 |     plt.scatter(range(len(values)), values)
298 |     plt.xlabel('episodes (1 per actor-learner)')
299 |     plt.ylabel(ylabel)
300 |     plt.show()
301 | 
302 | def run():
303 |     start = time.time()
304 |     room_size = 5
305 |     num_rooms = 2
306 |     mdp = MazeMDP(room_size=room_size, num_rooms=num_rooms)
307 |     # agent = AsyncSarsa(actions=mdp.ACTIONS, discount=mdp.DISCOUNT, 
308 |     #             exploration_prob=0.3, learning_rate=.5)
309 |     agent = AsyncAdvantageActorCritic(actions=mdp.ACTIONS, discount=mdp.DISCOUNT, 
310 |                 tau=.3, learning_rate=.5)
311 |     max_steps = (2 * room_size * num_rooms) ** 2
312 |     experiment = Experiment(mdp=mdp, agent=agent, num_episodes=800, max_steps=max_steps)
313 |     multiexperiment = MultithreadedExperiment(experiment=experiment, num_agents=NUM_THREADS)
314 |     multiexperiment.run()
315 |     end = time.time()
316 |     print 'took {} seconds'.format(end - start)
317 |     mdp.print_state_values()
318 |     plot_values(REWARDS, 'rewards')
319 |     plot_values(START_STATE_VALUES, 'start state value')
320 | 
321 | 
322 | 
323 | if __name__ =='__main__':
324 |     run()


--------------------------------------------------------------------------------
/scripts/aws_s3_utility.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import boto
  4 | 
  5 | from boto.s3.key import Key
  6 | from boto.s3.connection import S3Connection
  7 | 
  8 | class S3Utility(object):
  9 |     """
 10 |     :description: An AWS S3 utility class.
 11 | 
 12 |     This is a class rather than a module because some state is required 
 13 |     to avoid having to reestablish an s3 connection each transaction.
 14 | 
 15 |     :type access_key: string
 16 |     :param access_key: aws access key
 17 | 
 18 |     :type secret_key: string
 19 |     :param secret_key: aws secret key
 20 | 
 21 |     """
 22 | 
 23 |     def __init__(self, access_key, secret_key, s3_bucket):
 24 |         self.access_key = access_key
 25 |         self.secret_key = secret_key
 26 |         self.s3_bucket = s3_bucket
 27 |         self._conn = None
 28 | 
 29 |     @property   
 30 |     def conn(self):
 31 |         if self._conn is not None:
 32 |             return self._conn
 33 |         else:
 34 |             return S3Connection(self.access_key, self.secret_key)
 35 | 
 36 |     def download_file_list(self, prefix=''):
 37 |         """
 38 |         :description: loads the name of the files in a bucket. 
 39 |             Optionally returns only those filenames that start with prefix.
 40 |         """
 41 | 
 42 |         # select the bucket, where input_s3_bucket takes the form 'bsdsdata'
 43 |         bucket = self.conn.get_bucket(self.s3_bucket)
 44 | 
 45 |         # collect the list of files to process - those that start with the data group id
 46 |         file_list = []
 47 |         for key in bucket.list():
 48 |             key_name = key.name.encode('utf-8')
 49 |             if key_name.startswith(prefix):
 50 |                 file_list.append(key_name)
 51 | 
 52 |         return file_list
 53 | 
 54 |     def download_file(self, file_to_load, local_save_dir):
 55 |         """
 56 |         :description: load a file from a given s3 bucket with a 
 57 |             given name and save to a local dir
 58 | 
 59 |         :type s3_bucket: string
 60 |         :param s3_bucket: s3 bucket from which to load the file
 61 | 
 62 |         :type file_to_load: string
 63 |         :param file_to_load: the file to load
 64 | 
 65 |         :type local_save_dir: string
 66 |         :param local_save_dir: the local dir to which to save the downloaded file
 67 | 
 68 |         :return: the location where the file was saved
 69 |         """
 70 | 
 71 |         # select the bucket, where input_s3_bucket takes the form 'bsdsdata'
 72 |         bucket = self.conn.get_bucket(self.s3_bucket)
 73 | 
 74 |         # set a key to the processed files list
 75 |         key = Key(bucket, file_to_load)
 76 |         key_name = key.name.encode('utf-8')
 77 | 
 78 |         # download the file to process and save in the input location
 79 |         save_location = os.path.join(local_save_dir, key_name)
 80 |         try:
 81 |             key.get_contents_to_filename(save_location)
 82 |         except boto.exception.S3ResponseError as e:
 83 |             raise boto.exception.S3ResponseError("key name: {} failed".format(key_name))
 84 | 
 85 |         # return the location of the downloaded file
 86 |         return save_location
 87 | 
 88 |     def upload_file(self, filename_to_save_as, file_path):
 89 |         """
 90 |         :description: uploads a single file to an s3 bucket
 91 |         """
 92 |         # what is this?
 93 |         def percent_cb(complete, total):
 94 |             sys.stdout.write('.')
 95 |             sys.stdout.flush()
 96 | 
 97 |         # select the bucket, where input_s3_bucket takes the form 'bsdsdata'
 98 |         bucket = self.conn.get_bucket(self.s3_bucket)
 99 | 
100 |         # send the file to the s3 bucket
101 |         key = Key(bucket)
102 |         key.key = filename_to_save_as
103 |         key.set_contents_from_filename(file_path, cb=percent_cb, num_cb=50)
104 | 
105 |     def upload_directory(self, directory):
106 |         """
107 |         :description: upload all the files in a directory to aws s3
108 |         """
109 | 
110 |         filepaths = []
111 |         for root, dirs, files in os.walk(directory):
112 |             for filename in files:
113 |                 filepaths.append(os.path.join(root, filename))
114 | 
115 |         upload_directory = os.path.basename(directory)
116 |         for filepath in filepaths:
117 |             dest_filepath = os.path.join(upload_directory, filepath.split(upload_directory)[-1][1:])
118 |             self.upload_file(dest_filepath, filepath)
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/scripts/experiment.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import collections
  3 | import numpy as np
  4 | import random
  5 | 
  6 | import learning_utils
  7 | 
  8 | class Experiment(object):
  9 |     """
 10 |     :description: Experiment is a class representing an online reinforcement learning experiment. This class orchestrates the interaction between an agent and an mdp.
 11 |     """
 12 | 
 13 |     def __init__(self, mdp, agent, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, 
 14 |         value_logging=False):
 15 |         """
 16 |         :type mdp: object inheriting from MDP
 17 |         :param mdp: the markov decision process in which the agent acts
 18 | 
 19 |         :type agent: object inheriting from Agent
 20 |         :param agent: the agent that acts within the experiment
 21 | 
 22 |         :type num_epochs: int 
 23 |         :param num_epochs: number of training epochs to run 
 24 | 
 25 |         :type epoch_length: int 
 26 |         :param epoch_length: length of each epoch in episodes
 27 | 
 28 |         :type test_epoch_length: int 
 29 |         :param test_epoch_length: length of a test epoch in episodes
 30 | 
 31 |         :type max_steps: int 
 32 |         :param max_steps: maximum number of steps allowed in a single episode
 33 | 
 34 |         :type run_tests: boolean
 35 |         :param run_tests: whether or not to run testing epochs
 36 | 
 37 |         :type value_logging: boolean
 38 |         :param value_logging: whether or not to write a representation of the value function to a file
 39 |         """
 40 |         self.mdp = mdp
 41 |         self.agent = agent
 42 |         self.num_epochs = num_epochs
 43 |         self.epoch_length = epoch_length
 44 |         self.test_epoch_length = test_epoch_length
 45 |         self.max_steps = max_steps
 46 |         self.run_tests = run_tests
 47 |         self.mdp_actions = self.mdp.get_actions()
 48 |         self.value_logging = value_logging
 49 | 
 50 |     def run(self):
 51 |         """
 52 |         :description: main method which runs the entire experiment
 53 |         """
 54 |         for epoch in xrange(self.num_epochs):
 55 |             self.run_epoch(epoch, self.epoch_length)
 56 |             self.agent.finish_epoch(epoch)
 57 |             self.finish_epoch(epoch)
 58 | 
 59 |             if self.run_tests:
 60 |                 self.agent.start_testing()
 61 |                 self.run_epoch(self.test_epoch_length)
 62 |                 self.agent.finish_testing(epoch)
 63 | 
 64 |     def run_epoch(self, epoch, epoch_length):
 65 |         """
 66 |         :description: runs a single epoch
 67 | 
 68 |         :type epoch_length: int 
 69 |         :param epoch_length: length of the current epoch in episodes
 70 |         """
 71 |         for episode in xrange(epoch_length):
 72 |             self.run_episode()
 73 | 
 74 |     def run_episode(self):
 75 |         """
 76 |         :description: runs a single episode
 77 |         """
 78 |         state = self.mdp.get_start_state()
 79 |         action = self.agent.start_episode(state)
 80 |         reward = 0
 81 |         for step in xrange(self.max_steps):
 82 | 
 83 |             # get the next state and reward
 84 |             next_state, reward, terminal = self.step(state, action)
 85 | 
 86 |             # if episode has ended, then break
 87 |             if terminal:
 88 |                 break
 89 | 
 90 |             # otherwise, inform the agent and get a new action
 91 |             action = self.agent.step(next_state, reward)
 92 |             state = next_state
 93 |         
 94 |         # store this experience as a terminal one regardless of the loop exit condition
 95 |         # because either way the next state will break continuity
 96 |         self.agent.finish_episode(next_state, reward)
 97 | 
 98 |     def step(self, state, action):
 99 |         """
100 |         :description: progresses the experiment forward one time step
101 |         """
102 |         # convert to mdp action format and get transitions
103 |         real_action = self.mdp_actions[action]
104 |         transitions = self.mdp.succ_prob_reward(state, real_action)
105 | 
106 |         # randomly sample a transition
107 |         i = learning_utils.sample([prob for newState, prob, reward in transitions])
108 |         next_state, prob, reward = transitions[i]
109 | 
110 |         # if the next state is terminal note that
111 |         terminal = False
112 |         if self.mdp.is_end_state(next_state):
113 |             terminal = True
114 | 
115 |         return next_state, reward, terminal
116 | 
117 |     def finish_epoch(self, epoch):
118 |         """
119 |         :description: finalize epoch
120 |         """
121 |         if self.value_logging and self.agent.replay_memory.is_full():
122 |             self.log_value_string()
123 |             # if epoch > 3:
124 |             #     self.log_trajectories()
125 | 
126 |     def log_trajectories(self):
127 |         self.agent.logger.log_trajectories(self.mdp)
128 | 
129 |     def log_value_string(self):
130 |         """
131 |         :description: collect the necessary components to print a representation of the optimal value 
132 |             of each state in the mdp.
133 |         """
134 |         V = {}
135 |         for state in self.mdp.states:
136 |             V[state] = np.max(self.agent.get_q_values(state))
137 |         value_string = self.mdp.get_value_string(V)
138 |         self.agent.logger.log_value_string(value_string)
139 |         self.agent.logger.log_values(V)
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/scripts/file_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | import os
 5 | 
 6 | import logger
 7 | 
 8 | 
 9 | def is_valid(key):
10 |     return key.replace('-','').isalnum()
11 | 
12 | def load_key(filepath):
13 |     assert os.path.exists(filepath), 'filepath: {} not found'.format(filepath)
14 |     
15 |     key = None
16 |     with open(filepath, 'rb') as f:
17 |         key = f.readline()
18 |     if is_valid(key):
19 |         return key
20 |     else:
21 |         raise ValueError('invalid key: {}'.format(key))
22 | 
23 | 
24 | def graph_rewards_seq_len(filepaths):
25 |     initrewards = []
26 |     min_len = 10000000
27 |     for f in filepaths:
28 |         r = np.load(f)['values']
29 |         mr = logger.moving_average(r, 5)
30 |         if len(mr) < min_len:
31 |             min_len = len(mr)
32 |         initrewards.append(mr)
33 | 
34 |     rewards = []
35 |     for r in initrewards:
36 |         rewards.append(r[:min_len])
37 | 
38 |     r2 = plt.plot(rewards[0], label='length 2 sequence', color='orange')   
39 |     r4 = plt.plot(rewards[1], label='length 4 sequence', color='crimson')  
40 |     r8 = plt.plot(rewards[2], label='length 8 sequence', color='cyan')   
41 |     r12 = plt.plot(rewards[3], label='length 12 sequence', color='brown')  
42 |     r16 = plt.plot(rewards[4], label='length 16 sequence', color='blue')   
43 |     r20 = plt.plot(rewards[5], label='length 20 sequence', color='black')  
44 |     r24 = plt.plot(rewards[6], label='length 24 sequence', color='pink')   
45 | 
46 |     plt.legend(loc='lower right')
47 |     plt.ylabel('Episode Rewards')
48 |     plt.xlabel('Epochs')
49 |     plt.savefig('/Users/wulfe/Dropbox/School/Stanford/winter_2016/cs239/project/hierarchical_rl/results/seqlen_rewards.png')
50 | 
51 | def graph_rewards(filepaths):
52 |     rewards = []
53 |     for f in filepaths:
54 |         r = np.load(f)['values']
55 |         mr = logger.moving_average(r, 10)
56 |         rewards.append(mr)
57 | 
58 |     plt.plot(rewards[0], label='row/col + room', color='r')  
59 |     plt.plot(rewards[1], label='row/col only', color='g') 
60 |     plt.plot(rewards[2], label='tabular', color='b')    
61 |     plt.plot(rewards[3], label='coordinates', color='magenta') 
62 | 
63 |     plt.legend(loc='upper left')
64 |     plt.ylabel('Episode Rewards')
65 |     plt.xlabel('Epochs')
66 |     plt.savefig('/Users/wulfe/Dropbox/School/Stanford/winter_2016/cs239/project/hierarchical_rl/results/staterep_rewards.png')
67 | 
68 | if __name__ =='__main__':
69 |     root = '/Users/wulfe/Desktop/logs2/promise_hrlstaterep'
70 | 
71 |     rowcolroom = os.path.join(root, 'QNetwork_2016-03-02T02.56.25.325166', 'rewards.npz')
72 |     rowcol = os.path.join(root, 'QNetwork_2016-03-02T03.12.14.506093', 'rewards.npz')
73 |     tabular = os.path.join(root, 'QNetwork_2016-03-02T03.35.00.107253', 'rewards.npz')
74 |     coords = os.path.join(root, 'QNetwork_2016-03-02T04.01.10.893242', 'rewards.npz')
75 |     filepaths = [rowcolroom, rowcol, tabular, coords]
76 |     graph_rewards(filepaths)
77 | 
78 |     # r2 = os.path.join(root, 'single_layer_lstm_2016-02-29T12.52.41.641967', 'rewards.npz')
79 |     # r4 = os.path.join(root + '_hrltimestep', 'single_layer_lstm_2016-03-01T15.31.29.414573', 'rewards.npz')
80 |     # r8 = os.path.join(root, 'single_layer_lstm_2016-02-29T15.16.25.802919', 'rewards.npz')
81 |     # r12 = os.path.join(root + '_hrltimestep', 'single_layer_lstm_2016-03-01T12.42.03.151324', 'rewards.npz')
82 |     # r16 = os.path.join(root + '_hrltimestep', 'single_layer_lstm_2016-03-01T12.30.01.583816', 'rewards.npz')
83 |     # r20 = os.path.join(root + '_hrltimestep', 'single_layer_lstm_2016-03-01T12.35.43.236976', 'rewards.npz')
84 |     # r24 = os.path.join(root + '_hrltimestep', 'single_layer_lstm_2016-03-01T17.31.08.383906', 'rewards.npz')
85 |     # filepaths = [r2, r4, r8, r12, r16, r20, r24]
86 |     # graph_rewards(filepaths)


--------------------------------------------------------------------------------
/scripts/learning_utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import glob
  3 | from math import sqrt, ceil
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import os
  7 | import random
  8 | 
  9 | def sample(probs):
 10 |     """
 11 |     :description: given a list of probabilities, randomly select an index into those probabilities
 12 |     """
 13 |     if len(probs) < 1:
 14 |       raise ValueError('Sample received an empty list of probabilities. This should not happen. ')
 15 | 
 16 |     target = random.random()
 17 |     accum = 0
 18 |     for i, prob in enumerate(probs):
 19 |         accum += prob
 20 |         if accum >= target: return i
 21 |     raise ValueError('Invalid probabilities provided to sample method in experiment')
 22 | 
 23 | # Function: Weighted Random Choice
 24 | # --------------------------------
 25 | # Given a dictionary of the form element -> weight, selects an element
 26 | # randomly based on distribution proportional to the weights. Weights can sum
 27 | # up to be more than 1. 
 28 | # source: stanford.cs221.problem_set_6
 29 | # may be beneficial to switch to a faster method
 30 | def weightedRandomChoice(weightDict):
 31 |     weights = []
 32 |     elems = []
 33 |     for elem in weightDict:
 34 |         weights.append(weightDict[elem])
 35 |         elems.append(elem)
 36 |     total = sum(weights)
 37 |     key = random.uniform(0, total)
 38 |     runningTotal = 0.0
 39 |     chosenIndex = None
 40 |     for i in range(len(weights)):
 41 |         weight = weights[i]
 42 |         runningTotal += weight
 43 |         if runningTotal > key:
 44 |             chosenIndex = i
 45 |             return elems[chosenIndex]
 46 |     raise Exception('Should not reach here')
 47 | 
 48 | def visualize_grid(Xs, ubound=255.0, padding=1):
 49 |     """
 50 |     Reshape a 4D tensor of image data to a grid for easy visualization.
 51 | 
 52 |     Inputs:
 53 |     - Xs: Data of shape (N, H, W, C)
 54 |     - ubound: Output grid will have values scaled to the range [0, ubound]
 55 |     - padding: The number of blank pixels between elements of the grid
 56 |     """
 57 |     (N, H, W, C) = Xs.shape
 58 |     grid_size = int(ceil(sqrt(N)))
 59 |     grid_height = H * grid_size + padding * (grid_size - 1)
 60 |     grid_width = W * grid_size + padding * (grid_size - 1)
 61 |     grid = np.zeros((grid_height, grid_width, C))
 62 |     next_idx = 0
 63 |     y0, y1 = 0, H
 64 |     for y in xrange(grid_size):
 65 |         x0, x1 = 0, W
 66 |         for x in xrange(grid_size):
 67 |             if next_idx < N:
 68 |                 img = Xs[next_idx]
 69 |                 low, high = np.min(img), np.max(img)
 70 |                 grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low)
 71 |                 # grid[y0:y1, x0:x1] = Xs[next_idx]
 72 |                 next_idx += 1
 73 |             x0 += W + padding
 74 |             x1 += W + padding
 75 |         y0 += H + padding
 76 |         y1 += H + padding
 77 |     # grid_max = np.max(grid)
 78 |     # grid_min = np.min(grid)
 79 |     # grid = ubound * (grid - grid_min) / (grid_max - grid_min)
 80 |     return grid
 81 | 
 82 | def get_run_directory(filepath):
 83 |     return filepath[:filepath.rindex('/')]
 84 | 
 85 | def get_value_array_from_value_image_file(filepath):
 86 |     lines = None
 87 |     with open(filepath, 'rb') as f:
 88 |         lines = f.readlines()
 89 |         lines = [line.replace('\n', '').replace('S', '0').replace('E', '1').split(' ') for line in lines]
 90 |         lines = [[val for val in line if val != ''] for line in lines]
 91 |         lines = [[float(val) for val in line] for line in lines]
 92 |         lines = np.array(lines)
 93 |     return lines[::-1]
 94 | 
 95 | def make_heat_map(filepath, epoch):
 96 |     # convert value image to numeric array
 97 |     value_array = get_value_array_from_value_image_file(filepath)
 98 |     if value_array is None:
 99 |         print 'Value image could not be converted to heatmap'
100 |         return
101 | 
102 |     # determine output filepath
103 |     run_dir = get_run_directory(filepath)
104 |     output_filepath = os.path.join(run_dir, 'heatmaps', 'value_heatmap_{}.png'.format(epoch))
105 | 
106 |     # create and save heatmap
107 |     heatmap = plt.pcolormesh(value_array, vmin=-0.25, vmax=1.25)
108 |     plt.colorbar()
109 |     plt.savefig(output_filepath)
110 |     plt.close()
111 | 
112 | def load_params(filepath):
113 |     params = np.load(filepath)['params']
114 |     return params
115 |         
116 | 
117 | if __name__ =='__main__':
118 |     make_heat_maps()
119 | 
120 | 


--------------------------------------------------------------------------------
/scripts/logger.py:
--------------------------------------------------------------------------------
  1 | """
  2 | :description: These classes provide logging functionality for agents
  3 | """
  4 | 
  5 | import collections 
  6 | import datetime
  7 | import lasagne
  8 | import matplotlib
  9 | matplotlib.use('Agg')
 10 | import matplotlib.pyplot as plt
 11 | import numpy as np
 12 | import os
 13 | import pickle
 14 | import sys
 15 | 
 16 | import learning_utils
 17 | 
 18 | LOGGING_DIRECTORY = '../logs'
 19 | MAXIMUM_WEIGHT_MAGNITUDE = 1000
 20 | 
 21 | def moving_average(values, window_size):
 22 |     """
 23 |     :description: computes a moving average
 24 |     """
 25 |     if len(values) == 0:
 26 |         print 'the list given to moving average cannot be empty but is'
 27 |         return []
 28 | 
 29 |     window = np.ones(int(window_size))/float(window_size)
 30 |     values = np.hstack((np.repeat(values[0], int(window_size)), values, np.repeat(values[-1], int(window_size))))
 31 |     average = np.convolve(values, window, 'same').tolist()
 32 |     return average[window_size:-window_size]
 33 | 
 34 | class Logger(object):
 35 |     """
 36 |     :description: tracks and logs information about an agent
 37 |     """
 38 |     
 39 |     def __init__(self, agent_name, logging=True, verbose=True):
 40 |         """
 41 |         :type agent_name: string 
 42 |         :param agent_name: name of the agent whose information is being logged
 43 | 
 44 |         :type logging: boolean
 45 |         :param logging: whether or not to actually record any information
 46 |         """
 47 |         self.agent_name = agent_name
 48 |         self.actions = []
 49 |         self.rewards = []
 50 |         self.episode_rewards = []
 51 |         self.episode_actions = []
 52 |         self.action_start = 0
 53 |         self.losses = []
 54 |         self.states = []
 55 |         self.updates = 0
 56 |         self.epoch = 0
 57 |         self.state_values = collections.defaultdict(lambda: [])
 58 |         self.weights = None
 59 |         self.log_dir = None
 60 |         self.logging = logging
 61 |         self.verbose = verbose
 62 |         self.steps = 0
 63 |         self.prev_steps = 0
 64 |         self.episode_steps = []
 65 | 
 66 | 
 67 |     def log_action(self, action):
 68 |         self.steps += 1
 69 |         self.actions.append(action)
 70 | 
 71 |     def log_reward(self, reward):
 72 |         self.rewards.append(reward)
 73 | 
 74 |     def log_loss(self, loss):
 75 |         self.updates += 1
 76 |         self.losses.append(loss)
 77 | 
 78 |     def log_weights(self, weights):
 79 |         self.weights = weights
 80 |         max_magnitude = np.max(np.abs(weights.values()))
 81 | 
 82 |         if max_magnitude > MAXIMUM_WEIGHT_MAGNITUDE:
 83 |             except_string = 'Agent weights have surpassed reasonable values. Max weight: {}'.format(max_magnitude)
 84 |             raise ValueError(except_string)
 85 | 
 86 |     def log_epoch(self, epoch):
 87 |         """
 88 |         :description: records the information so far collected
 89 | 
 90 |         :type epoch: int
 91 |         :param epoch: the current epoch number
 92 |         """
 93 |         if not self.logging:
 94 |             return
 95 | 
 96 |         self.epoch += 1
 97 |         if self.log_dir is None:
 98 |             self.create_log_dir()
 99 | 
100 |         self.record_stat('actions', self.actions, epoch)
101 |         self.record_stat('rewards', self.episode_rewards, epoch)
102 | 
103 |         self.record_stat('losses', self.losses, epoch)
104 |         self.record_weights(self.weights, epoch)
105 | 
106 |     def finish_episode(self):
107 |         """
108 |         :description: performs tasks associated with the ending of an episode
109 |         """
110 |         self.episode_rewards.append(np.sum(self.rewards))
111 |         self.rewards = []
112 | 
113 |         self.episode_actions.append(self.actions[self.action_start:])
114 |         self.action_start = len(self.actions)
115 | 
116 |         self.episode_steps.append(self.steps - self.prev_steps)
117 |         self.prev_steps = self.steps
118 | 
119 |     def record_stat(self, name, values, epoch):
120 |         """
121 |         :description: saves values to a file and also plots them
122 | 
123 |         :type name: string
124 |         :param name: name of the value being recorded
125 | 
126 |         :type values: list
127 |         :param values: values to record
128 | 
129 |         :type epoch: int
130 |         :param epoch: current epoch number
131 |         """
132 |         self.save_stat(name, values, epoch)
133 |         self.plot_stat(name, values, epoch)
134 | 
135 |     def save_stat(self, name, values, epoch):
136 |         """
137 |         :description: saves a set of values to a file in npz format under the name 'values'
138 |         """
139 |         filename = '{}'.format(name)
140 |         filepath = os.path.join(self.log_dir, filename)
141 |         np.savez(filepath, values=values)
142 | 
143 |     def plot_stat(self, name, values, epoch):
144 |         """
145 |         :description: plots the provided values
146 |         """
147 |         if len(values) < 1:
148 |             return
149 | 
150 |         filename = '{}_graph.png'.format(name)
151 |         filepath = os.path.join(self.log_dir, filename)
152 | 
153 |         values = np.array(values)
154 |         if len(values.shape) < 2:
155 |             values = np.vstack((np.arange(len(values)), values))
156 |         else:
157 |             values = values.T
158 | 
159 |         plt.figure()
160 |         # x_max = 1.2 * max(values[0,:])
161 |         # x_min = -0.2 * max(values[0,:])
162 |         # y_max = 1.2 * max(values[1,:])
163 |         # y_min = -0.2 * max(values[0,:])
164 |         # plt.axis([x_min, x_max, y_min, y_max])
165 |         plt.scatter(values[0, :], values[1, :])
166 |         plt.plot(values[0, :], moving_average(values[1, :], 50), c='r')
167 |         plt.xlabel('Updates')
168 |         plt.ylabel(name)
169 |         plt.savefig(filepath)
170 |         plt.close()
171 | 
172 |     def record_weights(self, weights, epoch):
173 |         """
174 |         :description: saves the weights to a file
175 |         """
176 |         filename = 'weights_epoch_{}.pkl'.format(epoch)
177 |         filepath = os.path.join(self.log_dir, filename)
178 |         with open(filepath, 'wb') as f:
179 |             pickle.dump(weights, f, pickle.HIGHEST_PROTOCOL)
180 | 
181 |     def create_log_dir(self):
182 |         """
183 |         :description: creates a directory in which to log information for the current agent
184 |         """
185 |         # make the main logging directory
186 |         dir_name = '{}_{}'.format(self.agent_name, datetime.datetime.now().isoformat())
187 |         dir_path = os.path.join(LOGGING_DIRECTORY, dir_name)
188 |         os.mkdir(dir_path)
189 |         self.log_dir = dir_path
190 | 
191 |         # make a subdirectory for the network parameter files
192 |         params_dir_path = os.path.join(self.log_dir, 'params')
193 |         os.mkdir(params_dir_path)
194 |         self.params_dir = params_dir_path
195 |         
196 |         # make a subdirectory for the heatmaps
197 |         heatmap_dir_path = os.path.join(self.log_dir, 'heatmaps')
198 |         os.mkdir(heatmap_dir_path)
199 |         self.heatmap_dir = heatmap_dir_path
200 |         
201 |     def log_value_string(self, value_string):
202 |         """
203 |         :description: prints a string to a file. The string, when formatted, gives the values of different states in the mdp.
204 |         """
205 |         if self.log_dir is None:
206 |             self.create_log_dir()
207 | 
208 |         filename = 'value_image.txt'
209 |         filepath = os.path.join(self.log_dir, filename)
210 |         with open(filepath, 'wb') as f:
211 |             f.write(value_string)
212 | 
213 |         learning_utils.make_heat_map(filepath, self.epoch)
214 | 
215 |     def log_values(self, V):
216 |         """
217 |         :description: keeps track of how the q_values change over time
218 |         """
219 | 
220 |         mean_value = np.mean(V.values())
221 |         max_value = np.max(V.values())
222 |         min_value = np.min(V.values())
223 |         self.state_values['mean'].append(mean_value)
224 |         self.state_values['max'].append(max_value)
225 |         self.state_values['min'].append(min_value)
226 |         self.state_values['start'].append(V[(0,0)])
227 |         self.plot_values()
228 |         self.save_values()
229 | 
230 |     def save_values(self):
231 |         for state, values in self.state_values.iteritems():
232 |             filename = '{}'.format(state)
233 |             filepath = os.path.join(self.log_dir, filename)
234 |             np.savez(filepath, values=values)
235 | 
236 |     def plot_values(self):
237 |         """
238 |         :description: plot mean, max, and min state values so far
239 |         """
240 |         filename = 'state_values_graph.png'
241 |         filepath = os.path.join(self.log_dir, filename)
242 |         plt.figure()
243 |         plt.xlabel('Updates')
244 |         plt.ylabel('V(s)')
245 |         count = 0
246 |         plt.scatter(np.arange(len(self.state_values['mean'])), self.state_values['mean'], c='b')
247 |         plt.scatter(np.arange(len(self.state_values['max'])), self.state_values['max'], c='r')
248 |         plt.scatter(np.arange(len(self.state_values['min'])), self.state_values['min'], c='g')
249 |         plt.scatter(np.arange(len(self.state_values['start'])), self.state_values['start'], marker='*')
250 |         plt.savefig(filepath)
251 |         plt.close()
252 | 
253 | class NeuralLogger(Logger):
254 |     """
255 |     :description: inherting class that accomodates a network based agent
256 |     """
257 | 
258 |     def __init__(self, agent_name, logging=True, verbose=True):
259 |         super(NeuralLogger, self).__init__(agent_name, logging, verbose)
260 |         self.weight_magnitudes = []
261 |         self.weight_variances = []
262 |         self.exploration_probs = []
263 | 
264 |     def log_epoch(self, epoch, network, policy):
265 | 
266 |         if not self.logging:
267 |             return
268 | 
269 |         self.epoch += 1
270 |         if self.log_dir is None:
271 |             self.create_log_dir()
272 | 
273 |         try:    
274 |             self.record_stat('actions', self.actions, epoch)
275 |             self.record_stat('episode_rewards', self.episode_rewards, epoch)
276 |             self.record_stat('losses', self.losses, epoch)
277 |             self.record_stat('episode_steps', self.episode_steps, epoch)
278 |             
279 |             if self.verbose:
280 |                 print '\nEpoch: {}'.format(epoch)
281 |                 print 'Steps in last episode: {}'.format(self.episode_steps[-1])
282 |                 if len(self.losses) > 0:
283 |                     print 'Losses: {}'.format(np.mean(self.losses[-self.episode_steps[-1]:]))
284 | 
285 |             self.record_weights(epoch, network)
286 |             self.record_policy(epoch, policy)
287 |         except Exception as e:
288 |             print 'ERROR occurred during logging: '
289 |             print e
290 | 
291 |     def record_weights(self, epoch, network):
292 |         """
293 |         :description: records weights by saving them to a file
294 | 
295 |         :type epoch: int 
296 |         :param epoch: current epoch
297 | 
298 |         :type network: any class implementing get_params()
299 |         :param network: the networks whose weights should be saved
300 |         """
301 |         params = network.get_params()
302 |         self.save_params(params, epoch)
303 |         self.plot_weights(params, epoch)
304 | 
305 |     def save_params(self, params, epoch):
306 |         filename = 'network_file_epoch_{}.save'.format(epoch)
307 |         filepath = os.path.join(self.params_dir, filename)
308 |         np.savez(filepath, params=params)
309 | 
310 |     def plot_weights(self, params, epoch):
311 |         means = []
312 |         variances = []
313 |         for param in params:
314 |             means.append(np.mean(np.abs(param)))
315 |             variances.append(np.var(param))
316 |         self.weight_magnitudes.append(np.mean(means))
317 |         self.record_stat('weight_magnitudes', self.weight_magnitudes, epoch)
318 |         self.weight_variances.append(np.mean(variances))
319 |         self.record_stat('weight_variances', self.weight_variances, epoch)
320 | 
321 |     def record_policy(self, epoch, policy):
322 |         self.exploration_probs.append(policy.exploration_prob)
323 |         self.record_stat('exploration_probs', self.exploration_probs, epoch)
324 | 
325 |     def log_trajectories(self, mdp):
326 |         for trajectory in self.episode_actions:
327 |             mdp.print_trajectory(trajectory)
328 | 
329 |     def log_hyperparameters(self, network, policy, replay_memory):
330 |         if self.log_dir is None:
331 |             self.create_log_dir()
332 | 
333 |         filename = 'hyperparameters.txt'
334 |         filepath = os.path.join(self.log_dir, filename)
335 |         hyperparameters = {}
336 |         hyperparameters['batch_size'] = network.batch_size
337 |         hyperparameters['num_hidden'] = network.num_hidden
338 |         hyperparameters['num_parameters'] = lasagne.layers.count_params(network.l_out)
339 |         hyperparameters['discount'] = network.discount
340 |         hyperparameters['learning_rate'] = network.learning_rate
341 |         hyperparameters['regularization'] = network.regularization
342 |         hyperparameters['update_rule'] = network.update_rule
343 |         hyperparameters['freeze_interval'] = network.freeze_interval
344 |         hyperparameters['replay_memory_capacity'] = replay_memory.capacity
345 |         hyperparameters['actions_until_min'] = policy.actions_until_min
346 |         hyperparameters['epsilon'] = policy.exploration_prob
347 |         if hasattr(network, 'network_type'):
348 |             hyperparameters['network_type'] = network.network_type
349 |         if hasattr(replay_memory, 'sequence_length'):
350 |             hyperparameters['sequence_length'] = replay_memory.sequence_length
351 | 
352 |         with open(filepath, 'wb') as f:
353 |             for k, v in hyperparameters.iteritems():
354 |                 f.write('{}: {}\n'.format(k, v))
355 | 
356 | 
357 | 
358 | 


--------------------------------------------------------------------------------
/scripts/mdps.py:
--------------------------------------------------------------------------------
  1 | """
  2 | :description: Markov Decision Process classes
  3 | """
  4 | 
  5 | import collections
  6 | import copy
  7 | import numpy as np
  8 | import random
  9 | import sys
 10 | 
 11 | class MDP(object):
 12 | 
 13 |     def get_start_state(self): 
 14 |         raise NotImplementedError("Override me")
 15 | 
 16 |     def get_actions(self): 
 17 |         raise NotImplementedError("Override me")
 18 | 
 19 |     def succ_prob_reward(self, state, action): 
 20 |         """
 21 |         :description: returns a _list_ of tuples containing (next_state, probability, reward). Where the probability denotes the probability of the next_state and reward.
 22 |         """
 23 |         raise NotImplementedError("Override me")
 24 | 
 25 |     def get_discount(self): 
 26 |         raise NotImplementedError("Override me")
 27 | 
 28 |     def compute_states(self):
 29 |         self.states = set()
 30 |         self.graph = collections.defaultdict(lambda: set())
 31 |         queue = []
 32 |         self.states.add(self.get_start_state())
 33 |         queue.append(self.get_start_state())
 34 |         while len(queue) > 0:
 35 |             state = queue.pop()
 36 |             for action in self.get_actions(state):
 37 |                 for newState, prob, reward in self.succ_prob_reward(state, action):
 38 |                     if newState != state:
 39 |                         self.graph[state].add(newState)
 40 |                     if newState not in self.states:
 41 |                         self.states.add(newState)
 42 |                         if not self.is_end_state(newState):
 43 |                             queue.append(newState)
 44 | 
 45 |         self.graph = {k:list(v) for k, v in self.graph.iteritems()}
 46 | 
 47 | ###########################################################################
 48 | 
 49 | class LineMDP(MDP):
 50 |     """
 51 |     :description: A line mdp is just an x axis. Here the rewards are all -1 except for the last state on the right which is +1.
 52 |     """
 53 | 
 54 |     EXIT_REWARD = 1
 55 |     MOVE_REWARD = -.01
 56 | 
 57 |     def __init__(self, length):
 58 |         self.length = length
 59 | 
 60 |     def get_start_state(self):
 61 |         return 0
 62 | 
 63 |     def get_actions(self, state=None):
 64 |         return [-1, 1]
 65 | 
 66 |     def get_discount(self): 
 67 |         return 1
 68 | 
 69 |     def is_end_state(self, state):
 70 |         return state == self.length
 71 | 
 72 |     def succ_prob_reward(self, state, action): 
 73 |         if state == self.length:
 74 |             return []
 75 | 
 76 |         next_state = max(-self.length, state + action)
 77 |         reward = 1 if next_state == self.length else -1
 78 |         return [(next_state, 1, reward)]
 79 | 
 80 |     def print_v(self, V):
 81 |         line = ['-'] * (self.length * 2)
 82 |         for vidx, lidx in zip(range(-self.length, self.length), range(self.length * 2)):
 83 |             if vidx in V:
 84 |                 line[lidx] = round(V[vidx], 2)
 85 |         print line
 86 | 
 87 |     def print_pi(self, pi):
 88 |         line = ['-'] * (self.length * 2)
 89 |         for pidx, lidx in zip(range(-self.length, self.length), range(self.length * 2)):
 90 |             if pidx in pi:
 91 |                 line[lidx] = round(pi[pidx], 2)
 92 |         print line
 93 | 
 94 | ###########################################################################
 95 | 
 96 | class MazeMDP(MDP):
 97 |     """
 98 |     :description: an MDP specifying a maze, where that maze is a square, consists of num_rooms and each room having room_size discrete squares in it. So can have 1x1, 2x2, 3x3, etc size mazes. Rooms are separated by walls with a single entrance between them. The start state is always the bottom left of the maze, 1 position away from each wall of the first room. The end state is always in the top right room of the maze, again 1 position away from each wall. So the 1x1 maze looks like:
 99 | 
100 |          _______
101 |         |       |
102 |         |     E |
103 |         | S     |
104 |         |       |
105 |          -------
106 | 
107 |          the 2x2 maze would be
108 | 
109 |          _______ _______   
110 |         |       |       |
111 |         |             E |
112 |         |               |
113 |         |       |       |
114 |          --   -- --   --
115 |          __   __ __   __
116 |         |       |       |
117 |         |               |
118 |         | S             |
119 |         |       |       |
120 |          ------- -------
121 | 
122 | 
123 |          state is represented in absolute terms, so the bottom left corner of all mazes is (0,0) and to top right corner of all mazes is (room_size * num_rooms - 1, room_size * num_rooms - 1). In other words, the state ignores the fact that there are rooms or walls or anything, it's just the coordinates.
124 | 
125 |          actions are N,E,S,W movement by 1 direction. No stochasticity for now. moving into a wall leaves agent in place. Rewards are nothing except finding the exit is worth a lot 
126 | 
127 |          room_size must be odd
128 |     """
129 | 
130 |     EXIT_REWARD = 1
131 |     MOVE_REWARD = -0.01
132 |     TRUE_START_STATE_VALUE = 0.83
133 | 
134 |     def __init__(self, room_size, num_rooms):
135 |         self.room_size = room_size
136 |         self.num_rooms = num_rooms
137 |         self.max_position = self.room_size * self.num_rooms - 1
138 |         self.end_state = (self.max_position, self.max_position)
139 | 
140 |     def get_default_action(self):
141 |         return (1,0)
142 | 
143 |     def get_actions(self, state=None):
144 |         return [(1,0),(0,1),(-1,0),(0,-1)]
145 | 
146 |     def get_start_state(self):
147 |         return (0,0)   
148 | 
149 |     def is_end_state(self, state):
150 |         return state == self.end_state
151 | 
152 |     def get_discount(self):
153 |         return 0.9
154 | 
155 |     def get_mean_state_values(self):
156 |         return np.repeat(self.max_position / 2., 2)
157 | 
158 |     def calculate_next_state(self, state, action):
159 |         next_state = (state[0] + action[0], state[1] + action[1])
160 |         return next_state
161 | 
162 |     def runs_into_wall(self, state, action):
163 |         next_state = self.calculate_next_state(state, action)
164 | 
165 |         # 1. check for leaving the maze
166 |         if next_state[0] > self.max_position or next_state[0] < 0 \
167 |                             or next_state[1] > self.max_position or next_state[1] < 0:
168 |             return True
169 | 
170 |         # 2. check if movement was through doorway and if so return false
171 |         doorway_position = (self.room_size) / 2
172 |         # check horizontal movement through doorway
173 |         if next_state[0] != state[0]:
174 |             if next_state[1] % self.room_size == doorway_position:
175 |                 return False
176 | 
177 |         # check vertical movement through doorway
178 |         if next_state[1] != state[1]:
179 |             if next_state[0] % self.room_size == doorway_position:
180 |                 return False
181 | 
182 |         # 3. check if movement was through a wall
183 |         room_size = self.room_size
184 |         # move right to left through wall
185 |         if state[0] % room_size == room_size - 1 and next_state[0] % room_size == 0:
186 |             return True
187 | 
188 |         # move left to right through wall
189 |         if next_state[0] % room_size == room_size - 1 and state[0] % room_size == 0:
190 |             return True
191 | 
192 |         # move up through wall
193 |         if state[1] % room_size == room_size - 1 and next_state[1] % room_size == 0:
194 |             return True
195 | 
196 |         # move down through wall
197 |         if next_state[1] % room_size == room_size - 1 and state[1] % room_size == 0:
198 |             return True
199 | 
200 |         # if none of the above conditions meet, then have not passed through wall
201 |         return False
202 | 
203 |     def succ_prob_reward(self, state, action): 
204 | 
205 |         # if we reach the end state then the episode ends
206 |         if np.array_equal(state, self.end_state):
207 |             raise ValueError('Provided state equals end_state, should have stopped episode in experiment. state: {}\taction:{}'.format(state, action))
208 | 
209 |         if self.runs_into_wall(state, action):
210 |             # if the action runs us into a wall do nothing
211 |             next_state = state
212 |         else:
213 |             # o/w determine the next position
214 |             next_state = self.calculate_next_state(state, action)
215 | 
216 |         # if next state is exit, then set reward
217 |         reward = self.MOVE_REWARD
218 |         if np.array_equal(next_state, self.end_state):
219 |             reward = self.EXIT_REWARD
220 | 
221 |         #return [(next_state, 0.9, reward), (state, 0.1, self.MOVE_REWARD)]
222 |         return [(next_state, 1, reward)]
223 | 
224 |     def print_v(self, V):
225 |         for ridx in reversed(range(self.max_position + 1)):
226 |             for cidx in range(self.max_position + 1):
227 |                 if (ridx, cidx) in V:
228 |                     print round(V[(ridx, cidx)], 3),
229 |             print('\n')
230 | 
231 |     def get_value_string(self, V):
232 |         value_string = []
233 |         for ridx in reversed(range(self.max_position + 1)):
234 |             for cidx in range(self.max_position + 1):
235 |                 if (ridx, cidx) in V:
236 |                     value_string.append(round(V[(ridx, cidx)], 5))
237 |                     value_string.append(' ')
238 |             value_string.append('\n')
239 |         return ''.join([str(v) for v in value_string])
240 | 
241 |     def print_maze(self, coordinates):
242 |         for row in range(self.room_size):
243 |             for col in range(self.room_size):
244 |                 if coordinates == (row,col):
245 |                     print '*',
246 |                 elif self.end_state == (row,col):
247 |                     print 'e',
248 |                 else:
249 |                     print '-',
250 |             print '\n'
251 |         print '\n'
252 | 
253 |     def print_trajectory(self, actions):
254 |         coordinates = self.get_start_state()
255 |         self.print_maze(coordinates)
256 |         for action in actions:
257 |             action = self.get_actions()[action]
258 |             if not self.runs_into_wall(coordinates, action):
259 |                 coordinates = (coordinates[0] + action[0], coordinates[1] + action[1])
260 |             self.print_maze(coordinates)
261 | 
262 | ###########################################################################
263 | 
264 | 


--------------------------------------------------------------------------------
/scripts/policy.py:
--------------------------------------------------------------------------------
 1 | """
 2 | :description: classes implementing action selection policies
 3 | """
 4 | 
 5 | import numpy as np
 6 | import random
 7 | 
 8 | import learning_utils
 9 | 
10 | class Policy(object):
11 | 
12 |     def __init__(self, num_actions):
13 |         self.actions = range(num_actions)
14 | 
15 |     def choose_action(self, q_values):
16 |         raise NotImplementedError("Override me")
17 | 
18 |     def random_action(self):
19 |         return random.choice(self.actions)
20 | 
21 | class EpsilonGreedy(Policy):
22 | 
23 |     def __init__(self, num_actions, exploration_prob, min_exploration_prob, actions_until_min):
24 |         super(EpsilonGreedy, self).__init__(num_actions)
25 |         self.exploration_prob = exploration_prob
26 |         self.min_exploration_prob = min_exploration_prob
27 |         self.actions_until_min = actions_until_min
28 |         assert actions_until_min != 0, 'actions_until_min must be positive'
29 |         self.exploration_reduction = (exploration_prob - min_exploration_prob) / float(actions_until_min)
30 | 
31 |     def choose_action(self, q_values):
32 |         self.update_parameters()
33 |         if random.random() < self.exploration_prob:
34 |             return random.choice(self.actions)
35 |         else:
36 |             return np.argmax(q_values)
37 | 
38 |     def update_parameters(self):
39 |         updated_exploration_prob = self.exploration_prob - self.exploration_reduction
40 |         self.exploration_prob = max(self.min_exploration_prob, updated_exploration_prob)
41 | 
42 | class Softmax(Policy):
43 | 
44 |     def __init__(self, num_actions, tau, min_tau, actions_until_min):
45 |         super(Softmax, self).__init__(num_actions)
46 |         self.tau = float(tau)
47 |         self.min_tau = min_tau
48 |         self.actions_until_min = actions_until_min
49 |         assert actions_until_min != 0, 'actions_until_min must be positive'
50 |         self.tau_reduction = (tau - min_tau) / float(actions_until_min)
51 | 
52 |     def choose_action(self, q_values):
53 |         self.update_parameters()
54 |         exp_q_values = np.exp(q_values / (self.tau + 1e-2))
55 |         weights = dict()
56 |         for idx, val in enumerate(exp_q_values):
57 |             weights[idx] = val
58 |         action = learning_utils.weightedRandomChoice(weights)
59 |         return action
60 | 
61 |     def update_parameters(self):
62 |         updated_tau = self.tau - self.tau_reduction
63 |         self.tau = max(self.min_tau, updated_tau)
64 | 
65 | 


--------------------------------------------------------------------------------
/scripts/qnetwork.py:
--------------------------------------------------------------------------------
  1 | """
  2 | :description: This file contains the QNetwork class, which has a variable number of 
  3 |     fully-connected hidden layers. It also contains a similar class called 
  4 |     ConvQNetwork that implements the network with convolutional layers.
  5 | """
  6 | 
  7 | import lasagne
  8 | from lasagne.regularization import regularize_network_params, l2
  9 | import numpy as np
 10 | import theano
 11 | import theano.tensor as T
 12 | 
 13 | import learning_utils
 14 | 
 15 | class QNetwork(object):
 16 | 
 17 |     def __init__(self, input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng):
 18 |         """
 19 |         :type input_shape: int
 20 |         :param input_shape: the dimension of the input representation of the state
 21 | 
 22 |         :type batch_size: int
 23 |         :param batch_size: number of samples to use in computing the loss / updates
 24 | 
 25 |         :type num_hidden_layers: int
 26 |         :param num_hidden_layers: number of hidden layers to use in the network
 27 | 
 28 |         :type num_actions: int
 29 |         :param num_actions: the output dimension of the network measured in number of possible actions
 30 | 
 31 |         :type num_hidden: int
 32 |         :param num_hidden: number of hidden nodes to use in each layer (const across layers)
 33 | 
 34 |         :type discount: float
 35 |         :param discount: discount factor to use in computing Q-learning target values
 36 | 
 37 |         :type learning_rate: float
 38 |         :param learning_rate: the learning rate to use (no decay schedule since ADAM update assumed) 
 39 | 
 40 |         :type regularization: float
 41 |         :param regularization: l2 regularization constant applied to weights
 42 | 
 43 |         :type update_rule: string
 44 |         :param update_rule: the type of update rule to use, suggest using 'adam'
 45 | 
 46 |         :type freeze_interval: int
 47 |         :param freeze_interval: the number of updates between updating the target network weights
 48 | 
 49 |         :type rng: rng
 50 |         :param rng: rng for running deterministically, o/w just leave as None
 51 | 
 52 |         :example call: 
 53 |         network = qnetwork.QNetwork(input_shape=20, batch_size=64, num_hidden_layers=2, num_actions=4, 
 54 |             num_hidden=4, discount=1, learning_rate=1e-3, regularization=1e-4, 
 55 |             update_rule='adam', freeze_interval=1e5, rng=None)
 56 | 
 57 |         """
 58 |         self.input_shape = input_shape
 59 |         self.batch_size = batch_size
 60 |         self.num_hidden_layers = num_hidden_layers
 61 |         self.num_actions = num_actions
 62 |         self.num_hidden = num_hidden
 63 |         self.discount = discount
 64 |         self.learning_rate = learning_rate
 65 |         self.regularization = regularization
 66 |         self.update_rule = update_rule
 67 |         self.freeze_interval = freeze_interval
 68 |         self.rng = rng if rng else np.random.RandomState()
 69 |         self.initialize_network()
 70 |         self.update_counter = 0
 71 | 
 72 |     def train(self, states, actions, rewards, next_states, terminals):
 73 |         """
 74 |         :description: Perform a q-learning update using the (s,a,r,s') tuples provided
 75 | 
 76 |         :type states: np.array(dtype=theano.config.floatX)
 77 |         :param states: batch of states, shape (N,D) = (batch_size, input_shape)
 78 | 
 79 |         :type actions: np.array(dtype='int32')
 80 |         :param actions: the actions taken by the agent in the corresponding state from states
 81 |                         shape = (N,)
 82 | 
 83 |         :type rewards: np.array(dtype=theano.config.floatX)
 84 |         :param rewards: rewards associated with being in state s and taking action a, shape = (N,)
 85 | 
 86 |         :type next_states: np.array(dtype=theano.config.floatX)
 87 |         :param next_states: batch of next_states, shape (N,D) = (batch_size, input_shape)
 88 | 
 89 |         :type terminals: np.array(dtype='int32')
 90 |         :param terminals: whether the corresponding state was a terminal state. If so, this
 91 |                             will cause the max_a' Q(s',a') term to be zero in the q-learning loss.
 92 | 
 93 |         :example call:
 94 |         states = np.array([[1,0],[0,1]])
 95 |         actions = np.array([1,1])
 96 |         rewards = np.array([.2,-.1])
 97 |         next_states = np.array([[0,1],[1,0]])
 98 |         terminals = np.array([0,0])
 99 |         network.train(states, actions, rewards, next_states, terminals)
100 | 
101 |         """
102 | 
103 |         if self.update_counter % self.freeze_interval == 0:
104 |             self.reset_target_network()
105 |         self.update_counter += 1
106 | 
107 |         self.states_shared.set_value(states)
108 |         self.actions_shared.set_value(actions.astype('int32'))
109 |         self.rewards_shared.set_value(rewards)
110 |         self.next_states_shared.set_value(next_states)
111 |         self.terminals_shared.set_value(terminals.astype('int32'))
112 | 
113 |         loss, q_values = self._train()
114 |         return loss
115 | 
116 |     def get_q_values(self, state):
117 |         """
118 |         :description: Returns the q_values associated with a single state for the purposes of 
119 |                         deciding which action to take.
120 | 
121 |         :type state: np.array(dtype=theano.config.floatX)
122 |         :param state: state to compute q_values for, shape = (D,)
123 | 
124 |         :example call:
125 |         state = np.array([1,2])
126 |         network.get_q_values(state)
127 |         """
128 |         # create a fake batch
129 |         states = np.zeros((self.batch_size, self.input_shape), dtype=theano.config.floatX)
130 | 
131 |         # set the first item in that batch to the passed in state and set the shared variables
132 |         states[0] = state
133 |         self.states_shared.set_value(states)
134 | 
135 |         # do a forward pass using the theano function 'get_q_values' and index and return the first item
136 |         q_values = self._get_q_values()[0]
137 |         return q_values
138 | 
139 |     def get_params(self):
140 |         """
141 |         :description: Return a numpy array containing all of the parameters of the network. 
142 |                     Used for retrieving weights to save.
143 |         """
144 |         return lasagne.layers.helper.get_all_param_values(self.l_out)
145 | 
146 |     def set_params(self, params):
147 |         """
148 |         :description: Set the parameters of the network to the provided parameters. Used for 
149 |                     loading saved weights.
150 |         """
151 |         lasagne.layers.set_all_param_values(self.l_out, params)
152 |         self.reset_target_network()
153 | 
154 |     def reset_target_network(self):
155 |         """
156 |         :description: Set the target weights to the current weights.
157 |         """
158 |         all_params = lasagne.layers.helper.get_all_param_values(self.l_out)
159 |         lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params)
160 | 
161 |     def finish_episode(self):
162 |         pass
163 | 
164 |     ##########################################################################################
165 |     #### Network and Learning Initialization below
166 |     ##########################################################################################
167 | 
168 |     def initialize_network(self):
169 |         """
170 |         :description: this method initializes the network, updates, and theano functions for training and 
171 |             retrieving q values. Here's an outline: 
172 | 
173 |             1. build the q network and target q network
174 |             2. initialize theano symbolic variables used for compiling functions
175 |             3. initialize the theano numeric variables used as input to functions
176 |             4. formulate the symbolic loss 
177 |             5. formulate the symbolic updates 
178 |             6. compile theano functions for training and for getting q_values
179 |         """
180 |         batch_size, input_shape = self.batch_size, self.input_shape
181 |         lasagne.random.set_rng(self.rng)
182 | 
183 |         # 1. build the q network and target q network
184 |         self.l_out = self.build_network(input_shape, self.num_actions, batch_size)
185 |         self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size)
186 |         self.reset_target_network()
187 | 
188 |         # 2. initialize theano symbolic variables used for compiling functions
189 |         states = T.matrix('states')
190 |         actions = T.icol('actions')
191 |         rewards = T.col('rewards')
192 |         next_states = T.matrix('next_states')
193 |         # terminals are used to indicate a terminal state in the episode and hence a mask over the future
194 |         # q values i.e., Q(s',a')
195 |         terminals = T.icol('terminals')
196 | 
197 |         # 3. initialize the theano numeric variables used as input to functions
198 |         self.states_shared = theano.shared(np.zeros((batch_size, input_shape), dtype=theano.config.floatX))
199 |         self.next_states_shared = theano.shared(np.zeros((batch_size, input_shape), dtype=theano.config.floatX))
200 |         self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), 
201 |             broadcastable=(False, True))
202 |         self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
203 |             broadcastable=(False, True))
204 |         self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
205 |             broadcastable=(False, True))
206 | 
207 |         # 4. formulate the symbolic loss 
208 |         q_vals = lasagne.layers.get_output(self.l_out, states)
209 |         next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states)
210 |         target = (rewards +
211 |                  (T.ones_like(terminals) - terminals) *
212 |                   self.discount * T.max(next_q_vals, axis=1, keepdims=True))
213 |         # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector'
214 |         diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))
215 | 
216 |         # a lot of the recent work clips the td error at 1 so we do that here
217 |         # the problem is that gradient backpropagating through this minimum node
218 |         # will be zero if diff is larger then 1.0 (because changing params before
219 |         # the minimum does not impact the output of the minimum). To account for 
220 |         # this we take the part of the td error (magnitude) greater than 1.0 and simply
221 |         # add it to the loss, which allows gradient to backprop but just linearly
222 |         # in the td error rather than quadratically
223 |         quadratic_part = T.minimum(abs(diff), 1.0)
224 |         linear_part = abs(diff) - quadratic_part
225 |         loss = 0.5 * quadratic_part ** 2 + linear_part
226 |         loss = T.sum(loss) + self.regularization * regularize_network_params(self.l_out, l2)
227 |         
228 |         # 5. formulate the symbolic updates 
229 |         params = lasagne.layers.helper.get_all_params(self.l_out)  
230 |         updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate)
231 | 
232 |         # 6. compile theano functions for training and for getting q_values
233 |         givens = {
234 |             states: self.states_shared,
235 |             next_states: self.next_states_shared,
236 |             rewards: self.rewards_shared,
237 |             actions: self.actions_shared,
238 |             terminals: self.terminals_shared
239 |         }
240 |         self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
241 |         self._get_q_values = theano.function([], q_vals, givens={states: self.states_shared})
242 | 
243 |     def initialize_updates(self, update_rule, loss, params, learning_rate):
244 |         """
245 |         :description: This method decides which updates to apply. Suggest using 'adam'.
246 |         """
247 |         if update_rule == 'adam':
248 |             updates = lasagne.updates.adam(loss, params, learning_rate)
249 |         elif update_rule == 'rmsprop':
250 |             updates = lasagne.updates.rmsprop(loss, params, learning_rate)
251 |         elif update_rule == 'sgd':
252 |             updates = lasagne.updates.sgd(loss, params, learning_rate)
253 |             updates = lasagne.updates.apply_nesterov_momentum(updates)
254 |         else:
255 |             raise ValueError("Unrecognized update: {}".format(update_rule))
256 |         return updates
257 | 
258 |     def build_network(self, input_shape, output_shape, batch_size):
259 |         """
260 |         :description: Builds the computational graph in lasagne.
261 |         """
262 | 
263 |         l_in = lasagne.layers.InputLayer(
264 |             shape=(batch_size, input_shape)
265 |         )
266 | 
267 |         l_hid = l_in
268 |         for hidden_idx in range(self.num_hidden_layers):
269 |             l_hid = lasagne.layers.DenseLayer(
270 |                 l_in,
271 |                 num_units=self.num_hidden,
272 |                 nonlinearity=lasagne.nonlinearities.leaky_rectify,
273 |                 W=lasagne.init.HeNormal(),
274 |                 b=lasagne.init.Constant(.1)
275 |             )
276 | 
277 |         l_out = lasagne.layers.DenseLayer(
278 |             l_hid,
279 |             num_units=output_shape,
280 |             nonlinearity=None,
281 |             W=lasagne.init.HeNormal(),
282 |             b=lasagne.init.Constant(0)
283 |         )
284 | 
285 |         return l_out
286 | 
287 | 
288 | ##############################################################################################
289 | ##########################      Convolutional Q net below     ################################
290 | ##############################################################################################
291 | 
292 | class ConvQNetwork(object):
293 |     """
294 |     :description: This class is very similar to the QNetwork above, but uses convolutional
295 |                 layers and therefore requires some different input shape details. 
296 |     """
297 | 
298 |     def __init__(self, input_shape, batch_size, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng):
299 |         self.input_shape = input_shape
300 |         self.batch_size = batch_size
301 |         self.num_actions = num_actions
302 |         self.num_hidden = num_hidden
303 |         self.discount = discount
304 |         self.learning_rate = learning_rate
305 |         self.regularization = regularization
306 |         self.update_rule = update_rule
307 |         self.freeze_interval = freeze_interval
308 |         self.rng = rng if rng else np.random.RandomState()
309 |         self.initialize_network()
310 |         self.update_counter = 0
311 | 
312 |     def train(self, states, actions, rewards, next_states, terminals):
313 |         if self.update_counter % self.freeze_interval == 0:
314 |             self.reset_target_network()
315 |         self.update_counter += 1
316 | 
317 |         self.states_shared.set_value(states)
318 |         self.actions_shared.set_value(actions.astype('int32'))
319 |         self.rewards_shared.set_value(rewards)
320 |         self.next_states_shared.set_value(next_states)
321 |         self.terminals_shared.set_value(terminals.astype('int32'))
322 | 
323 |         loss, q_values = self._train()
324 |         return loss
325 | 
326 |     def get_q_values(self, state):
327 |         states = np.zeros(self.states_shape, dtype=theano.config.floatX)
328 |         states[0] = state
329 |         self.states_shared.set_value(states)
330 |         q_values = self._get_q_values()[0]
331 |         return q_values
332 | 
333 |     def get_params(self):
334 |         return lasagne.layers.helper.get_all_param_values(self.l_out)
335 | 
336 |     def reset_target_network(self):
337 |         all_params = lasagne.layers.helper.get_all_param_values(self.l_out)
338 |         lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params)
339 | 
340 |     ##########################################################################################
341 |     #### Network and Learning Initialization below
342 |     ##########################################################################################
343 | 
344 |     def initialize_network(self):
345 |         """
346 |         :description: this method initializes the network, updates, and theano functions for training and 
347 |             retrieving q values. Here's an outline: 
348 | 
349 |             1. build the q network and target q network
350 |             2. initialize theano symbolic variables used for compiling functions
351 |             3. initialize the theano numeric variables used as input to functions
352 |             4. formulate the symbolic loss 
353 |             5. formulate the symbolic updates 
354 |             6. compile theano functions for training and for getting q_values
355 |         """
356 |         batch_size, input_shape = self.batch_size, self.input_shape
357 |         lasagne.random.set_rng(self.rng)
358 | 
359 |         # 1. build the q network and target q network
360 |         self.l_out = self.build_network(input_shape, self.num_actions, batch_size)
361 |         self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size)
362 |         self.reset_target_network()
363 | 
364 |         # 2. initialize theano symbolic variables used for compiling functions
365 |         states = T.tensor4('states')
366 |         actions = T.icol('actions')
367 |         rewards = T.col('rewards')
368 |         next_states = T.tensor4('next_states')
369 |         # terminals are used to indicate a terminal state in the episode and hence a mask over the future
370 |         # q values i.e., Q(s',a')
371 |         terminals = T.icol('terminals')
372 | 
373 |         # 3. initialize the theano numeric variables used as input to functions
374 |         self.states_shape = (batch_size,) + (1,) + input_shape
375 |         self.states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX))
376 |         self.next_states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX))
377 |         self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), 
378 |             broadcastable=(False, True))
379 |         self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
380 |             broadcastable=(False, True))
381 |         self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
382 |             broadcastable=(False, True))
383 | 
384 |         # 4. formulate the symbolic loss 
385 |         q_vals = lasagne.layers.get_output(self.l_out, states)
386 |         next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states)
387 |         target = (rewards +
388 |                  (T.ones_like(terminals) - terminals) *
389 |                   self.discount * T.max(next_q_vals, axis=1, keepdims=True))
390 |         # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector'
391 |         diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))
392 | 
393 | 
394 |         # a lot of the deepmind work clips the td error at 1 so we do that here
395 |         # the problem is that gradient backpropagating through this minimum node
396 |         # will be zero if diff is larger then 1.0 (because changing params before
397 |         # the minimum does not impact the output of the minimum). To account for 
398 |         # this we take the part of the td error (magnitude) greater than 1.0 and simply
399 |         # add it to the loss, which allows gradient to backprop but just linearly
400 |         # in the td error rather than quadratically
401 |         quadratic_part = T.minimum(abs(diff), 1.0)
402 |         linear_part = abs(diff) - quadratic_part
403 |         loss = 0.5 * quadratic_part ** 2 + linear_part
404 |         loss = T.mean(loss) + self.regularization * regularize_network_params(self.l_out, l2)
405 | 
406 |         # 5. formulate the symbolic updates 
407 |         params = lasagne.layers.helper.get_all_params(self.l_out)  
408 |         updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate)
409 | 
410 |         # 6. compile theano functions for training and for getting q_values
411 |         givens = {
412 |             states: self.states_shared,
413 |             next_states: self.next_states_shared,
414 |             rewards: self.rewards_shared,
415 |             actions: self.actions_shared,
416 |             terminals: self.terminals_shared
417 |         }
418 |         self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
419 |         self._get_q_values = theano.function([], q_vals, givens={states: self.states_shared})
420 | 
421 |     def initialize_updates(self, update_rule, loss, params, learning_rate):
422 |         if update_rule == 'adam':
423 |             updates = lasagne.updates.adam(loss, params, learning_rate)
424 |         elif update_rule == 'rmsprop':
425 |             updates = lasagne.updates.rmsprop(loss, params, learning_rate)
426 |         elif update_rule == 'sgd':
427 |             updates = lasagne.updates.sgd(loss, params, learning_rate)
428 |             updates = lasagne.updates.apply_nesterov_momentum(updates)
429 |         else:
430 |             raise ValueError("Unrecognized update: {}".format(update_rule))
431 |         return updates
432 | 
433 |     def build_network(self, input_shape, output_shape, batch_size):
434 | 
435 |         l_in = lasagne.layers.InputLayer(
436 |             shape=(batch_size,) + (1,) + input_shape
437 |         )
438 | 
439 |         l_conv1 = lasagne.layers.Conv2DLayer(
440 |             l_in,
441 |             num_filters=self.num_hidden,
442 |             filter_size=(1,1),
443 |             stride = 1,
444 |             pad = 'same',
445 |             nonlinearity=lasagne.nonlinearities.leaky_rectify,
446 |             W=lasagne.init.HeNormal(),
447 |             b=lasagne.init.Constant(.1)
448 |         )
449 | 
450 |         l_out = lasagne.layers.DenseLayer(
451 |             l_conv1,
452 |             num_units=output_shape,
453 |             nonlinearity=None,
454 |             W=lasagne.init.HeNormal(),
455 |             b=lasagne.init.Constant(0)
456 |         )
457 | 
458 |         return l_out
459 | 
460 | 


--------------------------------------------------------------------------------
/scripts/replay_memory.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import random
  4 | import theano
  5 | 
  6 | DEFAULT_CAPACITY = 10000
  7 | 
  8 | class ReplayMemory(object):
  9 | 
 10 |     def __init__(self, batch_size, capacity=DEFAULT_CAPACITY):
 11 |         self.memory = {}
 12 |         self.batch_size = batch_size
 13 |         self.first_index = -1
 14 |         self.last_index = -1
 15 |         self.capacity = capacity
 16 |         self.terminal_count = 0
 17 | 
 18 |     def store(self, sars_tuple):
 19 |         self.terminal_count += sars_tuple[-1]
 20 |         if self.first_index == -1:
 21 |             self.first_index = 0
 22 |         self.last_index += 1
 23 |         self.memory[self.last_index] = sars_tuple   
 24 |         if (self.last_index + 1 - self.first_index) > self.capacity:
 25 |             self.discard_sample()
 26 | 
 27 |     def is_full(self):
 28 |         return self.last_index + 1 - self.first_index >= self.capacity
 29 | 
 30 |     def is_empty(self):
 31 |         return self.first_index == -1
 32 | 
 33 |     def discard_sample(self):
 34 |         rand_index = random.randint(self.first_index, self.last_index)
 35 |         first_tuple = self.memory[self.first_index]
 36 |         del self.memory[rand_index]
 37 |         if rand_index != self.first_index:
 38 |             del self.memory[self.first_index]
 39 |             self.memory[rand_index] = first_tuple
 40 |         self.first_index += 1
 41 | 
 42 |     def sample(self):
 43 |         if self.is_empty():
 44 |             raise Exception('Unable to sample from replay memory when empty')
 45 |         rand_sample_index = random.randint(self.first_index, self.last_index)
 46 |         return self.memory[rand_sample_index]
 47 | 
 48 |     def sample_batch(self):
 49 |         # must insert data into replay memory before sampling
 50 |         if self.is_empty():
 51 |             raise Exception('Unable to sample from replay memory when empty')
 52 | 
 53 |         # determine shape of states
 54 |         state_shape = np.shape(self.memory.values()[0][0])
 55 |         states_shape = (self.batch_size,) + state_shape
 56 | 
 57 |         states = np.empty(states_shape)
 58 |         actions = np.empty((self.batch_size, 1))
 59 |         rewards = np.empty((self.batch_size, 1))
 60 |         next_states = np.empty(states_shape)
 61 |         terminals = np.empty((self.batch_size, 1))
 62 | 
 63 |         # sample batch_size times from the memory
 64 |         for idx in range(self.batch_size):
 65 |             state, action, reward, next_state, terminal = self.sample()
 66 |             states[idx] = state
 67 |             actions[idx] = action
 68 |             rewards[idx] = reward
 69 |             next_states[idx] = next_state
 70 |             terminals[idx] = terminal
 71 | 
 72 |         return states.astype(theano.config.floatX), actions, \
 73 |             rewards.astype(theano.config.floatX), \
 74 |             next_states.astype(theano.config.floatX), terminals
 75 | 
 76 | class SequenceReplayMemory(object):
 77 |     """
 78 |     :description: this is from https://github.com/spragunr/deep_q_rl
 79 |     """
 80 |     
 81 |     def __init__(self, input_shape, sequence_length, batch_size, capacity):
 82 |         """
 83 |         :type input_shape: int or tuple 
 84 |         :param: the shape of the state input to the network
 85 | 
 86 |         :type sequence_length: int
 87 |         :param sequence_length: the length of the sequence used by the network
 88 | 
 89 |         :type batch_size: int
 90 |         :param batch_size: the size of a minibatch
 91 | 
 92 |         :type capacity: int
 93 |         :param capacity: maximum size of the replay memory
 94 |         """
 95 |         self.input_shape = input_shape
 96 |         self.sequence_length = sequence_length
 97 |         self.batch_size = batch_size
 98 |         self.capacity = capacity
 99 |         self.bottom = 0
100 |         self.top = 0
101 |         self.size = 0
102 | 
103 |         if type(self.input_shape) is int:
104 |             self.input_shape = (self.input_shape, )   
105 | 
106 |         if self.sequence_length == 1:
107 |             self.sequence_shape = self.input_shape
108 |         else:         
109 |             self.sequence_shape = (self.sequence_length,) + self.input_shape
110 |         self.batch_shape = (self.batch_size, ) + self.sequence_shape
111 | 
112 |         # Allocate the circular buffers
113 |         self.states = np.zeros(((self.capacity, ) + self.input_shape), dtype='int32')
114 |         self.actions = np.zeros(self.capacity, dtype='int32')
115 |         self.rewards = np.zeros(self.capacity, dtype=theano.config.floatX)
116 |         self.terminals = np.zeros(self.capacity, dtype='bool')
117 | 
118 |     def store(self, state, action, reward, terminal):
119 |         """
120 |         :description: stores a state, the action taken in that state, and the reward received for 
121 |             for being the state (i.e., we use r(s) not r(s,a)) in the replay memory
122 | 
123 |         :type state: np.array
124 |         :param state: the current state
125 | 
126 |         :type action: int 
127 |         :param action: the action taken in this state
128 | 
129 |         :type reward: float 
130 |         :param reward: the reward received for being in state
131 |         """
132 | 
133 |         self.states[self.top] = state
134 |         self.actions[self.top] = action
135 |         self.rewards[self.top] = reward
136 |         self.terminals[self.top] = terminal
137 | 
138 |         if self.size == self.capacity:
139 |             self.bottom = (self.bottom + 1) % self.capacity
140 |         else:
141 |             self.size += 1
142 | 
143 |         self.top = (self.top + 1) % self.capacity
144 | 
145 |     def make_last_sequence(self, next_state):
146 |         """
147 |         :description: given a state, this method creates a sequence of sequence_length where
148 |             the last state in that sequence is passed in state. This is used to get an action
149 | 
150 |         :type next_state: np.array
151 |         :param next_state: the next state to be inserted last into the sequence
152 |         """
153 | 
154 |         # take states from the memory
155 |         sequence = np.zeros(self.sequence_shape, dtype=theano.config.floatX)
156 |         indexes = np.arange(self.top - self.sequence_length + 1, self.top)
157 |         sequence[0:self.sequence_length - 1] = self.states.take(indexes, axis=0, mode='wrap')
158 | 
159 |         # set current states value in sequence
160 |         sequence[-1] = next_state
161 | 
162 |         # take the same terminal values from the memory
163 |         terminals = self.terminals.take(indexes, axis=0, mode='wrap')
164 |         
165 |         # if any of those terminals are true, then set indexes of the 
166 |         # sequence up to and including the index to zero
167 |         true_terminals = np.argwhere(terminals == True)
168 |         if len(true_terminals) > 0:
169 |             real_start = true_terminals[-1] + 1
170 |             sequence[:real_start] = 0
171 | 
172 |         return sequence
173 | 
174 |     def is_full(self):
175 |         """
176 |         :description: is the replay memory full
177 |         """
178 |         return self.size == self.capacity
179 | 
180 |     def sample_batch(self):
181 |         """
182 |         :description: sample a minibatch of data
183 |         """
184 | 
185 |         # must insert sufficient data into replay memory before sampling
186 |         if not self.is_full():
187 |             raise Exception('Unable to sample from replay memory when empty')
188 | 
189 |         # allocate batch containers
190 |         states = np.empty(self.batch_shape)
191 |         actions = np.empty((self.batch_size, 1))
192 |         rewards = np.empty((self.batch_size, 1))
193 |         next_states = np.empty(self.batch_shape)
194 |         terminals = np.empty((self.batch_size, 1))
195 | 
196 |         # sample batch_size times from the memory
197 |         count = 0 
198 |         while count < self.batch_size:
199 | 
200 |             index = np.random.randint(self.bottom, self.bottom + self.size - self.sequence_length)
201 |             initial_indices = np.arange(index, index + self.sequence_length)
202 |             transition_indices = initial_indices + 1
203 |             end_index = index + self.sequence_length - 1
204 |             
205 |             # original quote:
206 |             # "Check that the initial state corresponds entirely to a
207 |             # single episode, meaning none but the last frame may be
208 |             # terminal. If the last frame of the initial state is
209 |             # terminal, then the last frame of the transitioned state
210 |             # will actually be the first frame of a new episode, which
211 |             # the Q learner recognizes and handles correctly during
212 |             # training by zeroing the discounted future reward estimate."
213 |             if np.any(self.terminals.take(initial_indices[:-1], mode='wrap')):
214 |                 continue
215 | 
216 |             # Add the state transition to the response.
217 |             states[count] = self.states.take(initial_indices, axis=0, mode='wrap')
218 |             actions[count] = self.actions.take([end_index], mode='wrap')[0]
219 |             rewards[count] = self.rewards.take([end_index], mode='wrap')[0]
220 |             terminals[count] = self.terminals.take([end_index], mode='wrap')[0]
221 |             next_states[count] = self.states.take(transition_indices, axis=0, mode='wrap')
222 |             count += 1
223 | 
224 |         return states.astype(theano.config.floatX), \
225 |                actions, \
226 |                rewards.astype(theano.config.floatX), \
227 |                next_states.astype(theano.config.floatX), \
228 |                terminals
229 | 
230 | 


--------------------------------------------------------------------------------
/scripts/state_adapters.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | 
  4 | class CoordinatesToSingleRoomRowColAdapter(object):
  5 | 
  6 |     def __init__(self, room_size):
  7 |         self.room_size = room_size
  8 | 
  9 |     def convert_state_to_agent_format(self, state):
 10 |         """
 11 |         Convert states in format (x, y) to a single room, row, column one-hot vector
 12 |         example: 
 13 |         >>>state = (4, 4)
 14 |         >>>adapter = CoordinatesToSingleRoomRowColAdapter(room_size=3)
 15 |         >>>adapter.convert_state_to_agent_format(state)
 16 |         [1,0,0,1,0,0]
 17 |         """
 18 |         ridx, cidx = state
 19 | 
 20 |         # find where the agent is in the room
 21 |         row = np.zeros(self.room_size)
 22 |         row[ridx % self.room_size] = 1
 23 |         col = np.zeros(self.room_size)
 24 |         col[cidx % self.room_size] = 1
 25 | 
 26 |         # concat the two vectors
 27 |         formatted_state = np.hstack((row, col))
 28 | 
 29 |         return formatted_state
 30 | 
 31 | class CoordinatesToRowColAdapter(object):
 32 | 
 33 |     def __init__(self, room_size, num_rooms):
 34 |         self.room_size = room_size
 35 |         self.num_rooms = num_rooms
 36 | 
 37 |     def convert_state_to_agent_format(self, state):
 38 |         """
 39 |         Convert states in format (x, y) to a single room, row, column one-hot vector
 40 |         example: 
 41 |         >>>state = (4, 4)
 42 |         >>>adapter = CoordinatesToSingleRoomRowColAdapter(room_size=3, num_rooms=2)
 43 |         >>>adapter.convert_state_to_agent_format(state)
 44 |         [0,0,0,0,1,0,0,0,0,0,1,0]
 45 |         """
 46 |         ridx, cidx = state
 47 | 
 48 |         # find where the agent is in the room
 49 |         row = np.zeros(self.room_size * self.num_rooms)
 50 |         row[ridx] = 1
 51 |         col = np.zeros(self.room_size * self.num_rooms)
 52 |         col[cidx] = 1
 53 | 
 54 |         # concat the two vectors
 55 |         formatted_state = np.hstack((row, col))
 56 | 
 57 |         return formatted_state
 58 | 
 59 | class CoordinatesToRowColRoomAdapter(object):
 60 | 
 61 |     def __init__(self, room_size, num_rooms):
 62 |         self.room_size = room_size
 63 |         self.num_rooms = num_rooms
 64 | 
 65 |     def convert_state_to_agent_format(self, state):
 66 |         """
 67 |         Convert states in format (x, y) to a single room, row, column one-hot vector
 68 |         _with_ an additional one-hot vector identifying the room
 69 |         example: 
 70 |         >>>state = (4, 4)
 71 |         >>>adapter = CoordinatesToSingleRoomRowColAdapter(room_size=3, num_rooms=2)
 72 |         >>>adapter.convert_state_to_agent_format(state)
 73 |         [1,0,0,1,0,0,0,0,0,1]
 74 |         """
 75 |         ridx, cidx = state
 76 | 
 77 |         # find where the agent is in the room
 78 |         row = np.zeros(self.room_size)
 79 |         row[ridx % self.room_size] = 1
 80 |         col = np.zeros(self.room_size)
 81 |         col[cidx % self.room_size] = 1
 82 |         room = np.zeros(self.num_rooms ** 2)
 83 |         room_row = cidx / self.room_size
 84 |         room_col = ridx / self.room_size
 85 |         room_idx = room_row * self.num_rooms + room_col
 86 |         room[room_idx] = 1
 87 |         # concat the three vectors
 88 |         formatted_state = np.hstack((row, col, room))
 89 | 
 90 |         return formatted_state
 91 | 
 92 | class CoordinatesToFlattenedGridAdapter(object):
 93 | 
 94 |     def __init__(self, room_size):
 95 |         self.room_size = room_size
 96 |         self.num_rooms = num_rooms
 97 | 
 98 |     def convert_state_to_agent_format(self, state):
 99 |         """
100 |         Convert states in format (x, y) to the full grid
101 |         """
102 |         ridx, cidx = state
103 | 
104 |         # find where the agent is in the room
105 |         grid = np.zeros((room_size * num_rooms, room_size * num_rooms))
106 |         grid[ridx, cidx] = 1
107 | 
108 |         # flatten grid
109 |         formatted_state = grid.flatten()
110 | 
111 |         return formatted_state
112 | 
113 | class IdentityAdapter(object):
114 | 
115 |     def convert_state_to_agent_format(self, state):
116 |         """
117 |         Returns the state as is. Exists to keep the interface consistent.
118 |         """
119 |         return state
120 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wulfebw/hierarchical_rl/0156dd7b1675a0c3a3b7d81cb66721cbba406e28/tests/__init__.py


--------------------------------------------------------------------------------
/tests/run_tests.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | 
3 | test_loader = unittest.defaultTestLoader.discover( '.' )
4 | test_runner = unittest.TextTestRunner(verbosity=2)
5 | test_runner.run(test_loader)


--------------------------------------------------------------------------------
/tests/test_aws_s3_utility.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import unittest
 4 | 
 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
 6 | 
 7 | import aws_s3_utility
 8 | 
 9 | class TestAWSS3Utility(unittest.TestCase):
10 | 
11 |     def test_directory_upload(self):
12 |         ak = ''
13 |         sk = ''
14 |         bucket = 'hierarchical'
15 |         aws_util = aws_s3_utility.S3Utility(ak, sk, bucket)
16 |         directory = '.'
17 |         aws_util.upload_directory('/Users/wulfe/Desktop/aws_run/NeuralAgent_2016-02-20T02:59:50.808542')
18 | 
19 | if __name__ == '__main__':
20 |     unittest.main()
21 | 
22 | 


--------------------------------------------------------------------------------
/tests/test_build_network.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import lasagne
  3 | from lasagne.regularization import regularize_network_params, l2
  4 | import numpy as np
  5 | import os
  6 | import random
  7 | import sys
  8 | import theano
  9 | import theano.tensor as T
 10 | import unittest
 11 | 
 12 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
 13 | 
 14 | def build_hierachical_stacked_lstm_network_with_merge(input_shape, sequence_length, batch_size, output_shape, start=1, downsample=2):
 15 | 
 16 |     l_in = lasagne.layers.InputLayer(
 17 |         shape=(batch_size, sequence_length, input_shape)
 18 |     )
 19 | 
 20 |     default_gate = lasagne.layers.recurrent.Gate(
 21 |         W_in=lasagne.init.HeNormal(), W_hid=lasagne.init.HeNormal(),
 22 |         b=lasagne.init.Constant(0.))
 23 |     forget_gate = lasagne.layers.recurrent.Gate(
 24 |         W_in=lasagne.init.HeNormal(), W_hid=lasagne.init.HeNormal(),
 25 |         b=lasagne.init.Constant(5.))
 26 |     l_lstm1 = lasagne.layers.LSTMLayer(
 27 |         l_in, 
 28 |         num_units=10, 
 29 |         nonlinearity=lasagne.nonlinearities.tanh,
 30 |         cell=default_gate,
 31 |         ingate=default_gate,
 32 |         outgate=default_gate,
 33 |         forgetgate=forget_gate,
 34 |         grad_clipping=2,
 35 |         only_return_final=False
 36 |     )
 37 | 
 38 |     # does this slice out the correct values?
 39 |     l_slice1_up = lasagne.layers.SliceLayer(l_lstm1, slice(start, sequence_length, downsample), 1)
 40 | 
 41 |     l_lstm2 = lasagne.layers.LSTMLayer(
 42 |         l_slice1_up, 
 43 |         num_units=10, 
 44 |         nonlinearity=lasagne.nonlinearities.tanh,
 45 |         cell=default_gate,
 46 |         ingate=default_gate,
 47 |         outgate=default_gate,
 48 |         forgetgate=forget_gate,
 49 |         grad_clipping=2,
 50 |         only_return_final=True
 51 |     )
 52 | 
 53 |     l_slice1_out = lasagne.layers.SliceLayer(l_lstm1, -1, 1)
 54 |     l_merge = lasagne.layers.ConcatLayer([l_slice1_out, l_lstm2])
 55 |     l_out = lasagne.layers.DenseLayer(
 56 |         l_merge,
 57 |         num_units=output_shape,
 58 |         nonlinearity=None,
 59 |         W=lasagne.init.HeNormal(),
 60 |         b=lasagne.init.Constant(0)
 61 |     )
 62 | 
 63 |     return l_out, l_lstm1, l_slice1_up
 64 | 
 65 | class TestBuildHierarchicalStackedLSTMWithMerge(unittest.TestCase):
 66 | 
 67 |     def test_build_hierachical_stacked_lstm_network_with_merge_correct_slice(self):
 68 |         input_shape = 14
 69 |         sequence_length = 4
 70 |         batch_size = 1
 71 |         _, l_lstm, l_slice = build_hierachical_stacked_lstm_network_with_merge(
 72 |                                     input_shape=input_shape,
 73 |                                     sequence_length=sequence_length,
 74 |                                     batch_size=batch_size,
 75 |                                     output_shape=4)
 76 | 
 77 |         states = T.tensor3('states')
 78 |         lstm_out = lasagne.layers.get_output(l_lstm, states)
 79 |         slice_out = lasagne.layers.get_output(l_slice, states)
 80 |         run = theano.function([states], [lstm_out, slice_out])
 81 |         sample_states = np.zeros((batch_size, sequence_length, input_shape))
 82 |         sample_lstm_out, sample_slice_out = run(sample_states)
 83 | 
 84 |         self.assertEquals(sample_lstm_out[:, 1::2, :].tolist(), sample_slice_out.tolist())
 85 | 
 86 |     def test_build_hierachical_stacked_lstm_network_with_merge_correct_slice_short_seq(self):
 87 |         input_shape = 14
 88 |         sequence_length = 2
 89 |         batch_size = 1
 90 |         _, l_lstm, l_slice = build_hierachical_stacked_lstm_network_with_merge(
 91 |                                     input_shape=input_shape,
 92 |                                     sequence_length=sequence_length,
 93 |                                     batch_size=batch_size,
 94 |                                     output_shape=4)
 95 | 
 96 |         states = T.tensor3('states')
 97 |         lstm_out = lasagne.layers.get_output(l_lstm, states)
 98 |         slice_out = lasagne.layers.get_output(l_slice, states)
 99 |         run = theano.function([states], [lstm_out, slice_out])
100 |         sample_states = np.zeros((batch_size, sequence_length, input_shape))
101 |         sample_lstm_out, sample_slice_out = run(sample_states)
102 | 
103 |         self.assertEquals(sample_lstm_out[:, 1::2, :].tolist(), sample_slice_out.tolist())
104 | 
105 | 
106 |     def test_build_hierachical_stacked_lstm_network_with_merge_correct_slice_len_1_seq(self):
107 |         input_shape = 14
108 |         sequence_length = 1
109 |         batch_size = 1
110 |         l_out, l_lstm, l_slice = build_hierachical_stacked_lstm_network_with_merge(
111 |                                     input_shape=input_shape,
112 |                                     sequence_length=sequence_length,
113 |                                     batch_size=batch_size,
114 |                                     output_shape=4,
115 |                                     start=0,
116 |                                     downsample=3)
117 | 
118 |         states = T.tensor3('states')
119 |         l_out_out = lasagne.layers.get_output(l_out, states)
120 |         lstm_out = lasagne.layers.get_output(l_lstm, states)
121 |         slice_out = lasagne.layers.get_output(l_slice, states)
122 |         run = theano.function([states], [l_out_out, lstm_out, slice_out])
123 |         sample_states = np.zeros((batch_size, sequence_length, input_shape))
124 |         sample_out, sample_lstm_out, sample_slice_out = run(sample_states)
125 | 
126 |         self.assertEquals(sample_lstm_out[:, 0::3, :].tolist(), sample_slice_out.tolist())
127 | 
128 |     def test_build_hierachical_stacked_lstm_network_with_merge_correct_slice_longer_len_seq(self):
129 |         input_shape = 14
130 |         sequence_length = 7
131 |         batch_size = 1
132 |         l_out, l_lstm, l_slice = build_hierachical_stacked_lstm_network_with_merge(
133 |                                     input_shape=input_shape,
134 |                                     sequence_length=sequence_length,
135 |                                     batch_size=batch_size,
136 |                                     output_shape=4,
137 |                                     start=0,
138 |                                     downsample=3)
139 | 
140 |         states = T.tensor3('states')
141 |         l_out_out = lasagne.layers.get_output(l_out, states)
142 |         lstm_out = lasagne.layers.get_output(l_lstm, states)
143 |         slice_out = lasagne.layers.get_output(l_slice, states)
144 |         run = theano.function([states], [l_out_out, lstm_out, slice_out])
145 |         sample_states = np.zeros((batch_size, sequence_length, input_shape))
146 |         sample_out, sample_lstm_out, sample_slice_out = run(sample_states)
147 | 
148 |         self.assertEquals(sample_lstm_out[:, 0::3, :].tolist(), sample_slice_out.tolist())
149 | 
150 |     def test_build_hierachical_stacked_lstm_network_with_merge_correct_slice_shared_var(self):
151 |         input_shape = 14
152 |         sequence_length = 1
153 |         batch_size = 1
154 |         _, l_lstm, l_slice = build_hierachical_stacked_lstm_network_with_merge(
155 |                                     input_shape=input_shape,
156 |                                     sequence_length=sequence_length,
157 |                                     batch_size=batch_size,
158 |                                     output_shape=4)
159 | 
160 |         states = T.tensor3('states')
161 |         lstm_out = lasagne.layers.get_output(l_lstm, states)
162 |         slice_out = lasagne.layers.get_output(l_slice, states)
163 | 
164 |         states_shared = theano.shared(np.zeros((batch_size, sequence_length, input_shape)))
165 |         run = theano.function([], [lstm_out, slice_out], givens={states: states_shared})
166 |         sample_states = np.zeros((batch_size, sequence_length, input_shape))
167 |         states_shared.set_value(sample_states)
168 |         sample_lstm_out, sample_slice_out = run()
169 | 
170 |         self.assertEquals(sample_lstm_out[:, 1::2, :].tolist(), sample_slice_out.tolist())
171 | 
172 |         
173 | if __name__ == '__main__':
174 |     unittest.main()
175 | 


--------------------------------------------------------------------------------
/tests/test_experiment.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import shutil
  4 | import sys
  5 | import unittest
  6 | 
  7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
  8 | 
  9 | import agent
 10 | import experiment
 11 | import logger
 12 | import mdps
 13 | import policy
 14 | import recurrent_qnetwork
 15 | import replay_memory
 16 | import state_adapters
 17 | 
 18 | def get_V(e):
 19 |     V = {}
 20 |     e.agent.exploration_prob = 0
 21 |     for state in e.mdp.states:
 22 |         qopt = max((e.agent.getQ(state, action), action) for action in e.agent.actions)[0]
 23 |         V[state] = qopt
 24 |     return V
 25 | 
 26 | class TestExperiment(unittest.TestCase):
 27 | 
 28 |     def setUp(self):
 29 |         pass
 30 |         
 31 | class TestExperimentBasicRuns(TestExperiment):
 32 | 
 33 |     def test_run_basic_mdp_and_agent_episodes(self):
 34 |         mdp = mdps.LineMDP(5)
 35 |         a = agent.TestAgent(len(mdp.get_actions()))
 36 |         num_epochs = 1
 37 |         epoch_length = 10
 38 |         test_epoch_length = 0
 39 |         max_steps = 100
 40 |         run_tests = False
 41 |         e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests)
 42 |         e.run()
 43 |         actual = e.agent.episodes
 44 |         expected = e.num_epochs * e.epoch_length
 45 |         self.assertEquals(actual, expected)
 46 | 
 47 |     def test_run_basic_mdp_and_agent_many_episodes(self):
 48 |         mdp = mdps.LineMDP(5)
 49 |         a = agent.TestAgent(len(mdp.get_actions()))
 50 |         num_epochs = 5
 51 |         epoch_length = 10
 52 |         test_epoch_length = 0
 53 |         max_steps = 100
 54 |         run_tests = False
 55 |         e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests)
 56 |         e.run()
 57 |         actual = e.agent.episodes
 58 |         expected = e.num_epochs * e.epoch_length
 59 |         self.assertEquals(actual, expected)
 60 | 
 61 | class TestExperimentMazeSolving(TestExperiment):
 62 | 
 63 |     def test_run_with_maze_mdp_and_working_agent_completes(self):
 64 |         mdp = mdps.MazeMDP(5, 1)
 65 |         num_actions = len(mdp.get_actions(None))
 66 |         discount = 1
 67 |         exploration_prob = .3
 68 |         step_size = 1e-2
 69 |         a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False)
 70 |         num_epochs = 1
 71 |         epoch_length = 1
 72 |         test_epoch_length = 0
 73 |         max_steps = 10000
 74 |         run_tests = False
 75 |         e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests)
 76 |         e.run()
 77 |         total_len = len(e.agent.logger.actions)
 78 |         self.assertTrue(total_len < max_steps * epoch_length * num_epochs)
 79 | 
 80 |     def test_run_with_small_maze_mdp_q_learning_agent_correct_V(self):
 81 |         mdp = mdps.MazeMDP(5, 1)
 82 |         mdp.compute_states()
 83 |         mdp.EXIT_REWARD = 1
 84 |         mdp.MOVE_REWARD = -0.1
 85 |         num_actions = len(mdp.get_actions(None))
 86 |         discount = 1
 87 |         exploration_prob = .7
 88 |         step_size = 5e-1
 89 |         a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False)
 90 |         num_epochs = 20
 91 |         epoch_length = 100
 92 |         test_epoch_length = 0
 93 |         max_steps = 100
 94 |         run_tests = False
 95 |         e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests)
 96 |         e.run()
 97 | 
 98 |         V = get_V(e)
 99 |         expected = {(0,0):0.3, (1,0):0.4, (2,0):0.5, (3,0):0.6, (4,0):0.7,
100 |                     (0,1):0.4, (1,1):0.5, (2,1):0.6, (3,1):0.7, (4,1):0.8,
101 |                     (0,2):0.5, (1,2):0.6, (2,2):0.7, (3,2):0.8, (4,2):0.9,
102 |                     (0,3):0.6, (1,3):0.7, (2,3):0.8, (3,3):0.9, (4,3):1.0,
103 |                     (0,4):0.7, (1,4):0.8, (2,4):0.9, (3,4):1.0, (4,4):0.0}
104 | 
105 |         max_diff = 1e-1
106 |         for k in expected.keys():
107 |             self.assertTrue(k in V)
108 |             self.assertTrue(np.abs(V[k] - expected[k]) < max_diff)
109 | 
110 |     def test_run_with_large_maze_mdp_q_learning_agent_correct_V(self):
111 |         mdp = mdps.MazeMDP(5, 3)
112 |         mdp.compute_states()
113 |         mdp.EXIT_REWARD = 1
114 |         mdp.MOVE_REWARD = -0.1
115 |         num_actions = len(mdp.get_actions(None))
116 |         discount = 1
117 |         exploration_prob = .5
118 |         step_size = .1
119 |         a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False)
120 |         num_epochs = 10
121 |         epoch_length = 200
122 |         test_epoch_length = 0
123 |         max_steps = 300
124 |         run_tests = False
125 |         e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests)
126 |         e.run()
127 | 
128 |         V = get_V(e)
129 |         actual_total = 0
130 |         for k, v in V.iteritems():
131 |             actual_total += v
132 |         expected_total_min = -110
133 |         expected_total_max = -40
134 |         self.assertTrue(actual_total < expected_total_max)
135 |         self.assertTrue(actual_total > expected_total_min)
136 | 
137 |     def test_run_with_standard_maze_mdp_q_learning_agent_correct_V(self):
138 |         mdp = mdps.MazeMDP(5, 2)
139 |         mdp.compute_states()
140 |         mdp.EXIT_REWARD = 1
141 |         mdp.MOVE_REWARD = -0.01
142 |         num_actions = len(mdp.get_actions(None))
143 |         discount = 1
144 |         exploration_prob = .5
145 |         step_size = .1
146 |         a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False)
147 |         num_epochs = 10
148 |         epoch_length = 200
149 |         test_epoch_length = 0
150 |         max_steps = 300
151 |         run_tests = False
152 |         e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests)
153 |         e.run()
154 | 
155 |         V = get_V(e)
156 |         actual_total = 0
157 |         for k, v in V.iteritems():
158 |             actual_total += v
159 |         expected_total_min = -110
160 |         expected_total_max = -40
161 |         self.assertTrue(actual_total < expected_total_max)
162 |         self.assertTrue(actual_total > expected_total_min)
163 | 
164 | class TestExperimentValueString(TestExperiment):
165 | 
166 |     def test_sequence_value_string(self):
167 |         room_size = 3
168 |         num_rooms = 3
169 |         mdp = mdps.MazeMDP(room_size, num_rooms)
170 |         mdp.compute_states()
171 |         mdp.EXIT_REWARD = 1
172 |         mdp.MOVE_REWARD = -0.1
173 |         discount = 1
174 |         sequence_length = 2
175 |         batch_size = 10
176 |         learning_rate = 1e-3
177 |         freeze_interval = 10000
178 |         num_hidden = 4
179 |         eps = .5
180 |         reg = 1e-8
181 |         num_actions = len(mdp.get_actions(None))
182 |         batch_size = 100
183 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape=2 * room_size, 
184 |             sequence_length=sequence_length, batch_size=batch_size, 
185 |             num_actions=4, num_hidden=num_hidden, discount=discount, learning_rate=
186 |             learning_rate, regularization=reg, update_rule='adam', freeze_interval=
187 |             freeze_interval, network_type='single_layer_lstm', rng=None)        
188 |         num_epochs = 5
189 |         epoch_length = 10
190 |         test_epoch_length = 0
191 |         max_steps = (room_size * num_rooms) ** 2
192 |         epsilon_decay = (num_epochs * epoch_length * max_steps) / 2
193 |         adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size)
194 |         p = policy.EpsilonGreedy(num_actions, eps, 0.05, epsilon_decay)
195 |         rm = replay_memory.SequenceReplayMemory(input_shape=2 * room_size,
196 |                 sequence_length=sequence_length, batch_size=batch_size, capacity=50000)
197 |         log = logger.NeuralLogger(agent_name='RecurrentQNetwork')
198 |         a = agent.RecurrentNeuralAgent(network=network, policy=p, replay_memory=rm, log=log, state_adapter=adapter)
199 |         run_tests = False
200 |         e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, 
201 |             max_steps, run_tests, value_logging=True)
202 |         e.log_temporal_value_string()
203 |         
204 | 
205 | 
206 | if __name__ == '__main__':
207 |     unittest.main()
208 | 
209 | 
210 | 


--------------------------------------------------------------------------------
/tests/test_learning_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import os
 4 | import sys
 5 | import unittest
 6 | 
 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
 8 | 
 9 | import learning_utils
10 | 
11 | 
12 | class TestMakeHeatMap(unittest.TestCase):
13 | 
14 |     def test_make_heat_map(self):
15 |         filepath = '/Users/wulfe/Dropbox/School/Stanford/winter_2016/cs239/project/hierarchical_rl/logs/rqn_4_step_stacked_2roomx5x5_row_col/value_image.txt'
16 |         epoch = 1
17 |         learning_utils.make_heat_map(filepath, epoch)
18 | 
19 | if __name__ == '__main__':
20 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_logger.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import os
 4 | import shutil
 5 | import sys
 6 | import unittest
 7 | 
 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
 9 | 
10 | import agent
11 | import experiment
12 | import logger
13 | import mdps
14 | 
15 | class TestMazeMDP(unittest.TestCase):
16 | 
17 |     def test_log_epoch_empty_log(self):
18 |         l = logger.Logger(agent_name='test')
19 |         l.log_epoch(epoch=0)
20 |         log_dir = l.log_dir
21 |         self.assertTrue(os.path.isfile(os.path.join(log_dir, 'actions.npz')))
22 |         self.assertTrue(os.path.isfile(os.path.join(log_dir, 'rewards.npz')))
23 |         self.assertTrue(os.path.isfile(os.path.join(log_dir, 'losses.npz')))
24 |         shutil.rmtree(log_dir)
25 | 
26 | # class TestMovingAverage(unittest.TestCase):
27 | 
28 | #     def test_moving_average_single_item_window(self):
29 | #         arr = [1,2,3]
30 | #         actual = logger.moving_average(arr, 1)
31 | #         self.assertSequenceEqual(actual, arr)
32 | 
33 | #     def test_moving_average_small_window(self):
34 | #         arr = [1,2,3,4,5,6,7]
35 | #         actual = logger.moving_average(arr, 2)
36 | #         expected = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5]
37 | #         self.assertSequenceEqual(actual, expected)
38 | 
39 | #     def test_moving_average_small_window_large_variance(self):
40 | #         arr = [0,9,0,9,0]
41 | #         actual = logger.moving_average(arr, 3)
42 | #         expected = [3, 3, 6, 3, 3]
43 | #         self.assertSequenceEqual(actual, expected)
44 | 
45 | #     def test_moving_average_large_window_large_variance(self):
46 | #         arr = [0,9,0,9,0]
47 | #         actual = logger.moving_average(arr, 4)
48 | #         expected = [2.25, 2.25, 4.5, 4.5, 2.25]
49 | #         self.assertSequenceEqual(actual, expected)
50 | 
51 | 
52 | class testLoggerGraphing(unittest.TestCase):
53 | 
54 |     def test_graphs_are_plotted_and_saved_during_experiment(self):
55 |         mdp = mdps.MazeMDP(5, 3)
56 |         mdp.compute_states()
57 |         mdp.EXIT_REWARD = 1
58 |         mdp.MOVE_REWARD = -0.1
59 |         num_actions = len(mdp.get_actions(None))
60 |         discount = mdp.get_discount()
61 |         exploration_prob = .5
62 |         step_size = 1
63 |         a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=True)
64 |         num_epochs = 1
65 |         epoch_length = 100
66 |         test_epoch_length = 0
67 |         max_steps = 1000
68 |         run_tests = False
69 |         e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, False)
70 |         e.run()
71 | 
72 |         log_dir = e.agent.logger.log_dir
73 |         self.assertTrue(os.path.isfile(os.path.join(log_dir, 'actions_graph.png')))
74 |         self.assertTrue(os.path.isfile(os.path.join(log_dir, 'losses_graph.png')))
75 |         self.assertTrue(os.path.isfile(os.path.join(log_dir, 'rewards_graph.png')))
76 |         shutil.rmtree(log_dir)
77 |         
78 | if __name__ == '__main__':
79 |     unittest.main()
80 | 
81 | 


--------------------------------------------------------------------------------
/tests/test_mdps.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import sys
  4 | import unittest
  5 | 
  6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
  7 | 
  8 | import agent
  9 | import experiment
 10 | import mdps
 11 | 
 12 | class TestMazeMDPLogic(unittest.TestCase):
 13 | 
 14 |     """ runs_into_wall tests """
 15 |     def test_leave_maze_negative_x(self):
 16 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
 17 |         state = (0,0)
 18 |         action = (-1,0)
 19 |         actual = mdp.runs_into_wall(state, action)
 20 |         expected = True
 21 |         self.assertEquals(actual, expected)
 22 | 
 23 |     def test_leave_maze_positive_x(self):
 24 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
 25 |         state = (4,0)
 26 |         action = (1,0)
 27 |         actual = mdp.runs_into_wall(state, action)
 28 |         expected = True
 29 |         self.assertEquals(actual, expected)
 30 | 
 31 |     def test_leave_maze_negative_y(self):
 32 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
 33 |         state = (0,0)
 34 |         action = (0,-1)
 35 |         actual = mdp.runs_into_wall(state, action)
 36 |         expected = True
 37 |         self.assertEquals(actual, expected)
 38 | 
 39 |     def test_leave_maze_positive_y(self):
 40 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
 41 |         state = (0,4)
 42 |         action = (0,1)
 43 |         actual = mdp.runs_into_wall(state, action)
 44 |         expected = True
 45 |         self.assertEquals(actual, expected)
 46 | 
 47 |     def test_leave_maze_negative_x_false(self):
 48 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
 49 |         state = (1,0)
 50 |         action = (-1,0)
 51 |         actual = mdp.runs_into_wall(state, action)
 52 |         expected = False
 53 |         self.assertEquals(actual, expected)
 54 | 
 55 |     def test_leave_maze_positive_x_false(self):
 56 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
 57 |         state = (3,0)
 58 |         action = (1,0)
 59 |         actual = mdp.runs_into_wall(state, action)
 60 |         expected = False
 61 |         self.assertEquals(actual, expected)
 62 | 
 63 |     def test_leave_maze_negative_y_false(self):
 64 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
 65 |         state = (0,1)
 66 |         action = (0,-1)
 67 |         actual = mdp.runs_into_wall(state, action)
 68 |         expected = False
 69 |         self.assertEquals(actual, expected)
 70 | 
 71 |     def test_leave_maze_positive_y_false(self):
 72 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
 73 |         state = (0,3)
 74 |         action = (0,1)
 75 |         actual = mdp.runs_into_wall(state, action)
 76 |         expected = False
 77 |         self.assertEquals(actual, expected)
 78 | 
 79 |     def test_wall_cross_x_right_to_left(self):
 80 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
 81 |         state = (4,0)
 82 |         action = (1,0)
 83 |         actual = mdp.runs_into_wall(state, action)
 84 |         expected = True
 85 |         self.assertEquals(actual, expected)
 86 | 
 87 |     def test_wall_cross_x_left_to_right(self):
 88 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
 89 |         state = (5,0)
 90 |         action = (-1,0)
 91 |         actual = mdp.runs_into_wall(state, action)
 92 |         expected = True
 93 |         self.assertEquals(actual, expected)
 94 | 
 95 |     def test_wall_cross_y_down_to_up(self):
 96 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
 97 |         state = (0,4)
 98 |         action = (0,1)
 99 |         actual = mdp.runs_into_wall(state, action)
100 |         expected = True
101 |         self.assertEquals(actual, expected)
102 | 
103 |     def test_wall_cross_y_up_to_down(self):
104 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
105 |         state = (0,5)
106 |         action = (0,-1)
107 |         actual = mdp.runs_into_wall(state, action)
108 |         expected = True
109 |         self.assertEquals(actual, expected)
110 | 
111 |     def test_wall_cross_x_right_to_left_false(self):
112 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
113 |         state = (3,0)
114 |         action = (1,0)
115 |         actual = mdp.runs_into_wall(state, action)
116 |         expected = False
117 |         self.assertEquals(actual, expected)
118 | 
119 |     def test_wall_cross_x_left_to_right_false(self):
120 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
121 |         state = (6,0)
122 |         action = (-1,0)
123 |         actual = mdp.runs_into_wall(state, action)
124 |         expected = False
125 |         self.assertEquals(actual, expected)
126 | 
127 |     def test_wall_cross_y_down_to_up_false(self):
128 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
129 |         state = (0,3)
130 |         action = (0,1)
131 |         actual = mdp.runs_into_wall(state, action)
132 |         expected = False
133 |         self.assertEquals(actual, expected)
134 | 
135 |     def test_wall_cross_y_up_to_down_false(self):
136 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
137 |         state = (0,6)
138 |         action = (0,-1)
139 |         actual = mdp.runs_into_wall(state, action)
140 |         expected = False
141 |         self.assertEquals(actual, expected)
142 | 
143 |     def test_wall_cross_through_doorway_x_right_to_left(self):
144 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
145 |         state = (4,2)
146 |         action = (1,0)
147 |         actual = mdp.runs_into_wall(state, action)
148 |         expected = False
149 |         self.assertEquals(actual, expected)
150 | 
151 |     def test_wall_cross_through_doorway_x_left_to_right(self):
152 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
153 |         state = (5,2)
154 |         action = (-1,0)
155 |         actual = mdp.runs_into_wall(state, action)
156 |         expected = False
157 |         self.assertEquals(actual, expected)
158 | 
159 |     def test_wall_cross_through_doorway_y_up(self):
160 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
161 |         state = (2,4)
162 |         action = (0,1)
163 |         actual = mdp.runs_into_wall(state, action)
164 |         expected = False
165 |         self.assertEquals(actual, expected)
166 | 
167 |     def test_wall_cross_through_doorway_y_down(self):
168 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
169 |         state = (2,5)
170 |         action = (0,-1)
171 |         actual = mdp.runs_into_wall(state, action)
172 |         expected = False
173 |         self.assertEquals(actual, expected)
174 | 
175 |     """ runs_into_wall tests on larger mazes """
176 |     def test_leave_maze_negative_x_larger(self):
177 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=5)
178 |         state = (0,0)
179 |         action = (-1,0)
180 |         actual = mdp.runs_into_wall(state, action)
181 |         expected = True
182 |         self.assertEquals(actual, expected)
183 | 
184 |     def test_leave_maze_negative_y_larger(self):
185 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=5)
186 |         state = (0,0)
187 |         action = (0,-1)
188 |         actual = mdp.runs_into_wall(state, action)
189 |         expected = True
190 |         self.assertEquals(actual, expected)
191 | 
192 |     def test_leave_maze_negative_x_false_larger(self):
193 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=5)
194 |         state = (1,0)
195 |         action = (-1,0)
196 |         actual = mdp.runs_into_wall(state, action)
197 |         expected = False
198 |         self.assertEquals(actual, expected)
199 | 
200 |     def test_leave_maze_positive_x_false_larger(self):
201 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=5)
202 |         state = (3,0)
203 |         action = (1,0)
204 |         actual = mdp.runs_into_wall(state, action)
205 |         expected = False
206 |         self.assertEquals(actual, expected)
207 | 
208 |     def test_leave_maze_negative_y_false_larger(self):
209 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=5)
210 |         state = (0,1)
211 |         action = (0,-1)
212 |         actual = mdp.runs_into_wall(state, action)
213 |         expected = False
214 |         self.assertEquals(actual, expected)
215 | 
216 |     def test_leave_maze_positive_y_false_larger(self):
217 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=5)
218 |         state = (0,3)
219 |         action = (0,1)
220 |         actual = mdp.runs_into_wall(state, action)
221 |         expected = False
222 |         self.assertEquals(actual, expected)
223 | 
224 |     def test_wall_cross_through_doorway_x_right_to_left_larger(self):
225 |         mdp = mdps.MazeMDP(room_size=3, num_rooms=2)
226 |         state = (2,4)
227 |         action = (1,0)
228 |         actual = mdp.runs_into_wall(state, action)
229 |         expected = False
230 |         self.assertEquals(actual, expected)
231 | 
232 |     def test_wall_cross_through_doorway_x_left_to_right_larger(self):
233 |         mdp = mdps.MazeMDP(room_size=3, num_rooms=2)
234 |         state = (3,4)
235 |         action = (-1,0)
236 |         actual = mdp.runs_into_wall(state, action)
237 |         expected = False
238 |         self.assertEquals(actual, expected)
239 | 
240 |     def test_wall_cross_through_doorway_y_up_larger(self):
241 |         mdp = mdps.MazeMDP(room_size=3, num_rooms=2)
242 |         state = (4,2)
243 |         action = (0,1)
244 |         actual = mdp.runs_into_wall(state, action)
245 |         expected = False
246 |         self.assertEquals(actual, expected)
247 | 
248 |     def test_wall_cross_through_doorway_y_down_larger(self):
249 |         mdp = mdps.MazeMDP(room_size=3, num_rooms=2)
250 |         state = (4,3)
251 |         action = (0,-1)
252 |         actual = mdp.runs_into_wall(state, action)
253 |         expected = False
254 |         self.assertEquals(actual, expected)
255 | 
256 |     """ runs_into_wall tests on different room sizes """
257 | 
258 |     def test_wall_cross_x_right_to_left_larger_room_size(self):
259 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
260 |         state = (6,0)
261 |         action = (1,0)
262 |         actual = mdp.runs_into_wall(state, action)
263 |         expected = True
264 |         self.assertEquals(actual, expected)
265 | 
266 |     def test_wall_cross_x_left_to_right_larger_room_size(self):
267 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
268 |         state = (7,0)
269 |         action = (-1,0)
270 |         actual = mdp.runs_into_wall(state, action)
271 |         expected = True
272 |         self.assertEquals(actual, expected)
273 | 
274 |     def test_wall_cross_y_down_to_up_larger_room_size(self):
275 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
276 |         state = (0,6)
277 |         action = (0,1)
278 |         actual = mdp.runs_into_wall(state, action)
279 |         expected = True
280 |         self.assertEquals(actual, expected)
281 | 
282 |     def test_wall_cross_y_up_to_down_larger_room_size(self):
283 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
284 |         state = (0,7)
285 |         action = (0,-1)
286 |         actual = mdp.runs_into_wall(state, action)
287 |         expected = True
288 |         self.assertEquals(actual, expected)
289 | 
290 |     def test_wall_cross_x_right_to_left_false_larger_room_size(self):
291 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
292 |         state = (3,0)
293 |         action = (1,0)
294 |         actual = mdp.runs_into_wall(state, action)
295 |         expected = False
296 |         self.assertEquals(actual, expected)
297 | 
298 |     def test_wall_cross_x_left_to_right_false_larger_room_size(self):
299 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
300 |         state = (6,0)
301 |         action = (-1,0)
302 |         actual = mdp.runs_into_wall(state, action)
303 |         expected = False
304 |         self.assertEquals(actual, expected)
305 | 
306 |     def test_wall_cross_y_down_to_up_false_larger_room_size(self):
307 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
308 |         state = (0,3)
309 |         action = (0,1)
310 |         actual = mdp.runs_into_wall(state, action)
311 |         expected = False
312 |         self.assertEquals(actual, expected)
313 | 
314 |     def test_wall_cross_y_up_to_down_false_larger_room_size(self):
315 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
316 |         state = (0,6)
317 |         action = (0,-1)
318 |         actual = mdp.runs_into_wall(state, action)
319 |         expected = False
320 |         self.assertEquals(actual, expected)
321 | 
322 |     def test_wall_cross_through_doorway_x_right_to_left_larger_room_size(self):
323 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
324 |         state = (6,3)
325 |         action = (1,0)
326 |         actual = mdp.runs_into_wall(state, action)
327 |         expected = False
328 |         self.assertEquals(actual, expected)
329 | 
330 |     def test_wall_cross_through_doorway_x_left_to_right_larger_room_size(self):
331 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
332 |         state = (7,3)
333 |         action = (-1,0)
334 |         actual = mdp.runs_into_wall(state, action)
335 |         expected = False
336 |         self.assertEquals(actual, expected)
337 | 
338 |     def test_wall_cross_through_doorway_y_up_larger_room_size(self):
339 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
340 |         state = (3,6)
341 |         action = (0,1)
342 |         actual = mdp.runs_into_wall(state, action)
343 |         expected = False
344 |         self.assertEquals(actual, expected)
345 | 
346 |     def test_wall_cross_through_doorway_y_down_larger_room_size(self):
347 |         mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
348 |         state = (3,7)
349 |         action = (0,-1)
350 |         actual = mdp.runs_into_wall(state, action)
351 |         expected = False
352 |         self.assertEquals(actual, expected)
353 | 
354 |     def test_corner_movement_up(self):
355 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
356 |         state = (4,4)
357 |         action = (0,1)
358 |         actual = mdp.runs_into_wall(state, action)
359 |         expected = True
360 |         self.assertEquals(actual, expected)
361 | 
362 |     def test_corner_movement_right(self):
363 |         mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
364 |         state = (4,4)
365 |         action = (1,0)
366 |         actual = mdp.runs_into_wall(state, action)
367 |         expected = True
368 |         self.assertEquals(actual, expected)
369 | 
370 |         
371 | 
372 | if __name__ == '__main__':
373 |     unittest.main()
374 | 


--------------------------------------------------------------------------------
/tests/test_neural_agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import shutil
 4 | import sys
 5 | import unittest
 6 | 
 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
 8 | 
 9 | import agent
10 | import experiment
11 | import mdps
12 | import policy
13 | import qnetwork
14 | import replay_memory
15 | import state_adapters
16 | 
17 | @unittest.skipIf(__name__ != '__main__', "this test class does not run unless this file is called directly")
18 | class TestNeuralAgent(unittest.TestCase):
19 | 
20 |     def test_agent(self):
21 |         room_size = 5
22 |         mdp = mdps.MazeMDP(room_size, 1)
23 |         mdp.compute_states()
24 |         mdp.EXIT_REWARD = 1
25 |         mdp.MOVE_REWARD = -0.1
26 |         discount = mdp.get_discount()
27 |         num_actions = len(mdp.get_actions(None))
28 |         network = qnetwork.QNetwork(input_shape=2 * room_size, batch_size=1, num_actions=4, num_hidden=10, discount=discount, learning_rate=1e-3, update_rule='sgd', freeze_interval=10000, rng=None)
29 |         p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, 10000)
30 |         rm = replay_memory.ReplayMemory(1)
31 |         log = logger.NeuralLogger(agent_name='QNetwork')
32 |         adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size)
33 |         a = agent.NeuralAgent(network=network, policy=p, replay_memory=rm, logger=log, state_adapter=adapter)
34 |         num_epochs = 2
35 |         epoch_length = 10
36 |         test_epoch_length = 0
37 |         max_steps = 10
38 |         run_tests = False
39 |         e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=False)
40 |         e.run()
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import sys
 4 | import unittest
 5 | 
 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
 7 | 
 8 | import policy
 9 | 
10 | class TestEpsilonGreedy(unittest.TestCase):
11 | 
12 |     def test_deterministic_action_selection(self):
13 |         p = policy.EpsilonGreedy(num_actions=4, exploration_prob=0, min_exploration_prob=0, actions_until_min=1)
14 |         q_values = [1,2,3,4]
15 |         actual = p.choose_action(q_values)
16 |         expected = 3
17 |         self.assertEquals(actual, expected)
18 | 
19 |     def test_reduction_decreses_exploration_prob(self):
20 |         p = policy.EpsilonGreedy(num_actions=4, exploration_prob=1, min_exploration_prob=0, actions_until_min=2)
21 |         q_values = [1,2,3,4]
22 |         p.choose_action(q_values)
23 |         self.assertEquals(p.exploration_prob, 0.5)
24 | 
25 |     def test_reduction_decreses_exploration_prob_completely(self):
26 |         p = policy.EpsilonGreedy(num_actions=4, exploration_prob=1, min_exploration_prob=0, actions_until_min=2)
27 |         q_values = [1,2,3,4]
28 |         p.choose_action(q_values)
29 |         p.choose_action(q_values)
30 |         self.assertEquals(p.exploration_prob, 0)
31 | 
32 | class TestSoftmax(unittest.TestCase):
33 | 
34 |     def test_deterministic_action_selection(self):
35 |         p = policy.Softmax(num_actions=4, tau=1e-1, min_tau=0, actions_until_min=100)
36 |         q_values = np.array([1,2,3,4])
37 |         actual = p.choose_action(q_values)
38 |         expected = 3
39 |         self.assertEquals(actual, expected)
40 | 
41 |     def test_stochastic_action_selection(self):
42 |         p = policy.Softmax(num_actions=4, tau=1e1, min_tau=0, actions_until_min=1000)
43 |         q_values = np.array([1,2,3,4])
44 |         actions = []
45 |         for i in range(1000):
46 |             actions.append(p.choose_action(q_values))
47 |         actions = set(actions)
48 |         expected = 4
49 |         self.assertEquals(len(actions), expected)
50 | 
51 |     def test_reduction_decreses_exploration_prob(self):
52 |         p = policy.Softmax(num_actions=4, tau=1, min_tau=0, actions_until_min=2)
53 |         q_values = np.array([1,2,3,4])
54 |         p.choose_action(q_values)
55 |         self.assertEquals(p.tau, 0.5)
56 | 
57 |     def test_reduction_decreses_exploration_prob_completely(self):
58 |         p = policy.Softmax(num_actions=4, tau=1, min_tau=0, actions_until_min=2)
59 |         q_values = np.array([1,2,3,4])
60 |         p.choose_action(q_values)
61 |         p.choose_action(q_values)
62 |         self.assertEquals(p.tau, 0)
63 | 
64 | if __name__ == '__main__':
65 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_qnetwork.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import lasagne
  3 | import numpy as np
  4 | import os
  5 | import random
  6 | import shutil
  7 | import sys
  8 | import theano
  9 | import theano.tensor as T
 10 | import unittest
 11 | 
 12 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
 13 | 
 14 | import agent
 15 | import aws_s3_utility
 16 | import experiment
 17 | import file_utils
 18 | import logger
 19 | import mdps
 20 | import policy
 21 | import qnetwork
 22 | import replay_memory
 23 | import state_adapters
 24 | 
 25 | class TestQNetworkConstruction(unittest.TestCase):
 26 | 
 27 |     def test_qnetwork_constructor_sgd(self):
 28 |         input_shape = 2
 29 |         batch_size = 100
 30 |         num_actions = 4
 31 |         num_hidden = 10
 32 |         discount = 1
 33 |         learning_rate = 1e-2 
 34 |         update_rule = 'sgd'
 35 |         freeze_interval = 1000
 36 |         regularization = 0
 37 |         rng = None
 38 |         num_hidden_layers = 1
 39 |         network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng)
 40 | 
 41 |     def test_qnetwork_constructor_rmsprop(self):
 42 |         input_shape = 2
 43 |         batch_size = 100
 44 |         num_actions = 4
 45 |         num_hidden = 10
 46 |         discount = 1
 47 |         learning_rate = 1e-2 
 48 |         update_rule = 'rmsprop'
 49 |         freeze_interval = 1000
 50 |         regularization = 0
 51 |         rng = None
 52 |         num_hidden_layers = 1
 53 |         network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng)
 54 | 
 55 |     def test_qnetwork_constructor_adam(self):
 56 |         input_shape = 2
 57 |         batch_size = 100
 58 |         num_actions = 4
 59 |         num_hidden = 10
 60 |         discount = 1
 61 |         learning_rate = 1e-2 
 62 |         update_rule = 'adam'
 63 |         freeze_interval = 1000
 64 |         regularization = 0
 65 |         rng = None
 66 |         num_hidden_layers = 1
 67 |         network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng)
 68 | 
 69 | class TestQNetworkGetQValues(unittest.TestCase):
 70 | 
 71 |     def test_that_q_values_are_retrievable(self):
 72 |         input_shape = 2
 73 |         batch_size = 100
 74 |         num_actions = 4
 75 |         num_hidden = 10
 76 |         discount = 1
 77 |         learning_rate = 1e-2 
 78 |         update_rule = 'sgd'
 79 |         freeze_interval = 1000
 80 |         regularization = 0
 81 |         rng = None
 82 |         num_hidden_layers = 1
 83 |         network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng)
 84 | 
 85 |         state = np.array([1,1])
 86 |         q_values = network.get_q_values(state) 
 87 |         actual = np.shape(q_values)
 88 |         expected = (num_actions,)
 89 |         self.assertEquals(actual, expected)
 90 | 
 91 |     def test_that_initial_values_are_all_similar(self):
 92 |         input_shape = 2
 93 |         batch_size = 100
 94 |         num_actions = 4
 95 |         num_hidden = 10
 96 |         discount = 1
 97 |         learning_rate = 1e-2 
 98 |         update_rule = 'sgd'
 99 |         freeze_interval = 1000
100 |         regularization = 0
101 |         rng = None
102 |         num_hidden_layers = 1
103 |         network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng)
104 | 
105 |         states = [[1,1],[-1,-1],[-1,1],[1,-1]]
106 |         for state in states:
107 |             q_values = network.get_q_values(state) 
108 |             self.assertTrue(max(abs(q_values)) < 2)
109 | 
110 | class TestQNetworkGetParams(unittest.TestCase):
111 | 
112 |     def test_params_retrievable(self):
113 |         input_shape = 2
114 |         batch_size = 100
115 |         num_actions = 4
116 |         num_hidden = 10
117 |         discount = 1
118 |         learning_rate = 1e-2 
119 |         update_rule = 'sgd'
120 |         freeze_interval = 1000
121 |         regularization = 0
122 |         rng = None
123 |         num_hidden_layers = 1
124 |         network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng)
125 | 
126 |         params = network.get_params()
127 |         self.assertTrue(params is not None)
128 | 
129 | @unittest.skipIf(__name__ != '__main__', "this test class does not run unless this file is called directly")
130 | class TestQNetworkTrain(unittest.TestCase):
131 |     
132 |     def test_loss_with_zero_reward_same_next_state_is_zero(self):
133 |         # loss is still not zero because the selected action might not be the maximum value action
134 |         input_shape = 2
135 |         batch_size = 1
136 |         num_actions = 4
137 |         num_hidden = 10
138 |         discount = 1
139 |         learning_rate = 1e-2 
140 |         update_rule = 'sgd'
141 |         freeze_interval = 1000
142 |         regularization = 0
143 |         rng = None
144 |         num_hidden_layers = 1
145 |         network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng)
146 | 
147 |         states = np.zeros((1,2))
148 |         actions = np.zeros((1,1), dtype='int32')
149 |         rewards = np.zeros((1,1))
150 |         next_states = np.zeros((1,2))
151 |         terminals = np.zeros((1,1), dtype='int32')
152 | 
153 |         loss = network.train(states, actions, rewards, next_states, terminals)
154 |         actual = loss
155 |         expected = 2
156 |         self.assertTrue(actual < expected)
157 | 
158 |     def test_loss_with_nonzero_reward_same_next_state_is_nonzero(self):
159 |         input_shape = 2
160 |         batch_size = 1
161 |         num_actions = 4
162 |         num_hidden = 10
163 |         discount = 1
164 |         learning_rate = 1e-2 
165 |         update_rule = 'sgd'
166 |         freeze_interval = 1000
167 |         regularization = 0
168 |         rng = None
169 |         num_hidden_layers = 1
170 |         network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng)
171 | 
172 |         values = np.array(lasagne.layers.helper.get_all_param_values(network.l_out)) * 0
173 |         lasagne.layers.helper.set_all_param_values(network.l_out, values)
174 |         lasagne.layers.helper.set_all_param_values(network.next_l_out, values)
175 | 
176 |         states = np.ones((1,2), dtype=float)
177 |         actions = np.zeros((1,1), dtype='int32')
178 |         rewards = np.ones((1,1), dtype='int32')
179 |         next_states = np.ones((1,2), dtype=float)
180 |         terminals = np.zeros((1,1), dtype='int32')
181 | 
182 |         loss = network.train(states, actions, rewards, next_states, terminals)
183 |         actual = loss
184 |         expected = 0.5
185 |         self.assertEquals(actual, expected)
186 | 
187 |     def test_overfit_simple_artificial_dataset(self):
188 |         input_shape = 1
189 |         batch_size = 10
190 |         num_actions = 2
191 |         num_hidden = 2
192 |         discount = 1
193 |         learning_rate = 1
194 |         update_rule = 'adam'
195 |         freeze_interval = 100
196 |         regularization = 0
197 |         rng = None
198 |         num_hidden_layers = 1
199 |         network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng)
200 | 
201 |         rm = replay_memory.ReplayMemory(batch_size)
202 |         # state 0 to state 1 reward +1
203 |         for idx in range(20):
204 |             state = np.array([0])
205 |             next_state = np.array([1])
206 |             action = 1
207 |             reward = 1
208 |             terminal = 1
209 |             rm.store((state, action, reward, next_state, terminal))
210 | 
211 |         # state 0 to state 0 reward -1
212 |         for idx in range(20):
213 |             switch = random.randint(0,1)
214 |             state = np.array([0])
215 |             next_state = np.array([0])
216 |             action = 0
217 |             reward = -1
218 |             terminal = 0
219 |             rm.store((state, action, reward, next_state, terminal))
220 | 
221 |         print rm.terminal_count
222 |         print_data = False
223 |         l = logger.Logger('test')
224 |         counter = 0
225 |         while True:
226 |             counter += 1
227 |             states, actions, rewards, next_states, terminals = rm.sample_batch()
228 |             loss = network.train(states, actions, rewards, next_states, terminals)
229 |             l.log_loss(loss)
230 | 
231 | 
232 |             if counter % 100 == 0:
233 |                 l.log_epoch(counter)
234 |                 Q = {}
235 |                 s0 = network.get_q_values(np.array([0]))
236 |                 Q['s0_a0'] = s0[0]
237 |                 Q['s0_a1'] = s0[1]
238 |                 s1 = network.get_q_values(np.array([1]))
239 |                 Q['s1_a0'] = s1[0]
240 |                 Q['s1_a1'] = s1[1]
241 | 
242 | @unittest.skipIf(__name__ != '__main__', "this test class does not run unless this file is called directly")
243 | class TestQNetworkFullOperationFlattnedState(unittest.TestCase):
244 | 
245 |     def test_qnetwork_solves_small_mdp(self):
246 |         
247 | 
248 |         def run(learning_rate, freeze_interval, num_hidden, reg):
249 |             room_size = 5
250 |             num_rooms = 2
251 |             mdp = mdps.MazeMDP(room_size, num_rooms)
252 |             mdp.compute_states()
253 |             mdp.EXIT_REWARD = 1
254 |             mdp.MOVE_REWARD = -0.01
255 |             discount = 1
256 |             num_actions = len(mdp.get_actions(None))
257 |             batch_size = 100
258 |             print 'building network...'
259 |             network = qnetwork.QNetwork(input_shape=2 * room_size + num_rooms ** 2, batch_size=batch_size, num_hidden_layers=2, num_actions=4, num_hidden=num_hidden, discount=discount, learning_rate=learning_rate, regularization=reg, update_rule='adam', freeze_interval=freeze_interval, rng=None)
260 |             num_epochs = 50
261 |             epoch_length = 2
262 |             test_epoch_length = 0
263 |             max_steps = 4 * (room_size * num_rooms) ** 2 
264 |             epsilon_decay = (num_epochs * epoch_length * max_steps) / 1.5
265 |             print 'building policy...'
266 |             p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, epsilon_decay)
267 |             print 'building memory...'
268 |             rm = replay_memory.ReplayMemory(batch_size, capacity=50000)
269 |             print 'building logger...'
270 |             log = logger.NeuralLogger(agent_name='QNetwork')
271 |             print 'building state adapter...'
272 |             adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=room_size, num_rooms=num_rooms)
273 |             # adapter = state_adapters.CoordinatesToRowColAdapter(room_size=room_size, num_rooms=num_rooms)
274 |             # adapter = state_adapters.CoordinatesToFlattenedGridAdapter(room_size=room_size, num_rooms=num_rooms)
275 |             # adapter = state_adapters.IdentityAdapter(room_size=room_size, num_rooms=num_rooms)
276 |             # adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size)
277 |             print 'building agent...'
278 |             a = agent.NeuralAgent(network=network, policy=p, replay_memory=rm, log=log, state_adapter=adapter)
279 |             run_tests = False
280 |             e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=True)
281 |             e.run()
282 | 
283 |             ak = file_utils.load_key('../access_key.key')
284 |             sk = file_utils.load_key('../secret_key.key')
285 |             bucket = 'hierarchical'
286 |             try:
287 |                 aws_util = aws_s3_utility.S3Utility(ak, sk, bucket)
288 |                 aws_util.upload_directory(e.agent.logger.log_dir)
289 |             except Exception as e:
290 |                 print 'error uploading to s3: {}'.format(e)
291 | 
292 |         for idx in range(2):
293 |             lr = random.choice([.007, .006, .005])  # learning rate
294 |             fi = random.choice([200, 300, 400, 500, 600, 700, 800]) # freeze interval
295 |             nh = random.choice([4]) # num hidden
296 |             reg = random.choice([5e-4]) # regularization
297 |             print 'run number: {}'.format(idx)
298 |             print lr, fi, nh, reg
299 |             run(lr, fi, nh, reg)
300 | 
301 | if __name__ == '__main__':
302 |     unittest.main()
303 | 


--------------------------------------------------------------------------------
/tests/test_recurrent_qnetwork.py:
--------------------------------------------------------------------------------
  1 | import lasagne
  2 | import numpy as np
  3 | import os
  4 | import random
  5 | import shutil
  6 | import sys
  7 | import theano
  8 | import theano.tensor as T
  9 | import unittest
 10 | 
 11 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
 12 | 
 13 | import agent
 14 | import aws_s3_utility
 15 | import experiment
 16 | import file_utils
 17 | import learning_utils
 18 | import logger
 19 | import mdps
 20 | import policy
 21 | import recurrent_qnetwork
 22 | import replay_memory
 23 | import state_adapters
 24 | 
 25 | class TestRecurrentQNetworkConstruction(unittest.TestCase):
 26 | 
 27 |     def test_qnetwork_constructor_sgd(self):
 28 |         input_shape = 2
 29 |         batch_size = 10
 30 |         sequence_length = 1
 31 |         num_actions = 4
 32 |         num_hidden = 10
 33 |         discount = 1
 34 |         learning_rate = 1e-2 
 35 |         update_rule = 'sgd'
 36 |         freeze_interval = 1000
 37 |         regularization = 1e-4
 38 |         network_type = 'single_layer_rnn'
 39 |         rng = None
 40 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
 41 |                     sequence_length, batch_size, num_actions, num_hidden, 
 42 |                     discount, learning_rate, regularization, update_rule, 
 43 |                     freeze_interval, network_type, rng)
 44 | 
 45 | class TestRecurrentQNetworkTrain(unittest.TestCase):
 46 |     
 47 |     def test_loss_with_zero_reward_same_next_state_is_zero(self):
 48 |         input_shape = 2
 49 |         batch_size = 1
 50 |         sequence_length = 1
 51 |         num_actions = 4
 52 |         num_hidden = 5
 53 |         discount = 1
 54 |         learning_rate = 1e-2 
 55 |         update_rule = 'sgd'
 56 |         freeze_interval = 1000
 57 |         regularization = 1e-4
 58 |         network_type = 'single_layer_rnn'
 59 |         rng = None
 60 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
 61 |                     sequence_length, batch_size, num_actions, num_hidden, 
 62 |                     discount, learning_rate, regularization, update_rule, 
 63 |                     freeze_interval, network_type, rng)
 64 | 
 65 |         states = np.zeros((1,1,2))
 66 |         actions = np.zeros((1,1), dtype='int32')
 67 |         rewards = np.zeros((1,1))
 68 |         next_states = np.zeros((1,1,2))
 69 |         terminals = np.zeros((1,1), dtype='int32')
 70 | 
 71 |         loss = network.train(states, actions, rewards, next_states, terminals)
 72 |         actual = loss
 73 |         expected = 2
 74 |         self.assertTrue(actual < expected)
 75 | 
 76 |     def test_loss_with_nonzero_reward_same_next_state_is_nonzero(self):
 77 |         input_shape = 2
 78 |         batch_size = 1
 79 |         sequence_length = 1
 80 |         num_actions = 4
 81 |         num_hidden = 10
 82 |         discount = 1
 83 |         learning_rate = 1e-2 
 84 |         update_rule = 'sgd'
 85 |         freeze_interval = 1000
 86 |         regularization = 1e-4
 87 |         network_type = 'single_layer_rnn'
 88 |         rng = None
 89 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
 90 |                     sequence_length, batch_size, num_actions, num_hidden, 
 91 |                     discount, learning_rate, regularization, update_rule, 
 92 |                     freeze_interval, network_type, rng)
 93 | 
 94 |         values = np.array(lasagne.layers.helper.get_all_param_values(network.l_out)) * 0
 95 |         lasagne.layers.helper.set_all_param_values(network.l_out, values)
 96 |         lasagne.layers.helper.set_all_param_values(network.next_l_out, values)
 97 | 
 98 |         states = np.ones((1,1,2), dtype=float)
 99 |         actions = np.zeros((1,1), dtype='int32')
100 |         rewards = np.ones((1,1), dtype='int32')
101 |         next_states = np.ones((1,1,2), dtype=float)
102 |         terminals = np.zeros((1,1), dtype='int32')
103 | 
104 |         loss = network.train(states, actions, rewards, next_states, terminals)
105 |         actual = loss
106 |         expected = 0.5
107 |         self.assertEquals(actual, expected)
108 | 
109 |     def test_loss_with_nonzero_reward_same_next_state_is_nonzero_large_batch_size(self):
110 |         input_shape = 2
111 |         batch_size = 10
112 |         sequence_length = 1
113 |         num_actions = 4
114 |         num_hidden = 10
115 |         discount = 1
116 |         learning_rate = 1e-2 
117 |         update_rule = 'sgd'
118 |         freeze_interval = 1000
119 |         regularization = 1e-4
120 |         network_type = 'single_layer_rnn'
121 |         rng = None
122 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
123 |                     sequence_length, batch_size, num_actions, num_hidden, 
124 |                     discount, learning_rate, regularization, update_rule, 
125 |                     freeze_interval, network_type, rng)
126 | 
127 |         values = np.array(lasagne.layers.helper.get_all_param_values(network.l_out)) * 0
128 |         lasagne.layers.helper.set_all_param_values(network.l_out, values)
129 |         lasagne.layers.helper.set_all_param_values(network.next_l_out, values)
130 | 
131 |         states = np.ones((10,1,2), dtype=float)
132 |         actions = np.zeros((10,1), dtype='int32')
133 |         rewards = np.ones((10,1), dtype='int32')
134 |         next_states = np.ones((10,1,2), dtype=float)
135 |         terminals = np.zeros((10,1), dtype='int32')
136 | 
137 |         loss = network.train(states, actions, rewards, next_states, terminals)
138 |         actual = loss
139 |         expected = 5.0
140 |         self.assertEquals(actual, expected)
141 | 
142 |     def test_loss_not_impacted_by_hid_init(self):
143 |         input_shape = 2
144 |         batch_size = 10
145 |         sequence_length = 1
146 |         num_actions = 4
147 |         num_hidden = 10
148 |         discount = 1
149 |         learning_rate = 0 
150 |         update_rule = 'sgd'
151 |         freeze_interval = 1000
152 |         regularization = 1e-4
153 |         network_type = 'single_layer_rnn'
154 |         rng = None
155 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
156 |                     sequence_length, batch_size, num_actions, num_hidden, 
157 |                     discount, learning_rate, regularization, update_rule, 
158 |                     freeze_interval, network_type, rng)
159 | 
160 |         values = np.array(lasagne.layers.helper.get_all_param_values(network.l_out)) * 0
161 |         lasagne.layers.helper.set_all_param_values(network.l_out, values)
162 |         lasagne.layers.helper.set_all_param_values(network.next_l_out, values)
163 | 
164 |         states = np.ones((10,1,2), dtype=float)
165 |         actions = np.zeros((10,1), dtype='int32')
166 |         rewards = np.ones((10,1), dtype='int32')
167 |         next_states = np.ones((10,1,2), dtype=float)
168 |         terminals = np.zeros((10,1), dtype='int32')
169 | 
170 |         loss_before_q_values = network.train(states, actions, rewards, next_states, terminals)
171 | 
172 |         state = np.ones((1,1,2), dtype=float)
173 |         q_values_without_hid_init = network.get_q_values(state).tolist()
174 | 
175 |         loss_after_q_values = network.train(states, actions, rewards, next_states, terminals)
176 | 
177 |         self.assertEquals(loss_before_q_values, loss_after_q_values)
178 | 
179 | class TestRecurrentQNetworkGetQValues(unittest.TestCase):
180 |     
181 |     def test_get_q_values_hid_init_impacts_q_values(self):
182 |         input_shape = 2
183 |         batch_size = 10
184 |         sequence_length = 1
185 |         num_actions = 4
186 |         num_hidden = 10
187 |         discount = 1
188 |         learning_rate = 1e-2 
189 |         update_rule = 'sgd'
190 |         freeze_interval = 1000
191 |         regularization = 1e-4
192 |         network_type = 'single_layer_rnn'
193 |         rng = None
194 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
195 |                     sequence_length, batch_size, num_actions, num_hidden, 
196 |                     discount, learning_rate, regularization, update_rule, 
197 |                     freeze_interval, network_type, rng)
198 | 
199 |         state = np.ones((1,1,2), dtype=float)
200 |         q_values_without_hid_init = network.get_q_values(state).tolist()
201 |         q_values_with_hid_init = network.get_q_values(state).tolist()
202 |         self.assertNotEquals(q_values_without_hid_init, q_values_with_hid_init)
203 | 
204 |     def test_get_q_values_hid_init_does_not_impact_q_values(self):
205 |         input_shape = 2
206 |         batch_size = 10
207 |         sequence_length = 1
208 |         num_actions = 4
209 |         num_hidden = 10
210 |         discount = 1
211 |         learning_rate = 1e-2 
212 |         update_rule = 'sgd'
213 |         freeze_interval = 1000
214 |         regularization = 1e-4
215 |         network_type = 'single_layer_rnn'
216 |         rng = None
217 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
218 |                     sequence_length, batch_size, num_actions, num_hidden, 
219 |                     discount, learning_rate, regularization, update_rule, 
220 |                     freeze_interval, network_type, rng)
221 | 
222 |         state = np.ones((1,1,2), dtype=float)
223 |         network.finish_episode()
224 |         q_values_without_hid_init = network.get_q_values(state).tolist()
225 |         network.finish_episode()
226 |         q_values_after_hid_init = network.get_q_values(state).tolist()
227 |         self.assertEquals(q_values_without_hid_init, q_values_after_hid_init)
228 | 
229 |     def test_initial_q_values(self):
230 |         # if just one of these is 1, (or two are 1) why does a pattern arise?
231 |         input_shape = 20
232 |         batch_size = 10
233 |         sequence_length = 2
234 |         num_actions = 4
235 |         num_hidden = 4
236 |         discount = 1
237 |         learning_rate = 1e-2 
238 |         update_rule = 'adam'
239 |         freeze_interval = 1000
240 |         regularization = 1e-4
241 |         network_type = 'single_layer_lstm'
242 |         rng = None
243 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
244 |                     sequence_length, batch_size, num_actions, num_hidden, 
245 |                     discount, learning_rate, regularization, update_rule, 
246 |                     freeze_interval, network_type, rng)
247 | 
248 |         values = []
249 |         for r in range(10):
250 |             row_values = []
251 |             for c in range(10):
252 |                 r_state = np.zeros(10, dtype=float)
253 |                 c_state = np.zeros(10, dtype=float)
254 |                 r_state[r] = 1
255 |                 c_state[c] = 1
256 |                 state = np.hstack((r_state, c_state))
257 |                 max_q_value = max(network.get_q_values(state).tolist())
258 |                 row_values.append(max_q_value)
259 |             values.append(row_values)
260 | 
261 | 
262 |     # why is cell init nonzero?
263 |     def test_for_zero_cell_init_with_len_1_sequences(self):
264 |         input_shape = 2
265 |         batch_size = 2
266 |         sequence_length = 1
267 |         num_actions = 2
268 |         num_hidden = 1
269 |         discount = 1
270 |         learning_rate = 1
271 |         update_rule = 'adam'
272 |         freeze_interval = 1
273 |         regularization = 1e-4
274 |         network_type = 'single_layer_lstm'
275 |         rng = None
276 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
277 |                     sequence_length, batch_size, num_actions, num_hidden, 
278 |                     discount, learning_rate, regularization, update_rule, 
279 |                     freeze_interval, network_type, rng)
280 | 
281 |         print 'BEFORE'
282 |         params = lasagne.layers.get_all_params(network.l_out)
283 |         param_values = lasagne.layers.get_all_param_values(network.l_out)
284 |         for p, v in zip(params, param_values):
285 |             print p
286 |             print v 
287 |             print '\n'
288 | 
289 |         states = np.ones((batch_size, sequence_length, input_shape))
290 |         actions = np.ones((batch_size, 1), dtype='int32')
291 |         rewards = np.ones((batch_size, 1))
292 |         next_states = np.ones((batch_size, sequence_length, input_shape))
293 |         terminals = np.zeros((batch_size, 1), dtype='int32')
294 |         network.train(states, actions, rewards, next_states, terminals)
295 | 
296 |         print 'AFTER 1'
297 |         params = lasagne.layers.get_all_params(network.l_out)
298 |         param_values = lasagne.layers.get_all_param_values(network.l_out)
299 |         for p, v in zip(params, param_values):
300 |             print p
301 |             print v 
302 |             print '\n'
303 | 
304 |         network.train(states, actions, rewards, next_states, terminals)
305 | 
306 |         print 'AFTER 2'
307 |         params = lasagne.layers.get_all_params(network.l_out)
308 |         param_values = lasagne.layers.get_all_param_values(network.l_out)
309 |         for p, v in zip(params, param_values):
310 |             print p
311 |             print v 
312 |             print '\n'
313 | 
314 | class TestRecurrentQNetworkSaturation(unittest.TestCase):
315 |     
316 |     def test_negative_saturation_rnn(self):
317 |         input_shape = 2
318 |         batch_size = 2
319 |         sequence_length = 2
320 |         num_actions = 2
321 |         num_hidden = 1
322 |         discount = 1
323 |         learning_rate = 1
324 |         update_rule = 'adam'
325 |         freeze_interval = 1
326 |         regularization = 1e-4
327 |         network_type = 'single_layer_rnn'
328 |         rng = None
329 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
330 |                     sequence_length, batch_size, num_actions, num_hidden, 
331 |                     discount, learning_rate, regularization, update_rule, 
332 |                     freeze_interval, network_type, rng)
333 | 
334 |         reward_multiplier = -10000
335 | 
336 |         for idx in range(100):
337 |             states = np.ones((batch_size, sequence_length, input_shape))
338 | 
339 |             action_multiplier = random.choice([0,1])
340 |             actions = np.ones((batch_size, 1), dtype='int32') * action_multiplier
341 |             rewards = np.ones((batch_size, 1)) * reward_multiplier
342 |             next_states = np.ones((batch_size, sequence_length, input_shape))
343 |             terminals = np.zeros((batch_size, 1), dtype='int32')
344 |             network.train(states, actions, rewards, next_states, terminals)
345 | 
346 |         q_values = network.get_q_values(states[0]).tolist()
347 |         print q_values
348 |         self.assertTrue(sum(q_values) < 0)
349 | 
350 |     def test_negative_saturation_lstm(self):
351 |         input_shape = 2
352 |         batch_size = 2
353 |         sequence_length = 2
354 |         num_actions = 2
355 |         num_hidden = 1
356 |         discount = 1
357 |         learning_rate = 1
358 |         update_rule = 'adam'
359 |         freeze_interval = 1
360 |         regularization = 1e-4
361 |         network_type = 'single_layer_lstm'
362 |         rng = None
363 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
364 |                     sequence_length, batch_size, num_actions, num_hidden, 
365 |                     discount, learning_rate, regularization, update_rule, 
366 |                     freeze_interval, network_type, rng)
367 | 
368 |         reward_multiplier = -10000
369 | 
370 |         for idx in range(100):
371 |             states = np.ones((batch_size, sequence_length, input_shape))
372 | 
373 |             action_multiplier = random.choice([0,1])
374 |             actions = np.ones((batch_size, 1), dtype='int32') * action_multiplier
375 |             rewards = np.ones((batch_size, 1)) * reward_multiplier
376 |             next_states = np.ones((batch_size, sequence_length, input_shape))
377 |             terminals = np.zeros((batch_size, 1), dtype='int32')
378 |             network.train(states, actions, rewards, next_states, terminals)
379 | 
380 |         # all the params in the lstm layer become positive
381 |         # all the params in linear output layer become negative
382 |         # params = lasagne.layers.get_all_params(network.l_out)
383 |         # param_values = lasagne.layers.get_all_param_values(network.l_out)
384 |         # for p, v in zip(params, param_values):
385 |         #     print p
386 |         #     print v 
387 |         #     print '\n'
388 | 
389 |         q_values = network.get_q_values(states[0]).tolist()
390 |         self.assertTrue(sum(q_values) < 0)
391 | 
392 |     def test_positive_saturation_lstm(self):
393 |         input_shape = 2
394 |         batch_size = 2
395 |         sequence_length = 2
396 |         num_actions = 2
397 |         num_hidden = 1
398 |         discount = 1
399 |         learning_rate = 1
400 |         update_rule = 'adam'
401 |         freeze_interval = 1
402 |         regularization = 1e-4
403 |         network_type = 'single_layer_lstm'
404 |         rng = None
405 |         network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 
406 |                     sequence_length, batch_size, num_actions, num_hidden, 
407 |                     discount, learning_rate, regularization, update_rule, 
408 |                     freeze_interval, network_type, rng)
409 | 
410 |         reward_multiplier = 10000
411 | 
412 |         for idx in range(100):
413 |             states = np.ones((batch_size, sequence_length, input_shape))
414 | 
415 |             action_multiplier = random.choice([0,1])
416 |             actions = np.ones((batch_size, 1), dtype='int32') * action_multiplier
417 |             rewards = np.ones((batch_size, 1)) * reward_multiplier
418 |             next_states = np.ones((batch_size, sequence_length, input_shape))
419 |             terminals = np.zeros((batch_size, 1), dtype='int32')
420 |             network.train(states, actions, rewards, next_states, terminals)
421 | 
422 |         # # everything becomes positive
423 |         # params = lasagne.layers.get_all_params(network.l_out)
424 |         # param_values = lasagne.layers.get_all_param_values(network.l_out)
425 |         # for p, v in zip(params, param_values):
426 |         #     print p
427 |         #     print v 
428 |         #     print '\n'
429 | 
430 |         q_values = network.get_q_values(states[0]).tolist()
431 |         self.assertTrue(sum(q_values) > 0)
432 | 
433 | @unittest.skipIf(__name__ != '__main__', "this test class does not run unless \
434 |     this file is called directly")
435 | class TestRecurrentQNetworkFullOperationFlattnedState(unittest.TestCase):
436 | 
437 |     def test_qnetwork_solves_small_mdp(self):
438 | 
439 |         def run(learning_rate, freeze_interval, num_hidden, reg, seq_len, eps, nt, update):
440 |             room_size = 5
441 |             num_rooms = 2
442 |             input_shape = 2 * room_size
443 |             print 'building mdp...'
444 |             mdp = mdps.MazeMDP(room_size, num_rooms)
445 |             mdp.compute_states()
446 |             mdp.EXIT_REWARD = 1
447 |             mdp.MOVE_REWARD = -0.01
448 |             network_type = nt
449 |             discount = 1
450 |             sequence_length = seq_len
451 |             num_actions = len(mdp.get_actions(None))
452 |             batch_size = 100
453 |             update_rule = update
454 |             print 'building network...'
455 |             network = recurrent_qnetwork.RecurrentQNetwork(input_shape=input_shape, 
456 |                         sequence_length=sequence_length, batch_size=batch_size, 
457 |                         num_actions=4, num_hidden=num_hidden, discount=discount, 
458 |                         learning_rate=learning_rate, regularization=reg, 
459 |                         update_rule=update_rule, freeze_interval=freeze_interval, 
460 |                         network_type=network_type, rng=None)            
461 | 
462 |             # take this many steps because (very loosely):
463 |             # let l be the step length
464 |             # let d be the difference in start and end locations
465 |             # let N be the number of steps for the agent to travel a distance d
466 |             # then N ~ (d/l)^2  // assuming this is a random walk
467 |             # with l = 1, this gives d^2 in order to make it N steps away
468 |             # the desired distance here is to walk along both dimensions of the maze
469 |             # which is equal to two times the num_rooms * room_size
470 |             # so squaring that gives a loose approximation to the number of 
471 |             # steps needed (discounting that this is actually a lattice (does it really matter?))
472 |             # (also discounting the walls)
473 |             # see: http://mathworld.wolfram.com/RandomWalk2-Dimensional.html
474 |             max_steps = (2 * room_size * num_rooms) ** 2
475 |             num_epochs = 500
476 |             epoch_length = 1
477 |             test_epoch_length = 0
478 |             epsilon_decay = (num_epochs * epoch_length * max_steps) / 4
479 |             print 'building adapter...'
480 |             adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size)
481 |             print 'building policy...'
482 |             p = policy.EpsilonGreedy(num_actions, eps, 0.05, epsilon_decay)
483 |             print 'building replay memory...'
484 |             # want to track at minimum the last 50 episodes
485 |             capacity = max_steps * 50
486 |             rm = replay_memory.SequenceReplayMemory(input_shape=input_shape,
487 |                     sequence_length=sequence_length, batch_size=batch_size, capacity=capacity)
488 |             print 'building logger...'
489 |             log = logger.NeuralLogger(agent_name=network_type)
490 |             print 'building agent...'
491 |             a = agent.RecurrentNeuralAgent(network=network, policy=p, 
492 |                     replay_memory=rm, log=log, state_adapter=adapter)
493 |             run_tests = False
494 |             print 'building experiment...'
495 |             e = experiment.Experiment(mdp, a, num_epochs, epoch_length, 
496 |                 test_epoch_length, max_steps, run_tests, value_logging=True)
497 |             print 'running experiment...'
498 |             e.run()
499 |             
500 |             ak = file_utils.load_key('../access_key.key')
501 |             sk = file_utils.load_key('../secret_key.key')
502 |             bucket = 'hierarchical9'
503 |             try:
504 |                 aws_util = aws_s3_utility.S3Utility(ak, sk, bucket)
505 |                 aws_util.upload_directory(e.agent.logger.log_dir)
506 |             except Exception as e:
507 |                 print 'error uploading to s3: {}'.format(e)
508 | 
509 |         # net_types = ['single_layer_lstm', 'stacked_lstm', 'stacked_lstm_with_merge', 'hierarchical_stacked_lstm_with_merge']
510 |         net_types = ['connected_clockwork_lstm', 'disconnected_clockwork_lstm']
511 |         for idx in range(10):
512 |             lr = random.choice([.01]) 
513 |             fi = random.choice([100])
514 |             nh = random.choice([64]) 
515 |             reg = random.choice([1e-4]) 
516 |             seq_len = random.choice([16])
517 |             eps = random.choice([.5])
518 |             nt = net_types[idx % len(net_types)]
519 |             up = random.choice(['sgd+nesterov'])
520 |            
521 |             print 'run number: {}'.format(idx)
522 |             print 'learning_rate: {}  frozen_interval: \
523 |             {}  num_hidden: {}  reg: {}  sequence_length: \
524 |             {}  eps: {}  network_type: {}'.format(lr,fi,nh, reg, seq_len, eps, nt)
525 |             run(lr, fi, nh, reg, seq_len, eps, nt, up)
526 | 
527 | if __name__ == '__main__':
528 |     unittest.main()
529 | 


--------------------------------------------------------------------------------
/tests/test_replay_memory.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import sys
  4 | import unittest
  5 | 
  6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
  7 | 
  8 | import replay_memory
  9 | 
 10 | class TestReplayMemorySampleBatch(unittest.TestCase):
 11 | 
 12 |     def test_minibatch_sample_shapes_1D_state(self):
 13 |         batch_size = 100
 14 |         state_shape = 2
 15 |         rm = replay_memory.ReplayMemory(batch_size)
 16 |         for idx in range(1000):
 17 |             state = np.ones(state_shape)
 18 |             action = 0
 19 |             reward = 0
 20 |             next_state = np.ones(state_shape)
 21 |             terminal = 0
 22 |             rm.store((state, action, reward, next_state, terminal))
 23 | 
 24 |         states, actions, rewards, next_states, terminals = rm.sample_batch()
 25 |         self.assertEquals(states.shape, (batch_size, state_shape))
 26 |         self.assertEquals(actions.shape, (batch_size, 1))
 27 |         self.assertEquals(rewards.shape, (batch_size, 1))
 28 |         self.assertEquals(next_states.shape, (batch_size, state_shape))
 29 |         self.assertEquals(terminals.shape, (batch_size, 1))
 30 | 
 31 |     def test_minibatch_sample_shapes_multidimensional_state(self):
 32 |         batch_size = 100
 33 |         state_shape = (1,2,2)
 34 |         rm = replay_memory.ReplayMemory(batch_size)
 35 |         for idx in range(1000):
 36 |             state = np.ones(state_shape)
 37 |             action = 0
 38 |             reward = 0
 39 |             next_state = np.ones(state_shape)
 40 |             terminal = 0
 41 |             rm.store((state, action, reward, next_state, terminal))
 42 | 
 43 |         states, actions, rewards, next_states, terminals = rm.sample_batch()
 44 |         expected_states_shape = (batch_size,) + state_shape
 45 | 
 46 |         self.assertEquals(states.shape, expected_states_shape)
 47 |         self.assertEquals(actions.shape, (batch_size, 1))
 48 |         self.assertEquals(rewards.shape, (batch_size, 1))
 49 |         self.assertEquals(next_states.shape, expected_states_shape)
 50 |         self.assertEquals(terminals.shape, (batch_size, 1))
 51 | 
 52 | 
 53 |     def test_minibatch_sample_shapes_multidimensional_state_broadcast_check(self):
 54 |         batch_size = 100
 55 |         state_shape = (1,2,1)
 56 |         rm = replay_memory.ReplayMemory(batch_size)
 57 |         for idx in range(1000):
 58 |             state = np.ones(state_shape)
 59 |             action = 0
 60 |             reward = 0
 61 |             next_state = np.ones(state_shape)
 62 |             terminal = 0
 63 |             rm.store((state, action, reward, next_state, terminal))
 64 | 
 65 |         states, actions, rewards, next_states, terminals = rm.sample_batch()
 66 |         expected_states_shape = (batch_size,) + state_shape
 67 | 
 68 |         self.assertEquals(states.shape, expected_states_shape)
 69 |         self.assertEquals(actions.shape, (batch_size, 1))
 70 |         self.assertEquals(rewards.shape, (batch_size, 1))
 71 |         self.assertEquals(next_states.shape, expected_states_shape)
 72 |         self.assertEquals(terminals.shape, (batch_size, 1))
 73 | 
 74 | class TestSequenceReplayMemorySampleBatch(unittest.TestCase):
 75 | 
 76 |     def test_minibatch_sample_shapes_1D_state_sequence_length_1(self):
 77 |         batch_size = 100
 78 |         state_shape = 2
 79 |         sequence_length = 1
 80 |         capacity = 1000
 81 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
 82 |         for idx in range(1000):
 83 |             state = np.ones(state_shape)
 84 |             action = 0
 85 |             reward = 0
 86 |             next_state = np.ones(state_shape)
 87 |             terminal = False
 88 |             rm.store(state, action, reward, terminal)
 89 | 
 90 |         states, actions, rewards, next_states, terminals = rm.sample_batch()
 91 |         self.assertEquals(states.shape, (batch_size, sequence_length, state_shape))
 92 |         self.assertEquals(actions.shape, (batch_size, 1))
 93 |         self.assertEquals(rewards.shape, (batch_size, 1))
 94 |         self.assertEquals(next_states.shape, (batch_size, sequence_length, state_shape))
 95 |         self.assertEquals(terminals.shape, (batch_size, 1))
 96 | 
 97 |     def test_minibatch_sample_shapes_1D_state_sequence_length_2(self):
 98 |         batch_size = 10
 99 |         state_shape = 2
100 |         sequence_length = 2
101 |         capacity = 1000
102 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
103 |         for idx in range(1000):
104 |             state = np.ones(state_shape)
105 |             action = 0
106 |             reward = 0
107 |             next_state = np.ones(state_shape)
108 |             terminal = False
109 |             rm.store(state, action, reward, terminal)
110 | 
111 |         states, actions, rewards, next_states, terminals = rm.sample_batch()
112 |         self.assertEquals(states.shape, (batch_size, sequence_length, state_shape))
113 |         self.assertEquals(states.sum(), batch_size * sequence_length * state_shape)
114 |         self.assertEquals(actions.shape, (batch_size, 1))
115 |         self.assertEquals(rewards.shape, (batch_size, 1))
116 |         self.assertEquals(next_states.shape, (batch_size, sequence_length, state_shape))
117 |         self.assertEquals(next_states.sum(), batch_size * sequence_length * state_shape)
118 |         self.assertEquals(terminals.shape, (batch_size, 1))
119 | 
120 |     def test_minibatch_sample_shapes_1D_state_terminal(self):
121 |         batch_size = 200
122 |         state_shape = 2
123 |         sequence_length = 2
124 |         capacity = 1000
125 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
126 |         prev_state_terminal = False
127 |         for idx in range(1, 1001):
128 |             action = 0
129 |             reward = 0
130 |             state = np.ones(state_shape) * idx
131 |             state = state if not prev_state_terminal else np.zeros(state_shape)
132 |             prev_state_terminal = False if np.random.random() < .8 else True
133 |             rm.store(state, action, reward, prev_state_terminal)
134 | 
135 |         states, actions, rewards, next_states, terminals = rm.sample_batch()
136 |         for state, next_state, terminal in zip(states, next_states, terminals):
137 |             if terminal:
138 |                 self.assertEquals(next_state.tolist()[-1], np.zeros(state_shape).tolist())
139 | 
140 |     def test_minibatch_sample_shapes_multidimensional_state_sequence_length_1(self):
141 |         batch_size = 100
142 |         state_shape = (1,2,2)
143 |         sequence_length = 1
144 |         capacity = 1000
145 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
146 |         for idx in range(1000):
147 |             state = np.ones(state_shape)
148 |             action = 0
149 |             reward = 0
150 |             next_state = np.ones(state_shape)
151 |             terminal = False
152 |             rm.store(state, action, reward, terminal)
153 | 
154 |         states, actions, rewards, next_states, terminals = rm.sample_batch()
155 |         expected_states_shape = (batch_size,) + (sequence_length,) + state_shape
156 | 
157 |         self.assertEquals(states.shape, expected_states_shape)
158 |         self.assertEquals(actions.shape, (batch_size, 1))
159 |         self.assertEquals(rewards.shape, (batch_size, 1))
160 |         self.assertEquals(next_states.shape, expected_states_shape)
161 |         self.assertEquals(terminals.shape, (batch_size, 1))
162 | 
163 |     def test_minibatch_sample_shapes_multidimensional_state_sequence_length_2(self):
164 |         batch_size = 100
165 |         state_shape = (1,2,2)
166 |         sequence_length = 2
167 |         capacity = 1000
168 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
169 |         for idx in range(1000):
170 |             state = np.ones(state_shape)
171 |             action = 0
172 |             reward = 0
173 |             next_state = np.ones(state_shape)
174 |             terminal = False
175 |             rm.store(state, action, reward, terminal)
176 | 
177 |         states, actions, rewards, next_states, terminals = rm.sample_batch()
178 |         expected_states_shape = (batch_size,) + (sequence_length,) + state_shape
179 | 
180 |         self.assertEquals(states.shape, expected_states_shape)
181 |         self.assertEquals(actions.shape, (batch_size, 1))
182 |         self.assertEquals(rewards.shape, (batch_size, 1))
183 |         self.assertEquals(next_states.shape, expected_states_shape)
184 |         self.assertEquals(terminals.shape, (batch_size, 1))
185 | 
186 | 
187 |     def test_minibatch_sample_shapes_multidimensional_state_broadcast_check(self):
188 |         batch_size = 100
189 |         state_shape = (1,2,1)
190 |         sequence_length = 2
191 |         capacity = 1000
192 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
193 |         for idx in range(1000):
194 |             state = np.ones(state_shape)
195 |             action = 0
196 |             reward = 0
197 |             next_state = np.ones(state_shape)
198 |             terminal = False 
199 |             rm.store(state, action, reward, terminal)
200 | 
201 |         states, actions, rewards, next_states, terminals = rm.sample_batch()
202 |         expected_states_shape = (batch_size,) + (sequence_length,) + state_shape
203 | 
204 |         self.assertEquals(states.shape, expected_states_shape)
205 |         self.assertEquals(actions.shape, (batch_size, 1))
206 |         self.assertEquals(rewards.shape, (batch_size, 1))
207 |         self.assertEquals(next_states.shape, expected_states_shape)
208 |         self.assertEquals(terminals.shape, (batch_size, 1))
209 | 
210 | class TestSequenceReplayMemoryMakeLastSequence(unittest.TestCase):
211 | 
212 |     def test_make_last_sequence_basic_operation(self):
213 |         batch_size = 10
214 |         state_shape = 2
215 |         sequence_length = 3
216 |         capacity = 30
217 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
218 | 
219 |         for idx in range(4):
220 |             state = np.ones(state_shape)
221 |             action = 0
222 |             reward = 0
223 |             next_state = np.ones(state_shape)
224 |             terminal = False
225 |             rm.store(state, action, reward, terminal)
226 | 
227 |         actual = rm.make_last_sequence(np.arange(state_shape)).tolist()
228 |         expected = [[1, 1], [1, 1], [0, 1]]
229 |         self.assertEquals(actual, expected)
230 | 
231 |     def test_make_last_sequence_preceding_state_terminal(self):
232 |         batch_size = 10
233 |         state_shape = 2
234 |         sequence_length = 3
235 |         capacity = 30
236 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
237 | 
238 |         state = np.ones(state_shape)
239 |         action = 0
240 |         reward = 0
241 |         next_state = np.ones(state_shape)
242 |         terminal = False
243 |         rm.store(state, action, reward, terminal)
244 |         terminal = True
245 |         rm.store(state, action, reward, terminal)
246 |         actual = rm.make_last_sequence(np.arange(state_shape)).tolist()
247 |         expected = [[0, 0], [0, 0], [0, 1]]
248 |         self.assertEquals(actual, expected)
249 | 
250 |     def test_make_last_sequence_some_previous_state_terminal_not_in_sequence(self):
251 |         batch_size = 10
252 |         state_shape = 2
253 |         sequence_length = 3
254 |         capacity = 30
255 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
256 | 
257 |         state = np.ones(state_shape)
258 |         action = 0
259 |         reward = 0
260 |         next_state = np.ones(state_shape)
261 |         terminal = True
262 |         rm.store(state, action, reward, terminal)
263 |         terminal = False
264 |         for idx in range(10):
265 |             rm.store(state, action, reward, terminal)
266 | 
267 |         actual = rm.make_last_sequence(np.arange(state_shape)).tolist()
268 |         expected = [[1, 1], [1, 1], [0, 1]]
269 |         self.assertEquals(actual, expected)
270 | 
271 |     def test_make_last_sequence_terminal_state_within_sequence_but_not_preceding(self):
272 |         batch_size = 10
273 |         state_shape = 2
274 |         sequence_length = 4
275 |         capacity = 30
276 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
277 | 
278 |         # tuple 1
279 |         state = np.ones(state_shape)
280 |         action = 0
281 |         reward = 0
282 |         next_state = np.ones(state_shape)
283 |         terminal = False
284 |         rm.store(state, action, reward, terminal)
285 | 
286 |         # tuple 2
287 |         terminal = True
288 |         rm.store(state, action, reward, terminal)
289 | 
290 |         # tuple 3
291 |         terminal = False
292 |         rm.store(state, action, reward, terminal)
293 | 
294 |         actual = rm.make_last_sequence(np.arange(state_shape)).tolist()
295 |         expected = [[0, 0], [0, 0], [1, 1], [0, 1]]
296 |         self.assertEquals(actual, expected)
297 | 
298 |     def test_make_last_sequence_terminal_state_first_in_made_sequence(self):
299 |         batch_size = 10
300 |         state_shape = 2
301 |         sequence_length = 4
302 |         capacity = 30
303 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
304 | 
305 |         # tuple 1
306 |         state = np.ones(state_shape)
307 |         action = 0
308 |         reward = 0
309 |         next_state = np.ones(state_shape)
310 |         terminal = True
311 |         rm.store(state, action, reward, terminal)
312 | 
313 |         # tuple 2
314 |         terminal = False
315 |         rm.store(state, action, reward, terminal)
316 | 
317 |         # tuple 3
318 |         terminal = False
319 |         rm.store(state, action, reward, terminal)
320 | 
321 |         actual = rm.make_last_sequence(np.arange(state_shape)).tolist()
322 |         expected = [[0, 0], [1, 1], [1, 1], [0, 1]]
323 |         self.assertEquals(actual, expected)
324 | 
325 |     def test_make_last_sequence_terminal_state_first_in_made_sequence_wrap(self):
326 |         batch_size = 10
327 |         state_shape = 2
328 |         sequence_length = 4
329 |         capacity = 30
330 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
331 | 
332 |         # tuple 1
333 |         state = np.ones(state_shape)
334 |         action = 0
335 |         reward = 0
336 |         next_state = np.ones(state_shape)
337 |         terminal = False
338 |         for i in range(capacity - 1):
339 |             rm.store(state, action, reward, terminal)
340 | 
341 | 
342 |         terminal = True
343 |         rm.store(state, action, reward, terminal)
344 | 
345 |         # tuple 2
346 |         terminal = False
347 |         rm.store(state, action, reward, terminal)
348 | 
349 |         # tuple 3
350 |         terminal = False
351 |         rm.store(state, action, reward, terminal)
352 | 
353 |         actual = rm.make_last_sequence(np.arange(state_shape)).tolist()
354 |         expected = [[0, 0], [1, 1], [1, 1], [0, 1]]
355 |         self.assertEquals(actual, expected)
356 | 
357 | 
358 |     def test_make_last_sequence_insufficient_samples_for_full_sequence(self):
359 |         batch_size = 10
360 |         state_shape = 2
361 |         sequence_length = 4
362 |         capacity = 30
363 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
364 | 
365 |         # tuple 1
366 |         state = np.ones(state_shape)
367 |         action = 0
368 |         reward = 0
369 |         next_state = np.ones(state_shape)
370 |         terminal = False
371 |         rm.store(state, action, reward, terminal)
372 | 
373 |         # tuple 2
374 |         terminal = False
375 |         rm.store(state, action, reward, terminal)
376 | 
377 |         actual = rm.make_last_sequence(np.arange(state_shape)).tolist()
378 |         expected = [[0, 0], [1, 1], [1, 1], [0, 1]]
379 |         self.assertEquals(actual, expected)
380 | 
381 |     def test_make_last_sequence_empty(self):
382 |         batch_size = 10
383 |         state_shape = 2
384 |         sequence_length = 4
385 |         capacity = 30
386 |         rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity)
387 | 
388 |         actual = rm.make_last_sequence(np.arange(state_shape)).tolist()
389 |         expected = [[0, 0], [0, 0], [0, 0], [0, 1]]
390 |         self.assertEquals(actual, expected)
391 |         
392 | 
393 | if __name__ == '__main__':
394 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_state_adapters.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import sys
  4 | import unittest
  5 | 
  6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts')))
  7 | 
  8 | import state_adapters
  9 | 
 10 | class TestCoordinatesToSingleRoomRowColAdapter(unittest.TestCase):
 11 | 
 12 |     def test_convert_state_to_agent_format_first_room(self):
 13 |         adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=3)
 14 |         mdp_formatted_state = (2, 2)
 15 |         expected = [0,0,1,0,0,1]
 16 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 17 |         self.assertEquals(actual, expected)
 18 | 
 19 |     def test_convert_state_to_agent_format_fourth_room(self):
 20 |         adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=3)
 21 |         mdp_formatted_state = (4, 4)
 22 |         expected = [0,1,0,0,1,0]
 23 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 24 |         self.assertEquals(actual, expected)
 25 | 
 26 |     def test_convert_state_to_agent_format_off_diagonal_room(self):
 27 |         adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=3)
 28 |         mdp_formatted_state = (0, 4)
 29 |         expected = [1,0,0,0,1,0]
 30 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 31 |         self.assertEquals(actual, expected)
 32 | 
 33 |     def test_convert_state_to_agent_format_off_fourth_room_first_square(self):
 34 |         adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=3)
 35 |         mdp_formatted_state = (3, 3)
 36 |         expected = [1,0,0,1,0,0]
 37 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 38 |         self.assertEquals(actual, expected)
 39 | 
 40 | class TestCoordinatesToRowColAdapter(unittest.TestCase):
 41 | 
 42 |     def test_convert_state_to_agent_format_first_room(self):
 43 |         adapter = state_adapters.CoordinatesToRowColAdapter(room_size=3, num_rooms=2)
 44 |         mdp_formatted_state = (2, 2)
 45 |         expected = [0,0,1,0,0,0,0,0,1,0,0,0]
 46 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 47 |         self.assertEquals(actual, expected)
 48 | 
 49 |     def test_convert_state_to_agent_format_fourth_room(self):
 50 |         adapter = state_adapters.CoordinatesToRowColAdapter(room_size=3, num_rooms=2)
 51 |         mdp_formatted_state = (4, 4)
 52 |         expected = [0,0,0,0,1,0,0,0,0,0,1,0]
 53 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 54 |         self.assertEquals(actual, expected)
 55 | 
 56 |     def test_convert_state_to_agent_format_off_diagonal_room(self):
 57 |         adapter = state_adapters.CoordinatesToRowColAdapter(room_size=3, num_rooms=2)
 58 |         mdp_formatted_state = (0, 4)
 59 |         expected = [1,0,0,0,0,0,0,0,0,0,1,0]
 60 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 61 |         self.assertEquals(actual, expected)
 62 | 
 63 |     def test_convert_state_to_agent_format_off_fourth_room_first_square(self):
 64 |         adapter = state_adapters.CoordinatesToRowColAdapter(room_size=3, num_rooms=2)
 65 |         mdp_formatted_state = (3, 3)
 66 |         expected = [0,0,0,1,0,0,0,0,0,1,0,0]
 67 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 68 |         self.assertEquals(actual, expected)
 69 | 
 70 | class TestCoordinatesToRowColRoomAdapter(unittest.TestCase):
 71 | 
 72 |     def test_convert_state_to_agent_format_first_room(self):
 73 |         adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=3, num_rooms=2)
 74 |         mdp_formatted_state = (2, 2)
 75 |         expected = [0,0,1,0,0,1,1,0,0,0]
 76 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 77 |         self.assertEquals(actual, expected)
 78 | 
 79 |     def test_convert_state_to_agent_format_fourth_room(self):
 80 |         adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=3, num_rooms=2)
 81 |         mdp_formatted_state = (4, 4)
 82 |         expected = [0,1,0,0,1,0,0,0,0,1]
 83 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 84 |         self.assertEquals(actual, expected)
 85 | 
 86 |     def test_convert_state_to_agent_format_off_diagonal_room_top(self):
 87 |         adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=3, num_rooms=2)
 88 |         mdp_formatted_state = (0, 4)
 89 |         expected = [1,0,0,0,1,0,0,0,1,0]
 90 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 91 |         self.assertEquals(actual, expected)
 92 | 
 93 |     def test_convert_state_to_agent_format_off_diagonal_room_bottom(self):
 94 |         adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=3, num_rooms=2)
 95 |         mdp_formatted_state = (4, 0)
 96 |         expected = [0,1,0,1,0,0,0,1,0,0]
 97 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
 98 |         self.assertEquals(actual, expected)
 99 |         
100 |     def test_convert_state_to_agent_format_off_fourth_room_first_square(self):
101 |         adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=3, num_rooms=2)
102 |         mdp_formatted_state = (3, 3)
103 |         expected = [1,0,0,1,0,0,0,0,0,1]
104 |         actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist()
105 |         self.assertEquals(actual, expected)
106 | 
107 | 
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     unittest.main()
112 | 


--------------------------------------------------------------------------------