├── .gitignore ├── README.md ├── development.txt ├── results.txt ├── scripts ├── __init__.py ├── agent.py ├── async_rl.py ├── aws_s3_utility.py ├── experiment.py ├── file_utils.py ├── learning_utils.py ├── logger.py ├── mdps.py ├── policy.py ├── qnetwork.py ├── recurrent_qnetwork.py ├── replay_memory.py └── state_adapters.py └── tests ├── __init__.py ├── run_tests.py ├── test_aws_s3_utility.py ├── test_build_network.py ├── test_experiment.py ├── test_learning_utils.py ├── test_logger.py ├── test_mdps.py ├── test_neural_agent.py ├── test_policy.py ├── test_qnetwork.py ├── test_recurrent_qnetwork.py ├── test_replay_memory.py └── test_state_adapters.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | # mac 56 | .DS_Store 57 | 58 | # binaries 59 | *.bin 60 | *.out 61 | 62 | # emacs 63 | *~ 64 | 65 | # compile 66 | /compile 67 | 68 | # aws keys 69 | *.key 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hierarchical_rl 2 | [paper](https://wulfebw.github.io/assets/CS239_Final_Paper.pdf) 3 | -------------------------------------------------------------------------------- /development.txt: -------------------------------------------------------------------------------- 1 | tonight 2 | ------- 3 | 1. decide between cnn vs flat 4 | 2. compressor network (compression loss as incentive, as temp for softmax) 5 | 3. recurrent network 6 | 4. stacked recurrent network 7 | 5. batchnorm 8 | 6. prioritized experience replay 9 | - sliding window -> if sufficiently large, and sampling higher-td experiences, may not need to save in memory for longer 10 | - this would require feedback from training to the memory 11 | 7. stacked recurrent with different length of BPTT 12 | 8. why does the value image have all the same values sometimes? 13 | 9. graph the weight updates vs the size of the weights -> should be ratio of 1e-3 14 | 10. plot neural network weights / activations / figure out what visualizations work 15 | 11. maze just showing current room 16 | - in the maze problem, just show it the current room!!! and it has to figure out from that 17 | - need some distinguishing markers for each room? that's available in MR, but not with one-hot room matric 18 | 12. maze with key 19 | 13. read papers 20 | 14. should the loss behave differently? why does it go to zero so quickly? 21 | - why is it bimodal or something? 22 | - bimodal proabably b/c sometime batch contains reward pos and sometimes not? 23 | 15. improve visualizations 24 | - some method of showing how the different runs did 25 | 16. do they perform training after every step? or is it every once in a while or what? 26 | 17. should I think of a better mdp? 27 | 18. some notion of convergence in the test mdps 28 | - rather than just until "10 in a row or something" 29 | 19. clip TD error? 30 | - should keep track of it 31 | - isn't this just the loss? 32 | 20. 33 | 34 | --- 35 | problems with qnet 36 | --- 37 | 1. values go to nan sometimes. why? (particularly with rmsprop, does so immediately -> why?) 38 | 2. need regularization? 39 | 3. need to make tests checking outputs given set weights and inputs 40 | 5. maybe need to be taking random actions till replay memory full 41 | 6. check that the weights are sensible after training 42 | 43 | --- 44 | papers to read (again) 45 | --- 46 | - prioritized experience replay 47 | - compressor network 48 | - hybrid arch 49 | - algorithmic information theory CM 50 | - Memory-based control with recurrent neural networks 51 | - Deep Recurrent Q-Learning for Partially Observable MDPs 52 | 53 | -------------------------------------------------------------------------------- /results.txt: -------------------------------------------------------------------------------- 1 | --- 2 | results 3 | --- 4 | - this is a list of interesting observations made while training 5 | 6 | --- 7 | sgd vs adam 8 | --- 9 | - on a one room maze, sgd will fail to find the optimal policy whereas adam finds it quickly 10 | 11 | --- 12 | small replay memory vs large replay memory 13 | --- 14 | - on the small maze, if you try out a capacity=1000 vs capacity=100000 replay memory, it makes a huge difference 15 | 16 | --- 17 | numeric vs one-hot state representation 18 | --- 19 | - numeric is much worse than one-hot, one-hot is rote learning however 20 | 21 | --- 22 | 5 vs 10 size single room 23 | --- 24 | - how much more difficult is the 10 vs 5? 25 | 26 | --- 27 | 10 size room vs 5 size, 2 room maze 28 | --- 29 | - how much more difficult do walls make the task? 30 | 31 | --- 32 | conv vs dense one-hot representations 33 | --- 34 | - how does passing the one-hot input as an array to conv compare in performance to the flattened array, dense version? 35 | 36 | --- 37 | number of hidden units 38 | --- 39 | - what impact does number of hidden units have? 40 | - seems that lower number of units tends to get stuck in local optima that is relatively poor compared to the larger networks (both should be more than sufficient to express value function?) 41 | 42 | --- 43 | number of hidden layers 44 | --- 45 | - fewer seems to work better for the large maze - likely due to poor learning setup for the larger networks 46 | - possible that it just also takes less time 47 | - batch norm might improve 48 | - this is also rum on cpu on laptop so also likely due to lack of memories or something 49 | 50 | --- 51 | prioritized experience replay vs none 52 | --- 53 | - how much better does it do? 54 | 55 | --- 56 | increasing batch size increases stability 57 | --- 58 | - increasing the batch size improves learning 59 | 60 | --- 61 | should you regularize q network? 62 | --- 63 | - ? i'm not really sure at all 64 | - seems that it does not make a huge difference 65 | - you would think it might given that it could overfit the data in the experience replay 66 | - used small value of 1e-4 seems to work well 67 | 68 | --- 69 | clipping td error (i.e., the loss) 70 | --- 71 | - does clipping the td error max the loss graphs look pretty / smooth? -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wulfebw/hierarchical_rl/0156dd7b1675a0c3a3b7d81cb66721cbba406e28/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/agent.py: -------------------------------------------------------------------------------- 1 | 2 | import collections 3 | import numpy as np 4 | import random 5 | import theano 6 | 7 | import logger 8 | 9 | class Agent(object): 10 | 11 | def step(self, next_state, reward): 12 | """ 13 | :description: this method implements the agents deciding which action to take and updating its parameters 14 | """ 15 | raise NotImplementedError("Override me") 16 | 17 | def start_episode(self, state): 18 | """ 19 | :description: initializes an agent for an episode and returns an initial action to take 20 | """ 21 | raise NotImplementedError("Override me") 22 | 23 | def finish_episode(self, next_state, reward): 24 | """ 25 | :description: finalizes an episode for an agent 26 | """ 27 | 28 | def finish_epoch(self, epoch): 29 | """ 30 | :description: performs logging tasks at the end of an epoch 31 | """ 32 | raise NotImplementedError("Override me") 33 | 34 | def start_testing(self): 35 | pass 36 | 37 | def finish_testing(self): 38 | pass 39 | 40 | 41 | class TestAgent(Agent): 42 | 43 | def __init__(self, num_actions): 44 | self.actions = range(num_actions) 45 | self.steps = 0 46 | self.episodes = 0 47 | 48 | def step(self, next_state, reward): 49 | self.steps += 1 50 | return random.choice(self.actions) 51 | 52 | def start_episode(self, state): 53 | self.episodes += 1 54 | return random.choice(self.actions) 55 | 56 | def finish_episode(self, next_state, reward): 57 | pass 58 | 59 | def finish_epoch(self, epoch): 60 | pass 61 | 62 | 63 | class QLearningAgent(Agent): 64 | 65 | def __init__(self, num_actions, discount, exploration_prob, step_size, logging=True): 66 | self.actions = range(num_actions) 67 | self.discount = discount 68 | self.exploration_prob = exploration_prob 69 | self.step_size = step_size 70 | self.num_iters = 1 71 | self.weights = collections.Counter() 72 | self.logger = logger.Logger(agent_name='QLearningAgent', logging=logging) 73 | self.prev_state = None 74 | self.prev_action = None 75 | 76 | def step(self, next_state, reward): 77 | self.incorporate_feedback(self.prev_state, self.prev_action, reward, next_state, False) 78 | action = self.get_action(next_state) 79 | 80 | self.prev_state = next_state 81 | self.prev_action = action 82 | 83 | self.logger.log_action(action) 84 | self.logger.log_reward(reward) 85 | return action 86 | 87 | def feature_extractor(self, state, action): 88 | """ 89 | :description: this is the identity feature extractor, so we use tables here for the function 90 | """ 91 | return [((state, action), 1)] 92 | 93 | def getQ(self, state, action): 94 | """ 95 | :description: returns the Q value associated with this state-action pair 96 | 97 | :type state: numpy array 98 | :param state: the state of the game 99 | 100 | :type action: int 101 | :param action: the action for which to retrieve the Q-value 102 | """ 103 | score = 0 104 | for f, v in self.feature_extractor(state, action): 105 | score += self.weights[f] * v 106 | return score 107 | 108 | def get_action(self, state): 109 | """ 110 | :description: returns an action accoridng to epsilon-greedy policy 111 | 112 | :type state: dictionary 113 | :param state: the state of the game 114 | """ 115 | self.num_iters += 1 116 | 117 | if random.random() < self.exploration_prob: 118 | return random.choice(self.actions) 119 | else: 120 | max_action = max((self.getQ(state, action), action) for action in self.actions)[1] 121 | return max_action 122 | 123 | def incorporate_feedback(self, state, action, reward, next_state, terminal): 124 | """ 125 | :description: performs a Q-learning update 126 | 127 | :type reward: float 128 | :param reward: reward associated with transitioning to next_state 129 | 130 | :type next_state: numpy array 131 | :param next_state: the new state of the game 132 | """ 133 | step_size = self.step_size 134 | prediction = self.getQ(state, action) 135 | target = reward 136 | if not terminal: 137 | target += self.discount * max(self.getQ(next_state, next_action) for next_action in self.actions) 138 | 139 | diff = target - prediction 140 | loss = .5 * diff ** 2 141 | for f, v in self.feature_extractor(state, action): 142 | self.weights[f] = self.weights[f] + step_size * diff * v 143 | 144 | self.logger.log_loss(loss) 145 | self.logger.log_weights(self.weights) 146 | 147 | def start_episode(self, state): 148 | self.prev_state = state 149 | self.prev_action = self.get_action(state) 150 | 151 | self.logger.log_action(self.prev_action) 152 | return self.prev_action 153 | 154 | def finish_episode(self, next_state, reward): 155 | self.incorporate_feedback(self.prev_state, self.prev_action, reward, next_state, True) 156 | self.logger.finish_episode() 157 | 158 | def finish_epoch(self, epoch): 159 | self.logger.log_epoch(epoch) 160 | 161 | class NeuralAgent(Agent): 162 | """ 163 | :description: A class that wraps a network so it may more easily interact with an experiment. 164 | """ 165 | 166 | def __init__(self, network, policy, replay_memory, log, state_adapter): 167 | """ 168 | :type network: a network class (see e.g., qnetwork.py) 169 | :param network: the network the agent uses to evaluate states 170 | 171 | :type policy: a policy class (see policy.py) 172 | :param policy: a class that decides which action to take given the values of those actions 173 | 174 | :type replay_memory: replay memory class (see replay_memory.py) 175 | :param replay_memory: replay memory used to store dataset as it is gathered. 176 | """ 177 | 178 | self.network = network 179 | self.policy = policy 180 | self.replay_memory = replay_memory 181 | self.logger = log 182 | self.logger.log_hyperparameters(network, policy, replay_memory) 183 | self.state_adapter = state_adapter 184 | 185 | self.prev_state = None 186 | self.prev_action = None 187 | 188 | def step(self, next_state, reward): 189 | """ 190 | :description: the primary method of this class, which 'steps' the agent and network forward one time step. This includes selecting an action, making use of the new state and reward, and performing training. 191 | 192 | :type next_state: tuple or array 193 | :param next_state: the next state observed (i.e., s') 194 | 195 | :type reward: int 196 | :param reward: the reward associated with having moved from the previous state to the current state 197 | 198 | :type rval: int 199 | :param rval: returns the action to next be taken within the environment 200 | """ 201 | # need to transform an external state format to an internal one 202 | next_state = self.state_adapter.convert_state_to_agent_format(next_state) 203 | 204 | # store current (s,a,r,s') tuple 205 | self.replay_memory.store((self.prev_state, self.prev_action, reward, next_state, 0)) 206 | 207 | # perform training 208 | self.train() 209 | 210 | # retrieve an action 211 | action = self.get_action(next_state) 212 | 213 | # set previous values 214 | self.prev_state = next_state 215 | self.prev_action = action 216 | 217 | # log information 218 | self.logger.log_reward(reward) 219 | self.logger.log_action(self.prev_action) 220 | 221 | return action 222 | 223 | def train(self): 224 | """ 225 | :description: collects a minibatch of experiences and passes them to the network to train 226 | """ 227 | # wait until replay memory has samples 228 | if not self.replay_memory.is_full(): 229 | return 230 | 231 | # collect minibatch 232 | states, actions, rewards, next_states, terminals = self.replay_memory.sample_batch() 233 | 234 | # pass to network to perform training 235 | loss = self.network.train(states, actions, rewards, next_states, terminals) 236 | self.logger.log_loss(loss) 237 | 238 | def get_action(self, state): 239 | """ 240 | :description: gets an action given the current state. Defers to the network for selecting the action. 241 | 242 | :type state: numpy array 243 | :param state: the state used to determine the action 244 | """ 245 | q_values = self.network.get_q_values(state) 246 | return self.policy.choose_action(q_values) 247 | 248 | def start_episode(self, state): 249 | """ 250 | description: determines the first action to take and initializes internal variables 251 | """ 252 | self.prev_state = self.state_adapter.convert_state_to_agent_format(state) 253 | self.prev_action = self.get_action(self.prev_state) 254 | 255 | self.logger.log_action(self.prev_action) 256 | return self.prev_action 257 | 258 | def finish_episode(self, next_state, reward): 259 | """ 260 | :description: perform tasks at the end of episode 261 | """ 262 | 263 | terminal = 1 264 | next_state = self.state_adapter.convert_state_to_agent_format(next_state) 265 | self.replay_memory.store((self.prev_state, self.prev_action, reward, next_state, terminal)) 266 | self.logger.log_reward(reward) 267 | self.logger.finish_episode() 268 | 269 | def finish_epoch(self, epoch): 270 | """ 271 | :description: perform tasks at the end of an epoch 272 | """ 273 | self.logger.log_epoch(epoch, self.network, self.policy) 274 | 275 | def get_q_values(self, state): 276 | """ 277 | :description: returns the q values associated with a given state. Used for printing out a representation of the mdp with the values included. 278 | """ 279 | state = self.state_adapter.convert_state_to_agent_format(state) 280 | q_values = self.network.get_q_values(state) 281 | return q_values 282 | 283 | class RecurrentNeuralAgent(Agent): 284 | """ 285 | :description: A class that wraps a recuurent network so it may more easily 286 | interact with an experiment. 287 | """ 288 | def __init__(self, network, policy, replay_memory, state_adapter, log): 289 | self.network = network 290 | self.policy = policy 291 | self.replay_memory = replay_memory 292 | self.logger = log 293 | self.logger.log_hyperparameters(network, policy, replay_memory) 294 | self.state_adapter = state_adapter 295 | 296 | self.prev_state = None 297 | self.prev_action = None 298 | 299 | def step(self, next_state, reward): 300 | """ 301 | :description: the primary method of this class, which 'steps' the agent and network forward one time step. This includes selecting an action, making use of the new state and reward, and performing training. 302 | 303 | :type next_state: tuple or array 304 | :param next_state: the next state observed (i.e., s') 305 | 306 | :type reward: int 307 | :param reward: the reward associated with having moved from the previous state to the current state 308 | 309 | :type rval: int 310 | :param rval: returns the action to next be taken within the environment 311 | """ 312 | # need to transform an external state format to an internal one 313 | next_state = self.state_adapter.convert_state_to_agent_format(next_state) 314 | 315 | # store current (s,a,r,s') tuple 316 | self.replay_memory.store(self.prev_state, self.prev_action, reward, terminal=False) 317 | 318 | # perform training 319 | self.train() 320 | 321 | # retrieve an action 322 | action = self.get_action(next_state) 323 | 324 | # set previous values 325 | self.prev_state = next_state 326 | self.prev_action = action 327 | 328 | # log information 329 | self.logger.log_reward(reward) 330 | self.logger.log_action(self.prev_action) 331 | 332 | return action 333 | 334 | def train(self): 335 | """ 336 | :description: collects a minibatch of experiences and passes them to the network to train 337 | """ 338 | # wait until replay memory has samples 339 | if not self.replay_memory.is_full(): 340 | return 341 | 342 | # collect minibatch 343 | states, actions, rewards, next_states, terminals = self.replay_memory.sample_batch() 344 | 345 | # pass to network to perform training 346 | loss = self.network.train(states, actions, rewards, next_states, terminals) 347 | self.logger.log_loss(loss) 348 | 349 | def get_action(self, state): 350 | """ 351 | :description: gets an action given the current state. Defers to the network for selecting the action. 352 | 353 | :type state: numpy array 354 | :param state: the state used to determine the action 355 | """ 356 | # wait until agent starts learning to use network to decide action 357 | if not self.replay_memory.is_full(): 358 | return self.policy.random_action() 359 | 360 | sequence = self.replay_memory.make_last_sequence(state) 361 | q_values = self.network.get_q_values(sequence) 362 | return self.policy.choose_action(q_values) 363 | 364 | def start_episode(self, state): 365 | """ 366 | description: determines the first action to take and initializes internal variables 367 | """ 368 | self.prev_state = self.state_adapter.convert_state_to_agent_format(state) 369 | self.prev_action = self.get_action(self.prev_state) 370 | 371 | self.logger.log_action(self.prev_action) 372 | return self.prev_action 373 | 374 | def finish_episode(self, next_state, reward): 375 | """ 376 | :description: perform tasks at the end of episode. We don't store the next_state value 377 | because the previous state must have been a terminal one. It's in the method 378 | definition to stay consistent with the other replay memory implementation. 379 | """ 380 | self.replay_memory.store(self.prev_state, self.prev_action, reward, True) 381 | self.logger.log_reward(reward) 382 | self.logger.finish_episode() 383 | 384 | def finish_epoch(self, epoch): 385 | """ 386 | :description: perform tasks at the end of an epoch 387 | """ 388 | self.logger.log_epoch(epoch, self.network, self.policy) 389 | 390 | def get_q_values(self, state): 391 | """ 392 | :description: returns the q values associated with a given state. Used for printing out a representation of the mdp with the values included. 393 | """ 394 | state = self.state_adapter.convert_state_to_agent_format(state) 395 | q_values = self.network.get_logging_q_values(state) 396 | return q_values 397 | 398 | -------------------------------------------------------------------------------- /scripts/async_rl.py: -------------------------------------------------------------------------------- 1 | 2 | import collections 3 | import copy 4 | import matplotlib.pyplot as plt 5 | from multiprocessing.pool import ThreadPool 6 | import numpy as np 7 | import random 8 | import sys 9 | import time 10 | 11 | import learning_utils 12 | 13 | # threading constants 14 | NUM_THREADS = 2 15 | 16 | # global variables for AsyncSarsa 17 | WEIGHTS = collections.defaultdict(lambda: 0) 18 | 19 | # global variables for AsyncAdvantageActorCritic 20 | WEIGHTS = collections.defaultdict(lambda: 0) 21 | VALUE_WEIGHTS = collections.defaultdict(lambda: 0) 22 | 23 | # logging 24 | REWARDS = [] 25 | START_STATE_VALUES = [] 26 | 27 | class MazeMDP(object): 28 | 29 | EXIT_REWARD = 1 30 | MOVE_REWARD = -.01 31 | ACTIONS = [(1,0),(-1,0),(0,1),(0,-1)] 32 | DISCOUNT = 1 33 | START_STATE = (0,0) 34 | 35 | def __init__(self, room_size, num_rooms): 36 | self.room_size = room_size 37 | self.num_rooms = num_rooms 38 | self.max_position = self.room_size * self.num_rooms - 1 39 | self.end_state = (self.max_position, self.max_position) 40 | self.computeStates() 41 | 42 | def calculate_next_state(self, state, action): 43 | return state[0] + action[0], state[1] + action[1] 44 | 45 | def runs_into_wall(self, state, action): 46 | next_state = self.calculate_next_state(state, action) 47 | 48 | # 1. check for leaving the maze 49 | if next_state[0] > self.max_position or next_state[0] < 0 \ 50 | or next_state[1] > self.max_position or next_state[1] < 0: 51 | return True 52 | 53 | # 2. check if movement was through doorway and if so return false 54 | doorway_position = (self.room_size) / 2 55 | # check horizontal movement through doorway 56 | if next_state[0] != state[0]: 57 | if next_state[1] % self.room_size == doorway_position: 58 | return False 59 | 60 | # check vertical movement through doorway 61 | if next_state[1] != state[1]: 62 | if next_state[0] % self.room_size == doorway_position: 63 | return False 64 | 65 | # 3. check if movement was through a wall 66 | room_size = self.room_size 67 | # move right to left through wall 68 | if state[0] % room_size == room_size - 1 and next_state[0] % room_size == 0: 69 | return True 70 | 71 | # move left to right through wall 72 | if next_state[0] % room_size == room_size - 1 and state[0] % room_size == 0: 73 | return True 74 | 75 | # move up through wall 76 | if state[1] % room_size == room_size - 1 and next_state[1] % room_size == 0: 77 | return True 78 | 79 | # move down through wall 80 | if next_state[1] % room_size == room_size - 1 and state[1] % room_size == 0: 81 | return True 82 | 83 | # if none of the above conditions meet, then have not passed through wall 84 | return False 85 | 86 | def succAndProbReward(self, state, action): 87 | 88 | # if we reach the end state then the episode ends 89 | if np.array_equal(state, self.end_state): 90 | return [] 91 | 92 | if self.runs_into_wall(state, action): 93 | # if the action runs us into a wall do nothing 94 | next_state = state 95 | else: 96 | # o/w determine the next position 97 | next_state = self.calculate_next_state(state, action) 98 | 99 | # if next state is exit, then set reward 100 | reward = self.MOVE_REWARD 101 | if np.array_equal(next_state, self.end_state): 102 | reward = self.EXIT_REWARD 103 | 104 | return [(next_state, 1, reward)] 105 | 106 | def computeStates(self): 107 | self.states = set() 108 | queue = [] 109 | self.states.add(self.START_STATE) 110 | queue.append(self.START_STATE) 111 | while len(queue) > 0: 112 | state = queue.pop() 113 | for action in self.ACTIONS: 114 | for newState, prob, reward in self.succAndProbReward(state, action): 115 | if newState not in self.states: 116 | self.states.add(newState) 117 | queue.append(newState) 118 | 119 | def print_state_values(self): 120 | V = {} 121 | for state in self.states: 122 | #state_value = max(WEIGHTS[(state, action)] for action in self.ACTIONS) 123 | state_value = VALUE_WEIGHTS[(state, None)] 124 | V[state] = state_value 125 | 126 | for ridx in reversed(range(self.max_position + 1)): 127 | for cidx in range(self.max_position + 1): 128 | if (ridx, cidx) in V: 129 | print '{0:.5f}'.format(V[(ridx, cidx)]), 130 | print('\n') 131 | 132 | class Experiment(object): 133 | 134 | def __init__(self, mdp, agent, num_episodes, max_steps): 135 | self.mdp = mdp 136 | self.agent = agent 137 | self.num_episodes = num_episodes 138 | self.max_steps = max_steps 139 | 140 | def run(self, agent_id): 141 | print 'running experiment with agent number {}...'.format(agent_id) 142 | 143 | total_rewards = [] 144 | total_reward = 0 145 | 146 | for episode in range(self.num_episodes): 147 | if episode % 100 == 0: 148 | print 'running episode {} for agent {}...'.format(episode, agent_id) 149 | state = self.mdp.START_STATE 150 | action = self.agent.get_action(state) 151 | 152 | for step in range(self.max_steps): 153 | transitions = self.mdp.succAndProbReward(state, action) 154 | 155 | if len(transitions) == 0: 156 | reward = 0 157 | new_state = None 158 | break 159 | 160 | new_state, prob, reward = transitions[0] 161 | total_reward += reward 162 | action = self.agent.incorporateFeedback(state, action, reward, new_state) 163 | state = new_state 164 | 165 | self.agent.incorporateFeedback(state, action, reward, new_state) 166 | total_rewards.append(total_reward) 167 | REWARDS.append(total_reward) 168 | #START_STATE_VALUES.append(max(WEIGHTS[((0,0), action)] for action in self.mdp.ACTIONS)) 169 | START_STATE_VALUES.append(VALUE_WEIGHTS[((0,0), None)]) 170 | total_reward = 0 171 | 172 | print 'average reward of agent {}: {}'.format(agent_id, np.mean(total_rewards)) 173 | 174 | class MultithreadedExperiment(object): 175 | 176 | def __init__(self, experiment, num_agents): 177 | self.experiment = experiment 178 | self.num_agents = num_agents 179 | 180 | def run(self): 181 | pool = ThreadPool(self.num_agents) 182 | for idx in range(self.num_agents): 183 | pool.apply_async(self.run_experiement, args=(self.experiment, idx)) 184 | 185 | pool.close() 186 | pool.join() 187 | 188 | @staticmethod 189 | def run_experiement(experiment, agent_id): 190 | print 'starting experiment with agent number {}...'.format(agent_id) 191 | experiment_copy = copy.deepcopy(experiment) 192 | experiment.run(agent_id) 193 | 194 | class AsyncSarsa(object): 195 | 196 | def __init__(self, actions, discount, exploration_prob, learning_rate): 197 | self.actions = actions 198 | self.discount = discount 199 | self.exploration_prob = exploration_prob 200 | self.learning_rate = learning_rate 201 | self.num_iters = 0 202 | 203 | def feature_extractor(self, state, action): 204 | return [((state, action), 1)] 205 | 206 | def getQ(self, state, action): 207 | score = 0 208 | for f, v in self.feature_extractor(state, action): 209 | score += WEIGHTS[f] * v 210 | return score 211 | 212 | def get_action(self, state): 213 | self.num_iters += 1 214 | 215 | if self.exploration_prob > .05: 216 | self.exploration_prob -= 1e-8 217 | 218 | if random.random() < self.exploration_prob: 219 | action = random.choice(self.actions) 220 | else: 221 | action = max((self.getQ(state, action), action) for action in self.actions)[1] 222 | return action 223 | 224 | def incorporateFeedback(self, state, action, reward, new_state): 225 | prediction = self.getQ(state, action) 226 | target = reward 227 | new_action = None 228 | 229 | if new_state != None: 230 | new_action = self.get_action(new_state) 231 | target += self.discount * self.getQ(new_state, new_action) 232 | 233 | for f, v in self.feature_extractor(state, action): 234 | WEIGHTS[f] = WEIGHTS[f] + self.learning_rate * (target - prediction) * v 235 | 236 | return new_action 237 | 238 | class AsyncAdvantageActorCritic(object): 239 | 240 | def __init__(self, actions, discount, tau, learning_rate): 241 | self.actions = actions 242 | self.discount = discount 243 | self.tau = tau 244 | self.learning_rate = learning_rate 245 | self.num_iters = 0 246 | 247 | def feature_extractor(self, state, action=None): 248 | return [((state, action), 1)] 249 | 250 | def getV(self, state): 251 | score = 0 252 | for f, v in self.feature_extractor(state): 253 | score += VALUE_WEIGHTS[f] * v 254 | return score 255 | 256 | def getQ(self, state, action): 257 | score = 0 258 | for f, v in self.feature_extractor(state, action): 259 | score += WEIGHTS[f] * v 260 | return score 261 | 262 | def get_action(self, state): 263 | self.num_iters += 1 264 | # if self.tau > 1e-9: 265 | # self.tau *= .9999 266 | # print self.tau 267 | 268 | q_values = np.array([self.getQ(state, action) for action in self.actions]) 269 | exp_q_values = np.exp(q_values / (self.tau + 1e-2)) 270 | weights = dict() 271 | for idx, val in enumerate(exp_q_values): 272 | weights[idx] = val 273 | action_idx = learning_utils.weightedRandomChoice(weights) 274 | action = self.actions[action_idx] 275 | return action 276 | 277 | def incorporateFeedback(self, state, action, reward, new_state): 278 | prediction = self.getV(state) 279 | target = reward 280 | new_action = None 281 | 282 | if new_state != None: 283 | new_action = self.get_action(new_state) 284 | target += self.discount * self.getV(new_state) 285 | 286 | update = self.learning_rate * (target - prediction) 287 | for f, v in self.feature_extractor(state): 288 | VALUE_WEIGHTS[f] = VALUE_WEIGHTS[f] + 2 * update 289 | 290 | for f, v in self.feature_extractor(state, action): 291 | WEIGHTS[f] = WEIGHTS[f] + update * 1 292 | 293 | return new_action 294 | 295 | def plot_values(values, ylabel): 296 | values = np.mean(np.reshape(values, (-1, 4)), axis=1).reshape(-1) 297 | plt.scatter(range(len(values)), values) 298 | plt.xlabel('episodes (1 per actor-learner)') 299 | plt.ylabel(ylabel) 300 | plt.show() 301 | 302 | def run(): 303 | start = time.time() 304 | room_size = 5 305 | num_rooms = 2 306 | mdp = MazeMDP(room_size=room_size, num_rooms=num_rooms) 307 | # agent = AsyncSarsa(actions=mdp.ACTIONS, discount=mdp.DISCOUNT, 308 | # exploration_prob=0.3, learning_rate=.5) 309 | agent = AsyncAdvantageActorCritic(actions=mdp.ACTIONS, discount=mdp.DISCOUNT, 310 | tau=.3, learning_rate=.5) 311 | max_steps = (2 * room_size * num_rooms) ** 2 312 | experiment = Experiment(mdp=mdp, agent=agent, num_episodes=800, max_steps=max_steps) 313 | multiexperiment = MultithreadedExperiment(experiment=experiment, num_agents=NUM_THREADS) 314 | multiexperiment.run() 315 | end = time.time() 316 | print 'took {} seconds'.format(end - start) 317 | mdp.print_state_values() 318 | plot_values(REWARDS, 'rewards') 319 | plot_values(START_STATE_VALUES, 'start state value') 320 | 321 | 322 | 323 | if __name__ =='__main__': 324 | run() -------------------------------------------------------------------------------- /scripts/aws_s3_utility.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import boto 4 | 5 | from boto.s3.key import Key 6 | from boto.s3.connection import S3Connection 7 | 8 | class S3Utility(object): 9 | """ 10 | :description: An AWS S3 utility class. 11 | 12 | This is a class rather than a module because some state is required 13 | to avoid having to reestablish an s3 connection each transaction. 14 | 15 | :type access_key: string 16 | :param access_key: aws access key 17 | 18 | :type secret_key: string 19 | :param secret_key: aws secret key 20 | 21 | """ 22 | 23 | def __init__(self, access_key, secret_key, s3_bucket): 24 | self.access_key = access_key 25 | self.secret_key = secret_key 26 | self.s3_bucket = s3_bucket 27 | self._conn = None 28 | 29 | @property 30 | def conn(self): 31 | if self._conn is not None: 32 | return self._conn 33 | else: 34 | return S3Connection(self.access_key, self.secret_key) 35 | 36 | def download_file_list(self, prefix=''): 37 | """ 38 | :description: loads the name of the files in a bucket. 39 | Optionally returns only those filenames that start with prefix. 40 | """ 41 | 42 | # select the bucket, where input_s3_bucket takes the form 'bsdsdata' 43 | bucket = self.conn.get_bucket(self.s3_bucket) 44 | 45 | # collect the list of files to process - those that start with the data group id 46 | file_list = [] 47 | for key in bucket.list(): 48 | key_name = key.name.encode('utf-8') 49 | if key_name.startswith(prefix): 50 | file_list.append(key_name) 51 | 52 | return file_list 53 | 54 | def download_file(self, file_to_load, local_save_dir): 55 | """ 56 | :description: load a file from a given s3 bucket with a 57 | given name and save to a local dir 58 | 59 | :type s3_bucket: string 60 | :param s3_bucket: s3 bucket from which to load the file 61 | 62 | :type file_to_load: string 63 | :param file_to_load: the file to load 64 | 65 | :type local_save_dir: string 66 | :param local_save_dir: the local dir to which to save the downloaded file 67 | 68 | :return: the location where the file was saved 69 | """ 70 | 71 | # select the bucket, where input_s3_bucket takes the form 'bsdsdata' 72 | bucket = self.conn.get_bucket(self.s3_bucket) 73 | 74 | # set a key to the processed files list 75 | key = Key(bucket, file_to_load) 76 | key_name = key.name.encode('utf-8') 77 | 78 | # download the file to process and save in the input location 79 | save_location = os.path.join(local_save_dir, key_name) 80 | try: 81 | key.get_contents_to_filename(save_location) 82 | except boto.exception.S3ResponseError as e: 83 | raise boto.exception.S3ResponseError("key name: {} failed".format(key_name)) 84 | 85 | # return the location of the downloaded file 86 | return save_location 87 | 88 | def upload_file(self, filename_to_save_as, file_path): 89 | """ 90 | :description: uploads a single file to an s3 bucket 91 | """ 92 | # what is this? 93 | def percent_cb(complete, total): 94 | sys.stdout.write('.') 95 | sys.stdout.flush() 96 | 97 | # select the bucket, where input_s3_bucket takes the form 'bsdsdata' 98 | bucket = self.conn.get_bucket(self.s3_bucket) 99 | 100 | # send the file to the s3 bucket 101 | key = Key(bucket) 102 | key.key = filename_to_save_as 103 | key.set_contents_from_filename(file_path, cb=percent_cb, num_cb=50) 104 | 105 | def upload_directory(self, directory): 106 | """ 107 | :description: upload all the files in a directory to aws s3 108 | """ 109 | 110 | filepaths = [] 111 | for root, dirs, files in os.walk(directory): 112 | for filename in files: 113 | filepaths.append(os.path.join(root, filename)) 114 | 115 | upload_directory = os.path.basename(directory) 116 | for filepath in filepaths: 117 | dest_filepath = os.path.join(upload_directory, filepath.split(upload_directory)[-1][1:]) 118 | self.upload_file(dest_filepath, filepath) 119 | 120 | 121 | -------------------------------------------------------------------------------- /scripts/experiment.py: -------------------------------------------------------------------------------- 1 | 2 | import collections 3 | import numpy as np 4 | import random 5 | 6 | import learning_utils 7 | 8 | class Experiment(object): 9 | """ 10 | :description: Experiment is a class representing an online reinforcement learning experiment. This class orchestrates the interaction between an agent and an mdp. 11 | """ 12 | 13 | def __init__(self, mdp, agent, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, 14 | value_logging=False): 15 | """ 16 | :type mdp: object inheriting from MDP 17 | :param mdp: the markov decision process in which the agent acts 18 | 19 | :type agent: object inheriting from Agent 20 | :param agent: the agent that acts within the experiment 21 | 22 | :type num_epochs: int 23 | :param num_epochs: number of training epochs to run 24 | 25 | :type epoch_length: int 26 | :param epoch_length: length of each epoch in episodes 27 | 28 | :type test_epoch_length: int 29 | :param test_epoch_length: length of a test epoch in episodes 30 | 31 | :type max_steps: int 32 | :param max_steps: maximum number of steps allowed in a single episode 33 | 34 | :type run_tests: boolean 35 | :param run_tests: whether or not to run testing epochs 36 | 37 | :type value_logging: boolean 38 | :param value_logging: whether or not to write a representation of the value function to a file 39 | """ 40 | self.mdp = mdp 41 | self.agent = agent 42 | self.num_epochs = num_epochs 43 | self.epoch_length = epoch_length 44 | self.test_epoch_length = test_epoch_length 45 | self.max_steps = max_steps 46 | self.run_tests = run_tests 47 | self.mdp_actions = self.mdp.get_actions() 48 | self.value_logging = value_logging 49 | 50 | def run(self): 51 | """ 52 | :description: main method which runs the entire experiment 53 | """ 54 | for epoch in xrange(self.num_epochs): 55 | self.run_epoch(epoch, self.epoch_length) 56 | self.agent.finish_epoch(epoch) 57 | self.finish_epoch(epoch) 58 | 59 | if self.run_tests: 60 | self.agent.start_testing() 61 | self.run_epoch(self.test_epoch_length) 62 | self.agent.finish_testing(epoch) 63 | 64 | def run_epoch(self, epoch, epoch_length): 65 | """ 66 | :description: runs a single epoch 67 | 68 | :type epoch_length: int 69 | :param epoch_length: length of the current epoch in episodes 70 | """ 71 | for episode in xrange(epoch_length): 72 | self.run_episode() 73 | 74 | def run_episode(self): 75 | """ 76 | :description: runs a single episode 77 | """ 78 | state = self.mdp.get_start_state() 79 | action = self.agent.start_episode(state) 80 | reward = 0 81 | for step in xrange(self.max_steps): 82 | 83 | # get the next state and reward 84 | next_state, reward, terminal = self.step(state, action) 85 | 86 | # if episode has ended, then break 87 | if terminal: 88 | break 89 | 90 | # otherwise, inform the agent and get a new action 91 | action = self.agent.step(next_state, reward) 92 | state = next_state 93 | 94 | # store this experience as a terminal one regardless of the loop exit condition 95 | # because either way the next state will break continuity 96 | self.agent.finish_episode(next_state, reward) 97 | 98 | def step(self, state, action): 99 | """ 100 | :description: progresses the experiment forward one time step 101 | """ 102 | # convert to mdp action format and get transitions 103 | real_action = self.mdp_actions[action] 104 | transitions = self.mdp.succ_prob_reward(state, real_action) 105 | 106 | # randomly sample a transition 107 | i = learning_utils.sample([prob for newState, prob, reward in transitions]) 108 | next_state, prob, reward = transitions[i] 109 | 110 | # if the next state is terminal note that 111 | terminal = False 112 | if self.mdp.is_end_state(next_state): 113 | terminal = True 114 | 115 | return next_state, reward, terminal 116 | 117 | def finish_epoch(self, epoch): 118 | """ 119 | :description: finalize epoch 120 | """ 121 | if self.value_logging and self.agent.replay_memory.is_full(): 122 | self.log_value_string() 123 | # if epoch > 3: 124 | # self.log_trajectories() 125 | 126 | def log_trajectories(self): 127 | self.agent.logger.log_trajectories(self.mdp) 128 | 129 | def log_value_string(self): 130 | """ 131 | :description: collect the necessary components to print a representation of the optimal value 132 | of each state in the mdp. 133 | """ 134 | V = {} 135 | for state in self.mdp.states: 136 | V[state] = np.max(self.agent.get_q_values(state)) 137 | value_string = self.mdp.get_value_string(V) 138 | self.agent.logger.log_value_string(value_string) 139 | self.agent.logger.log_values(V) 140 | 141 | 142 | -------------------------------------------------------------------------------- /scripts/file_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import os 5 | 6 | import logger 7 | 8 | 9 | def is_valid(key): 10 | return key.replace('-','').isalnum() 11 | 12 | def load_key(filepath): 13 | assert os.path.exists(filepath), 'filepath: {} not found'.format(filepath) 14 | 15 | key = None 16 | with open(filepath, 'rb') as f: 17 | key = f.readline() 18 | if is_valid(key): 19 | return key 20 | else: 21 | raise ValueError('invalid key: {}'.format(key)) 22 | 23 | 24 | def graph_rewards_seq_len(filepaths): 25 | initrewards = [] 26 | min_len = 10000000 27 | for f in filepaths: 28 | r = np.load(f)['values'] 29 | mr = logger.moving_average(r, 5) 30 | if len(mr) < min_len: 31 | min_len = len(mr) 32 | initrewards.append(mr) 33 | 34 | rewards = [] 35 | for r in initrewards: 36 | rewards.append(r[:min_len]) 37 | 38 | r2 = plt.plot(rewards[0], label='length 2 sequence', color='orange') 39 | r4 = plt.plot(rewards[1], label='length 4 sequence', color='crimson') 40 | r8 = plt.plot(rewards[2], label='length 8 sequence', color='cyan') 41 | r12 = plt.plot(rewards[3], label='length 12 sequence', color='brown') 42 | r16 = plt.plot(rewards[4], label='length 16 sequence', color='blue') 43 | r20 = plt.plot(rewards[5], label='length 20 sequence', color='black') 44 | r24 = plt.plot(rewards[6], label='length 24 sequence', color='pink') 45 | 46 | plt.legend(loc='lower right') 47 | plt.ylabel('Episode Rewards') 48 | plt.xlabel('Epochs') 49 | plt.savefig('/Users/wulfe/Dropbox/School/Stanford/winter_2016/cs239/project/hierarchical_rl/results/seqlen_rewards.png') 50 | 51 | def graph_rewards(filepaths): 52 | rewards = [] 53 | for f in filepaths: 54 | r = np.load(f)['values'] 55 | mr = logger.moving_average(r, 10) 56 | rewards.append(mr) 57 | 58 | plt.plot(rewards[0], label='row/col + room', color='r') 59 | plt.plot(rewards[1], label='row/col only', color='g') 60 | plt.plot(rewards[2], label='tabular', color='b') 61 | plt.plot(rewards[3], label='coordinates', color='magenta') 62 | 63 | plt.legend(loc='upper left') 64 | plt.ylabel('Episode Rewards') 65 | plt.xlabel('Epochs') 66 | plt.savefig('/Users/wulfe/Dropbox/School/Stanford/winter_2016/cs239/project/hierarchical_rl/results/staterep_rewards.png') 67 | 68 | if __name__ =='__main__': 69 | root = '/Users/wulfe/Desktop/logs2/promise_hrlstaterep' 70 | 71 | rowcolroom = os.path.join(root, 'QNetwork_2016-03-02T02.56.25.325166', 'rewards.npz') 72 | rowcol = os.path.join(root, 'QNetwork_2016-03-02T03.12.14.506093', 'rewards.npz') 73 | tabular = os.path.join(root, 'QNetwork_2016-03-02T03.35.00.107253', 'rewards.npz') 74 | coords = os.path.join(root, 'QNetwork_2016-03-02T04.01.10.893242', 'rewards.npz') 75 | filepaths = [rowcolroom, rowcol, tabular, coords] 76 | graph_rewards(filepaths) 77 | 78 | # r2 = os.path.join(root, 'single_layer_lstm_2016-02-29T12.52.41.641967', 'rewards.npz') 79 | # r4 = os.path.join(root + '_hrltimestep', 'single_layer_lstm_2016-03-01T15.31.29.414573', 'rewards.npz') 80 | # r8 = os.path.join(root, 'single_layer_lstm_2016-02-29T15.16.25.802919', 'rewards.npz') 81 | # r12 = os.path.join(root + '_hrltimestep', 'single_layer_lstm_2016-03-01T12.42.03.151324', 'rewards.npz') 82 | # r16 = os.path.join(root + '_hrltimestep', 'single_layer_lstm_2016-03-01T12.30.01.583816', 'rewards.npz') 83 | # r20 = os.path.join(root + '_hrltimestep', 'single_layer_lstm_2016-03-01T12.35.43.236976', 'rewards.npz') 84 | # r24 = os.path.join(root + '_hrltimestep', 'single_layer_lstm_2016-03-01T17.31.08.383906', 'rewards.npz') 85 | # filepaths = [r2, r4, r8, r12, r16, r20, r24] 86 | # graph_rewards(filepaths) -------------------------------------------------------------------------------- /scripts/learning_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import glob 3 | from math import sqrt, ceil 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import os 7 | import random 8 | 9 | def sample(probs): 10 | """ 11 | :description: given a list of probabilities, randomly select an index into those probabilities 12 | """ 13 | if len(probs) < 1: 14 | raise ValueError('Sample received an empty list of probabilities. This should not happen. ') 15 | 16 | target = random.random() 17 | accum = 0 18 | for i, prob in enumerate(probs): 19 | accum += prob 20 | if accum >= target: return i 21 | raise ValueError('Invalid probabilities provided to sample method in experiment') 22 | 23 | # Function: Weighted Random Choice 24 | # -------------------------------- 25 | # Given a dictionary of the form element -> weight, selects an element 26 | # randomly based on distribution proportional to the weights. Weights can sum 27 | # up to be more than 1. 28 | # source: stanford.cs221.problem_set_6 29 | # may be beneficial to switch to a faster method 30 | def weightedRandomChoice(weightDict): 31 | weights = [] 32 | elems = [] 33 | for elem in weightDict: 34 | weights.append(weightDict[elem]) 35 | elems.append(elem) 36 | total = sum(weights) 37 | key = random.uniform(0, total) 38 | runningTotal = 0.0 39 | chosenIndex = None 40 | for i in range(len(weights)): 41 | weight = weights[i] 42 | runningTotal += weight 43 | if runningTotal > key: 44 | chosenIndex = i 45 | return elems[chosenIndex] 46 | raise Exception('Should not reach here') 47 | 48 | def visualize_grid(Xs, ubound=255.0, padding=1): 49 | """ 50 | Reshape a 4D tensor of image data to a grid for easy visualization. 51 | 52 | Inputs: 53 | - Xs: Data of shape (N, H, W, C) 54 | - ubound: Output grid will have values scaled to the range [0, ubound] 55 | - padding: The number of blank pixels between elements of the grid 56 | """ 57 | (N, H, W, C) = Xs.shape 58 | grid_size = int(ceil(sqrt(N))) 59 | grid_height = H * grid_size + padding * (grid_size - 1) 60 | grid_width = W * grid_size + padding * (grid_size - 1) 61 | grid = np.zeros((grid_height, grid_width, C)) 62 | next_idx = 0 63 | y0, y1 = 0, H 64 | for y in xrange(grid_size): 65 | x0, x1 = 0, W 66 | for x in xrange(grid_size): 67 | if next_idx < N: 68 | img = Xs[next_idx] 69 | low, high = np.min(img), np.max(img) 70 | grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low) 71 | # grid[y0:y1, x0:x1] = Xs[next_idx] 72 | next_idx += 1 73 | x0 += W + padding 74 | x1 += W + padding 75 | y0 += H + padding 76 | y1 += H + padding 77 | # grid_max = np.max(grid) 78 | # grid_min = np.min(grid) 79 | # grid = ubound * (grid - grid_min) / (grid_max - grid_min) 80 | return grid 81 | 82 | def get_run_directory(filepath): 83 | return filepath[:filepath.rindex('/')] 84 | 85 | def get_value_array_from_value_image_file(filepath): 86 | lines = None 87 | with open(filepath, 'rb') as f: 88 | lines = f.readlines() 89 | lines = [line.replace('\n', '').replace('S', '0').replace('E', '1').split(' ') for line in lines] 90 | lines = [[val for val in line if val != ''] for line in lines] 91 | lines = [[float(val) for val in line] for line in lines] 92 | lines = np.array(lines) 93 | return lines[::-1] 94 | 95 | def make_heat_map(filepath, epoch): 96 | # convert value image to numeric array 97 | value_array = get_value_array_from_value_image_file(filepath) 98 | if value_array is None: 99 | print 'Value image could not be converted to heatmap' 100 | return 101 | 102 | # determine output filepath 103 | run_dir = get_run_directory(filepath) 104 | output_filepath = os.path.join(run_dir, 'heatmaps', 'value_heatmap_{}.png'.format(epoch)) 105 | 106 | # create and save heatmap 107 | heatmap = plt.pcolormesh(value_array, vmin=-0.25, vmax=1.25) 108 | plt.colorbar() 109 | plt.savefig(output_filepath) 110 | plt.close() 111 | 112 | def load_params(filepath): 113 | params = np.load(filepath)['params'] 114 | return params 115 | 116 | 117 | if __name__ =='__main__': 118 | make_heat_maps() 119 | 120 | -------------------------------------------------------------------------------- /scripts/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | :description: These classes provide logging functionality for agents 3 | """ 4 | 5 | import collections 6 | import datetime 7 | import lasagne 8 | import matplotlib 9 | matplotlib.use('Agg') 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import os 13 | import pickle 14 | import sys 15 | 16 | import learning_utils 17 | 18 | LOGGING_DIRECTORY = '../logs' 19 | MAXIMUM_WEIGHT_MAGNITUDE = 1000 20 | 21 | def moving_average(values, window_size): 22 | """ 23 | :description: computes a moving average 24 | """ 25 | if len(values) == 0: 26 | print 'the list given to moving average cannot be empty but is' 27 | return [] 28 | 29 | window = np.ones(int(window_size))/float(window_size) 30 | values = np.hstack((np.repeat(values[0], int(window_size)), values, np.repeat(values[-1], int(window_size)))) 31 | average = np.convolve(values, window, 'same').tolist() 32 | return average[window_size:-window_size] 33 | 34 | class Logger(object): 35 | """ 36 | :description: tracks and logs information about an agent 37 | """ 38 | 39 | def __init__(self, agent_name, logging=True, verbose=True): 40 | """ 41 | :type agent_name: string 42 | :param agent_name: name of the agent whose information is being logged 43 | 44 | :type logging: boolean 45 | :param logging: whether or not to actually record any information 46 | """ 47 | self.agent_name = agent_name 48 | self.actions = [] 49 | self.rewards = [] 50 | self.episode_rewards = [] 51 | self.episode_actions = [] 52 | self.action_start = 0 53 | self.losses = [] 54 | self.states = [] 55 | self.updates = 0 56 | self.epoch = 0 57 | self.state_values = collections.defaultdict(lambda: []) 58 | self.weights = None 59 | self.log_dir = None 60 | self.logging = logging 61 | self.verbose = verbose 62 | self.steps = 0 63 | self.prev_steps = 0 64 | self.episode_steps = [] 65 | 66 | 67 | def log_action(self, action): 68 | self.steps += 1 69 | self.actions.append(action) 70 | 71 | def log_reward(self, reward): 72 | self.rewards.append(reward) 73 | 74 | def log_loss(self, loss): 75 | self.updates += 1 76 | self.losses.append(loss) 77 | 78 | def log_weights(self, weights): 79 | self.weights = weights 80 | max_magnitude = np.max(np.abs(weights.values())) 81 | 82 | if max_magnitude > MAXIMUM_WEIGHT_MAGNITUDE: 83 | except_string = 'Agent weights have surpassed reasonable values. Max weight: {}'.format(max_magnitude) 84 | raise ValueError(except_string) 85 | 86 | def log_epoch(self, epoch): 87 | """ 88 | :description: records the information so far collected 89 | 90 | :type epoch: int 91 | :param epoch: the current epoch number 92 | """ 93 | if not self.logging: 94 | return 95 | 96 | self.epoch += 1 97 | if self.log_dir is None: 98 | self.create_log_dir() 99 | 100 | self.record_stat('actions', self.actions, epoch) 101 | self.record_stat('rewards', self.episode_rewards, epoch) 102 | 103 | self.record_stat('losses', self.losses, epoch) 104 | self.record_weights(self.weights, epoch) 105 | 106 | def finish_episode(self): 107 | """ 108 | :description: performs tasks associated with the ending of an episode 109 | """ 110 | self.episode_rewards.append(np.sum(self.rewards)) 111 | self.rewards = [] 112 | 113 | self.episode_actions.append(self.actions[self.action_start:]) 114 | self.action_start = len(self.actions) 115 | 116 | self.episode_steps.append(self.steps - self.prev_steps) 117 | self.prev_steps = self.steps 118 | 119 | def record_stat(self, name, values, epoch): 120 | """ 121 | :description: saves values to a file and also plots them 122 | 123 | :type name: string 124 | :param name: name of the value being recorded 125 | 126 | :type values: list 127 | :param values: values to record 128 | 129 | :type epoch: int 130 | :param epoch: current epoch number 131 | """ 132 | self.save_stat(name, values, epoch) 133 | self.plot_stat(name, values, epoch) 134 | 135 | def save_stat(self, name, values, epoch): 136 | """ 137 | :description: saves a set of values to a file in npz format under the name 'values' 138 | """ 139 | filename = '{}'.format(name) 140 | filepath = os.path.join(self.log_dir, filename) 141 | np.savez(filepath, values=values) 142 | 143 | def plot_stat(self, name, values, epoch): 144 | """ 145 | :description: plots the provided values 146 | """ 147 | if len(values) < 1: 148 | return 149 | 150 | filename = '{}_graph.png'.format(name) 151 | filepath = os.path.join(self.log_dir, filename) 152 | 153 | values = np.array(values) 154 | if len(values.shape) < 2: 155 | values = np.vstack((np.arange(len(values)), values)) 156 | else: 157 | values = values.T 158 | 159 | plt.figure() 160 | # x_max = 1.2 * max(values[0,:]) 161 | # x_min = -0.2 * max(values[0,:]) 162 | # y_max = 1.2 * max(values[1,:]) 163 | # y_min = -0.2 * max(values[0,:]) 164 | # plt.axis([x_min, x_max, y_min, y_max]) 165 | plt.scatter(values[0, :], values[1, :]) 166 | plt.plot(values[0, :], moving_average(values[1, :], 50), c='r') 167 | plt.xlabel('Updates') 168 | plt.ylabel(name) 169 | plt.savefig(filepath) 170 | plt.close() 171 | 172 | def record_weights(self, weights, epoch): 173 | """ 174 | :description: saves the weights to a file 175 | """ 176 | filename = 'weights_epoch_{}.pkl'.format(epoch) 177 | filepath = os.path.join(self.log_dir, filename) 178 | with open(filepath, 'wb') as f: 179 | pickle.dump(weights, f, pickle.HIGHEST_PROTOCOL) 180 | 181 | def create_log_dir(self): 182 | """ 183 | :description: creates a directory in which to log information for the current agent 184 | """ 185 | # make the main logging directory 186 | dir_name = '{}_{}'.format(self.agent_name, datetime.datetime.now().isoformat()) 187 | dir_path = os.path.join(LOGGING_DIRECTORY, dir_name) 188 | os.mkdir(dir_path) 189 | self.log_dir = dir_path 190 | 191 | # make a subdirectory for the network parameter files 192 | params_dir_path = os.path.join(self.log_dir, 'params') 193 | os.mkdir(params_dir_path) 194 | self.params_dir = params_dir_path 195 | 196 | # make a subdirectory for the heatmaps 197 | heatmap_dir_path = os.path.join(self.log_dir, 'heatmaps') 198 | os.mkdir(heatmap_dir_path) 199 | self.heatmap_dir = heatmap_dir_path 200 | 201 | def log_value_string(self, value_string): 202 | """ 203 | :description: prints a string to a file. The string, when formatted, gives the values of different states in the mdp. 204 | """ 205 | if self.log_dir is None: 206 | self.create_log_dir() 207 | 208 | filename = 'value_image.txt' 209 | filepath = os.path.join(self.log_dir, filename) 210 | with open(filepath, 'wb') as f: 211 | f.write(value_string) 212 | 213 | learning_utils.make_heat_map(filepath, self.epoch) 214 | 215 | def log_values(self, V): 216 | """ 217 | :description: keeps track of how the q_values change over time 218 | """ 219 | 220 | mean_value = np.mean(V.values()) 221 | max_value = np.max(V.values()) 222 | min_value = np.min(V.values()) 223 | self.state_values['mean'].append(mean_value) 224 | self.state_values['max'].append(max_value) 225 | self.state_values['min'].append(min_value) 226 | self.state_values['start'].append(V[(0,0)]) 227 | self.plot_values() 228 | self.save_values() 229 | 230 | def save_values(self): 231 | for state, values in self.state_values.iteritems(): 232 | filename = '{}'.format(state) 233 | filepath = os.path.join(self.log_dir, filename) 234 | np.savez(filepath, values=values) 235 | 236 | def plot_values(self): 237 | """ 238 | :description: plot mean, max, and min state values so far 239 | """ 240 | filename = 'state_values_graph.png' 241 | filepath = os.path.join(self.log_dir, filename) 242 | plt.figure() 243 | plt.xlabel('Updates') 244 | plt.ylabel('V(s)') 245 | count = 0 246 | plt.scatter(np.arange(len(self.state_values['mean'])), self.state_values['mean'], c='b') 247 | plt.scatter(np.arange(len(self.state_values['max'])), self.state_values['max'], c='r') 248 | plt.scatter(np.arange(len(self.state_values['min'])), self.state_values['min'], c='g') 249 | plt.scatter(np.arange(len(self.state_values['start'])), self.state_values['start'], marker='*') 250 | plt.savefig(filepath) 251 | plt.close() 252 | 253 | class NeuralLogger(Logger): 254 | """ 255 | :description: inherting class that accomodates a network based agent 256 | """ 257 | 258 | def __init__(self, agent_name, logging=True, verbose=True): 259 | super(NeuralLogger, self).__init__(agent_name, logging, verbose) 260 | self.weight_magnitudes = [] 261 | self.weight_variances = [] 262 | self.exploration_probs = [] 263 | 264 | def log_epoch(self, epoch, network, policy): 265 | 266 | if not self.logging: 267 | return 268 | 269 | self.epoch += 1 270 | if self.log_dir is None: 271 | self.create_log_dir() 272 | 273 | try: 274 | self.record_stat('actions', self.actions, epoch) 275 | self.record_stat('episode_rewards', self.episode_rewards, epoch) 276 | self.record_stat('losses', self.losses, epoch) 277 | self.record_stat('episode_steps', self.episode_steps, epoch) 278 | 279 | if self.verbose: 280 | print '\nEpoch: {}'.format(epoch) 281 | print 'Steps in last episode: {}'.format(self.episode_steps[-1]) 282 | if len(self.losses) > 0: 283 | print 'Losses: {}'.format(np.mean(self.losses[-self.episode_steps[-1]:])) 284 | 285 | self.record_weights(epoch, network) 286 | self.record_policy(epoch, policy) 287 | except Exception as e: 288 | print 'ERROR occurred during logging: ' 289 | print e 290 | 291 | def record_weights(self, epoch, network): 292 | """ 293 | :description: records weights by saving them to a file 294 | 295 | :type epoch: int 296 | :param epoch: current epoch 297 | 298 | :type network: any class implementing get_params() 299 | :param network: the networks whose weights should be saved 300 | """ 301 | params = network.get_params() 302 | self.save_params(params, epoch) 303 | self.plot_weights(params, epoch) 304 | 305 | def save_params(self, params, epoch): 306 | filename = 'network_file_epoch_{}.save'.format(epoch) 307 | filepath = os.path.join(self.params_dir, filename) 308 | np.savez(filepath, params=params) 309 | 310 | def plot_weights(self, params, epoch): 311 | means = [] 312 | variances = [] 313 | for param in params: 314 | means.append(np.mean(np.abs(param))) 315 | variances.append(np.var(param)) 316 | self.weight_magnitudes.append(np.mean(means)) 317 | self.record_stat('weight_magnitudes', self.weight_magnitudes, epoch) 318 | self.weight_variances.append(np.mean(variances)) 319 | self.record_stat('weight_variances', self.weight_variances, epoch) 320 | 321 | def record_policy(self, epoch, policy): 322 | self.exploration_probs.append(policy.exploration_prob) 323 | self.record_stat('exploration_probs', self.exploration_probs, epoch) 324 | 325 | def log_trajectories(self, mdp): 326 | for trajectory in self.episode_actions: 327 | mdp.print_trajectory(trajectory) 328 | 329 | def log_hyperparameters(self, network, policy, replay_memory): 330 | if self.log_dir is None: 331 | self.create_log_dir() 332 | 333 | filename = 'hyperparameters.txt' 334 | filepath = os.path.join(self.log_dir, filename) 335 | hyperparameters = {} 336 | hyperparameters['batch_size'] = network.batch_size 337 | hyperparameters['num_hidden'] = network.num_hidden 338 | hyperparameters['num_parameters'] = lasagne.layers.count_params(network.l_out) 339 | hyperparameters['discount'] = network.discount 340 | hyperparameters['learning_rate'] = network.learning_rate 341 | hyperparameters['regularization'] = network.regularization 342 | hyperparameters['update_rule'] = network.update_rule 343 | hyperparameters['freeze_interval'] = network.freeze_interval 344 | hyperparameters['replay_memory_capacity'] = replay_memory.capacity 345 | hyperparameters['actions_until_min'] = policy.actions_until_min 346 | hyperparameters['epsilon'] = policy.exploration_prob 347 | if hasattr(network, 'network_type'): 348 | hyperparameters['network_type'] = network.network_type 349 | if hasattr(replay_memory, 'sequence_length'): 350 | hyperparameters['sequence_length'] = replay_memory.sequence_length 351 | 352 | with open(filepath, 'wb') as f: 353 | for k, v in hyperparameters.iteritems(): 354 | f.write('{}: {}\n'.format(k, v)) 355 | 356 | 357 | 358 | -------------------------------------------------------------------------------- /scripts/mdps.py: -------------------------------------------------------------------------------- 1 | """ 2 | :description: Markov Decision Process classes 3 | """ 4 | 5 | import collections 6 | import copy 7 | import numpy as np 8 | import random 9 | import sys 10 | 11 | class MDP(object): 12 | 13 | def get_start_state(self): 14 | raise NotImplementedError("Override me") 15 | 16 | def get_actions(self): 17 | raise NotImplementedError("Override me") 18 | 19 | def succ_prob_reward(self, state, action): 20 | """ 21 | :description: returns a _list_ of tuples containing (next_state, probability, reward). Where the probability denotes the probability of the next_state and reward. 22 | """ 23 | raise NotImplementedError("Override me") 24 | 25 | def get_discount(self): 26 | raise NotImplementedError("Override me") 27 | 28 | def compute_states(self): 29 | self.states = set() 30 | self.graph = collections.defaultdict(lambda: set()) 31 | queue = [] 32 | self.states.add(self.get_start_state()) 33 | queue.append(self.get_start_state()) 34 | while len(queue) > 0: 35 | state = queue.pop() 36 | for action in self.get_actions(state): 37 | for newState, prob, reward in self.succ_prob_reward(state, action): 38 | if newState != state: 39 | self.graph[state].add(newState) 40 | if newState not in self.states: 41 | self.states.add(newState) 42 | if not self.is_end_state(newState): 43 | queue.append(newState) 44 | 45 | self.graph = {k:list(v) for k, v in self.graph.iteritems()} 46 | 47 | ########################################################################### 48 | 49 | class LineMDP(MDP): 50 | """ 51 | :description: A line mdp is just an x axis. Here the rewards are all -1 except for the last state on the right which is +1. 52 | """ 53 | 54 | EXIT_REWARD = 1 55 | MOVE_REWARD = -.01 56 | 57 | def __init__(self, length): 58 | self.length = length 59 | 60 | def get_start_state(self): 61 | return 0 62 | 63 | def get_actions(self, state=None): 64 | return [-1, 1] 65 | 66 | def get_discount(self): 67 | return 1 68 | 69 | def is_end_state(self, state): 70 | return state == self.length 71 | 72 | def succ_prob_reward(self, state, action): 73 | if state == self.length: 74 | return [] 75 | 76 | next_state = max(-self.length, state + action) 77 | reward = 1 if next_state == self.length else -1 78 | return [(next_state, 1, reward)] 79 | 80 | def print_v(self, V): 81 | line = ['-'] * (self.length * 2) 82 | for vidx, lidx in zip(range(-self.length, self.length), range(self.length * 2)): 83 | if vidx in V: 84 | line[lidx] = round(V[vidx], 2) 85 | print line 86 | 87 | def print_pi(self, pi): 88 | line = ['-'] * (self.length * 2) 89 | for pidx, lidx in zip(range(-self.length, self.length), range(self.length * 2)): 90 | if pidx in pi: 91 | line[lidx] = round(pi[pidx], 2) 92 | print line 93 | 94 | ########################################################################### 95 | 96 | class MazeMDP(MDP): 97 | """ 98 | :description: an MDP specifying a maze, where that maze is a square, consists of num_rooms and each room having room_size discrete squares in it. So can have 1x1, 2x2, 3x3, etc size mazes. Rooms are separated by walls with a single entrance between them. The start state is always the bottom left of the maze, 1 position away from each wall of the first room. The end state is always in the top right room of the maze, again 1 position away from each wall. So the 1x1 maze looks like: 99 | 100 | _______ 101 | | | 102 | | E | 103 | | S | 104 | | | 105 | ------- 106 | 107 | the 2x2 maze would be 108 | 109 | _______ _______ 110 | | | | 111 | | E | 112 | | | 113 | | | | 114 | -- -- -- -- 115 | __ __ __ __ 116 | | | | 117 | | | 118 | | S | 119 | | | | 120 | ------- ------- 121 | 122 | 123 | state is represented in absolute terms, so the bottom left corner of all mazes is (0,0) and to top right corner of all mazes is (room_size * num_rooms - 1, room_size * num_rooms - 1). In other words, the state ignores the fact that there are rooms or walls or anything, it's just the coordinates. 124 | 125 | actions are N,E,S,W movement by 1 direction. No stochasticity for now. moving into a wall leaves agent in place. Rewards are nothing except finding the exit is worth a lot 126 | 127 | room_size must be odd 128 | """ 129 | 130 | EXIT_REWARD = 1 131 | MOVE_REWARD = -0.01 132 | TRUE_START_STATE_VALUE = 0.83 133 | 134 | def __init__(self, room_size, num_rooms): 135 | self.room_size = room_size 136 | self.num_rooms = num_rooms 137 | self.max_position = self.room_size * self.num_rooms - 1 138 | self.end_state = (self.max_position, self.max_position) 139 | 140 | def get_default_action(self): 141 | return (1,0) 142 | 143 | def get_actions(self, state=None): 144 | return [(1,0),(0,1),(-1,0),(0,-1)] 145 | 146 | def get_start_state(self): 147 | return (0,0) 148 | 149 | def is_end_state(self, state): 150 | return state == self.end_state 151 | 152 | def get_discount(self): 153 | return 0.9 154 | 155 | def get_mean_state_values(self): 156 | return np.repeat(self.max_position / 2., 2) 157 | 158 | def calculate_next_state(self, state, action): 159 | next_state = (state[0] + action[0], state[1] + action[1]) 160 | return next_state 161 | 162 | def runs_into_wall(self, state, action): 163 | next_state = self.calculate_next_state(state, action) 164 | 165 | # 1. check for leaving the maze 166 | if next_state[0] > self.max_position or next_state[0] < 0 \ 167 | or next_state[1] > self.max_position or next_state[1] < 0: 168 | return True 169 | 170 | # 2. check if movement was through doorway and if so return false 171 | doorway_position = (self.room_size) / 2 172 | # check horizontal movement through doorway 173 | if next_state[0] != state[0]: 174 | if next_state[1] % self.room_size == doorway_position: 175 | return False 176 | 177 | # check vertical movement through doorway 178 | if next_state[1] != state[1]: 179 | if next_state[0] % self.room_size == doorway_position: 180 | return False 181 | 182 | # 3. check if movement was through a wall 183 | room_size = self.room_size 184 | # move right to left through wall 185 | if state[0] % room_size == room_size - 1 and next_state[0] % room_size == 0: 186 | return True 187 | 188 | # move left to right through wall 189 | if next_state[0] % room_size == room_size - 1 and state[0] % room_size == 0: 190 | return True 191 | 192 | # move up through wall 193 | if state[1] % room_size == room_size - 1 and next_state[1] % room_size == 0: 194 | return True 195 | 196 | # move down through wall 197 | if next_state[1] % room_size == room_size - 1 and state[1] % room_size == 0: 198 | return True 199 | 200 | # if none of the above conditions meet, then have not passed through wall 201 | return False 202 | 203 | def succ_prob_reward(self, state, action): 204 | 205 | # if we reach the end state then the episode ends 206 | if np.array_equal(state, self.end_state): 207 | raise ValueError('Provided state equals end_state, should have stopped episode in experiment. state: {}\taction:{}'.format(state, action)) 208 | 209 | if self.runs_into_wall(state, action): 210 | # if the action runs us into a wall do nothing 211 | next_state = state 212 | else: 213 | # o/w determine the next position 214 | next_state = self.calculate_next_state(state, action) 215 | 216 | # if next state is exit, then set reward 217 | reward = self.MOVE_REWARD 218 | if np.array_equal(next_state, self.end_state): 219 | reward = self.EXIT_REWARD 220 | 221 | #return [(next_state, 0.9, reward), (state, 0.1, self.MOVE_REWARD)] 222 | return [(next_state, 1, reward)] 223 | 224 | def print_v(self, V): 225 | for ridx in reversed(range(self.max_position + 1)): 226 | for cidx in range(self.max_position + 1): 227 | if (ridx, cidx) in V: 228 | print round(V[(ridx, cidx)], 3), 229 | print('\n') 230 | 231 | def get_value_string(self, V): 232 | value_string = [] 233 | for ridx in reversed(range(self.max_position + 1)): 234 | for cidx in range(self.max_position + 1): 235 | if (ridx, cidx) in V: 236 | value_string.append(round(V[(ridx, cidx)], 5)) 237 | value_string.append(' ') 238 | value_string.append('\n') 239 | return ''.join([str(v) for v in value_string]) 240 | 241 | def print_maze(self, coordinates): 242 | for row in range(self.room_size): 243 | for col in range(self.room_size): 244 | if coordinates == (row,col): 245 | print '*', 246 | elif self.end_state == (row,col): 247 | print 'e', 248 | else: 249 | print '-', 250 | print '\n' 251 | print '\n' 252 | 253 | def print_trajectory(self, actions): 254 | coordinates = self.get_start_state() 255 | self.print_maze(coordinates) 256 | for action in actions: 257 | action = self.get_actions()[action] 258 | if not self.runs_into_wall(coordinates, action): 259 | coordinates = (coordinates[0] + action[0], coordinates[1] + action[1]) 260 | self.print_maze(coordinates) 261 | 262 | ########################################################################### 263 | 264 | -------------------------------------------------------------------------------- /scripts/policy.py: -------------------------------------------------------------------------------- 1 | """ 2 | :description: classes implementing action selection policies 3 | """ 4 | 5 | import numpy as np 6 | import random 7 | 8 | import learning_utils 9 | 10 | class Policy(object): 11 | 12 | def __init__(self, num_actions): 13 | self.actions = range(num_actions) 14 | 15 | def choose_action(self, q_values): 16 | raise NotImplementedError("Override me") 17 | 18 | def random_action(self): 19 | return random.choice(self.actions) 20 | 21 | class EpsilonGreedy(Policy): 22 | 23 | def __init__(self, num_actions, exploration_prob, min_exploration_prob, actions_until_min): 24 | super(EpsilonGreedy, self).__init__(num_actions) 25 | self.exploration_prob = exploration_prob 26 | self.min_exploration_prob = min_exploration_prob 27 | self.actions_until_min = actions_until_min 28 | assert actions_until_min != 0, 'actions_until_min must be positive' 29 | self.exploration_reduction = (exploration_prob - min_exploration_prob) / float(actions_until_min) 30 | 31 | def choose_action(self, q_values): 32 | self.update_parameters() 33 | if random.random() < self.exploration_prob: 34 | return random.choice(self.actions) 35 | else: 36 | return np.argmax(q_values) 37 | 38 | def update_parameters(self): 39 | updated_exploration_prob = self.exploration_prob - self.exploration_reduction 40 | self.exploration_prob = max(self.min_exploration_prob, updated_exploration_prob) 41 | 42 | class Softmax(Policy): 43 | 44 | def __init__(self, num_actions, tau, min_tau, actions_until_min): 45 | super(Softmax, self).__init__(num_actions) 46 | self.tau = float(tau) 47 | self.min_tau = min_tau 48 | self.actions_until_min = actions_until_min 49 | assert actions_until_min != 0, 'actions_until_min must be positive' 50 | self.tau_reduction = (tau - min_tau) / float(actions_until_min) 51 | 52 | def choose_action(self, q_values): 53 | self.update_parameters() 54 | exp_q_values = np.exp(q_values / (self.tau + 1e-2)) 55 | weights = dict() 56 | for idx, val in enumerate(exp_q_values): 57 | weights[idx] = val 58 | action = learning_utils.weightedRandomChoice(weights) 59 | return action 60 | 61 | def update_parameters(self): 62 | updated_tau = self.tau - self.tau_reduction 63 | self.tau = max(self.min_tau, updated_tau) 64 | 65 | -------------------------------------------------------------------------------- /scripts/qnetwork.py: -------------------------------------------------------------------------------- 1 | """ 2 | :description: This file contains the QNetwork class, which has a variable number of 3 | fully-connected hidden layers. It also contains a similar class called 4 | ConvQNetwork that implements the network with convolutional layers. 5 | """ 6 | 7 | import lasagne 8 | from lasagne.regularization import regularize_network_params, l2 9 | import numpy as np 10 | import theano 11 | import theano.tensor as T 12 | 13 | import learning_utils 14 | 15 | class QNetwork(object): 16 | 17 | def __init__(self, input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng): 18 | """ 19 | :type input_shape: int 20 | :param input_shape: the dimension of the input representation of the state 21 | 22 | :type batch_size: int 23 | :param batch_size: number of samples to use in computing the loss / updates 24 | 25 | :type num_hidden_layers: int 26 | :param num_hidden_layers: number of hidden layers to use in the network 27 | 28 | :type num_actions: int 29 | :param num_actions: the output dimension of the network measured in number of possible actions 30 | 31 | :type num_hidden: int 32 | :param num_hidden: number of hidden nodes to use in each layer (const across layers) 33 | 34 | :type discount: float 35 | :param discount: discount factor to use in computing Q-learning target values 36 | 37 | :type learning_rate: float 38 | :param learning_rate: the learning rate to use (no decay schedule since ADAM update assumed) 39 | 40 | :type regularization: float 41 | :param regularization: l2 regularization constant applied to weights 42 | 43 | :type update_rule: string 44 | :param update_rule: the type of update rule to use, suggest using 'adam' 45 | 46 | :type freeze_interval: int 47 | :param freeze_interval: the number of updates between updating the target network weights 48 | 49 | :type rng: rng 50 | :param rng: rng for running deterministically, o/w just leave as None 51 | 52 | :example call: 53 | network = qnetwork.QNetwork(input_shape=20, batch_size=64, num_hidden_layers=2, num_actions=4, 54 | num_hidden=4, discount=1, learning_rate=1e-3, regularization=1e-4, 55 | update_rule='adam', freeze_interval=1e5, rng=None) 56 | 57 | """ 58 | self.input_shape = input_shape 59 | self.batch_size = batch_size 60 | self.num_hidden_layers = num_hidden_layers 61 | self.num_actions = num_actions 62 | self.num_hidden = num_hidden 63 | self.discount = discount 64 | self.learning_rate = learning_rate 65 | self.regularization = regularization 66 | self.update_rule = update_rule 67 | self.freeze_interval = freeze_interval 68 | self.rng = rng if rng else np.random.RandomState() 69 | self.initialize_network() 70 | self.update_counter = 0 71 | 72 | def train(self, states, actions, rewards, next_states, terminals): 73 | """ 74 | :description: Perform a q-learning update using the (s,a,r,s') tuples provided 75 | 76 | :type states: np.array(dtype=theano.config.floatX) 77 | :param states: batch of states, shape (N,D) = (batch_size, input_shape) 78 | 79 | :type actions: np.array(dtype='int32') 80 | :param actions: the actions taken by the agent in the corresponding state from states 81 | shape = (N,) 82 | 83 | :type rewards: np.array(dtype=theano.config.floatX) 84 | :param rewards: rewards associated with being in state s and taking action a, shape = (N,) 85 | 86 | :type next_states: np.array(dtype=theano.config.floatX) 87 | :param next_states: batch of next_states, shape (N,D) = (batch_size, input_shape) 88 | 89 | :type terminals: np.array(dtype='int32') 90 | :param terminals: whether the corresponding state was a terminal state. If so, this 91 | will cause the max_a' Q(s',a') term to be zero in the q-learning loss. 92 | 93 | :example call: 94 | states = np.array([[1,0],[0,1]]) 95 | actions = np.array([1,1]) 96 | rewards = np.array([.2,-.1]) 97 | next_states = np.array([[0,1],[1,0]]) 98 | terminals = np.array([0,0]) 99 | network.train(states, actions, rewards, next_states, terminals) 100 | 101 | """ 102 | 103 | if self.update_counter % self.freeze_interval == 0: 104 | self.reset_target_network() 105 | self.update_counter += 1 106 | 107 | self.states_shared.set_value(states) 108 | self.actions_shared.set_value(actions.astype('int32')) 109 | self.rewards_shared.set_value(rewards) 110 | self.next_states_shared.set_value(next_states) 111 | self.terminals_shared.set_value(terminals.astype('int32')) 112 | 113 | loss, q_values = self._train() 114 | return loss 115 | 116 | def get_q_values(self, state): 117 | """ 118 | :description: Returns the q_values associated with a single state for the purposes of 119 | deciding which action to take. 120 | 121 | :type state: np.array(dtype=theano.config.floatX) 122 | :param state: state to compute q_values for, shape = (D,) 123 | 124 | :example call: 125 | state = np.array([1,2]) 126 | network.get_q_values(state) 127 | """ 128 | # create a fake batch 129 | states = np.zeros((self.batch_size, self.input_shape), dtype=theano.config.floatX) 130 | 131 | # set the first item in that batch to the passed in state and set the shared variables 132 | states[0] = state 133 | self.states_shared.set_value(states) 134 | 135 | # do a forward pass using the theano function 'get_q_values' and index and return the first item 136 | q_values = self._get_q_values()[0] 137 | return q_values 138 | 139 | def get_params(self): 140 | """ 141 | :description: Return a numpy array containing all of the parameters of the network. 142 | Used for retrieving weights to save. 143 | """ 144 | return lasagne.layers.helper.get_all_param_values(self.l_out) 145 | 146 | def set_params(self, params): 147 | """ 148 | :description: Set the parameters of the network to the provided parameters. Used for 149 | loading saved weights. 150 | """ 151 | lasagne.layers.set_all_param_values(self.l_out, params) 152 | self.reset_target_network() 153 | 154 | def reset_target_network(self): 155 | """ 156 | :description: Set the target weights to the current weights. 157 | """ 158 | all_params = lasagne.layers.helper.get_all_param_values(self.l_out) 159 | lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params) 160 | 161 | def finish_episode(self): 162 | pass 163 | 164 | ########################################################################################## 165 | #### Network and Learning Initialization below 166 | ########################################################################################## 167 | 168 | def initialize_network(self): 169 | """ 170 | :description: this method initializes the network, updates, and theano functions for training and 171 | retrieving q values. Here's an outline: 172 | 173 | 1. build the q network and target q network 174 | 2. initialize theano symbolic variables used for compiling functions 175 | 3. initialize the theano numeric variables used as input to functions 176 | 4. formulate the symbolic loss 177 | 5. formulate the symbolic updates 178 | 6. compile theano functions for training and for getting q_values 179 | """ 180 | batch_size, input_shape = self.batch_size, self.input_shape 181 | lasagne.random.set_rng(self.rng) 182 | 183 | # 1. build the q network and target q network 184 | self.l_out = self.build_network(input_shape, self.num_actions, batch_size) 185 | self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size) 186 | self.reset_target_network() 187 | 188 | # 2. initialize theano symbolic variables used for compiling functions 189 | states = T.matrix('states') 190 | actions = T.icol('actions') 191 | rewards = T.col('rewards') 192 | next_states = T.matrix('next_states') 193 | # terminals are used to indicate a terminal state in the episode and hence a mask over the future 194 | # q values i.e., Q(s',a') 195 | terminals = T.icol('terminals') 196 | 197 | # 3. initialize the theano numeric variables used as input to functions 198 | self.states_shared = theano.shared(np.zeros((batch_size, input_shape), dtype=theano.config.floatX)) 199 | self.next_states_shared = theano.shared(np.zeros((batch_size, input_shape), dtype=theano.config.floatX)) 200 | self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), 201 | broadcastable=(False, True)) 202 | self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), 203 | broadcastable=(False, True)) 204 | self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), 205 | broadcastable=(False, True)) 206 | 207 | # 4. formulate the symbolic loss 208 | q_vals = lasagne.layers.get_output(self.l_out, states) 209 | next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states) 210 | target = (rewards + 211 | (T.ones_like(terminals) - terminals) * 212 | self.discount * T.max(next_q_vals, axis=1, keepdims=True)) 213 | # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector' 214 | diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) 215 | 216 | # a lot of the recent work clips the td error at 1 so we do that here 217 | # the problem is that gradient backpropagating through this minimum node 218 | # will be zero if diff is larger then 1.0 (because changing params before 219 | # the minimum does not impact the output of the minimum). To account for 220 | # this we take the part of the td error (magnitude) greater than 1.0 and simply 221 | # add it to the loss, which allows gradient to backprop but just linearly 222 | # in the td error rather than quadratically 223 | quadratic_part = T.minimum(abs(diff), 1.0) 224 | linear_part = abs(diff) - quadratic_part 225 | loss = 0.5 * quadratic_part ** 2 + linear_part 226 | loss = T.sum(loss) + self.regularization * regularize_network_params(self.l_out, l2) 227 | 228 | # 5. formulate the symbolic updates 229 | params = lasagne.layers.helper.get_all_params(self.l_out) 230 | updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate) 231 | 232 | # 6. compile theano functions for training and for getting q_values 233 | givens = { 234 | states: self.states_shared, 235 | next_states: self.next_states_shared, 236 | rewards: self.rewards_shared, 237 | actions: self.actions_shared, 238 | terminals: self.terminals_shared 239 | } 240 | self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) 241 | self._get_q_values = theano.function([], q_vals, givens={states: self.states_shared}) 242 | 243 | def initialize_updates(self, update_rule, loss, params, learning_rate): 244 | """ 245 | :description: This method decides which updates to apply. Suggest using 'adam'. 246 | """ 247 | if update_rule == 'adam': 248 | updates = lasagne.updates.adam(loss, params, learning_rate) 249 | elif update_rule == 'rmsprop': 250 | updates = lasagne.updates.rmsprop(loss, params, learning_rate) 251 | elif update_rule == 'sgd': 252 | updates = lasagne.updates.sgd(loss, params, learning_rate) 253 | updates = lasagne.updates.apply_nesterov_momentum(updates) 254 | else: 255 | raise ValueError("Unrecognized update: {}".format(update_rule)) 256 | return updates 257 | 258 | def build_network(self, input_shape, output_shape, batch_size): 259 | """ 260 | :description: Builds the computational graph in lasagne. 261 | """ 262 | 263 | l_in = lasagne.layers.InputLayer( 264 | shape=(batch_size, input_shape) 265 | ) 266 | 267 | l_hid = l_in 268 | for hidden_idx in range(self.num_hidden_layers): 269 | l_hid = lasagne.layers.DenseLayer( 270 | l_in, 271 | num_units=self.num_hidden, 272 | nonlinearity=lasagne.nonlinearities.leaky_rectify, 273 | W=lasagne.init.HeNormal(), 274 | b=lasagne.init.Constant(.1) 275 | ) 276 | 277 | l_out = lasagne.layers.DenseLayer( 278 | l_hid, 279 | num_units=output_shape, 280 | nonlinearity=None, 281 | W=lasagne.init.HeNormal(), 282 | b=lasagne.init.Constant(0) 283 | ) 284 | 285 | return l_out 286 | 287 | 288 | ############################################################################################## 289 | ########################## Convolutional Q net below ################################ 290 | ############################################################################################## 291 | 292 | class ConvQNetwork(object): 293 | """ 294 | :description: This class is very similar to the QNetwork above, but uses convolutional 295 | layers and therefore requires some different input shape details. 296 | """ 297 | 298 | def __init__(self, input_shape, batch_size, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng): 299 | self.input_shape = input_shape 300 | self.batch_size = batch_size 301 | self.num_actions = num_actions 302 | self.num_hidden = num_hidden 303 | self.discount = discount 304 | self.learning_rate = learning_rate 305 | self.regularization = regularization 306 | self.update_rule = update_rule 307 | self.freeze_interval = freeze_interval 308 | self.rng = rng if rng else np.random.RandomState() 309 | self.initialize_network() 310 | self.update_counter = 0 311 | 312 | def train(self, states, actions, rewards, next_states, terminals): 313 | if self.update_counter % self.freeze_interval == 0: 314 | self.reset_target_network() 315 | self.update_counter += 1 316 | 317 | self.states_shared.set_value(states) 318 | self.actions_shared.set_value(actions.astype('int32')) 319 | self.rewards_shared.set_value(rewards) 320 | self.next_states_shared.set_value(next_states) 321 | self.terminals_shared.set_value(terminals.astype('int32')) 322 | 323 | loss, q_values = self._train() 324 | return loss 325 | 326 | def get_q_values(self, state): 327 | states = np.zeros(self.states_shape, dtype=theano.config.floatX) 328 | states[0] = state 329 | self.states_shared.set_value(states) 330 | q_values = self._get_q_values()[0] 331 | return q_values 332 | 333 | def get_params(self): 334 | return lasagne.layers.helper.get_all_param_values(self.l_out) 335 | 336 | def reset_target_network(self): 337 | all_params = lasagne.layers.helper.get_all_param_values(self.l_out) 338 | lasagne.layers.helper.set_all_param_values(self.next_l_out, all_params) 339 | 340 | ########################################################################################## 341 | #### Network and Learning Initialization below 342 | ########################################################################################## 343 | 344 | def initialize_network(self): 345 | """ 346 | :description: this method initializes the network, updates, and theano functions for training and 347 | retrieving q values. Here's an outline: 348 | 349 | 1. build the q network and target q network 350 | 2. initialize theano symbolic variables used for compiling functions 351 | 3. initialize the theano numeric variables used as input to functions 352 | 4. formulate the symbolic loss 353 | 5. formulate the symbolic updates 354 | 6. compile theano functions for training and for getting q_values 355 | """ 356 | batch_size, input_shape = self.batch_size, self.input_shape 357 | lasagne.random.set_rng(self.rng) 358 | 359 | # 1. build the q network and target q network 360 | self.l_out = self.build_network(input_shape, self.num_actions, batch_size) 361 | self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size) 362 | self.reset_target_network() 363 | 364 | # 2. initialize theano symbolic variables used for compiling functions 365 | states = T.tensor4('states') 366 | actions = T.icol('actions') 367 | rewards = T.col('rewards') 368 | next_states = T.tensor4('next_states') 369 | # terminals are used to indicate a terminal state in the episode and hence a mask over the future 370 | # q values i.e., Q(s',a') 371 | terminals = T.icol('terminals') 372 | 373 | # 3. initialize the theano numeric variables used as input to functions 374 | self.states_shape = (batch_size,) + (1,) + input_shape 375 | self.states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX)) 376 | self.next_states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX)) 377 | self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), 378 | broadcastable=(False, True)) 379 | self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), 380 | broadcastable=(False, True)) 381 | self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), 382 | broadcastable=(False, True)) 383 | 384 | # 4. formulate the symbolic loss 385 | q_vals = lasagne.layers.get_output(self.l_out, states) 386 | next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states) 387 | target = (rewards + 388 | (T.ones_like(terminals) - terminals) * 389 | self.discount * T.max(next_q_vals, axis=1, keepdims=True)) 390 | # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector' 391 | diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) 392 | 393 | 394 | # a lot of the deepmind work clips the td error at 1 so we do that here 395 | # the problem is that gradient backpropagating through this minimum node 396 | # will be zero if diff is larger then 1.0 (because changing params before 397 | # the minimum does not impact the output of the minimum). To account for 398 | # this we take the part of the td error (magnitude) greater than 1.0 and simply 399 | # add it to the loss, which allows gradient to backprop but just linearly 400 | # in the td error rather than quadratically 401 | quadratic_part = T.minimum(abs(diff), 1.0) 402 | linear_part = abs(diff) - quadratic_part 403 | loss = 0.5 * quadratic_part ** 2 + linear_part 404 | loss = T.mean(loss) + self.regularization * regularize_network_params(self.l_out, l2) 405 | 406 | # 5. formulate the symbolic updates 407 | params = lasagne.layers.helper.get_all_params(self.l_out) 408 | updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate) 409 | 410 | # 6. compile theano functions for training and for getting q_values 411 | givens = { 412 | states: self.states_shared, 413 | next_states: self.next_states_shared, 414 | rewards: self.rewards_shared, 415 | actions: self.actions_shared, 416 | terminals: self.terminals_shared 417 | } 418 | self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) 419 | self._get_q_values = theano.function([], q_vals, givens={states: self.states_shared}) 420 | 421 | def initialize_updates(self, update_rule, loss, params, learning_rate): 422 | if update_rule == 'adam': 423 | updates = lasagne.updates.adam(loss, params, learning_rate) 424 | elif update_rule == 'rmsprop': 425 | updates = lasagne.updates.rmsprop(loss, params, learning_rate) 426 | elif update_rule == 'sgd': 427 | updates = lasagne.updates.sgd(loss, params, learning_rate) 428 | updates = lasagne.updates.apply_nesterov_momentum(updates) 429 | else: 430 | raise ValueError("Unrecognized update: {}".format(update_rule)) 431 | return updates 432 | 433 | def build_network(self, input_shape, output_shape, batch_size): 434 | 435 | l_in = lasagne.layers.InputLayer( 436 | shape=(batch_size,) + (1,) + input_shape 437 | ) 438 | 439 | l_conv1 = lasagne.layers.Conv2DLayer( 440 | l_in, 441 | num_filters=self.num_hidden, 442 | filter_size=(1,1), 443 | stride = 1, 444 | pad = 'same', 445 | nonlinearity=lasagne.nonlinearities.leaky_rectify, 446 | W=lasagne.init.HeNormal(), 447 | b=lasagne.init.Constant(.1) 448 | ) 449 | 450 | l_out = lasagne.layers.DenseLayer( 451 | l_conv1, 452 | num_units=output_shape, 453 | nonlinearity=None, 454 | W=lasagne.init.HeNormal(), 455 | b=lasagne.init.Constant(0) 456 | ) 457 | 458 | return l_out 459 | 460 | -------------------------------------------------------------------------------- /scripts/replay_memory.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import random 4 | import theano 5 | 6 | DEFAULT_CAPACITY = 10000 7 | 8 | class ReplayMemory(object): 9 | 10 | def __init__(self, batch_size, capacity=DEFAULT_CAPACITY): 11 | self.memory = {} 12 | self.batch_size = batch_size 13 | self.first_index = -1 14 | self.last_index = -1 15 | self.capacity = capacity 16 | self.terminal_count = 0 17 | 18 | def store(self, sars_tuple): 19 | self.terminal_count += sars_tuple[-1] 20 | if self.first_index == -1: 21 | self.first_index = 0 22 | self.last_index += 1 23 | self.memory[self.last_index] = sars_tuple 24 | if (self.last_index + 1 - self.first_index) > self.capacity: 25 | self.discard_sample() 26 | 27 | def is_full(self): 28 | return self.last_index + 1 - self.first_index >= self.capacity 29 | 30 | def is_empty(self): 31 | return self.first_index == -1 32 | 33 | def discard_sample(self): 34 | rand_index = random.randint(self.first_index, self.last_index) 35 | first_tuple = self.memory[self.first_index] 36 | del self.memory[rand_index] 37 | if rand_index != self.first_index: 38 | del self.memory[self.first_index] 39 | self.memory[rand_index] = first_tuple 40 | self.first_index += 1 41 | 42 | def sample(self): 43 | if self.is_empty(): 44 | raise Exception('Unable to sample from replay memory when empty') 45 | rand_sample_index = random.randint(self.first_index, self.last_index) 46 | return self.memory[rand_sample_index] 47 | 48 | def sample_batch(self): 49 | # must insert data into replay memory before sampling 50 | if self.is_empty(): 51 | raise Exception('Unable to sample from replay memory when empty') 52 | 53 | # determine shape of states 54 | state_shape = np.shape(self.memory.values()[0][0]) 55 | states_shape = (self.batch_size,) + state_shape 56 | 57 | states = np.empty(states_shape) 58 | actions = np.empty((self.batch_size, 1)) 59 | rewards = np.empty((self.batch_size, 1)) 60 | next_states = np.empty(states_shape) 61 | terminals = np.empty((self.batch_size, 1)) 62 | 63 | # sample batch_size times from the memory 64 | for idx in range(self.batch_size): 65 | state, action, reward, next_state, terminal = self.sample() 66 | states[idx] = state 67 | actions[idx] = action 68 | rewards[idx] = reward 69 | next_states[idx] = next_state 70 | terminals[idx] = terminal 71 | 72 | return states.astype(theano.config.floatX), actions, \ 73 | rewards.astype(theano.config.floatX), \ 74 | next_states.astype(theano.config.floatX), terminals 75 | 76 | class SequenceReplayMemory(object): 77 | """ 78 | :description: this is from https://github.com/spragunr/deep_q_rl 79 | """ 80 | 81 | def __init__(self, input_shape, sequence_length, batch_size, capacity): 82 | """ 83 | :type input_shape: int or tuple 84 | :param: the shape of the state input to the network 85 | 86 | :type sequence_length: int 87 | :param sequence_length: the length of the sequence used by the network 88 | 89 | :type batch_size: int 90 | :param batch_size: the size of a minibatch 91 | 92 | :type capacity: int 93 | :param capacity: maximum size of the replay memory 94 | """ 95 | self.input_shape = input_shape 96 | self.sequence_length = sequence_length 97 | self.batch_size = batch_size 98 | self.capacity = capacity 99 | self.bottom = 0 100 | self.top = 0 101 | self.size = 0 102 | 103 | if type(self.input_shape) is int: 104 | self.input_shape = (self.input_shape, ) 105 | 106 | if self.sequence_length == 1: 107 | self.sequence_shape = self.input_shape 108 | else: 109 | self.sequence_shape = (self.sequence_length,) + self.input_shape 110 | self.batch_shape = (self.batch_size, ) + self.sequence_shape 111 | 112 | # Allocate the circular buffers 113 | self.states = np.zeros(((self.capacity, ) + self.input_shape), dtype='int32') 114 | self.actions = np.zeros(self.capacity, dtype='int32') 115 | self.rewards = np.zeros(self.capacity, dtype=theano.config.floatX) 116 | self.terminals = np.zeros(self.capacity, dtype='bool') 117 | 118 | def store(self, state, action, reward, terminal): 119 | """ 120 | :description: stores a state, the action taken in that state, and the reward received for 121 | for being the state (i.e., we use r(s) not r(s,a)) in the replay memory 122 | 123 | :type state: np.array 124 | :param state: the current state 125 | 126 | :type action: int 127 | :param action: the action taken in this state 128 | 129 | :type reward: float 130 | :param reward: the reward received for being in state 131 | """ 132 | 133 | self.states[self.top] = state 134 | self.actions[self.top] = action 135 | self.rewards[self.top] = reward 136 | self.terminals[self.top] = terminal 137 | 138 | if self.size == self.capacity: 139 | self.bottom = (self.bottom + 1) % self.capacity 140 | else: 141 | self.size += 1 142 | 143 | self.top = (self.top + 1) % self.capacity 144 | 145 | def make_last_sequence(self, next_state): 146 | """ 147 | :description: given a state, this method creates a sequence of sequence_length where 148 | the last state in that sequence is passed in state. This is used to get an action 149 | 150 | :type next_state: np.array 151 | :param next_state: the next state to be inserted last into the sequence 152 | """ 153 | 154 | # take states from the memory 155 | sequence = np.zeros(self.sequence_shape, dtype=theano.config.floatX) 156 | indexes = np.arange(self.top - self.sequence_length + 1, self.top) 157 | sequence[0:self.sequence_length - 1] = self.states.take(indexes, axis=0, mode='wrap') 158 | 159 | # set current states value in sequence 160 | sequence[-1] = next_state 161 | 162 | # take the same terminal values from the memory 163 | terminals = self.terminals.take(indexes, axis=0, mode='wrap') 164 | 165 | # if any of those terminals are true, then set indexes of the 166 | # sequence up to and including the index to zero 167 | true_terminals = np.argwhere(terminals == True) 168 | if len(true_terminals) > 0: 169 | real_start = true_terminals[-1] + 1 170 | sequence[:real_start] = 0 171 | 172 | return sequence 173 | 174 | def is_full(self): 175 | """ 176 | :description: is the replay memory full 177 | """ 178 | return self.size == self.capacity 179 | 180 | def sample_batch(self): 181 | """ 182 | :description: sample a minibatch of data 183 | """ 184 | 185 | # must insert sufficient data into replay memory before sampling 186 | if not self.is_full(): 187 | raise Exception('Unable to sample from replay memory when empty') 188 | 189 | # allocate batch containers 190 | states = np.empty(self.batch_shape) 191 | actions = np.empty((self.batch_size, 1)) 192 | rewards = np.empty((self.batch_size, 1)) 193 | next_states = np.empty(self.batch_shape) 194 | terminals = np.empty((self.batch_size, 1)) 195 | 196 | # sample batch_size times from the memory 197 | count = 0 198 | while count < self.batch_size: 199 | 200 | index = np.random.randint(self.bottom, self.bottom + self.size - self.sequence_length) 201 | initial_indices = np.arange(index, index + self.sequence_length) 202 | transition_indices = initial_indices + 1 203 | end_index = index + self.sequence_length - 1 204 | 205 | # original quote: 206 | # "Check that the initial state corresponds entirely to a 207 | # single episode, meaning none but the last frame may be 208 | # terminal. If the last frame of the initial state is 209 | # terminal, then the last frame of the transitioned state 210 | # will actually be the first frame of a new episode, which 211 | # the Q learner recognizes and handles correctly during 212 | # training by zeroing the discounted future reward estimate." 213 | if np.any(self.terminals.take(initial_indices[:-1], mode='wrap')): 214 | continue 215 | 216 | # Add the state transition to the response. 217 | states[count] = self.states.take(initial_indices, axis=0, mode='wrap') 218 | actions[count] = self.actions.take([end_index], mode='wrap')[0] 219 | rewards[count] = self.rewards.take([end_index], mode='wrap')[0] 220 | terminals[count] = self.terminals.take([end_index], mode='wrap')[0] 221 | next_states[count] = self.states.take(transition_indices, axis=0, mode='wrap') 222 | count += 1 223 | 224 | return states.astype(theano.config.floatX), \ 225 | actions, \ 226 | rewards.astype(theano.config.floatX), \ 227 | next_states.astype(theano.config.floatX), \ 228 | terminals 229 | 230 | -------------------------------------------------------------------------------- /scripts/state_adapters.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | class CoordinatesToSingleRoomRowColAdapter(object): 5 | 6 | def __init__(self, room_size): 7 | self.room_size = room_size 8 | 9 | def convert_state_to_agent_format(self, state): 10 | """ 11 | Convert states in format (x, y) to a single room, row, column one-hot vector 12 | example: 13 | >>>state = (4, 4) 14 | >>>adapter = CoordinatesToSingleRoomRowColAdapter(room_size=3) 15 | >>>adapter.convert_state_to_agent_format(state) 16 | [1,0,0,1,0,0] 17 | """ 18 | ridx, cidx = state 19 | 20 | # find where the agent is in the room 21 | row = np.zeros(self.room_size) 22 | row[ridx % self.room_size] = 1 23 | col = np.zeros(self.room_size) 24 | col[cidx % self.room_size] = 1 25 | 26 | # concat the two vectors 27 | formatted_state = np.hstack((row, col)) 28 | 29 | return formatted_state 30 | 31 | class CoordinatesToRowColAdapter(object): 32 | 33 | def __init__(self, room_size, num_rooms): 34 | self.room_size = room_size 35 | self.num_rooms = num_rooms 36 | 37 | def convert_state_to_agent_format(self, state): 38 | """ 39 | Convert states in format (x, y) to a single room, row, column one-hot vector 40 | example: 41 | >>>state = (4, 4) 42 | >>>adapter = CoordinatesToSingleRoomRowColAdapter(room_size=3, num_rooms=2) 43 | >>>adapter.convert_state_to_agent_format(state) 44 | [0,0,0,0,1,0,0,0,0,0,1,0] 45 | """ 46 | ridx, cidx = state 47 | 48 | # find where the agent is in the room 49 | row = np.zeros(self.room_size * self.num_rooms) 50 | row[ridx] = 1 51 | col = np.zeros(self.room_size * self.num_rooms) 52 | col[cidx] = 1 53 | 54 | # concat the two vectors 55 | formatted_state = np.hstack((row, col)) 56 | 57 | return formatted_state 58 | 59 | class CoordinatesToRowColRoomAdapter(object): 60 | 61 | def __init__(self, room_size, num_rooms): 62 | self.room_size = room_size 63 | self.num_rooms = num_rooms 64 | 65 | def convert_state_to_agent_format(self, state): 66 | """ 67 | Convert states in format (x, y) to a single room, row, column one-hot vector 68 | _with_ an additional one-hot vector identifying the room 69 | example: 70 | >>>state = (4, 4) 71 | >>>adapter = CoordinatesToSingleRoomRowColAdapter(room_size=3, num_rooms=2) 72 | >>>adapter.convert_state_to_agent_format(state) 73 | [1,0,0,1,0,0,0,0,0,1] 74 | """ 75 | ridx, cidx = state 76 | 77 | # find where the agent is in the room 78 | row = np.zeros(self.room_size) 79 | row[ridx % self.room_size] = 1 80 | col = np.zeros(self.room_size) 81 | col[cidx % self.room_size] = 1 82 | room = np.zeros(self.num_rooms ** 2) 83 | room_row = cidx / self.room_size 84 | room_col = ridx / self.room_size 85 | room_idx = room_row * self.num_rooms + room_col 86 | room[room_idx] = 1 87 | # concat the three vectors 88 | formatted_state = np.hstack((row, col, room)) 89 | 90 | return formatted_state 91 | 92 | class CoordinatesToFlattenedGridAdapter(object): 93 | 94 | def __init__(self, room_size): 95 | self.room_size = room_size 96 | self.num_rooms = num_rooms 97 | 98 | def convert_state_to_agent_format(self, state): 99 | """ 100 | Convert states in format (x, y) to the full grid 101 | """ 102 | ridx, cidx = state 103 | 104 | # find where the agent is in the room 105 | grid = np.zeros((room_size * num_rooms, room_size * num_rooms)) 106 | grid[ridx, cidx] = 1 107 | 108 | # flatten grid 109 | formatted_state = grid.flatten() 110 | 111 | return formatted_state 112 | 113 | class IdentityAdapter(object): 114 | 115 | def convert_state_to_agent_format(self, state): 116 | """ 117 | Returns the state as is. Exists to keep the interface consistent. 118 | """ 119 | return state 120 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wulfebw/hierarchical_rl/0156dd7b1675a0c3a3b7d81cb66721cbba406e28/tests/__init__.py -------------------------------------------------------------------------------- /tests/run_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | test_loader = unittest.defaultTestLoader.discover( '.' ) 4 | test_runner = unittest.TextTestRunner(verbosity=2) 5 | test_runner.run(test_loader) -------------------------------------------------------------------------------- /tests/test_aws_s3_utility.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 6 | 7 | import aws_s3_utility 8 | 9 | class TestAWSS3Utility(unittest.TestCase): 10 | 11 | def test_directory_upload(self): 12 | ak = '' 13 | sk = '' 14 | bucket = 'hierarchical' 15 | aws_util = aws_s3_utility.S3Utility(ak, sk, bucket) 16 | directory = '.' 17 | aws_util.upload_directory('/Users/wulfe/Desktop/aws_run/NeuralAgent_2016-02-20T02:59:50.808542') 18 | 19 | if __name__ == '__main__': 20 | unittest.main() 21 | 22 | -------------------------------------------------------------------------------- /tests/test_build_network.py: -------------------------------------------------------------------------------- 1 | 2 | import lasagne 3 | from lasagne.regularization import regularize_network_params, l2 4 | import numpy as np 5 | import os 6 | import random 7 | import sys 8 | import theano 9 | import theano.tensor as T 10 | import unittest 11 | 12 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 13 | 14 | def build_hierachical_stacked_lstm_network_with_merge(input_shape, sequence_length, batch_size, output_shape, start=1, downsample=2): 15 | 16 | l_in = lasagne.layers.InputLayer( 17 | shape=(batch_size, sequence_length, input_shape) 18 | ) 19 | 20 | default_gate = lasagne.layers.recurrent.Gate( 21 | W_in=lasagne.init.HeNormal(), W_hid=lasagne.init.HeNormal(), 22 | b=lasagne.init.Constant(0.)) 23 | forget_gate = lasagne.layers.recurrent.Gate( 24 | W_in=lasagne.init.HeNormal(), W_hid=lasagne.init.HeNormal(), 25 | b=lasagne.init.Constant(5.)) 26 | l_lstm1 = lasagne.layers.LSTMLayer( 27 | l_in, 28 | num_units=10, 29 | nonlinearity=lasagne.nonlinearities.tanh, 30 | cell=default_gate, 31 | ingate=default_gate, 32 | outgate=default_gate, 33 | forgetgate=forget_gate, 34 | grad_clipping=2, 35 | only_return_final=False 36 | ) 37 | 38 | # does this slice out the correct values? 39 | l_slice1_up = lasagne.layers.SliceLayer(l_lstm1, slice(start, sequence_length, downsample), 1) 40 | 41 | l_lstm2 = lasagne.layers.LSTMLayer( 42 | l_slice1_up, 43 | num_units=10, 44 | nonlinearity=lasagne.nonlinearities.tanh, 45 | cell=default_gate, 46 | ingate=default_gate, 47 | outgate=default_gate, 48 | forgetgate=forget_gate, 49 | grad_clipping=2, 50 | only_return_final=True 51 | ) 52 | 53 | l_slice1_out = lasagne.layers.SliceLayer(l_lstm1, -1, 1) 54 | l_merge = lasagne.layers.ConcatLayer([l_slice1_out, l_lstm2]) 55 | l_out = lasagne.layers.DenseLayer( 56 | l_merge, 57 | num_units=output_shape, 58 | nonlinearity=None, 59 | W=lasagne.init.HeNormal(), 60 | b=lasagne.init.Constant(0) 61 | ) 62 | 63 | return l_out, l_lstm1, l_slice1_up 64 | 65 | class TestBuildHierarchicalStackedLSTMWithMerge(unittest.TestCase): 66 | 67 | def test_build_hierachical_stacked_lstm_network_with_merge_correct_slice(self): 68 | input_shape = 14 69 | sequence_length = 4 70 | batch_size = 1 71 | _, l_lstm, l_slice = build_hierachical_stacked_lstm_network_with_merge( 72 | input_shape=input_shape, 73 | sequence_length=sequence_length, 74 | batch_size=batch_size, 75 | output_shape=4) 76 | 77 | states = T.tensor3('states') 78 | lstm_out = lasagne.layers.get_output(l_lstm, states) 79 | slice_out = lasagne.layers.get_output(l_slice, states) 80 | run = theano.function([states], [lstm_out, slice_out]) 81 | sample_states = np.zeros((batch_size, sequence_length, input_shape)) 82 | sample_lstm_out, sample_slice_out = run(sample_states) 83 | 84 | self.assertEquals(sample_lstm_out[:, 1::2, :].tolist(), sample_slice_out.tolist()) 85 | 86 | def test_build_hierachical_stacked_lstm_network_with_merge_correct_slice_short_seq(self): 87 | input_shape = 14 88 | sequence_length = 2 89 | batch_size = 1 90 | _, l_lstm, l_slice = build_hierachical_stacked_lstm_network_with_merge( 91 | input_shape=input_shape, 92 | sequence_length=sequence_length, 93 | batch_size=batch_size, 94 | output_shape=4) 95 | 96 | states = T.tensor3('states') 97 | lstm_out = lasagne.layers.get_output(l_lstm, states) 98 | slice_out = lasagne.layers.get_output(l_slice, states) 99 | run = theano.function([states], [lstm_out, slice_out]) 100 | sample_states = np.zeros((batch_size, sequence_length, input_shape)) 101 | sample_lstm_out, sample_slice_out = run(sample_states) 102 | 103 | self.assertEquals(sample_lstm_out[:, 1::2, :].tolist(), sample_slice_out.tolist()) 104 | 105 | 106 | def test_build_hierachical_stacked_lstm_network_with_merge_correct_slice_len_1_seq(self): 107 | input_shape = 14 108 | sequence_length = 1 109 | batch_size = 1 110 | l_out, l_lstm, l_slice = build_hierachical_stacked_lstm_network_with_merge( 111 | input_shape=input_shape, 112 | sequence_length=sequence_length, 113 | batch_size=batch_size, 114 | output_shape=4, 115 | start=0, 116 | downsample=3) 117 | 118 | states = T.tensor3('states') 119 | l_out_out = lasagne.layers.get_output(l_out, states) 120 | lstm_out = lasagne.layers.get_output(l_lstm, states) 121 | slice_out = lasagne.layers.get_output(l_slice, states) 122 | run = theano.function([states], [l_out_out, lstm_out, slice_out]) 123 | sample_states = np.zeros((batch_size, sequence_length, input_shape)) 124 | sample_out, sample_lstm_out, sample_slice_out = run(sample_states) 125 | 126 | self.assertEquals(sample_lstm_out[:, 0::3, :].tolist(), sample_slice_out.tolist()) 127 | 128 | def test_build_hierachical_stacked_lstm_network_with_merge_correct_slice_longer_len_seq(self): 129 | input_shape = 14 130 | sequence_length = 7 131 | batch_size = 1 132 | l_out, l_lstm, l_slice = build_hierachical_stacked_lstm_network_with_merge( 133 | input_shape=input_shape, 134 | sequence_length=sequence_length, 135 | batch_size=batch_size, 136 | output_shape=4, 137 | start=0, 138 | downsample=3) 139 | 140 | states = T.tensor3('states') 141 | l_out_out = lasagne.layers.get_output(l_out, states) 142 | lstm_out = lasagne.layers.get_output(l_lstm, states) 143 | slice_out = lasagne.layers.get_output(l_slice, states) 144 | run = theano.function([states], [l_out_out, lstm_out, slice_out]) 145 | sample_states = np.zeros((batch_size, sequence_length, input_shape)) 146 | sample_out, sample_lstm_out, sample_slice_out = run(sample_states) 147 | 148 | self.assertEquals(sample_lstm_out[:, 0::3, :].tolist(), sample_slice_out.tolist()) 149 | 150 | def test_build_hierachical_stacked_lstm_network_with_merge_correct_slice_shared_var(self): 151 | input_shape = 14 152 | sequence_length = 1 153 | batch_size = 1 154 | _, l_lstm, l_slice = build_hierachical_stacked_lstm_network_with_merge( 155 | input_shape=input_shape, 156 | sequence_length=sequence_length, 157 | batch_size=batch_size, 158 | output_shape=4) 159 | 160 | states = T.tensor3('states') 161 | lstm_out = lasagne.layers.get_output(l_lstm, states) 162 | slice_out = lasagne.layers.get_output(l_slice, states) 163 | 164 | states_shared = theano.shared(np.zeros((batch_size, sequence_length, input_shape))) 165 | run = theano.function([], [lstm_out, slice_out], givens={states: states_shared}) 166 | sample_states = np.zeros((batch_size, sequence_length, input_shape)) 167 | states_shared.set_value(sample_states) 168 | sample_lstm_out, sample_slice_out = run() 169 | 170 | self.assertEquals(sample_lstm_out[:, 1::2, :].tolist(), sample_slice_out.tolist()) 171 | 172 | 173 | if __name__ == '__main__': 174 | unittest.main() 175 | -------------------------------------------------------------------------------- /tests/test_experiment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import shutil 4 | import sys 5 | import unittest 6 | 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 8 | 9 | import agent 10 | import experiment 11 | import logger 12 | import mdps 13 | import policy 14 | import recurrent_qnetwork 15 | import replay_memory 16 | import state_adapters 17 | 18 | def get_V(e): 19 | V = {} 20 | e.agent.exploration_prob = 0 21 | for state in e.mdp.states: 22 | qopt = max((e.agent.getQ(state, action), action) for action in e.agent.actions)[0] 23 | V[state] = qopt 24 | return V 25 | 26 | class TestExperiment(unittest.TestCase): 27 | 28 | def setUp(self): 29 | pass 30 | 31 | class TestExperimentBasicRuns(TestExperiment): 32 | 33 | def test_run_basic_mdp_and_agent_episodes(self): 34 | mdp = mdps.LineMDP(5) 35 | a = agent.TestAgent(len(mdp.get_actions())) 36 | num_epochs = 1 37 | epoch_length = 10 38 | test_epoch_length = 0 39 | max_steps = 100 40 | run_tests = False 41 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests) 42 | e.run() 43 | actual = e.agent.episodes 44 | expected = e.num_epochs * e.epoch_length 45 | self.assertEquals(actual, expected) 46 | 47 | def test_run_basic_mdp_and_agent_many_episodes(self): 48 | mdp = mdps.LineMDP(5) 49 | a = agent.TestAgent(len(mdp.get_actions())) 50 | num_epochs = 5 51 | epoch_length = 10 52 | test_epoch_length = 0 53 | max_steps = 100 54 | run_tests = False 55 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests) 56 | e.run() 57 | actual = e.agent.episodes 58 | expected = e.num_epochs * e.epoch_length 59 | self.assertEquals(actual, expected) 60 | 61 | class TestExperimentMazeSolving(TestExperiment): 62 | 63 | def test_run_with_maze_mdp_and_working_agent_completes(self): 64 | mdp = mdps.MazeMDP(5, 1) 65 | num_actions = len(mdp.get_actions(None)) 66 | discount = 1 67 | exploration_prob = .3 68 | step_size = 1e-2 69 | a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False) 70 | num_epochs = 1 71 | epoch_length = 1 72 | test_epoch_length = 0 73 | max_steps = 10000 74 | run_tests = False 75 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests) 76 | e.run() 77 | total_len = len(e.agent.logger.actions) 78 | self.assertTrue(total_len < max_steps * epoch_length * num_epochs) 79 | 80 | def test_run_with_small_maze_mdp_q_learning_agent_correct_V(self): 81 | mdp = mdps.MazeMDP(5, 1) 82 | mdp.compute_states() 83 | mdp.EXIT_REWARD = 1 84 | mdp.MOVE_REWARD = -0.1 85 | num_actions = len(mdp.get_actions(None)) 86 | discount = 1 87 | exploration_prob = .7 88 | step_size = 5e-1 89 | a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False) 90 | num_epochs = 20 91 | epoch_length = 100 92 | test_epoch_length = 0 93 | max_steps = 100 94 | run_tests = False 95 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests) 96 | e.run() 97 | 98 | V = get_V(e) 99 | expected = {(0,0):0.3, (1,0):0.4, (2,0):0.5, (3,0):0.6, (4,0):0.7, 100 | (0,1):0.4, (1,1):0.5, (2,1):0.6, (3,1):0.7, (4,1):0.8, 101 | (0,2):0.5, (1,2):0.6, (2,2):0.7, (3,2):0.8, (4,2):0.9, 102 | (0,3):0.6, (1,3):0.7, (2,3):0.8, (3,3):0.9, (4,3):1.0, 103 | (0,4):0.7, (1,4):0.8, (2,4):0.9, (3,4):1.0, (4,4):0.0} 104 | 105 | max_diff = 1e-1 106 | for k in expected.keys(): 107 | self.assertTrue(k in V) 108 | self.assertTrue(np.abs(V[k] - expected[k]) < max_diff) 109 | 110 | def test_run_with_large_maze_mdp_q_learning_agent_correct_V(self): 111 | mdp = mdps.MazeMDP(5, 3) 112 | mdp.compute_states() 113 | mdp.EXIT_REWARD = 1 114 | mdp.MOVE_REWARD = -0.1 115 | num_actions = len(mdp.get_actions(None)) 116 | discount = 1 117 | exploration_prob = .5 118 | step_size = .1 119 | a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False) 120 | num_epochs = 10 121 | epoch_length = 200 122 | test_epoch_length = 0 123 | max_steps = 300 124 | run_tests = False 125 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests) 126 | e.run() 127 | 128 | V = get_V(e) 129 | actual_total = 0 130 | for k, v in V.iteritems(): 131 | actual_total += v 132 | expected_total_min = -110 133 | expected_total_max = -40 134 | self.assertTrue(actual_total < expected_total_max) 135 | self.assertTrue(actual_total > expected_total_min) 136 | 137 | def test_run_with_standard_maze_mdp_q_learning_agent_correct_V(self): 138 | mdp = mdps.MazeMDP(5, 2) 139 | mdp.compute_states() 140 | mdp.EXIT_REWARD = 1 141 | mdp.MOVE_REWARD = -0.01 142 | num_actions = len(mdp.get_actions(None)) 143 | discount = 1 144 | exploration_prob = .5 145 | step_size = .1 146 | a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False) 147 | num_epochs = 10 148 | epoch_length = 200 149 | test_epoch_length = 0 150 | max_steps = 300 151 | run_tests = False 152 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests) 153 | e.run() 154 | 155 | V = get_V(e) 156 | actual_total = 0 157 | for k, v in V.iteritems(): 158 | actual_total += v 159 | expected_total_min = -110 160 | expected_total_max = -40 161 | self.assertTrue(actual_total < expected_total_max) 162 | self.assertTrue(actual_total > expected_total_min) 163 | 164 | class TestExperimentValueString(TestExperiment): 165 | 166 | def test_sequence_value_string(self): 167 | room_size = 3 168 | num_rooms = 3 169 | mdp = mdps.MazeMDP(room_size, num_rooms) 170 | mdp.compute_states() 171 | mdp.EXIT_REWARD = 1 172 | mdp.MOVE_REWARD = -0.1 173 | discount = 1 174 | sequence_length = 2 175 | batch_size = 10 176 | learning_rate = 1e-3 177 | freeze_interval = 10000 178 | num_hidden = 4 179 | eps = .5 180 | reg = 1e-8 181 | num_actions = len(mdp.get_actions(None)) 182 | batch_size = 100 183 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape=2 * room_size, 184 | sequence_length=sequence_length, batch_size=batch_size, 185 | num_actions=4, num_hidden=num_hidden, discount=discount, learning_rate= 186 | learning_rate, regularization=reg, update_rule='adam', freeze_interval= 187 | freeze_interval, network_type='single_layer_lstm', rng=None) 188 | num_epochs = 5 189 | epoch_length = 10 190 | test_epoch_length = 0 191 | max_steps = (room_size * num_rooms) ** 2 192 | epsilon_decay = (num_epochs * epoch_length * max_steps) / 2 193 | adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size) 194 | p = policy.EpsilonGreedy(num_actions, eps, 0.05, epsilon_decay) 195 | rm = replay_memory.SequenceReplayMemory(input_shape=2 * room_size, 196 | sequence_length=sequence_length, batch_size=batch_size, capacity=50000) 197 | log = logger.NeuralLogger(agent_name='RecurrentQNetwork') 198 | a = agent.RecurrentNeuralAgent(network=network, policy=p, replay_memory=rm, log=log, state_adapter=adapter) 199 | run_tests = False 200 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, 201 | max_steps, run_tests, value_logging=True) 202 | e.log_temporal_value_string() 203 | 204 | 205 | 206 | if __name__ == '__main__': 207 | unittest.main() 208 | 209 | 210 | -------------------------------------------------------------------------------- /tests/test_learning_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import os 4 | import sys 5 | import unittest 6 | 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 8 | 9 | import learning_utils 10 | 11 | 12 | class TestMakeHeatMap(unittest.TestCase): 13 | 14 | def test_make_heat_map(self): 15 | filepath = '/Users/wulfe/Dropbox/School/Stanford/winter_2016/cs239/project/hierarchical_rl/logs/rqn_4_step_stacked_2roomx5x5_row_col/value_image.txt' 16 | epoch = 1 17 | learning_utils.make_heat_map(filepath, epoch) 18 | 19 | if __name__ == '__main__': 20 | unittest.main() -------------------------------------------------------------------------------- /tests/test_logger.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import os 4 | import shutil 5 | import sys 6 | import unittest 7 | 8 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 9 | 10 | import agent 11 | import experiment 12 | import logger 13 | import mdps 14 | 15 | class TestMazeMDP(unittest.TestCase): 16 | 17 | def test_log_epoch_empty_log(self): 18 | l = logger.Logger(agent_name='test') 19 | l.log_epoch(epoch=0) 20 | log_dir = l.log_dir 21 | self.assertTrue(os.path.isfile(os.path.join(log_dir, 'actions.npz'))) 22 | self.assertTrue(os.path.isfile(os.path.join(log_dir, 'rewards.npz'))) 23 | self.assertTrue(os.path.isfile(os.path.join(log_dir, 'losses.npz'))) 24 | shutil.rmtree(log_dir) 25 | 26 | # class TestMovingAverage(unittest.TestCase): 27 | 28 | # def test_moving_average_single_item_window(self): 29 | # arr = [1,2,3] 30 | # actual = logger.moving_average(arr, 1) 31 | # self.assertSequenceEqual(actual, arr) 32 | 33 | # def test_moving_average_small_window(self): 34 | # arr = [1,2,3,4,5,6,7] 35 | # actual = logger.moving_average(arr, 2) 36 | # expected = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5] 37 | # self.assertSequenceEqual(actual, expected) 38 | 39 | # def test_moving_average_small_window_large_variance(self): 40 | # arr = [0,9,0,9,0] 41 | # actual = logger.moving_average(arr, 3) 42 | # expected = [3, 3, 6, 3, 3] 43 | # self.assertSequenceEqual(actual, expected) 44 | 45 | # def test_moving_average_large_window_large_variance(self): 46 | # arr = [0,9,0,9,0] 47 | # actual = logger.moving_average(arr, 4) 48 | # expected = [2.25, 2.25, 4.5, 4.5, 2.25] 49 | # self.assertSequenceEqual(actual, expected) 50 | 51 | 52 | class testLoggerGraphing(unittest.TestCase): 53 | 54 | def test_graphs_are_plotted_and_saved_during_experiment(self): 55 | mdp = mdps.MazeMDP(5, 3) 56 | mdp.compute_states() 57 | mdp.EXIT_REWARD = 1 58 | mdp.MOVE_REWARD = -0.1 59 | num_actions = len(mdp.get_actions(None)) 60 | discount = mdp.get_discount() 61 | exploration_prob = .5 62 | step_size = 1 63 | a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=True) 64 | num_epochs = 1 65 | epoch_length = 100 66 | test_epoch_length = 0 67 | max_steps = 1000 68 | run_tests = False 69 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, False) 70 | e.run() 71 | 72 | log_dir = e.agent.logger.log_dir 73 | self.assertTrue(os.path.isfile(os.path.join(log_dir, 'actions_graph.png'))) 74 | self.assertTrue(os.path.isfile(os.path.join(log_dir, 'losses_graph.png'))) 75 | self.assertTrue(os.path.isfile(os.path.join(log_dir, 'rewards_graph.png'))) 76 | shutil.rmtree(log_dir) 77 | 78 | if __name__ == '__main__': 79 | unittest.main() 80 | 81 | -------------------------------------------------------------------------------- /tests/test_mdps.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | import unittest 5 | 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 7 | 8 | import agent 9 | import experiment 10 | import mdps 11 | 12 | class TestMazeMDPLogic(unittest.TestCase): 13 | 14 | """ runs_into_wall tests """ 15 | def test_leave_maze_negative_x(self): 16 | mdp = mdps.MazeMDP(room_size=5, num_rooms=1) 17 | state = (0,0) 18 | action = (-1,0) 19 | actual = mdp.runs_into_wall(state, action) 20 | expected = True 21 | self.assertEquals(actual, expected) 22 | 23 | def test_leave_maze_positive_x(self): 24 | mdp = mdps.MazeMDP(room_size=5, num_rooms=1) 25 | state = (4,0) 26 | action = (1,0) 27 | actual = mdp.runs_into_wall(state, action) 28 | expected = True 29 | self.assertEquals(actual, expected) 30 | 31 | def test_leave_maze_negative_y(self): 32 | mdp = mdps.MazeMDP(room_size=5, num_rooms=1) 33 | state = (0,0) 34 | action = (0,-1) 35 | actual = mdp.runs_into_wall(state, action) 36 | expected = True 37 | self.assertEquals(actual, expected) 38 | 39 | def test_leave_maze_positive_y(self): 40 | mdp = mdps.MazeMDP(room_size=5, num_rooms=1) 41 | state = (0,4) 42 | action = (0,1) 43 | actual = mdp.runs_into_wall(state, action) 44 | expected = True 45 | self.assertEquals(actual, expected) 46 | 47 | def test_leave_maze_negative_x_false(self): 48 | mdp = mdps.MazeMDP(room_size=5, num_rooms=1) 49 | state = (1,0) 50 | action = (-1,0) 51 | actual = mdp.runs_into_wall(state, action) 52 | expected = False 53 | self.assertEquals(actual, expected) 54 | 55 | def test_leave_maze_positive_x_false(self): 56 | mdp = mdps.MazeMDP(room_size=5, num_rooms=1) 57 | state = (3,0) 58 | action = (1,0) 59 | actual = mdp.runs_into_wall(state, action) 60 | expected = False 61 | self.assertEquals(actual, expected) 62 | 63 | def test_leave_maze_negative_y_false(self): 64 | mdp = mdps.MazeMDP(room_size=5, num_rooms=1) 65 | state = (0,1) 66 | action = (0,-1) 67 | actual = mdp.runs_into_wall(state, action) 68 | expected = False 69 | self.assertEquals(actual, expected) 70 | 71 | def test_leave_maze_positive_y_false(self): 72 | mdp = mdps.MazeMDP(room_size=5, num_rooms=1) 73 | state = (0,3) 74 | action = (0,1) 75 | actual = mdp.runs_into_wall(state, action) 76 | expected = False 77 | self.assertEquals(actual, expected) 78 | 79 | def test_wall_cross_x_right_to_left(self): 80 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 81 | state = (4,0) 82 | action = (1,0) 83 | actual = mdp.runs_into_wall(state, action) 84 | expected = True 85 | self.assertEquals(actual, expected) 86 | 87 | def test_wall_cross_x_left_to_right(self): 88 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 89 | state = (5,0) 90 | action = (-1,0) 91 | actual = mdp.runs_into_wall(state, action) 92 | expected = True 93 | self.assertEquals(actual, expected) 94 | 95 | def test_wall_cross_y_down_to_up(self): 96 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 97 | state = (0,4) 98 | action = (0,1) 99 | actual = mdp.runs_into_wall(state, action) 100 | expected = True 101 | self.assertEquals(actual, expected) 102 | 103 | def test_wall_cross_y_up_to_down(self): 104 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 105 | state = (0,5) 106 | action = (0,-1) 107 | actual = mdp.runs_into_wall(state, action) 108 | expected = True 109 | self.assertEquals(actual, expected) 110 | 111 | def test_wall_cross_x_right_to_left_false(self): 112 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 113 | state = (3,0) 114 | action = (1,0) 115 | actual = mdp.runs_into_wall(state, action) 116 | expected = False 117 | self.assertEquals(actual, expected) 118 | 119 | def test_wall_cross_x_left_to_right_false(self): 120 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 121 | state = (6,0) 122 | action = (-1,0) 123 | actual = mdp.runs_into_wall(state, action) 124 | expected = False 125 | self.assertEquals(actual, expected) 126 | 127 | def test_wall_cross_y_down_to_up_false(self): 128 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 129 | state = (0,3) 130 | action = (0,1) 131 | actual = mdp.runs_into_wall(state, action) 132 | expected = False 133 | self.assertEquals(actual, expected) 134 | 135 | def test_wall_cross_y_up_to_down_false(self): 136 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 137 | state = (0,6) 138 | action = (0,-1) 139 | actual = mdp.runs_into_wall(state, action) 140 | expected = False 141 | self.assertEquals(actual, expected) 142 | 143 | def test_wall_cross_through_doorway_x_right_to_left(self): 144 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 145 | state = (4,2) 146 | action = (1,0) 147 | actual = mdp.runs_into_wall(state, action) 148 | expected = False 149 | self.assertEquals(actual, expected) 150 | 151 | def test_wall_cross_through_doorway_x_left_to_right(self): 152 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 153 | state = (5,2) 154 | action = (-1,0) 155 | actual = mdp.runs_into_wall(state, action) 156 | expected = False 157 | self.assertEquals(actual, expected) 158 | 159 | def test_wall_cross_through_doorway_y_up(self): 160 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 161 | state = (2,4) 162 | action = (0,1) 163 | actual = mdp.runs_into_wall(state, action) 164 | expected = False 165 | self.assertEquals(actual, expected) 166 | 167 | def test_wall_cross_through_doorway_y_down(self): 168 | mdp = mdps.MazeMDP(room_size=5, num_rooms=2) 169 | state = (2,5) 170 | action = (0,-1) 171 | actual = mdp.runs_into_wall(state, action) 172 | expected = False 173 | self.assertEquals(actual, expected) 174 | 175 | """ runs_into_wall tests on larger mazes """ 176 | def test_leave_maze_negative_x_larger(self): 177 | mdp = mdps.MazeMDP(room_size=5, num_rooms=5) 178 | state = (0,0) 179 | action = (-1,0) 180 | actual = mdp.runs_into_wall(state, action) 181 | expected = True 182 | self.assertEquals(actual, expected) 183 | 184 | def test_leave_maze_negative_y_larger(self): 185 | mdp = mdps.MazeMDP(room_size=5, num_rooms=5) 186 | state = (0,0) 187 | action = (0,-1) 188 | actual = mdp.runs_into_wall(state, action) 189 | expected = True 190 | self.assertEquals(actual, expected) 191 | 192 | def test_leave_maze_negative_x_false_larger(self): 193 | mdp = mdps.MazeMDP(room_size=5, num_rooms=5) 194 | state = (1,0) 195 | action = (-1,0) 196 | actual = mdp.runs_into_wall(state, action) 197 | expected = False 198 | self.assertEquals(actual, expected) 199 | 200 | def test_leave_maze_positive_x_false_larger(self): 201 | mdp = mdps.MazeMDP(room_size=5, num_rooms=5) 202 | state = (3,0) 203 | action = (1,0) 204 | actual = mdp.runs_into_wall(state, action) 205 | expected = False 206 | self.assertEquals(actual, expected) 207 | 208 | def test_leave_maze_negative_y_false_larger(self): 209 | mdp = mdps.MazeMDP(room_size=5, num_rooms=5) 210 | state = (0,1) 211 | action = (0,-1) 212 | actual = mdp.runs_into_wall(state, action) 213 | expected = False 214 | self.assertEquals(actual, expected) 215 | 216 | def test_leave_maze_positive_y_false_larger(self): 217 | mdp = mdps.MazeMDP(room_size=5, num_rooms=5) 218 | state = (0,3) 219 | action = (0,1) 220 | actual = mdp.runs_into_wall(state, action) 221 | expected = False 222 | self.assertEquals(actual, expected) 223 | 224 | def test_wall_cross_through_doorway_x_right_to_left_larger(self): 225 | mdp = mdps.MazeMDP(room_size=3, num_rooms=2) 226 | state = (2,4) 227 | action = (1,0) 228 | actual = mdp.runs_into_wall(state, action) 229 | expected = False 230 | self.assertEquals(actual, expected) 231 | 232 | def test_wall_cross_through_doorway_x_left_to_right_larger(self): 233 | mdp = mdps.MazeMDP(room_size=3, num_rooms=2) 234 | state = (3,4) 235 | action = (-1,0) 236 | actual = mdp.runs_into_wall(state, action) 237 | expected = False 238 | self.assertEquals(actual, expected) 239 | 240 | def test_wall_cross_through_doorway_y_up_larger(self): 241 | mdp = mdps.MazeMDP(room_size=3, num_rooms=2) 242 | state = (4,2) 243 | action = (0,1) 244 | actual = mdp.runs_into_wall(state, action) 245 | expected = False 246 | self.assertEquals(actual, expected) 247 | 248 | def test_wall_cross_through_doorway_y_down_larger(self): 249 | mdp = mdps.MazeMDP(room_size=3, num_rooms=2) 250 | state = (4,3) 251 | action = (0,-1) 252 | actual = mdp.runs_into_wall(state, action) 253 | expected = False 254 | self.assertEquals(actual, expected) 255 | 256 | """ runs_into_wall tests on different room sizes """ 257 | 258 | def test_wall_cross_x_right_to_left_larger_room_size(self): 259 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 260 | state = (6,0) 261 | action = (1,0) 262 | actual = mdp.runs_into_wall(state, action) 263 | expected = True 264 | self.assertEquals(actual, expected) 265 | 266 | def test_wall_cross_x_left_to_right_larger_room_size(self): 267 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 268 | state = (7,0) 269 | action = (-1,0) 270 | actual = mdp.runs_into_wall(state, action) 271 | expected = True 272 | self.assertEquals(actual, expected) 273 | 274 | def test_wall_cross_y_down_to_up_larger_room_size(self): 275 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 276 | state = (0,6) 277 | action = (0,1) 278 | actual = mdp.runs_into_wall(state, action) 279 | expected = True 280 | self.assertEquals(actual, expected) 281 | 282 | def test_wall_cross_y_up_to_down_larger_room_size(self): 283 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 284 | state = (0,7) 285 | action = (0,-1) 286 | actual = mdp.runs_into_wall(state, action) 287 | expected = True 288 | self.assertEquals(actual, expected) 289 | 290 | def test_wall_cross_x_right_to_left_false_larger_room_size(self): 291 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 292 | state = (3,0) 293 | action = (1,0) 294 | actual = mdp.runs_into_wall(state, action) 295 | expected = False 296 | self.assertEquals(actual, expected) 297 | 298 | def test_wall_cross_x_left_to_right_false_larger_room_size(self): 299 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 300 | state = (6,0) 301 | action = (-1,0) 302 | actual = mdp.runs_into_wall(state, action) 303 | expected = False 304 | self.assertEquals(actual, expected) 305 | 306 | def test_wall_cross_y_down_to_up_false_larger_room_size(self): 307 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 308 | state = (0,3) 309 | action = (0,1) 310 | actual = mdp.runs_into_wall(state, action) 311 | expected = False 312 | self.assertEquals(actual, expected) 313 | 314 | def test_wall_cross_y_up_to_down_false_larger_room_size(self): 315 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 316 | state = (0,6) 317 | action = (0,-1) 318 | actual = mdp.runs_into_wall(state, action) 319 | expected = False 320 | self.assertEquals(actual, expected) 321 | 322 | def test_wall_cross_through_doorway_x_right_to_left_larger_room_size(self): 323 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 324 | state = (6,3) 325 | action = (1,0) 326 | actual = mdp.runs_into_wall(state, action) 327 | expected = False 328 | self.assertEquals(actual, expected) 329 | 330 | def test_wall_cross_through_doorway_x_left_to_right_larger_room_size(self): 331 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 332 | state = (7,3) 333 | action = (-1,0) 334 | actual = mdp.runs_into_wall(state, action) 335 | expected = False 336 | self.assertEquals(actual, expected) 337 | 338 | def test_wall_cross_through_doorway_y_up_larger_room_size(self): 339 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 340 | state = (3,6) 341 | action = (0,1) 342 | actual = mdp.runs_into_wall(state, action) 343 | expected = False 344 | self.assertEquals(actual, expected) 345 | 346 | def test_wall_cross_through_doorway_y_down_larger_room_size(self): 347 | mdp = mdps.MazeMDP(room_size=7, num_rooms=2) 348 | state = (3,7) 349 | action = (0,-1) 350 | actual = mdp.runs_into_wall(state, action) 351 | expected = False 352 | self.assertEquals(actual, expected) 353 | 354 | def test_corner_movement_up(self): 355 | mdp = mdps.MazeMDP(room_size=5, num_rooms=1) 356 | state = (4,4) 357 | action = (0,1) 358 | actual = mdp.runs_into_wall(state, action) 359 | expected = True 360 | self.assertEquals(actual, expected) 361 | 362 | def test_corner_movement_right(self): 363 | mdp = mdps.MazeMDP(room_size=5, num_rooms=1) 364 | state = (4,4) 365 | action = (1,0) 366 | actual = mdp.runs_into_wall(state, action) 367 | expected = True 368 | self.assertEquals(actual, expected) 369 | 370 | 371 | 372 | if __name__ == '__main__': 373 | unittest.main() 374 | -------------------------------------------------------------------------------- /tests/test_neural_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import shutil 4 | import sys 5 | import unittest 6 | 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 8 | 9 | import agent 10 | import experiment 11 | import mdps 12 | import policy 13 | import qnetwork 14 | import replay_memory 15 | import state_adapters 16 | 17 | @unittest.skipIf(__name__ != '__main__', "this test class does not run unless this file is called directly") 18 | class TestNeuralAgent(unittest.TestCase): 19 | 20 | def test_agent(self): 21 | room_size = 5 22 | mdp = mdps.MazeMDP(room_size, 1) 23 | mdp.compute_states() 24 | mdp.EXIT_REWARD = 1 25 | mdp.MOVE_REWARD = -0.1 26 | discount = mdp.get_discount() 27 | num_actions = len(mdp.get_actions(None)) 28 | network = qnetwork.QNetwork(input_shape=2 * room_size, batch_size=1, num_actions=4, num_hidden=10, discount=discount, learning_rate=1e-3, update_rule='sgd', freeze_interval=10000, rng=None) 29 | p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, 10000) 30 | rm = replay_memory.ReplayMemory(1) 31 | log = logger.NeuralLogger(agent_name='QNetwork') 32 | adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size) 33 | a = agent.NeuralAgent(network=network, policy=p, replay_memory=rm, logger=log, state_adapter=adapter) 34 | num_epochs = 2 35 | epoch_length = 10 36 | test_epoch_length = 0 37 | max_steps = 10 38 | run_tests = False 39 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=False) 40 | e.run() 41 | 42 | if __name__ == '__main__': 43 | unittest.main() -------------------------------------------------------------------------------- /tests/test_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | import unittest 5 | 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 7 | 8 | import policy 9 | 10 | class TestEpsilonGreedy(unittest.TestCase): 11 | 12 | def test_deterministic_action_selection(self): 13 | p = policy.EpsilonGreedy(num_actions=4, exploration_prob=0, min_exploration_prob=0, actions_until_min=1) 14 | q_values = [1,2,3,4] 15 | actual = p.choose_action(q_values) 16 | expected = 3 17 | self.assertEquals(actual, expected) 18 | 19 | def test_reduction_decreses_exploration_prob(self): 20 | p = policy.EpsilonGreedy(num_actions=4, exploration_prob=1, min_exploration_prob=0, actions_until_min=2) 21 | q_values = [1,2,3,4] 22 | p.choose_action(q_values) 23 | self.assertEquals(p.exploration_prob, 0.5) 24 | 25 | def test_reduction_decreses_exploration_prob_completely(self): 26 | p = policy.EpsilonGreedy(num_actions=4, exploration_prob=1, min_exploration_prob=0, actions_until_min=2) 27 | q_values = [1,2,3,4] 28 | p.choose_action(q_values) 29 | p.choose_action(q_values) 30 | self.assertEquals(p.exploration_prob, 0) 31 | 32 | class TestSoftmax(unittest.TestCase): 33 | 34 | def test_deterministic_action_selection(self): 35 | p = policy.Softmax(num_actions=4, tau=1e-1, min_tau=0, actions_until_min=100) 36 | q_values = np.array([1,2,3,4]) 37 | actual = p.choose_action(q_values) 38 | expected = 3 39 | self.assertEquals(actual, expected) 40 | 41 | def test_stochastic_action_selection(self): 42 | p = policy.Softmax(num_actions=4, tau=1e1, min_tau=0, actions_until_min=1000) 43 | q_values = np.array([1,2,3,4]) 44 | actions = [] 45 | for i in range(1000): 46 | actions.append(p.choose_action(q_values)) 47 | actions = set(actions) 48 | expected = 4 49 | self.assertEquals(len(actions), expected) 50 | 51 | def test_reduction_decreses_exploration_prob(self): 52 | p = policy.Softmax(num_actions=4, tau=1, min_tau=0, actions_until_min=2) 53 | q_values = np.array([1,2,3,4]) 54 | p.choose_action(q_values) 55 | self.assertEquals(p.tau, 0.5) 56 | 57 | def test_reduction_decreses_exploration_prob_completely(self): 58 | p = policy.Softmax(num_actions=4, tau=1, min_tau=0, actions_until_min=2) 59 | q_values = np.array([1,2,3,4]) 60 | p.choose_action(q_values) 61 | p.choose_action(q_values) 62 | self.assertEquals(p.tau, 0) 63 | 64 | if __name__ == '__main__': 65 | unittest.main() -------------------------------------------------------------------------------- /tests/test_qnetwork.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import lasagne 3 | import numpy as np 4 | import os 5 | import random 6 | import shutil 7 | import sys 8 | import theano 9 | import theano.tensor as T 10 | import unittest 11 | 12 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 13 | 14 | import agent 15 | import aws_s3_utility 16 | import experiment 17 | import file_utils 18 | import logger 19 | import mdps 20 | import policy 21 | import qnetwork 22 | import replay_memory 23 | import state_adapters 24 | 25 | class TestQNetworkConstruction(unittest.TestCase): 26 | 27 | def test_qnetwork_constructor_sgd(self): 28 | input_shape = 2 29 | batch_size = 100 30 | num_actions = 4 31 | num_hidden = 10 32 | discount = 1 33 | learning_rate = 1e-2 34 | update_rule = 'sgd' 35 | freeze_interval = 1000 36 | regularization = 0 37 | rng = None 38 | num_hidden_layers = 1 39 | network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) 40 | 41 | def test_qnetwork_constructor_rmsprop(self): 42 | input_shape = 2 43 | batch_size = 100 44 | num_actions = 4 45 | num_hidden = 10 46 | discount = 1 47 | learning_rate = 1e-2 48 | update_rule = 'rmsprop' 49 | freeze_interval = 1000 50 | regularization = 0 51 | rng = None 52 | num_hidden_layers = 1 53 | network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) 54 | 55 | def test_qnetwork_constructor_adam(self): 56 | input_shape = 2 57 | batch_size = 100 58 | num_actions = 4 59 | num_hidden = 10 60 | discount = 1 61 | learning_rate = 1e-2 62 | update_rule = 'adam' 63 | freeze_interval = 1000 64 | regularization = 0 65 | rng = None 66 | num_hidden_layers = 1 67 | network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) 68 | 69 | class TestQNetworkGetQValues(unittest.TestCase): 70 | 71 | def test_that_q_values_are_retrievable(self): 72 | input_shape = 2 73 | batch_size = 100 74 | num_actions = 4 75 | num_hidden = 10 76 | discount = 1 77 | learning_rate = 1e-2 78 | update_rule = 'sgd' 79 | freeze_interval = 1000 80 | regularization = 0 81 | rng = None 82 | num_hidden_layers = 1 83 | network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) 84 | 85 | state = np.array([1,1]) 86 | q_values = network.get_q_values(state) 87 | actual = np.shape(q_values) 88 | expected = (num_actions,) 89 | self.assertEquals(actual, expected) 90 | 91 | def test_that_initial_values_are_all_similar(self): 92 | input_shape = 2 93 | batch_size = 100 94 | num_actions = 4 95 | num_hidden = 10 96 | discount = 1 97 | learning_rate = 1e-2 98 | update_rule = 'sgd' 99 | freeze_interval = 1000 100 | regularization = 0 101 | rng = None 102 | num_hidden_layers = 1 103 | network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) 104 | 105 | states = [[1,1],[-1,-1],[-1,1],[1,-1]] 106 | for state in states: 107 | q_values = network.get_q_values(state) 108 | self.assertTrue(max(abs(q_values)) < 2) 109 | 110 | class TestQNetworkGetParams(unittest.TestCase): 111 | 112 | def test_params_retrievable(self): 113 | input_shape = 2 114 | batch_size = 100 115 | num_actions = 4 116 | num_hidden = 10 117 | discount = 1 118 | learning_rate = 1e-2 119 | update_rule = 'sgd' 120 | freeze_interval = 1000 121 | regularization = 0 122 | rng = None 123 | num_hidden_layers = 1 124 | network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) 125 | 126 | params = network.get_params() 127 | self.assertTrue(params is not None) 128 | 129 | @unittest.skipIf(__name__ != '__main__', "this test class does not run unless this file is called directly") 130 | class TestQNetworkTrain(unittest.TestCase): 131 | 132 | def test_loss_with_zero_reward_same_next_state_is_zero(self): 133 | # loss is still not zero because the selected action might not be the maximum value action 134 | input_shape = 2 135 | batch_size = 1 136 | num_actions = 4 137 | num_hidden = 10 138 | discount = 1 139 | learning_rate = 1e-2 140 | update_rule = 'sgd' 141 | freeze_interval = 1000 142 | regularization = 0 143 | rng = None 144 | num_hidden_layers = 1 145 | network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) 146 | 147 | states = np.zeros((1,2)) 148 | actions = np.zeros((1,1), dtype='int32') 149 | rewards = np.zeros((1,1)) 150 | next_states = np.zeros((1,2)) 151 | terminals = np.zeros((1,1), dtype='int32') 152 | 153 | loss = network.train(states, actions, rewards, next_states, terminals) 154 | actual = loss 155 | expected = 2 156 | self.assertTrue(actual < expected) 157 | 158 | def test_loss_with_nonzero_reward_same_next_state_is_nonzero(self): 159 | input_shape = 2 160 | batch_size = 1 161 | num_actions = 4 162 | num_hidden = 10 163 | discount = 1 164 | learning_rate = 1e-2 165 | update_rule = 'sgd' 166 | freeze_interval = 1000 167 | regularization = 0 168 | rng = None 169 | num_hidden_layers = 1 170 | network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) 171 | 172 | values = np.array(lasagne.layers.helper.get_all_param_values(network.l_out)) * 0 173 | lasagne.layers.helper.set_all_param_values(network.l_out, values) 174 | lasagne.layers.helper.set_all_param_values(network.next_l_out, values) 175 | 176 | states = np.ones((1,2), dtype=float) 177 | actions = np.zeros((1,1), dtype='int32') 178 | rewards = np.ones((1,1), dtype='int32') 179 | next_states = np.ones((1,2), dtype=float) 180 | terminals = np.zeros((1,1), dtype='int32') 181 | 182 | loss = network.train(states, actions, rewards, next_states, terminals) 183 | actual = loss 184 | expected = 0.5 185 | self.assertEquals(actual, expected) 186 | 187 | def test_overfit_simple_artificial_dataset(self): 188 | input_shape = 1 189 | batch_size = 10 190 | num_actions = 2 191 | num_hidden = 2 192 | discount = 1 193 | learning_rate = 1 194 | update_rule = 'adam' 195 | freeze_interval = 100 196 | regularization = 0 197 | rng = None 198 | num_hidden_layers = 1 199 | network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) 200 | 201 | rm = replay_memory.ReplayMemory(batch_size) 202 | # state 0 to state 1 reward +1 203 | for idx in range(20): 204 | state = np.array([0]) 205 | next_state = np.array([1]) 206 | action = 1 207 | reward = 1 208 | terminal = 1 209 | rm.store((state, action, reward, next_state, terminal)) 210 | 211 | # state 0 to state 0 reward -1 212 | for idx in range(20): 213 | switch = random.randint(0,1) 214 | state = np.array([0]) 215 | next_state = np.array([0]) 216 | action = 0 217 | reward = -1 218 | terminal = 0 219 | rm.store((state, action, reward, next_state, terminal)) 220 | 221 | print rm.terminal_count 222 | print_data = False 223 | l = logger.Logger('test') 224 | counter = 0 225 | while True: 226 | counter += 1 227 | states, actions, rewards, next_states, terminals = rm.sample_batch() 228 | loss = network.train(states, actions, rewards, next_states, terminals) 229 | l.log_loss(loss) 230 | 231 | 232 | if counter % 100 == 0: 233 | l.log_epoch(counter) 234 | Q = {} 235 | s0 = network.get_q_values(np.array([0])) 236 | Q['s0_a0'] = s0[0] 237 | Q['s0_a1'] = s0[1] 238 | s1 = network.get_q_values(np.array([1])) 239 | Q['s1_a0'] = s1[0] 240 | Q['s1_a1'] = s1[1] 241 | 242 | @unittest.skipIf(__name__ != '__main__', "this test class does not run unless this file is called directly") 243 | class TestQNetworkFullOperationFlattnedState(unittest.TestCase): 244 | 245 | def test_qnetwork_solves_small_mdp(self): 246 | 247 | 248 | def run(learning_rate, freeze_interval, num_hidden, reg): 249 | room_size = 5 250 | num_rooms = 2 251 | mdp = mdps.MazeMDP(room_size, num_rooms) 252 | mdp.compute_states() 253 | mdp.EXIT_REWARD = 1 254 | mdp.MOVE_REWARD = -0.01 255 | discount = 1 256 | num_actions = len(mdp.get_actions(None)) 257 | batch_size = 100 258 | print 'building network...' 259 | network = qnetwork.QNetwork(input_shape=2 * room_size + num_rooms ** 2, batch_size=batch_size, num_hidden_layers=2, num_actions=4, num_hidden=num_hidden, discount=discount, learning_rate=learning_rate, regularization=reg, update_rule='adam', freeze_interval=freeze_interval, rng=None) 260 | num_epochs = 50 261 | epoch_length = 2 262 | test_epoch_length = 0 263 | max_steps = 4 * (room_size * num_rooms) ** 2 264 | epsilon_decay = (num_epochs * epoch_length * max_steps) / 1.5 265 | print 'building policy...' 266 | p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, epsilon_decay) 267 | print 'building memory...' 268 | rm = replay_memory.ReplayMemory(batch_size, capacity=50000) 269 | print 'building logger...' 270 | log = logger.NeuralLogger(agent_name='QNetwork') 271 | print 'building state adapter...' 272 | adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=room_size, num_rooms=num_rooms) 273 | # adapter = state_adapters.CoordinatesToRowColAdapter(room_size=room_size, num_rooms=num_rooms) 274 | # adapter = state_adapters.CoordinatesToFlattenedGridAdapter(room_size=room_size, num_rooms=num_rooms) 275 | # adapter = state_adapters.IdentityAdapter(room_size=room_size, num_rooms=num_rooms) 276 | # adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size) 277 | print 'building agent...' 278 | a = agent.NeuralAgent(network=network, policy=p, replay_memory=rm, log=log, state_adapter=adapter) 279 | run_tests = False 280 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=True) 281 | e.run() 282 | 283 | ak = file_utils.load_key('../access_key.key') 284 | sk = file_utils.load_key('../secret_key.key') 285 | bucket = 'hierarchical' 286 | try: 287 | aws_util = aws_s3_utility.S3Utility(ak, sk, bucket) 288 | aws_util.upload_directory(e.agent.logger.log_dir) 289 | except Exception as e: 290 | print 'error uploading to s3: {}'.format(e) 291 | 292 | for idx in range(2): 293 | lr = random.choice([.007, .006, .005]) # learning rate 294 | fi = random.choice([200, 300, 400, 500, 600, 700, 800]) # freeze interval 295 | nh = random.choice([4]) # num hidden 296 | reg = random.choice([5e-4]) # regularization 297 | print 'run number: {}'.format(idx) 298 | print lr, fi, nh, reg 299 | run(lr, fi, nh, reg) 300 | 301 | if __name__ == '__main__': 302 | unittest.main() 303 | -------------------------------------------------------------------------------- /tests/test_recurrent_qnetwork.py: -------------------------------------------------------------------------------- 1 | import lasagne 2 | import numpy as np 3 | import os 4 | import random 5 | import shutil 6 | import sys 7 | import theano 8 | import theano.tensor as T 9 | import unittest 10 | 11 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 12 | 13 | import agent 14 | import aws_s3_utility 15 | import experiment 16 | import file_utils 17 | import learning_utils 18 | import logger 19 | import mdps 20 | import policy 21 | import recurrent_qnetwork 22 | import replay_memory 23 | import state_adapters 24 | 25 | class TestRecurrentQNetworkConstruction(unittest.TestCase): 26 | 27 | def test_qnetwork_constructor_sgd(self): 28 | input_shape = 2 29 | batch_size = 10 30 | sequence_length = 1 31 | num_actions = 4 32 | num_hidden = 10 33 | discount = 1 34 | learning_rate = 1e-2 35 | update_rule = 'sgd' 36 | freeze_interval = 1000 37 | regularization = 1e-4 38 | network_type = 'single_layer_rnn' 39 | rng = None 40 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 41 | sequence_length, batch_size, num_actions, num_hidden, 42 | discount, learning_rate, regularization, update_rule, 43 | freeze_interval, network_type, rng) 44 | 45 | class TestRecurrentQNetworkTrain(unittest.TestCase): 46 | 47 | def test_loss_with_zero_reward_same_next_state_is_zero(self): 48 | input_shape = 2 49 | batch_size = 1 50 | sequence_length = 1 51 | num_actions = 4 52 | num_hidden = 5 53 | discount = 1 54 | learning_rate = 1e-2 55 | update_rule = 'sgd' 56 | freeze_interval = 1000 57 | regularization = 1e-4 58 | network_type = 'single_layer_rnn' 59 | rng = None 60 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 61 | sequence_length, batch_size, num_actions, num_hidden, 62 | discount, learning_rate, regularization, update_rule, 63 | freeze_interval, network_type, rng) 64 | 65 | states = np.zeros((1,1,2)) 66 | actions = np.zeros((1,1), dtype='int32') 67 | rewards = np.zeros((1,1)) 68 | next_states = np.zeros((1,1,2)) 69 | terminals = np.zeros((1,1), dtype='int32') 70 | 71 | loss = network.train(states, actions, rewards, next_states, terminals) 72 | actual = loss 73 | expected = 2 74 | self.assertTrue(actual < expected) 75 | 76 | def test_loss_with_nonzero_reward_same_next_state_is_nonzero(self): 77 | input_shape = 2 78 | batch_size = 1 79 | sequence_length = 1 80 | num_actions = 4 81 | num_hidden = 10 82 | discount = 1 83 | learning_rate = 1e-2 84 | update_rule = 'sgd' 85 | freeze_interval = 1000 86 | regularization = 1e-4 87 | network_type = 'single_layer_rnn' 88 | rng = None 89 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 90 | sequence_length, batch_size, num_actions, num_hidden, 91 | discount, learning_rate, regularization, update_rule, 92 | freeze_interval, network_type, rng) 93 | 94 | values = np.array(lasagne.layers.helper.get_all_param_values(network.l_out)) * 0 95 | lasagne.layers.helper.set_all_param_values(network.l_out, values) 96 | lasagne.layers.helper.set_all_param_values(network.next_l_out, values) 97 | 98 | states = np.ones((1,1,2), dtype=float) 99 | actions = np.zeros((1,1), dtype='int32') 100 | rewards = np.ones((1,1), dtype='int32') 101 | next_states = np.ones((1,1,2), dtype=float) 102 | terminals = np.zeros((1,1), dtype='int32') 103 | 104 | loss = network.train(states, actions, rewards, next_states, terminals) 105 | actual = loss 106 | expected = 0.5 107 | self.assertEquals(actual, expected) 108 | 109 | def test_loss_with_nonzero_reward_same_next_state_is_nonzero_large_batch_size(self): 110 | input_shape = 2 111 | batch_size = 10 112 | sequence_length = 1 113 | num_actions = 4 114 | num_hidden = 10 115 | discount = 1 116 | learning_rate = 1e-2 117 | update_rule = 'sgd' 118 | freeze_interval = 1000 119 | regularization = 1e-4 120 | network_type = 'single_layer_rnn' 121 | rng = None 122 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 123 | sequence_length, batch_size, num_actions, num_hidden, 124 | discount, learning_rate, regularization, update_rule, 125 | freeze_interval, network_type, rng) 126 | 127 | values = np.array(lasagne.layers.helper.get_all_param_values(network.l_out)) * 0 128 | lasagne.layers.helper.set_all_param_values(network.l_out, values) 129 | lasagne.layers.helper.set_all_param_values(network.next_l_out, values) 130 | 131 | states = np.ones((10,1,2), dtype=float) 132 | actions = np.zeros((10,1), dtype='int32') 133 | rewards = np.ones((10,1), dtype='int32') 134 | next_states = np.ones((10,1,2), dtype=float) 135 | terminals = np.zeros((10,1), dtype='int32') 136 | 137 | loss = network.train(states, actions, rewards, next_states, terminals) 138 | actual = loss 139 | expected = 5.0 140 | self.assertEquals(actual, expected) 141 | 142 | def test_loss_not_impacted_by_hid_init(self): 143 | input_shape = 2 144 | batch_size = 10 145 | sequence_length = 1 146 | num_actions = 4 147 | num_hidden = 10 148 | discount = 1 149 | learning_rate = 0 150 | update_rule = 'sgd' 151 | freeze_interval = 1000 152 | regularization = 1e-4 153 | network_type = 'single_layer_rnn' 154 | rng = None 155 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 156 | sequence_length, batch_size, num_actions, num_hidden, 157 | discount, learning_rate, regularization, update_rule, 158 | freeze_interval, network_type, rng) 159 | 160 | values = np.array(lasagne.layers.helper.get_all_param_values(network.l_out)) * 0 161 | lasagne.layers.helper.set_all_param_values(network.l_out, values) 162 | lasagne.layers.helper.set_all_param_values(network.next_l_out, values) 163 | 164 | states = np.ones((10,1,2), dtype=float) 165 | actions = np.zeros((10,1), dtype='int32') 166 | rewards = np.ones((10,1), dtype='int32') 167 | next_states = np.ones((10,1,2), dtype=float) 168 | terminals = np.zeros((10,1), dtype='int32') 169 | 170 | loss_before_q_values = network.train(states, actions, rewards, next_states, terminals) 171 | 172 | state = np.ones((1,1,2), dtype=float) 173 | q_values_without_hid_init = network.get_q_values(state).tolist() 174 | 175 | loss_after_q_values = network.train(states, actions, rewards, next_states, terminals) 176 | 177 | self.assertEquals(loss_before_q_values, loss_after_q_values) 178 | 179 | class TestRecurrentQNetworkGetQValues(unittest.TestCase): 180 | 181 | def test_get_q_values_hid_init_impacts_q_values(self): 182 | input_shape = 2 183 | batch_size = 10 184 | sequence_length = 1 185 | num_actions = 4 186 | num_hidden = 10 187 | discount = 1 188 | learning_rate = 1e-2 189 | update_rule = 'sgd' 190 | freeze_interval = 1000 191 | regularization = 1e-4 192 | network_type = 'single_layer_rnn' 193 | rng = None 194 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 195 | sequence_length, batch_size, num_actions, num_hidden, 196 | discount, learning_rate, regularization, update_rule, 197 | freeze_interval, network_type, rng) 198 | 199 | state = np.ones((1,1,2), dtype=float) 200 | q_values_without_hid_init = network.get_q_values(state).tolist() 201 | q_values_with_hid_init = network.get_q_values(state).tolist() 202 | self.assertNotEquals(q_values_without_hid_init, q_values_with_hid_init) 203 | 204 | def test_get_q_values_hid_init_does_not_impact_q_values(self): 205 | input_shape = 2 206 | batch_size = 10 207 | sequence_length = 1 208 | num_actions = 4 209 | num_hidden = 10 210 | discount = 1 211 | learning_rate = 1e-2 212 | update_rule = 'sgd' 213 | freeze_interval = 1000 214 | regularization = 1e-4 215 | network_type = 'single_layer_rnn' 216 | rng = None 217 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 218 | sequence_length, batch_size, num_actions, num_hidden, 219 | discount, learning_rate, regularization, update_rule, 220 | freeze_interval, network_type, rng) 221 | 222 | state = np.ones((1,1,2), dtype=float) 223 | network.finish_episode() 224 | q_values_without_hid_init = network.get_q_values(state).tolist() 225 | network.finish_episode() 226 | q_values_after_hid_init = network.get_q_values(state).tolist() 227 | self.assertEquals(q_values_without_hid_init, q_values_after_hid_init) 228 | 229 | def test_initial_q_values(self): 230 | # if just one of these is 1, (or two are 1) why does a pattern arise? 231 | input_shape = 20 232 | batch_size = 10 233 | sequence_length = 2 234 | num_actions = 4 235 | num_hidden = 4 236 | discount = 1 237 | learning_rate = 1e-2 238 | update_rule = 'adam' 239 | freeze_interval = 1000 240 | regularization = 1e-4 241 | network_type = 'single_layer_lstm' 242 | rng = None 243 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 244 | sequence_length, batch_size, num_actions, num_hidden, 245 | discount, learning_rate, regularization, update_rule, 246 | freeze_interval, network_type, rng) 247 | 248 | values = [] 249 | for r in range(10): 250 | row_values = [] 251 | for c in range(10): 252 | r_state = np.zeros(10, dtype=float) 253 | c_state = np.zeros(10, dtype=float) 254 | r_state[r] = 1 255 | c_state[c] = 1 256 | state = np.hstack((r_state, c_state)) 257 | max_q_value = max(network.get_q_values(state).tolist()) 258 | row_values.append(max_q_value) 259 | values.append(row_values) 260 | 261 | 262 | # why is cell init nonzero? 263 | def test_for_zero_cell_init_with_len_1_sequences(self): 264 | input_shape = 2 265 | batch_size = 2 266 | sequence_length = 1 267 | num_actions = 2 268 | num_hidden = 1 269 | discount = 1 270 | learning_rate = 1 271 | update_rule = 'adam' 272 | freeze_interval = 1 273 | regularization = 1e-4 274 | network_type = 'single_layer_lstm' 275 | rng = None 276 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 277 | sequence_length, batch_size, num_actions, num_hidden, 278 | discount, learning_rate, regularization, update_rule, 279 | freeze_interval, network_type, rng) 280 | 281 | print 'BEFORE' 282 | params = lasagne.layers.get_all_params(network.l_out) 283 | param_values = lasagne.layers.get_all_param_values(network.l_out) 284 | for p, v in zip(params, param_values): 285 | print p 286 | print v 287 | print '\n' 288 | 289 | states = np.ones((batch_size, sequence_length, input_shape)) 290 | actions = np.ones((batch_size, 1), dtype='int32') 291 | rewards = np.ones((batch_size, 1)) 292 | next_states = np.ones((batch_size, sequence_length, input_shape)) 293 | terminals = np.zeros((batch_size, 1), dtype='int32') 294 | network.train(states, actions, rewards, next_states, terminals) 295 | 296 | print 'AFTER 1' 297 | params = lasagne.layers.get_all_params(network.l_out) 298 | param_values = lasagne.layers.get_all_param_values(network.l_out) 299 | for p, v in zip(params, param_values): 300 | print p 301 | print v 302 | print '\n' 303 | 304 | network.train(states, actions, rewards, next_states, terminals) 305 | 306 | print 'AFTER 2' 307 | params = lasagne.layers.get_all_params(network.l_out) 308 | param_values = lasagne.layers.get_all_param_values(network.l_out) 309 | for p, v in zip(params, param_values): 310 | print p 311 | print v 312 | print '\n' 313 | 314 | class TestRecurrentQNetworkSaturation(unittest.TestCase): 315 | 316 | def test_negative_saturation_rnn(self): 317 | input_shape = 2 318 | batch_size = 2 319 | sequence_length = 2 320 | num_actions = 2 321 | num_hidden = 1 322 | discount = 1 323 | learning_rate = 1 324 | update_rule = 'adam' 325 | freeze_interval = 1 326 | regularization = 1e-4 327 | network_type = 'single_layer_rnn' 328 | rng = None 329 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 330 | sequence_length, batch_size, num_actions, num_hidden, 331 | discount, learning_rate, regularization, update_rule, 332 | freeze_interval, network_type, rng) 333 | 334 | reward_multiplier = -10000 335 | 336 | for idx in range(100): 337 | states = np.ones((batch_size, sequence_length, input_shape)) 338 | 339 | action_multiplier = random.choice([0,1]) 340 | actions = np.ones((batch_size, 1), dtype='int32') * action_multiplier 341 | rewards = np.ones((batch_size, 1)) * reward_multiplier 342 | next_states = np.ones((batch_size, sequence_length, input_shape)) 343 | terminals = np.zeros((batch_size, 1), dtype='int32') 344 | network.train(states, actions, rewards, next_states, terminals) 345 | 346 | q_values = network.get_q_values(states[0]).tolist() 347 | print q_values 348 | self.assertTrue(sum(q_values) < 0) 349 | 350 | def test_negative_saturation_lstm(self): 351 | input_shape = 2 352 | batch_size = 2 353 | sequence_length = 2 354 | num_actions = 2 355 | num_hidden = 1 356 | discount = 1 357 | learning_rate = 1 358 | update_rule = 'adam' 359 | freeze_interval = 1 360 | regularization = 1e-4 361 | network_type = 'single_layer_lstm' 362 | rng = None 363 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 364 | sequence_length, batch_size, num_actions, num_hidden, 365 | discount, learning_rate, regularization, update_rule, 366 | freeze_interval, network_type, rng) 367 | 368 | reward_multiplier = -10000 369 | 370 | for idx in range(100): 371 | states = np.ones((batch_size, sequence_length, input_shape)) 372 | 373 | action_multiplier = random.choice([0,1]) 374 | actions = np.ones((batch_size, 1), dtype='int32') * action_multiplier 375 | rewards = np.ones((batch_size, 1)) * reward_multiplier 376 | next_states = np.ones((batch_size, sequence_length, input_shape)) 377 | terminals = np.zeros((batch_size, 1), dtype='int32') 378 | network.train(states, actions, rewards, next_states, terminals) 379 | 380 | # all the params in the lstm layer become positive 381 | # all the params in linear output layer become negative 382 | # params = lasagne.layers.get_all_params(network.l_out) 383 | # param_values = lasagne.layers.get_all_param_values(network.l_out) 384 | # for p, v in zip(params, param_values): 385 | # print p 386 | # print v 387 | # print '\n' 388 | 389 | q_values = network.get_q_values(states[0]).tolist() 390 | self.assertTrue(sum(q_values) < 0) 391 | 392 | def test_positive_saturation_lstm(self): 393 | input_shape = 2 394 | batch_size = 2 395 | sequence_length = 2 396 | num_actions = 2 397 | num_hidden = 1 398 | discount = 1 399 | learning_rate = 1 400 | update_rule = 'adam' 401 | freeze_interval = 1 402 | regularization = 1e-4 403 | network_type = 'single_layer_lstm' 404 | rng = None 405 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape, 406 | sequence_length, batch_size, num_actions, num_hidden, 407 | discount, learning_rate, regularization, update_rule, 408 | freeze_interval, network_type, rng) 409 | 410 | reward_multiplier = 10000 411 | 412 | for idx in range(100): 413 | states = np.ones((batch_size, sequence_length, input_shape)) 414 | 415 | action_multiplier = random.choice([0,1]) 416 | actions = np.ones((batch_size, 1), dtype='int32') * action_multiplier 417 | rewards = np.ones((batch_size, 1)) * reward_multiplier 418 | next_states = np.ones((batch_size, sequence_length, input_shape)) 419 | terminals = np.zeros((batch_size, 1), dtype='int32') 420 | network.train(states, actions, rewards, next_states, terminals) 421 | 422 | # # everything becomes positive 423 | # params = lasagne.layers.get_all_params(network.l_out) 424 | # param_values = lasagne.layers.get_all_param_values(network.l_out) 425 | # for p, v in zip(params, param_values): 426 | # print p 427 | # print v 428 | # print '\n' 429 | 430 | q_values = network.get_q_values(states[0]).tolist() 431 | self.assertTrue(sum(q_values) > 0) 432 | 433 | @unittest.skipIf(__name__ != '__main__', "this test class does not run unless \ 434 | this file is called directly") 435 | class TestRecurrentQNetworkFullOperationFlattnedState(unittest.TestCase): 436 | 437 | def test_qnetwork_solves_small_mdp(self): 438 | 439 | def run(learning_rate, freeze_interval, num_hidden, reg, seq_len, eps, nt, update): 440 | room_size = 5 441 | num_rooms = 2 442 | input_shape = 2 * room_size 443 | print 'building mdp...' 444 | mdp = mdps.MazeMDP(room_size, num_rooms) 445 | mdp.compute_states() 446 | mdp.EXIT_REWARD = 1 447 | mdp.MOVE_REWARD = -0.01 448 | network_type = nt 449 | discount = 1 450 | sequence_length = seq_len 451 | num_actions = len(mdp.get_actions(None)) 452 | batch_size = 100 453 | update_rule = update 454 | print 'building network...' 455 | network = recurrent_qnetwork.RecurrentQNetwork(input_shape=input_shape, 456 | sequence_length=sequence_length, batch_size=batch_size, 457 | num_actions=4, num_hidden=num_hidden, discount=discount, 458 | learning_rate=learning_rate, regularization=reg, 459 | update_rule=update_rule, freeze_interval=freeze_interval, 460 | network_type=network_type, rng=None) 461 | 462 | # take this many steps because (very loosely): 463 | # let l be the step length 464 | # let d be the difference in start and end locations 465 | # let N be the number of steps for the agent to travel a distance d 466 | # then N ~ (d/l)^2 // assuming this is a random walk 467 | # with l = 1, this gives d^2 in order to make it N steps away 468 | # the desired distance here is to walk along both dimensions of the maze 469 | # which is equal to two times the num_rooms * room_size 470 | # so squaring that gives a loose approximation to the number of 471 | # steps needed (discounting that this is actually a lattice (does it really matter?)) 472 | # (also discounting the walls) 473 | # see: http://mathworld.wolfram.com/RandomWalk2-Dimensional.html 474 | max_steps = (2 * room_size * num_rooms) ** 2 475 | num_epochs = 500 476 | epoch_length = 1 477 | test_epoch_length = 0 478 | epsilon_decay = (num_epochs * epoch_length * max_steps) / 4 479 | print 'building adapter...' 480 | adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size) 481 | print 'building policy...' 482 | p = policy.EpsilonGreedy(num_actions, eps, 0.05, epsilon_decay) 483 | print 'building replay memory...' 484 | # want to track at minimum the last 50 episodes 485 | capacity = max_steps * 50 486 | rm = replay_memory.SequenceReplayMemory(input_shape=input_shape, 487 | sequence_length=sequence_length, batch_size=batch_size, capacity=capacity) 488 | print 'building logger...' 489 | log = logger.NeuralLogger(agent_name=network_type) 490 | print 'building agent...' 491 | a = agent.RecurrentNeuralAgent(network=network, policy=p, 492 | replay_memory=rm, log=log, state_adapter=adapter) 493 | run_tests = False 494 | print 'building experiment...' 495 | e = experiment.Experiment(mdp, a, num_epochs, epoch_length, 496 | test_epoch_length, max_steps, run_tests, value_logging=True) 497 | print 'running experiment...' 498 | e.run() 499 | 500 | ak = file_utils.load_key('../access_key.key') 501 | sk = file_utils.load_key('../secret_key.key') 502 | bucket = 'hierarchical9' 503 | try: 504 | aws_util = aws_s3_utility.S3Utility(ak, sk, bucket) 505 | aws_util.upload_directory(e.agent.logger.log_dir) 506 | except Exception as e: 507 | print 'error uploading to s3: {}'.format(e) 508 | 509 | # net_types = ['single_layer_lstm', 'stacked_lstm', 'stacked_lstm_with_merge', 'hierarchical_stacked_lstm_with_merge'] 510 | net_types = ['connected_clockwork_lstm', 'disconnected_clockwork_lstm'] 511 | for idx in range(10): 512 | lr = random.choice([.01]) 513 | fi = random.choice([100]) 514 | nh = random.choice([64]) 515 | reg = random.choice([1e-4]) 516 | seq_len = random.choice([16]) 517 | eps = random.choice([.5]) 518 | nt = net_types[idx % len(net_types)] 519 | up = random.choice(['sgd+nesterov']) 520 | 521 | print 'run number: {}'.format(idx) 522 | print 'learning_rate: {} frozen_interval: \ 523 | {} num_hidden: {} reg: {} sequence_length: \ 524 | {} eps: {} network_type: {}'.format(lr,fi,nh, reg, seq_len, eps, nt) 525 | run(lr, fi, nh, reg, seq_len, eps, nt, up) 526 | 527 | if __name__ == '__main__': 528 | unittest.main() 529 | -------------------------------------------------------------------------------- /tests/test_replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | import unittest 5 | 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 7 | 8 | import replay_memory 9 | 10 | class TestReplayMemorySampleBatch(unittest.TestCase): 11 | 12 | def test_minibatch_sample_shapes_1D_state(self): 13 | batch_size = 100 14 | state_shape = 2 15 | rm = replay_memory.ReplayMemory(batch_size) 16 | for idx in range(1000): 17 | state = np.ones(state_shape) 18 | action = 0 19 | reward = 0 20 | next_state = np.ones(state_shape) 21 | terminal = 0 22 | rm.store((state, action, reward, next_state, terminal)) 23 | 24 | states, actions, rewards, next_states, terminals = rm.sample_batch() 25 | self.assertEquals(states.shape, (batch_size, state_shape)) 26 | self.assertEquals(actions.shape, (batch_size, 1)) 27 | self.assertEquals(rewards.shape, (batch_size, 1)) 28 | self.assertEquals(next_states.shape, (batch_size, state_shape)) 29 | self.assertEquals(terminals.shape, (batch_size, 1)) 30 | 31 | def test_minibatch_sample_shapes_multidimensional_state(self): 32 | batch_size = 100 33 | state_shape = (1,2,2) 34 | rm = replay_memory.ReplayMemory(batch_size) 35 | for idx in range(1000): 36 | state = np.ones(state_shape) 37 | action = 0 38 | reward = 0 39 | next_state = np.ones(state_shape) 40 | terminal = 0 41 | rm.store((state, action, reward, next_state, terminal)) 42 | 43 | states, actions, rewards, next_states, terminals = rm.sample_batch() 44 | expected_states_shape = (batch_size,) + state_shape 45 | 46 | self.assertEquals(states.shape, expected_states_shape) 47 | self.assertEquals(actions.shape, (batch_size, 1)) 48 | self.assertEquals(rewards.shape, (batch_size, 1)) 49 | self.assertEquals(next_states.shape, expected_states_shape) 50 | self.assertEquals(terminals.shape, (batch_size, 1)) 51 | 52 | 53 | def test_minibatch_sample_shapes_multidimensional_state_broadcast_check(self): 54 | batch_size = 100 55 | state_shape = (1,2,1) 56 | rm = replay_memory.ReplayMemory(batch_size) 57 | for idx in range(1000): 58 | state = np.ones(state_shape) 59 | action = 0 60 | reward = 0 61 | next_state = np.ones(state_shape) 62 | terminal = 0 63 | rm.store((state, action, reward, next_state, terminal)) 64 | 65 | states, actions, rewards, next_states, terminals = rm.sample_batch() 66 | expected_states_shape = (batch_size,) + state_shape 67 | 68 | self.assertEquals(states.shape, expected_states_shape) 69 | self.assertEquals(actions.shape, (batch_size, 1)) 70 | self.assertEquals(rewards.shape, (batch_size, 1)) 71 | self.assertEquals(next_states.shape, expected_states_shape) 72 | self.assertEquals(terminals.shape, (batch_size, 1)) 73 | 74 | class TestSequenceReplayMemorySampleBatch(unittest.TestCase): 75 | 76 | def test_minibatch_sample_shapes_1D_state_sequence_length_1(self): 77 | batch_size = 100 78 | state_shape = 2 79 | sequence_length = 1 80 | capacity = 1000 81 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 82 | for idx in range(1000): 83 | state = np.ones(state_shape) 84 | action = 0 85 | reward = 0 86 | next_state = np.ones(state_shape) 87 | terminal = False 88 | rm.store(state, action, reward, terminal) 89 | 90 | states, actions, rewards, next_states, terminals = rm.sample_batch() 91 | self.assertEquals(states.shape, (batch_size, sequence_length, state_shape)) 92 | self.assertEquals(actions.shape, (batch_size, 1)) 93 | self.assertEquals(rewards.shape, (batch_size, 1)) 94 | self.assertEquals(next_states.shape, (batch_size, sequence_length, state_shape)) 95 | self.assertEquals(terminals.shape, (batch_size, 1)) 96 | 97 | def test_minibatch_sample_shapes_1D_state_sequence_length_2(self): 98 | batch_size = 10 99 | state_shape = 2 100 | sequence_length = 2 101 | capacity = 1000 102 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 103 | for idx in range(1000): 104 | state = np.ones(state_shape) 105 | action = 0 106 | reward = 0 107 | next_state = np.ones(state_shape) 108 | terminal = False 109 | rm.store(state, action, reward, terminal) 110 | 111 | states, actions, rewards, next_states, terminals = rm.sample_batch() 112 | self.assertEquals(states.shape, (batch_size, sequence_length, state_shape)) 113 | self.assertEquals(states.sum(), batch_size * sequence_length * state_shape) 114 | self.assertEquals(actions.shape, (batch_size, 1)) 115 | self.assertEquals(rewards.shape, (batch_size, 1)) 116 | self.assertEquals(next_states.shape, (batch_size, sequence_length, state_shape)) 117 | self.assertEquals(next_states.sum(), batch_size * sequence_length * state_shape) 118 | self.assertEquals(terminals.shape, (batch_size, 1)) 119 | 120 | def test_minibatch_sample_shapes_1D_state_terminal(self): 121 | batch_size = 200 122 | state_shape = 2 123 | sequence_length = 2 124 | capacity = 1000 125 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 126 | prev_state_terminal = False 127 | for idx in range(1, 1001): 128 | action = 0 129 | reward = 0 130 | state = np.ones(state_shape) * idx 131 | state = state if not prev_state_terminal else np.zeros(state_shape) 132 | prev_state_terminal = False if np.random.random() < .8 else True 133 | rm.store(state, action, reward, prev_state_terminal) 134 | 135 | states, actions, rewards, next_states, terminals = rm.sample_batch() 136 | for state, next_state, terminal in zip(states, next_states, terminals): 137 | if terminal: 138 | self.assertEquals(next_state.tolist()[-1], np.zeros(state_shape).tolist()) 139 | 140 | def test_minibatch_sample_shapes_multidimensional_state_sequence_length_1(self): 141 | batch_size = 100 142 | state_shape = (1,2,2) 143 | sequence_length = 1 144 | capacity = 1000 145 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 146 | for idx in range(1000): 147 | state = np.ones(state_shape) 148 | action = 0 149 | reward = 0 150 | next_state = np.ones(state_shape) 151 | terminal = False 152 | rm.store(state, action, reward, terminal) 153 | 154 | states, actions, rewards, next_states, terminals = rm.sample_batch() 155 | expected_states_shape = (batch_size,) + (sequence_length,) + state_shape 156 | 157 | self.assertEquals(states.shape, expected_states_shape) 158 | self.assertEquals(actions.shape, (batch_size, 1)) 159 | self.assertEquals(rewards.shape, (batch_size, 1)) 160 | self.assertEquals(next_states.shape, expected_states_shape) 161 | self.assertEquals(terminals.shape, (batch_size, 1)) 162 | 163 | def test_minibatch_sample_shapes_multidimensional_state_sequence_length_2(self): 164 | batch_size = 100 165 | state_shape = (1,2,2) 166 | sequence_length = 2 167 | capacity = 1000 168 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 169 | for idx in range(1000): 170 | state = np.ones(state_shape) 171 | action = 0 172 | reward = 0 173 | next_state = np.ones(state_shape) 174 | terminal = False 175 | rm.store(state, action, reward, terminal) 176 | 177 | states, actions, rewards, next_states, terminals = rm.sample_batch() 178 | expected_states_shape = (batch_size,) + (sequence_length,) + state_shape 179 | 180 | self.assertEquals(states.shape, expected_states_shape) 181 | self.assertEquals(actions.shape, (batch_size, 1)) 182 | self.assertEquals(rewards.shape, (batch_size, 1)) 183 | self.assertEquals(next_states.shape, expected_states_shape) 184 | self.assertEquals(terminals.shape, (batch_size, 1)) 185 | 186 | 187 | def test_minibatch_sample_shapes_multidimensional_state_broadcast_check(self): 188 | batch_size = 100 189 | state_shape = (1,2,1) 190 | sequence_length = 2 191 | capacity = 1000 192 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 193 | for idx in range(1000): 194 | state = np.ones(state_shape) 195 | action = 0 196 | reward = 0 197 | next_state = np.ones(state_shape) 198 | terminal = False 199 | rm.store(state, action, reward, terminal) 200 | 201 | states, actions, rewards, next_states, terminals = rm.sample_batch() 202 | expected_states_shape = (batch_size,) + (sequence_length,) + state_shape 203 | 204 | self.assertEquals(states.shape, expected_states_shape) 205 | self.assertEquals(actions.shape, (batch_size, 1)) 206 | self.assertEquals(rewards.shape, (batch_size, 1)) 207 | self.assertEquals(next_states.shape, expected_states_shape) 208 | self.assertEquals(terminals.shape, (batch_size, 1)) 209 | 210 | class TestSequenceReplayMemoryMakeLastSequence(unittest.TestCase): 211 | 212 | def test_make_last_sequence_basic_operation(self): 213 | batch_size = 10 214 | state_shape = 2 215 | sequence_length = 3 216 | capacity = 30 217 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 218 | 219 | for idx in range(4): 220 | state = np.ones(state_shape) 221 | action = 0 222 | reward = 0 223 | next_state = np.ones(state_shape) 224 | terminal = False 225 | rm.store(state, action, reward, terminal) 226 | 227 | actual = rm.make_last_sequence(np.arange(state_shape)).tolist() 228 | expected = [[1, 1], [1, 1], [0, 1]] 229 | self.assertEquals(actual, expected) 230 | 231 | def test_make_last_sequence_preceding_state_terminal(self): 232 | batch_size = 10 233 | state_shape = 2 234 | sequence_length = 3 235 | capacity = 30 236 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 237 | 238 | state = np.ones(state_shape) 239 | action = 0 240 | reward = 0 241 | next_state = np.ones(state_shape) 242 | terminal = False 243 | rm.store(state, action, reward, terminal) 244 | terminal = True 245 | rm.store(state, action, reward, terminal) 246 | actual = rm.make_last_sequence(np.arange(state_shape)).tolist() 247 | expected = [[0, 0], [0, 0], [0, 1]] 248 | self.assertEquals(actual, expected) 249 | 250 | def test_make_last_sequence_some_previous_state_terminal_not_in_sequence(self): 251 | batch_size = 10 252 | state_shape = 2 253 | sequence_length = 3 254 | capacity = 30 255 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 256 | 257 | state = np.ones(state_shape) 258 | action = 0 259 | reward = 0 260 | next_state = np.ones(state_shape) 261 | terminal = True 262 | rm.store(state, action, reward, terminal) 263 | terminal = False 264 | for idx in range(10): 265 | rm.store(state, action, reward, terminal) 266 | 267 | actual = rm.make_last_sequence(np.arange(state_shape)).tolist() 268 | expected = [[1, 1], [1, 1], [0, 1]] 269 | self.assertEquals(actual, expected) 270 | 271 | def test_make_last_sequence_terminal_state_within_sequence_but_not_preceding(self): 272 | batch_size = 10 273 | state_shape = 2 274 | sequence_length = 4 275 | capacity = 30 276 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 277 | 278 | # tuple 1 279 | state = np.ones(state_shape) 280 | action = 0 281 | reward = 0 282 | next_state = np.ones(state_shape) 283 | terminal = False 284 | rm.store(state, action, reward, terminal) 285 | 286 | # tuple 2 287 | terminal = True 288 | rm.store(state, action, reward, terminal) 289 | 290 | # tuple 3 291 | terminal = False 292 | rm.store(state, action, reward, terminal) 293 | 294 | actual = rm.make_last_sequence(np.arange(state_shape)).tolist() 295 | expected = [[0, 0], [0, 0], [1, 1], [0, 1]] 296 | self.assertEquals(actual, expected) 297 | 298 | def test_make_last_sequence_terminal_state_first_in_made_sequence(self): 299 | batch_size = 10 300 | state_shape = 2 301 | sequence_length = 4 302 | capacity = 30 303 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 304 | 305 | # tuple 1 306 | state = np.ones(state_shape) 307 | action = 0 308 | reward = 0 309 | next_state = np.ones(state_shape) 310 | terminal = True 311 | rm.store(state, action, reward, terminal) 312 | 313 | # tuple 2 314 | terminal = False 315 | rm.store(state, action, reward, terminal) 316 | 317 | # tuple 3 318 | terminal = False 319 | rm.store(state, action, reward, terminal) 320 | 321 | actual = rm.make_last_sequence(np.arange(state_shape)).tolist() 322 | expected = [[0, 0], [1, 1], [1, 1], [0, 1]] 323 | self.assertEquals(actual, expected) 324 | 325 | def test_make_last_sequence_terminal_state_first_in_made_sequence_wrap(self): 326 | batch_size = 10 327 | state_shape = 2 328 | sequence_length = 4 329 | capacity = 30 330 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 331 | 332 | # tuple 1 333 | state = np.ones(state_shape) 334 | action = 0 335 | reward = 0 336 | next_state = np.ones(state_shape) 337 | terminal = False 338 | for i in range(capacity - 1): 339 | rm.store(state, action, reward, terminal) 340 | 341 | 342 | terminal = True 343 | rm.store(state, action, reward, terminal) 344 | 345 | # tuple 2 346 | terminal = False 347 | rm.store(state, action, reward, terminal) 348 | 349 | # tuple 3 350 | terminal = False 351 | rm.store(state, action, reward, terminal) 352 | 353 | actual = rm.make_last_sequence(np.arange(state_shape)).tolist() 354 | expected = [[0, 0], [1, 1], [1, 1], [0, 1]] 355 | self.assertEquals(actual, expected) 356 | 357 | 358 | def test_make_last_sequence_insufficient_samples_for_full_sequence(self): 359 | batch_size = 10 360 | state_shape = 2 361 | sequence_length = 4 362 | capacity = 30 363 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 364 | 365 | # tuple 1 366 | state = np.ones(state_shape) 367 | action = 0 368 | reward = 0 369 | next_state = np.ones(state_shape) 370 | terminal = False 371 | rm.store(state, action, reward, terminal) 372 | 373 | # tuple 2 374 | terminal = False 375 | rm.store(state, action, reward, terminal) 376 | 377 | actual = rm.make_last_sequence(np.arange(state_shape)).tolist() 378 | expected = [[0, 0], [1, 1], [1, 1], [0, 1]] 379 | self.assertEquals(actual, expected) 380 | 381 | def test_make_last_sequence_empty(self): 382 | batch_size = 10 383 | state_shape = 2 384 | sequence_length = 4 385 | capacity = 30 386 | rm = replay_memory.SequenceReplayMemory(state_shape, sequence_length, batch_size, capacity) 387 | 388 | actual = rm.make_last_sequence(np.arange(state_shape)).tolist() 389 | expected = [[0, 0], [0, 0], [0, 0], [0, 1]] 390 | self.assertEquals(actual, expected) 391 | 392 | 393 | if __name__ == '__main__': 394 | unittest.main() -------------------------------------------------------------------------------- /tests/test_state_adapters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import sys 4 | import unittest 5 | 6 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'scripts'))) 7 | 8 | import state_adapters 9 | 10 | class TestCoordinatesToSingleRoomRowColAdapter(unittest.TestCase): 11 | 12 | def test_convert_state_to_agent_format_first_room(self): 13 | adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=3) 14 | mdp_formatted_state = (2, 2) 15 | expected = [0,0,1,0,0,1] 16 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 17 | self.assertEquals(actual, expected) 18 | 19 | def test_convert_state_to_agent_format_fourth_room(self): 20 | adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=3) 21 | mdp_formatted_state = (4, 4) 22 | expected = [0,1,0,0,1,0] 23 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 24 | self.assertEquals(actual, expected) 25 | 26 | def test_convert_state_to_agent_format_off_diagonal_room(self): 27 | adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=3) 28 | mdp_formatted_state = (0, 4) 29 | expected = [1,0,0,0,1,0] 30 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 31 | self.assertEquals(actual, expected) 32 | 33 | def test_convert_state_to_agent_format_off_fourth_room_first_square(self): 34 | adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=3) 35 | mdp_formatted_state = (3, 3) 36 | expected = [1,0,0,1,0,0] 37 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 38 | self.assertEquals(actual, expected) 39 | 40 | class TestCoordinatesToRowColAdapter(unittest.TestCase): 41 | 42 | def test_convert_state_to_agent_format_first_room(self): 43 | adapter = state_adapters.CoordinatesToRowColAdapter(room_size=3, num_rooms=2) 44 | mdp_formatted_state = (2, 2) 45 | expected = [0,0,1,0,0,0,0,0,1,0,0,0] 46 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 47 | self.assertEquals(actual, expected) 48 | 49 | def test_convert_state_to_agent_format_fourth_room(self): 50 | adapter = state_adapters.CoordinatesToRowColAdapter(room_size=3, num_rooms=2) 51 | mdp_formatted_state = (4, 4) 52 | expected = [0,0,0,0,1,0,0,0,0,0,1,0] 53 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 54 | self.assertEquals(actual, expected) 55 | 56 | def test_convert_state_to_agent_format_off_diagonal_room(self): 57 | adapter = state_adapters.CoordinatesToRowColAdapter(room_size=3, num_rooms=2) 58 | mdp_formatted_state = (0, 4) 59 | expected = [1,0,0,0,0,0,0,0,0,0,1,0] 60 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 61 | self.assertEquals(actual, expected) 62 | 63 | def test_convert_state_to_agent_format_off_fourth_room_first_square(self): 64 | adapter = state_adapters.CoordinatesToRowColAdapter(room_size=3, num_rooms=2) 65 | mdp_formatted_state = (3, 3) 66 | expected = [0,0,0,1,0,0,0,0,0,1,0,0] 67 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 68 | self.assertEquals(actual, expected) 69 | 70 | class TestCoordinatesToRowColRoomAdapter(unittest.TestCase): 71 | 72 | def test_convert_state_to_agent_format_first_room(self): 73 | adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=3, num_rooms=2) 74 | mdp_formatted_state = (2, 2) 75 | expected = [0,0,1,0,0,1,1,0,0,0] 76 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 77 | self.assertEquals(actual, expected) 78 | 79 | def test_convert_state_to_agent_format_fourth_room(self): 80 | adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=3, num_rooms=2) 81 | mdp_formatted_state = (4, 4) 82 | expected = [0,1,0,0,1,0,0,0,0,1] 83 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 84 | self.assertEquals(actual, expected) 85 | 86 | def test_convert_state_to_agent_format_off_diagonal_room_top(self): 87 | adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=3, num_rooms=2) 88 | mdp_formatted_state = (0, 4) 89 | expected = [1,0,0,0,1,0,0,0,1,0] 90 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 91 | self.assertEquals(actual, expected) 92 | 93 | def test_convert_state_to_agent_format_off_diagonal_room_bottom(self): 94 | adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=3, num_rooms=2) 95 | mdp_formatted_state = (4, 0) 96 | expected = [0,1,0,1,0,0,0,1,0,0] 97 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 98 | self.assertEquals(actual, expected) 99 | 100 | def test_convert_state_to_agent_format_off_fourth_room_first_square(self): 101 | adapter = state_adapters.CoordinatesToRowColRoomAdapter(room_size=3, num_rooms=2) 102 | mdp_formatted_state = (3, 3) 103 | expected = [1,0,0,1,0,0,0,0,0,1] 104 | actual = adapter.convert_state_to_agent_format(mdp_formatted_state).tolist() 105 | self.assertEquals(actual, expected) 106 | 107 | 108 | 109 | 110 | if __name__ == '__main__': 111 | unittest.main() 112 | --------------------------------------------------------------------------------