├── LICENSE ├── README.md ├── Report.pdf ├── explore1.py ├── explore2.py ├── full_training.py ├── learning_agent.py ├── model_eval.py ├── models ├── checkpoint ├── model.ckpt └── model.ckpt.meta ├── random_avg_score.py ├── search_params.csv └── search_params.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Jou-ching Sung 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Using Deep Reinforcement Learning to Solve Acrobot 2 | 3 | **NOTE**: This project was done a long time ago when I first started with reinforcement learning, so please excuse some conceptual inaccuracies, e.g. I'm not actually using the DDPG algorithm, and I'm not doing the "asynchronous" part of A3C :) Otherwise, I *am* using policy gradients with actor/critic networks, with advantage (A2C). 4 | 5 | This project uses policy gradients with actor/critic networks and parallel environments to solve OpenAI Gym's Acrobot-v1 environment. As of September 20, 2016, the final learned model placed 3rd on the OpenAI Gym Acrobot-v1 leaderboard, with a score of -80.69 ± 1.06 (see "georgesung's algorithm"): https://gym.openai.com/envs/Acrobot-v1 6 | 7 | This project is my capstone project for Udacity's Machine Learning Engineer Nanodegree. For the full capstone project report, please see 'Report.pdf'. 8 | 9 | ## Dependencies 10 | The following depenencies are required: 11 | 12 | * Python 2.7/3.5+ 13 | * NumPy 14 | * Matplotlib 15 | * OpenAI Gym 16 | * TensorFlow 0.10.0 17 | 18 | ## How to run 19 | To run the learning agent with pre-set parameter values, run 'python learning_agent.py'. The main reinformcent learning code is located in this file. 20 | 21 | To run the parameter search, run 'python search_params.py'. In this file, you can modify the parameter values over which to search. 22 | 23 | Once you know your optimal parameters, enter them in 'full_training.py', and run 'python full_training.py'. This will perform the full training process on the model. 24 | 25 | To validate your model (make sure results are consistent), run 'python model_eval.py'. 26 | 27 | ## Detailed report 28 | A full detailed report can be found at 'Report.pdf' 29 | 30 | ## Final model 31 | My final trained model is available at 'models/model.ckpt'. This model is saved as a TensorFlow model. 32 | -------------------------------------------------------------------------------- /Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/deep_rl_acrobot/ef4d266f2fb16bb6cccf393404047542b3eb9ea2/Report.pdf -------------------------------------------------------------------------------- /explore1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Exploration: 3 | Run one episode of acrobot by uniformly sampling random actions. 4 | Prints the observation, action, and reward at each timestep. 5 | ''' 6 | import gym 7 | 8 | env = gym.make('Acrobot-v1') 9 | 10 | observation = env.reset() 11 | 12 | t = 0 13 | while True: 14 | old_obs = observation 15 | action = env.action_space.sample() 16 | observation, reward, done, info = env.step(action) 17 | 18 | print('obs: %s, action: %s, reward: %s' % (old_obs, action, reward)) 19 | 20 | t += 1 21 | if done: 22 | print('Episode finished after %d timesteps' % t) 23 | break 24 | -------------------------------------------------------------------------------- /explore2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Exploration: 3 | Run 10 episodes of acrobot by uniformly sampling random actions. 4 | Save all observations in memory, and plot a histogram for all 6 dimensions of the observation. 5 | ''' 6 | import gym 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | # Helper function to plot numpy histogram 11 | def plot_hist(hist_bins, ylabel, filename): 12 | hist, bins = hist_bins 13 | width = 0.7 * (bins[1] - bins[0]) 14 | center = (bins[:-1] + bins[1:]) / 2 15 | 16 | plt.bar(center, hist, align='center', width=width) 17 | plt.ylabel(ylabel) 18 | plt.xlabel('Value') 19 | #plt.show() 20 | plt.savefig(filename, bbox_inches='tight') 21 | plt.clf() 22 | 23 | env = gym.make('Acrobot-v1') 24 | observation = env.reset() 25 | 26 | # List of all observations across 10 episodes 27 | observations = [] 28 | 29 | # Run 10 episodes 30 | for i in range(10): 31 | t = 0 32 | while True: 33 | observations.append(observation) 34 | 35 | action = env.action_space.sample() 36 | observation, reward, done, info = env.step(action) 37 | 38 | t += 1 39 | if done: 40 | env.reset() 41 | print('Episode finished after %d timesteps' % t) 42 | break 43 | 44 | # 'observations' is an Nx6 matrix, where N is the total number of observations 45 | # To create the histograms, take a transpose of observations so we get a 6xN matrix instead 46 | observations_t = np.transpose(observations) 47 | 48 | # For each dimension of the observation, create a histogram 49 | # np.histogram returns (hist, bins) 50 | hist_bins = [np.histogram(dim, bins=20) for dim in observations_t] 51 | 52 | # Plot a histogram for each dimension of the observations 53 | dim_num = 1 54 | for hb in hist_bins: 55 | plot_hist(hb, 'Dimension %d' % dim_num, 'dim%d.png' % dim_num) 56 | dim_num += 1 57 | -------------------------------------------------------------------------------- /full_training.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Given the optimal parameter combination, run 1500 episodes of training from scratch, 3 | followed by 8500 episodes of training with the learning rate reduced by a factor of 10 4 | 5 | More details in the "Refinement" section of the report. 6 | ''' 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import tensorflow as tf 12 | import tensorflow.contrib.slim as slim 13 | import numpy as np 14 | import random 15 | import gym 16 | import math 17 | import matplotlib.pyplot as plt 18 | import pickle 19 | 20 | import learning_agent 21 | 22 | # Set parameters in learning_agent 23 | learning_agent.ACTOR_LR = 0.05 24 | learning_agent.CRITIC_LR_SCALE = 0.5 25 | learning_agent.REWARD_DISCOUNT = 0.97 26 | learning_agent.A_REG_SCALE = 0.00005 27 | learning_agent.C_REG_SCALE = 0.0005 28 | 29 | # Enable saving the model to disk 30 | learning_agent.SAVE_MODEL = True 31 | 32 | ################################################################### 33 | # Phase 1: Run training over 1500 episodes, save the trained model 34 | ################################################################### 35 | 36 | # Configure learning_agent to run 1500 training episodes, from scratch 37 | learning_agent.NUM_EPISODES = 1500 38 | learning_agent.RESUME = False 39 | 40 | # Run RL algorithm until it does not return an error 41 | while True: 42 | avg_rewards1, score1 = learning_agent.run_rl() 43 | if avg_rewards1 is not None: 44 | break 45 | 46 | print('Phase 1 complete, score: %f' % score1) 47 | 48 | ################################################################### 49 | # Phase 2: Load model from Phase 1, reduce the learning rate by 50 | # a factor of 10, run another 8500 training episodes 51 | ################################################################### 52 | 53 | # Configure learning_agent appropriately 54 | learning_agent.NUM_EPISODES = 8500 55 | learning_agent.RESUME = True 56 | learning_agent.ACTOR_LR /= 10 57 | 58 | # Run RL algorithm until it does not return an error 59 | while True: 60 | avg_rewards2, score2 = learning_agent.run_rl() 61 | if avg_rewards2 is not None: 62 | break 63 | 64 | print('Phase 2 complete, final score: %f' % score2) 65 | print('Final model saved at %s' % learning_agent.MODEL_LOC) 66 | 67 | avg_rewards = np.concatenate((avg_rewards1, avg_rewards2)) 68 | 69 | # Save the avg_rewards list just in case we need it later 70 | # Maybe our plot was unclear, and we need to re-plot w/ same data 71 | print('Saving avg_rewards to avg_rewards.p') 72 | with open('avg_rewards.p', 'wb') as avg_rewards_out: 73 | pickle.dump(avg_rewards, avg_rewards_out) 74 | 75 | print('Plotting avg rewards over episodes') 76 | plt.plot(avg_rewards) 77 | plt.title('Average Reward over Episodes') 78 | plt.ylabel('Average Reward') 79 | plt.xlabel('Episode') 80 | plt.show() 81 | -------------------------------------------------------------------------------- /learning_agent.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Implementing policy gradient actor/critic reinforcement learning method ("Deep Deterministic Policy Gradients"), 3 | with parallel environments ("Asynchronous Methods for Deep Reinforcement Learning"). 4 | 5 | See "Implementation" section of the report for more details. 6 | ''' 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import tensorflow as tf 12 | import tensorflow.contrib.slim as slim 13 | import numpy as np 14 | import random 15 | import gym 16 | import math 17 | import matplotlib.pyplot as plt 18 | 19 | # Environment parameters 20 | ENV = 'Acrobot-v1' 21 | OBS_WIDTH = 6 22 | NUM_ACTIONS = 3 23 | 24 | # Save/restore previously trained models 25 | RESUME = False # resume from previously trained model? 26 | SAVE_MODEL = False # save final trained model? 27 | MODEL_LOC = 'models/model.ckpt' # final model save location 28 | SAVE_THRESHOLD = -200 # must achieve score above this threshold to save model 29 | 30 | # Save results here to upload to OpenAI Gym leaderboard 31 | # Only applicable in model evaluation phase 32 | RECORD_LOC = 'openai_data' 33 | 34 | # Overall parameters 35 | NUM_EPISODES = 1000 36 | MAX_ITER = 3000 # max number of timesteps to run per episode 37 | NUM_ENVS = 5 # number of environments to run in parallel 38 | # NOTE: Not using replay buffer, so set below two parameters to value 1 39 | EPISODES_PER_UPDATE = 1 # i.e. how many episodes per replay buffer 40 | DS_FACTOR = 1 # replay buffer downsample factor (num_samples = buffer_size // DS_FACTOR) 41 | 42 | # Model hyper-parameters 43 | ACTOR_LR = 0.005 # actor network learning rate 44 | CRITIC_LR_SCALE = 0.5 # scaling factor of critic network learning rate, relative to actor 45 | CRITIC_LR = ACTOR_LR * CRITIC_LR_SCALE # do not tune this parameter, tune the above 46 | REWARD_DISCOUNT = 0.97 47 | A_REG_SCALE = 0.0005 # actor network regularization strength 48 | C_REG_SCALE = 0.0005 # critic network regularization strength 49 | 50 | ######################################## 51 | # Helper functions 52 | ######################################## 53 | def discount_rewards(r): 54 | ''' 55 | Take 1D float array of rewards and compute discounted reward 56 | Slightly modified from https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5 57 | ''' 58 | discounted_r = np.zeros_like(r) 59 | running_add = 0. 60 | for t in reversed(range(len(r))): 61 | running_add = running_add * REWARD_DISCOUNT + r[t] 62 | discounted_r[t] = running_add 63 | return discounted_r 64 | 65 | def sample_action(probs): 66 | ''' 67 | Sample action (0/1/2/etc.) from probability distribution probs 68 | ''' 69 | num_actions = len(probs) 70 | threshold = random.uniform(0,1) 71 | cumulative_prob = 0. 72 | for action in range(num_actions): 73 | cumulative_prob += probs[action] 74 | if cumulative_prob > threshold: 75 | return action 76 | return num_actions - 1 # might need this for strange corner case? 77 | 78 | ######################################## 79 | # Actor and Critic networks 80 | ######################################## 81 | def actor_network(): 82 | ''' 83 | Actor network, including policy gradient equation and optimizer 84 | ''' 85 | with tf.variable_scope('policy'): 86 | # Inputs 87 | state = tf.placeholder('float', [None, OBS_WIDTH]) # batch_size x obs_width 88 | actions = tf.placeholder('float', [None, NUM_ACTIONS]) # batch_size x num_actions 89 | advantages = tf.placeholder('float', [None, 1]) # batch_size x 1 90 | 91 | # 3-layer fully-connected neural network 92 | mlp_out = slim.stack(state, slim.fully_connected, [6, NUM_ACTIONS], weights_regularizer=slim.l2_regularizer(scale=A_REG_SCALE)) 93 | 94 | # Network output 95 | probabilities = tf.nn.softmax(mlp_out) 96 | 97 | good_probabilities = tf.reduce_sum(tf.mul(probabilities, actions), reduction_indices=[1]) 98 | eligibility = tf.log(good_probabilities) * advantages 99 | 100 | # Loss & optimizer 101 | data_loss = -tf.reduce_sum(eligibility) 102 | reg_losses = slim.losses.get_regularization_losses(scope='policy') 103 | reg_loss = tf.reduce_sum(reg_losses) 104 | total_loss = data_loss + reg_loss 105 | 106 | optimizer = tf.train.AdamOptimizer(ACTOR_LR).minimize(total_loss) 107 | 108 | return probabilities, state, actions, advantages, optimizer 109 | 110 | def critic_network(): 111 | ''' 112 | Critic network, including loss and optimizer 113 | ''' 114 | with tf.variable_scope('value'): 115 | # Inputs 116 | state = tf.placeholder('float', [None, OBS_WIDTH]) # batch_size x obs_width 117 | newvals = tf.placeholder('float', [None, 1]) # batch_size x 1 118 | 119 | # 4-layer fully-connected neural network 120 | calculated = slim.stack(state, slim.fully_connected, [6, 6, 1], weights_regularizer=slim.l2_regularizer(scale=C_REG_SCALE)) 121 | 122 | # Error value 123 | diffs = calculated - newvals 124 | 125 | # Loss & optimizer 126 | data_loss = tf.nn.l2_loss(diffs) 127 | reg_losses = slim.losses.get_regularization_losses(scope='value') 128 | reg_loss = tf.reduce_sum(reg_losses) 129 | total_loss = data_loss + reg_loss 130 | 131 | optimizer = tf.train.AdamOptimizer(CRITIC_LR).minimize(total_loss) 132 | 133 | return calculated, state, newvals, optimizer, total_loss 134 | 135 | ######################################## 136 | # Training and inference processes 137 | ######################################## 138 | def train_networks(replay_buffer, actor, critic, sess): 139 | ''' 140 | Run training on a random subset of experiences in replay buffer 141 | Arguments: 142 | replay_buffer: 2D array-like of the form 143 | [(states, actions, advantages, update_vals)_0, (states, actions, advantages, update_vals)_1, ...] 144 | ''' 145 | actor_calculated, actor_state, actor_actions, actor_advantages, actor_optimizer = actor 146 | critic_calculated, critic_state, critic_newvals, critic_optimizer, critic_loss = critic 147 | 148 | # Down-sample the replay buffer 149 | training_batch_size = len(replay_buffer) // DS_FACTOR 150 | training_batch = np.array(replay_buffer)[np.random.choice(len(replay_buffer), training_batch_size, False)] 151 | 152 | # "Un-zip" training_batch 153 | states, actions, advantages, update_vals = list(zip(*training_batch)) 154 | 155 | print('Average advantage: %s' % np.mean(advantages)) 156 | 157 | # Train critic network (i.e. value network) 158 | update_vals_vector = np.expand_dims(update_vals, axis=1) 159 | sess.run(critic_optimizer, feed_dict={critic_state: states, critic_newvals: update_vals_vector}) 160 | 161 | # Train actor network (i.e. policy network) 162 | advantages_vector = np.expand_dims(advantages, axis=1) 163 | sess.run(actor_optimizer, feed_dict={actor_state: states, actor_advantages: advantages_vector, actor_actions: actions}) 164 | # END train_networks 165 | 166 | def run_episode(envs, actor, critic, sess): 167 | ''' 168 | Run a single episode 169 | ''' 170 | # Actor and critic networks 171 | actor_calculated, actor_state, actor_actions, actor_advantages, actor_optimizer = actor 172 | critic_calculated, critic_state, critic_newvals, critic_optimizer, critic_loss = critic 173 | 174 | # Reset env 175 | observation = [env.reset() for env in envs] 176 | 177 | # Total undiscounted reward for each env 178 | totalreward = [0 for _ in range(NUM_ENVS)] 179 | 180 | # States, actions, rewards across all timesteps in episode, across all envs 181 | states = [] 182 | actions = [] 183 | rewards = [] 184 | 185 | # Keep track of which envs are done 186 | done_mask = [False for _ in range(NUM_ENVS)] 187 | 188 | # Interact with the environment 189 | for _ in range(MAX_ITER): 190 | # Actor network calculates policy 191 | probs = sess.run(actor_calculated, feed_dict={actor_state: observation}) 192 | 193 | # Sample action from stochastic policy 194 | action = [sample_action(prob) for prob in probs] 195 | 196 | # Record state and action if applicable. Record None object if particular env is already done. 197 | states.append([observation[i] if not done_mask[i] else None for i in range(NUM_ENVS)]) 198 | 199 | action_onehot = [np.zeros(NUM_ACTIONS) for _ in range(NUM_ENVS)] 200 | for i in range(NUM_ENVS): 201 | action_onehot[i][action[i]] = 1 202 | actions.append([action_onehot[i] if not done_mask[i] else None for i in range(NUM_ENVS)]) 203 | 204 | # Reset envs that are already done 205 | for i in range(NUM_ENVS): 206 | if done_mask[i]: 207 | envs[i].reset() 208 | 209 | # Take action in each environment, and store the feedback 210 | # If env is already done, we will ignore it's result later 211 | observation, reward, done, info = list(zip(*[envs[i].step(action[i]) for i in range(NUM_ENVS)])) 212 | 213 | # Record the reward, but record None if env is already done 214 | rewards.append([reward[i] if not done_mask[i] else None for i in range(NUM_ENVS)]) 215 | 216 | # Check which env(s) are done in this iteration 217 | for i in range(NUM_ENVS): 218 | if done[i]: 219 | done_mask[i] = True 220 | 221 | # If all envs are done, break 222 | if all(done_mask): 223 | break 224 | 225 | # Convert states, actions, and rewards tensor w/ shape num_iters x NUM_ENVS into NUM_ENVS x num_iters (i.e. matrix transpose) 226 | states_per_env = list(zip(*states)) 227 | actions_per_env = list(zip(*actions)) 228 | rewards_per_env = list(zip(*rewards)) 229 | 230 | # For all envs, for all applicable timesteps, do necessary calculations to add this experience 231 | experiences = [] 232 | for env_idx in range(NUM_ENVS): 233 | # Some envs finished earlier than others, so remove the None objects in lists 234 | filtered_states = [s for s in states_per_env[env_idx] if s is not None] 235 | filtered_actions = [a for a in actions_per_env[env_idx] if a is not None] 236 | filtered_rewards = [r for r in rewards_per_env[env_idx] if r is not None] 237 | 238 | # Compute discounted rewards for this env 239 | disc_rewards = discount_rewards(filtered_rewards) 240 | 241 | # Critic network computes the estimated value of the state/observation 242 | baseline = sess.run(critic_calculated,feed_dict={critic_state: filtered_states}).ravel() 243 | 244 | # Advantage: How much better is the observed discounted reward vs. baseline computed by critic 245 | advantages = disc_rewards - baseline 246 | 247 | # Record this experience 248 | experiences += zip(filtered_states, filtered_actions, advantages, disc_rewards) 249 | 250 | # For book-keeping, record total undiscounted reward for this env 251 | totalreward[env_idx] = sum(filtered_rewards) 252 | 253 | return totalreward, experiences 254 | # END run_episode 255 | 256 | ######################################## 257 | # Top-level RL algorithm 258 | ######################################## 259 | def run_rl(): 260 | ''' 261 | Run the reinforcement learning process 262 | ''' 263 | # Manage the TensorFlow context, i.e. the default graph and session 264 | with tf.Graph().as_default(), tf.Session() as sess: 265 | # Create multiple parallel environments, per "Asynchronous Methods" paper 266 | envs = [gym.make(ENV) for _ in range(NUM_ENVS)] 267 | 268 | # "Instantiate" actor/critic networks 269 | actor = actor_network() 270 | critic = critic_network() 271 | 272 | # TF saver to save/restore model 273 | saver = tf.train.Saver() 274 | 275 | # Initialize or restore model 276 | if RESUME: 277 | print('Restoring model from %s' % MODEL_LOC) 278 | saver.restore(sess, MODEL_LOC) 279 | else: 280 | sess.run(tf.initialize_all_variables()) 281 | 282 | # Variables for book-keeping and replay buffer 283 | total_reward = 0. 284 | avg_rewards = [] 285 | max_avg_reward = None 286 | replay_buffer = [] 287 | 288 | # Run the reinforcement learning algorithm 289 | for episode_count in range(NUM_EPISODES): 290 | reward, experiences = run_episode(envs, actor, critic, sess) 291 | 292 | # If more than 1/2 of the envs exceeded MAX_ITER timesteps, 293 | # generally this means the learning process becomes unstable, since we have incomplete experiences 294 | # Return None to indicate such an error 295 | exceed_count = 0 296 | for r in reward: 297 | if r <= -MAX_ITER: 298 | exceed_count += 1 299 | if exceed_count > NUM_ENVS//2: 300 | print('ERROR: More than 1/2 of envs exceeded %s timesteps, aborting this run' % MAX_ITER) 301 | return None, None 302 | 303 | # If no such error as above, proceed 304 | total_reward += np.mean(reward) 305 | replay_buffer += experiences 306 | print('Episode %s, reward (max_iter=%s): %s' % (episode_count, MAX_ITER, reward)) 307 | 308 | if (episode_count+1) % EPISODES_PER_UPDATE == 0: 309 | avg_reward = total_reward / EPISODES_PER_UPDATE 310 | avg_rewards.append(avg_reward) 311 | print('Average reward for past %s episodes: %s' % (EPISODES_PER_UPDATE, avg_reward)) 312 | 313 | # If the average reward is good enough, then save the model 314 | if max_avg_reward is None: 315 | max_avg_reward = avg_reward 316 | 317 | if avg_reward > max_avg_reward: 318 | max_avg_reward = avg_reward 319 | print('New max average reward') 320 | 321 | if avg_reward > SAVE_THRESHOLD and SAVE_MODEL: 322 | save_path = saver.save(sess, MODEL_LOC) 323 | print('Model saved in file: %s' % save_path) 324 | 325 | print('Running training after episode %s...' % str(episode_count+1)) 326 | train_networks(replay_buffer, actor, critic, sess) 327 | print('Training complete') 328 | 329 | total_reward = 0. 330 | replay_buffer = [] 331 | 332 | # Calculate final score 333 | # Run 100 episodes, no training steps, and only look at a single envs[0] 334 | print('Calculating final score (avg reward over 100 episodes)') 335 | score = 0.0 336 | for episode_count in range(100): 337 | reward, experiences = run_episode(envs, actor, critic, sess) 338 | score += reward[0] 339 | score /= 100 340 | print('Final score: %f' % score) 341 | 342 | # Save final model 343 | if SAVE_MODEL: 344 | save_path = saver.save(sess, MODEL_LOC) 345 | print('Final model saved in file: %s' % save_path) 346 | 347 | # Return list of average total rewards 348 | return avg_rewards, score 349 | 350 | ######################################## 351 | # Model evaluation 352 | # Only run this after model is trained 353 | ######################################## 354 | def model_evaluation(): 355 | ''' 356 | Run 1000 consecutive episodes using a pre-trained model 357 | Also runs the OpenAI Gym environment monitor, so upload results 358 | The OpenAI Gym results are stored in RECORD_LOC 359 | 360 | Returns a list of total rewards over 1000 episodes 361 | ''' 362 | 363 | # Rewards over episodes 364 | rewards = [] 365 | 366 | with tf.Graph().as_default(), tf.Session() as sess: 367 | # Similar code as run_rl(), except we only have 1 env, and perform no training 368 | envs = [gym.make(ENV)] # create a list of only 1 env 369 | 370 | actor = actor_network() 371 | critic = critic_network() 372 | 373 | print('Restoring model from %s' % MODEL_LOC) 374 | saver = tf.train.Saver() 375 | saver.restore(sess, MODEL_LOC) 376 | 377 | print('Running 1000 episodes. Recording experiment data at %s' % RECORD_LOC) 378 | 379 | envs[0].monitor.start(RECORD_LOC, force=True) # start OpenAI Gym environment monitor 380 | 381 | for episode_count in range(1000): 382 | reward, experiences = run_episode(envs, actor, critic, sess) 383 | rewards.append(reward[0]) 384 | 385 | envs[0].monitor.close() # close OpenAI Gym environment monitor 386 | 387 | return rewards 388 | 389 | ######################################## 390 | # If executing this python file stand-alone 391 | ######################################## 392 | if __name__ == '__main__': 393 | # Run RL algorithm until it does not return an error 394 | while True: 395 | avg_rewards, score = run_rl() 396 | 397 | if avg_rewards is not None: 398 | break 399 | 400 | # Plot average rewards over each batch 401 | print('Plotting avg rewards over episodes') 402 | plt.plot(avg_rewards) 403 | plt.ylabel('Average Reward') 404 | plt.xlabel('Episode') 405 | plt.show() 406 | -------------------------------------------------------------------------------- /model_eval.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluate and validate the final model. 3 | More details in the "Model Evaluation and Validation" section of the report. 4 | ''' 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import tensorflow as tf 10 | import tensorflow.contrib.slim as slim 11 | import numpy as np 12 | import random 13 | import gym 14 | import math 15 | import matplotlib.pyplot as plt 16 | import pickle 17 | 18 | import learning_agent 19 | 20 | # Run model evaluation function from the learning_agent 21 | learning_agent.NUM_ENVS = 1 22 | rewards = learning_agent.model_evaluation() 23 | 24 | # Print mean and standard deviation of rewards 25 | print('Rewards mean: %f\nRewards std-dev: %f' % (np.mean(rewards), np.std(rewards))) 26 | 27 | # Avg reward for final 100 episodes 28 | print('Average reward for final 100 episodes: %f' % np.mean(rewards[-100:])) 29 | 30 | # Save the rewards list just in case we need it later 31 | print('Saving rewards to eval_rewards.p') 32 | with open('eval_rewards.p', 'wb') as rewards_out: 33 | pickle.dump(rewards, rewards_out) 34 | 35 | # Plot rewards over episodes for visualization 36 | plt.plot(rewards) 37 | plt.title('Reward over Episodes') 38 | plt.ylabel('Reward') 39 | plt.xlabel('Episode') 40 | plt.show() -------------------------------------------------------------------------------- /models/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model.ckpt" 2 | all_model_checkpoint_paths: "model.ckpt" 3 | -------------------------------------------------------------------------------- /models/model.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/deep_rl_acrobot/ef4d266f2fb16bb6cccf393404047542b3eb9ea2/models/model.ckpt -------------------------------------------------------------------------------- /models/model.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/deep_rl_acrobot/ef4d266f2fb16bb6cccf393404047542b3eb9ea2/models/model.ckpt.meta -------------------------------------------------------------------------------- /random_avg_score.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Find the average score over 100 episodes, by taking random actions at each timestep 3 | ''' 4 | import gym 5 | import numpy as np 6 | 7 | env = gym.make('Acrobot-v1') 8 | observation = env.reset() 9 | 10 | # List of rewards after each episode 11 | rewards = [] 12 | 13 | # Run 100 episodes 14 | for i in range(100): 15 | total_reward = 0 16 | 17 | while True: 18 | # Take a random action in the environment 19 | action = env.action_space.sample() 20 | observation, reward, done, info = env.step(action) 21 | 22 | total_reward += reward 23 | 24 | if done: 25 | print('Episode %s finished with total reward of %s' % (i+1, total_reward)) 26 | break 27 | 28 | rewards.append(total_reward) 29 | observation = env.reset() 30 | 31 | # Calculate the average reward over 100 episodes 32 | avg_reward = np.mean(rewards) 33 | print('Average reward over 100 episodes: %s' % avg_reward) -------------------------------------------------------------------------------- /search_params.csv: -------------------------------------------------------------------------------- 1 | ACTOR_LR,CRITIC_LR_SCALE,REWARD_DISCOUNT,A_REG_SCALE,C_REG_SCALE,score 2 | 0.050000,0.750000,0.990000,0.000050,0.005000,-163.780000 3 | 0.001000,0.750000,0.950000,0.000500,0.000050,-374.634000 4 | 0.050000,0.250000,0.970000,0.005000,0.000050,-93.812000 5 | 0.050000,1.000000,0.950000,0.000050,0.005000,-158.192000 6 | 0.001000,0.750000,0.990000,0.000500,0.000500,-118.540000 7 | 0.005000,1.250000,0.950000,0.000500,0.000050,-174.730000 8 | 0.010000,0.500000,0.990000,0.000050,0.000500,-173.232000 9 | 0.001000,0.750000,0.970000,0.000500,0.000050,-341.042000 10 | 0.001000,0.250000,0.970000,0.005000,0.000050,-180.224000 11 | 0.010000,1.250000,0.970000,0.000050,0.000500,-279.272000 12 | 0.005000,0.250000,0.950000,0.000050,0.000050,-220.402000 13 | 0.001000,0.250000,0.970000,0.005000,0.000500,-602.762000 14 | 0.050000,0.500000,0.930000,0.005000,0.005000,-351.888000 15 | 0.050000,1.250000,0.930000,0.005000,0.000500,-183.476000 16 | 0.050000,0.500000,0.970000,0.000050,0.000500,-92.944000 17 | 0.010000,0.500000,0.950000,0.005000,0.000500,-108.762000 18 | 0.001000,0.750000,0.990000,0.000500,0.000050,-201.282000 19 | 0.005000,1.250000,0.930000,0.000050,0.000050,-234.574000 20 | 0.010000,0.500000,0.990000,0.000500,0.000500,-574.722000 21 | 0.001000,1.250000,0.930000,0.005000,0.005000,-454.514000 22 | -------------------------------------------------------------------------------- /search_params.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Searching for the optimal parameter combinations. 3 | More details in the "Refinement" section of the report. 4 | ''' 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import tensorflow as tf 10 | import tensorflow.contrib.slim as slim 11 | import numpy as np 12 | import random 13 | import gym 14 | import math 15 | 16 | import learning_agent 17 | 18 | # Configure the parameter search 19 | NUM_ITERS = 20 # how many iterations of random parameter search 20 | NUM_REPEAT = 3 # how many times to repeat the learning process for a given parameter set 21 | REPORT_FILE = 'search_params.csv' # save results to this file 22 | 23 | # Lists of hyper-parameter values on which to perform random search (w/o replacement) 24 | ACTOR_LR = [0.05, 0.01, 0.005, 0.001, 0.0005] # 0.005 -> 0.0005 25 | CRITIC_LR_SCALE = [1.25, 1.0, 0.75, 0.5, 0.25] 26 | REWARD_DISCOUNT = [0.99, 0.97, 0.95, 0.93] 27 | A_REG_SCALE = [0.005, 0.0005, 0.00005] 28 | C_REG_SCALE = [0.005, 0.0005, 0.00005] 29 | 30 | # Helper functions 31 | def choose_params(results): 32 | ''' 33 | Choose random parameter combination w/o replacement 34 | results is the dict that maps (param1, param2, ...) --> score 35 | Returns tuple of chosen parameters 36 | ''' 37 | # Keep trying random parameter combinations until we get a unique combination 38 | while True: 39 | actor_lr = np.random.choice(ACTOR_LR) 40 | critic_lr_scale = np.random.choice(CRITIC_LR_SCALE) 41 | reward_discount = np.random.choice(REWARD_DISCOUNT) 42 | a_reg_scale = np.random.choice(A_REG_SCALE) 43 | c_reg_scale = np.random.choice(C_REG_SCALE) 44 | 45 | params = (actor_lr, critic_lr_scale, reward_discount, a_reg_scale, c_reg_scale) 46 | 47 | if params not in results: 48 | break 49 | 50 | return params 51 | 52 | def set_params(params): 53 | ''' 54 | Sets the parameters specified in params tuple to the learning_agent 55 | ''' 56 | learning_agent.ACTOR_LR = params[0] 57 | learning_agent.CRITIC_LR_SCALE = params[1] 58 | learning_agent.REWARD_DISCOUNT = params[2] 59 | learning_agent.A_REG_SCALE = params[3] 60 | learning_agent.C_REG_SCALE = params[4] 61 | 62 | 63 | ########################################## 64 | # Main script to perform parameter search 65 | ########################################## 66 | 67 | # Dictionary to store results 68 | # (param1, param2, ...) --> score 69 | results = {} 70 | 71 | # Write csv header of report file 72 | report = open(REPORT_FILE, 'w') 73 | report.write('ACTOR_LR,CRITIC_LR_SCALE,REWARD_DISCOUNT,A_REG_SCALE,C_REG_SCALE,score\n') 74 | report.close() 75 | 76 | for _ in range(NUM_ITERS): 77 | params = choose_params(results) # choose random parameter combination w/o replacement 78 | set_params(params) # set the parameters 79 | 80 | for _ in range(NUM_REPEAT): 81 | # Run RL algorithm until it does not return an error 82 | while True: 83 | avg_rewards, score = learning_agent.run_rl() 84 | if avg_rewards is not None: 85 | break 86 | 87 | # Save the max score for this parameter combination 88 | if params in results: 89 | if score > results[params]: 90 | results[params] = score 91 | else: 92 | results[params] = score 93 | 94 | # Append results to report file 95 | report = open(REPORT_FILE, 'a') 96 | report.write('%f,%f,%f,%f,%f,%f\n' % (*params, score)) 97 | report.close() 98 | 99 | --------------------------------------------------------------------------------