├── README.md ├── cartpole_a3c.py ├── ac_network.py └── worker.py /README.md: -------------------------------------------------------------------------------- 1 | # Implementation of Asynchronous Advantage Actor-Critic algorithm using Long Short Term Memory Networks (A3C-LSTM) 2 | 3 | >### Important note: The model as it is shown here does not converge on this environment. To see a converging model, please look at the [DDPG implementation](https://github.com/liampetti/DDPG) 4 | 5 | Modified from the work of Arthur Juliani: [Simple Reinforcement Learning with Tensorflow Part 8: Asynchronous Actor-Critic Agents (A3C)](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2) 6 | 7 | Paper can be found here: ["Asynchronous Methods for Deep Reinforcement Learning" - Mnih et al., 2016](https://arxiv.org/abs/1602.01783) 8 | 9 | Tested on [CartPole](https://gym.openai.com/envs/CartPole-v0) 10 | 11 | ### Requirements 12 | [Gym](https://github.com/openai/gym#installation) and [TensorFlow](https://www.tensorflow.org/install/). 13 | 14 | ### Usage 15 | 16 | Training only happens on minibatches of greater than 30, effectively preventing poor performing episodes from influencing training. A reward factor is used to allow for effective training at faster learning rates. 17 | 18 | Models are saved every 100 episodes. They can be reloaded for further training or visualised for testing by setting either of the global parameters to True. 19 | 20 | This is just example code to test an A3C-LSTM implementation. This should not be considered the optimal way to learn for this environment! 21 | 22 | -------------------------------------------------------------------------------- /cartpole_a3c.py: -------------------------------------------------------------------------------- 1 | # ================================================ 2 | # Modified from the work of Arthur Juliani: 3 | # Simple Reinforcement Learning with Tensorflow Part 8: Asynchronus Advantage Actor-Critic (A3C) 4 | # https://github.com/awjuliani/DeepRL-Agents/blob/master/A3C-Doom.ipynb 5 | # 6 | # Implementation of Asynchronous Methods for Deep Reinforcement Learning 7 | # Algorithm details can be found here: 8 | # https://arxiv.org/pdf/1602.01783.pdf 9 | # 10 | # Modified to work with OpenAI Gym environments (currently working with cartpole) 11 | # Author: Liam Pettigrew 12 | # ================================================= 13 | 14 | import os 15 | import threading 16 | import multiprocessing 17 | import numpy as np 18 | import tensorflow as tf 19 | 20 | from worker import Worker 21 | from ac_network import AC_Network 22 | 23 | # =========================== 24 | # Gym Utility Parameters 25 | # =========================== 26 | # Gym environment 27 | ENV_NAME = 'CartPole-v0' # Discrete (4, 2) 28 | STATE_DIM = 4 29 | ACTION_DIM = 2 30 | # Directory for storing gym results 31 | MONITOR_DIR = './results/' + ENV_NAME 32 | 33 | # ========================== 34 | # Training Parameters 35 | # ========================== 36 | RANDOM_SEED = 1234 37 | # Load previously trained model 38 | LOAD_MODEL = False 39 | # Test and visualise a trained model 40 | TEST_MODEL = False 41 | # Directory for storing session model 42 | MODEL_DIR = './model/' 43 | # Learning rate 44 | LEARNING_RATE = 0.0001 45 | # Discount rate for advantage estimation and reward discounting 46 | GAMMA = 0.99 47 | 48 | def main(_): 49 | global master_network 50 | global global_episodes 51 | 52 | tf.reset_default_graph() 53 | 54 | if not os.path.exists(MODEL_DIR): 55 | os.makedirs(MODEL_DIR) 56 | 57 | with tf.device("/cpu:0"): 58 | np.random.seed(RANDOM_SEED) 59 | tf.set_random_seed(RANDOM_SEED) 60 | 61 | global_episodes = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False) 62 | trainer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) 63 | master_network = AC_Network(STATE_DIM, ACTION_DIM, 'global', None) # Generate global network 64 | num_workers = multiprocessing.cpu_count() # Set workers to number of available CPU threads 65 | 66 | # For testing and visualisation we only need one worker 67 | if TEST_MODEL: 68 | num_workers = 1 69 | 70 | workers = [] 71 | # Create worker classes 72 | for i in range(num_workers): 73 | workers.append(Worker(i, STATE_DIM, ACTION_DIM, trainer, MODEL_DIR, global_episodes, 74 | ENV_NAME, RANDOM_SEED, TEST_MODEL)) 75 | saver = tf.train.Saver(max_to_keep=5) 76 | 77 | # Gym monitor 78 | if not TEST_MODEL: 79 | env = workers[0].get_env() 80 | env.monitor.start(MONITOR_DIR, video_callable=False, force=True) 81 | 82 | with tf.Session() as sess: 83 | coord = tf.train.Coordinator() 84 | if LOAD_MODEL or TEST_MODEL: 85 | print('Loading Model...') 86 | ckpt = tf.train.get_checkpoint_state(MODEL_DIR) 87 | saver.restore(sess, ckpt.model_checkpoint_path) 88 | else: 89 | sess.run(tf.global_variables_initializer()) 90 | 91 | if TEST_MODEL: 92 | env = workers[0].get_env() 93 | env.monitor.start(MONITOR_DIR, force=True) 94 | workers[0].work(GAMMA, sess, coord, saver) 95 | else: 96 | # This is where the asynchronous magic happens. 97 | # Start the "work" process for each worker in a separate thread. 98 | worker_threads = [] 99 | for worker in workers: 100 | worker_work = lambda: worker.work(GAMMA, sess, coord, saver) 101 | t = threading.Thread(target=(worker_work)) 102 | t.start() 103 | worker_threads.append(t) 104 | coord.join(worker_threads) 105 | 106 | if __name__ == '__main__': 107 | tf.app.run() -------------------------------------------------------------------------------- /ac_network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import tensorflow.contrib.slim as slim 4 | 5 | # Clipping ratio for gradients 6 | CLIP_NORM = 40.0 7 | # Cell units 8 | CELL_UNITS = 128 9 | 10 | #Used to initialize weights for policy and value output layers 11 | def normalized_columns_initializer(std=1.0): 12 | def _initializer(shape, dtype=None, partition_info=None): 13 | out = np.random.randn(*shape).astype(np.float32) 14 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 15 | return tf.constant(out) 16 | return _initializer 17 | 18 | class AC_Network(): 19 | def __init__(self, s_size, a_size, scope, trainer): 20 | with tf.variable_scope(scope): 21 | # Input 22 | self.inputs = tf.placeholder(shape=[None, s_size], dtype=tf.float32) 23 | 24 | # Recurrent network for temporal dependencies 25 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(CELL_UNITS, state_is_tuple=True) 26 | c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) 27 | h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) 28 | self.state_init = [c_init, h_init] 29 | c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c]) 30 | h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h]) 31 | self.state_in = [c_in, h_in] 32 | rnn_in = tf.expand_dims(self.inputs, [0]) 33 | state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in) 34 | lstm_outputs, lstm_state = tf.nn.dynamic_rnn( 35 | lstm_cell, rnn_in, 36 | initial_state=state_in, 37 | time_major=False) 38 | lstm_c, lstm_h = lstm_state 39 | self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) 40 | rnn_out = tf.reshape(lstm_outputs, [-1, CELL_UNITS]) 41 | 42 | # Output layers for policy and value estimations 43 | self.policy = slim.fully_connected(rnn_out, a_size, 44 | activation_fn=tf.nn.softmax, 45 | weights_initializer=normalized_columns_initializer(0.01), 46 | biases_initializer=None) 47 | self.value = slim.fully_connected(rnn_out, 1, 48 | activation_fn=None, 49 | weights_initializer=normalized_columns_initializer(1.0), 50 | biases_initializer=None) 51 | 52 | # Only the worker network need ops for loss functions and gradient updating. 53 | if scope != 'global': 54 | self.actions = tf.placeholder(shape=[None, a_size], dtype=tf.float32) 55 | self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) 56 | self.advantages = tf.placeholder(shape=[None], dtype=tf.float32) 57 | 58 | self.responsible_outputs = tf.reduce_sum(self.policy * self.actions, [1]) 59 | 60 | # Value loss function 61 | self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1]))) 62 | 63 | # Softmax policy loss function 64 | self.policy_loss = -tf.reduce_sum(tf.log(tf.maximum(self.responsible_outputs, 1e-12)) * self.advantages) 65 | 66 | # Softmax entropy function 67 | self.entropy = - tf.reduce_sum(self.policy * tf.log(tf.maximum(self.policy, 1e-12))) 68 | 69 | self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01 70 | 71 | # Get gradients from local network using local losses 72 | local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 73 | self.gradients = tf.gradients(self.loss, local_vars) 74 | self.var_norms = tf.global_norm(local_vars) 75 | grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, 40.0) 76 | 77 | # Apply local gradients to global network 78 | global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') 79 | self.apply_grads = trainer.apply_gradients(zip(grads, global_vars)) 80 | -------------------------------------------------------------------------------- /worker.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import scipy.signal 3 | import numpy as np 4 | import gym 5 | from ac_network import AC_Network 6 | 7 | # Size of mini batches to run training on 8 | MINI_BATCH = 30 9 | REWARD_FACTOR = 0.001 10 | 11 | # Copies one set of variables to another. 12 | # Used to set worker network parameters to those of global network. 13 | def update_target_graph(from_scope,to_scope): 14 | from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) 15 | to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) 16 | 17 | op_holder = [] 18 | for from_var,to_var in zip(from_vars,to_vars): 19 | op_holder.append(to_var.assign(from_var)) 20 | return op_holder 21 | 22 | # Weighted random selection returns n_picks random indexes. 23 | # the chance to pick the index i is give by the weight weights[i]. 24 | def weighted_pick(weights,n_picks): 25 | t = np.cumsum(weights) 26 | s = np.sum(weights) 27 | return np.searchsorted(t,np.random.rand(n_picks)*s) 28 | 29 | # Discounting function used to calculate discounted returns. 30 | def discounting(x, gamma): 31 | return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] 32 | 33 | # Normalization of inputs and outputs 34 | def norm(x, upper, lower=0.): 35 | return (x-lower)/max((upper-lower), 1e-12) 36 | 37 | class Worker(): 38 | def __init__(self, name, s_size, a_size, trainer, model_path, global_episodes, env_name, seed, test): 39 | self.name = "worker_" + str(name) 40 | self.number = name 41 | self.model_path = model_path 42 | self.trainer = trainer 43 | self.global_episodes = global_episodes 44 | self.increment = self.global_episodes.assign_add(1) 45 | self.episode_rewards = [] 46 | self.episode_lengths = [] 47 | self.episode_mean_values = [] 48 | self.summary_writer = tf.summary.FileWriter("train_" + str(self.number)) 49 | self.is_test = test 50 | self.a_size = a_size 51 | 52 | # Create the local copy of the network and the tensorflow op to copy global parameters to local network 53 | self.local_AC = AC_Network(s_size, a_size, self.name, trainer) 54 | self.update_local_ops = update_target_graph('global', self.name) 55 | 56 | self.env = gym.make(env_name) 57 | self.env.seed(seed) 58 | 59 | def get_env(self): 60 | return self.env 61 | 62 | def train(self, rollout, sess, gamma, r): 63 | rollout = np.array(rollout) 64 | states = rollout[:, 0] 65 | actions = rollout[:, 1] 66 | rewards = rollout[:, 2] 67 | values = rollout[:, 5] 68 | 69 | # Here we take the rewards and values from the rollout, and use them to 70 | # generate the advantage and discounted returns. 71 | rewards_list = np.asarray(rewards.tolist()+[r])*REWARD_FACTOR 72 | discounted_rewards = discounting(rewards_list, gamma)[:-1] 73 | 74 | # Advantage estimation 75 | # JS, P Moritz, S Levine, M Jordan, P Abbeel, 76 | # "High-dimensional continuous control using generalized advantage estimation." 77 | # arXiv preprint arXiv:1506.02438 (2015). 78 | values_list = np.asarray(values.tolist()+[r])*REWARD_FACTOR 79 | advantages = rewards + gamma * values_list[1:] - values_list[:-1] 80 | discounted_advantages = discounting(advantages, gamma) 81 | 82 | 83 | # Update the global network using gradients from loss 84 | # Generate network statistics to periodically save 85 | # sess.run(self.local_AC.reset_state_op) 86 | rnn_state = self.local_AC.state_init 87 | feed_dict = {self.local_AC.target_v: discounted_rewards, 88 | self.local_AC.inputs: np.vstack(states), 89 | self.local_AC.actions: np.vstack(actions), 90 | self.local_AC.advantages: discounted_advantages, 91 | self.local_AC.state_in[0]: rnn_state[0], 92 | self.local_AC.state_in[1]: rnn_state[1]} 93 | v_l, p_l, e_l, g_n, v_n, _ = sess.run([self.local_AC.value_loss, 94 | self.local_AC.policy_loss, 95 | self.local_AC.entropy, 96 | self.local_AC.grad_norms, 97 | self.local_AC.var_norms, 98 | self.local_AC.apply_grads], 99 | feed_dict=feed_dict) 100 | return v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n 101 | 102 | def work(self, gamma, sess, coord, saver): 103 | episode_count = sess.run(self.global_episodes) 104 | total_steps = 0 105 | print("Starting worker " + str(self.number)) 106 | with sess.as_default(), sess.graph.as_default(): 107 | while not coord.should_stop(): 108 | sess.run(self.update_local_ops) 109 | episode_buffer = [] 110 | episode_mini_buffer = [] 111 | episode_values = [] 112 | episode_states = [] 113 | episode_reward = 0 114 | episode_step_count = 0 115 | 116 | # Restart environment 117 | terminal = False 118 | s = self.env.reset() 119 | 120 | rnn_state = self.local_AC.state_init 121 | 122 | # Run an episode 123 | while not terminal: 124 | episode_states.append(s) 125 | if self.is_test: 126 | self.env.render() 127 | 128 | # Get preferred action distribution 129 | a_dist, v, rnn_state = sess.run([self.local_AC.policy, self.local_AC.value, self.local_AC.state_out], 130 | feed_dict={self.local_AC.inputs: [s], 131 | self.local_AC.state_in[0]: rnn_state[0], 132 | self.local_AC.state_in[1]: rnn_state[1]}) 133 | 134 | a0 = weighted_pick(a_dist[0], 1) # Use stochastic distribution sampling 135 | if self.is_test: 136 | a0 = np.argmax(a_dist[0]) # Use maximum when testing 137 | a = np.zeros(self.a_size) 138 | a[a0] = 1 139 | 140 | s2, r, terminal, info = self.env.step(np.argmax(a)) 141 | 142 | episode_reward += r 143 | 144 | episode_buffer.append([s, a, r, s2, terminal, v[0, 0]]) 145 | episode_mini_buffer.append([s, a, r, s2, terminal, v[0, 0]]) 146 | 147 | episode_values.append(v[0, 0]) 148 | 149 | # Train on mini batches from episode 150 | if len(episode_mini_buffer) == MINI_BATCH and not self.is_test: 151 | v1 = sess.run([self.local_AC.value], 152 | feed_dict={self.local_AC.inputs: [s], 153 | self.local_AC.state_in[0]: rnn_state[0], 154 | self.local_AC.state_in[1]: rnn_state[1]}) 155 | v_l, p_l, e_l, g_n, v_n = self.train(episode_mini_buffer, sess, gamma, v1[0][0]) 156 | episode_mini_buffer = [] 157 | 158 | # Set previous state for next step 159 | s = s2 160 | total_steps += 1 161 | episode_step_count += 1 162 | 163 | self.episode_rewards.append(episode_reward) 164 | self.episode_lengths.append(episode_step_count) 165 | self.episode_mean_values.append(np.mean(episode_values)) 166 | 167 | if episode_count % 10 == 0 and not episode_count % 100 == 0 and not self.is_test: 168 | mean_reward = np.mean(self.episode_rewards[-5:]) 169 | mean_length = np.mean(self.episode_lengths[-5:]) 170 | mean_value = np.mean(self.episode_mean_values[-5:]) 171 | summary = tf.Summary() 172 | summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) 173 | summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) 174 | summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) 175 | summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) 176 | summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) 177 | summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) 178 | summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n)) 179 | summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n)) 180 | self.summary_writer.add_summary(summary, episode_count) 181 | 182 | self.summary_writer.flush() 183 | 184 | if self.name == 'worker_0': 185 | if episode_count % 100 == 0 and not self.is_test: 186 | saver.save(sess, self.model_path + '/model-' + str(episode_count) + '.cptk') 187 | 188 | print("| Reward: " + str(episode_reward), " | Episode", episode_count) 189 | sess.run(self.increment) # Next global episode 190 | 191 | episode_count += 1 192 | --------------------------------------------------------------------------------