├── README.md
├── cartpole_a3c.py
├── ac_network.py
└── worker.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Implementation of Asynchronous Advantage Actor-Critic algorithm using Long Short Term Memory Networks (A3C-LSTM)
 2 | 
 3 | >### Important note: The model as it is shown here does not converge on this environment. To see a converging model, please look at the [DDPG implementation](https://github.com/liampetti/DDPG)
 4 | 
 5 | Modified from the work of Arthur Juliani: [Simple Reinforcement Learning with Tensorflow Part 8: Asynchronous Actor-Critic Agents (A3C)](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2)
 6 | 
 7 | Paper can be found here: ["Asynchronous Methods for Deep Reinforcement Learning" - Mnih et al., 2016](https://arxiv.org/abs/1602.01783)
 8 | 
 9 | Tested on [CartPole](https://gym.openai.com/envs/CartPole-v0)
10 | 
11 | ### Requirements
12 | [Gym](https://github.com/openai/gym#installation) and [TensorFlow](https://www.tensorflow.org/install/).
13 | 
14 | ### Usage
15 | 
16 | Training only happens on minibatches of greater than 30, effectively preventing poor performing episodes from influencing training. A reward factor is used to allow for effective training at faster learning rates.
17 | 
18 | Models are saved every 100 episodes. They can be reloaded for further training or visualised for testing by setting either of the global parameters to True.
19 | 
20 | This is just example code to test an A3C-LSTM implementation. This should not be considered the optimal way to learn for this environment!
21 | 
22 | 


--------------------------------------------------------------------------------
/cartpole_a3c.py:
--------------------------------------------------------------------------------
  1 | # ================================================
  2 | # Modified from the work of Arthur Juliani:
  3 | #       Simple Reinforcement Learning with Tensorflow Part 8: Asynchronus Advantage Actor-Critic (A3C)
  4 | #       https://github.com/awjuliani/DeepRL-Agents/blob/master/A3C-Doom.ipynb
  5 | #
  6 | #       Implementation of Asynchronous Methods for Deep Reinforcement Learning
  7 | #       Algorithm details can be found here:
  8 | #           https://arxiv.org/pdf/1602.01783.pdf
  9 | #
 10 | # Modified to work with OpenAI Gym environments (currently working with cartpole)
 11 | # Author: Liam Pettigrew
 12 | # =================================================
 13 | 
 14 | import os
 15 | import threading
 16 | import multiprocessing
 17 | import numpy as np
 18 | import tensorflow as tf
 19 | 
 20 | from worker import Worker
 21 | from ac_network import AC_Network
 22 | 
 23 | # ===========================
 24 | #   Gym Utility Parameters
 25 | # ===========================
 26 | # Gym environment
 27 | ENV_NAME = 'CartPole-v0' # Discrete (4, 2)
 28 | STATE_DIM = 4
 29 | ACTION_DIM = 2
 30 | # Directory for storing gym results
 31 | MONITOR_DIR = './results/' + ENV_NAME
 32 | 
 33 | # ==========================
 34 | #   Training Parameters
 35 | # ==========================
 36 | RANDOM_SEED = 1234
 37 | # Load previously trained model
 38 | LOAD_MODEL = False
 39 | # Test and visualise a trained model
 40 | TEST_MODEL = False
 41 | # Directory for storing session model
 42 | MODEL_DIR = './model/'
 43 | # Learning rate
 44 | LEARNING_RATE = 0.0001
 45 | # Discount rate for advantage estimation and reward discounting
 46 | GAMMA = 0.99
 47 | 
 48 | def main(_):
 49 |     global master_network
 50 |     global global_episodes
 51 | 
 52 |     tf.reset_default_graph()
 53 | 
 54 |     if not os.path.exists(MODEL_DIR):
 55 |         os.makedirs(MODEL_DIR)
 56 | 
 57 |     with tf.device("/cpu:0"):
 58 |         np.random.seed(RANDOM_SEED)
 59 |         tf.set_random_seed(RANDOM_SEED)
 60 | 
 61 |         global_episodes = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False)
 62 |         trainer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
 63 |         master_network = AC_Network(STATE_DIM, ACTION_DIM, 'global', None)  # Generate global network
 64 |         num_workers = multiprocessing.cpu_count()  # Set workers to number of available CPU threads
 65 | 
 66 |         # For testing and visualisation we only need one worker
 67 |         if TEST_MODEL:
 68 |             num_workers = 1
 69 | 
 70 |         workers = []
 71 |         # Create worker classes
 72 |         for i in range(num_workers):
 73 |             workers.append(Worker(i, STATE_DIM, ACTION_DIM, trainer, MODEL_DIR, global_episodes,
 74 |                                   ENV_NAME, RANDOM_SEED, TEST_MODEL))
 75 |         saver = tf.train.Saver(max_to_keep=5)
 76 | 
 77 |         # Gym monitor
 78 |         if not TEST_MODEL:
 79 |             env = workers[0].get_env()
 80 |             env.monitor.start(MONITOR_DIR, video_callable=False, force=True)
 81 | 
 82 |     with tf.Session() as sess:
 83 |         coord = tf.train.Coordinator()
 84 |         if LOAD_MODEL or TEST_MODEL:
 85 |             print('Loading Model...')
 86 |             ckpt = tf.train.get_checkpoint_state(MODEL_DIR)
 87 |             saver.restore(sess, ckpt.model_checkpoint_path)
 88 |         else:
 89 |             sess.run(tf.global_variables_initializer())
 90 | 
 91 |         if TEST_MODEL:
 92 |             env = workers[0].get_env()
 93 |             env.monitor.start(MONITOR_DIR, force=True)
 94 |             workers[0].work(GAMMA, sess, coord, saver)
 95 |         else:
 96 |             # This is where the asynchronous magic happens.
 97 |             # Start the "work" process for each worker in a separate thread.
 98 |             worker_threads = []
 99 |             for worker in workers:
100 |                 worker_work = lambda: worker.work(GAMMA, sess, coord, saver)
101 |                 t = threading.Thread(target=(worker_work))
102 |                 t.start()
103 |                 worker_threads.append(t)
104 |             coord.join(worker_threads)
105 | 
106 | if __name__ == '__main__':
107 |     tf.app.run()


--------------------------------------------------------------------------------
/ac_network.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import tensorflow.contrib.slim as slim
 4 | 
 5 | # Clipping ratio for gradients
 6 | CLIP_NORM = 40.0
 7 | # Cell units
 8 | CELL_UNITS = 128
 9 | 
10 | #Used to initialize weights for policy and value output layers
11 | def normalized_columns_initializer(std=1.0):
12 |     def _initializer(shape, dtype=None, partition_info=None):
13 |         out = np.random.randn(*shape).astype(np.float32)
14 |         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
15 |         return tf.constant(out)
16 |     return _initializer
17 | 
18 | class AC_Network():
19 |     def __init__(self, s_size, a_size, scope, trainer):
20 |         with tf.variable_scope(scope):
21 |             # Input
22 |             self.inputs = tf.placeholder(shape=[None, s_size], dtype=tf.float32)
23 | 
24 |             # Recurrent network for temporal dependencies
25 |             lstm_cell = tf.contrib.rnn.BasicLSTMCell(CELL_UNITS, state_is_tuple=True)
26 |             c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
27 |             h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
28 |             self.state_init = [c_init, h_init]
29 |             c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
30 |             h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
31 |             self.state_in = [c_in, h_in]
32 |             rnn_in = tf.expand_dims(self.inputs, [0])
33 |             state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
34 |             lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
35 |                 lstm_cell, rnn_in,
36 |                 initial_state=state_in,
37 |                 time_major=False)
38 |             lstm_c, lstm_h = lstm_state
39 |             self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
40 |             rnn_out = tf.reshape(lstm_outputs, [-1, CELL_UNITS])
41 | 
42 |             # Output layers for policy and value estimations
43 |             self.policy = slim.fully_connected(rnn_out, a_size,
44 |                                                activation_fn=tf.nn.softmax,
45 |                                                weights_initializer=normalized_columns_initializer(0.01),
46 |                                                biases_initializer=None)
47 |             self.value = slim.fully_connected(rnn_out, 1,
48 |                                               activation_fn=None,
49 |                                               weights_initializer=normalized_columns_initializer(1.0),
50 |                                               biases_initializer=None)
51 | 
52 |             # Only the worker network need ops for loss functions and gradient updating.
53 |             if scope != 'global':
54 |                 self.actions = tf.placeholder(shape=[None, a_size], dtype=tf.float32)
55 |                 self.target_v = tf.placeholder(shape=[None], dtype=tf.float32)
56 |                 self.advantages = tf.placeholder(shape=[None], dtype=tf.float32)
57 | 
58 |                 self.responsible_outputs = tf.reduce_sum(self.policy * self.actions, [1])
59 | 
60 |                 # Value loss function
61 |                 self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1])))
62 | 
63 |                 # Softmax policy loss function
64 |                 self.policy_loss = -tf.reduce_sum(tf.log(tf.maximum(self.responsible_outputs, 1e-12)) * self.advantages)
65 | 
66 |                 # Softmax entropy function
67 |                 self.entropy = - tf.reduce_sum(self.policy * tf.log(tf.maximum(self.policy, 1e-12)))
68 | 
69 |                 self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01
70 | 
71 |                 # Get gradients from local network using local losses
72 |                 local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
73 |                 self.gradients = tf.gradients(self.loss, local_vars)
74 |                 self.var_norms = tf.global_norm(local_vars)
75 |                 grads, self.grad_norms = tf.clip_by_global_norm(self.gradients, 40.0)
76 | 
77 |                 # Apply local gradients to global network
78 |                 global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
79 |                 self.apply_grads = trainer.apply_gradients(zip(grads, global_vars))
80 | 


--------------------------------------------------------------------------------
/worker.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import scipy.signal
  3 | import numpy as np
  4 | import gym
  5 | from ac_network import AC_Network
  6 | 
  7 | # Size of mini batches to run training on
  8 | MINI_BATCH = 30
  9 | REWARD_FACTOR = 0.001
 10 | 
 11 | # Copies one set of variables to another.
 12 | # Used to set worker network parameters to those of global network.
 13 | def update_target_graph(from_scope,to_scope):
 14 |     from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
 15 |     to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
 16 | 
 17 |     op_holder = []
 18 |     for from_var,to_var in zip(from_vars,to_vars):
 19 |         op_holder.append(to_var.assign(from_var))
 20 |     return op_holder
 21 | 
 22 | # Weighted random selection returns n_picks random indexes.
 23 | # the chance to pick the index i is give by the weight weights[i].
 24 | def weighted_pick(weights,n_picks):
 25 |     t = np.cumsum(weights)
 26 |     s = np.sum(weights)
 27 |     return np.searchsorted(t,np.random.rand(n_picks)*s)
 28 | 
 29 | # Discounting function used to calculate discounted returns.
 30 | def discounting(x, gamma):
 31 |     return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
 32 | 
 33 | # Normalization of inputs and outputs
 34 | def norm(x, upper, lower=0.):
 35 |     return (x-lower)/max((upper-lower), 1e-12)
 36 | 
 37 | class Worker():
 38 |     def __init__(self, name, s_size, a_size, trainer, model_path, global_episodes, env_name, seed, test):
 39 |         self.name = "worker_" + str(name)
 40 |         self.number = name
 41 |         self.model_path = model_path
 42 |         self.trainer = trainer
 43 |         self.global_episodes = global_episodes
 44 |         self.increment = self.global_episodes.assign_add(1)
 45 |         self.episode_rewards = []
 46 |         self.episode_lengths = []
 47 |         self.episode_mean_values = []
 48 |         self.summary_writer = tf.summary.FileWriter("train_" + str(self.number))
 49 |         self.is_test = test
 50 |         self.a_size = a_size
 51 | 
 52 |         # Create the local copy of the network and the tensorflow op to copy global parameters to local network
 53 |         self.local_AC = AC_Network(s_size, a_size, self.name, trainer)
 54 |         self.update_local_ops = update_target_graph('global', self.name)
 55 | 
 56 |         self.env = gym.make(env_name)
 57 |         self.env.seed(seed)
 58 | 
 59 |     def get_env(self):
 60 |         return self.env
 61 | 
 62 |     def train(self, rollout, sess, gamma, r):
 63 |         rollout = np.array(rollout)
 64 |         states = rollout[:, 0]
 65 |         actions = rollout[:, 1]
 66 |         rewards = rollout[:, 2]
 67 |         values = rollout[:, 5]
 68 | 
 69 |         # Here we take the rewards and values from the rollout, and use them to
 70 |         # generate the advantage and discounted returns.
 71 |         rewards_list = np.asarray(rewards.tolist()+[r])*REWARD_FACTOR
 72 |         discounted_rewards = discounting(rewards_list, gamma)[:-1]
 73 | 
 74 |         # Advantage estimation
 75 |         # JS, P Moritz, S Levine, M Jordan, P Abbeel,
 76 |         # "High-dimensional continuous control using generalized advantage estimation."
 77 |         # arXiv preprint arXiv:1506.02438 (2015).
 78 |         values_list = np.asarray(values.tolist()+[r])*REWARD_FACTOR
 79 |         advantages = rewards + gamma * values_list[1:] - values_list[:-1]
 80 |         discounted_advantages = discounting(advantages, gamma)
 81 | 
 82 | 
 83 |         # Update the global network using gradients from loss
 84 |         # Generate network statistics to periodically save
 85 |         # sess.run(self.local_AC.reset_state_op)
 86 |         rnn_state = self.local_AC.state_init
 87 |         feed_dict = {self.local_AC.target_v: discounted_rewards,
 88 |                      self.local_AC.inputs: np.vstack(states),
 89 |                      self.local_AC.actions: np.vstack(actions),
 90 |                      self.local_AC.advantages: discounted_advantages,
 91 |                      self.local_AC.state_in[0]: rnn_state[0],
 92 |                      self.local_AC.state_in[1]: rnn_state[1]}
 93 |         v_l, p_l, e_l, g_n, v_n, _ = sess.run([self.local_AC.value_loss,
 94 |                                                self.local_AC.policy_loss,
 95 |                                                self.local_AC.entropy,
 96 |                                                self.local_AC.grad_norms,
 97 |                                                self.local_AC.var_norms,
 98 |                                                self.local_AC.apply_grads],
 99 |                                               feed_dict=feed_dict)
100 |         return v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n
101 | 
102 |     def work(self, gamma, sess, coord, saver):
103 |         episode_count = sess.run(self.global_episodes)
104 |         total_steps = 0
105 |         print("Starting worker " + str(self.number))
106 |         with sess.as_default(), sess.graph.as_default():
107 |             while not coord.should_stop():
108 |                 sess.run(self.update_local_ops)
109 |                 episode_buffer = []
110 |                 episode_mini_buffer = []
111 |                 episode_values = []
112 |                 episode_states = []
113 |                 episode_reward = 0
114 |                 episode_step_count = 0
115 | 
116 |                 # Restart environment
117 |                 terminal = False
118 |                 s = self.env.reset()
119 | 
120 |                 rnn_state = self.local_AC.state_init
121 | 
122 |                 # Run an episode
123 |                 while not terminal:
124 |                     episode_states.append(s)
125 |                     if self.is_test:
126 |                         self.env.render()
127 | 
128 |                     # Get preferred action distribution
129 |                     a_dist, v, rnn_state = sess.run([self.local_AC.policy, self.local_AC.value, self.local_AC.state_out],
130 |                                          feed_dict={self.local_AC.inputs: [s],
131 |                                                     self.local_AC.state_in[0]: rnn_state[0],
132 |                                                     self.local_AC.state_in[1]: rnn_state[1]})
133 | 
134 |                     a0 = weighted_pick(a_dist[0], 1) # Use stochastic distribution sampling
135 |                     if self.is_test:
136 |                         a0 = np.argmax(a_dist[0]) # Use maximum when testing
137 |                     a = np.zeros(self.a_size)
138 |                     a[a0] = 1
139 | 
140 |                     s2, r, terminal, info = self.env.step(np.argmax(a))
141 | 
142 |                     episode_reward += r
143 | 
144 |                     episode_buffer.append([s, a, r, s2, terminal, v[0, 0]])
145 |                     episode_mini_buffer.append([s, a, r, s2, terminal, v[0, 0]])
146 | 
147 |                     episode_values.append(v[0, 0])
148 | 
149 |                     # Train on mini batches from episode
150 |                     if len(episode_mini_buffer) == MINI_BATCH and not self.is_test:
151 |                         v1 = sess.run([self.local_AC.value],
152 |                                       feed_dict={self.local_AC.inputs: [s],
153 |                                                     self.local_AC.state_in[0]: rnn_state[0],
154 |                                                     self.local_AC.state_in[1]: rnn_state[1]})
155 |                         v_l, p_l, e_l, g_n, v_n = self.train(episode_mini_buffer, sess, gamma, v1[0][0])
156 |                         episode_mini_buffer = []
157 | 
158 |                     # Set previous state for next step
159 |                     s = s2
160 |                     total_steps += 1
161 |                     episode_step_count += 1
162 | 
163 |                 self.episode_rewards.append(episode_reward)
164 |                 self.episode_lengths.append(episode_step_count)
165 |                 self.episode_mean_values.append(np.mean(episode_values))
166 | 
167 |                 if episode_count % 10 == 0 and not episode_count % 100 == 0 and not self.is_test:
168 |                     mean_reward = np.mean(self.episode_rewards[-5:])
169 |                     mean_length = np.mean(self.episode_lengths[-5:])
170 |                     mean_value = np.mean(self.episode_mean_values[-5:])
171 |                     summary = tf.Summary()
172 |                     summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
173 |                     summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
174 |                     summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
175 |                     summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
176 |                     summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
177 |                     summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
178 |                     summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
179 |                     summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
180 |                     self.summary_writer.add_summary(summary, episode_count)
181 | 
182 |                     self.summary_writer.flush()
183 | 
184 |                 if self.name == 'worker_0':
185 |                     if episode_count % 100 == 0 and not self.is_test:
186 |                         saver.save(sess, self.model_path + '/model-' + str(episode_count) + '.cptk')
187 | 
188 |                     print("| Reward: " + str(episode_reward), " | Episode", episode_count)
189 |                     sess.run(self.increment) # Next global episode
190 | 
191 |                 episode_count += 1
192 | 


--------------------------------------------------------------------------------