├── .gitignore ├── README.md ├── a3c.py ├── agent.py ├── environment.py ├── setup.py ├── thread.py └── train.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # Generated directories. 92 | models -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A3C 2 | Deep reinforcement learning using an asynchronous advantage actor-critic (A3C) model written in [TensorFlow](https://www.tensorflow.org/). 3 | 4 | This AI does not rely on hand-engineered rules or features. Instead, it masters the environment by looking at raw pixels and learning from experience, just as humans do. 5 | 6 | ## Dependencies 7 | * NumPy 8 | * OpenAI Gym 0.10 9 | * Pillow 10 | * SciPy 11 | * TensorFlow 1.0 12 | 13 | ## Learning Environment 14 | Uses environments provided by [OpenAI Gym](https://gym.openai.com/). 15 | 16 | ## Preprocessing 17 | Each frame is transformed into a 47×47 grayscale image with 32-bit float values between 0 and 1. No image cropping is performed. Reward signals are restricted to -1, 0 and 1. 18 | 19 | ## Network Architecture 20 | The input layer consists of a 47×47 grayscale image. 21 | 22 | Four convolutional layers follow, each with 32 filters of size 3×3 and stride 2 and each applying the rectifier nonlinearity. 23 | 24 | A recurrent layer follows, consisting of 256 LSTM units. 25 | 26 | Lastly, the network diverges into two output layers – one is a probability distribution over actions (represented as logits), the other is a single linear output representing the value function. 27 | 28 | ## Acknowledgements 29 | Implementation inspired by the [OpenAI Universe](https://universe.openai.com/) reference agent. 30 | 31 | Heavily influenced by DeepMind's seminal paper ['Asynchronous Methods for Deep Reinforcement Learning' (Mnih et al., 2016)](https://arxiv.org/abs/1602.01783). 32 | -------------------------------------------------------------------------------- /a3c.py: -------------------------------------------------------------------------------- 1 | """Defines policy networks for asynchronous advantage actor-critic architectures. 2 | 3 | Heavily influenced by DeepMind's seminal paper 'Asynchronous Methods for Deep Reinforcement 4 | Learning' (Mnih et al., 2016). 5 | """ 6 | 7 | import math 8 | import numpy as np 9 | import tensorflow as tf 10 | 11 | 12 | def _convolutional_layer(x, shape, stride, activation_fn): 13 | if len(shape) != 4: 14 | raise ValueError('Shape "{}" is invalid. Must have length 4.'.format(shape)) 15 | 16 | num_input_params = shape[0] * shape[1] * shape[2] 17 | num_output_params = shape[0] * shape[1] * shape[3] 18 | maxval = math.sqrt(6 / (num_input_params + num_output_params)) 19 | W = tf.Variable(tf.random_uniform(shape, -maxval, maxval), name='Weights') 20 | b = tf.Variable(tf.constant(0, tf.float32, [shape[3]]), name='Bias') 21 | conv = tf.nn.conv2d(x, W, [1, stride, stride, 1], 'VALID') 22 | 23 | return activation_fn(tf.nn.bias_add(conv, b)) 24 | 25 | 26 | def _fully_connected_layer(x, shape, activation_fn): 27 | if len(shape) != 2: 28 | raise ValueError('Shape "{}" is invalid. Must have length 2.'.format(shape)) 29 | 30 | maxval = math.sqrt(6 / (shape[0] + shape[1])) 31 | W = tf.Variable(tf.random_uniform(shape, -maxval, maxval), name='Weights') 32 | b = tf.Variable(tf.constant(0, tf.float32, [shape[1]]), name='Bias') 33 | 34 | return activation_fn(tf.matmul(x, W) + b) 35 | 36 | 37 | class PolicyNetwork(): 38 | def __init__(self, num_actions, state_shape): 39 | """Defines a policy network implemented as a convolutional recurrent neural network. 40 | 41 | Args: 42 | num_actions: Number of possible actions. 43 | state_shape: A vector with three values, representing the width, height and depth of 44 | input states. For example, the shape of 100x80 RGB images is [100, 80, 3]. 45 | """ 46 | 47 | width, height, depth = state_shape 48 | self.x = tf.placeholder(tf.float32, [None, width, height, depth], name='Input_States') 49 | 50 | with tf.name_scope('Convolutional_Layer_1'): 51 | h_conv1 = _convolutional_layer(self.x, [3, 3, depth, 32], 2, tf.nn.relu) 52 | 53 | with tf.name_scope('Convolutional_Layer_2'): 54 | h_conv2 = _convolutional_layer(h_conv1, [3, 3, 32, 32], 2, tf.nn.relu) 55 | 56 | with tf.name_scope('Convolutional_Layer_3'): 57 | h_conv3 = _convolutional_layer(h_conv2, [3, 3, 32, 32], 2, tf.nn.relu) 58 | 59 | with tf.name_scope('Convolutional_Layer_4'): 60 | h_conv4 = _convolutional_layer(h_conv3, [3, 3, 32, 32], 2, tf.nn.relu) 61 | 62 | # Flatten the output to feed it into the LSTM layer. 63 | num_params = np.prod(h_conv4.get_shape().as_list()[1:]) 64 | h_flat = tf.reshape(h_conv4, [-1, num_params]) 65 | 66 | with tf.name_scope('LSTM_Layer'): 67 | self.lstm_state = (tf.placeholder(tf.float32, [1, 256]), 68 | tf.placeholder(tf.float32, [1, 256])) 69 | 70 | self.initial_lstm_state = (np.zeros([1, 256], np.float32), 71 | np.zeros([1, 256], np.float32)) 72 | 73 | lstm_state = tf.contrib.rnn.LSTMStateTuple(*self.lstm_state) 74 | lstm = tf.contrib.rnn.BasicLSTMCell(256) 75 | 76 | # tf.nn.dynamic_rnn expects inputs of shape [batch_size, time, features], but the shape 77 | # of h_flat is [batch_size, features]. We want the batch_size dimension to be treated as 78 | # the time dimension, so the input is redundantly expanded to [1, batch_size, features]. 79 | # The LSTM layer will assume it has 1 batch with a time dimension of length batch_size. 80 | batch_size = tf.shape(h_flat)[:1] # [:1] is a trick to correctly get the dynamic shape. 81 | lstm_input = tf.expand_dims(h_flat, [0]) 82 | lstm_output, self.new_lstm_state = tf.nn.dynamic_rnn(lstm, 83 | lstm_input, 84 | batch_size, 85 | lstm_state) 86 | # Delete the fake batch dimension. 87 | lstm_output = tf.squeeze(lstm_output, [0]) 88 | 89 | self.action_logits = _fully_connected_layer(lstm_output, [256, num_actions], tf.identity) 90 | self.value = tf.squeeze(_fully_connected_layer(lstm_output, [256, 1], tf.identity)) 91 | self.action = tf.squeeze(tf.multinomial( 92 | self.action_logits - tf.reduce_max(self.action_logits, 1, keep_dims=True), 1)) 93 | self.parameters = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 94 | tf.get_variable_scope().name) 95 | 96 | def get_initial_lstm_state(self): 97 | """Returns a value that can be used as the initial state of the LSTM unit of the network.""" 98 | 99 | return self.initial_lstm_state 100 | 101 | def sample_action(self, state, lstm_state): 102 | """Samples an action for the specified state from the learned mixed strategy. 103 | 104 | Args: 105 | state: State of the environment. 106 | lstm_state: The state of the long short-term memory unit of the network. Use the 107 | get_initial_lstm_state method when unknown. 108 | 109 | Returns: 110 | An action, the value of the specified state and the new state of the LSTM unit. 111 | """ 112 | 113 | sess = tf.get_default_session() 114 | feed_dict = {self.x: [state], self.lstm_state: lstm_state} 115 | return sess.run((self.action, self.value, self.new_lstm_state), feed_dict) 116 | 117 | def estimate_value(self, state, lstm_state): 118 | """Estimates the value of the specified state. 119 | 120 | Args: 121 | state: State of the environment. 122 | 123 | Returns: 124 | The value of the specified state. 125 | """ 126 | 127 | sess = tf.get_default_session() 128 | return sess.run(self.value, {self.x: [state], self.lstm_state: lstm_state}) 129 | -------------------------------------------------------------------------------- /agent.py: -------------------------------------------------------------------------------- 1 | """Defines an agent that learns to play Atari games using an A3C architecture. 2 | 3 | Heavily influenced by DeepMind's seminal paper 'Asynchronous Methods for Deep Reinforcement 4 | Learning' (Mnih et al., 2016). 5 | """ 6 | 7 | import a3c 8 | import logging 9 | import numpy as np 10 | import tensorflow as tf 11 | 12 | from scipy import signal 13 | 14 | 15 | LOGGER = logging.getLogger(__name__) 16 | LOGGER.setLevel(logging.INFO) 17 | 18 | 19 | def _apply_discount(rewards, discount): 20 | """Discounts the specified rewards exponentially. 21 | 22 | Given rewards = [r0, r1, r2, r3] and discount = 0.99, the result is: 23 | [r0 + 0.99 * (r1 + 0.99 * (r2 + 0.99 * r3)), 24 | r1 + 0.99 * (r2 + 0.99 * r3), 25 | r2 + 0.99 * r3, 26 | r3] 27 | 28 | Example: rewards = [10, 20, 30, 40] and discount = 0.99 -> [98.01496, 88.904, 69.6, 40]. 29 | 30 | Returns: 31 | The discounted rewards. 32 | """ 33 | 34 | return signal.lfilter([1], [1, -discount], rewards[::-1])[::-1] 35 | 36 | 37 | class Agent(): 38 | def __init__(self, 39 | worker_index, 40 | env, 41 | render, 42 | num_local_steps, 43 | learning_rate, 44 | entropy_regularization, 45 | max_gradient_norm, 46 | discount, 47 | summary_writer, 48 | summary_update_interval): 49 | """An agent that learns to play Atari games using an A3C architecture. 50 | 51 | Args: 52 | worker_index: Index of the worker thread that is running this agent. 53 | env: An AtariWrapper object (see 'environment.py') that wraps over an OpenAI Gym Atari 54 | environment. 55 | render: Determines whether to display the game screen. 56 | num_local_steps: Number of experiences used per worker when updating the model. 57 | learning_rate: The speed with which the network learns from new examples. 58 | entropy_regularization: The strength of the entropy regularization term. 59 | max_gradient_norm: Maximum value allowed for the L2-norms of gradients. Gradients with 60 | norms that would otherwise surpass this value are scaled down. 61 | discount: Discount factor for future rewards. 62 | summary_writer: A TensorFlow object that writes summaries. 63 | summary_update_interval: Number of training steps needed to update the summary data. 64 | """ 65 | 66 | self.worker_index = worker_index 67 | self.env = env 68 | self.render = render 69 | self.num_local_steps = num_local_steps 70 | self.discount = discount 71 | self.summary_writer = summary_writer 72 | self.summary_update_interval = summary_update_interval 73 | self.num_times_trained = 0 74 | 75 | worker_device = '/job:thread/task:{}/cpu:0'.format(worker_index) 76 | 77 | with tf.device(tf.train.replica_device_setter(1, '/job:master', worker_device)): 78 | with tf.variable_scope('global'): 79 | self.global_network = a3c.PolicyNetwork(len(env.action_space), 80 | env.observation_space) 81 | self.global_step = tf.get_variable('global_step', 82 | [], 83 | tf.int32, 84 | tf.constant_initializer(0, tf.int32), 85 | trainable=False) 86 | with tf.device(worker_device): 87 | with tf.variable_scope('local'): 88 | self.local_network = a3c.PolicyNetwork(len(env.action_space), env.observation_space) 89 | self.local_network.global_step = self.global_step 90 | 91 | self.action = tf.placeholder(tf.int32, [None], 'Action') 92 | self.advantage = tf.placeholder(tf.float32, [None], 'Advantage') 93 | self.discounted_reward = tf.placeholder(tf.float32, [None], 'Discounted_Reward') 94 | 95 | # Estimate the policy loss using the cross-entropy loss function. 96 | action_logits = self.local_network.action_logits 97 | policy_loss = tf.reduce_sum( 98 | self.advantage * tf.nn.sparse_softmax_cross_entropy_with_logits(logits=action_logits, 99 | labels=self.action)) 100 | 101 | # Regularize the policy loss by adding uncertainty (subtracting entropy). High entropy means 102 | # the agent is uncertain (meaning, it assigns similar probabilities to multiple actions). 103 | # Low entropy means the agent is sure of which action it should perform next. 104 | entropy = -tf.reduce_sum(tf.nn.softmax(action_logits) * tf.nn.log_softmax(action_logits)) 105 | policy_loss -= entropy_regularization * entropy 106 | 107 | # Estimate the value loss using the sum of squared errors. 108 | value_loss = tf.nn.l2_loss(self.local_network.value - self.discounted_reward) 109 | 110 | # Estimate the final loss. 111 | self.loss = policy_loss + 0.5 * value_loss 112 | 113 | # Fetch and clip the gradients of the local network. 114 | gradients = tf.gradients(self.loss, self.local_network.parameters) 115 | clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm) 116 | 117 | # Update the global network using the clipped gradients. 118 | batch_size = tf.shape(self.local_network.x)[0] 119 | grads_and_vars = list(zip(clipped_gradients, self.global_network.parameters)) 120 | self.train_step = [tf.train.AdamOptimizer(learning_rate).apply_gradients(grads_and_vars), 121 | self.global_step.assign_add(batch_size)] 122 | 123 | # Synchronize the local network with the global network. 124 | self.reset_local_network = [local_p.assign(global_p) 125 | for local_p, global_p in zip(self.local_network.parameters, 126 | self.global_network.parameters)] 127 | 128 | tf.summary.scalar('model/loss', self.loss / tf.to_float(batch_size)) 129 | tf.summary.scalar('model/policy_loss', policy_loss / tf.to_float(batch_size)) 130 | tf.summary.scalar('model/value_loss', value_loss / tf.to_float(batch_size)) 131 | tf.summary.scalar('model/entropy', entropy / tf.to_float(batch_size)) 132 | tf.summary.scalar('model/global_norm', tf.global_norm(self.local_network.parameters)) 133 | tf.summary.scalar('model/gradient_global_norm', tf.global_norm(gradients)) 134 | self.summary_step = tf.summary.merge_all() 135 | 136 | def _get_experiences(self): 137 | states = [] 138 | actions = [] 139 | rewards = [] 140 | values = [] 141 | 142 | if self.env.done: 143 | self.env.reset() 144 | 145 | lstm_state = self.local_network.get_initial_lstm_state() 146 | 147 | for _ in range(self.num_local_steps): 148 | state = self.env.get_state() 149 | action, value, lstm_state = self.local_network.sample_action(state, lstm_state) 150 | reward = self.env.step(self.env.action_space[action]) 151 | 152 | if self.render: 153 | self.env.render() 154 | 155 | # Store this experience. 156 | states.append(state) 157 | actions.append(action) 158 | rewards.append(reward) 159 | values.append(value) 160 | 161 | if self.env.done: 162 | LOGGER.info('Finished episode. Total reward: %d. Length: %d.', 163 | self.env.episode_reward, 164 | self.env.episode_length) 165 | 166 | summary = tf.Summary() 167 | summary.value.add(tag='environment/episode_length', 168 | simple_value=self.env.episode_length) 169 | summary.value.add(tag='environment/episode_reward', 170 | simple_value=self.env.episode_reward) 171 | summary.value.add(tag='environment/fps', 172 | simple_value=self.env.episode_length / self.env.episode_run_time) 173 | 174 | self.summary_writer.add_summary(summary, self.global_step.eval()) 175 | self.summary_writer.flush() 176 | break 177 | 178 | # Estimate discounted rewards. 179 | rewards = np.array(rewards) 180 | next_value = 0 if self.env.done else self.local_network.estimate_value(self.env.get_state(), 181 | lstm_state) 182 | discounted_rewards = _apply_discount(np.append(rewards, next_value), self.discount)[:-1] 183 | 184 | # Estimate advantages. 185 | values = np.array(values + [next_value]) 186 | advantages = _apply_discount(rewards + self.discount * values[1:] - values[:-1], 187 | self.discount) 188 | 189 | return np.array(states), np.array(actions), advantages, discounted_rewards 190 | 191 | def train(self, sess): 192 | """Performs a single learning step. 193 | 194 | Args: 195 | sess: A TensorFlow session. 196 | """ 197 | 198 | sess.run(self.reset_local_network) 199 | states, actions, advantages, discounted_rewards = self._get_experiences() 200 | feed_dict = {self.local_network.x: states, 201 | self.action: actions, 202 | self.advantage: advantages, 203 | self.discounted_reward: discounted_rewards, 204 | self.local_network.lstm_state: self.local_network.get_initial_lstm_state()} 205 | 206 | # Occasionally write summaries. 207 | if self.worker_index == 0 and self.num_times_trained % self.summary_update_interval == 0: 208 | _, global_step, summary = sess.run( 209 | [self.train_step, self.global_step, self.summary_step], feed_dict) 210 | self.summary_writer.add_summary(tf.Summary.FromString(summary), global_step) 211 | self.summary_writer.flush() 212 | else: 213 | _, global_step = sess.run([self.train_step, self.global_step], feed_dict) 214 | 215 | self.num_times_trained += 1 216 | 217 | return global_step 218 | -------------------------------------------------------------------------------- /environment.py: -------------------------------------------------------------------------------- 1 | """Augments OpenAI Gym Atari environments by preprocessing observations. 2 | 3 | Heavily influenced by DeepMind's seminal paper 'Playing Atari with Deep Reinforcement Learning' 4 | (Mnih et al., 2013) and 'Human-level control through deep reinforcement learning' (Mnih et al., 5 | 2015). 6 | """ 7 | 8 | import gym 9 | import numpy as np 10 | import time 11 | 12 | from scipy import misc 13 | 14 | 15 | # Specifies restricted action spaces. For games not in this dictionary, all actions are enabled. 16 | ACTION_SPACE = {'Pong-v0': [0, 2, 3], # NONE, UP and DOWN. 17 | 'Breakout-v0': [1, 2, 3]} # FIRE (respawn ball, otherwise NOOP), UP and DOWN. 18 | 19 | TESTING = 0 20 | TRAINING = 1 21 | 22 | 23 | def _preprocess_observation(observation): 24 | """Transforms the specified observation into a 47x47x1 grayscale image. 25 | 26 | Returns: 27 | A 47x47x1 tensor with float32 values between 0 and 1. 28 | """ 29 | 30 | # Transform the observation into a grayscale image with values between 0 and 1. Use the simple 31 | # np.mean method instead of sophisticated luminance extraction techniques since they do not seem 32 | # to improve training. 33 | grayscale_observation = observation.mean(2) 34 | 35 | # Resize grayscale frame to a 47x47 matrix of 32-bit floats. 36 | resized_observation = misc.imresize(grayscale_observation, (47, 47)).astype(np.float32) 37 | 38 | return np.expand_dims(resized_observation, 2) 39 | 40 | 41 | class AtariWrapper: 42 | """Wraps over an Atari environment from OpenAI Gym and preprocesses observations.""" 43 | 44 | def __init__(self, env_name, mode, action_space=None): 45 | """Creates the wrapper. 46 | 47 | Args: 48 | env_name: Name of an OpenAI Gym Atari environment. 49 | action_space: A list of possible actions. If 'action_space' is 'None' and no default 50 | configuration exists for this environment, all actions will be allowed. 51 | mode: The context in which the environment is used. Can be either environment.TESTING or 52 | environment.TRAINING. 53 | """ 54 | 55 | if mode is not TESTING and mode is not TRAINING: 56 | raise ValueError(('Mode is invalid. Must be either environment.TESTING or ' 57 | 'environment.TRAINING.')) 58 | 59 | self.env = gym.make(env_name) 60 | self.mode = mode 61 | self.observation_space = [47, 47, 1] 62 | self.reset() 63 | 64 | if action_space: 65 | self.action_space = list(action_space) 66 | elif env_name in ACTION_SPACE: 67 | self.action_space = ACTION_SPACE[env_name] 68 | else: 69 | self.action_space = list(range(self.env.action_space.n)) 70 | 71 | def reset(self): 72 | """Resets the environment.""" 73 | 74 | self.done = False 75 | self.episode_reward = 0 76 | self.episode_length = 0 77 | self.state = _preprocess_observation(self.env.reset()) 78 | self.episode_start_time = time.time() 79 | self.episode_run_time = 0 80 | self.lives = None 81 | 82 | def step(self, action): 83 | """Performs the specified action. 84 | 85 | Returns: 86 | A reward signal which is either -1, 0 or 1. 87 | 88 | Raises: 89 | Exception: If the game ended. 90 | ValueError: If the action is not valid. 91 | """ 92 | 93 | if self.done: 94 | raise Exception('Game finished.') 95 | 96 | if action not in self.action_space: 97 | raise ValueError('Action "{}" is invalid. Valid actions: {}.'.format(action, 98 | self.action_space)) 99 | 100 | observation, reward, self.done, info = self.env.step(action) 101 | 102 | if self.mode is TRAINING and self.lives is not None and info['ale.lives'] < self.lives: 103 | # While training, treat loss of life as end of episode. 104 | self.done = True 105 | 106 | self.episode_reward += reward 107 | self.episode_length += 1 108 | self.state = _preprocess_observation(observation) 109 | self.episode_run_time = time.time() - self.episode_start_time 110 | self.lives = info['ale.lives'] 111 | 112 | return -1 if reward < 0 else 1 if reward > 0 else 0 113 | 114 | def render(self): 115 | """Draws the environment.""" 116 | 117 | self.env.render() 118 | 119 | def sample_action(self): 120 | """Samples a random action.""" 121 | 122 | return np.random.choice(self.action_space) 123 | 124 | def get_state(self): 125 | """Gets the current state. 126 | 127 | Returns: 128 | An observation (47x47x1 tensor with float32 values between 0 and 1). 129 | """ 130 | 131 | return self.state 132 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Installs the modules required to run the package.""" 2 | 3 | from setuptools import setup 4 | 5 | 6 | setup( 7 | name='Asynchronous Advantage Actor-Critic', 8 | version='1.0.0', 9 | url='https://github.com/andreimuntean/a3c', 10 | description='Deep reinforcement learning using an asynchronous advantage actor-critic model.', 11 | author='Andrei Muntean', 12 | keywords='deep learning machine reinforcement neural network a3c actor-critic openai', 13 | install_requires=['gym[atari]', 'numpy', 'pillow', 'scipy', 'tensorflow'] 14 | ) 15 | -------------------------------------------------------------------------------- /thread.py: -------------------------------------------------------------------------------- 1 | """Trains an agent to play Atari games from OpenAI Gym. 2 | 3 | Heavily influenced by DeepMind's seminal paper 'Asynchronous Methods for Deep Reinforcement 4 | Learning' (Mnih et al., 2016). 5 | """ 6 | 7 | import agent 8 | import argparse 9 | import environment 10 | import logging 11 | import multiprocessing 12 | import os 13 | import signal 14 | import sys 15 | import tensorflow as tf 16 | import time 17 | 18 | LOGGER = logging.getLogger(__name__) 19 | LOGGER.setLevel(logging.INFO) 20 | PARSER = argparse.ArgumentParser(description='Train an agent to play Atari games.') 21 | 22 | PARSER.add_argument('--env_name', 23 | metavar='ENVIRONMENT', 24 | help='name of an OpenAI Gym Atari environment on which to train', 25 | default='Pong-v0') 26 | 27 | PARSER.add_argument('--worker_index', 28 | help='the index of this worker thread (if it is the master, leave it None)', 29 | type=int, 30 | default=None) 31 | 32 | PARSER.add_argument('--render', 33 | help='determines whether to display the game screen of each agent', 34 | type=bool, 35 | default=False) 36 | 37 | PARSER.add_argument('--action_space', 38 | nargs='+', 39 | help='restricts the number of possible actions', 40 | type=int) 41 | 42 | PARSER.add_argument('--log_dir', 43 | metavar='PATH', 44 | help='path to a directory where to save & restore the model and log events', 45 | default='models/tmp') 46 | 47 | PARSER.add_argument('--num_threads', 48 | metavar='THREADS', 49 | help='number of learning threads', 50 | type=int, 51 | default=multiprocessing.cpu_count()) 52 | 53 | PARSER.add_argument('--num_local_steps', 54 | metavar='TIME STEPS', 55 | help='number of experiences used per worker when updating the model', 56 | type=int, 57 | default=20) 58 | 59 | PARSER.add_argument('--num_global_steps', 60 | metavar='TIME STEPS', 61 | help='number of time steps trained for in total', 62 | type=int, 63 | default=50000000) 64 | 65 | PARSER.add_argument('--learning_rate', 66 | metavar='LAMBDA', 67 | help='rate at which the network learns from new examples', 68 | type=float, 69 | default=1e-4) 70 | 71 | PARSER.add_argument('--entropy_regularization', 72 | metavar='BETA', 73 | help='the strength of the entropy regularization term', 74 | type=float, 75 | default=0.01) 76 | 77 | PARSER.add_argument('--max_gradient_norm', 78 | metavar='DELTA', 79 | help='maximum value allowed for the L2-norms of gradients', 80 | type=float, 81 | default=40) 82 | 83 | PARSER.add_argument('--discount', 84 | metavar='GAMMA', 85 | help='discount factor for future rewards', 86 | type=float, 87 | default=0.99) 88 | 89 | PARSER.add_argument('--summary_update_interval', 90 | metavar='TRAINING STEPS', 91 | help='frequency at which summary data is updated when training', 92 | type=int, 93 | default=10) 94 | 95 | 96 | def get_cluster_def(num_threads): 97 | """Creates a cluster definition for 1 master (parameter server) and num_threads workers.""" 98 | 99 | port = 14000 100 | localhost = '127.0.0.1' 101 | cluster = {'master': ['{}:{}'.format(localhost, port)], 102 | 'thread': []} 103 | 104 | for _ in range(num_threads): 105 | port += 1 106 | cluster['thread'].append('{}:{}'.format(localhost, port)) 107 | 108 | return tf.train.ClusterSpec(cluster).as_cluster_def() 109 | 110 | 111 | def run_worker(args): 112 | """Starts a worker thread that learns how to play the specified Atari game.""" 113 | 114 | cluster_def = get_cluster_def(args.num_threads) 115 | config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=2) 116 | server = tf.train.Server(cluster_def, 'thread', args.worker_index, config=config) 117 | 118 | # Configure the supervisor. 119 | is_chief = args.worker_index == 0 120 | checkpoint_dir = os.path.join(args.log_dir, 'checkpoint') 121 | thread_dir = os.path.join(args.log_dir, 'thread-{}'.format(args.worker_index)) 122 | summary_writer = tf.summary.FileWriter(thread_dir) 123 | global_variables_initializer = tf.global_variables_initializer() 124 | init_fn = lambda sess: sess.run(global_variables_initializer) 125 | 126 | # Initialize the model. 127 | env = environment.AtariWrapper(args.env_name, environment.TRAINING, args.action_space) 128 | player = agent.Agent(args.worker_index, 129 | env, 130 | args.render, 131 | args.num_local_steps, 132 | args.learning_rate, 133 | args.entropy_regularization, 134 | args.max_gradient_norm, 135 | args.discount, 136 | summary_writer, 137 | args.summary_update_interval) 138 | 139 | # Local copies of the model will not be saved. 140 | model_variables = [var for var in tf.global_variables() if not var.name.startswith('local')] 141 | 142 | supervisor = tf.train.Supervisor(ready_op=tf.report_uninitialized_variables(model_variables), 143 | is_chief=is_chief, 144 | init_op=tf.variables_initializer(model_variables), 145 | logdir=checkpoint_dir, 146 | summary_op=None, 147 | saver=tf.train.Saver(model_variables), 148 | global_step=player.global_step, 149 | save_summaries_secs=30, 150 | save_model_secs=30, 151 | summary_writer=summary_writer, 152 | init_fn=init_fn) 153 | 154 | config = tf.ConfigProto(device_filters=['/job:master', 155 | '/job:thread/task:{}/cpu:0'.format(args.worker_index)]) 156 | 157 | LOGGER.info('Starting worker. This may take a while.') 158 | with supervisor.managed_session(server.target, config=config) as sess, sess.as_default(): 159 | global_step = 0 160 | while not supervisor.should_stop() and global_step < args.num_global_steps: 161 | global_step = player.train(sess) 162 | 163 | supervisor.stop() 164 | LOGGER.info('Stopped after %d global steps.', player.global_step) 165 | 166 | 167 | def main(args): 168 | """Trains an agent to play Atari games.""" 169 | 170 | # Ensure that threads are terminated gracefully. 171 | shutdown_thread = lambda signal, stack_frame: sys.exit(signal + 128) 172 | signal.signal(signal.SIGHUP, shutdown_thread) 173 | 174 | is_master = args.worker_index is None 175 | 176 | if is_master: 177 | cluster_def = get_cluster_def(args.num_threads) 178 | config = tf.ConfigProto(device_filters=['/job:master']) 179 | server = tf.train.Server(cluster_def, 'master', config=config) 180 | LOGGER.info('Started master thread.') 181 | 182 | # Keep master thread running since worker threads depend on it. 183 | while True: 184 | time.sleep(1000) 185 | 186 | # Otherwise, this is a worker. 187 | run_worker(args) 188 | 189 | 190 | if __name__ == '__main__': 191 | main(PARSER.parse_args()) 192 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ENV_NAME="Pong-v0" 4 | LOG_DIR="./models/$ENV_NAME" 5 | TMUX_SESSION_NAME="a3c" 6 | NUM_THREADS=$(nproc --all) 7 | TENSORBOARD_PORT=15000 8 | 9 | # Create the log directory. 10 | mkdir -p $LOG_DIR 11 | 12 | # Kill previous tmux session. Ignore potential "can't find session" messages. 13 | tmux kill-session -t $TMUX_SESSION_NAME 2> /dev/null 14 | 15 | # Initialize a new tmux session. 16 | tmux new-session -s $TMUX_SESSION_NAME -n master -d 17 | 18 | # Create a window for each learning thread. 19 | for thread_id in $(seq 0 $(($NUM_THREADS - 1))); do 20 | tmux new-window -t $TMUX_SESSION_NAME -n thread-$thread_id 21 | done 22 | 23 | # Create a window for TensorBoard. 24 | tmux new-window -t $TMUX_SESSION_NAME -n tensorboard 25 | 26 | # Create a window for observing hardware usage. 27 | tmux new-window -t $TMUX_SESSION_NAME -n htop 28 | 29 | # Wait for tmux to finish setting up. 30 | sleep 1 31 | 32 | # Start the master thread, which synchronizes worker threads. 33 | tmux send-keys -t $TMUX_SESSION_NAME:master "/usr/bin/python3 thread.py" \ 34 | " --env_name=$ENV_NAME" \ 35 | " --log_dir=$LOG_DIR" \ 36 | " --num_threads=$NUM_THREADS" \ 37 | " $@" Enter 38 | 39 | # Start worker threads. 40 | for thread_id in $(seq 0 $(($NUM_THREADS - 1))); do 41 | tmux send-keys -t $TMUX_SESSION_NAME:thread-$thread_id \ 42 | "/usr/bin/python3 thread.py" \ 43 | " --env_name=$ENV_NAME" \ 44 | " --log_dir=$LOG_DIR" \ 45 | " --num_threads=$NUM_THREADS" \ 46 | " --worker_index=$thread_id" \ 47 | " $@" Enter 48 | done 49 | 50 | # Start TensorBoard. 51 | tmux send-keys -t $TMUX_SESSION_NAME:tensorboard "tensorboard" \ 52 | " --port $TENSORBOARD_PORT" \ 53 | " --logdir $LOG_DIR" Enter 54 | 55 | # Start htop. 56 | tmux send-keys -t $TMUX_SESSION_NAME:htop htop Enter 57 | 58 | echo "Started the learning session." 59 | echo "Started TensorBoard at localhost:$TENSORBOARD_PORT." 60 | echo "Use 'tmux attach -t $TMUX_SESSION_NAME' to connect to the session." 61 | echo "Use 'tmux kill-session -t $TMUX_SESSION_NAME' to end the session." 62 | --------------------------------------------------------------------------------