├── README.md ├── reward.py ├── noise.py ├── replay_buffer.py ├── actor.py ├── critic.py └── ddpg.py /README.md: -------------------------------------------------------------------------------- 1 | # Implementation of DDPG - Deep Deterministic Policy Gradient 2 | 3 | Modified from the work of Patrick Emami: [Deep Deterministic Policy Gradients in TensorFlow](http://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html) 4 | 5 | Algorithm and hyperparameter details can be found here: ["Continuous control with deep reinforcement learning" - TP Lillicrap, JJ Hunt et al., 2015](http://arxiv.org/abs/1509.02971) 6 | 7 | Tested on [CartPole](https://gym.openai.com/envs/CartPole-v0) & [Pendulum](https://gym.openai.com/envs/Pendulum-v0) 8 | 9 | ### Requirements 10 | [Gym](https://github.com/openai/gym#installation) and [TensorFlow](https://www.tensorflow.org/install/). 11 | 12 | ### Modifications 13 | - Removed TFLearn dependency 14 | - Added Ornstein Uhlenbeck noise function 15 | - Added reward discounting 16 | - Works with discrete and continuous action spaces -------------------------------------------------------------------------------- /reward.py: -------------------------------------------------------------------------------- 1 | import scipy.signal 2 | 3 | # =========================== 4 | # Set rewards 5 | # =========================== 6 | class Reward(object): 7 | 8 | def __init__(self, factor, gamma): 9 | # Reward parameters 10 | self.factor = factor 11 | self.gamma = gamma 12 | 13 | # Set step rewards to total episode reward 14 | def total(self, ep_batch, tot_reward): 15 | for step in ep_batch: 16 | step[2] = tot_reward*self.factor 17 | return ep_batch 18 | 19 | # Set step rewards to discounted reward 20 | def discount(self, ep_batch): 21 | x = ep_batch[:,2] 22 | 23 | discounted = scipy.signal.lfilter([1], [1, -self.gamma], x[::-1], axis=0)[::-1] 24 | discounted *= self.factor 25 | 26 | for i in range(len(discounted)): 27 | ep_batch[i,2] = discounted[i] 28 | 29 | return ep_batch -------------------------------------------------------------------------------- /noise.py: -------------------------------------------------------------------------------- 1 | # =========================== 2 | # Noise - Ornstein Uhlenbeck 3 | # Modified from: http://www.turingfinance.com/random-walks-down-wall-street-stochastic-processes-in-python/ 4 | # Author: Liam Pettigrew 5 | # =========================== 6 | import numpy as np 7 | 8 | class Noise(object): 9 | 10 | def __init__(self, delta, sigma, ou_a, ou_mu): 11 | # Noise parameters 12 | self.delta = delta 13 | self.sigma = sigma 14 | self.ou_a = ou_a 15 | self.ou_mu = ou_mu 16 | 17 | def brownian_motion_log_returns(self): 18 | """ 19 | This method returns a Wiener process. The Wiener process is also called Brownian motion. For more information 20 | about the Wiener process check out the Wikipedia page: http://en.wikipedia.org/wiki/Wiener_process 21 | :return: brownian motion log returns 22 | """ 23 | sqrt_delta_sigma = np.sqrt(self.delta) * self.sigma 24 | return np.random.normal(loc=0, scale=sqrt_delta_sigma, size=None) 25 | 26 | def ornstein_uhlenbeck_level(self, prev_ou_level): 27 | """ 28 | This method returns the rate levels of a mean-reverting ornstein uhlenbeck process. 29 | :return: the Ornstein Uhlenbeck level 30 | """ 31 | drift = self.ou_a * (self.ou_mu - prev_ou_level) * self.delta 32 | randomness = self.brownian_motion_log_returns() 33 | return prev_ou_level + drift + randomness 34 | -------------------------------------------------------------------------------- /replay_buffer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data structure for implementing experience replay 3 | Author: Patrick Emami 4 | """ 5 | from collections import deque 6 | import random 7 | import numpy as np 8 | 9 | class ReplayBuffer(object): 10 | 11 | def __init__(self, buffer_size, random_seed=123): 12 | """ 13 | The right side of the deque contains the most recent experiences 14 | """ 15 | self.buffer_size = buffer_size 16 | self.count = 0 17 | self.buffer = deque() 18 | random.seed(random_seed) 19 | 20 | def add(self, s, a, r, t, s2): 21 | experience = (s, a, r, t, s2) 22 | if self.count < self.buffer_size: 23 | self.buffer.append(experience) 24 | self.count += 1 25 | else: 26 | self.buffer.popleft() 27 | self.buffer.append(experience) 28 | 29 | def size(self): 30 | return self.count 31 | 32 | def sample_batch(self, batch_size): 33 | batch = [] 34 | 35 | if self.count < batch_size: 36 | batch = random.sample(self.buffer, self.count) 37 | else: 38 | batch = random.sample(self.buffer, batch_size) 39 | 40 | s_batch = np.array([_[0] for _ in batch]) 41 | a_batch = np.array([_[1] for _ in batch]) 42 | r_batch = np.array([_[2] for _ in batch]) 43 | t_batch = np.array([_[3] for _ in batch]) 44 | s2_batch = np.array([_[4] for _ in batch]) 45 | 46 | return s_batch, a_batch, r_batch, t_batch, s2_batch 47 | 48 | def clear(self): 49 | self.deque.clear() 50 | self.count = 0 -------------------------------------------------------------------------------- /actor.py: -------------------------------------------------------------------------------- 1 | # =========================== 2 | # Actor DNN 3 | # =========================== 4 | import tensorflow as tf 5 | 6 | # Network Parameters - Hidden layers 7 | n_hidden_1 = 400 8 | n_hidden_2 = 300 9 | 10 | def weight_variable(shape): 11 | initial = tf.truncated_normal(shape, stddev=0.01) 12 | return tf.Variable(initial) 13 | 14 | def bias_variable(shape): 15 | initial = tf.constant(0.03, shape=shape) 16 | return tf.Variable(initial) 17 | 18 | class ActorNetwork(object): 19 | """ 20 | Input to the network is the state, output is the action 21 | under a deterministic policy. 22 | The output layer activation is a tanh to keep the action 23 | between -2 and 2 24 | """ 25 | 26 | def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau): 27 | self.sess = sess 28 | self.s_dim = state_dim 29 | self.a_dim = action_dim 30 | self.action_bound = action_bound 31 | self.learning_rate = learning_rate 32 | self.tau = tau 33 | 34 | # Actor Network 35 | self.inputs, self.out, self.scaled_out = self.create_actor_network() 36 | 37 | self.network_params = tf.trainable_variables() 38 | 39 | # Target Network 40 | self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network() 41 | 42 | self.target_network_params = tf.trainable_variables()[len(self.network_params):] 43 | 44 | # Op for periodically updating target network with online network weights 45 | self.update_target_network_params = \ 46 | [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + \ 47 | tf.multiply(self.target_network_params[i], 1. - self.tau)) 48 | for i in range(len(self.target_network_params))] 49 | 50 | # This gradient will be provided by the critic network 51 | self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim]) 52 | 53 | # Combine the gradients here 54 | self.actor_gradients = tf.gradients(self.scaled_out, self.network_params, -self.action_gradient) 55 | 56 | # Optimization Op by applying gradient, variable pairs 57 | self.optimize = tf.train.AdamOptimizer(self.learning_rate). \ 58 | apply_gradients(zip(self.actor_gradients, self.network_params)) 59 | 60 | self.num_trainable_vars = len(self.network_params) + len(self.target_network_params) 61 | 62 | def create_actor_network(self): 63 | inputs = tf.placeholder(tf.float32, [None, self.s_dim]) 64 | 65 | # Input -> Hidden Layer 66 | w1 = weight_variable([self.s_dim, n_hidden_1]) 67 | b1 = bias_variable([n_hidden_1]) 68 | # Hidden Layer -> Hidden Layer 69 | w2 = weight_variable([n_hidden_1, n_hidden_2]) 70 | b2 = bias_variable([n_hidden_2]) 71 | # Hidden Layer -> Output 72 | w3 = weight_variable([n_hidden_2, self.a_dim]) 73 | b3 = bias_variable([self.a_dim]) 74 | 75 | # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid 76 | h1 = tf.nn.relu(tf.matmul(inputs, w1) + b1) 77 | # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid 78 | h2 = tf.nn.relu(tf.matmul(h1, w2) + b2) 79 | 80 | # Run tanh on output to get -1 to 1 81 | out = tf.nn.tanh(tf.matmul(h2, w3) + b3) 82 | 83 | scaled_out = tf.multiply(out, self.action_bound) # Scale output to -action_bound to action_bound 84 | return inputs, out, scaled_out 85 | 86 | def train(self, inputs, a_gradient): 87 | self.sess.run(self.optimize, feed_dict={ 88 | self.inputs: inputs, 89 | self.action_gradient: a_gradient 90 | }) 91 | 92 | def predict(self, inputs): 93 | return self.sess.run(self.scaled_out, feed_dict={ 94 | self.inputs: inputs 95 | }) 96 | 97 | def predict_target(self, inputs): 98 | return self.sess.run(self.target_scaled_out, feed_dict={ 99 | self.target_inputs: inputs 100 | }) 101 | 102 | def update_target_network(self): 103 | self.sess.run(self.update_target_network_params) 104 | 105 | def get_num_trainable_vars(self): 106 | return self.num_trainable_vars -------------------------------------------------------------------------------- /critic.py: -------------------------------------------------------------------------------- 1 | # =========================== 2 | # Critic DNN 3 | # =========================== 4 | import tensorflow as tf 5 | 6 | # Network Parameters - Hidden layers 7 | n_hidden_1 = 400 8 | n_hidden_2 = 300 9 | 10 | def weight_variable(shape): 11 | initial = tf.truncated_normal(shape, stddev=0.01) 12 | return tf.Variable(initial) 13 | 14 | def bias_variable(shape): 15 | initial = tf.constant(0.03, shape=shape) 16 | return tf.Variable(initial) 17 | 18 | class CriticNetwork(object): 19 | """ 20 | Input to the network is the state and action, output is Q(s,a). 21 | The action must be obtained from the output of the Actor network. 22 | """ 23 | 24 | def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars): 25 | self.sess = sess 26 | self.s_dim = state_dim 27 | self.a_dim = action_dim 28 | self.learning_rate = learning_rate 29 | self.tau = tau 30 | 31 | # Create the critic network 32 | self.inputs, self.action, self.out = self.create_critic_network() 33 | 34 | self.network_params = tf.trainable_variables()[num_actor_vars:] 35 | 36 | # Target Network 37 | self.target_inputs, self.target_action, self.target_out = self.create_critic_network() 38 | 39 | self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):] 40 | 41 | # Op for periodically updating target network with online network weights with regularization 42 | self.update_target_network_params = \ 43 | [self.target_network_params[i].assign( 44 | tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) 45 | for i in range(len(self.target_network_params))] 46 | 47 | # Network target (y_i) 48 | self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) 49 | 50 | # Define loss and optimization Op 51 | self.loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(self.predicted_q_value, self.out)))) 52 | self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) 53 | 54 | # Get the gradient of the net w.r.t. the action 55 | self.action_grads = tf.gradients(self.out, self.action) 56 | 57 | def create_critic_network(self): 58 | inputs = tf.placeholder(tf.float32, [None, self.s_dim]) 59 | action = tf.placeholder(tf.float32, [None, self.a_dim]) 60 | 61 | # Input -> Hidden Layer 62 | w1 = weight_variable([self.s_dim, n_hidden_1]) 63 | b1 = bias_variable([n_hidden_1]) 64 | # Hidden Layer -> Hidden Layer + Action 65 | w2 = weight_variable([n_hidden_1, n_hidden_2]) 66 | w2a = weight_variable([self.a_dim, n_hidden_2]) 67 | b2 = bias_variable([n_hidden_2]) 68 | # Hidden Layer -> Output (Q) 69 | w3 = weight_variable([n_hidden_2, 1]) 70 | b3 = bias_variable([1]) 71 | 72 | # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid 73 | h1 = tf.nn.relu(tf.matmul(inputs, w1) + b1) 74 | # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid 75 | # Action inserted here 76 | h2 = tf.nn.relu(tf.matmul(h1, w2) + tf.matmul(action, w2a) + b2) 77 | 78 | out = tf.matmul(h2, w3) + b3 79 | 80 | return inputs, action, out 81 | 82 | def train(self, inputs, action, predicted_q_value): 83 | return self.sess.run([self.out, self.optimize], feed_dict={ 84 | self.inputs: inputs, 85 | self.action: action, 86 | self.predicted_q_value: predicted_q_value 87 | }) 88 | 89 | def predict(self, inputs, action): 90 | return self.sess.run(self.out, feed_dict={ 91 | self.inputs: inputs, 92 | self.action: action 93 | }) 94 | 95 | def predict_target(self, inputs, action): 96 | return self.sess.run(self.target_out, feed_dict={ 97 | self.target_inputs: inputs, 98 | self.target_action: action 99 | }) 100 | 101 | def action_gradients(self, inputs, actions): 102 | return self.sess.run(self.action_grads, feed_dict={ 103 | self.inputs: inputs, 104 | self.action: actions 105 | }) 106 | 107 | def update_target_network(self): 108 | self.sess.run(self.update_target_network_params) 109 | 110 | -------------------------------------------------------------------------------- /ddpg.py: -------------------------------------------------------------------------------- 1 | # ================================================ 2 | # Modified from the work of Patrick Emami: 3 | # Implementation of DDPG - Deep Deterministic Policy Gradient 4 | # Algorithm and hyperparameter details can be found here: 5 | # http://arxiv.org/pdf/1509.02971v2.pdf 6 | # 7 | # Removed TFLearn dependency 8 | # Added Ornstein Uhlenbeck noise function 9 | # Added reward discounting 10 | # Works with discrete actions spaces (Cartpole) 11 | # Tested on CartPole-v0 & -v1 & Pendulum-v0 12 | # Author: Liam Pettigrew 13 | # ================================================= 14 | import tensorflow as tf 15 | import numpy as np 16 | import gym 17 | 18 | from replay_buffer import ReplayBuffer 19 | from noise import Noise 20 | from reward import Reward 21 | from actor import ActorNetwork 22 | from critic import CriticNetwork 23 | 24 | 25 | # ========================== 26 | # Training Parameters 27 | # ========================== 28 | # Maximum episodes run 29 | MAX_EPISODES = 1000 30 | # Max episode length 31 | MAX_EP_STEPS = 1000 32 | # Episodes with noise 33 | NOISE_MAX_EP = 200 34 | # Noise parameters - Ornstein Uhlenbeck 35 | DELTA = 0.5 # The rate of change (time) 36 | SIGMA = 0.5 # Volatility of the stochastic processes 37 | OU_A = 3. # The rate of mean reversion 38 | OU_MU = 0. # The long run average interest rate 39 | # Reward parameters 40 | REWARD_FACTOR = 0.1 # Total episode reward factor 41 | # Base learning rate for the Actor network 42 | ACTOR_LEARNING_RATE = 0.0001 43 | # Base learning rate for the Critic Network 44 | CRITIC_LEARNING_RATE = 0.001 45 | # Discount factor 46 | GAMMA = 0.99 47 | # Soft target update param 48 | TAU = 0.001 49 | 50 | # =========================== 51 | # Utility Parameters 52 | # =========================== 53 | # Render gym env during training 54 | RENDER_ENV = False 55 | # Use Gym Monitor 56 | GYM_MONITOR_EN = True 57 | # Gym environment 58 | ENV_NAME = 'CartPole-v0' # Discrete: Reward factor = 0.1 59 | #ENV_NAME = 'CartPole-v1' # Discrete: Reward factor = 0.1 60 | #ENV_NAME = 'Pendulum-v0' # Continuous: Reward factor = 0.01 61 | # Directory for storing gym results 62 | MONITOR_DIR = './results/' + ENV_NAME 63 | # Directory for storing tensorboard summary results 64 | SUMMARY_DIR = './results/tf_ddpg' 65 | RANDOM_SEED = 1234 66 | # Size of replay buffer 67 | BUFFER_SIZE = 100000 68 | MINIBATCH_SIZE = 100 69 | 70 | # =========================== 71 | # Tensorflow Summary Ops 72 | # =========================== 73 | def build_summaries(): 74 | episode_reward = tf.Variable(0.) 75 | tf.summary.scalar("Reward", episode_reward) 76 | episode_ave_max_q = tf.Variable(0.) 77 | tf.summary.scalar("Qmax Value", episode_ave_max_q) 78 | 79 | summary_vars = [episode_reward, episode_ave_max_q] 80 | summary_ops = tf.summary.merge_all() 81 | 82 | return summary_ops, summary_vars 83 | 84 | # =========================== 85 | # Agent Training 86 | # =========================== 87 | def train(sess, env, actor, critic, noise, reward, discrete): 88 | # Set up summary writer 89 | summary_writer = tf.summary.FileWriter("ddpg_summary") 90 | 91 | sess.run(tf.global_variables_initializer()) 92 | 93 | # Initialize target network weights 94 | actor.update_target_network() 95 | critic.update_target_network() 96 | 97 | # Initialize replay memory 98 | replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) 99 | 100 | # Initialize noise 101 | ou_level = 0. 102 | 103 | for i in range(MAX_EPISODES): 104 | s = env.reset() 105 | 106 | ep_reward = 0 107 | ep_ave_max_q = 0 108 | 109 | # Clear episode buffer 110 | episode_buffer = np.empty((0,5), float) 111 | 112 | for j in range(MAX_EP_STEPS): 113 | if RENDER_ENV: 114 | env.render() 115 | 116 | a = actor.predict(np.reshape(s, (1, actor.s_dim))) 117 | 118 | # Add exploration noise 119 | if i < NOISE_MAX_EP: 120 | ou_level = noise.ornstein_uhlenbeck_level(ou_level) 121 | a = a + ou_level 122 | 123 | # Set action for discrete and continuous action spaces 124 | if discrete: 125 | action = np.argmax(a) 126 | else: 127 | action = a[0] 128 | 129 | s2, r, terminal, info = env.step(action) 130 | 131 | # Choose reward type 132 | ep_reward += r 133 | 134 | episode_buffer = np.append(episode_buffer, [[s, a, r, terminal, s2]], axis=0) 135 | 136 | # Keep adding experience to the memory until 137 | # there are at least minibatch size samples 138 | if replay_buffer.size() > MINIBATCH_SIZE: 139 | s_batch, a_batch, r_batch, t_batch, s2_batch = \ 140 | replay_buffer.sample_batch(MINIBATCH_SIZE) 141 | 142 | # Calculate targets 143 | target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) 144 | 145 | y_i = [] 146 | for k in range(MINIBATCH_SIZE): 147 | if t_batch[k]: 148 | y_i.append(r_batch[k]) 149 | else: 150 | y_i.append(r_batch[k] + GAMMA * target_q[k]) 151 | 152 | # Update the critic given the targets 153 | predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) 154 | 155 | ep_ave_max_q += np.amax(predicted_q_value) 156 | 157 | # Update the actor policy using the sampled gradient 158 | a_outs = actor.predict(s_batch) 159 | grads = critic.action_gradients(s_batch, a_outs) 160 | actor.train(s_batch, grads[0]) 161 | 162 | # Update target networks 163 | actor.update_target_network() 164 | critic.update_target_network() 165 | 166 | # Set previous state for next step 167 | s = s2 168 | 169 | if terminal: 170 | # Reward system for episode 171 | #episode_buffer = reward.total(episode_buffer, ep_reward) 172 | episode_buffer = reward.discount(episode_buffer) 173 | 174 | # Add episode to replay buffer 175 | for step in episode_buffer: 176 | replay_buffer.add(np.reshape(step[0], (actor.s_dim,)), np.reshape(step[1], (actor.a_dim,)), step[2], \ 177 | step[3], np.reshape(step[4], (actor.s_dim,))) 178 | 179 | summary = tf.Summary() 180 | summary.value.add(tag='Perf/Reward', simple_value=float(ep_reward)) 181 | summary.value.add(tag='Perf/Qmax', simple_value=float(ep_ave_max_q / float(j))) 182 | summary_writer.add_summary(summary, i) 183 | 184 | summary_writer.flush() 185 | 186 | print('| Reward: %.2i' % int(ep_reward), " | Episode", i, \ 187 | '| Qmax: %.4f' % (ep_ave_max_q / float(j))) 188 | 189 | break 190 | 191 | 192 | def main(_): 193 | with tf.Session() as sess: 194 | env = gym.make(ENV_NAME) 195 | np.random.seed(RANDOM_SEED) 196 | tf.set_random_seed(RANDOM_SEED) 197 | env.seed(RANDOM_SEED) 198 | 199 | print(env.observation_space) 200 | print(env.action_space) 201 | 202 | state_dim = env.observation_space.shape[0] 203 | 204 | try: 205 | action_dim = env.action_space.shape[0] 206 | action_bound = env.action_space.high 207 | # Ensure action bound is symmetric 208 | assert (env.action_space.high == -env.action_space.low) 209 | discrete = False 210 | print('Continuous Action Space') 211 | except AttributeError: 212 | action_dim = env.action_space.n 213 | action_bound = 1 214 | discrete = True 215 | print('Discrete Action Space') 216 | 217 | actor = ActorNetwork(sess, state_dim, action_dim, action_bound, 218 | ACTOR_LEARNING_RATE, TAU) 219 | 220 | critic = CriticNetwork(sess, state_dim, action_dim, 221 | CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) 222 | 223 | noise = Noise(DELTA, SIGMA, OU_A, OU_MU) 224 | reward = Reward(REWARD_FACTOR, GAMMA) 225 | 226 | if GYM_MONITOR_EN: 227 | if not RENDER_ENV: 228 | env.monitor.start(MONITOR_DIR, video_callable=False, force=True) 229 | else: 230 | env.monitor.start(MONITOR_DIR, force=True) 231 | 232 | try: 233 | train(sess, env, actor, critic, noise, reward, discrete) 234 | except KeyboardInterrupt: 235 | pass 236 | 237 | if GYM_MONITOR_EN: 238 | env.monitor.close() 239 | 240 | 241 | if __name__ == '__main__': 242 | tf.app.run() --------------------------------------------------------------------------------