├── README.md
├── reward.py
├── noise.py
├── replay_buffer.py
├── actor.py
├── critic.py
└── ddpg.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Implementation of DDPG - Deep Deterministic Policy Gradient
 2 | 
 3 | Modified from the work of Patrick Emami: [Deep Deterministic Policy Gradients in TensorFlow](http://pemami4911.github.io/blog/2016/08/21/ddpg-rl.html)
 4 | 
 5 | Algorithm and hyperparameter details can be found here: ["Continuous control with deep reinforcement learning" - TP Lillicrap, JJ Hunt et al., 2015](http://arxiv.org/abs/1509.02971)
 6 | 
 7 | Tested on [CartPole](https://gym.openai.com/envs/CartPole-v0) & [Pendulum](https://gym.openai.com/envs/Pendulum-v0)
 8 | 
 9 | ### Requirements
10 | [Gym](https://github.com/openai/gym#installation) and [TensorFlow](https://www.tensorflow.org/install/).
11 | 
12 | ### Modifications
13 | - Removed TFLearn dependency
14 | - Added Ornstein Uhlenbeck noise function
15 | - Added reward discounting
16 | - Works with discrete and continuous action spaces


--------------------------------------------------------------------------------
/reward.py:
--------------------------------------------------------------------------------
 1 | import scipy.signal
 2 | 
 3 | # ===========================
 4 | #   Set rewards
 5 | # ===========================
 6 | class Reward(object):
 7 | 
 8 |     def __init__(self, factor, gamma):
 9 |         # Reward parameters
10 |         self.factor = factor
11 |         self.gamma = gamma
12 | 
13 |     # Set step rewards to total episode reward
14 |     def total(self, ep_batch, tot_reward):
15 |         for step in ep_batch:
16 |             step[2] = tot_reward*self.factor
17 |         return ep_batch
18 | 
19 |     # Set step rewards to discounted reward
20 |     def discount(self, ep_batch):
21 |         x = ep_batch[:,2]
22 | 
23 |         discounted = scipy.signal.lfilter([1], [1, -self.gamma], x[::-1], axis=0)[::-1]
24 |         discounted *= self.factor
25 | 
26 |         for i in range(len(discounted)):
27 |             ep_batch[i,2] = discounted[i]
28 | 
29 |         return ep_batch


--------------------------------------------------------------------------------
/noise.py:
--------------------------------------------------------------------------------
 1 | # ===========================
 2 | #   Noise - Ornstein Uhlenbeck
 3 | #   Modified from: http://www.turingfinance.com/random-walks-down-wall-street-stochastic-processes-in-python/
 4 | #   Author: Liam Pettigrew
 5 | # ===========================
 6 | import numpy as np
 7 | 
 8 | class Noise(object):
 9 | 
10 |     def __init__(self, delta, sigma, ou_a, ou_mu):
11 |         # Noise parameters
12 |         self.delta = delta
13 |         self.sigma = sigma
14 |         self.ou_a = ou_a
15 |         self.ou_mu = ou_mu
16 | 
17 |     def brownian_motion_log_returns(self):
18 |         """
19 |         This method returns a Wiener process. The Wiener process is also called Brownian motion. For more information
20 |         about the Wiener process check out the Wikipedia page: http://en.wikipedia.org/wiki/Wiener_process
21 |         :return: brownian motion log returns
22 |         """
23 |         sqrt_delta_sigma = np.sqrt(self.delta) * self.sigma
24 |         return np.random.normal(loc=0, scale=sqrt_delta_sigma, size=None)
25 | 
26 |     def ornstein_uhlenbeck_level(self, prev_ou_level):
27 |         """
28 |         This method returns the rate levels of a mean-reverting ornstein uhlenbeck process.
29 |         :return: the Ornstein Uhlenbeck level
30 |         """
31 |         drift = self.ou_a * (self.ou_mu - prev_ou_level) * self.delta
32 |         randomness = self.brownian_motion_log_returns()
33 |         return prev_ou_level + drift + randomness
34 | 


--------------------------------------------------------------------------------
/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data structure for implementing experience replay
 3 | Author: Patrick Emami
 4 | """
 5 | from collections import deque
 6 | import random
 7 | import numpy as np
 8 | 
 9 | class ReplayBuffer(object):
10 | 
11 |     def __init__(self, buffer_size, random_seed=123):
12 |         """
13 |         The right side of the deque contains the most recent experiences
14 |         """
15 |         self.buffer_size = buffer_size
16 |         self.count = 0
17 |         self.buffer = deque()
18 |         random.seed(random_seed)
19 | 
20 |     def add(self, s, a, r, t, s2):
21 |         experience = (s, a, r, t, s2)
22 |         if self.count < self.buffer_size:
23 |             self.buffer.append(experience)
24 |             self.count += 1
25 |         else:
26 |             self.buffer.popleft()
27 |             self.buffer.append(experience)
28 | 
29 |     def size(self):
30 |         return self.count
31 | 
32 |     def sample_batch(self, batch_size):
33 |         batch = []
34 | 
35 |         if self.count < batch_size:
36 |             batch = random.sample(self.buffer, self.count)
37 |         else:
38 |             batch = random.sample(self.buffer, batch_size)
39 | 
40 |         s_batch = np.array([_[0] for _ in batch])
41 |         a_batch = np.array([_[1] for _ in batch])
42 |         r_batch = np.array([_[2] for _ in batch])
43 |         t_batch = np.array([_[3] for _ in batch])
44 |         s2_batch = np.array([_[4] for _ in batch])
45 | 
46 |         return s_batch, a_batch, r_batch, t_batch, s2_batch
47 | 
48 |     def clear(self):
49 |         self.deque.clear()
50 |         self.count = 0


--------------------------------------------------------------------------------
/actor.py:
--------------------------------------------------------------------------------
  1 | # ===========================
  2 | #   Actor DNN
  3 | # ===========================
  4 | import tensorflow as tf
  5 | 
  6 | # Network Parameters - Hidden layers
  7 | n_hidden_1 = 400
  8 | n_hidden_2 = 300
  9 | 
 10 | def weight_variable(shape):
 11 |     initial = tf.truncated_normal(shape, stddev=0.01)
 12 |     return tf.Variable(initial)
 13 | 
 14 | def bias_variable(shape):
 15 |     initial = tf.constant(0.03, shape=shape)
 16 |     return tf.Variable(initial)
 17 | 
 18 | class ActorNetwork(object):
 19 |     """
 20 |     Input to the network is the state, output is the action
 21 |     under a deterministic policy.
 22 |     The output layer activation is a tanh to keep the action
 23 |     between -2 and 2
 24 |     """
 25 | 
 26 |     def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau):
 27 |         self.sess = sess
 28 |         self.s_dim = state_dim
 29 |         self.a_dim = action_dim
 30 |         self.action_bound = action_bound
 31 |         self.learning_rate = learning_rate
 32 |         self.tau = tau
 33 | 
 34 |         # Actor Network
 35 |         self.inputs, self.out, self.scaled_out = self.create_actor_network()
 36 | 
 37 |         self.network_params = tf.trainable_variables()
 38 | 
 39 |         # Target Network
 40 |         self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network()
 41 | 
 42 |         self.target_network_params = tf.trainable_variables()[len(self.network_params):]
 43 | 
 44 |         # Op for periodically updating target network with online network weights
 45 |         self.update_target_network_params = \
 46 |             [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + \
 47 |                                                   tf.multiply(self.target_network_params[i], 1. - self.tau))
 48 |              for i in range(len(self.target_network_params))]
 49 | 
 50 |         # This gradient will be provided by the critic network
 51 |         self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim])
 52 | 
 53 |         # Combine the gradients here
 54 |         self.actor_gradients = tf.gradients(self.scaled_out, self.network_params, -self.action_gradient)
 55 | 
 56 |         # Optimization Op by applying gradient, variable pairs
 57 |         self.optimize = tf.train.AdamOptimizer(self.learning_rate). \
 58 |             apply_gradients(zip(self.actor_gradients, self.network_params))
 59 | 
 60 |         self.num_trainable_vars = len(self.network_params) + len(self.target_network_params)
 61 | 
 62 |     def create_actor_network(self):
 63 |         inputs = tf.placeholder(tf.float32, [None, self.s_dim])
 64 | 
 65 |         # Input -> Hidden Layer
 66 |         w1 = weight_variable([self.s_dim, n_hidden_1])
 67 |         b1 = bias_variable([n_hidden_1])
 68 |         # Hidden Layer -> Hidden Layer
 69 |         w2 = weight_variable([n_hidden_1, n_hidden_2])
 70 |         b2 = bias_variable([n_hidden_2])
 71 |         # Hidden Layer -> Output
 72 |         w3 = weight_variable([n_hidden_2, self.a_dim])
 73 |         b3 = bias_variable([self.a_dim])
 74 | 
 75 |         # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
 76 |         h1 = tf.nn.relu(tf.matmul(inputs, w1) + b1)
 77 |         # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
 78 |         h2 = tf.nn.relu(tf.matmul(h1, w2) + b2)
 79 | 
 80 |         # Run tanh on output to get -1 to 1
 81 |         out = tf.nn.tanh(tf.matmul(h2, w3) + b3)
 82 | 
 83 |         scaled_out = tf.multiply(out, self.action_bound)  # Scale output to -action_bound to action_bound
 84 |         return inputs, out, scaled_out
 85 | 
 86 |     def train(self, inputs, a_gradient):
 87 |         self.sess.run(self.optimize, feed_dict={
 88 |             self.inputs: inputs,
 89 |             self.action_gradient: a_gradient
 90 |         })
 91 | 
 92 |     def predict(self, inputs):
 93 |         return self.sess.run(self.scaled_out, feed_dict={
 94 |             self.inputs: inputs
 95 |         })
 96 | 
 97 |     def predict_target(self, inputs):
 98 |         return self.sess.run(self.target_scaled_out, feed_dict={
 99 |             self.target_inputs: inputs
100 |         })
101 | 
102 |     def update_target_network(self):
103 |         self.sess.run(self.update_target_network_params)
104 | 
105 |     def get_num_trainable_vars(self):
106 |         return self.num_trainable_vars


--------------------------------------------------------------------------------
/critic.py:
--------------------------------------------------------------------------------
  1 | # ===========================
  2 | #   Critic DNN
  3 | # ===========================
  4 | import tensorflow as tf
  5 | 
  6 | # Network Parameters - Hidden layers
  7 | n_hidden_1 = 400
  8 | n_hidden_2 = 300
  9 | 
 10 | def weight_variable(shape):
 11 |     initial = tf.truncated_normal(shape, stddev=0.01)
 12 |     return tf.Variable(initial)
 13 | 
 14 | def bias_variable(shape):
 15 |     initial = tf.constant(0.03, shape=shape)
 16 |     return tf.Variable(initial)
 17 | 
 18 | class CriticNetwork(object):
 19 |     """
 20 |     Input to the network is the state and action, output is Q(s,a).
 21 |     The action must be obtained from the output of the Actor network.
 22 |     """
 23 | 
 24 |     def __init__(self, sess, state_dim, action_dim, learning_rate, tau, num_actor_vars):
 25 |         self.sess = sess
 26 |         self.s_dim = state_dim
 27 |         self.a_dim = action_dim
 28 |         self.learning_rate = learning_rate
 29 |         self.tau = tau
 30 | 
 31 |         # Create the critic network
 32 |         self.inputs, self.action, self.out = self.create_critic_network()
 33 | 
 34 |         self.network_params = tf.trainable_variables()[num_actor_vars:]
 35 | 
 36 |         # Target Network
 37 |         self.target_inputs, self.target_action, self.target_out = self.create_critic_network()
 38 | 
 39 |         self.target_network_params = tf.trainable_variables()[(len(self.network_params) + num_actor_vars):]
 40 | 
 41 |         # Op for periodically updating target network with online network weights with regularization
 42 |         self.update_target_network_params = \
 43 |             [self.target_network_params[i].assign(
 44 |                 tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau))
 45 |              for i in range(len(self.target_network_params))]
 46 | 
 47 |         # Network target (y_i)
 48 |         self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])
 49 | 
 50 |         # Define loss and optimization Op
 51 |         self.loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(self.predicted_q_value, self.out))))
 52 |         self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
 53 | 
 54 |         # Get the gradient of the net w.r.t. the action
 55 |         self.action_grads = tf.gradients(self.out, self.action)
 56 | 
 57 |     def create_critic_network(self):
 58 |         inputs = tf.placeholder(tf.float32, [None, self.s_dim])
 59 |         action = tf.placeholder(tf.float32, [None, self.a_dim])
 60 | 
 61 |         # Input -> Hidden Layer
 62 |         w1 = weight_variable([self.s_dim, n_hidden_1])
 63 |         b1 = bias_variable([n_hidden_1])
 64 |         # Hidden Layer -> Hidden Layer + Action
 65 |         w2 = weight_variable([n_hidden_1, n_hidden_2])
 66 |         w2a = weight_variable([self.a_dim, n_hidden_2])
 67 |         b2 = bias_variable([n_hidden_2])
 68 |         # Hidden Layer -> Output (Q)
 69 |         w3 = weight_variable([n_hidden_2, 1])
 70 |         b3 = bias_variable([1])
 71 | 
 72 |         # 1st Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
 73 |         h1 = tf.nn.relu(tf.matmul(inputs, w1) + b1)
 74 |         # 2nd Hidden layer, OPTION: Softmax, relu, tanh or sigmoid
 75 |         # Action inserted here
 76 |         h2 = tf.nn.relu(tf.matmul(h1, w2) + tf.matmul(action, w2a) + b2)
 77 | 
 78 |         out = tf.matmul(h2, w3) + b3
 79 | 
 80 |         return inputs, action, out
 81 | 
 82 |     def train(self, inputs, action, predicted_q_value):
 83 |         return self.sess.run([self.out, self.optimize], feed_dict={
 84 |             self.inputs: inputs,
 85 |             self.action: action,
 86 |             self.predicted_q_value: predicted_q_value
 87 |         })
 88 | 
 89 |     def predict(self, inputs, action):
 90 |         return self.sess.run(self.out, feed_dict={
 91 |             self.inputs: inputs,
 92 |             self.action: action
 93 |         })
 94 | 
 95 |     def predict_target(self, inputs, action):
 96 |         return self.sess.run(self.target_out, feed_dict={
 97 |             self.target_inputs: inputs,
 98 |             self.target_action: action
 99 |         })
100 | 
101 |     def action_gradients(self, inputs, actions):
102 |         return self.sess.run(self.action_grads, feed_dict={
103 |             self.inputs: inputs,
104 |             self.action: actions
105 |         })
106 | 
107 |     def update_target_network(self):
108 |         self.sess.run(self.update_target_network_params)
109 | 
110 | 


--------------------------------------------------------------------------------
/ddpg.py:
--------------------------------------------------------------------------------
  1 | # ================================================
  2 | # Modified from the work of Patrick Emami:
  3 | #       Implementation of DDPG - Deep Deterministic Policy Gradient
  4 | #       Algorithm and hyperparameter details can be found here:
  5 | #           http://arxiv.org/pdf/1509.02971v2.pdf
  6 | #
  7 | # Removed TFLearn dependency
  8 | # Added Ornstein Uhlenbeck noise function
  9 | # Added reward discounting
 10 | # Works with discrete actions spaces (Cartpole)
 11 | # Tested on CartPole-v0 & -v1 & Pendulum-v0
 12 | # Author: Liam Pettigrew
 13 | # =================================================
 14 | import tensorflow as tf
 15 | import numpy as np
 16 | import gym
 17 | 
 18 | from replay_buffer import ReplayBuffer
 19 | from noise import Noise
 20 | from reward import Reward
 21 | from actor import ActorNetwork
 22 | from critic import CriticNetwork
 23 | 
 24 | 
 25 | # ==========================
 26 | #   Training Parameters
 27 | # ==========================
 28 | # Maximum episodes run
 29 | MAX_EPISODES = 1000
 30 | # Max episode length
 31 | MAX_EP_STEPS = 1000
 32 | # Episodes with noise
 33 | NOISE_MAX_EP = 200
 34 | # Noise parameters - Ornstein Uhlenbeck
 35 | DELTA = 0.5 # The rate of change (time)
 36 | SIGMA = 0.5 # Volatility of the stochastic processes
 37 | OU_A = 3. # The rate of mean reversion
 38 | OU_MU = 0. # The long run average interest rate
 39 | # Reward parameters
 40 | REWARD_FACTOR = 0.1 # Total episode reward factor
 41 | # Base learning rate for the Actor network
 42 | ACTOR_LEARNING_RATE = 0.0001
 43 | # Base learning rate for the Critic Network
 44 | CRITIC_LEARNING_RATE = 0.001
 45 | # Discount factor
 46 | GAMMA = 0.99
 47 | # Soft target update param
 48 | TAU = 0.001
 49 | 
 50 | # ===========================
 51 | #   Utility Parameters
 52 | # ===========================
 53 | # Render gym env during training
 54 | RENDER_ENV = False
 55 | # Use Gym Monitor
 56 | GYM_MONITOR_EN = True
 57 | # Gym environment
 58 | ENV_NAME = 'CartPole-v0' # Discrete: Reward factor = 0.1
 59 | #ENV_NAME = 'CartPole-v1' # Discrete: Reward factor = 0.1
 60 | #ENV_NAME = 'Pendulum-v0' # Continuous: Reward factor = 0.01
 61 | # Directory for storing gym results
 62 | MONITOR_DIR = './results/' + ENV_NAME
 63 | # Directory for storing tensorboard summary results
 64 | SUMMARY_DIR = './results/tf_ddpg'
 65 | RANDOM_SEED = 1234
 66 | # Size of replay buffer
 67 | BUFFER_SIZE = 100000
 68 | MINIBATCH_SIZE = 100
 69 | 
 70 | # ===========================
 71 | #   Tensorflow Summary Ops
 72 | # ===========================
 73 | def build_summaries():
 74 |     episode_reward = tf.Variable(0.)
 75 |     tf.summary.scalar("Reward", episode_reward)
 76 |     episode_ave_max_q = tf.Variable(0.)
 77 |     tf.summary.scalar("Qmax Value", episode_ave_max_q)
 78 | 
 79 |     summary_vars = [episode_reward, episode_ave_max_q]
 80 |     summary_ops = tf.summary.merge_all()
 81 | 
 82 |     return summary_ops, summary_vars
 83 | 
 84 | # ===========================
 85 | #   Agent Training
 86 | # ===========================
 87 | def train(sess, env, actor, critic, noise, reward, discrete):
 88 |     # Set up summary writer
 89 |     summary_writer = tf.summary.FileWriter("ddpg_summary")
 90 | 
 91 |     sess.run(tf.global_variables_initializer())
 92 | 
 93 |     # Initialize target network weights
 94 |     actor.update_target_network()
 95 |     critic.update_target_network()
 96 | 
 97 |     # Initialize replay memory
 98 |     replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
 99 | 
100 |     # Initialize noise
101 |     ou_level = 0.
102 | 
103 |     for i in range(MAX_EPISODES):
104 |         s = env.reset()
105 | 
106 |         ep_reward = 0
107 |         ep_ave_max_q = 0
108 | 
109 |         # Clear episode buffer
110 |         episode_buffer = np.empty((0,5), float)
111 | 
112 |         for j in range(MAX_EP_STEPS):
113 |             if RENDER_ENV:
114 |                 env.render()
115 | 
116 |             a = actor.predict(np.reshape(s, (1, actor.s_dim)))
117 | 
118 |             # Add exploration noise
119 |             if i < NOISE_MAX_EP:
120 |                 ou_level = noise.ornstein_uhlenbeck_level(ou_level)
121 |                 a = a + ou_level
122 | 
123 |             # Set action for discrete and continuous action spaces
124 |             if discrete:
125 |                 action = np.argmax(a)
126 |             else:
127 |                 action = a[0]
128 | 
129 |             s2, r, terminal, info = env.step(action)
130 | 
131 |             # Choose reward type
132 |             ep_reward += r
133 | 
134 |             episode_buffer = np.append(episode_buffer, [[s, a, r, terminal, s2]], axis=0)
135 | 
136 |             # Keep adding experience to the memory until
137 |             # there are at least minibatch size samples
138 |             if replay_buffer.size() > MINIBATCH_SIZE:
139 |                 s_batch, a_batch, r_batch, t_batch, s2_batch = \
140 |                     replay_buffer.sample_batch(MINIBATCH_SIZE)
141 | 
142 |                 # Calculate targets
143 |                 target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch))
144 | 
145 |                 y_i = []
146 |                 for k in range(MINIBATCH_SIZE):
147 |                     if t_batch[k]:
148 |                         y_i.append(r_batch[k])
149 |                     else:
150 |                         y_i.append(r_batch[k] + GAMMA * target_q[k])
151 | 
152 |                 # Update the critic given the targets
153 |                 predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))
154 | 
155 |                 ep_ave_max_q += np.amax(predicted_q_value)
156 | 
157 |                 # Update the actor policy using the sampled gradient
158 |                 a_outs = actor.predict(s_batch)
159 |                 grads = critic.action_gradients(s_batch, a_outs)
160 |                 actor.train(s_batch, grads[0])
161 | 
162 |                 # Update target networks
163 |                 actor.update_target_network()
164 |                 critic.update_target_network()
165 | 
166 |             # Set previous state for next step
167 |             s = s2
168 | 
169 |             if terminal:
170 |                 # Reward system for episode
171 |                 #episode_buffer = reward.total(episode_buffer, ep_reward)
172 |                 episode_buffer = reward.discount(episode_buffer)
173 | 
174 |                 # Add episode to replay buffer
175 |                 for step in episode_buffer:
176 |                     replay_buffer.add(np.reshape(step[0], (actor.s_dim,)), np.reshape(step[1], (actor.a_dim,)), step[2], \
177 |                                   step[3], np.reshape(step[4], (actor.s_dim,)))
178 | 
179 |                 summary = tf.Summary()
180 |                 summary.value.add(tag='Perf/Reward', simple_value=float(ep_reward))
181 |                 summary.value.add(tag='Perf/Qmax', simple_value=float(ep_ave_max_q / float(j)))
182 |                 summary_writer.add_summary(summary, i)
183 | 
184 |                 summary_writer.flush()
185 | 
186 |                 print('| Reward: %.2i' % int(ep_reward), " | Episode", i, \
187 |                 '| Qmax: %.4f' % (ep_ave_max_q / float(j)))
188 | 
189 |                 break
190 | 
191 | 
192 | def main(_):
193 |     with tf.Session() as sess:
194 |         env = gym.make(ENV_NAME)
195 |         np.random.seed(RANDOM_SEED)
196 |         tf.set_random_seed(RANDOM_SEED)
197 |         env.seed(RANDOM_SEED)
198 | 
199 |         print(env.observation_space)
200 |         print(env.action_space)
201 | 
202 |         state_dim = env.observation_space.shape[0]
203 | 
204 |         try:
205 |             action_dim = env.action_space.shape[0]
206 |             action_bound = env.action_space.high
207 |             # Ensure action bound is symmetric
208 |             assert (env.action_space.high == -env.action_space.low)
209 |             discrete = False
210 |             print('Continuous Action Space')
211 |         except AttributeError:
212 |             action_dim = env.action_space.n
213 |             action_bound = 1
214 |             discrete = True
215 |             print('Discrete Action Space')
216 | 
217 |         actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
218 |                              ACTOR_LEARNING_RATE, TAU)
219 | 
220 |         critic = CriticNetwork(sess, state_dim, action_dim,
221 |                                CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars())
222 | 
223 |         noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
224 |         reward = Reward(REWARD_FACTOR, GAMMA)
225 | 
226 |         if GYM_MONITOR_EN:
227 |             if not RENDER_ENV:
228 |                 env.monitor.start(MONITOR_DIR, video_callable=False, force=True)
229 |             else:
230 |                 env.monitor.start(MONITOR_DIR, force=True)
231 | 
232 |         try:
233 |             train(sess, env, actor, critic, noise, reward, discrete)
234 |         except KeyboardInterrupt:
235 |             pass
236 | 
237 |         if GYM_MONITOR_EN:
238 |             env.monitor.close()
239 | 
240 | 
241 | if __name__ == '__main__':
242 |     tf.app.run()


--------------------------------------------------------------------------------