├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── _config.yml ├── agent ├── __init__.py ├── ddpg.py └── reinforce.py ├── mechanism ├── __init__.py ├── ou_process.py └── replay_buffer.py ├── model ├── __init__.py ├── ddpg_actor.py ├── ddpg_critic.py ├── ddpg_model.py └── reinforce_model.py ├── run_ddpg.bat ├── run_ddpg.py └── run_reinforce.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # gym video 104 | /CartPole-v0* 105 | /Pendulum-v0* 106 | 107 | # test scripts 108 | test* 109 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 ligh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TensorAgent 2 | Author: Guohao Li 3 | 4 | Email: lightaime@gmail.com 5 | 6 | Deep reinforcement learning agents implemented by tensorflow 7 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/TensorAgent/7176d7fa5cbc20d3d31e9c01f6c1424bd3501ecc/__init__.py -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /agent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/TensorAgent/7176d7fa5cbc20d3d31e9c01f6c1424bd3501ecc/agent/__init__.py -------------------------------------------------------------------------------- /agent/ddpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Agent(object): 4 | def __init__(self, model, replay_buffer, exploration_noise, discout_factor, verbose=False): 5 | self.model = model 6 | self.replay_buffer = replay_buffer 7 | self.exploration_noise = exploration_noise 8 | self.discout_factor = discout_factor 9 | self.verbose = verbose 10 | 11 | def predict_action(self, observation): 12 | return self.model.predict_action(observation) 13 | 14 | def select_action(self, observation, p=None): 15 | pred_action = self.predict_action(observation) 16 | noise = self.exploration_noise.return_noise() 17 | if p is not None: 18 | return pred_action * p + noise * (1 - p) 19 | else: 20 | return pred_action + noise 21 | 22 | def store_transition(self, transition): 23 | self.replay_buffer.store_transition(transition) 24 | 25 | def init_process(self): 26 | self.exploration_noise.init_process() 27 | 28 | def get_transition_batch(self): 29 | batch = self.replay_buffer.get_batch() 30 | transpose_batch = list(zip(*batch)) 31 | s_batch = np.vstack(transpose_batch[0]) 32 | a_batch = np.vstack(transpose_batch[1]) 33 | r_batch = np.vstack(transpose_batch[2]) 34 | next_s_batch = np.vstack(transpose_batch[3]) 35 | done_batch = np.vstack(transpose_batch[4]) 36 | return s_batch, a_batch, r_batch, next_s_batch, done_batch 37 | 38 | def preprocess_batch(self, s_batch, a_batch, r_batch, next_s_batch, done_batch): 39 | target_actor_net_pred_action = self.model.actor.predict_action_target_net(next_s_batch) 40 | target_critic_net_pred_q = self.model.critic.predict_q_target_net(next_s_batch, target_actor_net_pred_action) 41 | y_batch = r_batch + self.discout_factor * target_critic_net_pred_q * (1 - done_batch) 42 | return s_batch, a_batch, y_batch 43 | 44 | def train_model(self): 45 | s_batch, a_batch, r_batch, next_s_batch, done_batch = self.get_transition_batch() 46 | self.model.update(*self.preprocess_batch(s_batch, a_batch, r_batch, next_s_batch, done_batch)) 47 | 48 | 49 | -------------------------------------------------------------------------------- /agent/reinforce.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | class Agent(object): 5 | def __init__(self, model, discout_factor, verbose=False): 6 | self.model = model 7 | self.discout_factor = discout_factor 8 | self.verbose = verbose 9 | self.sess = tf.Session() 10 | self.sess.run(tf.global_variables_initializer()) 11 | self.state_rollout = [] 12 | self.action_rollout = [] 13 | self.reward_rollout = [] 14 | self.done_rollout = [] 15 | 16 | def state_append(self, state): 17 | self.state_rollout.append(state) 18 | 19 | def action_append(self, action): 20 | self.action_rollout.append(action) 21 | 22 | def reward_append(self, reward): 23 | self.reward_rollout.append(reward) 24 | 25 | def predict_policy(self, observation): 26 | return self.model.predict_policy([observation], self.sess) 27 | 28 | def train_model(self): 29 | w = np.array([]) 30 | b = np.array([]) 31 | for i, sar in enumerate(zip(self.state_rollout, self.action_rollout, self.reward_rollout)): 32 | s, a, r = sar 33 | _, total_loss, policy_loss, base_line_loss = self.model.update([s], 34 | [a], 35 | [[sum(self.discout_factor**i_ * rwd for i_, rwd in enumerate(self.reward_rollout[i:]))]], 36 | self.sess) 37 | if i%10 == 0: 38 | print(base_line_loss) 39 | if self.verbose: 40 | if i%10 == 0: 41 | print(base_line_loss) 42 | print(total_loss) 43 | w_p, b_p = w.copy(), b.copy() 44 | w, b = model.run_layer_weight() 45 | if i > 0: 46 | print(w-w_p) 47 | print(b-b_p) 48 | 49 | def clear_rollout(self): 50 | del self.state_rollout[:] 51 | del self.action_rollout[:] 52 | del self.reward_rollout[:] 53 | 54 | -------------------------------------------------------------------------------- /mechanism/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/TensorAgent/7176d7fa5cbc20d3d31e9c01f6c1424bd3501ecc/mechanism/__init__.py -------------------------------------------------------------------------------- /mechanism/ou_process.py: -------------------------------------------------------------------------------- 1 | ''' 2 | refer to openai 3 | https://github.com/rll/rllab/blob/master/rllab/exploration_strategies/ou_strategy.py 4 | ''' 5 | 6 | import numpy as np 7 | 8 | class OU_Process(object): 9 | def __init__(self, action_dim, theta=0.15, mu=0, sigma=0.2): 10 | self.action_dim = action_dim 11 | self.theta = theta 12 | self.mu = mu 13 | self.sigma = sigma 14 | self.current_x = None 15 | 16 | self.init_process() 17 | 18 | def init_process(self): 19 | self.current_x = np.ones(self.action_dim) * self.mu 20 | 21 | def update_process(self): 22 | dx = self.theta * (self.mu - self.current_x) + self.sigma * np.random.randn(self.action_dim) 23 | self.current_x = self.current_x + dx 24 | 25 | def return_noise(self): 26 | self.update_process() 27 | return self.current_x 28 | 29 | if __name__ == "__main__": 30 | ou = OU_Process(3, theta=0.15, mu=0, sigma=0.2) 31 | states = [] 32 | for i in range(10000): 33 | states.append(ou.return_noise()[0]) 34 | import matplotlib.pyplot as plt 35 | 36 | plt.plot(states) 37 | plt.show() 38 | 39 | -------------------------------------------------------------------------------- /mechanism/replay_buffer.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import random 3 | 4 | class Replay_Buffer(object): 5 | def __init__(self, buffer_size=10e6, batch_size=1): 6 | self.buffer_size = buffer_size 7 | self.batch_size = batch_size 8 | self.memory = deque(maxlen=buffer_size) 9 | 10 | def __call__(self): 11 | return self.memory 12 | 13 | def store_transition(self, transition): 14 | self.memory.append(transition) 15 | 16 | def store_transitions(self, transitions): 17 | self.memory.extend(transitions) 18 | 19 | def get_batch(self, batch_size=None): 20 | b_s = batch_size or self.batch_size 21 | cur_men_size = len(self.memory) 22 | if cur_men_size < b_s: 23 | return random.sample(list(self.memory), cur_men_size) 24 | else: 25 | return random.sample(list(self.memory), b_s) 26 | 27 | def memory_state(self): 28 | return {"buffer_size": self.buffer_size, 29 | "current_size": len(self.memory), 30 | "full": len(self.memory)==self.buffer_size} 31 | 32 | def empty_transition(self): 33 | self.memory.clear() 34 | 35 | if __name__ == '__main__': 36 | import numpy as np 37 | replay_buffer = Replay_Buffer(buffer_size=4) 38 | print(replay_buffer.memory_state()) 39 | replay_buffer.store_transition([1, 2, 3, 4, False]) 40 | print(replay_buffer.memory_state()) 41 | replay_buffer.store_transition([2, 2, 3, 4, False]) 42 | print(replay_buffer.memory_state()) 43 | replay_buffer.store_transition([3, 2, 3, 4, True]) 44 | print(replay_buffer.memory_state()) 45 | print(replay_buffer()) 46 | 47 | replay_buffer.store_transition([4, 2, 3, 4, True]) 48 | print(replay_buffer.memory_state()) 49 | print(replay_buffer()) 50 | 51 | replay_buffer.store_transitions([[5, 2, 3, 4, False], 52 | [6, 2, 3, 4, True]]) 53 | print(replay_buffer.memory_state()) 54 | print(replay_buffer()) 55 | 56 | batch = replay_buffer.get_batch(3) 57 | print("batch", batch) 58 | transpose_batch = list(zip(*batch)) 59 | print("transpose_batch", transpose_batch) 60 | s_batch = np.array(transpose_batch[0]) 61 | a_batch = list(transpose_batch[1]) 62 | r_batch = list(transpose_batch[2]) 63 | next_s_batch = list(transpose_batch[3]) 64 | done_batch = np.array(transpose_batch[4]) 65 | print("s_batch", s_batch) 66 | print("a_batch", a_batch) 67 | print("r_batch", r_batch) 68 | print("next_s_batch", next_s_batch) 69 | print("done_batch", done_batch) 70 | print((1-done_batch)*s_batch) 71 | 72 | replay_buffer.empty_transition() 73 | print(replay_buffer.memory_state()) 74 | print(replay_buffer()) 75 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightaime/TensorAgent/7176d7fa5cbc20d3d31e9c01f6c1424bd3501ecc/model/__init__.py -------------------------------------------------------------------------------- /model/ddpg_actor.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from math import sqrt 3 | 4 | class DDPG_Actor(object): 5 | def __init__(self, state_dim, action_dim, optimizer=None, learning_rate=0.001, tau=0.001, scope="", sess=None): 6 | self.scope = scope 7 | self.sess = sess 8 | self.state_dim = state_dim 9 | self.action_dim = action_dim 10 | self.learning_rate = learning_rate 11 | self.l2_reg = 0.01 12 | self.optimizer = optimizer or tf.train.AdamOptimizer(self.learning_rate) 13 | self.tau = tau 14 | self.h1_dim = 400 15 | self.h2_dim = 300 16 | # self.h3_dim = 200 17 | self.activation = tf.nn.relu 18 | self.kernel_initializer = tf.contrib.layers.variance_scaling_initializer() 19 | # fan-out uniform initializer which is different from original paper 20 | self.kernel_initializer_1 = tf.random_uniform_initializer(minval=-1/sqrt(self.h1_dim), maxval=1/sqrt(self.h1_dim)) 21 | self.kernel_initializer_2 = tf.random_uniform_initializer(minval=-1/sqrt(self.h2_dim), maxval=1/sqrt(self.h2_dim)) 22 | self.kernel_initializer_3 = tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3) 23 | self.kernel_regularizer = tf.contrib.layers.l2_regularizer(self.l2_reg) 24 | 25 | with tf.name_scope("actor_input"): 26 | self.input_state = tf.placeholder(tf.float32, shape=[None, self.state_dim], name="states") 27 | 28 | with tf.name_scope("actor_label"): 29 | self.actions_grad = tf.placeholder(tf.float32, shape=[None, self.action_dim], name="actions_grad") 30 | 31 | self.source_var_scope = "ddpg/" + "actor_net" 32 | with tf.variable_scope(self.source_var_scope): 33 | self.action_output = self.__create_actor_network() 34 | 35 | self.target_var_scope = "ddpg/" + "actor_target_net" 36 | with tf.variable_scope(self.target_var_scope): 37 | self.target_net_actions_output = self.__create_target_network() 38 | 39 | with tf.name_scope("compute_policy_gradients"): 40 | self.__create_loss() 41 | 42 | self.train_op_scope = "actor_train_op" 43 | with tf.variable_scope(self.train_op_scope): 44 | self.__create_train_op() 45 | 46 | with tf.name_scope("actor_target_update_train_op"): 47 | self.__create_update_target_net_op() 48 | 49 | self.__create_get_layer_weight_op_source() 50 | self.__create_get_layer_weight_op_target() 51 | 52 | def __create_actor_network(self): 53 | h1 = tf.layers.dense(self.input_state, 54 | units=self.h1_dim, 55 | activation=self.activation, 56 | kernel_initializer=self.kernel_initializer_1, 57 | # kernel_initializer=self.kernel_initializer, 58 | kernel_regularizer=self.kernel_regularizer, 59 | name="hidden_1") 60 | 61 | h2 = tf.layers.dense(h1, 62 | units=self.h2_dim, 63 | activation=self.activation, 64 | kernel_initializer=self.kernel_initializer_2, 65 | # kernel_initializer=self.kernel_initializer, 66 | kernel_regularizer=self.kernel_regularizer, 67 | name="hidden_2") 68 | 69 | # h3 = tf.layers.dense(h2, 70 | # units=self.h3_dim, 71 | # activation=self.activation, 72 | # kernel_initializer=self.kernel_initializer, 73 | # kernel_regularizer=self.kernel_regularizer, 74 | # name="hidden_3") 75 | 76 | action_output = tf.layers.dense(h2, 77 | units=self.action_dim, 78 | activation=tf.nn.tanh, 79 | # activation=tf.nn.tanh, 80 | kernel_initializer=self.kernel_initializer_3, 81 | # kernel_initializer=self.kernel_initializer, 82 | kernel_regularizer=self.kernel_regularizer, 83 | use_bias=False, 84 | name="action_outputs") 85 | 86 | return action_output 87 | 88 | def __create_target_network(self): 89 | # get source variales and initialize 90 | source_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.source_var_scope) 91 | self.sess.run(tf.variables_initializer(source_vars)) 92 | 93 | # create target network and initialize it by source network 94 | action_output = self.__create_actor_network() 95 | target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_var_scope) 96 | 97 | target_init_op_list = [target_vars[i].assign(source_vars[i]) for i in range(len(source_vars))] 98 | self.sess.run(target_init_op_list) 99 | 100 | return action_output 101 | 102 | def __create_loss(self): 103 | source_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.source_var_scope) 104 | self.policy_gradient = tf.gradients(self.action_output, source_vars, -self.actions_grad) 105 | self.grads_and_vars = zip(self.policy_gradient, source_vars) 106 | 107 | def __create_train_op(self): 108 | self.train_policy_op = self.optimizer.apply_gradients(self.grads_and_vars, global_step=tf.contrib.framework.get_global_step()) 109 | train_op_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope= self.scope + "/" + self.train_op_scope) # to do: remove prefix 110 | train_op_vars.extend(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.train_op_scope)) 111 | self.sess.run(tf.variables_initializer(train_op_vars)) 112 | 113 | def __create_update_target_net_op(self): 114 | source_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.source_var_scope) 115 | target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_var_scope) 116 | update_target_net_op_list = [target_vars[i].assign(self.tau*source_vars[i] + (1-self.tau)*target_vars[i]) for i in range(len(source_vars))] 117 | 118 | # source_net_dict = {var.name[len(self.source_var_scope):]: var for var in source_vars} 119 | # target_net_dict = {var.name[len(self.target_var_scope):]: var for var in target_vars} 120 | # keys = source_net_dict.keys() 121 | # update_target_net_op_list = [target_net_dict[key].assign((1-self.tau)*target_net_dict[key]+self.tau*source_net_dict[key]) \ 122 | # for key in keys] 123 | 124 | # for s_v, t_v in zip(source_vars, target_vars): 125 | # update_target_net_op_list.append(t_v.assign(self.tau*s_v - (1-self.tau)*t_v)) 126 | 127 | self.update_target_net_op = tf.group(*update_target_net_op_list) 128 | 129 | def predict_action_source_net(self, feed_state, sess=None): 130 | sess = sess or self.sess 131 | return sess.run(self.action_output, {self.input_state: feed_state}) 132 | 133 | def predict_action_target_net(self, feed_state, sess=None): 134 | sess = sess or self.sess 135 | return sess.run(self.target_net_actions_output, {self.input_state: feed_state}) 136 | 137 | def update_source_actor_net(self, feed_state, actions_grad, sess=None): 138 | sess = sess or self.sess 139 | batch_size = len(actions_grad) 140 | return sess.run([self. train_policy_op], 141 | {self.input_state: feed_state, 142 | self.actions_grad: actions_grad/batch_size}) 143 | 144 | def update_target_actor_net(self, sess=None): 145 | sess = sess or self.sess 146 | return sess.run(self.update_target_net_op) 147 | 148 | def __create_get_layer_weight_op_source(self): 149 | with tf.variable_scope(self.source_var_scope, reuse=True): 150 | self.h1_weight_source = tf.get_variable("hidden_1/kernel") 151 | self.h1_bias_source = tf.get_variable("hidden_1/bias") 152 | 153 | def run_layer_weight_source(self, sess=None): 154 | sess = sess or self.sess 155 | return sess.run([self.h1_weight_source, self.h1_bias_source]) 156 | 157 | def __create_get_layer_weight_op_target(self): 158 | with tf.variable_scope(self.target_var_scope, reuse=True): 159 | self.h1_weight_target = tf.get_variable("hidden_1/kernel") 160 | self.h1_bias_target = tf.get_variable("hidden_1/bias") 161 | 162 | def run_layer_weight_target(self, sess=None): 163 | sess = sess or self.sess 164 | return sess.run([self.h1_weight_target, self.h1_bias_target]) 165 | 166 | 167 | if __name__ == '__main__': 168 | import numpy as np 169 | state_dim = 40 170 | action_dim = 3 171 | learning_rate = np.random.rand(1) 172 | print("learning_rate: ", learning_rate) 173 | tau = np.random.rand(1) 174 | print("tau: ", tau) 175 | sess = tf.Session() 176 | actor = DDPG_Actor(state_dim, action_dim, sess=sess, tau=tau, learning_rate=learning_rate[0]) 177 | # actor = DDPG_Actor(state_dim, action_dim, sess=sess, tau=tau) 178 | random_state = np.random.normal(size=state_dim) 179 | print("random_state", random_state) 180 | 181 | # check forward 182 | action = actor.predict_action_source_net([random_state], sess) 183 | print("predict action", action) 184 | 185 | # check update_source_net 186 | h1_weight, h1_bias = actor.run_layer_weight_source(sess) 187 | random_actions_grad = np.random.normal(size=action_dim) 188 | actor.update_source_actor_net([random_state], [random_actions_grad], sess) 189 | h1_weight_trained, h1_bias_trained = actor.run_layer_weight_source(sess) 190 | print("h1_weight_difference", (h1_weight_trained-h1_weight)) 191 | print("h1_bias_difference", (h1_bias_trained-h1_bias)) 192 | 193 | # check update target net 194 | h1_weight_target, h1_bias_target = actor.run_layer_weight_target(sess) 195 | actor.update_target_actor_net(sess) 196 | h1_weight_trained_target, h1_bias_trained_target = actor.run_layer_weight_target(sess) 197 | print("source_target_differece_weight", (h1_weight_trained - h1_weight_trained_target)) 198 | print("source_target_differece_bias", (h1_bias_trained - h1_bias_trained_target)) 199 | print("weight_error", h1_weight_trained_target - tau*h1_weight_trained + (1-tau)*h1_weight_target) 200 | print("bias_error", h1_bias_trained_target - tau*h1_bias_trained + (1-tau)*h1_bias_target) 201 | print(np.sum(np.abs(h1_weight_trained_target - tau*h1_weight_trained + (1-tau)*h1_weight_target))) 202 | print(np.sum(np.abs(h1_bias_trained_target - tau*h1_bias_trained + (1-tau)*h1_bias_target))) 203 | -------------------------------------------------------------------------------- /model/ddpg_critic.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from math import sqrt 3 | 4 | class DDPG_Critic(object): 5 | def __init__(self, state_dim, action_dim, optimizer=None, learning_rate=0.001, tau=0.001, scope="", sess=None): 6 | self.scope = scope 7 | self.sess = sess 8 | self.state_dim = state_dim 9 | self.action_dim = action_dim 10 | self.learning_rate = learning_rate 11 | self.l2_reg = 0.01 12 | self.optimizer = optimizer or tf.train.AdamOptimizer(self.learning_rate) 13 | self.tau = tau 14 | self.h1_dim = 400 15 | self.h2_dim = 100 16 | self.h3_dim = 300 17 | self.activation = tf.nn.relu 18 | self.kernel_initializer = tf.contrib.layers.variance_scaling_initializer() 19 | # fan-out uniform initializer which is different from original paper 20 | self.kernel_initializer_1 = tf.random_uniform_initializer(minval=-1/sqrt(self.h1_dim), maxval=1/sqrt(self.h1_dim)) 21 | self.kernel_initializer_2 = tf.random_uniform_initializer(minval=-1/sqrt(self.h2_dim), maxval=1/sqrt(self.h2_dim)) 22 | self.kernel_initializer_3 = tf.random_uniform_initializer(minval=-1/sqrt(self.h3_dim), maxval=1/sqrt(self.h3_dim)) 23 | self.kernel_initializer_4 = tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3) 24 | self.kernel_regularizer = tf.contrib.layers.l2_regularizer(self.l2_reg) 25 | 26 | with tf.name_scope("critic_input"): 27 | self.input_state = tf.placeholder(tf.float32, shape=[None, self.state_dim], name="states") 28 | self.input_action = tf.placeholder(tf.float32, shape=[None, self.action_dim], name="actions") 29 | 30 | with tf.name_scope("critic_label"): 31 | self.y = tf.placeholder(tf.float32, shape=[None, 1], name="y") 32 | 33 | self.source_var_scope = "ddpg/" + "critic_net" 34 | with tf.variable_scope(self.source_var_scope): 35 | self.q_output = self.__create_critic_network() 36 | 37 | self.target_var_scope = "ddpg/" + "critic_target_net" 38 | with tf.variable_scope(self.target_var_scope): 39 | self.target_net_q_output = self.__create_target_network() 40 | 41 | with tf.name_scope("compute_critic_loss"): 42 | self.__create_loss() 43 | 44 | self.train_op_scope = "critic_train_op" 45 | with tf.variable_scope(self.train_op_scope): 46 | self.__create_train_op() 47 | 48 | with tf.name_scope("critic_target_update_train_op"): 49 | self.__create_update_target_net_op() 50 | 51 | with tf.name_scope("get_action_grad_op"): 52 | self.__create_get_action_grad_op() 53 | 54 | self.__create_get_layer_weight_op_source() 55 | self.__create_get_layer_weight_op_target() 56 | 57 | def __create_critic_network(self): 58 | h1 = tf.layers.dense(self.input_state, 59 | units=self.h1_dim, 60 | activation=self.activation, 61 | kernel_initializer=self.kernel_initializer_1, 62 | # kernel_initializer=self.kernel_initializer, 63 | kernel_regularizer=self.kernel_regularizer, 64 | name="hidden_1") 65 | 66 | # h1_with_action = tf.concat([h1, self.input_action], 1, name="hidden_1_with_action") 67 | 68 | h2 = tf.layers.dense(self.input_action, 69 | units=self.h2_dim, 70 | activation=self.activation, 71 | kernel_initializer=self.kernel_initializer_2, 72 | # kernel_initializer=self.kernel_initializer, 73 | kernel_regularizer=self.kernel_regularizer, 74 | name="hidden_2") 75 | 76 | h_concat = tf.concat([h1, h2], 1, name="h_concat") 77 | 78 | h3 = tf.layers.dense(h_concat, 79 | units=self.h3_dim, 80 | activation=self.activation, 81 | kernel_initializer=self.kernel_initializer_3, 82 | # kernel_initializer=self.kernel_initializer, 83 | kernel_regularizer=self.kernel_regularizer, 84 | name="hidden_3") 85 | 86 | # h2_with_action = tf.concat([h2, self.input_action], 1, name="hidden_3_with_action") 87 | 88 | q_output = tf.layers.dense(h3, 89 | units=1, 90 | # activation=tf.nn.sigmoid, 91 | activation = None, 92 | kernel_initializer=self.kernel_initializer_4, 93 | # kernel_initializer=self.kernel_initializer, 94 | kernel_regularizer=self.kernel_regularizer, 95 | name="q_output") 96 | 97 | return q_output 98 | 99 | def __create_target_network(self): 100 | # get source variales and initialize 101 | source_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.source_var_scope) 102 | self.sess.run(tf.variables_initializer(source_vars)) 103 | 104 | # create target network and initialize it by source network 105 | q_output = self.__create_critic_network() 106 | target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_var_scope) 107 | 108 | target_init_op_list = [target_vars[i].assign(source_vars[i]) for i in range(len(source_vars))] 109 | self.sess.run(target_init_op_list) 110 | 111 | return q_output 112 | 113 | def __create_loss(self): 114 | self.loss = tf.losses.mean_squared_error(self.y, self.q_output) 115 | 116 | def __create_train_op(self): 117 | self.train_q_op = self.optimizer.minimize(self.loss) 118 | train_op_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope= self.scope + "/" + self.train_op_scope) # to do: remove prefix 119 | train_op_vars.extend(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.train_op_scope)) 120 | self.sess.run(tf.variables_initializer(train_op_vars)) 121 | 122 | def __create_update_target_net_op(self): 123 | source_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.source_var_scope) 124 | target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_var_scope) 125 | update_target_net_op_list = [target_vars[i].assign(self.tau*source_vars[i] + (1-self.tau)*target_vars[i]) for i in range(len(source_vars))] 126 | # source_net_dict = {var.name[len(self.source_var_scope):]: var for var in source_vars} 127 | # target_net_dict = {var.name[len(self.target_var_scope):]: var for var in target_vars} 128 | # keys = source_net_dict.keys() 129 | # update_target_net_op_list = [target_net_dict[key].assign((1-self.tau)*target_net_dict[key]+self.tau*source_net_dict[key]) \ 130 | # for key in keys] 131 | 132 | # for s_v, t_v in zip(source_vars, target_vars): 133 | # update_target_net_op_list.append(t_v.assign(self.tau*s_v - (1-self.tau)*t_v)) 134 | 135 | self.update_target_net_op = tf.group(*update_target_net_op_list) 136 | 137 | def __create_get_action_grad_op(self): 138 | self.get_action_grad_op = tf.gradients(self.q_output, self.input_action) 139 | 140 | def predict_q_source_net(self, feed_state, feed_action, sess=None): 141 | sess = sess or self.sess 142 | return sess.run(self.q_output, {self.input_state: feed_state, 143 | self.input_action: feed_action}) 144 | 145 | def predict_q_target_net(self, feed_state, feed_action, sess=None): 146 | sess = sess or self.sess 147 | return sess.run(self.target_net_q_output, {self.input_state: feed_state, 148 | self.input_action: feed_action}) 149 | 150 | def update_source_critic_net(self, feed_state, feed_action, feed_y, sess=None): 151 | sess = sess or self.sess 152 | return sess.run([self.train_q_op], 153 | {self.input_state: feed_state, 154 | self.input_action: feed_action, 155 | self.y: feed_y}) 156 | 157 | def update_target_critic_net(self, sess=None): 158 | sess = sess or self.sess 159 | return sess.run(self.update_target_net_op) 160 | 161 | def get_action_grads(self, feed_state, feed_action, sess=None): 162 | sess = sess or self.sess 163 | return (sess.run(self.get_action_grad_op, {self.input_state: feed_state, 164 | self.input_action: feed_action}))[0] 165 | 166 | def __create_get_layer_weight_op_source(self): 167 | with tf.variable_scope(self.source_var_scope, reuse=True): 168 | self.h1_weight_source = tf.get_variable("hidden_1/kernel") 169 | self.h1_bias_source = tf.get_variable("hidden_1/bias") 170 | 171 | def run_layer_weight_source(self, sess=None): 172 | sess = sess or self.sess 173 | return sess.run([self.h1_weight_source, self.h1_bias_source]) 174 | 175 | def __create_get_layer_weight_op_target(self): 176 | with tf.variable_scope(self.target_var_scope, reuse=True): 177 | self.h1_weight_target = tf.get_variable("hidden_1/kernel") 178 | self.h1_bias_target = tf.get_variable("hidden_1/bias") 179 | 180 | def run_layer_weight_target(self, sess=None): 181 | sess = sess or self.sess 182 | return sess.run([self.h1_weight_target, self.h1_bias_target]) 183 | 184 | if __name__ == '__main__': 185 | import numpy as np 186 | state_dim = 40 187 | action_dim = 3 188 | learning_rate = np.random.rand(1) 189 | print("learning_rate: ", learning_rate) 190 | tau = np.random.rand(1) 191 | print("tau: ", tau) 192 | sess = tf.Session() 193 | critic = DDPG_Critic(state_dim, action_dim, sess=sess, tau=tau, learning_rate=learning_rate[0]) 194 | # critic = DDPG_Actor(state_dim, action_dim, sess=sess, tau=tau) 195 | random_state = np.random.normal(size=state_dim) 196 | print("random_state", random_state) 197 | 198 | random_action = np.random.random(size=action_dim) 199 | print("random_action", random_action) 200 | 201 | # check forward 202 | target_q = critic.predict_q_target_net([random_state], [random_action], sess) 203 | print("predict target q", target_q) 204 | 205 | # check update_source_net 206 | y = target_q[0] + 1 207 | h1_weight, h1_bias = critic.run_layer_weight_source(sess) 208 | random_actions_grad = np.random.normal(size=action_dim) 209 | critic.update_source_critic_net([random_state], [random_action], [y], sess) 210 | h1_weight_trained, h1_bias_trained = critic.run_layer_weight_source(sess) 211 | print("h1_weight_difference", (h1_weight_trained-h1_weight)) 212 | print("h1_bias_difference", (h1_bias_trained-h1_bias)) 213 | 214 | # check update target net 215 | h1_weight_target, h1_bias_target = critic.run_layer_weight_target(sess) 216 | critic.update_target_critic_net(sess) 217 | h1_weight_trained_target, h1_bias_trained_target = critic.run_layer_weight_target(sess) 218 | print("source_target_differece_weight", (h1_weight_trained - h1_weight_trained_target)) 219 | print("source_target_differece_bias", (h1_bias_trained - h1_bias_trained_target)) 220 | print("weight_error", h1_weight_trained_target - tau*h1_weight_trained + (1-tau)*h1_weight_target) 221 | print("bias_error", h1_bias_trained_target - tau*h1_bias_trained + (1-tau)*h1_bias_target) 222 | print(np.sum(np.abs(h1_weight_trained_target - tau*h1_weight_trained + (1-tau)*h1_weight_target))) 223 | print(np.sum(np.abs(h1_bias_trained_target - tau*h1_bias_trained + (1-tau)*h1_bias_target))) 224 | 225 | # check get action grad 226 | random_action_for_grad = np.random.random(size=action_dim) 227 | print("random_actions_grad", random_action_for_grad) 228 | action_grad = critic.get_action_grads([random_state], [random_action_for_grad], sess) 229 | # print("action_grad", action_grad) 230 | for i in action_grad: 231 | print(i) 232 | -------------------------------------------------------------------------------- /model/ddpg_model.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | lib_path = os.path.abspath(os.path.dirname(__file__)) 3 | sys.path.append(lib_path) 4 | 5 | import tensorflow as tf 6 | from ddpg_actor import DDPG_Actor 7 | from ddpg_critic import DDPG_Critic 8 | 9 | 10 | class Model(object): 11 | def __init__(self, 12 | state_dim, 13 | action_dim, 14 | optimizer=None, 15 | actor_learning_rate=1e-4, 16 | critic_learning_rate=1e-3, 17 | tau = 0.001, 18 | sess=None): 19 | self.state_dim = state_dim 20 | self.action_dim = action_dim 21 | self.actor_learning_rate = actor_learning_rate 22 | self.critic_learning_rate = critic_learning_rate 23 | self.tau = tau 24 | 25 | #tf.reset_default_graph() 26 | self.sess = sess or tf.Session() 27 | 28 | self.global_step = tf.Variable(0, name="global_step", trainable=False) 29 | global_step_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="global_step") 30 | self.sess.run(tf.variables_initializer(global_step_vars)) 31 | 32 | self.actor_scope = "actor_net" 33 | with tf.name_scope(self.actor_scope): 34 | self.actor = DDPG_Actor(self.state_dim, 35 | self.action_dim, 36 | learning_rate=self.actor_learning_rate, 37 | tau=self.tau, 38 | scope=self.actor_scope, 39 | sess=self.sess) 40 | 41 | self.critic_scope = "critic_net" 42 | with tf.name_scope(self.critic_scope): 43 | self.critic = DDPG_Critic(self.state_dim, 44 | self.action_dim, 45 | learning_rate=self.critic_learning_rate, 46 | tau=self.tau, 47 | scope=self.critic_scope, 48 | sess=self.sess) 49 | 50 | def update(self, state_batch, action_batch, y_batch, sess=None): 51 | sess = sess or self.sess 52 | self.critic.update_source_critic_net(state_batch, action_batch, y_batch, sess) 53 | action_batch_for_grad = self.actor.predict_action_source_net(state_batch, sess) 54 | action_grad_batch = self.critic.get_action_grads(state_batch, action_batch_for_grad, sess) 55 | self.actor.update_source_actor_net(state_batch, action_grad_batch, sess) 56 | 57 | self.critic.update_target_critic_net(sess) 58 | self.actor.update_target_actor_net(sess) 59 | 60 | def predict_action(self, observation, sess=None): 61 | sess = sess or self.sess 62 | return self.actor.predict_action_source_net(observation, sess) 63 | 64 | if __name__ == '__main__': 65 | import numpy as np 66 | state_dim = 40 67 | action_dim = 3 68 | actor_learning_rate = np.random.rand(1) 69 | print("actor_learning_rate: ", actor_learning_rate) 70 | critic_learning_rate = np.random.rand(1) 71 | print("critic_learning_rate: ", critic_learning_rate) 72 | tau = np.random.rand(1) 73 | print("tau: ", tau) 74 | sess = tf.Session() 75 | model = Model(state_dim, 76 | action_dim, 77 | tau=tau, 78 | actor_learning_rate=actor_learning_rate[0], 79 | critic_learning_rate=critic_learning_rate[0], 80 | sess=sess) 81 | random_state = np.random.normal(size=state_dim) 82 | print("random_state", random_state) 83 | 84 | random_action = np.random.random(size=action_dim) 85 | print("random_action", random_action) 86 | 87 | # check prediction 88 | pred_action = model.predict_action(random_state) 89 | print("predict_action", pred_action) 90 | 91 | # check forward 92 | target_q = model.critic.predict_q_target_net([random_state], [random_action], sess) 93 | print("predict target q", target_q) 94 | y = target_q[0] + 1 95 | 96 | model.update([random_state], [random_action], [y]) 97 | -------------------------------------------------------------------------------- /model/reinforce_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class Model(object): 4 | def __init__(self, state_dim, action_dim, entropy_beta=1e-3, optimizer=None, learning_rate=0.001): 5 | self.state_dim = state_dim 6 | self.action_dim = action_dim 7 | self.entropy_beta = entropy_beta 8 | self.learning_rate = learning_rate 9 | tf.reset_default_graph() 10 | self.global_step = tf.Variable(0, name="global_step", trainable=False) 11 | self.optimizer = optimizer or tf.train.RMSPropOptimizer(self.learning_rate) 12 | 13 | with tf.name_scope("model_input"): 14 | self.input_state = tf.placeholder(tf.float32, shape=[None, self.state_dim], name="states") 15 | 16 | with tf.name_scope("model_target"): 17 | self.taken_actions = tf.placeholder(tf.int32, shape=[None, 1], name="taken_actions") 18 | self.future_rewards = tf.placeholder(tf.float32, shape=[None, 1], name="future_rewards") 19 | 20 | with tf.name_scope("model"): 21 | self.__create_model() 22 | 23 | def __create_policy_network(self): 24 | with tf.variable_scope("shared_network"): 25 | h1 = tf.layers.dense(self.input_state, 26 | units=32, 27 | activation=tf.nn.relu, 28 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 29 | name="hidden_1") 30 | 31 | h2 = tf.layers.dense(h1, 32 | units=32, 33 | activation=tf.nn.relu, 34 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 35 | name="hidden_2") 36 | 37 | with tf.variable_scope("policy_network"): 38 | self.policy_outputs = tf.layers.dense(h2, 39 | units=self.action_dim, 40 | activation=tf.nn.softmax, 41 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 42 | name="policy_outputs") 43 | 44 | def __creat_base_line_network(self): 45 | with tf.variable_scope("shared_network"): 46 | h1 = tf.layers.dense(self.input_state, 47 | units=32, 48 | activation=tf.nn.relu, 49 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 50 | name="hidden_1", 51 | reuse=True) 52 | 53 | h2 = tf.layers.dense(h1, 54 | units=32, 55 | activation=tf.nn.relu, 56 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 57 | name="hidden_2", 58 | reuse=True) 59 | 60 | with tf.variable_scope("base_line_network"): 61 | self.base_line_outputs = tf.layers.dense(h2, 62 | units=1, 63 | activation=None, 64 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 65 | name="base_line_outputs") 66 | 67 | def __create_loss(self): 68 | with tf.name_scope("compute_policy_gradients"): 69 | self.log_probs = tf.log(tf.clip_by_value(self.policy_outputs, 1e-20, 1.0)) 70 | 71 | # entropy loss of exploration 72 | self.entropy_loss = -tf.reduce_sum(self.policy_outputs * self.log_probs, reduction_indices=1) 73 | self.policy_loss = -tf.reduce_sum(tf.multiply(self.log_probs, tf.squeeze(tf.cast(tf.one_hot(self.taken_actions, self.action_dim), tf.float32))) * \ 74 | (self.future_rewards - tf.stop_gradient(self.base_line_outputs)), reduction_indices=1) 75 | # self.policy_loss = -tf.reduce_sum(tf.multiply(self.log_probs, tf.squeeze(tf.cast(tf.one_hot(self.taken_actions, self.action_dim), tf.float32))) * \ 76 | # (self.future_rewards), reduction_indices=1) 77 | td = self.base_line_outputs - self.future_rewards 78 | td = tf.clip_by_value(td, -5.0, 5.0) 79 | # self.base_line_loss = tf.nn.l2_loss(self.base_line_outputs - self.future_rewards) 80 | self.base_line_loss = tf.reduce_sum(td**2 / 2, reduction_indices=1) 81 | self.total_loss = tf.add_n([self.policy_loss, 82 | self.entropy_beta * self.entropy_loss, 83 | self.base_line_loss]) 84 | 85 | def __create_train_policy_op(self): 86 | self.train_policy_op = self.optimizer.minimize(self.policy_loss, 87 | global_step=tf.contrib.framework.get_global_step()) 88 | 89 | def __create_train_op(self): 90 | self.train_op = self.optimizer.minimize(self.total_loss, 91 | global_step=tf.contrib.framework.get_global_step()) 92 | 93 | def __create_model(self): 94 | self.__create_policy_network() 95 | self.__creat_base_line_network() 96 | self.__create_loss() 97 | self.__create_train_policy_op() 98 | self.__create_train_op() 99 | self.__create_get_layer_weight_op() 100 | 101 | def predict_policy(self, feed_state, sess=None): 102 | sess = sess or tf.get_default_session() 103 | return sess.run(self.policy_outputs, {self.input_state: feed_state}) 104 | 105 | def update_policy(self, feed_state, feed_taken_actions, feed_future_rewards, sess=None): 106 | sess = sess or tf.get_default_session() 107 | return sess.run([self. train_policy_op, self.policy_loss], 108 | {self.input_state: feed_state, 109 | self.taken_actions: feed_taken_actions, 110 | self.future_rewards: feed_future_rewards}) 111 | 112 | def predict_baseline(self, feed_state, sess=None): 113 | sess = sess or tf.get_default_session() 114 | return sess.run(self.base_line_outputs, {self.input_state: feed_state}) 115 | 116 | def update_baseline(self, feed_state, feed_future_rewards, sess=None): 117 | sess = sess or tf.get_default_session() 118 | return sess.run(self.base_line_loss, {self.input_state: feed_state, 119 | self.future_rewards: feed_future_rewards}) 120 | 121 | def update(self, feed_state, feed_taken_actions, feed_future_rewards, sess=None): 122 | sess = sess or tf.get_default_session() 123 | return sess.run([self.train_op, self.total_loss, self.policy_loss, self.base_line_loss], 124 | {self.input_state: feed_state, 125 | self.taken_actions: feed_taken_actions, 126 | self.future_rewards: feed_future_rewards}) 127 | 128 | def __create_get_layer_weight_op(self): 129 | with tf.name_scope("model"): 130 | with tf.variable_scope("shared_network", reuse=True): 131 | self.h1_weiget = tf.get_variable("hidden_1/kernel") 132 | self.h1_bias = tf.get_variable("hidden_1/bias") 133 | 134 | def run_layer_weight(self, sess=None): 135 | sess = sess or tf.get_default_session() 136 | return sess.run([self.h1_weiget, self.h1_bias]) 137 | 138 | 139 | if __name__ == '__main__': 140 | model = Model(40, 5) 141 | -------------------------------------------------------------------------------- /run_ddpg.bat: -------------------------------------------------------------------------------- 1 | mode con: cols=80 lines=100 2 | 3 | 4 | if NOT "%ComputerName%" == "PC-KW-60002" ( 5 | set CUDA_VISIBLE_DEVICES=0 & activate tensorflow & python run_ddpg.py 6 | ) else ( 7 | set CUDA_VISIBLE_DEVICES=0 & activate deep-fpv-racer & python run_ddpg.py & set /p temp="Hit enter to exit" 8 | ) -------------------------------------------------------------------------------- /run_ddpg.py: -------------------------------------------------------------------------------- 1 | from model.ddpg_model import Model 2 | from agent.ddpg import Agent 3 | from mechanism.replay_buffer import Replay_Buffer 4 | from mechanism.ou_process import OU_Process 5 | from gym import wrappers 6 | import gym 7 | import numpy as np 8 | 9 | ENV_NAME = 'Pendulum-v0' 10 | EPISODES = 100000 11 | MAX_EXPLORE_EPS = 100 12 | TEST_EPS = 1 13 | BATCH_SIZE = 64 14 | BUFFER_SIZE = 1e6 15 | WARM_UP_MEN = 5 * BATCH_SIZE 16 | DISCOUNT_FACTOR = 0.99 17 | ACTOR_LEARNING_RATE = 1e-4 18 | CRITIC_LEARNING_RATE = 1e-3 19 | TAU = 0.001 20 | 21 | def main(): 22 | env = gym.make(ENV_NAME) 23 | env = wrappers.Monitor(env, ENV_NAME+"experiment-1", force=True) 24 | state_dim = env.observation_space.shape[0] 25 | action_dim = env.action_space.shape[0] 26 | model = Model(state_dim, 27 | action_dim, 28 | actor_learning_rate=ACTOR_LEARNING_RATE, 29 | critic_learning_rate=CRITIC_LEARNING_RATE, 30 | tau=TAU) 31 | replay_buffer = Replay_Buffer(buffer_size=int(BUFFER_SIZE) ,batch_size=BATCH_SIZE) 32 | exploration_noise = OU_Process(action_dim) 33 | agent = Agent(model, replay_buffer, exploration_noise, discout_factor=DISCOUNT_FACTOR) 34 | 35 | action_mean = 0 36 | i = 0 37 | for episode in range(EPISODES): 38 | state = env.reset() 39 | agent.init_process() 40 | # Training: 41 | for step in range(env.spec.timestep_limit): 42 | # env.render() 43 | state = np.reshape(state, (1, -1)) 44 | if episode < MAX_EXPLORE_EPS: 45 | p = episode / MAX_EXPLORE_EPS 46 | action = np.clip(agent.select_action(state, p), -1.0, 1.0) 47 | else: 48 | action = agent.predict_action(state) 49 | action_ = action * 2 50 | next_state, reward, done, _ = env.step(action_) 51 | next_state = np.reshape(next_state, (1, -1)) 52 | agent.store_transition([state, action, reward, next_state, done]) 53 | if agent.replay_buffer.memory_state()["current_size"] > WARM_UP_MEN: 54 | agent.train_model() 55 | else: 56 | i += 1 57 | action_mean = action_mean + (action - action_mean) / i 58 | print("running action mean: {}".format(action_mean)) 59 | state = next_state 60 | if done: 61 | break 62 | 63 | # Testing: 64 | if episode % 2 == 0 and episode > 10: 65 | total_reward = 0 66 | for i in range(TEST_EPS): 67 | state = env.reset() 68 | for j in range(env.spec.timestep_limit): 69 | # env.render() 70 | state = np.reshape(state, (1, 3)) 71 | action = agent.predict_action(state) 72 | action_ = action * 2 73 | state, reward, done, _ = env.step(action_) 74 | total_reward += reward 75 | if done: 76 | break 77 | avg_reward = total_reward/TEST_EPS 78 | print("episode: {}, Evaluation Average Reward: {}".format(episode, avg_reward)) 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /run_reinforce.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import wrappers 3 | import numpy as np 4 | import random 5 | from model.reinforce_model import Model 6 | from agent.reinforce import Agent 7 | 8 | import logging 9 | 10 | # config logger 11 | logger = logging.getLogger(__name__) 12 | logger.setLevel(logging.INFO) 13 | 14 | # make environment 15 | env_name = 'CartPole-v0' 16 | env = gym.make(env_name) 17 | env = wrappers.Monitor(env, env_name+"experiment-1", force=True) 18 | logger.info("{} is made".format(env_name)) 19 | action_dim = env.action_space.n 20 | state_dim = env.observation_space.shape[0] 21 | logger.info("action dimension of env is {}".format(action_dim)) 22 | logger.info("state dimension of env is {}".format(state_dim)) 23 | 24 | # parameters 25 | MAX_EPISODE = 100000 26 | MAX_STEP = 1000 27 | DISCOUNT_FACTOR = 0.97 28 | ENTROPY_BETA = 1e-3 29 | LEARNING_RATE = 0.001 30 | VERBOSE = False 31 | 32 | model = Model(state_dim, action_dim, entropy_beta=ENTROPY_BETA, learning_rate=LEARNING_RATE) 33 | agent = Agent(model, DISCOUNT_FACTOR, VERBOSE) 34 | last_100_epi_red = [] 35 | for i_episode in xrange(MAX_EPISODE): 36 | observation = env.reset() 37 | episode_reward = 0 38 | for t in xrange(MAX_STEP): 39 | agent.state_append(observation) 40 | # env.render() 41 | p = agent.predict_policy(observation) 42 | action = np.random.choice(action_dim, 1, p=p[0]) 43 | observation, reward, done, info = env.step(action[0]) 44 | episode_reward += reward 45 | 46 | if done and episode_reward != 200: 47 | reward = -10 48 | elif done and episode_reward == 200: 49 | reward = 10 50 | print("positive done!") 51 | agent.action_append(action) 52 | agent.reward_append(reward) 53 | 54 | if done: 55 | last_100_epi_red.insert(0, episode_reward) 56 | if len(last_100_epi_red) > 100: 57 | last_100_epi_red.pop() 58 | logger.info("episode {} finished after {} timesteps with total reward {}".format(i_episode, t+1, episode_reward)) 59 | avg_reward = sum(last_100_epi_red) / float(len(last_100_epi_red)) 60 | logger.info("last 100 episodes average reward is {}".format(sum(last_100_epi_red) / float(len(last_100_epi_red)))) 61 | if avg_reward >= 195.0: 62 | print("problem solved!") 63 | exit() 64 | break 65 | agent.train_model() 66 | agent.clear_rollout() 67 | --------------------------------------------------------------------------------