├── __init__.py ├── ddpg-movan ├── DDPG.py ├── DDPG_per.py └── __init__.py ├── ddpg_sp ├── DDPG_class.py ├── DDPG_per_class.py ├── DDPG_sp.py ├── __init__.py ├── core.py ├── ddpg_class_HalfCheetah-v2_epochs200_seed553.png └── ddpg_class_HalfCheetah-v2_epochs3000_seed485.png ├── memory ├── __init__.py ├── per_memory.py ├── simple_memory.py └── sp_per_memory.py ├── noise ├── __init__.py ├── ou_noise.py └── simple_noise.py ├── readme.md ├── run_in_gym ├── __init__.py ├── launch_with_gym.py └── run_gym_sac_class.py ├── sac_auto ├── __init__.py ├── core.py ├── sac_auto_class.py └── sac_auto_per_class.py ├── sac_sp ├── SAC_class.py ├── SAC_sp.py ├── __init__.py ├── core.py ├── exp_images │ ├── HalfCheetah-v2-sac-class-300k-test.png │ ├── HalfCheetah-v2-sac-class-300k.png │ ├── Hopper-v2-sac-class-3000k-test.png │ ├── Hopper-v2-sac-class-3000k-train.png │ └── Hopper-v2-sac-sp-5000k-train.png └── test_gym_sac_sp_class.py ├── sp_utils ├── __init__.py ├── logx.py ├── mpi_tools.py ├── plot.py └── serialization_utils.py └── td3_sp ├── TD3_class.py ├── TD3_per_class.py ├── TD3_sp.py ├── __init__.py ├── core.py ├── td3_origin.py └── test_gym_td3_sp_class.py /__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from ddpg_sp.DDPG_per_class import DDPG as DDPG 3 | from sac_auto.sac_auto_per_class import SAC as SAC_AUTO 4 | from td3_sp.TD3_per_class import TD3 as TD3 5 | from sac_sp.sac_class import SAC as SAC 6 | except: 7 | from rl_algorithms.ddpg_sp.DDPG_per_class import DDPG as DDPG 8 | from rl_algorithms.sac_auto.sac_auto_per_class import SAC as SAC_AUTO 9 | from rl_algorithms.td3_sp.TD3_per_class import TD3 as TD3 10 | from rl_algorithms.sac_sp.sac_class import SAC as SAC 11 | 12 | -------------------------------------------------------------------------------- /ddpg-movan/DDPG.py: -------------------------------------------------------------------------------- 1 | """ 2 | 从最原始的成功的DDPG_OU_noise_memory_list.py抽取出来. 3 | 删掉噪声部分. 4 | 剔除memory部分. 5 | 便于以后加per. 6 | """ 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | import sys 11 | from memory.simple_memory import Memory 12 | 13 | 14 | ##################### hyper parameters #################### 15 | GAMMA = 0.9 # reward discount 16 | TAU = 0.01 # soft replacement 17 | 18 | 19 | ############################### DDPG #################################### 20 | 21 | class DDPG(object): 22 | def __init__(self, a_dim, s_dim, a_bound, transition_num=4, restore_flag=False, batch_size=32, memory_size=100000): 23 | self.transition_num = transition_num 24 | self.memory = Memory(memory_size=memory_size, 25 | batch_size=batch_size, 26 | transition_num=transition_num, 27 | ) 28 | self.batch_size = batch_size 29 | 30 | self.pointer = 0 31 | self.learn_step = 0 32 | self.restore_flag = restore_flag 33 | 34 | self.sess = tf.Session() 35 | 36 | self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, 37 | self.actor_lr = tf.placeholder(tf.float32, shape=[], name='actor_lr') 38 | self.critic_lr = tf.placeholder(tf.float32, shape=[], name='critic_lr') 39 | 40 | self.S = tf.placeholder(tf.float32, [None, s_dim], 's') 41 | self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_') 42 | self.R = tf.placeholder(tf.float32, [None, 1], 'r') 43 | 44 | with tf.variable_scope('Actor'): 45 | self.a = self._build_a(self.S, scope='eval', trainable=True) 46 | a_ = self._build_a(self.S_, scope='target', trainable=False) 47 | with tf.variable_scope('Critic'): 48 | # assign self.a = a in memory when calculating q for td_error, 49 | # otherwise the self.a is from Actor when updating Actor 50 | q = self._build_c(self.S, self.a, scope='eval', trainable=True) 51 | q_ = self._build_c(self.S_, a_, scope='target', trainable=False) 52 | 53 | # networks parameters 54 | self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval') 55 | self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target') 56 | self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval') 57 | self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target') 58 | 59 | # hard_replace 60 | self.hard_replace = [tf.assign(t, e) 61 | for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)] 62 | 63 | # target net replacement 64 | self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e) 65 | for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)] 66 | 67 | q_target = self.R + GAMMA * q_ 68 | # in the feed_dic for the td_error, the self.a should change to actions in memory 69 | td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) 70 | self.c_loss = td_error 71 | self.ctrain = tf.train.AdamOptimizer(self.critic_lr).minimize(td_error, var_list=self.ce_params) 72 | 73 | a_loss = - tf.reduce_mean(q) # maximize the q 74 | self.a_loss = a_loss 75 | self.atrain = tf.train.AdamOptimizer(self.actor_lr).minimize(a_loss, var_list=self.ae_params) 76 | 77 | self.sess.run(tf.global_variables_initializer()) 78 | 79 | def choose_action(self, s): 80 | return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0] 81 | 82 | def store_transition(self, transition): 83 | self.memory.store(transition) 84 | 85 | def learn(self, actor_lr_input, critic_lr_input, output_loss_flag=False): 86 | # soft target replacement 87 | self.sess.run(self.soft_replace) 88 | # 加上terminal信息 89 | if self.transition_num==5: 90 | bs, ba, br, bs_, bt = self.memory.sample() 91 | if self.transition_num==4: 92 | bs, ba, br, bs_ = self.memory.sample() 93 | 94 | self.learn_step += 1 95 | 96 | if output_loss_flag: 97 | _, a_loss = self.sess.run([self.atrain, self.a_loss], {self.S: bs, self.actor_lr: actor_lr_input}) 98 | _, c_loss = self.sess.run([self.ctrain, self.c_loss], 99 | {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.critic_lr: critic_lr_input}) 100 | return a_loss, c_loss 101 | else: 102 | self.sess.run(self.atrain, {self.S: bs, self.actor_lr: actor_lr_input}) 103 | self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.critic_lr: critic_lr_input}) 104 | 105 | 106 | 107 | def _build_a(self, s, scope, trainable): 108 | with tf.variable_scope(scope): 109 | net = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable) 110 | new_actor_layer = tf.layers.dense(net, 200, activation=tf.nn.relu, name='new_actor_layer', trainable=trainable) 111 | a = tf.layers.dense(new_actor_layer, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable) 112 | return tf.multiply(a, self.a_bound, name='scaled_a') 113 | 114 | def _build_c(self, s, a, scope, trainable): 115 | with tf.variable_scope(scope): 116 | n_l1 = 400 117 | w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable) 118 | w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable) 119 | b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable) 120 | net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) 121 | new_critic_layer = tf.layers.dense(net, 300, activation=tf.nn.relu, name='new_critic_layer', 122 | trainable=trainable) 123 | return tf.layers.dense(new_critic_layer, 1, trainable=trainable) # Q(s,a) 124 | 125 | def load_step_network(self, saver, load_path): 126 | checkpoint = tf.train.get_checkpoint_state(load_path) 127 | if checkpoint and checkpoint.model_checkpoint_path: 128 | saver.restore(self.sess, tf.train.latest_checkpoint(load_path)) 129 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 130 | self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1]) 131 | 132 | else: 133 | print("Could not find old network weights") 134 | 135 | def save_step_network(self, time_step, saver, save_path): 136 | saver.save(self.sess, save_path + 'network', global_step=time_step, 137 | write_meta_graph=False) 138 | 139 | def load_simple_network(self, path): 140 | saver = tf.train.Saver() 141 | saver.restore(self.sess, tf.train.latest_checkpoint(path)) 142 | print("restore model successful") 143 | 144 | def save_simple_network(self, save_path): 145 | saver = tf.train.Saver() 146 | saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False) 147 | -------------------------------------------------------------------------------- /ddpg-movan/DDPG_per.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning. 3 | DDPG is Actor Critic based algorithm. 4 | Pendulum example. 5 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/ 6 | Using: 7 | tensorflow 1.0 8 | gym 0.8.0 9 | """ 10 | 11 | import tensorflow as tf 12 | import numpy as np 13 | 14 | 15 | ##################### hyper parameters #################### 16 | 17 | 18 | LR_A = 0.001 # learning rate for actor 19 | LR_C = 0.002 # learning rate for critic 20 | GAMMA = 0.9 # reward discount 21 | TAU = 0.01 # soft replacement 22 | 23 | 24 | class OU_noise(object): 25 | def __init__(self, num_actions, action_low_bound, action_high_bound, dt, 26 | mu=0.0, theta=0.15, max_sigma=2.0, min_sigma=0.1): 27 | self.mu = mu # 0.0 28 | self.theta = theta # 0.15 29 | self.sigma = max_sigma # 0.3 30 | self.max_sigma = max_sigma # 0.3 31 | self.min_sigma = min_sigma # 0.1 32 | self.dt = dt # 0.001 33 | self.num_actions = num_actions # 1 34 | self.action_low = action_low_bound # -2 35 | self.action_high = action_high_bound # 2 36 | self.reset() 37 | 38 | def reset(self): 39 | self.state = np.zeros(self.num_actions) 40 | 41 | # self.state = np.zeros(self.num_actions) 42 | def state_update(self): 43 | x = self.state 44 | dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.num_actions) # np.random.randn()生成0,1的随机数 45 | self.state = x + dx 46 | 47 | def add_noise(self, action): 48 | self.state_update() 49 | state = self.state 50 | self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, self.dt) 51 | return np.clip(action + state, self.action_low, self.action_high) 52 | 53 | 54 | class SumTree(object): 55 | """ 56 | This SumTree code is a modified version and the original code is from: 57 | https://github.com/jaara/AI-blog/blob/master/SumTree.py 58 | 59 | Story data with its priority in the tree. 60 | """ 61 | data_pointer = 0 62 | 63 | def __init__(self, capacity): 64 | self.capacity = capacity # for all priority values 65 | self.tree = np.zeros(2 * capacity - 1) 66 | # [--------------Parent nodes-------------][-------leaves to recode priority-------] 67 | # size: capacity - 1 size: capacity 68 | self.data = list(np.zeros(capacity, dtype=object)) # for all transitions 69 | # [--------------data frame-------------] 70 | # size: capacity 71 | 72 | def add(self, p, transition): 73 | tree_idx = self.data_pointer + self.capacity - 1 74 | self.data[self.data_pointer] = transition # update data_frame 75 | self.update(tree_idx, p) # update tree_frame 76 | 77 | self.data_pointer += 1 78 | if self.data_pointer >= self.capacity: # replace when exceed the capacity 79 | self.data_pointer = 0 80 | 81 | def update(self, tree_idx, p): 82 | change = p - self.tree[tree_idx] 83 | self.tree[tree_idx] = p 84 | # then propagate the change through tree 85 | while tree_idx != 0: # this method is faster than the recursive loop in the reference code 86 | tree_idx = (tree_idx - 1) // 2 87 | self.tree[tree_idx] += change 88 | 89 | def get_leaf(self, v): 90 | """ 91 | Tree structure and array storage: 92 | 93 | Tree index: 94 | 0 -> storing priority sum 95 | / \ 96 | 1 2 97 | / \ / \ 98 | 3 4 5 6 -> storing priority for transitions 99 | 100 | Array type for storing: 101 | [0,1,2,3,4,5,6] 102 | """ 103 | parent_idx = 0 104 | while True: # the while loop is faster than the method in the reference code 105 | cl_idx = 2 * parent_idx + 1 # this leaf's left and right kids 106 | cr_idx = cl_idx + 1 107 | if cl_idx >= len(self.tree): # reach bottom, end search 108 | leaf_idx = parent_idx 109 | break 110 | else: # downward search, always search for a higher priority node 111 | if v <= self.tree[cl_idx]: 112 | parent_idx = cl_idx 113 | else: 114 | v -= self.tree[cl_idx] 115 | parent_idx = cr_idx 116 | 117 | data_idx = leaf_idx - self.capacity + 1 118 | return leaf_idx, self.tree[leaf_idx], self.data[data_idx] 119 | 120 | @property 121 | def total_p(self): 122 | return self.tree[0] # the root 123 | 124 | 125 | class Memory(object): # stored as ( s, a, r, s_ ) in SumTree 126 | """ 127 | This Memory class is modified based on the original code from: 128 | https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py 129 | """ 130 | epsilon = 0.01 # small amount to avoid zero priority 131 | alpha = 0.6 # [0~1] convert the importance of TD error to priority 132 | beta = 0.4 # importance-sampling, from initial value increasing to 1 133 | beta_increment_per_sampling = 0.001 134 | abs_err_upper = 1. # clipped abs error 135 | 136 | def __init__(self, capacity): 137 | self.tree = SumTree(capacity) 138 | self.full_flag = False 139 | 140 | def store(self, transition): 141 | max_p = np.max(self.tree.tree[-self.tree.capacity:]) 142 | if max_p == 0: 143 | max_p = self.abs_err_upper 144 | self.tree.add(max_p, transition) # set the max p for new p 145 | 146 | def sample(self, n): 147 | # n就是batch size! 148 | # np.empty()这是一个随机初始化的一个矩阵! 149 | b_idx, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1)) 150 | b_memory = [] 151 | pri_seg = self.tree.total_p / n # priority segment 152 | self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) # max = 1 153 | 154 | min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p # for later calculate ISweight 155 | if min_prob == 0: 156 | min_prob = 0.00001 157 | for i in range(n): 158 | a, b = pri_seg * i, pri_seg * (i + 1) 159 | v = np.random.uniform(a, b) 160 | idx, p, data = self.tree.get_leaf(v) 161 | prob = p / self.tree.total_p 162 | ISWeights[i, 0] = np.power(prob/min_prob, -self.beta) 163 | b_idx[i] = idx 164 | b_memory.append(data) 165 | 166 | return b_idx, b_memory, ISWeights 167 | 168 | def batch_update(self, tree_idx, abs_errors): 169 | abs_errors += self.epsilon # convert to abs and avoid 0 170 | clipped_errors = np.minimum(abs_errors, self.abs_err_upper) 171 | ps = np.power(clipped_errors, self.alpha) 172 | for ti, p in zip(tree_idx, ps): 173 | self.tree.update(ti, p) 174 | 175 | ###############################DDPG#################################### 176 | 177 | 178 | class DDPG(object): 179 | def __init__(self, a_dim, s_dim, a_bound, exp_path, 180 | restore_flag=False, 181 | batch_size=512, 182 | per_batch_size=32, 183 | memory_size=100000, 184 | per_memory_size=20000): 185 | self.memory_size = memory_size 186 | self.memory = [] 187 | self.per_memory = Memory(capacity=per_memory_size) 188 | self.per_memory_size = self.per_memory.tree.capacity 189 | self.pointer = 0 190 | self.per_pointer = 0 191 | 192 | self.batch_size = batch_size 193 | self.per_batch_size = per_batch_size 194 | self.exp_path = exp_path 195 | print("self.exp_path", self.exp_path) 196 | 197 | self.learn_step = 0 198 | self.restore_flag = restore_flag 199 | 200 | self.sess = tf.Session() 201 | 202 | self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, 203 | self.actor_lr = tf.placeholder(tf.float32, shape=[], name='actor_lr') 204 | self.critic_lr = tf.placeholder(tf.float32, shape=[], name='critic_lr') 205 | 206 | self.S = tf.placeholder(tf.float32, [None, s_dim], 's') 207 | self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_') 208 | self.R = tf.placeholder(tf.float32, [None, 1], 'r') 209 | self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') 210 | 211 | with tf.variable_scope('Actor'): 212 | self.a = self._build_a(self.S, scope='eval', trainable=True) 213 | a_ = self._build_a(self.S_, scope='target', trainable=False) 214 | with tf.variable_scope('Critic'): 215 | # assign self.a = a in memory when calculating q for td_error, 216 | # otherwise the self.a is from Actor when updating Actor 217 | q = self._build_c(self.S, self.a, scope='eval', trainable=True) 218 | q_ = self._build_c(self.S_, a_, scope='target', trainable=False) 219 | 220 | # networks parameters 221 | self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval') 222 | self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target') 223 | self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval') 224 | self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target') 225 | 226 | # hard_replace 227 | self.hard_replace = [tf.assign(t, e) 228 | for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)] 229 | 230 | # target net replacement 231 | self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e) 232 | for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)] 233 | 234 | q_target = self.R + GAMMA * q_ 235 | # in the feed_dic for the td_error, the self.a should change to actions in memory 236 | # td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) 237 | self.abs_errors = tf.reduce_sum(tf.abs(q_target - q), axis=1) # for updating Sumtree 238 | self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(q_target, q)) 239 | self.ctrain = tf.train.AdamOptimizer(self.critic_lr).minimize(self.loss, var_list=self.ce_params) 240 | 241 | a_loss = - tf.reduce_mean(q) # maximize the q 242 | self.atrain = tf.train.AdamOptimizer(self.actor_lr).minimize(a_loss, var_list=self.ae_params) 243 | 244 | self.sess.run(tf.global_variables_initializer()) 245 | 246 | def choose_action(self, s): 247 | return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0] 248 | 249 | def learn(self, actor_lr_input, critic_lr_input, per_flag=True): 250 | # soft target replacement 251 | self.sess.run(self.soft_replace) 252 | 253 | if per_flag: 254 | tree_idx, batch_memory, ISWeights = self.per_memory.sample(self.per_batch_size) 255 | batch_states, batch_actions, batch_rewards, batch_states_ = [], [], [], [] 256 | for i in range(self.per_batch_size): 257 | batch_states.append(batch_memory[i][0]) 258 | batch_actions.append(batch_memory[i][1]) 259 | batch_rewards.append(batch_memory[i][2]) 260 | batch_states_.append(batch_memory[i][3]) 261 | 262 | bs = np.array(batch_states) 263 | ba = np.array(batch_actions) 264 | batch_rewards = np.array(batch_rewards) 265 | bs_ = np.array(batch_states_) 266 | br = batch_rewards[:, np.newaxis] 267 | else: 268 | bs, ba, br, bs_ = self.sample_memory() 269 | 270 | # print("br:", br) 271 | 272 | self.sess.run(self.atrain, {self.S: bs, self.actor_lr: actor_lr_input}) 273 | _, abs_errors, cost = self.sess.run([self.ctrain, self.abs_errors, self.loss], 274 | {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.critic_lr: critic_lr_input, 275 | self.ISWeights: ISWeights}) 276 | 277 | self.per_memory.batch_update(tree_idx, abs_errors) # update priority 278 | # print("lr:", self.sess.run(self.actor_lr, {self.actor_lr: actor_lr_input})) 279 | 280 | self.learn_step += 1 281 | 282 | def store_transition(self, s, a, r, s_): 283 | self.per_memory.store(transition=[s, a, r, s_]) 284 | self.per_pointer = self.per_memory.tree.data_pointer 285 | if len(self.memory) >= self.memory_size: 286 | del self.memory[0] 287 | self.memory.append([s, a, r, s_]) 288 | self.pointer = len(self.memory) 289 | 290 | def sample_memory(self): 291 | if len(self.memory) < self.memory_size: 292 | indices = np.random.choice(len(self.memory), size=self.batch_size) 293 | else: 294 | indices = np.random.choice(self.memory_size, self.batch_size) 295 | batch_states, batch_actions, batch_rewards, batch_states_ = [], [], [], [] 296 | for i in indices: 297 | batch_states.append(self.memory[i][0]) 298 | batch_actions.append(self.memory[i][1]) 299 | batch_rewards.append(self.memory[i][2]) 300 | batch_states_.append(self.memory[i][3]) 301 | 302 | batch_states = np.array(batch_states) 303 | batch_actions = np.array(batch_actions) 304 | batch_rewards = np.array(batch_rewards) 305 | batch_states_ = np.array(batch_states_) 306 | batch_rewards = batch_rewards[:, np.newaxis] 307 | return batch_states, batch_actions, batch_rewards, batch_states_ 308 | 309 | def _build_a(self, s, scope, trainable): 310 | with tf.variable_scope(scope): 311 | net = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable) 312 | new_actor_layer = tf.layers.dense(net, 200, activation=tf.nn.relu, name='new_actor_layer', trainable=trainable) 313 | a = tf.layers.dense(new_actor_layer, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable) 314 | return tf.multiply(a, self.a_bound, name='scaled_a') 315 | 316 | def _build_c(self, s, a, scope, trainable): 317 | with tf.variable_scope(scope): 318 | n_l1 = 400 319 | w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable) 320 | w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable) 321 | b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable) 322 | net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) 323 | new_critic_layer = tf.layers.dense(net, 300, activation=tf.nn.relu, name='new_critic_layer', 324 | trainable=trainable) 325 | return tf.layers.dense(new_critic_layer, 1, trainable=trainable) # Q(s,a) 326 | 327 | def load_network(self, saver, load_path): 328 | checkpoint = tf.train.get_checkpoint_state(load_path) 329 | if checkpoint and checkpoint.model_checkpoint_path: 330 | # self.saver.restore(self.sess, checkpoint.model_checkpoint_path) 331 | saver.restore(self.sess, tf.train.latest_checkpoint(load_path)) 332 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 333 | self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1]) 334 | 335 | else: 336 | print("Could not find old network weights") 337 | 338 | def save_network(self, time_step, saver, save_path): 339 | saver.save(self.sess, save_path + 'network', global_step=time_step, 340 | write_meta_graph=False) 341 | 342 | 343 | 344 | 345 | ############################### training #################################### 346 | 347 | -------------------------------------------------------------------------------- /ddpg-movan/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 从DDPG_per中抽取出sumtree类,以及per_memory类 3 | 然后将普通Memeory换成per_memory类。 4 | 5 | """ 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | import sys 10 | 11 | 12 | ##################### hyper parameters #################### 13 | GAMMA = 0.9 # reward discount 14 | TAU = 0.01 # soft replacement 15 | 16 | 17 | ############################### DDPG #################################### 18 | 19 | class DDPG(object): 20 | def __init__(self, a_dim, s_dim, a_bound, transition_num=4, batch_size=32, memory_size=100000, per_flag=False): 21 | self.transition_num = transition_num 22 | self.memory_size = memory_size 23 | self.per_flag = per_flag 24 | if per_flag: 25 | from memory.per_memory import Memory 26 | else: 27 | from memory.simple_memory import Memory 28 | 29 | self.memory = Memory(memory_size=memory_size, 30 | batch_size=batch_size, 31 | transition_num=transition_num, 32 | ) 33 | self.batch_size = batch_size 34 | 35 | self.learn_step = 0 36 | self.per_pointer = 0 37 | 38 | self.sess = tf.Session() 39 | 40 | self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound, 41 | self.actor_lr = tf.placeholder(tf.float32, shape=[], name='actor_lr') 42 | self.critic_lr = tf.placeholder(tf.float32, shape=[], name='critic_lr') 43 | 44 | self.S = tf.placeholder(tf.float32, [None, s_dim], 's') 45 | self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_') 46 | self.R = tf.placeholder(tf.float32, [None, 1], 'r') 47 | self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') 48 | 49 | with tf.variable_scope('Actor'): 50 | self.a = self._build_a(self.S, scope='eval', trainable=True) 51 | a_ = self._build_a(self.S_, scope='target', trainable=False) 52 | with tf.variable_scope('Critic'): 53 | # assign self.a = a in memory when calculating q for td_error, 54 | # otherwise the self.a is from Actor when updating Actor 55 | q = self._build_c(self.S, self.a, scope='eval', trainable=True) 56 | q_ = self._build_c(self.S_, a_, scope='target', trainable=False) 57 | 58 | # networks parameters 59 | self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval') 60 | self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target') 61 | self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval') 62 | self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target') 63 | 64 | # hard_replace 65 | self.hard_replace = [tf.assign(t, e) 66 | for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)] 67 | 68 | # target net replacement 69 | self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e) 70 | for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)] 71 | 72 | q_target = self.R + GAMMA * q_ 73 | 74 | if self.per_flag: 75 | self.abs_errors = tf.reduce_sum(tf.abs(q_target - q), axis=1) # for updating Sumtree 76 | self.c_loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(q_target, q)) 77 | self.ctrain = tf.train.AdamOptimizer(self.critic_lr).minimize(self.c_loss, var_list=self.ce_params) 78 | 79 | self.a_loss = - tf.reduce_mean(q) # maximize the q 80 | self.atrain = tf.train.AdamOptimizer(self.actor_lr).minimize(self.a_loss, var_list=self.ae_params) 81 | else: 82 | # in the feed_dic for the td_error, the self.a should change to actions in memory 83 | td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q) 84 | self.c_loss = td_error 85 | self.ctrain = tf.train.AdamOptimizer(self.critic_lr).minimize(td_error, var_list=self.ce_params) 86 | 87 | a_loss = - tf.reduce_mean(q) # maximize the q 88 | self.a_loss = a_loss 89 | self.atrain = tf.train.AdamOptimizer(self.actor_lr).minimize(a_loss, var_list=self.ae_params) 90 | 91 | self.sess.run(tf.global_variables_initializer()) 92 | 93 | def choose_action(self, s): 94 | return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0] 95 | 96 | def store_transition(self, transition): 97 | self.memory.store(transition) 98 | self.per_pointer = self.memory.tree.data_pointer 99 | 100 | def learn(self, actor_lr_input, critic_lr_input, output_loss_flag=False): 101 | # soft target replacement 102 | self.sess.run(self.soft_replace) 103 | self.learn_step += 1 104 | if self.per_flag: 105 | tree_idx, batch_memory, ISWeights = self.memory.sample() 106 | 107 | batch_states, batch_actions, batch_rewards, batch_states_ = [], [], [], [] 108 | for i in range(self.batch_size): 109 | batch_states.append(batch_memory[i][0]) 110 | batch_actions.append(batch_memory[i][1]) 111 | batch_rewards.append(batch_memory[i][2]) 112 | batch_states_.append(batch_memory[i][3]) 113 | 114 | bs = np.array(batch_states) 115 | ba = np.array(batch_actions) 116 | batch_rewards = np.array(batch_rewards) 117 | bs_ = np.array(batch_states_) 118 | br = batch_rewards[:, np.newaxis] 119 | # 增加一个延时更新. 120 | policy_delay = 2 121 | a_loss = 0.0 122 | if self.learn_step % policy_delay == 0: 123 | _, a_loss = self.sess.run([self.atrain, self.a_loss], {self.S: bs, self.actor_lr: actor_lr_input}) 124 | 125 | _, abs_errors, cost = self.sess.run([self.ctrain, self.abs_errors, self.c_loss], 126 | {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, 127 | self.critic_lr: critic_lr_input, 128 | self.ISWeights: ISWeights}) 129 | 130 | self.memory.batch_update(tree_idx, abs_errors) # update priority 131 | return a_loss, cost 132 | 133 | else: 134 | # 加上terminal信息 135 | if self.transition_num == 5: 136 | bs, ba, br, bs_, bt = self.memory.sample() 137 | if self.transition_num == 4: 138 | bs, ba, br, bs_ = self.memory.sample() 139 | 140 | if output_loss_flag: 141 | _, a_loss = self.sess.run([self.atrain, self.a_loss], {self.S: bs, self.actor_lr: actor_lr_input}) 142 | _, c_loss = self.sess.run([self.ctrain, self.c_loss], 143 | {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.critic_lr: critic_lr_input}) 144 | return a_loss, c_loss 145 | else: 146 | self.sess.run(self.atrain, {self.S: bs, self.actor_lr: actor_lr_input}) 147 | self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.critic_lr: critic_lr_input}) 148 | 149 | def _build_a(self, s, scope, trainable): 150 | with tf.variable_scope(scope): 151 | net = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable) 152 | new_actor_layer = tf.layers.dense(net, 200, activation=tf.nn.relu, name='new_actor_layer', trainable=trainable) 153 | a = tf.layers.dense(new_actor_layer, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable) 154 | return tf.multiply(a, self.a_bound, name='scaled_a') 155 | 156 | def _build_c(self, s, a, scope, trainable): 157 | with tf.variable_scope(scope): 158 | n_l1 = 400 159 | w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable) 160 | w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable) 161 | b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable) 162 | net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) 163 | new_critic_layer = tf.layers.dense(net, 300, activation=tf.nn.relu, name='new_critic_layer', 164 | trainable=trainable) 165 | return tf.layers.dense(new_critic_layer, 1, trainable=trainable) # Q(s,a) 166 | 167 | def load_step_network(self, saver, load_path): 168 | checkpoint = tf.train.get_checkpoint_state(load_path) 169 | if checkpoint and checkpoint.model_checkpoint_path: 170 | saver.restore(self.sess, tf.train.latest_checkpoint(load_path)) 171 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 172 | self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1]) 173 | else: 174 | print("Could not find old network weights") 175 | 176 | def save_step_network(self, time_step, saver, save_path): 177 | saver.save(self.sess, save_path + 'network', global_step=time_step, 178 | write_meta_graph=False) 179 | 180 | def load_simple_network(self, path): 181 | saver = tf.train.Saver() 182 | saver.restore(self.sess, tf.train.latest_checkpoint(path)) 183 | print("restore model successful") 184 | 185 | def save_simple_network(self, save_path): 186 | saver = tf.train.Saver() 187 | saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False) 188 | -------------------------------------------------------------------------------- /ddpg_sp/DDPG_class.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | import sys 6 | sys.path.append("../") 7 | from ddpg_sp import core 8 | from ddpg_sp.core import get_vars, mlp_actor_critic 9 | 10 | 11 | class ReplayBuffer: 12 | """ 13 | A simple FIFO experience replay buffer for TD3 agents. 14 | """ 15 | 16 | def __init__(self, obs_dim, act_dim, size): 17 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 18 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 19 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 20 | self.rews_buf = np.zeros(size, dtype=np.float32) 21 | self.done_buf = np.zeros(size, dtype=np.float32) 22 | self.ptr, self.size, self.max_size = 0, 0, size 23 | 24 | def store(self, obs, act, rew, next_obs, done): 25 | self.obs1_buf[self.ptr] = obs 26 | self.obs2_buf[self.ptr] = next_obs 27 | self.acts_buf[self.ptr] = act 28 | self.rews_buf[self.ptr] = rew 29 | self.done_buf[self.ptr] = done 30 | self.ptr = (self.ptr + 1) % self.max_size 31 | self.size = min(self.size + 1, self.max_size) 32 | 33 | def sample_batch(self, batch_size=32): 34 | idxs = np.random.randint(0, self.size, size=batch_size) 35 | return dict(obs1=self.obs1_buf[idxs], 36 | obs2=self.obs2_buf[idxs], 37 | acts=self.acts_buf[idxs], 38 | rews=self.rews_buf[idxs], 39 | done=self.done_buf[idxs]) 40 | 41 | 42 | class DDPG: 43 | def __init__(self, 44 | a_dim, obs_dim, a_bound, 45 | mlp_actor_critic=core.mlp_actor_critic, 46 | ac_kwargs=dict(), seed=0, 47 | 48 | replay_size=int(1e6), gamma=0.99, 49 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 50 | batch_size=100, 51 | # start_steps=10000, 52 | act_noise=0.1, target_noise=0.2, 53 | noise_clip=0.5, policy_delay=2, 54 | # max_ep_len=1000, 55 | # logger_kwargs=dict(), save_freq=1 56 | ): 57 | 58 | self.learn_step = 0 59 | 60 | self.obs_dim = obs_dim 61 | self.act_dim = a_dim 62 | self.act_limit = a_bound 63 | self.policy_delay = policy_delay 64 | self.action_noise = act_noise 65 | 66 | # Share information about action space with policy architecture 67 | ac_kwargs['action_space'] = a_bound 68 | 69 | # Inputs to computation graph 70 | self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(obs_dim, a_dim, obs_dim, None, None) 71 | 72 | # Main outputs from computation graph 73 | with tf.variable_scope('main'): 74 | self.pi, self.q, q_pi = mlp_actor_critic(self.x_ph, self.a_ph, **ac_kwargs) 75 | 76 | # Target networks 77 | with tf.variable_scope('target'): 78 | # Note that the action placeholder going to actor_critic here is 79 | # irrelevant, because we only need q_targ(s, pi_targ(s)). 80 | pi_targ, _, q_pi_targ = mlp_actor_critic(self.x2_ph, self.a_ph, **ac_kwargs) 81 | 82 | # Experience buffer 83 | self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=self.act_dim, size=replay_size) 84 | 85 | # Count variables 86 | var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) 87 | print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) 88 | 89 | # Bellman backup for Q function 90 | backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * q_pi_targ) 91 | 92 | # DDPG losses 93 | self.pi_loss = -tf.reduce_mean(q_pi) 94 | self.q_loss = tf.reduce_mean((self.q - backup) ** 2) 95 | 96 | # Separate train ops for pi, q 97 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) 98 | q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) 99 | self.train_pi_op = pi_optimizer.minimize(self.pi_loss, var_list=get_vars('main/pi')) 100 | self.train_q_op = q_optimizer.minimize(self.q_loss, var_list=get_vars('main/q')) 101 | 102 | # Polyak averaging for target variables 103 | self.target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) 104 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 105 | 106 | # Initializing targets to match main variables 107 | target_init = tf.group([tf.assign(v_targ, v_main) 108 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 109 | 110 | self.sess = tf.Session() 111 | self.sess.run(tf.global_variables_initializer()) 112 | self.sess.run(target_init) 113 | 114 | def get_action(self, s, noise_scale=0): 115 | if not noise_scale: 116 | noise_scale = self.action_noise 117 | a = self.sess.run(self.pi, feed_dict={self.x_ph: s.reshape(1, -1)})[0] 118 | a += noise_scale * np.random.randn(self.act_dim) 119 | return np.clip(a, -self.act_limit, self.act_limit) 120 | 121 | def store_transition(self, transition): 122 | (s, a, r, s_, done) = transition 123 | self.replay_buffer.store(s, a, r, s_, done) 124 | 125 | def test_agent(self, env, max_ep_len=1000, n=5): 126 | ep_reward_list = [] 127 | for j in range(n): 128 | s = env.reset() 129 | ep_reward = 0 130 | for i in range(max_ep_len): 131 | # Take deterministic actions at test time (noise_scale=0) 132 | s, r, d, _ = env.step(self.get_action(s)) 133 | ep_reward += r 134 | ep_reward_list.append(ep_reward) 135 | mean_ep_reward = np.mean(np.array(ep_reward_list)) 136 | return mean_ep_reward 137 | 138 | def learn(self, batch_size=100): 139 | 140 | batch = self.replay_buffer.sample_batch(batch_size) 141 | feed_dict = {self.x_ph: batch['obs1'], 142 | self.x2_ph: batch['obs2'], 143 | self.a_ph: batch['acts'], 144 | self.r_ph: batch['rews'], 145 | self.d_ph: batch['done'] 146 | } 147 | q_step_ops = [self.train_q_op] 148 | 149 | # Q-learning update 150 | outs = self.sess.run([self.q_loss, self.q, self.train_q_op], feed_dict) 151 | # Policy update 152 | outs = self.sess.run([self.pi_loss, self.train_pi_op, self.target_update], 153 | feed_dict) 154 | 155 | self.learn_step += 1 156 | 157 | def load_step_network(self, saver, load_path): 158 | checkpoint = tf.train.get_checkpoint_state(load_path) 159 | if checkpoint and checkpoint.model_checkpoint_path: 160 | saver.restore(self.sess, tf.train.latest_checkpoint(load_path)) 161 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 162 | self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1]) 163 | else: 164 | print("Could not find old network weights") 165 | 166 | def save_step_network(self, time_step, saver, save_path): 167 | saver.save(self.sess, save_path + 'network', global_step=time_step, 168 | write_meta_graph=False) 169 | 170 | def load_simple_network(self, path): 171 | saver = tf.train.Saver() 172 | saver.restore(self.sess, tf.train.latest_checkpoint(path)) 173 | print("restore model successful") 174 | 175 | def save_simple_network(self, save_path): 176 | saver = tf.train.Saver() 177 | saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False) 178 | 179 | 180 | if __name__ == '__main__': 181 | import argparse 182 | 183 | random_seed = int(time.time() * 1000 % 1000) 184 | parser = argparse.ArgumentParser() 185 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 186 | parser.add_argument('--hid', type=int, default=300) 187 | parser.add_argument('--l', type=int, default=1) 188 | parser.add_argument('--gamma', type=float, default=0.99) 189 | parser.add_argument('--seed', '-s', type=int, default=random_seed) 190 | parser.add_argument('--epochs', type=int, default=200) 191 | parser.add_argument('--max_steps', type=int, default=1000) 192 | parser.add_argument('--exp_name', type=str, default='ddpg_class') 193 | args = parser.parse_args() 194 | 195 | env = gym.make(args.env) 196 | env = env.unwrapped 197 | env.seed(args.seed) 198 | 199 | s_dim = env.observation_space.shape[0] 200 | a_dim = env.action_space.shape[0] 201 | a_bound = env.action_space.high[0] 202 | 203 | net = DDPG(a_dim, s_dim, a_bound, 204 | batch_size=100, 205 | ) 206 | ep_reward_list = [] 207 | test_ep_reward_list = [] 208 | 209 | for i in range(args.epochs): 210 | s = env.reset() 211 | ep_reward = 0 212 | for j in range(args.max_steps): 213 | 214 | # Add exploration noise 215 | if i < 10: 216 | a = np.random.rand(a_dim) * a_bound 217 | else: 218 | # a = net.choose_action(s) 219 | a = net.get_action(s, 0.1) 220 | # a = noise.add_noise(a) 221 | 222 | a = np.clip(a, -a_bound, a_bound) 223 | 224 | s_, r, done, info = env.step(a) 225 | done = False if j == args.max_steps - 1 else done 226 | 227 | net.store_transition((s, a, r, s_, done)) 228 | 229 | s = s_ 230 | ep_reward += r 231 | if j == args.max_steps - 1: 232 | 233 | for _ in range(args.max_steps): 234 | net.learn() 235 | 236 | ep_reward_list.append(ep_reward) 237 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 238 | # 'Explore: %.2f' % var, 239 | "learn step:", net.learn_step) 240 | # if ep_reward > -300:RENDER = True 241 | 242 | # 增加测试部分! 243 | if i % 20 == 0: 244 | test_ep_reward = net.test_agent(env=env, n=5) 245 | test_ep_reward_list.append(test_ep_reward) 246 | print("-" * 20) 247 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 248 | 'Test Reward: %i' % int(test_ep_reward), 249 | ) 250 | print("-" * 20) 251 | 252 | break 253 | 254 | import matplotlib.pyplot as plt 255 | 256 | plt.plot(ep_reward_list) 257 | img_name = str(args.exp_name + "_" + args.env + "_epochs" + 258 | str(args.epochs) + 259 | "_seed" + str(args.seed)) 260 | plt.title(img_name+"_train") 261 | plt.savefig(img_name+".png") 262 | plt.show() 263 | plt.close() 264 | 265 | plt.plot(test_ep_reward_list) 266 | plt.title(img_name + "_test") 267 | plt.savefig(img_name + ".png") 268 | plt.show() 269 | -------------------------------------------------------------------------------- /ddpg_sp/DDPG_per_class.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | import sys 6 | sys.path.append("../") 7 | try: 8 | from rl_algorithms.ddpg_sp import core 9 | from rl_algorithms.ddpg_sp.core import get_vars 10 | except: 11 | from ddpg_sp import core 12 | from ddpg_sp.core import get_vars 13 | 14 | 15 | class ReplayBuffer: 16 | """ 17 | A simple FIFO experience replay buffer for TD3 agents. 18 | """ 19 | 20 | def __init__(self, obs_dim, act_dim, size): 21 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 22 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 23 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 24 | self.rews_buf = np.zeros(size, dtype=np.float32) 25 | self.done_buf = np.zeros(size, dtype=np.float32) 26 | self.ptr, self.size, self.max_size = 0, 0, size 27 | 28 | def store(self, obs, act, rew, next_obs, done): 29 | self.obs1_buf[self.ptr] = obs 30 | self.obs2_buf[self.ptr] = next_obs 31 | self.acts_buf[self.ptr] = act 32 | self.rews_buf[self.ptr] = rew 33 | self.done_buf[self.ptr] = done 34 | self.ptr = (self.ptr + 1) % self.max_size 35 | self.size = min(self.size + 1, self.max_size) 36 | 37 | def sample_batch(self, batch_size=32): 38 | idxs = np.random.randint(0, self.size, size=batch_size) 39 | return dict(obs1=self.obs1_buf[idxs], 40 | obs2=self.obs2_buf[idxs], 41 | acts=self.acts_buf[idxs], 42 | rews=self.rews_buf[idxs], 43 | done=self.done_buf[idxs]) 44 | 45 | 46 | class DDPG: 47 | def __init__(self, 48 | a_dim, obs_dim, a_bound, 49 | mlp_actor_critic=core.mlp_actor_critic, 50 | ac_kwargs=dict(), seed=0, 51 | 52 | replay_size=int(1e6), gamma=0.99, 53 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 54 | batch_size=100, 55 | act_noise=0.1, target_noise=0.2, 56 | noise_clip=0.5, policy_delay=2, 57 | sess_opt=None, 58 | per_flag=True, 59 | ): 60 | self.per_flag = per_flag 61 | self.learn_step = 0 62 | 63 | self.obs_dim = obs_dim 64 | self.act_dim = a_dim 65 | self.act_limit = a_bound 66 | self.policy_delay = policy_delay 67 | self.action_noise = act_noise 68 | 69 | # Share information about action space with policy architecture 70 | ac_kwargs['action_space'] = a_bound 71 | 72 | # Inputs to computation graph 73 | self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') 74 | self.actor_lr = tf.placeholder(tf.float32, shape=[], name='actor_lr') 75 | self.critic_lr = tf.placeholder(tf.float32, shape=[], name='critic_lr') 76 | self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(obs_dim, a_dim, obs_dim, None, None) 77 | 78 | # Main outputs from computation graph 79 | with tf.variable_scope('main'): 80 | self.pi, self.q, q_pi = mlp_actor_critic(self.x_ph, self.a_ph, **ac_kwargs) 81 | 82 | # Target networks 83 | with tf.variable_scope('target'): 84 | # Note that the action placeholder going to actor_critic here is 85 | # irrelevant, because we only need q_targ(s, pi_targ(s)). 86 | pi_targ, _, q_pi_targ = mlp_actor_critic(self.x2_ph, self.a_ph, **ac_kwargs) 87 | 88 | # Experience buffer 89 | if self.per_flag: 90 | from memory.sp_per_memory import ReplayBuffer 91 | self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=self.act_dim, size=replay_size) 92 | 93 | # Count variables 94 | var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) 95 | print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) 96 | 97 | # Bellman backup for Q function 98 | backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * q_pi_targ) 99 | 100 | # DDPG losses 101 | self.pi_loss = -tf.reduce_mean(q_pi) 102 | 103 | if self.per_flag: 104 | # q_target - q 105 | self.abs_errors = tf.abs(backup - self.q) 106 | self.q_loss = self.ISWeights * tf.reduce_mean((self.q - backup) ** 2) 107 | else: 108 | # 正常的! 109 | self.q_loss = tf.reduce_mean((self.q - backup) ** 2) 110 | 111 | # Separate train ops for pi, q 112 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr) 113 | q_optimizer = tf.train.AdamOptimizer(learning_rate=self.critic_lr) 114 | self.train_pi_op = pi_optimizer.minimize(self.pi_loss, var_list=get_vars('main/pi')) 115 | self.train_q_op = q_optimizer.minimize(self.q_loss, var_list=get_vars('main/q')) 116 | 117 | # Polyak averaging for target variables 118 | self.target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) 119 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 120 | 121 | # Initializing targets to match main variables 122 | target_init = tf.group([tf.assign(v_targ, v_main) 123 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 124 | 125 | if sess_opt: 126 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=sess_opt) 127 | self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 128 | else: 129 | self.sess = tf.Session() 130 | self.sess.run(tf.global_variables_initializer()) 131 | self.sess.run(target_init) 132 | 133 | def get_action(self, s, noise_scale=0): 134 | if not noise_scale: 135 | noise_scale = self.action_noise 136 | a = self.sess.run(self.pi, feed_dict={self.x_ph: s.reshape(1, -1)})[0] 137 | a += noise_scale * np.random.randn(self.act_dim) 138 | return np.clip(a, -self.act_limit, self.act_limit) 139 | 140 | def store_transition(self, transition): 141 | if self.per_flag: 142 | self.replay_buffer.store(transition) 143 | else: 144 | (s, a, r, s_, done) = transition 145 | self.replay_buffer.store(s, a, r, s_, done) 146 | 147 | def test_agent(self, env, max_ep_len=1000, n=5): 148 | ep_reward_list = [] 149 | for j in range(n): 150 | s = env.reset() 151 | ep_reward = 0 152 | for i in range(max_ep_len): 153 | # Take deterministic actions at test time (noise_scale=0) 154 | s, r, d, _ = env.step(self.get_action(s)) 155 | ep_reward += r 156 | ep_reward_list.append(ep_reward) 157 | mean_ep_reward = np.mean(np.array(ep_reward_list)) 158 | return mean_ep_reward 159 | 160 | def learn(self, batch_size=100, actor_lr_input=0.001, 161 | critic_lr_input=0.001,): 162 | if self.per_flag: 163 | tree_idx, batch_memory, ISWeights = self.replay_buffer.sample(batch_size=batch_size) 164 | batch_states, batch_actions, batch_rewards, batch_states_, batch_dones = [], [], [], [], [] 165 | for i in range(batch_size): 166 | batch_states.append(batch_memory[i][0]) 167 | batch_actions.append(batch_memory[i][1]) 168 | batch_rewards.append(batch_memory[i][2]) 169 | batch_states_.append(batch_memory[i][3]) 170 | batch_dones.append(batch_memory[i][4]) 171 | 172 | feed_dict = {self.x_ph: np.array(batch_states), 173 | self.x2_ph: np.array(batch_states_), 174 | self.a_ph: np.array(batch_actions), 175 | self.r_ph: np.array(batch_rewards), 176 | self.d_ph: np.array(batch_dones), 177 | self.actor_lr: actor_lr_input, 178 | self.critic_lr: critic_lr_input, 179 | self.ISWeights: ISWeights 180 | } 181 | q_step_ops = [self.q_loss, self.q, 182 | self.train_q_op, 183 | self.abs_errors, 184 | ] 185 | outs = self.sess.run(q_step_ops, feed_dict) 186 | q_loss, q, train_q_op, abs_errors = outs 187 | if self.learn_step % self.policy_delay == 0: 188 | # Delayed policy update 189 | outs = self.sess.run([self.pi_loss, 190 | self.train_pi_op, 191 | self.target_update], 192 | feed_dict) 193 | 194 | self.replay_buffer.batch_update(tree_idx, 195 | abs_errors) # update priority 196 | self.learn_step += 1 197 | return outs 198 | else: 199 | batch = self.replay_buffer.sample_batch(batch_size) 200 | feed_dict = {self.x_ph: batch['obs1'], 201 | self.x2_ph: batch['obs2'], 202 | self.a_ph: batch['acts'], 203 | self.r_ph: batch['rews'], 204 | self.d_ph: batch['done'], 205 | self.actor_lr: actor_lr_input, 206 | self.critic_lr: critic_lr_input, 207 | } 208 | q_step_ops = [self.train_q_op] 209 | 210 | # Q-learning update 211 | outs = self.sess.run([self.q_loss, self.q, self.train_q_op], feed_dict) 212 | # Policy update 213 | outs = self.sess.run([self.pi_loss, self.train_pi_op, self.target_update], 214 | feed_dict) 215 | 216 | self.learn_step += 1 217 | 218 | def load_step_network(self, saver, load_path): 219 | checkpoint = tf.train.get_checkpoint_state(load_path) 220 | if checkpoint and checkpoint.model_checkpoint_path: 221 | saver.restore(self.sess, tf.train.latest_checkpoint(load_path)) 222 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 223 | self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1]) 224 | else: 225 | print("Could not find old network weights") 226 | 227 | def save_step_network(self, time_step, saver, save_path): 228 | saver.save(self.sess, save_path + 'network', global_step=time_step, 229 | write_meta_graph=False) 230 | 231 | def load_simple_network(self, path): 232 | saver = tf.train.Saver() 233 | saver.restore(self.sess, tf.train.latest_checkpoint(path)) 234 | print("restore model successful") 235 | 236 | def save_simple_network(self, save_path): 237 | saver = tf.train.Saver() 238 | saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False) 239 | 240 | 241 | if __name__ == '__main__': 242 | import argparse 243 | 244 | random_seed = int(time.time() * 1000 % 1000) 245 | random_seed = 184 246 | parser = argparse.ArgumentParser() 247 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 248 | parser.add_argument('--hid', type=int, default=300) 249 | parser.add_argument('--l', type=int, default=1) 250 | parser.add_argument('--gamma', type=float, default=0.99) 251 | parser.add_argument('--seed', '-s', type=int, default=random_seed) 252 | parser.add_argument('--epochs', type=int, default=3000) 253 | parser.add_argument('--max_steps', type=int, default=1000) 254 | parser.add_argument('--exp_name', type=str, default='ddpg_per_class') 255 | args = parser.parse_args() 256 | 257 | env = gym.make(args.env) 258 | env = env.unwrapped 259 | env.seed(args.seed) 260 | 261 | s_dim = env.observation_space.shape[0] 262 | a_dim = env.action_space.shape[0] 263 | a_bound = env.action_space.high[0] 264 | 265 | net = DDPG(a_dim, s_dim, a_bound, 266 | batch_size=100, 267 | sess_opt=0.1 268 | ) 269 | ep_reward_list = [] 270 | test_ep_reward_list = [] 271 | 272 | for i in range(args.epochs): 273 | s = env.reset() 274 | ep_reward = 0 275 | st = time.time() 276 | for j in range(args.max_steps): 277 | 278 | # Add exploration noise 279 | if i < 10: 280 | a = np.random.rand(a_dim) * a_bound 281 | else: 282 | # a = net.choose_action(s) 283 | a = net.get_action(s, 0.1) 284 | # a = noise.add_noise(a) 285 | 286 | a = np.clip(a, -a_bound, a_bound) 287 | 288 | s_, r, done, info = env.step(a) 289 | done = False if j == args.max_steps - 1 else done 290 | 291 | net.store_transition((s, a, r, s_, done)) 292 | 293 | s = s_ 294 | ep_reward += r 295 | if j == args.max_steps - 1: 296 | ep_update_time = time.time() 297 | for _ in range(args.max_steps): 298 | net.learn() 299 | ep_update_time = time.time() - ep_update_time 300 | ep_reward_list.append(ep_reward) 301 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 302 | # 'Explore: %.2f' % var, 303 | "learn step:", net.learn_step, 304 | "ep_time:", np.round(time.time()-st, 3), 305 | "up_time:", np.round(ep_update_time, 3), 306 | ) 307 | # if ep_reward > -300:RENDER = True 308 | 309 | # 增加测试部分! 310 | if i % 20 == 0: 311 | test_ep_reward = net.test_agent(env=env, n=5) 312 | test_ep_reward_list.append(test_ep_reward) 313 | print("-" * 20) 314 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 315 | 'Test Reward: %i' % int(test_ep_reward), 316 | ) 317 | print("-" * 20) 318 | 319 | break 320 | 321 | import matplotlib.pyplot as plt 322 | 323 | plt.plot(ep_reward_list) 324 | img_name = str(args.exp_name + "_" + args.env + "_epochs" + 325 | str(args.epochs) + 326 | "_seed" + str(args.seed)) 327 | plt.title(img_name + "_train") 328 | plt.savefig(img_name + ".png") 329 | plt.show() 330 | plt.close() 331 | 332 | plt.plot(test_ep_reward_list) 333 | plt.title(img_name + "_test") 334 | plt.savefig(img_name + ".png") 335 | plt.show() -------------------------------------------------------------------------------- /ddpg_sp/DDPG_sp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | import sys 6 | sys.path.append("../") 7 | from ddpg_sp import core 8 | from ddpg_sp.core import get_vars, mlp_actor_critic 9 | 10 | 11 | class ReplayBuffer: 12 | """ 13 | A simple FIFO experience replay buffer for DDPG agents. 14 | """ 15 | 16 | def __init__(self, obs_dim, act_dim, size): 17 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 18 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 19 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 20 | self.rews_buf = np.zeros(size, dtype=np.float32) 21 | self.done_buf = np.zeros(size, dtype=np.float32) 22 | self.ptr, self.size, self.max_size = 0, 0, size 23 | 24 | def store(self, obs, act, rew, next_obs, done): 25 | self.obs1_buf[self.ptr] = obs 26 | self.obs2_buf[self.ptr] = next_obs 27 | self.acts_buf[self.ptr] = act 28 | self.rews_buf[self.ptr] = rew 29 | self.done_buf[self.ptr] = done 30 | self.ptr = (self.ptr + 1) % self.max_size 31 | self.size = min(self.size + 1, self.max_size) 32 | 33 | def sample_batch(self, batch_size=32): 34 | idxs = np.random.randint(0, self.size, size=batch_size) 35 | return dict(obs1=self.obs1_buf[idxs], 36 | obs2=self.obs2_buf[idxs], 37 | acts=self.acts_buf[idxs], 38 | rews=self.rews_buf[idxs], 39 | done=self.done_buf[idxs]) 40 | 41 | 42 | def ddpg(env_fn, actor_critic=core.mlp_actor_critic, 43 | ac_kwargs=dict(), seed=0, 44 | steps_per_epoch=5000, epochs=100, 45 | replay_size=int(1e6), gamma=0.99, 46 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 47 | batch_size=100, start_steps=10000, 48 | act_noise=0.1, max_ep_len=1000, 49 | logger_kwargs=dict(), save_freq=1): 50 | 51 | env, test_env = env_fn(), env_fn() 52 | obs_dim = env.observation_space.shape[0] 53 | act_dim = env.action_space.shape[0] 54 | 55 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 56 | act_limit = env.action_space.high[0] 57 | 58 | # Share information about action space with policy architecture 59 | ac_kwargs['action_space'] = act_limit 60 | 61 | # Inputs to computation graph 62 | x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) 63 | 64 | # Main outputs from computation graph 65 | with tf.variable_scope('main'): 66 | pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) 67 | 68 | # Target networks 69 | with tf.variable_scope('target'): 70 | # Note that the action placeholder going to actor_critic here is 71 | # irrelevant, because we only need q_targ(s, pi_targ(s)). 72 | pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) 73 | 74 | # Experience buffer 75 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 76 | 77 | # Count variables 78 | var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) 79 | print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) 80 | 81 | # Bellman backup for Q function 82 | backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) 83 | 84 | # DDPG losses 85 | pi_loss = -tf.reduce_mean(q_pi) 86 | q_loss = tf.reduce_mean((q - backup) ** 2) 87 | 88 | # Separate train ops for pi, q 89 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) 90 | q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) 91 | train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) 92 | train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) 93 | 94 | # Polyak averaging for target variables 95 | target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) 96 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 97 | 98 | # Initializing targets to match main variables 99 | target_init = tf.group([tf.assign(v_targ, v_main) 100 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 101 | 102 | sess = tf.Session() 103 | sess.run(tf.global_variables_initializer()) 104 | sess.run(target_init) 105 | 106 | def get_action(o, noise_scale): 107 | a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] 108 | a += noise_scale * np.random.randn(act_dim) 109 | return np.clip(a, -act_limit, act_limit) 110 | 111 | def test_agent(n=10): 112 | for j in range(n): 113 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 114 | while not (d or (ep_len == max_ep_len)): 115 | # Take deterministic actions at test time (noise_scale=0) 116 | o, r, d, _ = test_env.step(get_action(o, 0)) 117 | ep_ret += r 118 | ep_len += 1 119 | 120 | start_time = time.time() 121 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 122 | total_steps = steps_per_epoch * epochs 123 | 124 | ep_ret_list = [] 125 | episode = 0 126 | 127 | # Main loop: collect experience in env and update/log each epoch 128 | for t in range(total_steps): 129 | 130 | """ 131 | Until start_steps have elapsed, randomly sample actions 132 | from a uniform distribution for better exploration. Afterwards, 133 | use the learned policy (with some noise, via act_noise). 134 | """ 135 | if t > start_steps: 136 | a = get_action(o, act_noise) 137 | else: 138 | a = env.action_space.sample() 139 | 140 | # Step the env 141 | o2, r, d, _ = env.step(a) 142 | ep_ret += r 143 | ep_len += 1 144 | 145 | # Ignore the "done" signal if it comes from hitting the time 146 | # horizon (that is, when it's an artificial terminal signal 147 | # that isn't based on the agent's state) 148 | d = False if ep_len == max_ep_len else d 149 | 150 | # Store experience to replay buffer 151 | replay_buffer.store(o, a, r, o2, d) 152 | 153 | # Super critical, easy to overlook step: make sure to update 154 | # most recent observation! 155 | o = o2 156 | 157 | if d or (ep_len == max_ep_len): 158 | """ 159 | Perform all DDPG updates at the end of the trajectory, 160 | in accordance with tuning done by TD3 paper authors. 161 | """ 162 | episode += 1 163 | ep_ret_list.append(ep_ret) 164 | epoch = t // steps_per_epoch 165 | print("Epoch:", epoch) 166 | print("Episode:", episode) 167 | print("Training Step:", t) 168 | print("Episode Reward:", ep_ret) 169 | 170 | for _ in range(ep_len): 171 | batch = replay_buffer.sample_batch(batch_size) 172 | feed_dict = {x_ph: batch['obs1'], 173 | x2_ph: batch['obs2'], 174 | a_ph: batch['acts'], 175 | r_ph: batch['rews'], 176 | d_ph: batch['done'] 177 | } 178 | 179 | # Q-learning update 180 | outs = sess.run([q_loss, q, train_q_op], feed_dict) 181 | 182 | # Policy update 183 | outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) 184 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 185 | 186 | # End of epoch wrap-up 187 | if t > 0 and t % steps_per_epoch == 0: 188 | test_agent() 189 | 190 | import matplotlib.pyplot as plt 191 | plt.plot(ep_ret_list) 192 | plt.show() 193 | 194 | 195 | if __name__ == '__main__': 196 | import argparse 197 | 198 | parser = argparse.ArgumentParser() 199 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 200 | parser.add_argument('--hid', type=int, default=300) 201 | parser.add_argument('--l', type=int, default=1) 202 | parser.add_argument('--gamma', type=float, default=0.99) 203 | parser.add_argument('--seed', '-s', type=int, default=0) 204 | parser.add_argument('--epochs', type=int, default=600) 205 | parser.add_argument('--exp_name', type=str, default='ddpg') 206 | args = parser.parse_args() 207 | 208 | ddpg(lambda: gym.make(args.env), actor_critic=core.mlp_actor_critic, 209 | ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), 210 | gamma=args.gamma, seed=args.seed, epochs=args.epochs, 211 | ) 212 | -------------------------------------------------------------------------------- /ddpg_sp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/ddpg_sp/__init__.py -------------------------------------------------------------------------------- /ddpg_sp/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def placeholder(dim=None): 6 | return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) 7 | 8 | 9 | def placeholders(*args): 10 | return [placeholder(dim) for dim in args] 11 | 12 | 13 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 14 | for h in hidden_sizes[:-1]: 15 | x = tf.layers.dense(x, units=h, activation=activation) 16 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 17 | 18 | 19 | def get_vars(scope): 20 | return [x for x in tf.global_variables() if scope in x.name] 21 | 22 | 23 | def count_vars(scope): 24 | v = get_vars(scope) 25 | return sum([np.prod(var.shape.as_list()) for var in v]) 26 | 27 | 28 | """ 29 | Actor-Critics 30 | """ 31 | 32 | 33 | def mlp_actor_critic(x, a, hidden_sizes=(400, 300), activation=tf.nn.relu, 34 | output_activation=tf.tanh, action_space=None): 35 | act_dim = a.shape.as_list()[-1] 36 | act_limit = action_space 37 | with tf.variable_scope('pi'): 38 | pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 39 | with tf.variable_scope('q'): 40 | q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 41 | with tf.variable_scope('q', reuse=True): 42 | q_pi = tf.squeeze(mlp(tf.concat([x, pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 43 | return pi, q, q_pi 44 | -------------------------------------------------------------------------------- /ddpg_sp/ddpg_class_HalfCheetah-v2_epochs200_seed553.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/ddpg_sp/ddpg_class_HalfCheetah-v2_epochs200_seed553.png -------------------------------------------------------------------------------- /ddpg_sp/ddpg_class_HalfCheetah-v2_epochs3000_seed485.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/ddpg_sp/ddpg_class_HalfCheetah-v2_epochs3000_seed485.png -------------------------------------------------------------------------------- /memory/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/memory/__init__.py -------------------------------------------------------------------------------- /memory/per_memory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/memory/per_memory.py -------------------------------------------------------------------------------- /memory/simple_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Memory: 5 | def __init__(self, memory_size, batch_size, transition_num): 6 | self.memory_list = [] 7 | self.memory_size = memory_size 8 | self.batch_size = batch_size 9 | self.transition_num = transition_num 10 | 11 | def store(self, transition): 12 | if self.memory_num >= self.memory_size: 13 | del self.memory_list[0] 14 | if len(transition) == 5: 15 | s, a, r, s_, t = transition 16 | self.memory_list.append([s, a, r, s_, t]) 17 | if len(transition) == 4: 18 | s, a, r, s_ = transition 19 | self.memory_list.append([s, a, r, s_]) 20 | 21 | def sample(self): 22 | assert self.memory_num >= self.batch_size 23 | if self.memory_num < self.memory_size: 24 | indices = np.random.choice(self.memory_num, size=self.batch_size) 25 | else: 26 | indices = np.random.choice(self.memory_size, self.batch_size) 27 | batch_states, batch_actions, batch_rewards, batch_states_, batch_terminal = [], [], [], [], [] 28 | for i in indices: 29 | batch_states.append(self.memory_list[i][0]) 30 | batch_actions.append(self.memory_list[i][1]) 31 | batch_rewards.append(self.memory_list[i][2]) 32 | batch_states_.append(self.memory_list[i][3]) 33 | if self.transition_num == 5: 34 | batch_terminal.append(self.memory_list[i][4]) 35 | 36 | batch_states = np.array(batch_states) 37 | batch_actions = np.array(batch_actions) 38 | batch_rewards = np.array(batch_rewards) 39 | batch_states_ = np.array(batch_states_) 40 | batch_rewards = batch_rewards[:, np.newaxis] 41 | if self.transition_num==5: 42 | batch_terminal = np.array(batch_terminal) 43 | batch_terminal = batch_terminal[:, np.newaxis] 44 | return batch_states, batch_actions, batch_rewards, batch_states_, batch_terminal 45 | if self.transition_num == 4: 46 | return batch_states, batch_actions, batch_rewards, batch_states_ 47 | 48 | @property 49 | def memory_num(self): 50 | return len(self.memory_list) 51 | 52 | 53 | -------------------------------------------------------------------------------- /memory/sp_per_memory.py: -------------------------------------------------------------------------------- 1 | """ 2 | 和per_memory.py相比,per_memory.py是配合莫烦的DDPG算法, 3 | 而sp_per_memory.py是配合spinningup系列打包的强化类写的。 4 | 目前只单独适配了TD3_per_class.py,SAC的还没有适配,sp的DDPG也没适配。 5 | """ 6 | 7 | import numpy as np 8 | 9 | 10 | class SumTree(object): 11 | """ 12 | This SumTree code is a modified version and the original code is from: 13 | https://github.com/jaara/AI-blog/blob/master/SumTree.py 14 | Story data with its priority in the tree. 15 | """ 16 | data_pointer = 0 17 | 18 | def __init__(self, capacity): 19 | self.capacity = capacity # for all priority values 20 | self.tree = np.zeros(2 * capacity - 1) 21 | # [--------------Parent nodes-------------][-------leaves to recode priority-------] 22 | # size: capacity - 1 size: capacity 23 | self.data = list(np.zeros(capacity, dtype=object)) # for all transitions 24 | # [--------------data frame-------------] 25 | # size: capacity 26 | 27 | def add(self, p, transition): 28 | tree_idx = self.data_pointer + self.capacity - 1 29 | self.data[self.data_pointer] = transition # update data_frame 30 | self.update(tree_idx, p) # update tree_frame 31 | 32 | self.data_pointer += 1 33 | if self.data_pointer >= self.capacity: # replace when exceed the capacity 34 | self.data_pointer = 0 35 | 36 | def update(self, tree_idx, p): 37 | change = p - self.tree[tree_idx] 38 | self.tree[tree_idx] = p 39 | # then propagate the change through tree 40 | while tree_idx != 0: # this method is faster than the recursive loop in the reference code 41 | tree_idx = (tree_idx - 1) // 2 42 | self.tree[tree_idx] += change 43 | 44 | def get_leaf(self, v): 45 | """ 46 | Tree structure and array storage: 47 | 48 | Tree index: 49 | 0 -> storing priority sum 50 | / \ 51 | 1 2 52 | / \ / \ 53 | 3 4 5 6 -> storing priority for transitions 54 | 55 | Array type for storing: 56 | [0,1,2,3,4,5,6] 57 | """ 58 | parent_idx = 0 59 | while True: # the while loop is faster than the method in the reference code 60 | cl_idx = 2 * parent_idx + 1 # this leaf's left and right kids 61 | cr_idx = cl_idx + 1 62 | if cl_idx >= len(self.tree): # reach bottom, end search 63 | leaf_idx = parent_idx 64 | break 65 | else: # downward search, always search for a higher priority node 66 | if v <= self.tree[cl_idx]: 67 | parent_idx = cl_idx 68 | else: 69 | v -= self.tree[cl_idx] 70 | parent_idx = cr_idx 71 | 72 | data_idx = leaf_idx - self.capacity + 1 73 | return leaf_idx, self.tree[leaf_idx], self.data[data_idx] 74 | 75 | @property 76 | def total_p(self): 77 | return self.tree[0] # the root 78 | 79 | 80 | class ReplayBuffer(object): # stored as ( s, a, r, s_ ) in SumTree 81 | """ 82 | This Memory class is modified based on the original code from: 83 | https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py 84 | 这些可以改的,但是目前我也没时间调参了,就凑活用吧 85 | """ 86 | epsilon = 0.01 # small amount to avoid zero priority 87 | alpha = 0.6 # [0~1] convert the importance of TD error to priority 88 | beta = 0.4 # importance-sampling, from initial value increasing to 1 89 | beta_increment_per_sampling = 0.001 90 | abs_err_upper = 1. # clipped abs error 91 | 92 | def __init__(self, 93 | obs_dim=32, 94 | act_dim=3, 95 | size=int(1e6) 96 | ): 97 | self.tree = SumTree(size) 98 | self.full_flag = False 99 | self.memory_num = 0 100 | self.memory_size = size 101 | 102 | def store(self, transition): 103 | max_p = np.max(self.tree.tree[-self.tree.capacity:]) 104 | if max_p == 0: 105 | max_p = self.abs_err_upper 106 | self.tree.add(max_p, transition) # set the max p for new p 107 | if self.memory_num < self.memory_size: 108 | self.memory_num += 1 109 | 110 | def sample(self, batch_size=32): 111 | n = batch_size 112 | # n就是batch size! 113 | # np.empty()这是一个随机初始化的一个矩阵! 114 | b_idx, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1)) 115 | b_memory = [] 116 | pri_seg = self.tree.total_p / n # priority segment 117 | self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) # max = 1 118 | 119 | min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p # for later calculate ISweight 120 | if min_prob == 0: 121 | min_prob = 0.00001 122 | for i in range(n): 123 | a, b = pri_seg * i, pri_seg * (i + 1) 124 | v = np.random.uniform(a, b) 125 | idx, p, data = self.tree.get_leaf(v) 126 | prob = p / self.tree.total_p 127 | ISWeights[i, 0] = np.power(prob/min_prob, -self.beta) 128 | b_idx[i] = idx 129 | b_memory.append(data) 130 | return b_idx, b_memory, ISWeights 131 | 132 | def batch_update(self, tree_idx, abs_errors): 133 | abs_errors += self.epsilon # convert to abs and avoid 0 134 | clipped_errors = np.minimum(abs_errors, self.abs_err_upper) 135 | ps = np.power(clipped_errors, self.alpha) 136 | for ti, p in zip(tree_idx, ps): 137 | self.tree.update(ti, p) 138 | -------------------------------------------------------------------------------- /noise/__init__.py: -------------------------------------------------------------------------------- 1 | from .ou_noise import OU_noise 2 | from .simple_noise import Simple_noise -------------------------------------------------------------------------------- /noise/ou_noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class OU_noise(object): 5 | def __init__(self, num_actions, action_low_bound, action_high_bound, dt, 6 | mu=0.0, theta=0.15, max_sigma=2.0, min_sigma=0.1): 7 | self.mu = mu # 0.0 8 | self.theta = theta # 0.15 9 | self.sigma = max_sigma # 0.3 10 | self.max_sigma = max_sigma # 0.3 11 | self.min_sigma = min_sigma # 0.1 12 | self.dt = dt # 0.001 13 | self.num_actions = num_actions # 1 14 | self.action_low = action_low_bound # -2 15 | self.action_high = action_high_bound # 2 16 | self.reset() 17 | 18 | def reset(self): 19 | self.state = np.zeros(self.num_actions) 20 | 21 | # self.state = np.zeros(self.num_actions) 22 | def state_update(self): 23 | x = self.state 24 | dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.num_actions) # np.random.randn()生成0,1的随机数 25 | self.state = x + dx 26 | 27 | def add_noise(self, action): 28 | self.state_update() 29 | state = self.state 30 | self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, self.dt) 31 | return np.clip(action + state, self.action_low, self.action_high) 32 | -------------------------------------------------------------------------------- /noise/simple_noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Simple_noise(object): 5 | def __init__(self, num_actions, action_low_bound, action_high_bound, 6 | dt=0.0001, 7 | mu=0.0, theta=0.15, max_sigma=2.0, min_sigma=0.1): 8 | self.mu = mu # 0.0 9 | self.theta = theta # 0.15 10 | self.sigma = max_sigma # 0.3 11 | self.max_sigma = max_sigma # 0.3 12 | self.min_sigma = min_sigma # 0.1 13 | self.dt = dt # 0.001 14 | self.num_actions = num_actions # 1 15 | self.action_low = action_low_bound # -2 16 | self.action_high = action_high_bound # 2 17 | 18 | def add_noise(self, action): 19 | action += self.max_sigma * np.random.randn(self.num_actions) 20 | return np.clip(action, self.action_low, self.action_high) 21 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | 本代码库基本停止维护了,请移步最新的代码库: 2 | https://github.com/kaixindelele/DRLib 3 | 4 | 5 | 文件命名示例: 6 | 7 | sac_sp.py:凡是带下划线sp的文件,都是spinup中的封装形式,即把强化算法封装成函数; 8 | 9 | sac_class.py 凡是带class的文件,都是封装成类的形式,便于大家直接调用; 10 | 11 | sac_auto_per_class 凡是带per的文件,都是可以选择是否调用优先经验回放的class,但是有时候优先经验回放并不一定好使,谨慎使用。 12 | 13 | 另外关于sac_auto,也被称作sac2,或者自适应sac,因为alpha超参数是放到网络中进行学习,一般来说要比sac更容易调用。 14 | 15 | 16 | -- 17 | 18 | 2020-12-09 19 | 20 | 发现这是我获得star最多的一个项目了。 21 | 22 | 刚才过了一遍所有的文件,发现优先经验回放PER没有单独抽取出来,还是和DDPG打包在一起的,这样会导致不能和TD3,SAC兼容。 23 | 24 | 另外sac-auto也没有提交。 25 | 26 | 事后经验回放HER没有实现(我到现在还没有调出最好的参数,贼烦,我本以为her是灵丹妙药,没想到不过如此)。 27 | 28 | 另外继续搞tf1感觉有种49年入国军的错觉。 29 | 30 | 难顶 31 | 32 | -- 33 | 34 | 35 | 36 | # DRL-tensorflow 37 | My DRL library with tensorflow1.14 38 | core codes based on https://github.com/openai/spinningup 39 | 40 | My job is wrap the algorithms functions into classes in order to easy to call. 41 | Maintain the performance in gym environments of the original codes. 42 | 43 | 越来越丰富了,基本上将主流的深度强化学习的off-policy的三个主要算法都打包成功了。 44 | **目前都是最简单的模式,直接进入algo_class.py的文件中,run就完事儿了。** 45 | 46 | 对于结果的显示,以及性能的对比,目前做的还不够,因为我还没吃透spinning-up的log类,没有办法更方便的将这个功能嵌入进去。 47 | 还有画图的功能,目前只能用乞丐版的matplotlib画个图。 48 | 49 | 等我有时间了再加点功能~ 50 | 51 | ---- 52 | 已经更新了logger和plot功能,功能实现代码在sp_utils文件夹中,直接抽调了spinup的代码,做了稍许修改。 53 | 在run_in_gym这个文件夹中可以直接试用该功能,非常方便。 54 | spinup的这两个功能可以抽调到大家自己开发的包当中,比自己实现要省事儿很多。 55 | 56 | 57 | 另外,个人感觉我封装的这三个算法,好像不是特别的完美,在gym中测试好像没有问题,但是在机器人环境中无法收敛。 58 | 要是有人测试出bug的话,恳请告知~ 59 | 60 | 61 | ---- 62 | 63 | 64 | 65 | 66 | 大家要是用起来有什么bug,欢迎开issues~ 67 | 要是有帮助的话,希望能给个star。 68 | 69 | 过段时间看看能不能加个LSTM的,我已经看到有大佬的实现了,整合一下到我这个包里~ 70 | -------------------------------------------------------------------------------- /run_in_gym/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/run_in_gym/__init__.py -------------------------------------------------------------------------------- /run_in_gym/launch_with_gym.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | import numpy as np 6 | import tensorflow as tf 7 | import gym 8 | import os 9 | import time 10 | import sys 11 | 12 | sys.path.append("../") 13 | 14 | 15 | def run(seed=184, 16 | algo='td3', 17 | per_flag=True, 18 | epochs=3000, 19 | gamma=0.99, 20 | RlNet=None, 21 | noise_size=0.1 22 | ): 23 | import argparse 24 | random_seed = seed 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 27 | parser.add_argument('--hid', type=int, default=300) 28 | parser.add_argument('--l', type=int, default=1) 29 | parser.add_argument('--gamma', type=float, default=gamma) 30 | parser.add_argument('--seed', '-s', type=int, default=random_seed) 31 | parser.add_argument('--epochs', type=int, default=epochs) 32 | parser.add_argument('--max_steps', type=int, default=1000) 33 | if per_flag: 34 | exp_name = algo+"_per" 35 | else: 36 | exp_name = algo 37 | parser.add_argument('--exp_name', type=str, default=exp_name) 38 | args = parser.parse_args() 39 | 40 | env = gym.make(args.env) 41 | env = env.unwrapped 42 | env.seed(args.seed) 43 | 44 | s_dim = env.observation_space.shape[0] 45 | a_dim = env.action_space.shape[0] 46 | a_bound = env.action_space.high[0] 47 | 48 | 49 | net = RlNet(a_dim, s_dim, a_bound, 50 | gamma=gamma, 51 | sess_opt=0.1, 52 | per_flag=per_flag 53 | ) 54 | ep_reward_list = [] 55 | test_ep_reward_list = [] 56 | 57 | for i in range(args.epochs): 58 | s = env.reset() 59 | ep_reward = 0 60 | st = time.time() 61 | for j in range(args.max_steps): 62 | 63 | # Add exploration noise 64 | if i < 10: 65 | a = np.random.rand(a_dim) * a_bound 66 | else: 67 | a = net.get_action(s, noise_size) 68 | 69 | a = np.clip(a, -a_bound, a_bound) 70 | 71 | s_, r, done, info = env.step(a) 72 | done = False if j == args.max_steps - 1 else done 73 | 74 | net.store_transition((s, a, r, s_, done)) 75 | 76 | s = s_ 77 | ep_reward += r 78 | if j == args.max_steps - 1: 79 | up_st = time.time() 80 | for _ in range(args.max_steps): 81 | net.learn() 82 | 83 | ep_update_time = time.time() - up_st 84 | 85 | ep_reward_list.append(ep_reward) 86 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 87 | # 'Explore: %.2f' % var, 88 | "learn step:", net.learn_step, 89 | "ep_time:", np.round(time.time()-st, 3), 90 | "up_time:", np.round(ep_update_time, 3), 91 | ) 92 | # if ep_reward > -300:RENDER = True 93 | 94 | # 增加测试部分! 95 | if i % 20 == 0: 96 | test_ep_reward = net.test_agent(env=env, n=5) 97 | test_ep_reward_list.append(test_ep_reward) 98 | print("-" * 20) 99 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 100 | 'Test Reward: %i' % int(test_ep_reward), 101 | ) 102 | print("-" * 20) 103 | 104 | break 105 | 106 | import matplotlib.pyplot as plt 107 | 108 | plt.plot(ep_reward_list) 109 | img_name = str(args.exp_name + "_" + args.env + "_epochs" + 110 | str(args.epochs) + 111 | "_seed" + str(args.seed)) 112 | plt.title(img_name + "_train") 113 | plt.savefig(img_name + ".png") 114 | plt.show() 115 | plt.close() 116 | 117 | plt.plot(test_ep_reward_list) 118 | plt.title(img_name + "_test") 119 | plt.savefig(img_name + ".png") 120 | plt.show() 121 | 122 | 123 | if __name__ == '__main__': 124 | algo_index = 1 125 | seed = 184 126 | per_flag = True 127 | 128 | rl_algo_list = ["DDPG", "SAC_AUTO", "TD3", "SAC"] 129 | import rl_algorithms 130 | try: 131 | net = eval("rl_algorithms."+rl_algo_list[algo_index]) 132 | except: 133 | pass 134 | 135 | run(seed=seed, 136 | algo=rl_algo_list[algo_index], 137 | per_flag=per_flag, 138 | epochs=3000, 139 | gamma=0.99, 140 | RlNet=net, 141 | noise_size=0.1) 142 | -------------------------------------------------------------------------------- /run_in_gym/run_gym_sac_class.py: -------------------------------------------------------------------------------- 1 | # 导入一些其他的必要包 2 | import numpy as np 3 | import time 4 | import argparse 5 | import os 6 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 7 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 8 | import tensorflow as tf 9 | import sys 10 | 11 | sys.path.append("../") 12 | # 选择强化算法 13 | from sac_sp.SAC_class import SAC 14 | 15 | # 导入log包! 16 | from sp_utils.logx import EpochLogger 17 | from sp_utils.logx import setup_logger_kwargs 18 | 19 | # 选择环境 20 | import gym 21 | 22 | 23 | def test_agent(args, net, env, n=5, logger=None): 24 | ep_reward_list = [] 25 | for j in range(n): 26 | obs = env.reset() 27 | ep_reward = 0 28 | for i in range(args.max_steps): 29 | # Take deterministic actions at test time (noise_scale=0) 30 | s = obs 31 | 32 | a = net.get_action(s) 33 | obs, r, d, _ = env.step(a) 34 | 35 | ep_reward += r 36 | if logger: 37 | logger.store(TestEpRet=ep_reward) 38 | 39 | ep_reward_list.append(ep_reward) 40 | mean_ep_reward = np.mean(np.array(ep_reward_list)) 41 | if logger: 42 | return mean_ep_reward, logger 43 | else: 44 | return mean_ep_reward 45 | 46 | 47 | def main(): 48 | 49 | # 确定随机种子 50 | random_seed = int(time.time() * 10000 % 10000) 51 | # 设置传参和默认值 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 54 | parser.add_argument('--batch_size', type=int, default=128) 55 | parser.add_argument('--noise_scale', type=float, default=0.1) 56 | parser.add_argument('--alpha', type=float, default=0.1) 57 | parser.add_argument('--gamma', type=float, default=0.99) 58 | parser.add_argument('--seed', '-s', type=int, default=random_seed) 59 | # 默认的epochs=5000! 60 | parser.add_argument('--epochs', type=int, default=3000) 61 | parser.add_argument('--max_steps', type=int, default=200) 62 | # 实验名字需要改对应起来 63 | parser.add_argument('--exp_name', type=str, default='sac_') 64 | 65 | args = parser.parse_args() 66 | 67 | tf.reset_default_graph() 68 | 69 | # 实例化log函数! 70 | exp_name = 'sac_{}_alpha_{}_noise_{}'.format( 71 | args.env, 72 | args.alpha, 73 | args.noise, 74 | ) 75 | 76 | logger_kwargs = setup_logger_kwargs(exp_name=exp_name, 77 | seed=args.seed, 78 | output_dir="../sp_data_logs/") 79 | # 将字典传进去 80 | logger = EpochLogger(**logger_kwargs) 81 | 82 | print("locals():", locals()) 83 | logger.save_config(locals()) 84 | 85 | # 创建虚拟环境 86 | env = gym.make(args.env) 87 | # 设置环境的随机种子:robosuite可能没有 88 | # env.seed(args.seed) 89 | tf.set_random_seed(args.seed) 90 | np.random.seed(args.seed) 91 | 92 | obs = env.reset() 93 | perception_dim = env.observation_space.shape[0] 94 | 95 | # 确定state和action维度和动作上限 96 | s_dim = perception_dim 97 | a_dim = env.action_space.shape[0] 98 | a_bound = env.action_space.high[0] 99 | 100 | # 创建强化算法类,里面还有一些参数,需要看里面的代码 101 | # SAC主要调整alpha,从0.1到0.25,找到最佳的一组 102 | net = SAC(a_dim, s_dim, a_bound, 103 | alpha=args.alpha, 104 | batch_size=args.batch_size, 105 | ) 106 | 107 | # 设定保存的一些参数. 108 | ep_reward_list = [] 109 | test_ep_reward_list = [] 110 | start_time = time.time() 111 | # 主循环 112 | for i in range(args.epochs): 113 | # 环境的重置和一些变量的归零 114 | obs = env.reset() 115 | s = obs 116 | ep_reward = 0 117 | episode_time = time.time() 118 | for j in range(args.max_steps): 119 | # 选择动作 120 | # Add exploration noise 121 | a = net.get_action(s, args.noise_scale) 122 | 123 | a = np.clip(a, -a_bound, a_bound) 124 | 125 | obs, r, done, info = env.step(a) 126 | 127 | s_ = obs 128 | net.store_transition((s, a, r, s_, done)) 129 | 130 | s = s_ 131 | ep_reward += r 132 | if j == args.max_steps - 1: 133 | # 存episode reward.这里安心的存进去就好,到时候它会自己计算均值 134 | logger.store(EpRet=ep_reward) 135 | for _ in range(args.max_steps): 136 | net.learn() 137 | 138 | ep_reward_list.append(ep_reward) 139 | print('Episode:', i, ' Reward: %0.4f' % float(ep_reward), 140 | "learn step:", net.learn_step) 141 | 142 | # 增加测试部分! 143 | if i % 20 == 0: 144 | test_ep_reward, logger = test_agent(args=args, 145 | net=net, 146 | env=env, 147 | n=5, 148 | logger=logger 149 | ) 150 | test_ep_reward_list.append(test_ep_reward) 151 | 152 | logger.log_tabular('Epoch', i) 153 | # 不用with_min_and_max的时候,就不会有AverageEpRet这个值~画图的时候会找不到~ 154 | # 每个test都打印一次,如果已经存过的就不用管了,没存过的,赋值就行 155 | logger.log_tabular('EpRet', with_min_and_max=True) 156 | logger.log_tabular('TestEpRet', with_min_and_max=True) 157 | logger.log_tabular('TotalEnvInteracts', i*args.max_steps+j) 158 | logger.log_tabular('TotalTime', time.time() - start_time) 159 | # logger.log_tabular('EpisopeTime', time.time() - episode_time) 160 | logger.dump_tabular() 161 | 162 | break 163 | 164 | 165 | if __name__ == '__main__': 166 | 167 | main() 168 | -------------------------------------------------------------------------------- /sac_auto/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sac_auto/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | EPS = 1e-8 5 | 6 | def placeholder(dim=None): 7 | return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) 8 | 9 | def placeholders(*args): 10 | return [placeholder(dim) for dim in args] 11 | 12 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 13 | for h in hidden_sizes[:-1]: 14 | x = tf.layers.dense(x, units=h, activation=activation) 15 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 16 | 17 | def get_vars(scope): 18 | return [x for x in tf.global_variables() if scope in x.name] 19 | 20 | def count_vars(scope): 21 | v = get_vars(scope) 22 | return sum([np.prod(var.shape.as_list()) for var in v]) 23 | 24 | def gaussian_likelihood(x, mu, log_std): 25 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 26 | return tf.reduce_sum(pre_sum, axis=1) 27 | 28 | def clip_but_pass_gradient(x, l=-1., u=1.): 29 | clip_up = tf.cast(x > u, tf.float32) 30 | clip_low = tf.cast(x < l, tf.float32) 31 | return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low) 32 | 33 | 34 | """ 35 | Policies 36 | """ 37 | 38 | LOG_STD_MAX = 2 39 | LOG_STD_MIN = -20 40 | 41 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): 42 | act_dim = a.shape.as_list()[-1] 43 | net = mlp(x, list(hidden_sizes), activation, activation) 44 | mu = tf.layers.dense(net, act_dim, activation=output_activation) 45 | 46 | """ 47 | Because algorithm maximizes trade-off of reward and entropy, 48 | entropy must be unique to state---and therefore log_stds need 49 | to be a neural network output instead of a shared-across-states 50 | learnable parameter vector. But for deep Relu and other nets, 51 | simply sticking an activationless dense layer at the end would 52 | be quite bad---at the beginning of training, a randomly initialized 53 | net could produce extremely large values for the log_stds, which 54 | would result in some actions being either entirely deterministic 55 | or too random to come back to earth. Either of these introduces 56 | numerical instability which could break the algorithm. To 57 | protect against that, we'll constrain the output range of the 58 | log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is 59 | slightly different from the trick used by the original authors of 60 | SAC---they used tf.clip_by_value instead of squashing and rescaling. 61 | I prefer this approach because it allows gradient propagation 62 | through log_std where clipping wouldn't, but I don't know if 63 | it makes much of a difference. 64 | """ 65 | log_std = tf.layers.dense(net, act_dim, activation=tf.tanh) 66 | log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) 67 | 68 | std = tf.exp(log_std) 69 | pi = mu + tf.random_normal(tf.shape(mu)) * std 70 | logp_pi = gaussian_likelihood(pi, mu, log_std) 71 | return mu, pi, logp_pi 72 | 73 | def apply_squashing_func(mu, pi, logp_pi): 74 | mu = tf.tanh(mu) 75 | pi = tf.tanh(pi) 76 | # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range. 77 | logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) 78 | return mu, pi, logp_pi 79 | 80 | 81 | """ 82 | Actor-Critics 83 | """ 84 | def mlp_actor_critic(x, x2, a, 85 | hidden_sizes=(400,300), 86 | activation=tf.nn.relu, 87 | output_activation=None, 88 | policy=mlp_gaussian_policy, 89 | action_space=None): 90 | # policy 91 | with tf.variable_scope('pi'): 92 | mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation) 93 | mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) 94 | with tf.variable_scope('pi', reuse=True): 95 | mu2, pi2, logp_pi2 = policy(x2, a, hidden_sizes, activation, output_activation) 96 | mu2, pi2, logp_pi2 = apply_squashing_func(mu2, pi2, logp_pi2) 97 | 98 | # make sure actions are in correct range 99 | action_scale = action_space 100 | mu *= action_scale 101 | pi *= action_scale 102 | 103 | # vfs 104 | # tf.squeeze( shape(?,1), axis=1 ) = shape(?,) 105 | vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 106 | with tf.variable_scope('q1'): 107 | q1 = vf_mlp(tf.concat([x,a], axis=-1)) 108 | with tf.variable_scope('q1', reuse=True): 109 | q1_pi = vf_mlp(tf.concat([x,pi], axis=-1)) 110 | with tf.variable_scope('q2'): 111 | q2 = vf_mlp(tf.concat([x,a], axis=-1)) 112 | with tf.variable_scope('q2', reuse=True): 113 | q2_pi = vf_mlp(tf.concat([x,pi], axis=-1)) 114 | 115 | return mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi -------------------------------------------------------------------------------- /sac_auto/sac_auto_class.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import os 5 | import time 6 | import sys 7 | 8 | sys.path.append("../") 9 | try: 10 | from rl_algorithms.sac_auto import core 11 | from rl_algorithms.sac_auto.core import get_vars 12 | except: 13 | from sac_auto import core 14 | from sac_auto.core import get_vars 15 | 16 | 17 | class ReplayBuffer: 18 | """ 19 | A simple FIFO experience replay buffer for TD3 agents. 20 | """ 21 | 22 | def __init__(self, obs_dim, act_dim, size): 23 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 24 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 25 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 26 | self.rews_buf = np.zeros(size, dtype=np.float32) 27 | self.done_buf = np.zeros(size, dtype=np.float32) 28 | self.ptr, self.size, self.max_size = 0, 0, size 29 | 30 | def store(self, obs, act, rew, next_obs, done): 31 | self.obs1_buf[self.ptr] = obs 32 | self.obs2_buf[self.ptr] = next_obs 33 | self.acts_buf[self.ptr] = act 34 | self.rews_buf[self.ptr] = rew 35 | self.done_buf[self.ptr] = done 36 | self.ptr = (self.ptr + 1) % self.max_size 37 | self.size = min(self.size + 1, self.max_size) 38 | 39 | def sample_batch(self, batch_size=32): 40 | idxs = np.random.randint(0, self.size, size=batch_size) 41 | return dict(obs1=self.obs1_buf[idxs], 42 | obs2=self.obs2_buf[idxs], 43 | acts=self.acts_buf[idxs], 44 | rews=self.rews_buf[idxs], 45 | done=self.done_buf[idxs]) 46 | 47 | 48 | class SAC: 49 | def __init__(self, 50 | a_dim, obs_dim, a_bound, 51 | mlp_actor_critic=core.mlp_actor_critic, 52 | ac_kwargs=dict(), seed=0, 53 | replay_size=int(1e6), gamma=0.99, 54 | polyak=0.995, alpha="auto", 55 | # pi_lr=1e-4, q_lr=1e-4, 56 | # batch_size=100, 57 | # act_noise=0.1, target_noise=0.2, noise_clip=0.5, 58 | # policy_delay=2, 59 | sess_opt=0.1, 60 | ): 61 | 62 | self.learn_step = 0 63 | 64 | self.obs_dim = obs_dim 65 | self.act_dim = a_dim 66 | self.act_limit = a_bound 67 | self.policy_delay = policy_delay 68 | # self.action_noise = act_noise 69 | 70 | # Share information about action space with policy architecture 71 | ac_kwargs['action_space'] = a_bound 72 | 73 | # Inputs to computation graph 74 | self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(obs_dim, a_dim, obs_dim, None, None) 75 | self.actor_lr = tf.placeholder(tf.float32, shape=[], name='actor_lr') 76 | self.critic_lr = tf.placeholder(tf.float32, shape=[], name='critic_lr') 77 | 78 | # Main outputs from computation graph 79 | with tf.variable_scope('main'): 80 | self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi, = mlp_actor_critic(self.x_ph, 81 | self.x2_ph, 82 | self.a_ph, 83 | **ac_kwargs) 84 | 85 | # Target value network 86 | with tf.variable_scope('target'): 87 | _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = mlp_actor_critic(self.x2_ph, 88 | self.x2_ph, 89 | self.a_ph, 90 | **ac_kwargs) 91 | 92 | # Experience buffer 93 | self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, 94 | act_dim=self.act_dim, 95 | size=replay_size) 96 | 97 | # Count variables 98 | var_counts = tuple(core.count_vars(scope) for scope in 99 | ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) 100 | print(('\nNumber of parameters: \t pi: %d, \t' + \ 101 | 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts) 102 | # 重新修改下面这段! 103 | target_entropy = (-np.prod(a_dim)) 104 | 105 | log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) 106 | alpha = tf.exp(log_alpha) 107 | 108 | alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) 109 | 110 | alpha_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4, 111 | name='alpha_optimizer') 112 | train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) 113 | 114 | # Min Double-Q: 115 | min_q_pi = tf.minimum(q1_pi_, q2_pi_) 116 | 117 | # Targets for Q and V regression 118 | v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2) 119 | q_backup = self.r_ph + gamma * (1 - self.d_ph) * v_backup 120 | 121 | # Soft actor-critic losses 122 | pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) 123 | q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) 124 | q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) 125 | value_loss = q1_loss + q2_loss 126 | 127 | # Policy train op 128 | # (has to be separate from value train op, because q1_pi appears in pi_loss) 129 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr) 130 | train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) 131 | 132 | # Value train op 133 | # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) 134 | value_optimizer = tf.train.AdamOptimizer(learning_rate=self.critic_lr) 135 | value_params = get_vars('main/q') 136 | with tf.control_dependencies([train_pi_op]): 137 | train_value_op = value_optimizer.minimize(value_loss, 138 | var_list=value_params) 139 | 140 | # Polyak averaging for target variables 141 | # (control flow because sess.run otherwise evaluates in nondeterministic order) 142 | with tf.control_dependencies([train_value_op]): 143 | target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) 144 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 145 | 146 | # All ops to call during one training step 147 | self.step_ops = [pi_loss, 148 | q1_loss, q2_loss, 149 | q1, q2, 150 | logp_pi, alpha, 151 | train_pi_op, 152 | train_value_op, 153 | target_update, 154 | train_alpha_op] 155 | 156 | # Initializing targets to match main variables 157 | target_init = tf.group([tf.assign(v_targ, v_main) 158 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 159 | 160 | if sess_opt: 161 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=sess_opt) 162 | self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 163 | else: 164 | self.sess = tf.Session() 165 | self.sess.run(tf.global_variables_initializer()) 166 | self.sess.run(target_init) 167 | 168 | def get_action(self, s, noise_scale=0): 169 | if not noise_scale: 170 | act_op = self.mu 171 | else: 172 | act_op = self.pi 173 | a = self.sess.run(act_op, 174 | feed_dict={self.x_ph: s.reshape(1, -1)})[0] 175 | return np.clip(a, -self.act_limit, self.act_limit) 176 | 177 | def store_transition(self, transition): 178 | (s, a, r, s_, done) = transition 179 | self.replay_buffer.store(s, a, r, s_, done) 180 | 181 | def test_agent(self, env, max_ep_len=200, n=5, logger=None): 182 | ep_reward_list = [] 183 | for j in range(n): 184 | s = env.reset() 185 | ep_reward = 0 186 | for i in range(max_ep_len): 187 | # Take deterministic actions at test time (noise_scale=0) 188 | a = self.get_action(s) 189 | s, r, d, _ = env.step(a) 190 | ep_reward += r 191 | ep_reward_list.append(ep_reward) 192 | mean_ep_reward = np.mean(np.array(ep_reward_list)) 193 | if logger: 194 | logger.store(TestEpRet=mean_ep_reward) 195 | if logger: 196 | return mean_ep_reward, logger 197 | else: 198 | return mean_ep_reward 199 | 200 | def learn(self, batch_size=100, 201 | actor_lr_input=0.001, 202 | critic_lr_input=0.001, 203 | ): 204 | 205 | batch = self.replay_buffer.sample_batch(batch_size) 206 | feed_dict = {self.x_ph: batch['obs1'], 207 | self.x2_ph: batch['obs2'], 208 | self.a_ph: batch['acts'], 209 | self.r_ph: batch['rews'], 210 | self.d_ph: batch['done'], 211 | self.actor_lr: actor_lr_input, 212 | self.critic_lr: critic_lr_input, 213 | } 214 | outs = self.sess.run(self.step_ops, 215 | feed_dict) 216 | self.learn_step += 1 217 | return outs 218 | 219 | def load_step_network(self, saver, load_path): 220 | checkpoint = tf.train.get_checkpoint_state(load_path) 221 | if checkpoint and checkpoint.model_checkpoint_path: 222 | saver.restore(self.sess, tf.train.latest_checkpoint(load_path)) 223 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 224 | self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1]) 225 | else: 226 | print("Could not find old network weights") 227 | 228 | def save_step_network(self, time_step, saver, save_path): 229 | saver.save(self.sess, save_path + 'network', global_step=time_step, 230 | write_meta_graph=False) 231 | 232 | def load_simple_network(self, path): 233 | saver = tf.train.Saver() 234 | saver.restore(self.sess, tf.train.latest_checkpoint(path)) 235 | print("restore model successful") 236 | 237 | def save_simple_network(self, save_path): 238 | saver = tf.train.Saver() 239 | saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False) 240 | 241 | 242 | if __name__ == '__main__': 243 | import argparse 244 | 245 | random_seed = int(time.time() * 1000 % 1000) 246 | random_seed = 184 247 | parser = argparse.ArgumentParser() 248 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 249 | parser.add_argument('--hid', type=int, default=300) 250 | parser.add_argument('--l', type=int, default=1) 251 | parser.add_argument('--gamma', type=float, default=0.99) 252 | parser.add_argument('--seed', '-s', type=int, default=random_seed) 253 | parser.add_argument('--epochs', type=int, default=3000) 254 | parser.add_argument('--max_steps', type=int, default=1000) 255 | parser.add_argument('--exp_name', type=str, default='sac_auto_class') 256 | args = parser.parse_args() 257 | 258 | env = gym.make(args.env) 259 | env = env.unwrapped 260 | env.seed(args.seed) 261 | 262 | s_dim = env.observation_space.shape[0] 263 | a_dim = env.action_space.shape[0] 264 | a_bound = env.action_space.high[0] 265 | 266 | net = SAC(a_dim, s_dim, a_bound, 267 | # batch_size=100, 268 | sess_opt=0.1 269 | ) 270 | ep_reward_list = [] 271 | test_ep_reward_list = [] 272 | 273 | for i in range(args.epochs): 274 | s = env.reset() 275 | ep_reward = 0 276 | st = time.time() 277 | for j in range(args.max_steps): 278 | 279 | # Add exploration noise 280 | if i < 10: 281 | a = np.random.rand(a_dim) * a_bound 282 | else: 283 | a = net.get_action(s, 0.1) 284 | 285 | a = np.clip(a, -a_bound, a_bound) 286 | 287 | s_, r, done, info = env.step(a) 288 | done = False if j == args.max_steps - 1 else done 289 | 290 | net.store_transition((s, a, r, s_, done)) 291 | 292 | s = s_ 293 | ep_reward += r 294 | if j == args.max_steps - 1: 295 | ep_update_time = time.time() 296 | for _ in range(args.max_steps): 297 | net.learn() 298 | ep_update_time = time.time() - ep_update_time 299 | ep_reward_list.append(ep_reward) 300 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 301 | # 'Explore: %.2f' % var, 302 | "learn step:", net.learn_step, 303 | "ep_time:", np.round(time.time()-st, 3), 304 | "up_time:", np.round(ep_update_time, 3), 305 | ) 306 | # if ep_reward > -300:RENDER = True 307 | 308 | # 增加测试部分! 309 | if i % 20 == 0: 310 | test_ep_reward = net.test_agent(env=env, n=5) 311 | test_ep_reward_list.append(test_ep_reward) 312 | print("-" * 20) 313 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 314 | 'Test Reward: %i' % int(test_ep_reward), 315 | ) 316 | print("-" * 20) 317 | 318 | break 319 | 320 | import matplotlib.pyplot as plt 321 | 322 | plt.plot(ep_reward_list) 323 | img_name = str(args.exp_name + "_" + args.env + "_epochs" + 324 | str(args.epochs) + 325 | "_seed" + str(args.seed)) 326 | plt.title(img_name + "_train") 327 | plt.savefig(img_name + ".png") 328 | plt.show() 329 | plt.close() 330 | 331 | plt.plot(test_ep_reward_list) 332 | plt.title(img_name + "_test") 333 | plt.savefig(img_name + ".png") 334 | plt.show() -------------------------------------------------------------------------------- /sac_sp/SAC_class.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | import sys 6 | sys.path.append("../") 7 | from sac_sp import core 8 | from sac_sp.core import get_vars, mlp_actor_critic 9 | 10 | 11 | class ReplayBuffer: 12 | """ 13 | A simple FIFO experience replay buffer for SAC agents. 14 | """ 15 | 16 | def __init__(self, obs_dim, act_dim, size): 17 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 18 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 19 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 20 | self.rews_buf = np.zeros(size, dtype=np.float32) 21 | self.done_buf = np.zeros(size, dtype=np.float32) 22 | self.ptr, self.size, self.max_size = 0, 0, size 23 | 24 | def store(self, obs, act, rew, next_obs, done): 25 | self.obs1_buf[self.ptr] = obs 26 | self.obs2_buf[self.ptr] = next_obs 27 | self.acts_buf[self.ptr] = act 28 | self.rews_buf[self.ptr] = rew 29 | self.done_buf[self.ptr] = done 30 | self.ptr = (self.ptr + 1) % self.max_size 31 | self.size = min(self.size + 1, self.max_size) 32 | 33 | def sample_batch(self, batch_size=32): 34 | idxs = np.random.randint(0, self.size, size=batch_size) 35 | return dict(obs1=self.obs1_buf[idxs], 36 | obs2=self.obs2_buf[idxs], 37 | acts=self.acts_buf[idxs], 38 | rews=self.rews_buf[idxs], 39 | done=self.done_buf[idxs]) 40 | 41 | 42 | class SAC: 43 | def __init__(self, 44 | a_dim, obs_dim, a_bound, 45 | mlp_actor_critic=core.mlp_actor_critic, 46 | ac_kwargs=dict(), seed=0, 47 | 48 | replay_size=int(1e6), gamma=0.99, 49 | polyak=0.995, alpha=0.2, 50 | pi_lr=1e-3, q_lr=1e-3, 51 | batch_size=100, 52 | # start_steps=10000, 53 | act_noise=0.1, target_noise=0.2, 54 | noise_clip=0.5, policy_delay=2, 55 | # max_ep_len=1000, 56 | # logger_kwargs=dict(), save_freq=1 57 | ): 58 | 59 | self.learn_step = 0 60 | 61 | self.obs_dim = obs_dim 62 | self.act_dim = a_dim 63 | self.act_limit = a_bound 64 | self.policy_delay = policy_delay 65 | self.action_noise = act_noise 66 | 67 | # Share information about action space with policy architecture 68 | ac_kwargs['action_space'] = a_bound 69 | 70 | # Inputs to computation graph 71 | self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(obs_dim, a_dim, obs_dim, None, None) 72 | 73 | # Main outputs from computation graph 74 | with tf.variable_scope('main'): 75 | self.mu, self.pi, logp_pi, q1, q2, q1_pi, q2_pi, v = mlp_actor_critic(self.x_ph, self.a_ph, **ac_kwargs) 76 | 77 | # Target value network 78 | with tf.variable_scope('target'): 79 | _, _, _, _, _, _, _, v_targ = mlp_actor_critic(self.x2_ph, self.a_ph, **ac_kwargs) 80 | 81 | # Experience buffer 82 | self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=self.act_dim, size=replay_size) 83 | 84 | # Count variables 85 | var_counts = tuple(core.count_vars(scope) for scope in 86 | ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) 87 | print(('\nNumber of parameters: \t pi: %d, \t' + \ 88 | 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts) 89 | 90 | # Min Double-Q: 91 | min_q_pi = tf.minimum(q1_pi, q2_pi) 92 | 93 | # Targets for Q and V regression 94 | q_backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * v_targ) 95 | v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) 96 | 97 | # Soft actor-critic losses 98 | pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) 99 | q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) 100 | q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) 101 | v_loss = 0.5 * tf.reduce_mean((v_backup - v) ** 2) 102 | value_loss = q1_loss + q2_loss + v_loss 103 | 104 | # Policy train op 105 | # (has to be separate from value train op, because q1_pi appears in pi_loss) 106 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) 107 | self.train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) 108 | 109 | # Value train op 110 | # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) 111 | value_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) 112 | value_params = get_vars('main/q') + get_vars('main/v') 113 | with tf.control_dependencies([self.train_pi_op]): 114 | self.train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) 115 | 116 | # Polyak averaging for target variables 117 | # (control flow because sess.run otherwise evaluates in nondeterministic order) 118 | with tf.control_dependencies([self.train_value_op]): 119 | target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) 120 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 121 | 122 | # All ops to call during one training step 123 | self.step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 124 | self.train_pi_op, self.train_value_op, target_update] 125 | 126 | # Initializing targets to match main variables 127 | target_init = tf.group([tf.assign(v_targ, v_main) 128 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 129 | 130 | self.sess = tf.Session() 131 | self.sess.run(tf.global_variables_initializer()) 132 | self.sess.run(target_init) 133 | 134 | def get_action(self, s, noise_scale=0): 135 | if not noise_scale: 136 | act_op = self.mu 137 | else: 138 | act_op = self.pi 139 | a = self.sess.run(act_op, feed_dict={self.x_ph: s.reshape(1, -1)})[0] 140 | 141 | return np.clip(a, -self.act_limit, self.act_limit) 142 | 143 | def store_transition(self, transition): 144 | (s, a, r, s_, done) = transition 145 | self.replay_buffer.store(s, a, r, s_, done) 146 | 147 | def test_agent(self, env, max_ep_len=1000, n=5): 148 | ep_reward_list = [] 149 | for j in range(n): 150 | s = env.reset() 151 | ep_reward = 0 152 | for i in range(max_ep_len): 153 | # Take deterministic actions at test time (noise_scale=0) 154 | s, r, d, _ = env.step(self.get_action(s)) 155 | ep_reward += r 156 | ep_reward_list.append(ep_reward) 157 | mean_ep_reward = np.mean(np.array(ep_reward_list)) 158 | return mean_ep_reward 159 | 160 | def learn(self, batch_size=100): 161 | 162 | batch = self.replay_buffer.sample_batch(batch_size) 163 | feed_dict = {self.x_ph: batch['obs1'], 164 | self.x2_ph: batch['obs2'], 165 | self.a_ph: batch['acts'], 166 | self.r_ph: batch['rews'], 167 | self.d_ph: batch['done'] 168 | } 169 | outs = self.sess.run(self.step_ops,feed_dict) 170 | self.learn_step += 1 171 | 172 | def load_step_network(self, saver, load_path): 173 | checkpoint = tf.train.get_checkpoint_state(load_path) 174 | if checkpoint and checkpoint.model_checkpoint_path: 175 | saver.restore(self.sess, tf.train.latest_checkpoint(load_path)) 176 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 177 | self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1]) 178 | else: 179 | print("Could not find old network weights") 180 | 181 | def save_step_network(self, time_step, saver, save_path): 182 | saver.save(self.sess, save_path + 'network', global_step=time_step, 183 | write_meta_graph=False) 184 | 185 | def load_simple_network(self, path): 186 | saver = tf.train.Saver() 187 | saver.restore(self.sess, tf.train.latest_checkpoint(path)) 188 | print("restore model successful") 189 | 190 | def save_simple_network(self, save_path): 191 | saver = tf.train.Saver() 192 | saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False) 193 | 194 | 195 | if __name__ == '__main__': 196 | import argparse 197 | 198 | random_seed = int(time.time() * 1000 % 1000) 199 | parser = argparse.ArgumentParser() 200 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 201 | parser.add_argument('--hid', type=int, default=300) 202 | parser.add_argument('--l', type=int, default=1) 203 | parser.add_argument('--gamma', type=float, default=0.99) 204 | parser.add_argument('--seed', '-s', type=int, default=random_seed) 205 | parser.add_argument('--epochs', type=int, default=3000) 206 | parser.add_argument('--max_steps', type=int, default=1000) 207 | parser.add_argument('--exp_name', type=str, default='sac_class') 208 | args = parser.parse_args() 209 | 210 | env = gym.make(args.env) 211 | env = env.unwrapped 212 | env.seed(args.seed) 213 | 214 | s_dim = env.observation_space.shape[0] 215 | a_dim = env.action_space.shape[0] 216 | a_bound = env.action_space.high[0] 217 | 218 | net = SAC(a_dim, s_dim, a_bound, 219 | batch_size=100, 220 | ) 221 | ep_reward_list = [] 222 | test_ep_reward_list = [] 223 | 224 | for i in range(args.epochs): 225 | s = env.reset() 226 | ep_reward = 0 227 | for j in range(args.max_steps): 228 | 229 | # Add exploration noise 230 | if i < 10: 231 | a = np.random.rand(a_dim) * a_bound 232 | else: 233 | # a = net.choose_action(s) 234 | a = net.get_action(s, 0.1) 235 | # a = noise.add_noise(a) 236 | 237 | a = np.clip(a, -a_bound, a_bound) 238 | 239 | s_, r, done, info = env.step(a) 240 | done = False if j == args.max_steps - 1 else done 241 | 242 | net.store_transition((s, a, r, s_, done)) 243 | 244 | s = s_ 245 | ep_reward += r 246 | if j == args.max_steps - 1: 247 | 248 | for _ in range(args.max_steps): 249 | net.learn() 250 | 251 | ep_reward_list.append(ep_reward) 252 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 253 | # 'Explore: %.2f' % var, 254 | "learn step:", net.learn_step) 255 | # if ep_reward > -300:RENDER = True 256 | 257 | # 增加测试部分! 258 | if i % 20 == 0: 259 | test_ep_reward = net.test_agent(env=env, n=5) 260 | test_ep_reward_list.append(test_ep_reward) 261 | print("-" * 20) 262 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 263 | 'Test Reward: %i' % int(test_ep_reward), 264 | ) 265 | print("-" * 20) 266 | 267 | break 268 | 269 | import matplotlib.pyplot as plt 270 | 271 | plt.plot(ep_reward_list) 272 | img_name = str(args.exp_name + "_" + args.env + "_epochs" + 273 | str(args.epochs) + 274 | "_seed" + str(args.seed)) 275 | plt.title(img_name + "_train") 276 | plt.savefig(img_name + ".png") 277 | plt.show() 278 | plt.close() 279 | 280 | plt.plot(test_ep_reward_list) 281 | plt.title(img_name + "_test") 282 | plt.savefig(img_name + ".png") 283 | plt.show() 284 | -------------------------------------------------------------------------------- /sac_sp/SAC_sp.py: -------------------------------------------------------------------------------- 1 | """ 2 | 去掉了log的信息,直接简单的画了一个图. 3 | 待会儿在这个基础上,封装一个类 4 | """ 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | import gym 9 | import time 10 | import sys 11 | sys.path.append("../") 12 | from sac_sp import core 13 | from sac_sp.core import get_vars, mlp_actor_critic 14 | 15 | 16 | class ReplayBuffer: 17 | """ 18 | A simple FIFO experience replay buffer for SAC agents. 19 | """ 20 | 21 | def __init__(self, obs_dim, act_dim, size): 22 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 23 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 24 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 25 | self.rews_buf = np.zeros(size, dtype=np.float32) 26 | self.done_buf = np.zeros(size, dtype=np.float32) 27 | self.ptr, self.size, self.max_size = 0, 0, size 28 | 29 | def store(self, obs, act, rew, next_obs, done): 30 | self.obs1_buf[self.ptr] = obs 31 | self.obs2_buf[self.ptr] = next_obs 32 | self.acts_buf[self.ptr] = act 33 | self.rews_buf[self.ptr] = rew 34 | self.done_buf[self.ptr] = done 35 | self.ptr = (self.ptr + 1) % self.max_size 36 | self.size = min(self.size + 1, self.max_size) 37 | 38 | def sample_batch(self, batch_size=32): 39 | idxs = np.random.randint(0, self.size, size=batch_size) 40 | return dict(obs1=self.obs1_buf[idxs], 41 | obs2=self.obs2_buf[idxs], 42 | acts=self.acts_buf[idxs], 43 | rews=self.rews_buf[idxs], 44 | done=self.done_buf[idxs]) 45 | 46 | 47 | def sac(env_fn, actor_critic=core.mlp_actor_critic, 48 | ac_kwargs=dict(), seed=0, 49 | steps_per_epoch=5000, epochs=600, 50 | replay_size=int(1e6), gamma=0.99, 51 | polyak=0.995, lr=1e-3, alpha=0.2, 52 | batch_size=100, start_steps=10000, 53 | max_ep_len=1000, logger_kwargs=dict(), save_freq=1): 54 | 55 | tf.set_random_seed(seed) 56 | np.random.seed(seed) 57 | 58 | env, test_env = env_fn(), env_fn() 59 | obs_dim = env.observation_space.shape[0] 60 | act_dim = env.action_space.shape[0] 61 | 62 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 63 | act_limit = env.action_space.high[0] 64 | 65 | # Share information about action space with policy architecture 66 | ac_kwargs['action_space'] = env.action_space.high[0] 67 | 68 | # Inputs to computation graph 69 | x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) 70 | 71 | # Main outputs from computation graph 72 | with tf.variable_scope('main'): 73 | mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) 74 | 75 | # Target value network 76 | with tf.variable_scope('target'): 77 | _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) 78 | 79 | # Experience buffer 80 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 81 | 82 | # Count variables 83 | var_counts = tuple(core.count_vars(scope) for scope in 84 | ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) 85 | print(('\nNumber of parameters: \t pi: %d, \t' + \ 86 | 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts) 87 | 88 | # Min Double-Q: 89 | min_q_pi = tf.minimum(q1_pi, q2_pi) 90 | 91 | # Targets for Q and V regression 92 | q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) 93 | v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) 94 | 95 | # Soft actor-critic losses 96 | pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) 97 | q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) 98 | q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) 99 | v_loss = 0.5 * tf.reduce_mean((v_backup - v) ** 2) 100 | value_loss = q1_loss + q2_loss + v_loss 101 | 102 | # Policy train op 103 | # (has to be separate from value train op, because q1_pi appears in pi_loss) 104 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) 105 | train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) 106 | 107 | # Value train op 108 | # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) 109 | value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) 110 | value_params = get_vars('main/q') + get_vars('main/v') 111 | with tf.control_dependencies([train_pi_op]): 112 | train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) 113 | 114 | # Polyak averaging for target variables 115 | # (control flow because sess.run otherwise evaluates in nondeterministic order) 116 | with tf.control_dependencies([train_value_op]): 117 | target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) 118 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 119 | 120 | # All ops to call during one training step 121 | step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 122 | train_pi_op, train_value_op, target_update] 123 | 124 | # Initializing targets to match main variables 125 | target_init = tf.group([tf.assign(v_targ, v_main) 126 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 127 | 128 | sess = tf.Session() 129 | sess.run(tf.global_variables_initializer()) 130 | sess.run(target_init) 131 | 132 | def get_action(o, deterministic=False): 133 | act_op = mu if deterministic else pi 134 | return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] 135 | 136 | def test_agent(n=10): 137 | global sess, mu, pi, q1, q2, q1_pi, q2_pi 138 | for j in range(n): 139 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 140 | while not (d or (ep_len == max_ep_len)): 141 | # Take deterministic actions at test time 142 | o, r, d, _ = test_env.step(get_action(o, True)) 143 | ep_ret += r 144 | ep_len += 1 145 | 146 | start_time = time.time() 147 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 148 | total_steps = steps_per_epoch * epochs 149 | 150 | ep_ret_list = [] 151 | episode = 0 152 | 153 | # Main loop: collect experience in env and update/log each epoch 154 | for t in range(total_steps): 155 | 156 | """ 157 | Until start_steps have elapsed, randomly sample actions 158 | from a uniform distribution for better exploration. Afterwards, 159 | use the learned policy. 160 | """ 161 | if t > start_steps: 162 | a = get_action(o) 163 | else: 164 | a = env.action_space.sample() 165 | 166 | # Step the env 167 | o2, r, d, _ = env.step(a) 168 | ep_ret += r 169 | ep_len += 1 170 | 171 | # Ignore the "done" signal if it comes from hitting the time 172 | # horizon (that is, when it's an artificial terminal signal 173 | # that isn't based on the agent's state) 174 | d = False if ep_len == max_ep_len else d 175 | 176 | # Store experience to replay buffer 177 | replay_buffer.store(o, a, r, o2, d) 178 | 179 | # Super critical, easy to overlook step: make sure to update 180 | # most recent observation! 181 | o = o2 182 | 183 | if d or (ep_len == max_ep_len): 184 | """ 185 | Perform all SAC updates at the end of the trajectory. 186 | This is a slight difference from the SAC specified in the 187 | original paper. 188 | """ 189 | episode += 1 190 | ep_ret_list.append(ep_ret) 191 | epoch = t // steps_per_epoch 192 | print("Epoch:", epoch) 193 | print("Episode:", episode) 194 | print("Training Step:", t) 195 | print("Episode Reward:", ep_ret) 196 | 197 | for j in range(ep_len): 198 | batch = replay_buffer.sample_batch(batch_size) 199 | feed_dict = {x_ph: batch['obs1'], 200 | x2_ph: batch['obs2'], 201 | a_ph: batch['acts'], 202 | r_ph: batch['rews'], 203 | d_ph: batch['done'], 204 | } 205 | outs = sess.run(step_ops, feed_dict) 206 | 207 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 208 | 209 | # End of epoch wrap-up 210 | if t > 0 and t % steps_per_epoch == 0: 211 | test_agent() 212 | 213 | import matplotlib.pyplot as plt 214 | plt.plot(ep_ret_list) 215 | plt.show() 216 | 217 | 218 | if __name__ == '__main__': 219 | import argparse 220 | 221 | parser = argparse.ArgumentParser() 222 | parser.add_argument('--env', type=str, default='Hopper-v2') 223 | parser.add_argument('--hid', type=int, default=300) 224 | parser.add_argument('--l', type=int, default=1) 225 | parser.add_argument('--gamma', type=float, default=0.99) 226 | parser.add_argument('--seed', '-s', type=int, default=7) 227 | parser.add_argument('--epochs', type=int, default=600) 228 | parser.add_argument('--exp_name', type=str, default='sac') 229 | args = parser.parse_args() 230 | 231 | sac(lambda: gym.make(args.env), actor_critic=core.mlp_actor_critic, 232 | ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), 233 | gamma=args.gamma, seed=args.seed, epochs=args.epochs, 234 | ) 235 | 236 | -------------------------------------------------------------------------------- /sac_sp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/__init__.py -------------------------------------------------------------------------------- /sac_sp/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | EPS = 1e-8 5 | 6 | def placeholder(dim=None): 7 | return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) 8 | 9 | def placeholders(*args): 10 | return [placeholder(dim) for dim in args] 11 | 12 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 13 | for h in hidden_sizes[:-1]: 14 | x = tf.layers.dense(x, units=h, activation=activation) 15 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 16 | 17 | def get_vars(scope): 18 | return [x for x in tf.global_variables() if scope in x.name] 19 | 20 | def count_vars(scope): 21 | v = get_vars(scope) 22 | return sum([np.prod(var.shape.as_list()) for var in v]) 23 | 24 | 25 | def gaussian_likelihood(x, mu, log_std): 26 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 27 | return tf.reduce_sum(pre_sum, axis=1) 28 | 29 | 30 | def clip_but_pass_gradient(x, l=-1., u=1.): 31 | clip_up = tf.cast(x > u, tf.float32) 32 | clip_low = tf.cast(x < l, tf.float32) 33 | return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low) 34 | 35 | 36 | """ 37 | Policies 38 | """ 39 | 40 | LOG_STD_MAX = 2 41 | LOG_STD_MIN = -20 42 | 43 | 44 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): 45 | act_dim = a.shape.as_list()[-1] 46 | net = mlp(x, list(hidden_sizes), activation, activation) 47 | mu = tf.layers.dense(net, act_dim, activation=output_activation) 48 | 49 | """ 50 | Because algorithm maximizes trade-off of reward and entropy, 51 | entropy must be unique to state---and therefore log_stds need 52 | to be a neural network output instead of a shared-across-states 53 | learnable parameter vector. But for deep Relu and other nets, 54 | simply sticking an activationless dense layer at the end would 55 | be quite bad---at the beginning of training, a randomly initialized 56 | net could produce extremely large values for the log_stds, which 57 | would result in some actions being either entirely deterministic 58 | or too random to come back to earth. Either of these introduces 59 | numerical instability which could break the algorithm. To 60 | protect against that, we'll constrain the output range of the 61 | log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is 62 | slightly different from the trick used by the original authors of 63 | SAC---they used tf.clip_by_value instead of squashing and rescaling. 64 | I prefer this approach because it allows gradient propagation 65 | through log_std where clipping wouldn't, but I don't know if 66 | it makes much of a difference. 67 | """ 68 | log_std = tf.layers.dense(net, act_dim, activation=tf.tanh) 69 | log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) 70 | 71 | std = tf.exp(log_std) 72 | pi = mu + tf.random_normal(tf.shape(mu)) * std 73 | logp_pi = gaussian_likelihood(pi, mu, log_std) 74 | return mu, pi, logp_pi 75 | 76 | 77 | def apply_squashing_func(mu, pi, logp_pi): 78 | mu = tf.tanh(mu) 79 | pi = tf.tanh(pi) 80 | # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range. 81 | logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) 82 | return mu, pi, logp_pi 83 | 84 | 85 | """ 86 | Actor-Critics 87 | """ 88 | 89 | 90 | def mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 91 | output_activation=None, policy=mlp_gaussian_policy, action_space=None): 92 | # policy 93 | with tf.variable_scope('pi'): 94 | mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation) 95 | mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) 96 | 97 | # make sure actions are in correct range 98 | # action_scale = action_space.high[0] 99 | action_scale = action_space 100 | mu *= action_scale 101 | pi *= action_scale 102 | 103 | # vfs 104 | vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 105 | with tf.variable_scope('q1'): 106 | q1 = vf_mlp(tf.concat([x,a], axis=-1)) 107 | with tf.variable_scope('q1', reuse=True): 108 | q1_pi = vf_mlp(tf.concat([x,pi], axis=-1)) 109 | with tf.variable_scope('q2'): 110 | q2 = vf_mlp(tf.concat([x,a], axis=-1)) 111 | with tf.variable_scope('q2', reuse=True): 112 | q2_pi = vf_mlp(tf.concat([x,pi], axis=-1)) 113 | with tf.variable_scope('v'): 114 | v = vf_mlp(x) 115 | return mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v -------------------------------------------------------------------------------- /sac_sp/exp_images/HalfCheetah-v2-sac-class-300k-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/exp_images/HalfCheetah-v2-sac-class-300k-test.png -------------------------------------------------------------------------------- /sac_sp/exp_images/HalfCheetah-v2-sac-class-300k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/exp_images/HalfCheetah-v2-sac-class-300k.png -------------------------------------------------------------------------------- /sac_sp/exp_images/Hopper-v2-sac-class-3000k-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/exp_images/Hopper-v2-sac-class-3000k-test.png -------------------------------------------------------------------------------- /sac_sp/exp_images/Hopper-v2-sac-class-3000k-train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/exp_images/Hopper-v2-sac-class-3000k-train.png -------------------------------------------------------------------------------- /sac_sp/exp_images/Hopper-v2-sac-sp-5000k-train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/exp_images/Hopper-v2-sac-sp-5000k-train.png -------------------------------------------------------------------------------- /sac_sp/test_gym_sac_sp_class.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import gym 4 | import time 5 | import sys 6 | 7 | sys.path.append("../") 8 | from sac_sp.SAC_class import SAC 9 | 10 | MAX_EPISODES = 250 11 | MAX_EP_STEPS = 1000 12 | 13 | RENDER = False 14 | ENV_NAME = 'Hopper-v2' 15 | 16 | 17 | def test_agent(net, env, n=10): 18 | ep_reward_list = [] 19 | for j in range(n): 20 | s = env.reset() 21 | ep_reward = 0 22 | for i in range(MAX_EP_STEPS): 23 | # Take deterministic actions at test time (noise_scale=0) 24 | s, r, d, _ = env.step(net.get_action(s)) 25 | ep_reward += r 26 | 27 | ep_reward_list.append(ep_reward) 28 | mean_ep_reward = np.mean(np.array(ep_reward_list)) 29 | return mean_ep_reward 30 | 31 | 32 | def main(): 33 | 34 | env = gym.make(ENV_NAME) 35 | env = env.unwrapped 36 | env.seed(4) 37 | 38 | s_dim = env.observation_space.shape[0] 39 | a_dim = env.action_space.shape[0] 40 | a_bound = env.action_space.high[0] 41 | 42 | net = SAC(a_dim, s_dim, a_bound, 43 | batch_size=100, 44 | ) 45 | ep_reward_list = [] 46 | test_ep_reward_list = [] 47 | for i in range(MAX_EPISODES): 48 | s = env.reset() 49 | ep_reward = 0 50 | for j in range(MAX_EP_STEPS): 51 | if RENDER: 52 | env.render() 53 | 54 | # Add exploration noise 55 | if i < 10: 56 | a = np.random.rand(a_dim) * a_bound 57 | else: 58 | # a = net.choose_action(s) 59 | a = net.get_action(s, 0.1) 60 | # a = noise.add_noise(a) 61 | 62 | a = np.clip(a, -a_bound, a_bound) 63 | 64 | s_, r, done, info = env.step(a) 65 | done = False if j == MAX_EP_STEPS-1 else done 66 | 67 | net.store_transition((s, a, r, s_, done)) 68 | 69 | s = s_ 70 | ep_reward += r 71 | if j == MAX_EP_STEPS - 1: 72 | 73 | for _ in range(MAX_EP_STEPS): 74 | net.learn() 75 | 76 | ep_reward_list.append(ep_reward) 77 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 78 | # 'Explore: %.2f' % var, 79 | "learn step:", net.learn_step) 80 | # if ep_reward > -300:RENDER = True 81 | 82 | # 增加测试部分! 83 | if i % 20 == 0: 84 | test_ep_reward = test_agent(net=net, env=env, n=5) 85 | test_ep_reward_list.append(test_ep_reward) 86 | print("-"*20) 87 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 88 | 'Test Reward: %i' % int(test_ep_reward), 89 | ) 90 | print("-" * 20) 91 | 92 | break 93 | 94 | plt.plot(ep_reward_list) 95 | plt.show() 96 | plt.plot(test_ep_reward_list) 97 | plt.show() 98 | 99 | 100 | if __name__ == '__main__': 101 | main() 102 | 103 | -------------------------------------------------------------------------------- /sp_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sp_utils/__init__.py -------------------------------------------------------------------------------- /sp_utils/logx.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Some simple logging functionality, inspired by rllab's logging. 4 | 5 | Logs to a tab-separated-values file (path/to/output_directory/progress.txt) 6 | 7 | """ 8 | import json 9 | import joblib 10 | import shutil 11 | import numpy as np 12 | import tensorflow as tf 13 | import os.path as osp, time, atexit, os 14 | import sys 15 | sys.path.append("../") 16 | 17 | from sp_utils.mpi_tools import proc_id, mpi_statistics_scalar 18 | from sp_utils.serialization_utils import convert_json 19 | 20 | color2num = dict( 21 | gray=30, 22 | red=31, 23 | green=32, 24 | yellow=33, 25 | blue=34, 26 | magenta=35, 27 | cyan=36, 28 | white=37, 29 | crimson=38 30 | ) 31 | 32 | 33 | def colorize(string, color, bold=False, highlight=False): 34 | """ 35 | Colorize a string. 36 | 37 | This function was originally written by John Schulman. 38 | """ 39 | attr = [] 40 | num = color2num[color] 41 | if highlight: num += 10 42 | attr.append(str(num)) 43 | if bold: attr.append('1') 44 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 45 | 46 | 47 | def restore_tf_graph(sess, fpath): 48 | """ 49 | Loads graphs saved by Logger. 50 | 51 | Will output a dictionary whose keys and values are from the 'inputs' 52 | and 'outputs' dict you specified with logger.setup_tf_saver(). 53 | 54 | Args: 55 | sess: A Tensorflow session. 56 | fpath: Filepath to save directory. 57 | 58 | Returns: 59 | A dictionary mapping from keys to tensors in the computation graph 60 | loaded from ``fpath``. 61 | """ 62 | tf.saved_model.loader.load( 63 | sess, 64 | [tf.saved_model.tag_constants.SERVING], 65 | fpath 66 | ) 67 | model_info = joblib.load(osp.join(fpath, 'model_info.pkl')) 68 | graph = tf.get_default_graph() 69 | model = dict() 70 | model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['inputs'].items()}) 71 | model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['outputs'].items()}) 72 | return model 73 | 74 | 75 | def setup_logger_kwargs(exp_name, seed=None, output_dir=None, datestamp=False): 76 | """ 77 | 从run.py文件里调过来的,output_dir名字修改了一下 78 | Sets up the output_dir for a logger and returns a dict for logger kwargs. 79 | 80 | If no seed is given and datestamp is false, 81 | 82 | :: 83 | 84 | output_dir = data_dir/exp_name 85 | 86 | If a seed is given and datestamp is false, 87 | 88 | :: 89 | 90 | output_dir = data_dir/exp_name/exp_name_s[seed] 91 | 92 | If datestamp is true, amend to 93 | 94 | :: 95 | 96 | output_dir = data_dir/YY-MM-DD_exp_name/YY-MM-DD_HH-MM-SS_exp_name_s[seed] 97 | 98 | You can force datestamp=True by setting ``FORCE_DATESTAMP=True`` in 99 | ``spinup/user_config.py``. 100 | 101 | Args: 102 | 103 | exp_name (string): Name for experiment. 104 | 105 | seed (int): Seed for random number generators used by experiment. 106 | 107 | data_dir (string): Path to folder where results should be saved. 108 | Default is the ``DEFAULT_DATA_DIR`` in ``spinup/user_config.py``. 109 | 110 | datestamp (bool): Whether to include a date and timestamp in the 111 | name of the save directory. 112 | 113 | Returns: 114 | 115 | logger_kwargs, a dict containing output_dir and exp_name. 116 | """ 117 | 118 | # Datestamp forcing 119 | datestamp = datestamp or True 120 | 121 | # Make base path 122 | ymd_time = time.strftime("%Y-%m-%d_") if datestamp else '' 123 | relpath = ''.join([ymd_time, exp_name]) 124 | 125 | if seed is not None: 126 | # Make a seed-specific subfolder in the experiment directory. 127 | if datestamp: 128 | hms_time = time.strftime("%Y-%m-%d_%H-%M-%S") 129 | subfolder = ''.join([hms_time, '-', exp_name, '_s', str(seed)]) 130 | else: 131 | subfolder = ''.join([exp_name, '_s', str(seed)]) 132 | relpath = osp.join(relpath, subfolder) 133 | 134 | data_dir = output_dir or True 135 | logger_kwargs = dict(output_dir=osp.join(data_dir, relpath), 136 | exp_name=exp_name) 137 | return logger_kwargs 138 | 139 | 140 | class Logger: 141 | """ 142 | A general-purpose logger. 143 | 144 | Makes it easy to save diagnostics, hyperparameter configurations, the 145 | state of a training run, and the trained model. 146 | """ 147 | 148 | def __init__(self, output_dir=None, 149 | output_fname='progress.txt', exp_name=None): 150 | """ 151 | Initialize a Logger. 152 | 153 | Args: 154 | output_dir (string): A directory for saving results to. If 155 | ``None``, defaults to a temp directory of the form 156 | ``/tmp/experiments/somerandomnumber``. 157 | 158 | output_fname (string): Name for the tab-separated-value file 159 | containing metrics logged throughout a training run. 160 | Defaults to ``progress.txt``. 161 | 162 | exp_name (string): Experiment name. If you run multiple training 163 | runs and give them all the same ``exp_name``, the plotter 164 | will know to group them. (Use case: if you run the same 165 | hyperparameter configuration with multiple random seeds, you 166 | should give them all the same ``exp_name``.) 167 | """ 168 | if proc_id()==0: 169 | self.output_dir = output_dir or "/tmp/experiments/%i"%int(time.time()) 170 | if osp.exists(self.output_dir): 171 | print("Warning: Log dir %s already exists! Storing info there anyway."%self.output_dir) 172 | else: 173 | os.makedirs(self.output_dir) 174 | self.output_file = open(osp.join(self.output_dir, output_fname), 'w') 175 | atexit.register(self.output_file.close) 176 | print(colorize("Logging data to %s"%self.output_file.name, 'green', bold=True)) 177 | else: 178 | self.output_dir = None 179 | self.output_file = None 180 | self.first_row=True 181 | self.log_headers = [] 182 | self.log_current_row = {} 183 | self.exp_name = exp_name 184 | 185 | def log(self, msg, color='green'): 186 | """Print a colorized message to stdout.""" 187 | if proc_id()==0: 188 | print(colorize(msg, color, bold=True)) 189 | 190 | def log_tabular(self, key, val): 191 | """ 192 | Log a value of some diagnostic. 193 | 194 | Call this only once for each diagnostic quantity, each iteration. 195 | After using ``log_tabular`` to store values for each diagnostic, 196 | make sure to call ``dump_tabular`` to write them out to file and 197 | stdout (otherwise they will not get saved anywhere). 198 | """ 199 | if self.first_row: 200 | self.log_headers.append(key) 201 | else: 202 | assert key in self.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 203 | assert key not in self.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 204 | self.log_current_row[key] = val 205 | 206 | def save_config(self, config): 207 | """ 208 | Log an experiment configuration. 209 | 210 | Call this once at the top of your experiment, passing in all important 211 | config vars as a dict. This will serialize the config to JSON, while 212 | handling anything which can't be serialized in a graceful way (writing 213 | as informative a string as possible). 214 | 215 | Example use: 216 | 217 | .. code-block:: python 218 | 219 | logger = EpochLogger(**logger_kwargs) 220 | logger.save_config(locals()) 221 | """ 222 | 223 | config_json = convert_json(config) 224 | if self.exp_name is not None: 225 | config_json['exp_name'] = self.exp_name 226 | if proc_id()==0: 227 | output = json.dumps(config_json, separators=(',',':\t'), 228 | indent=4, sort_keys=True) 229 | print(colorize('Saving config:\n', color='cyan', bold=True)) 230 | print(output) 231 | with open(osp.join(self.output_dir, "config.json"), 'w') as out: 232 | out.write(output) 233 | 234 | def save_state(self, state_dict, itr=None): 235 | """ 236 | Saves the state of an experiment. 237 | 238 | To be clear: this is about saving *state*, not logging diagnostics. 239 | All diagnostic logging is separate from this function. This function 240 | will save whatever is in ``state_dict``---usually just a copy of the 241 | environment---and the most recent parameters for the model you 242 | previously set up saving for with ``setup_tf_saver``. 243 | 244 | Call with any frequency you prefer. If you only want to maintain a 245 | single state and overwrite it at each call with the most recent 246 | version, leave ``itr=None``. If you want to keep all of the states you 247 | save, provide unique (increasing) values for 'itr'. 248 | 249 | Args: 250 | state_dict (dict): Dictionary containing essential elements to 251 | describe the current state of training. 252 | 253 | itr: An int, or None. Current iteration of training. 254 | """ 255 | if proc_id()==0: 256 | fname = 'vars.pkl' if itr is None else 'vars%d.pkl'%itr 257 | try: 258 | joblib.dump(state_dict, osp.join(self.output_dir, fname)) 259 | except: 260 | self.log('Warning: could not pickle state_dict.', color='red') 261 | if hasattr(self, 'tf_saver_elements'): 262 | self._tf_simple_save(itr) 263 | 264 | def setup_tf_saver(self, sess, inputs, outputs): 265 | """ 266 | Set up easy model saving for tensorflow. 267 | 268 | Call once, after defining your computation graph but before training. 269 | 270 | Args: 271 | sess: The Tensorflow session in which you train your computation 272 | graph. 273 | 274 | inputs (dict): A dictionary that maps from keys of your choice 275 | to the tensorflow placeholders that serve as inputs to the 276 | computation graph. Make sure that *all* of the placeholders 277 | needed for your outputs are included! 278 | 279 | outputs (dict): A dictionary that maps from keys of your choice 280 | to the outputs from your computation graph. 281 | """ 282 | self.tf_saver_elements = dict(session=sess, inputs=inputs, 283 | outputs=outputs) 284 | self.tf_saver_info = {'inputs': {k:v.name for k,v in inputs.items()}, 285 | 'outputs': {k:v.name for k,v in outputs.items()}} 286 | 287 | def _tf_simple_save(self, itr=None): 288 | """ 289 | Uses simple_save to save a trained model, plus info to make it easy 290 | to associated tensors to variables after restore. 291 | """ 292 | if proc_id()==0: 293 | assert hasattr(self, 'tf_saver_elements'), \ 294 | "First have to setup saving with self.setup_tf_saver" 295 | fpath = 'simple_save' + ('%d'%itr if itr is not None else '') 296 | fpath = osp.join(self.output_dir, fpath) 297 | if osp.exists(fpath): 298 | # simple_save refuses to be useful if fpath already exists, 299 | # so just delete fpath if it's there. 300 | shutil.rmtree(fpath) 301 | tf.saved_model.simple_save(export_dir=fpath, **self.tf_saver_elements) 302 | joblib.dump(self.tf_saver_info, osp.join(fpath, 'model_info.pkl')) 303 | 304 | def dump_tabular(self): 305 | """ 306 | Write all of the diagnostics from the current iteration. 307 | 308 | Writes both to stdout, and to the output file. 309 | """ 310 | if proc_id()==0: 311 | vals = [] 312 | key_lens = [len(key) for key in self.log_headers] 313 | max_key_len = max(15,max(key_lens)) 314 | keystr = '%'+'%d'%max_key_len 315 | fmt = "| " + keystr + "s | %15s |" 316 | n_slashes = 22 + max_key_len 317 | print("-"*n_slashes) 318 | for key in self.log_headers: 319 | val = self.log_current_row.get(key, "") 320 | valstr = "%8.3g"%val if hasattr(val, "__float__") else val 321 | print(fmt%(key, valstr)) 322 | vals.append(val) 323 | print("-"*n_slashes) 324 | if self.output_file is not None: 325 | if self.first_row: 326 | self.output_file.write("\t".join(self.log_headers)+"\n") 327 | self.output_file.write("\t".join(map(str,vals))+"\n") 328 | self.output_file.flush() 329 | self.log_current_row.clear() 330 | self.first_row=False 331 | 332 | 333 | class EpochLogger(Logger): 334 | """ 335 | A variant of Logger tailored for tracking average values over epochs. 336 | 337 | Typical use case: there is some quantity which is calculated many times 338 | throughout an epoch, and at the end of the epoch, you would like to 339 | report the average / std / min / max value of that quantity. 340 | 341 | With an EpochLogger, each time the quantity is calculated, you would 342 | use 343 | 344 | .. code-block:: python 345 | 346 | epoch_logger.store(NameOfQuantity=quantity_value) 347 | 348 | to load it into the EpochLogger's state. Then at the end of the epoch, you 349 | would use 350 | 351 | .. code-block:: python 352 | 353 | epoch_logger.log_tabular(NameOfQuantity, **options) 354 | 355 | to record the desired values. 356 | """ 357 | 358 | def __init__(self, *args, **kwargs): 359 | super().__init__(*args, **kwargs) 360 | self.epoch_dict = dict() 361 | 362 | def store(self, **kwargs): 363 | """ 364 | Save something into the epoch_logger's current state. 365 | 366 | Provide an arbitrary number of keyword arguments with numerical 367 | values. 368 | """ 369 | for k,v in kwargs.items(): 370 | if not(k in self.epoch_dict.keys()): 371 | self.epoch_dict[k] = [] 372 | self.epoch_dict[k].append(v) 373 | 374 | def log_tabular(self, key, val=None, with_min_and_max=False, average_only=False): 375 | """ 376 | Log a value or possibly the mean/std/min/max values of a diagnostic. 377 | 378 | Args: 379 | key (string): The name of the diagnostic. If you are logging a 380 | diagnostic whose state has previously been saved with 381 | ``store``, the key here has to match the key you used there. 382 | 383 | val: A value for the diagnostic. If you have previously saved 384 | values for this key via ``store``, do *not* provide a ``val`` 385 | here. 386 | 387 | with_min_and_max (bool): If true, log min and max values of the 388 | diagnostic over the epoch. 389 | 390 | average_only (bool): If true, do not log the standard deviation 391 | of the diagnostic over the epoch. 392 | """ 393 | if val is not None: 394 | super().log_tabular(key,val) 395 | else: 396 | v = self.epoch_dict[key] 397 | vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v 398 | stats = mpi_statistics_scalar(vals, with_min_and_max=with_min_and_max) 399 | super().log_tabular(key if average_only else 'Average' + key, stats[0]) 400 | if not(average_only): 401 | super().log_tabular('Std'+key, stats[1]) 402 | if with_min_and_max: 403 | super().log_tabular('Max'+key, stats[3]) 404 | super().log_tabular('Min'+key, stats[2]) 405 | self.epoch_dict[key] = [] 406 | 407 | def get_stats(self, key): 408 | """ 409 | Lets an algorithm ask the logger for mean/std/min/max of a diagnostic. 410 | """ 411 | v = self.epoch_dict[key] 412 | vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v 413 | return mpi_statistics_scalar(vals) 414 | -------------------------------------------------------------------------------- /sp_utils/mpi_tools.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import os, subprocess, sys 3 | import numpy as np 4 | 5 | 6 | def mpi_fork(n, bind_to_core=False): 7 | """ 8 | Re-launches the current script with workers linked by MPI. 9 | 10 | Also, terminates the original process that launched it. 11 | 12 | Taken almost without modification from the Baselines function of the 13 | `same name`_. 14 | 15 | .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py 16 | 17 | Args: 18 | n (int): Number of process to split into. 19 | 20 | bind_to_core (bool): Bind each MPI process to a core. 21 | """ 22 | if n<=1: 23 | return 24 | if os.getenv("IN_MPI") is None: 25 | env = os.environ.copy() 26 | env.update( 27 | MKL_NUM_THREADS="1", 28 | OMP_NUM_THREADS="1", 29 | IN_MPI="1" 30 | ) 31 | args = ["mpirun", "-np", str(n)] 32 | if bind_to_core: 33 | args += ["-bind-to", "core"] 34 | args += [sys.executable] + sys.argv 35 | subprocess.check_call(args, env=env) 36 | sys.exit() 37 | 38 | 39 | def msg(m, string=''): 40 | print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m)) 41 | 42 | def proc_id(): 43 | """Get rank of calling process.""" 44 | return MPI.COMM_WORLD.Get_rank() 45 | 46 | def allreduce(*args, **kwargs): 47 | return MPI.COMM_WORLD.Allreduce(*args, **kwargs) 48 | 49 | def num_procs(): 50 | """Count active MPI processes.""" 51 | return MPI.COMM_WORLD.Get_size() 52 | 53 | def broadcast(x, root=0): 54 | MPI.COMM_WORLD.Bcast(x, root=root) 55 | 56 | def mpi_op(x, op): 57 | x, scalar = ([x], True) if np.isscalar(x) else (x, False) 58 | x = np.asarray(x, dtype=np.float32) 59 | buff = np.zeros_like(x, dtype=np.float32) 60 | allreduce(x, buff, op=op) 61 | return buff[0] if scalar else buff 62 | 63 | def mpi_sum(x): 64 | return mpi_op(x, MPI.SUM) 65 | 66 | def mpi_avg(x): 67 | """Average a scalar or vector over MPI processes.""" 68 | return mpi_sum(x) / num_procs() 69 | 70 | def mpi_statistics_scalar(x, with_min_and_max=False): 71 | """ 72 | Get mean/std and optional min/max of scalar x across MPI processes. 73 | 74 | Args: 75 | x: An array containing samples of the scalar to produce statistics 76 | for. 77 | 78 | with_min_and_max (bool): If true, return min and max of x in 79 | addition to mean and std. 80 | """ 81 | x = np.array(x, dtype=np.float32) 82 | global_sum, global_n = mpi_sum([np.sum(x), len(x)]) 83 | mean = global_sum / global_n 84 | 85 | global_sum_sq = mpi_sum(np.sum((x - mean)**2)) 86 | std = np.sqrt(global_sum_sq / global_n) # compute global std 87 | 88 | if with_min_and_max: 89 | global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN) 90 | global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX) 91 | return mean, std, global_min, global_max 92 | return mean, std -------------------------------------------------------------------------------- /sp_utils/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | import os.path as osp 7 | import numpy as np 8 | 9 | DIV_LINE_WIDTH = 50 10 | 11 | # Global vars for tracking and labeling data at load time. 12 | exp_idx = 0 13 | units = dict() 14 | 15 | 16 | def plot_data(data, xaxis='Epoch', value="AverageEpRet", condition="Condition1", smooth=1, **kwargs): 17 | if smooth > 1: 18 | """ 19 | smooth data with moving window average. 20 | that is, 21 | smoothed_y[t] = average(y[t-k], y[t-k+1], ..., y[t+k-1], y[t+k]) 22 | where the "smooth" param is width of that window (2k+1) 23 | """ 24 | y = np.ones(smooth) 25 | for datum in data: 26 | x = np.asarray(datum[value]) 27 | z = np.ones(len(x)) 28 | smoothed_x = np.convolve(x,y,'same') / np.convolve(z,y,'same') 29 | datum[value] = smoothed_x 30 | 31 | if isinstance(data, list): 32 | data = pd.concat(data, ignore_index=True) 33 | sns.set(style="darkgrid", font_scale=1.5) 34 | sns.tsplot(data=data, time=xaxis, value=value, unit="Unit", condition=condition, ci='sd', **kwargs) 35 | """ 36 | If you upgrade to any version of Seaborn greater than 0.8.1, switch from 37 | tsplot to lineplot replacing L29 with: 38 | 39 | sns.lineplot(data=data, x=xaxis, y=value, hue=condition, ci='sd', **kwargs) 40 | 41 | Changes the colorscheme and the default legend style, though. 42 | """ 43 | # plt.legend(loc='best').set_draggable(True) 44 | # 上面的会报错~ 45 | plt.legend(loc='best') 46 | 47 | """ 48 | For the version of the legend used in the Spinning Up benchmarking page, 49 | swap L38 with: 50 | 51 | plt.legend(loc='upper center', ncol=6, handlelength=1, 52 | mode="expand", borderaxespad=0., prop={'size': 13}) 53 | """ 54 | 55 | xscale = np.max(np.asarray(data[xaxis])) > 5e3 56 | if xscale: 57 | # Just some formatting niceness: x-axis scale in scientific notation if max x is large 58 | plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0)) 59 | 60 | plt.tight_layout(pad=0.5) 61 | 62 | 63 | def get_datasets(logdir, condition=None): 64 | """ 65 | Recursively look through logdir for output files produced by 66 | spinup.logx.Logger. 67 | 68 | Assumes that any file "progress.txt" is a valid hit. 69 | """ 70 | global exp_idx 71 | global units 72 | datasets = [] 73 | for root, _, files in os.walk(logdir): 74 | if 'progress.txt' in files: 75 | exp_name = None 76 | try: 77 | config_path = open(os.path.join(root,'config.json')) 78 | config = json.load(config_path) 79 | if 'exp_name' in config: 80 | exp_name = config['exp_name'] 81 | except: 82 | print('No file named config.json') 83 | condition1 = condition or exp_name or 'exp' 84 | condition2 = condition1 + '-' + str(exp_idx) 85 | exp_idx += 1 86 | if condition1 not in units: 87 | units[condition1] = 0 88 | unit = units[condition1] 89 | units[condition1] += 1 90 | 91 | try: 92 | exp_data = pd.read_table(os.path.join(root,'progress.txt')) 93 | except: 94 | print('Could not read from %s'%os.path.join(root,'progress.txt')) 95 | continue 96 | performance = 'AverageTestEpRet' if 'AverageTestEpRet' in exp_data else 'AverageEpRet' 97 | exp_data.insert(len(exp_data.columns),'Unit',unit) 98 | exp_data.insert(len(exp_data.columns),'Condition1',condition1) 99 | exp_data.insert(len(exp_data.columns),'Condition2',condition2) 100 | exp_data.insert(len(exp_data.columns),'Performance',exp_data[performance]) 101 | datasets.append(exp_data) 102 | return datasets 103 | 104 | 105 | def get_all_datasets(all_logdirs, legend=None, select=None, exclude=None): 106 | """ 107 | For every entry in all_logdirs, 108 | 1) check if the entry is a real directory and if it is, 109 | pull data from it; 110 | 111 | 2) if not, check to see if the entry is a prefix for a 112 | real directory, and pull data from that. 113 | """ 114 | logdirs = [] 115 | for logdir in all_logdirs: 116 | if osp.isdir(logdir) and logdir[-1]==os.sep: 117 | logdirs += [logdir] 118 | else: 119 | basedir = osp.dirname(logdir) 120 | fulldir = lambda x : osp.join(basedir, x) 121 | prefix = logdir.split(os.sep)[-1] 122 | listdir= os.listdir(basedir) 123 | logdirs += sorted([fulldir(x) for x in listdir if prefix in x]) 124 | 125 | """ 126 | Enforce selection rules, which check logdirs for certain substrings. 127 | Makes it easier to look at graphs from particular ablations, if you 128 | launch many jobs at once with similar names. 129 | """ 130 | if select is not None: 131 | logdirs = [log for log in logdirs if all(x in log for x in select)] 132 | if exclude is not None: 133 | logdirs = [log for log in logdirs if all(not(x in log) for x in exclude)] 134 | 135 | # Verify logdirs 136 | print('Plotting from...\n' + '='*DIV_LINE_WIDTH + '\n') 137 | for logdir in logdirs: 138 | print(logdir) 139 | print('\n' + '='*DIV_LINE_WIDTH) 140 | 141 | # Make sure the legend is compatible with the logdirs 142 | assert not(legend) or (len(legend) == len(logdirs)), \ 143 | "Must give a legend title for each set of experiments." 144 | 145 | # Load data from logdirs 146 | data = [] 147 | if legend: 148 | for log, leg in zip(logdirs, legend): 149 | data += get_datasets(log, leg) 150 | else: 151 | for log in logdirs: 152 | data += get_datasets(log) 153 | return data 154 | 155 | 156 | def make_plots(all_logdirs, legend=None, xaxis=None, values=None, count=False, 157 | font_scale=1.5, smooth=1, select=None, exclude=None, estimator='mean'): 158 | data = get_all_datasets(all_logdirs, legend, select, exclude) 159 | values = values if isinstance(values, list) else [values] 160 | condition = 'Condition2' if count else 'Condition1' 161 | estimator = getattr(np, estimator) # choose what to show on main curve: mean? max? min? 162 | for value in values: 163 | plt.figure() 164 | plot_data(data, xaxis=xaxis, value=value, condition=condition, smooth=smooth, estimator=estimator) 165 | plt.show() 166 | 167 | 168 | def main(): 169 | import argparse 170 | parser = argparse.ArgumentParser() 171 | parser.add_argument('logdir', nargs='*') 172 | parser.add_argument('--legend', '-l', nargs='*') 173 | parser.add_argument('--xaxis', '-x', default='TotalEnvInteracts') 174 | parser.add_argument('--value', '-y', default='Performance', nargs='*') 175 | parser.add_argument('--count', action='store_true') 176 | parser.add_argument('--smooth', '-s', type=int, default=1) 177 | parser.add_argument('--select', nargs='*') 178 | parser.add_argument('--exclude', nargs='*') 179 | parser.add_argument('--est', default='mean') 180 | args = parser.parse_args() 181 | """ 182 | 183 | Args: 184 | logdir (strings): As many log directories (or prefixes to log 185 | directories, which the plotter will autocomplete internally) as 186 | you'd like to plot from. 187 | 188 | legend (strings): Optional way to specify legend for the plot. The 189 | plotter legend will automatically use the ``exp_name`` from the 190 | config.json file, unless you tell it otherwise through this flag. 191 | This only works if you provide a name for each directory that 192 | will get plotted. (Note: this may not be the same as the number 193 | of logdir args you provide! Recall that the plotter looks for 194 | autocompletes of the logdir args: there may be more than one 195 | match for a given logdir prefix, and you will need to provide a 196 | legend string for each one of those matches---unless you have 197 | removed some of them as candidates via selection or exclusion 198 | rules (below).) 199 | 200 | xaxis (string): Pick what column from data is used for the x-axis. 201 | Defaults to ``TotalEnvInteracts``. 202 | 203 | value (strings): Pick what columns from data to graph on the y-axis. 204 | Submitting multiple values will produce multiple graphs. Defaults 205 | to ``Performance``, which is not an actual output of any algorithm. 206 | Instead, ``Performance`` refers to either ``AverageEpRet``, the 207 | correct performance measure for the on-policy algorithms, or 208 | ``AverageTestEpRet``, the correct performance measure for the 209 | off-policy algorithms. The plotter will automatically figure out 210 | which of ``AverageEpRet`` or ``AverageTestEpRet`` to report for 211 | each separate logdir. 212 | 213 | count: Optional flag. By default, the plotter shows y-values which 214 | are averaged across all results that share an ``exp_name``, 215 | which is typically a set of identical experiments that only vary 216 | in random seed. But if you'd like to see all of those curves 217 | separately, use the ``--count`` flag. 218 | 219 | smooth (int): Smooth data by averaging it over a fixed window. This 220 | parameter says how wide the averaging window will be. 221 | 222 | select (strings): Optional selection rule: the plotter will only show 223 | curves from logdirs that contain all of these substrings. 224 | 225 | exclude (strings): Optional exclusion rule: plotter will only show 226 | curves from logdirs that do not contain these substrings. 227 | 228 | """ 229 | 230 | make_plots(args.logdir, args.legend, args.xaxis, args.value, args.count, 231 | smooth=args.smooth, select=args.select, exclude=args.exclude, 232 | estimator=args.est) 233 | 234 | if __name__ == "__main__": 235 | main() -------------------------------------------------------------------------------- /sp_utils/serialization_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def convert_json(obj): 5 | """ Convert obj to a version which can be serialized with JSON. 6 | 垃圾递归!删掉了造成无限递归的数据类型~ 7 | """ 8 | 9 | if is_json_serializable(obj): 10 | return obj 11 | else: 12 | if isinstance(obj, dict): 13 | return {convert_json(k): convert_json(v) 14 | for k,v in obj.items()} 15 | 16 | elif isinstance(obj, tuple): 17 | return (convert_json(x) for x in obj) 18 | 19 | elif isinstance(obj, list): 20 | return [convert_json(x) for x in obj] 21 | 22 | elif hasattr(obj,'__name__') and not('lambda' in obj.__name__): 23 | # return 24 | return convert_json(obj.__name__) 25 | 26 | elif hasattr(obj,'__dict__') and obj.__dict__: 27 | return 28 | obj_dict = {convert_json(k): convert_json(v) 29 | for k,v in obj.__dict__.items()} 30 | return {str(obj): obj_dict} 31 | 32 | return str(obj) 33 | 34 | def is_json_serializable(v): 35 | try: 36 | json.dumps(v) 37 | return True 38 | except: 39 | return False -------------------------------------------------------------------------------- /td3_sp/TD3_class.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | import sys 6 | sys.path.append("../") 7 | from td3_sp import core 8 | from td3_sp.core import get_vars, mlp_actor_critic 9 | 10 | 11 | class ReplayBuffer: 12 | """ 13 | A simple FIFO experience replay buffer for TD3 agents. 14 | """ 15 | 16 | def __init__(self, obs_dim, act_dim, size): 17 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 18 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 19 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 20 | self.rews_buf = np.zeros(size, dtype=np.float32) 21 | self.done_buf = np.zeros(size, dtype=np.float32) 22 | self.ptr, self.size, self.max_size = 0, 0, size 23 | 24 | def store(self, obs, act, rew, next_obs, done): 25 | self.obs1_buf[self.ptr] = obs 26 | self.obs2_buf[self.ptr] = next_obs 27 | self.acts_buf[self.ptr] = act 28 | self.rews_buf[self.ptr] = rew 29 | self.done_buf[self.ptr] = done 30 | self.ptr = (self.ptr + 1) % self.max_size 31 | self.size = min(self.size + 1, self.max_size) 32 | 33 | def sample_batch(self, batch_size=32): 34 | idxs = np.random.randint(0, self.size, size=batch_size) 35 | return dict(obs1=self.obs1_buf[idxs], 36 | obs2=self.obs2_buf[idxs], 37 | acts=self.acts_buf[idxs], 38 | rews=self.rews_buf[idxs], 39 | done=self.done_buf[idxs]) 40 | 41 | 42 | class TD3: 43 | def __init__(self, 44 | a_dim, obs_dim, a_bound, 45 | mlp_actor_critic=core.mlp_actor_critic, 46 | ac_kwargs=dict(), seed=0, 47 | 48 | replay_size=int(1e6), gamma=0.99, 49 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 50 | batch_size=100, 51 | # start_steps=10000, 52 | act_noise=0.1, target_noise=0.2, 53 | noise_clip=0.5, policy_delay=2, 54 | # max_ep_len=1000, 55 | # logger_kwargs=dict(), save_freq=1 56 | ): 57 | 58 | self.learn_step = 0 59 | 60 | self.obs_dim = obs_dim 61 | self.act_dim = a_dim 62 | self.act_limit = a_bound 63 | self.policy_delay = policy_delay 64 | self.action_noise = act_noise 65 | 66 | # Share information about action space with policy architecture 67 | ac_kwargs['action_space'] = a_bound 68 | 69 | # Inputs to computation graph 70 | self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(obs_dim, a_dim, obs_dim, None, None) 71 | 72 | # Main outputs from computation graph 73 | with tf.variable_scope('main'): 74 | self.pi, self.q1, self.q2, self.q1_pi = mlp_actor_critic(self.x_ph, self.a_ph, **ac_kwargs) 75 | 76 | # Target policy network 77 | with tf.variable_scope('target'): 78 | pi_targ, _, _, _ = mlp_actor_critic(self.x2_ph, self.a_ph, **ac_kwargs) 79 | 80 | # Target Q networks 81 | with tf.variable_scope('target', reuse=True): 82 | 83 | # Target policy smoothing, by adding clipped noise to target actions 84 | epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) 85 | epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) 86 | a2 = pi_targ + epsilon 87 | a2 = tf.clip_by_value(a2, -self.act_limit, self.act_limit) 88 | 89 | # Target Q-values, using action from target policy 90 | _, q1_targ, q2_targ, _ = mlp_actor_critic(self.x2_ph, a2, **ac_kwargs) 91 | 92 | # Experience buffer 93 | self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=self.act_dim, size=replay_size) 94 | 95 | # Count variables 96 | var_counts = tuple(core.count_vars(scope) 97 | for scope in ['main/pi', 98 | 'main/q1', 99 | 'main/q2', 100 | 'main']) 101 | print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) 102 | 103 | # Bellman backup for Q functions, using Clipped Double-Q targets 104 | min_q_targ = tf.minimum(q1_targ, q2_targ) 105 | backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * min_q_targ) 106 | 107 | # TD3 losses 108 | self.pi_loss = -tf.reduce_mean(self.q1_pi) 109 | q1_loss = tf.reduce_mean((self.q1 - backup) ** 2) 110 | q2_loss = tf.reduce_mean((self.q2 - backup) ** 2) 111 | # 为啥这里的loss是加起来的? 112 | self.q_loss = q1_loss + q2_loss 113 | 114 | # Separate train ops for pi, q 115 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) 116 | q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) 117 | self.train_pi_op = pi_optimizer.minimize(self.pi_loss, 118 | var_list=get_vars('main/pi')) 119 | # 这里的参数,怎么是总的q? 120 | # 难道这里的字符串只需要匹配就好了? 121 | self.train_q_op = q_optimizer.minimize(self.q_loss, 122 | var_list=get_vars('main/q')) 123 | 124 | # Polyak averaging for target variables 125 | self.target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) 126 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 127 | 128 | # Initializing targets to match main variables 129 | target_init = tf.group([tf.assign(v_targ, v_main) 130 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 131 | 132 | self.sess = tf.Session() 133 | self.sess.run(tf.global_variables_initializer()) 134 | self.sess.run(target_init) 135 | 136 | def get_action(self, s, noise_scale=0): 137 | if not noise_scale: 138 | noise_scale = self.action_noise 139 | a = self.sess.run(self.pi, feed_dict={self.x_ph: s.reshape(1, -1)})[0] 140 | a += noise_scale * np.random.randn(self.act_dim) 141 | return np.clip(a, -self.act_limit, self.act_limit) 142 | 143 | def store_transition(self, transition): 144 | (s, a, r, s_, done) = transition 145 | self.replay_buffer.store(s, a, r, s_, done) 146 | 147 | def test_agent(self, env, max_ep_len=1000, n=5): 148 | ep_reward_list = [] 149 | for j in range(n): 150 | s = env.reset() 151 | ep_reward = 0 152 | for i in range(max_ep_len): 153 | # Take deterministic actions at test time (noise_scale=0) 154 | s, r, d, _ = env.step(self.get_action(s)) 155 | ep_reward += r 156 | ep_reward_list.append(ep_reward) 157 | mean_ep_reward = np.mean(np.array(ep_reward_list)) 158 | return mean_ep_reward 159 | 160 | def learn(self, batch_size=100): 161 | 162 | batch = self.replay_buffer.sample_batch(batch_size) 163 | feed_dict = {self.x_ph: batch['obs1'], 164 | self.x2_ph: batch['obs2'], 165 | self.a_ph: batch['acts'], 166 | self.r_ph: batch['rews'], 167 | self.d_ph: batch['done'] 168 | } 169 | q_step_ops = [self.q_loss, self.q1, self.q2, self.train_q_op] 170 | outs = self.sess.run(q_step_ops, feed_dict) 171 | 172 | if self.learn_step % self.policy_delay == 0: 173 | # Delayed policy update 174 | outs = self.sess.run([self.pi_loss, self.train_pi_op, self.target_update], 175 | feed_dict) 176 | self.learn_step += 1 177 | 178 | def load_step_network(self, saver, load_path): 179 | checkpoint = tf.train.get_checkpoint_state(load_path) 180 | if checkpoint and checkpoint.model_checkpoint_path: 181 | saver.restore(self.sess, tf.train.latest_checkpoint(load_path)) 182 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 183 | self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1]) 184 | else: 185 | print("Could not find old network weights") 186 | 187 | def save_step_network(self, time_step, saver, save_path): 188 | saver.save(self.sess, save_path + 'network', global_step=time_step, 189 | write_meta_graph=False) 190 | 191 | def load_simple_network(self, path): 192 | saver = tf.train.Saver() 193 | saver.restore(self.sess, tf.train.latest_checkpoint(path)) 194 | print("restore model successful") 195 | 196 | def save_simple_network(self, save_path): 197 | saver = tf.train.Saver() 198 | saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False) 199 | 200 | 201 | if __name__ == '__main__': 202 | import argparse 203 | 204 | random_seed = int(time.time() * 1000 % 1000) 205 | parser = argparse.ArgumentParser() 206 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 207 | parser.add_argument('--hid', type=int, default=300) 208 | parser.add_argument('--l', type=int, default=1) 209 | parser.add_argument('--gamma', type=float, default=0.99) 210 | parser.add_argument('--seed', '-s', type=int, default=random_seed) 211 | parser.add_argument('--epochs', type=int, default=3000) 212 | parser.add_argument('--max_steps', type=int, default=1000) 213 | parser.add_argument('--exp_name', type=str, default='td3_class') 214 | args = parser.parse_args() 215 | 216 | env = gym.make(args.env) 217 | env = env.unwrapped 218 | env.seed(args.seed) 219 | 220 | s_dim = env.observation_space.shape[0] 221 | a_dim = env.action_space.shape[0] 222 | a_bound = env.action_space.high[0] 223 | 224 | net = TD3(a_dim, s_dim, a_bound, 225 | batch_size=100, 226 | ) 227 | ep_reward_list = [] 228 | test_ep_reward_list = [] 229 | 230 | for i in range(args.epochs): 231 | s = env.reset() 232 | ep_reward = 0 233 | for j in range(args.max_steps): 234 | 235 | # Add exploration noise 236 | if i < 10: 237 | a = np.random.rand(a_dim) * a_bound 238 | else: 239 | # a = net.choose_action(s) 240 | a = net.get_action(s, 0.1) 241 | # a = noise.add_noise(a) 242 | 243 | a = np.clip(a, -a_bound, a_bound) 244 | 245 | s_, r, done, info = env.step(a) 246 | done = False if j == args.max_steps - 1 else done 247 | 248 | net.store_transition((s, a, r, s_, done)) 249 | 250 | s = s_ 251 | ep_reward += r 252 | if j == args.max_steps - 1: 253 | 254 | for _ in range(args.max_steps): 255 | net.learn() 256 | 257 | ep_reward_list.append(ep_reward) 258 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 259 | # 'Explore: %.2f' % var, 260 | "learn step:", net.learn_step) 261 | # if ep_reward > -300:RENDER = True 262 | 263 | # 增加测试部分! 264 | if i % 20 == 0: 265 | test_ep_reward = net.test_agent(env=env, n=5) 266 | test_ep_reward_list.append(test_ep_reward) 267 | print("-" * 20) 268 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 269 | 'Test Reward: %i' % int(test_ep_reward), 270 | ) 271 | print("-" * 20) 272 | 273 | break 274 | 275 | import matplotlib.pyplot as plt 276 | 277 | plt.plot(ep_reward_list) 278 | img_name = str(args.exp_name + "_" + args.env + "_epochs" + 279 | str(args.epochs) + 280 | "_seed" + str(args.seed)) 281 | plt.title(img_name + "_train") 282 | plt.savefig(img_name + ".png") 283 | plt.show() 284 | plt.close() 285 | 286 | plt.plot(test_ep_reward_list) 287 | plt.title(img_name + "_test") 288 | plt.savefig(img_name + ".png") 289 | plt.show() 290 | 291 | -------------------------------------------------------------------------------- /td3_sp/TD3_sp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | import sys 6 | sys.path.append("../") 7 | from td3_sp import core 8 | from td3_sp.core import get_vars, mlp_actor_critic 9 | 10 | 11 | class ReplayBuffer: 12 | """ 13 | A simple FIFO experience replay buffer for TD3 agents. 14 | """ 15 | 16 | def __init__(self, obs_dim, act_dim, size): 17 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 18 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 19 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 20 | self.rews_buf = np.zeros(size, dtype=np.float32) 21 | self.done_buf = np.zeros(size, dtype=np.float32) 22 | self.ptr, self.size, self.max_size = 0, 0, size 23 | 24 | def store(self, obs, act, rew, next_obs, done): 25 | self.obs1_buf[self.ptr] = obs 26 | self.obs2_buf[self.ptr] = next_obs 27 | self.acts_buf[self.ptr] = act 28 | self.rews_buf[self.ptr] = rew 29 | self.done_buf[self.ptr] = done 30 | self.ptr = (self.ptr + 1) % self.max_size 31 | self.size = min(self.size + 1, self.max_size) 32 | 33 | def sample_batch(self, batch_size=32): 34 | idxs = np.random.randint(0, self.size, size=batch_size) 35 | return dict(obs1=self.obs1_buf[idxs], 36 | obs2=self.obs2_buf[idxs], 37 | acts=self.acts_buf[idxs], 38 | rews=self.rews_buf[idxs], 39 | done=self.done_buf[idxs]) 40 | 41 | 42 | """ 43 | 44 | TD3 (Twin Delayed DDPG) 45 | 46 | """ 47 | 48 | 49 | def td3(env_fn, mlp_actor_critic=core.mlp_actor_critic, 50 | ac_kwargs=dict(), seed=0, 51 | steps_per_epoch=5000, epochs=250, 52 | replay_size=int(1e6), gamma=0.99, 53 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 54 | batch_size=100, start_steps=10000, 55 | act_noise=0.1, target_noise=0.2, 56 | noise_clip=0.5, policy_delay=2, 57 | max_ep_len=1000, logger_kwargs=dict(), save_freq=1): 58 | 59 | tf.set_random_seed(seed) 60 | np.random.seed(seed) 61 | 62 | env, test_env = env_fn(), env_fn() 63 | obs_dim = env.observation_space.shape[0] 64 | act_dim = env.action_space.shape[0] 65 | 66 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 67 | act_limit = env.action_space.high[0] 68 | 69 | # Share information about action space with policy architecture 70 | ac_kwargs['action_space'] = env.action_space.high[0] 71 | 72 | # Inputs to computation graph 73 | x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) 74 | 75 | # Main outputs from computation graph 76 | with tf.variable_scope('main'): 77 | pi, q1, q2, q1_pi = mlp_actor_critic(x_ph, a_ph, **ac_kwargs) 78 | 79 | # Target policy network 80 | with tf.variable_scope('target'): 81 | pi_targ, _, _, _ = mlp_actor_critic(x2_ph, a_ph, **ac_kwargs) 82 | 83 | # Target Q networks 84 | with tf.variable_scope('target', reuse=True): 85 | 86 | # Target policy smoothing, by adding clipped noise to target actions 87 | epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) 88 | epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) 89 | a2 = pi_targ + epsilon 90 | a2 = tf.clip_by_value(a2, -act_limit, act_limit) 91 | 92 | # Target Q-values, using action from target policy 93 | _, q1_targ, q2_targ, _ = mlp_actor_critic(x2_ph, a2, **ac_kwargs) 94 | 95 | # Experience buffer 96 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 97 | 98 | # Count variables 99 | var_counts = tuple(core.count_vars(scope) 100 | for scope in ['main/pi', 101 | 'main/q1', 102 | 'main/q2', 103 | 'main']) 104 | print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) 105 | 106 | # Bellman backup for Q functions, using Clipped Double-Q targets 107 | min_q_targ = tf.minimum(q1_targ, q2_targ) 108 | backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ) 109 | 110 | # TD3 losses 111 | pi_loss = -tf.reduce_mean(q1_pi) 112 | q1_loss = tf.reduce_mean((q1 - backup) ** 2) 113 | q2_loss = tf.reduce_mean((q2 - backup) ** 2) 114 | # 为啥这里的loss是加起来的? 115 | q_loss = q1_loss + q2_loss 116 | 117 | # Separate train ops for pi, q 118 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) 119 | q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) 120 | train_pi_op = pi_optimizer.minimize(pi_loss, 121 | var_list=get_vars('main/pi')) 122 | # 这里的参数,怎么是总的q? 123 | # 难道这里的字符串只需要匹配就好了? 124 | train_q_op = q_optimizer.minimize(q_loss, 125 | var_list=get_vars('main/q')) 126 | 127 | # Polyak averaging for target variables 128 | target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) 129 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 130 | 131 | # Initializing targets to match main variables 132 | target_init = tf.group([tf.assign(v_targ, v_main) 133 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 134 | 135 | sess = tf.Session() 136 | sess.run(tf.global_variables_initializer()) 137 | sess.run(target_init) 138 | 139 | def get_action(o, noise_scale): 140 | a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] 141 | a += noise_scale * np.random.randn(act_dim) 142 | return np.clip(a, -act_limit, act_limit) 143 | 144 | def test_agent(n=10): 145 | for j in range(n): 146 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 147 | while not (d or (ep_len == max_ep_len)): 148 | # Take deterministic actions at test time (noise_scale=0) 149 | o, r, d, _ = test_env.step(get_action(o, 0)) 150 | ep_ret += r 151 | ep_len += 1 152 | 153 | start_time = time.time() 154 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 155 | total_steps = steps_per_epoch * epochs 156 | 157 | ep_ret_list = [] 158 | episode = 0 159 | 160 | for t in range(total_steps): 161 | 162 | """ 163 | Until start_steps have elapsed, randomly sample actions 164 | from a uniform distribution for better exploration. Afterwards, 165 | use the learned policy (with some noise, via act_noise). 166 | """ 167 | if t > start_steps: 168 | a = get_action(o, act_noise) 169 | else: 170 | a = env.action_space.sample() 171 | 172 | # Step the env 173 | o2, r, d, _ = env.step(a) 174 | ep_ret += r 175 | ep_len += 1 176 | 177 | # Ignore the "done" signal if it comes from hitting the time 178 | # horizon (that is, when it's an artificial terminal signal 179 | # that isn't based on the agent's state) 180 | d = False if ep_len == max_ep_len else d 181 | 182 | # Store experience to replay buffer 183 | replay_buffer.store(o, a, r, o2, d) 184 | 185 | # Super critical, easy to overlook step: make sure to update 186 | # most recent observation! 187 | o = o2 188 | 189 | if d or (ep_len == max_ep_len): 190 | """ 191 | Perform all TD3 updates at the end of the trajectory 192 | (in accordance with source code of TD3 published by 193 | original authors). 194 | """ 195 | episode += 1 196 | ep_ret_list.append(ep_ret) 197 | epoch = t // steps_per_epoch 198 | print("Epoch:", epoch) 199 | print("Episode:", episode) 200 | print("Training Step:", t) 201 | print("Episode Reward:", ep_ret) 202 | for j in range(ep_len): 203 | batch = replay_buffer.sample_batch(batch_size) 204 | feed_dict = {x_ph: batch['obs1'], 205 | x2_ph: batch['obs2'], 206 | a_ph: batch['acts'], 207 | r_ph: batch['rews'], 208 | d_ph: batch['done'] 209 | } 210 | q_step_ops = [q_loss, q1, q2, train_q_op] 211 | outs = sess.run(q_step_ops, feed_dict) 212 | 213 | if j % policy_delay == 0: 214 | # Delayed policy update 215 | outs = sess.run([pi_loss, train_pi_op, target_update], 216 | feed_dict) 217 | 218 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 219 | 220 | # End of epoch wrap-up 221 | if t > 0 and t % steps_per_epoch == 0: 222 | 223 | # Test the performance of the deterministic version of the agent. 224 | test_agent() 225 | 226 | import matplotlib.pyplot as plt 227 | plt.plot(ep_ret_list) 228 | plt.show() 229 | 230 | 231 | if __name__ == '__main__': 232 | import argparse 233 | 234 | parser = argparse.ArgumentParser() 235 | parser.add_argument('--env', type=str, default='Hopper-v2') 236 | parser.add_argument('--hid', type=int, default=300) 237 | parser.add_argument('--l', type=int, default=1) 238 | parser.add_argument('--gamma', type=float, default=0.99) 239 | parser.add_argument('--seed', '-s', type=int, default=3) 240 | parser.add_argument('--epochs', type=int, default=250) 241 | parser.add_argument('--exp_name', type=str, default='td3') 242 | args = parser.parse_args() 243 | 244 | td3(lambda: gym.make(args.env), mlp_actor_critic=core.mlp_actor_critic, 245 | ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), 246 | gamma=args.gamma, seed=args.seed, epochs=args.epochs, 247 | ) 248 | 249 | -------------------------------------------------------------------------------- /td3_sp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/td3_sp/__init__.py -------------------------------------------------------------------------------- /td3_sp/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def placeholder(dim=None): 6 | return tf.placeholder(dtype=tf.float32, 7 | shape=(None,dim) if dim else (None,)) 8 | 9 | 10 | def placeholders(*args): 11 | return [placeholder(dim) for dim in args] 12 | 13 | 14 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 15 | for h in hidden_sizes[:-1]: 16 | x = tf.layers.dense(x, units=h, activation=activation) 17 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 18 | 19 | 20 | def get_vars(scope): 21 | return [x for x in tf.global_variables() if scope in x.name] 22 | 23 | 24 | def count_vars(scope): 25 | v = get_vars(scope) 26 | return sum([np.prod(var.shape.as_list()) for var in v]) 27 | 28 | 29 | """ 30 | Actor-Critics 31 | """ 32 | 33 | 34 | def mlp_actor_critic(x, a, hidden_sizes=(400, 300), activation=tf.nn.relu, 35 | output_activation=tf.tanh, action_space=None): 36 | act_dim = a.shape.as_list()[-1] 37 | act_limit = action_space 38 | with tf.variable_scope('pi'): 39 | pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], 40 | activation, output_activation) 41 | with tf.variable_scope('q1'): 42 | q1 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), 43 | list(hidden_sizes)+[1], 44 | activation, None), axis=1) 45 | with tf.variable_scope('q2'): 46 | q2 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), 47 | list(hidden_sizes)+[1], 48 | activation, None), axis=1) 49 | with tf.variable_scope('q1', reuse=True): 50 | q1_pi = tf.squeeze(mlp(tf.concat([x, pi], axis=-1), 51 | list(hidden_sizes)+[1], 52 | activation, None), axis=1) 53 | return pi, q1, q2, q1_pi 54 | -------------------------------------------------------------------------------- /td3_sp/td3_origin.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import time 5 | import sys 6 | sys.path.append("../") 7 | from td3_sp import core 8 | from td3_sp.core import get_vars, mlp_actor_critic 9 | from sp_utils.logx import EpochLogger, setup_logger_kwargs 10 | 11 | 12 | class ReplayBuffer: 13 | """ 14 | A simple FIFO experience replay buffer for TD3 agents. 15 | """ 16 | 17 | def __init__(self, obs_dim, act_dim, size): 18 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 19 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 20 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 21 | self.rews_buf = np.zeros(size, dtype=np.float32) 22 | self.done_buf = np.zeros(size, dtype=np.float32) 23 | self.ptr, self.size, self.max_size = 0, 0, size 24 | 25 | def store(self, obs, act, rew, next_obs, done): 26 | self.obs1_buf[self.ptr] = obs 27 | self.obs2_buf[self.ptr] = next_obs 28 | self.acts_buf[self.ptr] = act 29 | self.rews_buf[self.ptr] = rew 30 | self.done_buf[self.ptr] = done 31 | self.ptr = (self.ptr + 1) % self.max_size 32 | self.size = min(self.size + 1, self.max_size) 33 | 34 | def sample_batch(self, batch_size=32): 35 | idxs = np.random.randint(0, self.size, size=batch_size) 36 | return dict(obs1=self.obs1_buf[idxs], 37 | obs2=self.obs2_buf[idxs], 38 | acts=self.acts_buf[idxs], 39 | rews=self.rews_buf[idxs], 40 | done=self.done_buf[idxs]) 41 | 42 | 43 | """ 44 | 45 | TD3 (Twin Delayed DDPG) 46 | 47 | """ 48 | 49 | 50 | def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 51 | steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, 52 | polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 53 | act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, 54 | max_ep_len=1000, logger_kwargs=dict(), save_freq=1): 55 | """ 56 | 57 | Args: 58 | env_fn : A function which creates a copy of the environment. 59 | The environment must satisfy the OpenAI Gym API. 60 | 61 | actor_critic: A function which takes in placeholder symbols 62 | for state, ``x_ph``, and action, ``a_ph``, and returns the main 63 | outputs from the agent's Tensorflow computation graph: 64 | 65 | =========== ================ ====================================== 66 | Symbol Shape Description 67 | =========== ================ ====================================== 68 | ``pi`` (batch, act_dim) | Deterministically computes actions 69 | | from policy given states. 70 | ``q1`` (batch,) | Gives one estimate of Q* for 71 | | states in ``x_ph`` and actions in 72 | | ``a_ph``. 73 | ``q2`` (batch,) | Gives another estimate of Q* for 74 | | states in ``x_ph`` and actions in 75 | | ``a_ph``. 76 | ``q1_pi`` (batch,) | Gives the composition of ``q1`` and 77 | | ``pi`` for states in ``x_ph``: 78 | | q1(x, pi(x)). 79 | =========== ================ ====================================== 80 | 81 | ac_kwargs (dict): Any kwargs appropriate for the actor_critic 82 | function you provided to TD3. 83 | 84 | seed (int): Seed for random number generators. 85 | 86 | steps_per_epoch (int): Number of steps of interaction (state-action pairs) 87 | for the agent and the environment in each epoch. 88 | 89 | epochs (int): Number of epochs to run and train agent. 90 | 91 | replay_size (int): Maximum length of replay buffer. 92 | 93 | gamma (float): Discount factor. (Always between 0 and 1.) 94 | 95 | polyak (float): Interpolation factor in polyak averaging for target 96 | networks. Target networks are updated towards main networks 97 | according to: 98 | 99 | .. math:: \\theta_{\\text{targ}} \\leftarrow 100 | \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta 101 | 102 | where :math:`\\rho` is polyak. (Always between 0 and 1, usually 103 | close to 1.) 104 | 105 | pi_lr (float): Learning rate for policy. 106 | 107 | q_lr (float): Learning rate for Q-networks. 108 | 109 | batch_size (int): Minibatch size for SGD. 110 | 111 | start_steps (int): Number of steps for uniform-random action selection, 112 | before running real policy. Helps exploration. 113 | 114 | act_noise (float): Stddev for Gaussian exploration noise added to 115 | policy at training time. (At test time, no noise is added.) 116 | 117 | target_noise (float): Stddev for smoothing noise added to target 118 | policy. 119 | 120 | noise_clip (float): Limit for absolute value of target policy 121 | smoothing noise. 122 | 123 | policy_delay (int): Policy will only be updated once every 124 | policy_delay times for each update of the Q-networks. 125 | 126 | max_ep_len (int): Maximum length of trajectory / episode / rollout. 127 | 128 | logger_kwargs (dict): Keyword args for EpochLogger. 129 | 130 | save_freq (int): How often (in terms of gap between epochs) to save 131 | the current policy and value function. 132 | 133 | """ 134 | 135 | logger = EpochLogger(**logger_kwargs) 136 | print("local()", locals()) 137 | logger.save_config(locals()) 138 | 139 | tf.set_random_seed(seed) 140 | np.random.seed(seed) 141 | 142 | env, test_env = env_fn(), env_fn() 143 | obs_dim = env.observation_space.shape[0] 144 | act_dim = env.action_space.shape[0] 145 | 146 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 147 | act_limit = env.action_space.high[0] 148 | 149 | # Share information about action space with policy architecture 150 | ac_kwargs['action_space'] = env.action_space 151 | 152 | # Inputs to computation graph 153 | x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) 154 | 155 | # Main outputs from computation graph 156 | with tf.variable_scope('main'): 157 | pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) 158 | 159 | # Target policy network 160 | with tf.variable_scope('target'): 161 | pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) 162 | 163 | # Target Q networks 164 | with tf.variable_scope('target', reuse=True): 165 | 166 | # Target policy smoothing, by adding clipped noise to target actions 167 | epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) 168 | epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) 169 | a2 = pi_targ + epsilon 170 | a2 = tf.clip_by_value(a2, -act_limit, act_limit) 171 | 172 | # Target Q-values, using action from target policy 173 | _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) 174 | 175 | # Experience buffer 176 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 177 | 178 | # Count variables 179 | var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) 180 | print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) 181 | 182 | # Bellman backup for Q functions, using Clipped Double-Q targets 183 | min_q_targ = tf.minimum(q1_targ, q2_targ) 184 | backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ) 185 | 186 | # TD3 losses 187 | pi_loss = -tf.reduce_mean(q1_pi) 188 | q1_loss = tf.reduce_mean((q1 - backup) ** 2) 189 | q2_loss = tf.reduce_mean((q2 - backup) ** 2) 190 | q_loss = q1_loss + q2_loss 191 | 192 | # Separate train ops for pi, q 193 | pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) 194 | q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) 195 | train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) 196 | train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) 197 | 198 | # Polyak averaging for target variables 199 | target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) 200 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 201 | 202 | # Initializing targets to match main variables 203 | target_init = tf.group([tf.assign(v_targ, v_main) 204 | for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) 205 | 206 | sess = tf.Session() 207 | sess.run(tf.global_variables_initializer()) 208 | sess.run(target_init) 209 | 210 | # Setup model saving 211 | logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2}) 212 | 213 | def get_action(o, noise_scale): 214 | a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] 215 | a += noise_scale * np.random.randn(act_dim) 216 | return np.clip(a, -act_limit, act_limit) 217 | 218 | def test_agent(n=10): 219 | for j in range(n): 220 | o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 221 | while not (d or (ep_len == max_ep_len)): 222 | # Take deterministic actions at test time (noise_scale=0) 223 | o, r, d, _ = test_env.step(get_action(o, 0)) 224 | ep_ret += r 225 | ep_len += 1 226 | logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) 227 | 228 | start_time = time.time() 229 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 230 | total_steps = steps_per_epoch * epochs 231 | 232 | # Main loop: collect experience in env and update/log each epoch 233 | for t in range(total_steps): 234 | 235 | """ 236 | Until start_steps have elapsed, randomly sample actions 237 | from a uniform distribution for better exploration. Afterwards, 238 | use the learned policy (with some noise, via act_noise). 239 | """ 240 | if t > start_steps: 241 | a = get_action(o, act_noise) 242 | else: 243 | a = env.action_space.sample() 244 | 245 | # Step the env 246 | o2, r, d, _ = env.step(a) 247 | ep_ret += r 248 | ep_len += 1 249 | 250 | # Ignore the "done" signal if it comes from hitting the time 251 | # horizon (that is, when it's an artificial terminal signal 252 | # that isn't based on the agent's state) 253 | d = False if ep_len == max_ep_len else d 254 | 255 | # Store experience to replay buffer 256 | replay_buffer.store(o, a, r, o2, d) 257 | 258 | # Super critical, easy to overlook step: make sure to update 259 | # most recent observation! 260 | o = o2 261 | 262 | if d or (ep_len == max_ep_len): 263 | """ 264 | Perform all TD3 updates at the end of the trajectory 265 | (in accordance with source code of TD3 published by 266 | original authors). 267 | """ 268 | for j in range(ep_len): 269 | batch = replay_buffer.sample_batch(batch_size) 270 | feed_dict = {x_ph: batch['obs1'], 271 | x2_ph: batch['obs2'], 272 | a_ph: batch['acts'], 273 | r_ph: batch['rews'], 274 | d_ph: batch['done'] 275 | } 276 | q_step_ops = [q_loss, q1, q2, train_q_op] 277 | outs = sess.run(q_step_ops, feed_dict) 278 | logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) 279 | 280 | if j % policy_delay == 0: 281 | # Delayed policy update 282 | outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) 283 | logger.store(LossPi=outs[0]) 284 | 285 | logger.store(EpRet=ep_ret, EpLen=ep_len) 286 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 287 | 288 | # End of epoch wrap-up 289 | if t > 0 and t % steps_per_epoch == 0: 290 | epoch = t // steps_per_epoch 291 | 292 | # Save model 293 | if (epoch % save_freq == 0) or (epoch == epochs - 1): 294 | logger.save_state({'env': env}, None) 295 | 296 | # Test the performance of the deterministic version of the agent. 297 | test_agent() 298 | 299 | # Log info about epoch 300 | logger.log_tabular('Epoch', epoch) 301 | logger.log_tabular('EpRet', with_min_and_max=True) 302 | logger.log_tabular('TestEpRet', with_min_and_max=True) 303 | logger.log_tabular('EpLen', average_only=True) 304 | logger.log_tabular('TestEpLen', average_only=True) 305 | logger.log_tabular('TotalEnvInteracts', t) 306 | logger.log_tabular('Q1Vals', with_min_and_max=True) 307 | logger.log_tabular('Q2Vals', with_min_and_max=True) 308 | logger.log_tabular('LossPi', average_only=True) 309 | logger.log_tabular('LossQ', average_only=True) 310 | logger.log_tabular('Time', time.time() - start_time) 311 | logger.dump_tabular() 312 | 313 | 314 | if __name__ == '__main__': 315 | import argparse 316 | 317 | parser = argparse.ArgumentParser() 318 | parser.add_argument('--env', type=str, default='Hopper-v2') 319 | parser.add_argument('--hid', type=int, default=300) 320 | parser.add_argument('--l', type=int, default=1) 321 | parser.add_argument('--gamma', type=float, default=0.99) 322 | parser.add_argument('--seed', '-s', type=int, default=0) 323 | parser.add_argument('--epochs', type=int, default=50) 324 | parser.add_argument('--exp_name', type=str, default='td3') 325 | args = parser.parse_args() 326 | 327 | logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) 328 | 329 | td3(lambda: gym.make(args.env), actor_critic=core.mlp_actor_critic, 330 | ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), 331 | gamma=args.gamma, seed=args.seed, epochs=args.epochs, 332 | logger_kwargs=logger_kwargs) 333 | -------------------------------------------------------------------------------- /td3_sp/test_gym_td3_sp_class.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import gym 4 | import time 5 | import sys 6 | 7 | sys.path.append("../") 8 | from td3_sp.TD3_class import TD3 9 | 10 | MAX_EPISODES = 250 11 | MAX_EP_STEPS = 1000 12 | 13 | RENDER = False 14 | ENV_NAME = 'Hopper-v2' 15 | 16 | 17 | def test_agent(net, env, n=10): 18 | ep_reward_list = [] 19 | for j in range(n): 20 | s = env.reset() 21 | ep_reward = 0 22 | for i in range(MAX_EP_STEPS): 23 | # Take deterministic actions at test time (noise_scale=0) 24 | s, r, d, _ = env.step(net.get_action(s)) 25 | ep_reward += r 26 | 27 | ep_reward_list.append(ep_reward) 28 | mean_ep_reward = np.mean(np.array(ep_reward_list)) 29 | return mean_ep_reward 30 | 31 | 32 | def main(): 33 | 34 | env = gym.make(ENV_NAME) 35 | env = env.unwrapped 36 | env.seed(4) 37 | 38 | s_dim = env.observation_space.shape[0] 39 | a_dim = env.action_space.shape[0] 40 | a_bound = env.action_space.high[0] 41 | 42 | net = TD3(a_dim, s_dim, a_bound, 43 | batch_size=100, 44 | ) 45 | ep_reward_list = [] 46 | test_ep_reward_list = [] 47 | for i in range(MAX_EPISODES): 48 | s = env.reset() 49 | ep_reward = 0 50 | for j in range(MAX_EP_STEPS): 51 | if RENDER: 52 | env.render() 53 | 54 | # Add exploration noise 55 | if i < 10: 56 | a = np.random.rand(a_dim) * a_bound 57 | else: 58 | # a = net.choose_action(s) 59 | a = net.get_action(s, 0.1) 60 | # a = noise.add_noise(a) 61 | 62 | a = np.clip(a, -a_bound, a_bound) 63 | 64 | s_, r, done, info = env.step(a) 65 | done = False if j == MAX_EP_STEPS-1 else done 66 | 67 | net.store_transition((s, a, r, s_, done)) 68 | 69 | s = s_ 70 | ep_reward += r 71 | if j == MAX_EP_STEPS - 1: 72 | 73 | for _ in range(MAX_EP_STEPS): 74 | net.learn() 75 | 76 | ep_reward_list.append(ep_reward) 77 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 78 | # 'Explore: %.2f' % var, 79 | "learn step:", net.learn_step) 80 | # if ep_reward > -300:RENDER = True 81 | 82 | # 增加测试部分! 83 | if i % 20 == 0: 84 | test_ep_reward = test_agent(net=net, env=env, n=5) 85 | test_ep_reward_list.append(test_ep_reward) 86 | print("-"*20) 87 | print('Episode:', i, ' Reward: %i' % int(ep_reward), 88 | 'Test Reward: %i' % int(test_ep_reward), 89 | ) 90 | print("-" * 20) 91 | 92 | break 93 | 94 | plt.plot(ep_reward_list) 95 | plt.show() 96 | plt.plot(test_ep_reward_list) 97 | plt.show() 98 | 99 | 100 | if __name__ == '__main__': 101 | main() 102 | 103 | --------------------------------------------------------------------------------