├── __init__.py
├── ddpg-movan
    ├── DDPG.py
    ├── DDPG_per.py
    └── __init__.py
├── ddpg_sp
    ├── DDPG_class.py
    ├── DDPG_per_class.py
    ├── DDPG_sp.py
    ├── __init__.py
    ├── core.py
    ├── ddpg_class_HalfCheetah-v2_epochs200_seed553.png
    └── ddpg_class_HalfCheetah-v2_epochs3000_seed485.png
├── memory
    ├── __init__.py
    ├── per_memory.py
    ├── simple_memory.py
    └── sp_per_memory.py
├── noise
    ├── __init__.py
    ├── ou_noise.py
    └── simple_noise.py
├── readme.md
├── run_in_gym
    ├── __init__.py
    ├── launch_with_gym.py
    └── run_gym_sac_class.py
├── sac_auto
    ├── __init__.py
    ├── core.py
    ├── sac_auto_class.py
    └── sac_auto_per_class.py
├── sac_sp
    ├── SAC_class.py
    ├── SAC_sp.py
    ├── __init__.py
    ├── core.py
    ├── exp_images
    │   ├── HalfCheetah-v2-sac-class-300k-test.png
    │   ├── HalfCheetah-v2-sac-class-300k.png
    │   ├── Hopper-v2-sac-class-3000k-test.png
    │   ├── Hopper-v2-sac-class-3000k-train.png
    │   └── Hopper-v2-sac-sp-5000k-train.png
    └── test_gym_sac_sp_class.py
├── sp_utils
    ├── __init__.py
    ├── logx.py
    ├── mpi_tools.py
    ├── plot.py
    └── serialization_utils.py
└── td3_sp
    ├── TD3_class.py
    ├── TD3_per_class.py
    ├── TD3_sp.py
    ├── __init__.py
    ├── core.py
    ├── td3_origin.py
    └── test_gym_td3_sp_class.py


/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from ddpg_sp.DDPG_per_class import DDPG as DDPG
 3 |     from sac_auto.sac_auto_per_class import SAC as SAC_AUTO
 4 |     from td3_sp.TD3_per_class import TD3 as TD3
 5 |     from sac_sp.sac_class import SAC as SAC
 6 | except:
 7 |     from rl_algorithms.ddpg_sp.DDPG_per_class import DDPG as DDPG
 8 |     from rl_algorithms.sac_auto.sac_auto_per_class import SAC as SAC_AUTO
 9 |     from rl_algorithms.td3_sp.TD3_per_class import TD3 as TD3
10 |     from rl_algorithms.sac_sp.sac_class import SAC as SAC
11 |     
12 | 


--------------------------------------------------------------------------------
/ddpg-movan/DDPG.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 从最原始的成功的DDPG_OU_noise_memory_list.py抽取出来.
  3 | 删掉噪声部分.
  4 | 剔除memory部分.
  5 | 便于以后加per.
  6 | """
  7 | 
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | import sys
 11 | from memory.simple_memory import Memory
 12 | 
 13 | 
 14 | #####################  hyper parameters  ####################
 15 | GAMMA = 0.9     # reward discount
 16 | TAU = 0.01      # soft replacement
 17 | 
 18 | 
 19 | ###############################  DDPG  ####################################
 20 | 
 21 | class DDPG(object):
 22 |     def __init__(self, a_dim, s_dim, a_bound, transition_num=4, restore_flag=False, batch_size=32, memory_size=100000):
 23 |         self.transition_num = transition_num
 24 |         self.memory = Memory(memory_size=memory_size,
 25 |                              batch_size=batch_size,
 26 |                              transition_num=transition_num,
 27 |                              )
 28 |         self.batch_size = batch_size
 29 | 
 30 |         self.pointer = 0
 31 |         self.learn_step = 0
 32 |         self.restore_flag = restore_flag
 33 | 
 34 |         self.sess = tf.Session()
 35 | 
 36 |         self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
 37 |         self.actor_lr = tf.placeholder(tf.float32, shape=[], name='actor_lr')
 38 |         self.critic_lr = tf.placeholder(tf.float32, shape=[], name='critic_lr')
 39 | 
 40 |         self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
 41 |         self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
 42 |         self.R = tf.placeholder(tf.float32, [None, 1], 'r')
 43 | 
 44 |         with tf.variable_scope('Actor'):
 45 |             self.a = self._build_a(self.S, scope='eval', trainable=True)
 46 |             a_ = self._build_a(self.S_, scope='target', trainable=False)
 47 |         with tf.variable_scope('Critic'):
 48 |             # assign self.a = a in memory when calculating q for td_error,
 49 |             # otherwise the self.a is from Actor when updating Actor
 50 |             q = self._build_c(self.S, self.a, scope='eval', trainable=True)
 51 |             q_ = self._build_c(self.S_, a_, scope='target', trainable=False)
 52 | 
 53 |         # networks parameters
 54 |         self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
 55 |         self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
 56 |         self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
 57 |         self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
 58 | 
 59 |         # hard_replace
 60 |         self.hard_replace = [tf.assign(t, e)
 61 |                              for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
 62 | 
 63 |         # target net replacement
 64 |         self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e)
 65 |                              for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
 66 | 
 67 |         q_target = self.R + GAMMA * q_
 68 |         # in the feed_dic for the td_error, the self.a should change to actions in memory
 69 |         td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
 70 |         self.c_loss = td_error
 71 |         self.ctrain = tf.train.AdamOptimizer(self.critic_lr).minimize(td_error, var_list=self.ce_params)
 72 | 
 73 |         a_loss = - tf.reduce_mean(q)    # maximize the q
 74 |         self.a_loss = a_loss
 75 |         self.atrain = tf.train.AdamOptimizer(self.actor_lr).minimize(a_loss, var_list=self.ae_params)
 76 | 
 77 |         self.sess.run(tf.global_variables_initializer())
 78 | 
 79 |     def choose_action(self, s):
 80 |         return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
 81 | 
 82 |     def store_transition(self, transition):
 83 |         self.memory.store(transition)
 84 | 
 85 |     def learn(self, actor_lr_input, critic_lr_input, output_loss_flag=False):
 86 |         # soft target replacement
 87 |         self.sess.run(self.soft_replace)
 88 |         # 加上terminal信息
 89 |         if self.transition_num==5:
 90 |             bs, ba, br, bs_, bt = self.memory.sample()
 91 |         if self.transition_num==4:
 92 |             bs, ba, br, bs_ = self.memory.sample()
 93 | 
 94 |         self.learn_step += 1
 95 | 
 96 |         if output_loss_flag:
 97 |             _, a_loss = self.sess.run([self.atrain, self.a_loss], {self.S: bs, self.actor_lr: actor_lr_input})
 98 |             _, c_loss = self.sess.run([self.ctrain, self.c_loss],
 99 |                             {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.critic_lr: critic_lr_input})
100 |             return a_loss, c_loss
101 |         else:
102 |             self.sess.run(self.atrain, {self.S: bs, self.actor_lr: actor_lr_input})
103 |             self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.critic_lr: critic_lr_input})
104 | 
105 | 
106 | 
107 |     def _build_a(self, s, scope, trainable):
108 |         with tf.variable_scope(scope):
109 |             net = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable)
110 |             new_actor_layer = tf.layers.dense(net, 200, activation=tf.nn.relu, name='new_actor_layer', trainable=trainable)
111 |             a = tf.layers.dense(new_actor_layer, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
112 |             return tf.multiply(a, self.a_bound, name='scaled_a')
113 | 
114 |     def _build_c(self, s, a, scope, trainable):
115 |         with tf.variable_scope(scope):
116 |             n_l1 = 400
117 |             w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
118 |             w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
119 |             b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
120 |             net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
121 |             new_critic_layer = tf.layers.dense(net, 300, activation=tf.nn.relu, name='new_critic_layer',
122 |                                               trainable=trainable)
123 |             return tf.layers.dense(new_critic_layer, 1, trainable=trainable)  # Q(s,a)
124 | 
125 |     def load_step_network(self, saver, load_path):
126 |         checkpoint = tf.train.get_checkpoint_state(load_path)
127 |         if checkpoint and checkpoint.model_checkpoint_path:
128 |             saver.restore(self.sess, tf.train.latest_checkpoint(load_path))
129 |             print("Successfully loaded:", checkpoint.model_checkpoint_path)
130 |             self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1])
131 | 
132 |         else:
133 |             print("Could not find old network weights")
134 | 
135 |     def save_step_network(self, time_step, saver, save_path):
136 |         saver.save(self.sess, save_path + 'network', global_step=time_step,
137 |                    write_meta_graph=False)
138 | 
139 |     def load_simple_network(self, path):
140 |         saver = tf.train.Saver()
141 |         saver.restore(self.sess, tf.train.latest_checkpoint(path))
142 |         print("restore model successful")
143 | 
144 |     def save_simple_network(self, save_path):
145 |         saver = tf.train.Saver()
146 |         saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False)
147 | 


--------------------------------------------------------------------------------
/ddpg-movan/DDPG_per.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
  3 | DDPG is Actor Critic based algorithm.
  4 | Pendulum example.
  5 | View more on my tutorial page: https://morvanzhou.github.io/tutorials/
  6 | Using:
  7 | tensorflow 1.0
  8 | gym 0.8.0
  9 | """
 10 | 
 11 | import tensorflow as tf
 12 | import numpy as np
 13 | 
 14 | 
 15 | #####################  hyper parameters  ####################
 16 | 
 17 | 
 18 | LR_A = 0.001    # learning rate for actor
 19 | LR_C = 0.002    # learning rate for critic
 20 | GAMMA = 0.9     # reward discount
 21 | TAU = 0.01      # soft replacement
 22 | 
 23 | 
 24 | class OU_noise(object):
 25 |     def __init__(self, num_actions, action_low_bound, action_high_bound, dt,
 26 |                  mu=0.0, theta=0.15, max_sigma=2.0, min_sigma=0.1):
 27 |         self.mu = mu  # 0.0
 28 |         self.theta = theta  # 0.15
 29 |         self.sigma = max_sigma  # 0.3
 30 |         self.max_sigma = max_sigma  # 0.3
 31 |         self.min_sigma = min_sigma  # 0.1
 32 |         self.dt = dt  # 0.001
 33 |         self.num_actions = num_actions  # 1
 34 |         self.action_low = action_low_bound  # -2
 35 |         self.action_high = action_high_bound  # 2
 36 |         self.reset()
 37 | 
 38 |     def reset(self):
 39 |         self.state = np.zeros(self.num_actions)
 40 | 
 41 |     # self.state = np.zeros(self.num_actions)
 42 |     def state_update(self):
 43 |         x = self.state
 44 |         dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.num_actions)  # np.random.randn()生成0,1的随机数
 45 |         self.state = x + dx
 46 | 
 47 |     def add_noise(self, action):
 48 |         self.state_update()
 49 |         state = self.state
 50 |         self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, self.dt)
 51 |         return np.clip(action + state, self.action_low, self.action_high)
 52 | 
 53 | 
 54 | class SumTree(object):
 55 |     """
 56 |     This SumTree code is a modified version and the original code is from:
 57 |     https://github.com/jaara/AI-blog/blob/master/SumTree.py
 58 | 
 59 |     Story data with its priority in the tree.
 60 |     """
 61 |     data_pointer = 0
 62 | 
 63 |     def __init__(self, capacity):
 64 |         self.capacity = capacity  # for all priority values
 65 |         self.tree = np.zeros(2 * capacity - 1)
 66 |         # [--------------Parent nodes-------------][-------leaves to recode priority-------]
 67 |         #             size: capacity - 1                       size: capacity
 68 |         self.data = list(np.zeros(capacity, dtype=object))  # for all transitions
 69 |         # [--------------data frame-------------]
 70 |         #             size: capacity
 71 | 
 72 |     def add(self, p, transition):
 73 |         tree_idx = self.data_pointer + self.capacity - 1
 74 |         self.data[self.data_pointer] = transition  # update data_frame
 75 |         self.update(tree_idx, p)  # update tree_frame
 76 | 
 77 |         self.data_pointer += 1
 78 |         if self.data_pointer >= self.capacity:  # replace when exceed the capacity
 79 |             self.data_pointer = 0
 80 | 
 81 |     def update(self, tree_idx, p):
 82 |         change = p - self.tree[tree_idx]
 83 |         self.tree[tree_idx] = p
 84 |         # then propagate the change through tree
 85 |         while tree_idx != 0:    # this method is faster than the recursive loop in the reference code
 86 |             tree_idx = (tree_idx - 1) // 2
 87 |             self.tree[tree_idx] += change
 88 | 
 89 |     def get_leaf(self, v):
 90 |         """
 91 |         Tree structure and array storage:
 92 | 
 93 |         Tree index:
 94 |              0         -> storing priority sum
 95 |             / \
 96 |           1     2
 97 |          / \   / \
 98 |         3   4 5   6    -> storing priority for transitions
 99 | 
100 |         Array type for storing:
101 |         [0,1,2,3,4,5,6]
102 |         """
103 |         parent_idx = 0
104 |         while True:     # the while loop is faster than the method in the reference code
105 |             cl_idx = 2 * parent_idx + 1         # this leaf's left and right kids
106 |             cr_idx = cl_idx + 1
107 |             if cl_idx >= len(self.tree):        # reach bottom, end search
108 |                 leaf_idx = parent_idx
109 |                 break
110 |             else:       # downward search, always search for a higher priority node
111 |                 if v <= self.tree[cl_idx]:
112 |                     parent_idx = cl_idx
113 |                 else:
114 |                     v -= self.tree[cl_idx]
115 |                     parent_idx = cr_idx
116 | 
117 |         data_idx = leaf_idx - self.capacity + 1
118 |         return leaf_idx, self.tree[leaf_idx], self.data[data_idx]
119 | 
120 |     @property
121 |     def total_p(self):
122 |         return self.tree[0]  # the root
123 | 
124 | 
125 | class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
126 |     """
127 |     This Memory class is modified based on the original code from:
128 |     https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
129 |     """
130 |     epsilon = 0.01  # small amount to avoid zero priority
131 |     alpha = 0.6  # [0~1] convert the importance of TD error to priority
132 |     beta = 0.4  # importance-sampling, from initial value increasing to 1
133 |     beta_increment_per_sampling = 0.001
134 |     abs_err_upper = 1.  # clipped abs error
135 | 
136 |     def __init__(self, capacity):
137 |         self.tree = SumTree(capacity)
138 |         self.full_flag = False
139 | 
140 |     def store(self, transition):
141 |         max_p = np.max(self.tree.tree[-self.tree.capacity:])
142 |         if max_p == 0:
143 |             max_p = self.abs_err_upper
144 |         self.tree.add(max_p, transition)   # set the max p for new p
145 | 
146 |     def sample(self, n):
147 |         # n就是batch size！
148 |         # np.empty()这是一个随机初始化的一个矩阵！
149 |         b_idx, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1))
150 |         b_memory = []
151 |         pri_seg = self.tree.total_p / n       # priority segment
152 |         self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])  # max = 1
153 | 
154 |         min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p     # for later calculate ISweight
155 |         if min_prob == 0:
156 |             min_prob = 0.00001
157 |         for i in range(n):
158 |             a, b = pri_seg * i, pri_seg * (i + 1)
159 |             v = np.random.uniform(a, b)
160 |             idx, p, data = self.tree.get_leaf(v)
161 |             prob = p / self.tree.total_p
162 |             ISWeights[i, 0] = np.power(prob/min_prob, -self.beta)
163 |             b_idx[i] = idx
164 |             b_memory.append(data)
165 | 
166 |         return b_idx, b_memory, ISWeights
167 | 
168 |     def batch_update(self, tree_idx, abs_errors):
169 |         abs_errors += self.epsilon  # convert to abs and avoid 0
170 |         clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
171 |         ps = np.power(clipped_errors, self.alpha)
172 |         for ti, p in zip(tree_idx, ps):
173 |             self.tree.update(ti, p)
174 | 
175 | ###############################DDPG####################################
176 | 
177 | 
178 | class DDPG(object):
179 |     def __init__(self, a_dim, s_dim, a_bound, exp_path,
180 |                  restore_flag=False,
181 |                  batch_size=512,
182 |                  per_batch_size=32,
183 |                  memory_size=100000,
184 |                  per_memory_size=20000):
185 |         self.memory_size = memory_size
186 |         self.memory = []
187 |         self.per_memory = Memory(capacity=per_memory_size)
188 |         self.per_memory_size = self.per_memory.tree.capacity
189 |         self.pointer = 0
190 |         self.per_pointer = 0
191 | 
192 |         self.batch_size = batch_size
193 |         self.per_batch_size = per_batch_size
194 |         self.exp_path = exp_path
195 |         print("self.exp_path", self.exp_path)
196 | 
197 |         self.learn_step = 0
198 |         self.restore_flag = restore_flag
199 | 
200 |         self.sess = tf.Session()
201 | 
202 |         self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
203 |         self.actor_lr = tf.placeholder(tf.float32, shape=[], name='actor_lr')
204 |         self.critic_lr = tf.placeholder(tf.float32, shape=[], name='critic_lr')
205 | 
206 |         self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
207 |         self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
208 |         self.R = tf.placeholder(tf.float32, [None, 1], 'r')
209 |         self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
210 | 
211 |         with tf.variable_scope('Actor'):
212 |             self.a = self._build_a(self.S, scope='eval', trainable=True)
213 |             a_ = self._build_a(self.S_, scope='target', trainable=False)
214 |         with tf.variable_scope('Critic'):
215 |             # assign self.a = a in memory when calculating q for td_error,
216 |             # otherwise the self.a is from Actor when updating Actor
217 |             q = self._build_c(self.S, self.a, scope='eval', trainable=True)
218 |             q_ = self._build_c(self.S_, a_, scope='target', trainable=False)
219 | 
220 |         # networks parameters
221 |         self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
222 |         self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
223 |         self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
224 |         self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
225 | 
226 |         # hard_replace
227 |         self.hard_replace = [tf.assign(t, e)
228 |                              for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
229 | 
230 |         # target net replacement
231 |         self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e)
232 |                              for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
233 | 
234 |         q_target = self.R + GAMMA * q_
235 |         # in the feed_dic for the td_error, the self.a should change to actions in memory
236 |         # td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
237 |         self.abs_errors = tf.reduce_sum(tf.abs(q_target - q), axis=1)  # for updating Sumtree
238 |         self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(q_target, q))
239 |         self.ctrain = tf.train.AdamOptimizer(self.critic_lr).minimize(self.loss, var_list=self.ce_params)
240 | 
241 |         a_loss = - tf.reduce_mean(q)    # maximize the q
242 |         self.atrain = tf.train.AdamOptimizer(self.actor_lr).minimize(a_loss, var_list=self.ae_params)
243 | 
244 |         self.sess.run(tf.global_variables_initializer())
245 | 
246 |     def choose_action(self, s):
247 |         return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
248 | 
249 |     def learn(self, actor_lr_input, critic_lr_input, per_flag=True):
250 |         # soft target replacement
251 |         self.sess.run(self.soft_replace)
252 | 
253 |         if per_flag:
254 |             tree_idx, batch_memory, ISWeights = self.per_memory.sample(self.per_batch_size)
255 |             batch_states, batch_actions, batch_rewards, batch_states_ = [], [], [], []
256 |             for i in range(self.per_batch_size):
257 |                 batch_states.append(batch_memory[i][0])
258 |                 batch_actions.append(batch_memory[i][1])
259 |                 batch_rewards.append(batch_memory[i][2])
260 |                 batch_states_.append(batch_memory[i][3])
261 | 
262 |             bs = np.array(batch_states)
263 |             ba = np.array(batch_actions)
264 |             batch_rewards = np.array(batch_rewards)
265 |             bs_ = np.array(batch_states_)
266 |             br = batch_rewards[:, np.newaxis]
267 |         else:
268 |             bs, ba, br, bs_ = self.sample_memory()
269 | 
270 |         # print("br:", br)
271 | 
272 |         self.sess.run(self.atrain, {self.S: bs, self.actor_lr: actor_lr_input})
273 |         _, abs_errors, cost = self.sess.run([self.ctrain, self.abs_errors, self.loss],
274 |                       {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.critic_lr: critic_lr_input,
275 |                        self.ISWeights: ISWeights})
276 | 
277 |         self.per_memory.batch_update(tree_idx, abs_errors)  # update priority
278 |         # print("lr:", self.sess.run(self.actor_lr, {self.actor_lr: actor_lr_input}))
279 | 
280 |         self.learn_step += 1
281 | 
282 |     def store_transition(self, s, a, r, s_):
283 |         self.per_memory.store(transition=[s, a, r, s_])
284 |         self.per_pointer = self.per_memory.tree.data_pointer
285 |         if len(self.memory) >= self.memory_size:
286 |             del self.memory[0]
287 |         self.memory.append([s, a, r, s_])
288 |         self.pointer = len(self.memory)
289 | 
290 |     def sample_memory(self):
291 |         if len(self.memory) < self.memory_size:
292 |             indices = np.random.choice(len(self.memory), size=self.batch_size)
293 |         else:
294 |             indices = np.random.choice(self.memory_size, self.batch_size)
295 |         batch_states, batch_actions, batch_rewards, batch_states_ = [], [], [], []
296 |         for i in indices:
297 |             batch_states.append(self.memory[i][0])
298 |             batch_actions.append(self.memory[i][1])
299 |             batch_rewards.append(self.memory[i][2])
300 |             batch_states_.append(self.memory[i][3])
301 | 
302 |         batch_states = np.array(batch_states)
303 |         batch_actions = np.array(batch_actions)
304 |         batch_rewards = np.array(batch_rewards)
305 |         batch_states_ = np.array(batch_states_)
306 |         batch_rewards = batch_rewards[:, np.newaxis]
307 |         return batch_states, batch_actions, batch_rewards, batch_states_
308 | 
309 |     def _build_a(self, s, scope, trainable):
310 |         with tf.variable_scope(scope):
311 |             net = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable)
312 |             new_actor_layer = tf.layers.dense(net, 200, activation=tf.nn.relu, name='new_actor_layer', trainable=trainable)
313 |             a = tf.layers.dense(new_actor_layer, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
314 |             return tf.multiply(a, self.a_bound, name='scaled_a')
315 | 
316 |     def _build_c(self, s, a, scope, trainable):
317 |         with tf.variable_scope(scope):
318 |             n_l1 = 400
319 |             w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
320 |             w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
321 |             b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
322 |             net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
323 |             new_critic_layer = tf.layers.dense(net, 300, activation=tf.nn.relu, name='new_critic_layer',
324 |                                                trainable=trainable)
325 |             return tf.layers.dense(new_critic_layer, 1, trainable=trainable)  # Q(s,a)
326 | 
327 |     def load_network(self, saver, load_path):
328 |         checkpoint = tf.train.get_checkpoint_state(load_path)
329 |         if checkpoint and checkpoint.model_checkpoint_path:
330 |             # self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
331 |             saver.restore(self.sess, tf.train.latest_checkpoint(load_path))
332 |             print("Successfully loaded:", checkpoint.model_checkpoint_path)
333 |             self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1])
334 | 
335 |         else:
336 |             print("Could not find old network weights")
337 | 
338 |     def save_network(self, time_step, saver, save_path):
339 |         saver.save(self.sess, save_path + 'network', global_step=time_step,
340 |                    write_meta_graph=False)
341 | 
342 | 
343 | 
344 | 
345 | ###############################  training  ####################################
346 | 
347 | 


--------------------------------------------------------------------------------
/ddpg-movan/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 从DDPG_per中抽取出sumtree类，以及per_memory类
  3 | 然后将普通Memeory换成per_memory类。
  4 | 
  5 | """
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | import sys
 10 | 
 11 | 
 12 | #####################  hyper parameters  ####################
 13 | GAMMA = 0.9     # reward discount
 14 | TAU = 0.01      # soft replacement
 15 | 
 16 | 
 17 | ###############################  DDPG  ####################################
 18 | 
 19 | class DDPG(object):
 20 |     def __init__(self, a_dim, s_dim, a_bound, transition_num=4, batch_size=32, memory_size=100000, per_flag=False):
 21 |         self.transition_num = transition_num
 22 |         self.memory_size = memory_size
 23 |         self.per_flag = per_flag
 24 |         if per_flag:
 25 |             from memory.per_memory import Memory
 26 |         else:
 27 |             from memory.simple_memory import Memory
 28 | 
 29 |         self.memory = Memory(memory_size=memory_size,
 30 |                              batch_size=batch_size,
 31 |                              transition_num=transition_num,
 32 |                              )
 33 |         self.batch_size = batch_size
 34 | 
 35 |         self.learn_step = 0
 36 |         self.per_pointer = 0
 37 | 
 38 |         self.sess = tf.Session()
 39 | 
 40 |         self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
 41 |         self.actor_lr = tf.placeholder(tf.float32, shape=[], name='actor_lr')
 42 |         self.critic_lr = tf.placeholder(tf.float32, shape=[], name='critic_lr')
 43 | 
 44 |         self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
 45 |         self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
 46 |         self.R = tf.placeholder(tf.float32, [None, 1], 'r')
 47 |         self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
 48 | 
 49 |         with tf.variable_scope('Actor'):
 50 |             self.a = self._build_a(self.S, scope='eval', trainable=True)
 51 |             a_ = self._build_a(self.S_, scope='target', trainable=False)
 52 |         with tf.variable_scope('Critic'):
 53 |             # assign self.a = a in memory when calculating q for td_error,
 54 |             # otherwise the self.a is from Actor when updating Actor
 55 |             q = self._build_c(self.S, self.a, scope='eval', trainable=True)
 56 |             q_ = self._build_c(self.S_, a_, scope='target', trainable=False)
 57 | 
 58 |         # networks parameters
 59 |         self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
 60 |         self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
 61 |         self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
 62 |         self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
 63 | 
 64 |         # hard_replace
 65 |         self.hard_replace = [tf.assign(t, e)
 66 |                              for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
 67 | 
 68 |         # target net replacement
 69 |         self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e)
 70 |                              for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
 71 | 
 72 |         q_target = self.R + GAMMA * q_
 73 | 
 74 |         if self.per_flag:
 75 |             self.abs_errors = tf.reduce_sum(tf.abs(q_target - q), axis=1)  # for updating Sumtree
 76 |             self.c_loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(q_target, q))
 77 |             self.ctrain = tf.train.AdamOptimizer(self.critic_lr).minimize(self.c_loss, var_list=self.ce_params)
 78 | 
 79 |             self.a_loss = - tf.reduce_mean(q)  # maximize the q
 80 |             self.atrain = tf.train.AdamOptimizer(self.actor_lr).minimize(self.a_loss, var_list=self.ae_params)
 81 |         else:
 82 |             # in the feed_dic for the td_error, the self.a should change to actions in memory
 83 |             td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
 84 |             self.c_loss = td_error
 85 |             self.ctrain = tf.train.AdamOptimizer(self.critic_lr).minimize(td_error, var_list=self.ce_params)
 86 | 
 87 |             a_loss = - tf.reduce_mean(q)    # maximize the q
 88 |             self.a_loss = a_loss
 89 |             self.atrain = tf.train.AdamOptimizer(self.actor_lr).minimize(a_loss, var_list=self.ae_params)
 90 | 
 91 |         self.sess.run(tf.global_variables_initializer())
 92 | 
 93 |     def choose_action(self, s):
 94 |         return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
 95 | 
 96 |     def store_transition(self, transition):
 97 |         self.memory.store(transition)
 98 |         self.per_pointer = self.memory.tree.data_pointer
 99 | 
100 |     def learn(self, actor_lr_input, critic_lr_input, output_loss_flag=False):
101 |         # soft target replacement
102 |         self.sess.run(self.soft_replace)
103 |         self.learn_step += 1
104 |         if self.per_flag:
105 |             tree_idx, batch_memory, ISWeights = self.memory.sample()
106 | 
107 |             batch_states, batch_actions, batch_rewards, batch_states_ = [], [], [], []
108 |             for i in range(self.batch_size):
109 |                 batch_states.append(batch_memory[i][0])
110 |                 batch_actions.append(batch_memory[i][1])
111 |                 batch_rewards.append(batch_memory[i][2])
112 |                 batch_states_.append(batch_memory[i][3])
113 | 
114 |             bs = np.array(batch_states)
115 |             ba = np.array(batch_actions)
116 |             batch_rewards = np.array(batch_rewards)
117 |             bs_ = np.array(batch_states_)
118 |             br = batch_rewards[:, np.newaxis]
119 |             # 增加一个延时更新.
120 |             policy_delay = 2
121 |             a_loss = 0.0
122 |             if self.learn_step % policy_delay == 0:
123 |                 _, a_loss = self.sess.run([self.atrain, self.a_loss], {self.S: bs, self.actor_lr: actor_lr_input})
124 | 
125 |             _, abs_errors, cost = self.sess.run([self.ctrain, self.abs_errors, self.c_loss],
126 |                                                 {self.S: bs, self.a: ba, self.R: br, self.S_: bs_,
127 |                                                  self.critic_lr: critic_lr_input,
128 |                                                  self.ISWeights: ISWeights})
129 | 
130 |             self.memory.batch_update(tree_idx, abs_errors)  # update priority
131 |             return a_loss, cost
132 | 
133 |         else:
134 |             # 加上terminal信息
135 |             if self.transition_num == 5:
136 |                 bs, ba, br, bs_, bt = self.memory.sample()
137 |             if self.transition_num == 4:
138 |                 bs, ba, br, bs_ = self.memory.sample()
139 | 
140 |             if output_loss_flag:
141 |                 _, a_loss = self.sess.run([self.atrain, self.a_loss], {self.S: bs, self.actor_lr: actor_lr_input})
142 |                 _, c_loss = self.sess.run([self.ctrain, self.c_loss],
143 |                                           {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.critic_lr: critic_lr_input})
144 |                 return a_loss, c_loss
145 |             else:
146 |                 self.sess.run(self.atrain, {self.S: bs, self.actor_lr: actor_lr_input})
147 |                 self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_, self.critic_lr: critic_lr_input})
148 | 
149 |     def _build_a(self, s, scope, trainable):
150 |         with tf.variable_scope(scope):
151 |             net = tf.layers.dense(s, 300, activation=tf.nn.relu, name='l1', trainable=trainable)
152 |             new_actor_layer = tf.layers.dense(net, 200, activation=tf.nn.relu, name='new_actor_layer', trainable=trainable)
153 |             a = tf.layers.dense(new_actor_layer, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
154 |             return tf.multiply(a, self.a_bound, name='scaled_a')
155 | 
156 |     def _build_c(self, s, a, scope, trainable):
157 |         with tf.variable_scope(scope):
158 |             n_l1 = 400
159 |             w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
160 |             w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
161 |             b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
162 |             net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
163 |             new_critic_layer = tf.layers.dense(net, 300, activation=tf.nn.relu, name='new_critic_layer',
164 |                                                trainable=trainable)
165 |             return tf.layers.dense(new_critic_layer, 1, trainable=trainable)  # Q(s,a)
166 | 
167 |     def load_step_network(self, saver, load_path):
168 |         checkpoint = tf.train.get_checkpoint_state(load_path)
169 |         if checkpoint and checkpoint.model_checkpoint_path:
170 |             saver.restore(self.sess, tf.train.latest_checkpoint(load_path))
171 |             print("Successfully loaded:", checkpoint.model_checkpoint_path)
172 |             self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1])
173 |         else:
174 |             print("Could not find old network weights")
175 | 
176 |     def save_step_network(self, time_step, saver, save_path):
177 |         saver.save(self.sess, save_path + 'network', global_step=time_step,
178 |                    write_meta_graph=False)
179 | 
180 |     def load_simple_network(self, path):
181 |         saver = tf.train.Saver()
182 |         saver.restore(self.sess, tf.train.latest_checkpoint(path))
183 |         print("restore model successful")
184 | 
185 |     def save_simple_network(self, save_path):
186 |         saver = tf.train.Saver()
187 |         saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False)
188 | 


--------------------------------------------------------------------------------
/ddpg_sp/DDPG_class.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | import sys
  6 | sys.path.append("../")
  7 | from ddpg_sp import core
  8 | from ddpg_sp.core import get_vars, mlp_actor_critic
  9 | 
 10 | 
 11 | class ReplayBuffer:
 12 |     """
 13 |     A simple FIFO experience replay buffer for TD3 agents.
 14 |     """
 15 | 
 16 |     def __init__(self, obs_dim, act_dim, size):
 17 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 18 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 19 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 20 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 21 |         self.done_buf = np.zeros(size, dtype=np.float32)
 22 |         self.ptr, self.size, self.max_size = 0, 0, size
 23 | 
 24 |     def store(self, obs, act, rew, next_obs, done):
 25 |         self.obs1_buf[self.ptr] = obs
 26 |         self.obs2_buf[self.ptr] = next_obs
 27 |         self.acts_buf[self.ptr] = act
 28 |         self.rews_buf[self.ptr] = rew
 29 |         self.done_buf[self.ptr] = done
 30 |         self.ptr = (self.ptr + 1) % self.max_size
 31 |         self.size = min(self.size + 1, self.max_size)
 32 | 
 33 |     def sample_batch(self, batch_size=32):
 34 |         idxs = np.random.randint(0, self.size, size=batch_size)
 35 |         return dict(obs1=self.obs1_buf[idxs],
 36 |                     obs2=self.obs2_buf[idxs],
 37 |                     acts=self.acts_buf[idxs],
 38 |                     rews=self.rews_buf[idxs],
 39 |                     done=self.done_buf[idxs])
 40 | 
 41 | 
 42 | class DDPG:
 43 |     def __init__(self,
 44 |                  a_dim, obs_dim, a_bound,
 45 |                  mlp_actor_critic=core.mlp_actor_critic,
 46 |                  ac_kwargs=dict(), seed=0,
 47 | 
 48 |                  replay_size=int(1e6), gamma=0.99,
 49 |                  polyak=0.995, pi_lr=1e-3, q_lr=1e-3,
 50 |                  batch_size=100,
 51 |                  # start_steps=10000,
 52 |                  act_noise=0.1, target_noise=0.2,
 53 |                  noise_clip=0.5, policy_delay=2,
 54 |                  # max_ep_len=1000,
 55 |                  # logger_kwargs=dict(), save_freq=1
 56 |                  ):
 57 | 
 58 |         self.learn_step = 0
 59 | 
 60 |         self.obs_dim = obs_dim
 61 |         self.act_dim = a_dim
 62 |         self.act_limit = a_bound
 63 |         self.policy_delay = policy_delay
 64 |         self.action_noise = act_noise
 65 | 
 66 |         # Share information about action space with policy architecture
 67 |         ac_kwargs['action_space'] = a_bound
 68 | 
 69 |         # Inputs to computation graph
 70 |         self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(obs_dim, a_dim, obs_dim, None, None)
 71 | 
 72 |         # Main outputs from computation graph
 73 |         with tf.variable_scope('main'):
 74 |             self.pi, self.q, q_pi = mlp_actor_critic(self.x_ph, self.a_ph, **ac_kwargs)
 75 | 
 76 |         # Target networks
 77 |         with tf.variable_scope('target'):
 78 |             # Note that the action placeholder going to actor_critic here is
 79 |             # irrelevant, because we only need q_targ(s, pi_targ(s)).
 80 |             pi_targ, _, q_pi_targ = mlp_actor_critic(self.x2_ph, self.a_ph, **ac_kwargs)
 81 | 
 82 |         # Experience buffer
 83 |         self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=self.act_dim, size=replay_size)
 84 | 
 85 |         # Count variables
 86 |         var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
 87 |         print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts)
 88 | 
 89 |         # Bellman backup for Q function
 90 |         backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * q_pi_targ)
 91 | 
 92 |         # DDPG losses
 93 |         self.pi_loss = -tf.reduce_mean(q_pi)
 94 |         self.q_loss = tf.reduce_mean((self.q - backup) ** 2)
 95 | 
 96 |         # Separate train ops for pi, q
 97 |         pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
 98 |         q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
 99 |         self.train_pi_op = pi_optimizer.minimize(self.pi_loss, var_list=get_vars('main/pi'))
100 |         self.train_q_op = q_optimizer.minimize(self.q_loss, var_list=get_vars('main/q'))
101 | 
102 |         # Polyak averaging for target variables
103 |         self.target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
104 |                                   for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
105 | 
106 |         # Initializing targets to match main variables
107 |         target_init = tf.group([tf.assign(v_targ, v_main)
108 |                                 for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
109 | 
110 |         self.sess = tf.Session()
111 |         self.sess.run(tf.global_variables_initializer())
112 |         self.sess.run(target_init)
113 | 
114 |     def get_action(self, s, noise_scale=0):
115 |         if not noise_scale:
116 |             noise_scale = self.action_noise
117 |         a = self.sess.run(self.pi, feed_dict={self.x_ph: s.reshape(1, -1)})[0]
118 |         a += noise_scale * np.random.randn(self.act_dim)
119 |         return np.clip(a, -self.act_limit, self.act_limit)
120 | 
121 |     def store_transition(self, transition):
122 |         (s, a, r, s_, done) = transition
123 |         self.replay_buffer.store(s, a, r, s_, done)
124 | 
125 |     def test_agent(self, env, max_ep_len=1000, n=5):
126 |         ep_reward_list = []
127 |         for j in range(n):
128 |             s = env.reset()
129 |             ep_reward = 0
130 |             for i in range(max_ep_len):
131 |                 # Take deterministic actions at test time (noise_scale=0)
132 |                 s, r, d, _ = env.step(self.get_action(s))
133 |                 ep_reward += r
134 |             ep_reward_list.append(ep_reward)
135 |         mean_ep_reward = np.mean(np.array(ep_reward_list))
136 |         return mean_ep_reward
137 | 
138 |     def learn(self, batch_size=100):
139 | 
140 |         batch = self.replay_buffer.sample_batch(batch_size)
141 |         feed_dict = {self.x_ph: batch['obs1'],
142 |                      self.x2_ph: batch['obs2'],
143 |                      self.a_ph: batch['acts'],
144 |                      self.r_ph: batch['rews'],
145 |                      self.d_ph: batch['done']
146 |                      }
147 |         q_step_ops = [self.train_q_op]
148 | 
149 |         # Q-learning update
150 |         outs = self.sess.run([self.q_loss, self.q, self.train_q_op], feed_dict)
151 |         # Policy update
152 |         outs = self.sess.run([self.pi_loss, self.train_pi_op, self.target_update],
153 |                         feed_dict)
154 | 
155 |         self.learn_step += 1
156 | 
157 |     def load_step_network(self, saver, load_path):
158 |         checkpoint = tf.train.get_checkpoint_state(load_path)
159 |         if checkpoint and checkpoint.model_checkpoint_path:
160 |             saver.restore(self.sess, tf.train.latest_checkpoint(load_path))
161 |             print("Successfully loaded:", checkpoint.model_checkpoint_path)
162 |             self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1])
163 |         else:
164 |             print("Could not find old network weights")
165 | 
166 |     def save_step_network(self, time_step, saver, save_path):
167 |         saver.save(self.sess, save_path + 'network', global_step=time_step,
168 |                    write_meta_graph=False)
169 | 
170 |     def load_simple_network(self, path):
171 |         saver = tf.train.Saver()
172 |         saver.restore(self.sess, tf.train.latest_checkpoint(path))
173 |         print("restore model successful")
174 | 
175 |     def save_simple_network(self, save_path):
176 |         saver = tf.train.Saver()
177 |         saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False)
178 | 
179 | 
180 | if __name__ == '__main__':
181 |     import argparse
182 | 
183 |     random_seed = int(time.time() * 1000 % 1000)
184 |     parser = argparse.ArgumentParser()
185 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
186 |     parser.add_argument('--hid', type=int, default=300)
187 |     parser.add_argument('--l', type=int, default=1)
188 |     parser.add_argument('--gamma', type=float, default=0.99)
189 |     parser.add_argument('--seed', '-s', type=int, default=random_seed)
190 |     parser.add_argument('--epochs', type=int, default=200)
191 |     parser.add_argument('--max_steps', type=int, default=1000)
192 |     parser.add_argument('--exp_name', type=str, default='ddpg_class')
193 |     args = parser.parse_args()
194 | 
195 |     env = gym.make(args.env)
196 |     env = env.unwrapped
197 |     env.seed(args.seed)
198 | 
199 |     s_dim = env.observation_space.shape[0]
200 |     a_dim = env.action_space.shape[0]
201 |     a_bound = env.action_space.high[0]
202 | 
203 |     net = DDPG(a_dim, s_dim, a_bound,
204 |               batch_size=100,
205 |               )
206 |     ep_reward_list = []
207 |     test_ep_reward_list = []
208 | 
209 |     for i in range(args.epochs):
210 |         s = env.reset()
211 |         ep_reward = 0
212 |         for j in range(args.max_steps):
213 | 
214 |             # Add exploration noise
215 |             if i < 10:
216 |                 a = np.random.rand(a_dim) * a_bound
217 |             else:
218 |                 # a = net.choose_action(s)
219 |                 a = net.get_action(s, 0.1)
220 |             # a = noise.add_noise(a)
221 | 
222 |             a = np.clip(a, -a_bound, a_bound)
223 | 
224 |             s_, r, done, info = env.step(a)
225 |             done = False if j == args.max_steps - 1 else done
226 | 
227 |             net.store_transition((s, a, r, s_, done))
228 | 
229 |             s = s_
230 |             ep_reward += r
231 |             if j == args.max_steps - 1:
232 | 
233 |                 for _ in range(args.max_steps):
234 |                     net.learn()
235 | 
236 |                 ep_reward_list.append(ep_reward)
237 |                 print('Episode:', i, ' Reward: %i' % int(ep_reward),
238 |                       # 'Explore: %.2f' % var,
239 |                       "learn step:", net.learn_step)
240 |                 # if ep_reward > -300:RENDER = True
241 | 
242 |                 # 增加测试部分!
243 |                 if i % 20 == 0:
244 |                     test_ep_reward = net.test_agent(env=env, n=5)
245 |                     test_ep_reward_list.append(test_ep_reward)
246 |                     print("-" * 20)
247 |                     print('Episode:', i, ' Reward: %i' % int(ep_reward),
248 |                           'Test Reward: %i' % int(test_ep_reward),
249 |                           )
250 |                     print("-" * 20)
251 | 
252 |                 break
253 | 
254 |     import matplotlib.pyplot as plt
255 | 
256 |     plt.plot(ep_reward_list)
257 |     img_name = str(args.exp_name + "_" + args.env + "_epochs" +
258 |                    str(args.epochs) +
259 |                    "_seed" + str(args.seed))
260 |     plt.title(img_name+"_train")
261 |     plt.savefig(img_name+".png")
262 |     plt.show()
263 |     plt.close()
264 | 
265 |     plt.plot(test_ep_reward_list)
266 |     plt.title(img_name + "_test")
267 |     plt.savefig(img_name + ".png")
268 |     plt.show()
269 | 


--------------------------------------------------------------------------------
/ddpg_sp/DDPG_per_class.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | import sys
  6 | sys.path.append("../")
  7 | try:
  8 |     from rl_algorithms.ddpg_sp import core
  9 |     from rl_algorithms.ddpg_sp.core import get_vars
 10 | except:
 11 |     from ddpg_sp import core
 12 |     from ddpg_sp.core import get_vars
 13 | 
 14 | 
 15 | class ReplayBuffer:
 16 |     """
 17 |     A simple FIFO experience replay buffer for TD3 agents.
 18 |     """
 19 | 
 20 |     def __init__(self, obs_dim, act_dim, size):
 21 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 22 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 23 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 24 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 25 |         self.done_buf = np.zeros(size, dtype=np.float32)
 26 |         self.ptr, self.size, self.max_size = 0, 0, size
 27 | 
 28 |     def store(self, obs, act, rew, next_obs, done):
 29 |         self.obs1_buf[self.ptr] = obs
 30 |         self.obs2_buf[self.ptr] = next_obs
 31 |         self.acts_buf[self.ptr] = act
 32 |         self.rews_buf[self.ptr] = rew
 33 |         self.done_buf[self.ptr] = done
 34 |         self.ptr = (self.ptr + 1) % self.max_size
 35 |         self.size = min(self.size + 1, self.max_size)
 36 | 
 37 |     def sample_batch(self, batch_size=32):
 38 |         idxs = np.random.randint(0, self.size, size=batch_size)
 39 |         return dict(obs1=self.obs1_buf[idxs],
 40 |                     obs2=self.obs2_buf[idxs],
 41 |                     acts=self.acts_buf[idxs],
 42 |                     rews=self.rews_buf[idxs],
 43 |                     done=self.done_buf[idxs])
 44 | 
 45 | 
 46 | class DDPG:
 47 |     def __init__(self,
 48 |                  a_dim, obs_dim, a_bound,
 49 |                  mlp_actor_critic=core.mlp_actor_critic,
 50 |                  ac_kwargs=dict(), seed=0,
 51 | 
 52 |                  replay_size=int(1e6), gamma=0.99,
 53 |                  polyak=0.995, pi_lr=1e-3, q_lr=1e-3,
 54 |                  batch_size=100,                 
 55 |                  act_noise=0.1, target_noise=0.2,
 56 |                  noise_clip=0.5, policy_delay=2,                 
 57 |                  sess_opt=None,
 58 |                  per_flag=True,
 59 |                  ):
 60 |         self.per_flag = per_flag
 61 |         self.learn_step = 0
 62 | 
 63 |         self.obs_dim = obs_dim
 64 |         self.act_dim = a_dim
 65 |         self.act_limit = a_bound
 66 |         self.policy_delay = policy_delay
 67 |         self.action_noise = act_noise
 68 | 
 69 |         # Share information about action space with policy architecture
 70 |         ac_kwargs['action_space'] = a_bound
 71 | 
 72 |         # Inputs to computation graph
 73 |         self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
 74 |         self.actor_lr = tf.placeholder(tf.float32, shape=[], name='actor_lr')
 75 |         self.critic_lr = tf.placeholder(tf.float32, shape=[], name='critic_lr')
 76 |         self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(obs_dim, a_dim, obs_dim, None, None)
 77 | 
 78 |         # Main outputs from computation graph
 79 |         with tf.variable_scope('main'):
 80 |             self.pi, self.q, q_pi = mlp_actor_critic(self.x_ph, self.a_ph, **ac_kwargs)
 81 | 
 82 |         # Target networks
 83 |         with tf.variable_scope('target'):
 84 |             # Note that the action placeholder going to actor_critic here is
 85 |             # irrelevant, because we only need q_targ(s, pi_targ(s)).
 86 |             pi_targ, _, q_pi_targ = mlp_actor_critic(self.x2_ph, self.a_ph, **ac_kwargs)
 87 | 
 88 |         # Experience buffer
 89 |         if self.per_flag:
 90 |             from memory.sp_per_memory import ReplayBuffer
 91 |         self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=self.act_dim, size=replay_size)
 92 | 
 93 |         # Count variables
 94 |         var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
 95 |         print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts)
 96 | 
 97 |         # Bellman backup for Q function
 98 |         backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * q_pi_targ)
 99 | 
100 |         # DDPG losses
101 |         self.pi_loss = -tf.reduce_mean(q_pi)
102 | 
103 |         if self.per_flag:
104 |             # q_target - q
105 |             self.abs_errors = tf.abs(backup - self.q)
106 |             self.q_loss = self.ISWeights * tf.reduce_mean((self.q - backup) ** 2)
107 |         else:
108 |             # 正常的！
109 |             self.q_loss = tf.reduce_mean((self.q - backup) ** 2)
110 | 
111 |         # Separate train ops for pi, q
112 |         pi_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr)
113 |         q_optimizer = tf.train.AdamOptimizer(learning_rate=self.critic_lr)
114 |         self.train_pi_op = pi_optimizer.minimize(self.pi_loss, var_list=get_vars('main/pi'))
115 |         self.train_q_op = q_optimizer.minimize(self.q_loss, var_list=get_vars('main/q'))
116 | 
117 |         # Polyak averaging for target variables
118 |         self.target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
119 |                                   for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
120 | 
121 |         # Initializing targets to match main variables
122 |         target_init = tf.group([tf.assign(v_targ, v_main)
123 |                                 for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
124 | 
125 |         if sess_opt:
126 |             gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=sess_opt)
127 |             self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
128 |         else:
129 |             self.sess = tf.Session()
130 |         self.sess.run(tf.global_variables_initializer())
131 |         self.sess.run(target_init)
132 | 
133 |     def get_action(self, s, noise_scale=0):
134 |         if not noise_scale:
135 |             noise_scale = self.action_noise
136 |         a = self.sess.run(self.pi, feed_dict={self.x_ph: s.reshape(1, -1)})[0]
137 |         a += noise_scale * np.random.randn(self.act_dim)
138 |         return np.clip(a, -self.act_limit, self.act_limit)
139 | 
140 |     def store_transition(self, transition):
141 |         if self.per_flag:
142 |             self.replay_buffer.store(transition)
143 |         else:
144 |             (s, a, r, s_, done) = transition
145 |             self.replay_buffer.store(s, a, r, s_, done)            
146 | 
147 |     def test_agent(self, env, max_ep_len=1000, n=5):
148 |         ep_reward_list = []
149 |         for j in range(n):
150 |             s = env.reset()
151 |             ep_reward = 0
152 |             for i in range(max_ep_len):
153 |                 # Take deterministic actions at test time (noise_scale=0)
154 |                 s, r, d, _ = env.step(self.get_action(s))
155 |                 ep_reward += r
156 |             ep_reward_list.append(ep_reward)
157 |         mean_ep_reward = np.mean(np.array(ep_reward_list))
158 |         return mean_ep_reward
159 | 
160 |     def learn(self, batch_size=100, actor_lr_input=0.001,
161 |               critic_lr_input=0.001,):
162 |         if self.per_flag:
163 |             tree_idx, batch_memory, ISWeights = self.replay_buffer.sample(batch_size=batch_size)
164 |             batch_states, batch_actions, batch_rewards, batch_states_, batch_dones = [], [], [], [], []
165 |             for i in range(batch_size):
166 |                 batch_states.append(batch_memory[i][0])
167 |                 batch_actions.append(batch_memory[i][1])
168 |                 batch_rewards.append(batch_memory[i][2])
169 |                 batch_states_.append(batch_memory[i][3])
170 |                 batch_dones.append(batch_memory[i][4])
171 | 
172 |             feed_dict = {self.x_ph: np.array(batch_states),
173 |                          self.x2_ph: np.array(batch_states_),
174 |                          self.a_ph: np.array(batch_actions),
175 |                          self.r_ph: np.array(batch_rewards),
176 |                          self.d_ph: np.array(batch_dones),
177 |                          self.actor_lr: actor_lr_input,
178 |                          self.critic_lr: critic_lr_input,
179 |                          self.ISWeights: ISWeights
180 |                          }
181 |             q_step_ops = [self.q_loss, self.q,
182 |                           self.train_q_op,
183 |                           self.abs_errors,
184 |                           ]
185 |             outs = self.sess.run(q_step_ops, feed_dict)
186 |             q_loss, q, train_q_op, abs_errors = outs
187 |             if self.learn_step % self.policy_delay == 0:
188 |                 # Delayed policy update
189 |                 outs = self.sess.run([self.pi_loss,
190 |                                       self.train_pi_op,
191 |                                       self.target_update],
192 |                                      feed_dict)
193 | 
194 |             self.replay_buffer.batch_update(tree_idx,
195 |                                             abs_errors)  # update priority
196 |             self.learn_step += 1
197 |             return outs
198 |         else:
199 |             batch = self.replay_buffer.sample_batch(batch_size)
200 |             feed_dict = {self.x_ph: batch['obs1'],
201 |                         self.x2_ph: batch['obs2'],
202 |                         self.a_ph: batch['acts'],
203 |                         self.r_ph: batch['rews'],
204 |                         self.d_ph: batch['done'],
205 |                         self.actor_lr: actor_lr_input,
206 |                         self.critic_lr: critic_lr_input,
207 |                         }
208 |             q_step_ops = [self.train_q_op]
209 | 
210 |             # Q-learning update
211 |             outs = self.sess.run([self.q_loss, self.q, self.train_q_op], feed_dict)
212 |             # Policy update
213 |             outs = self.sess.run([self.pi_loss, self.train_pi_op, self.target_update],
214 |                             feed_dict)
215 | 
216 |             self.learn_step += 1
217 | 
218 |     def load_step_network(self, saver, load_path):
219 |         checkpoint = tf.train.get_checkpoint_state(load_path)
220 |         if checkpoint and checkpoint.model_checkpoint_path:
221 |             saver.restore(self.sess, tf.train.latest_checkpoint(load_path))
222 |             print("Successfully loaded:", checkpoint.model_checkpoint_path)
223 |             self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1])
224 |         else:
225 |             print("Could not find old network weights")
226 | 
227 |     def save_step_network(self, time_step, saver, save_path):
228 |         saver.save(self.sess, save_path + 'network', global_step=time_step,
229 |                    write_meta_graph=False)
230 | 
231 |     def load_simple_network(self, path):
232 |         saver = tf.train.Saver()
233 |         saver.restore(self.sess, tf.train.latest_checkpoint(path))
234 |         print("restore model successful")
235 | 
236 |     def save_simple_network(self, save_path):
237 |         saver = tf.train.Saver()
238 |         saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False)
239 | 
240 | 
241 | if __name__ == '__main__':
242 |     import argparse
243 | 
244 |     random_seed = int(time.time() * 1000 % 1000)
245 |     random_seed = 184
246 |     parser = argparse.ArgumentParser()
247 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
248 |     parser.add_argument('--hid', type=int, default=300)
249 |     parser.add_argument('--l', type=int, default=1)
250 |     parser.add_argument('--gamma', type=float, default=0.99)
251 |     parser.add_argument('--seed', '-s', type=int, default=random_seed)
252 |     parser.add_argument('--epochs', type=int, default=3000)
253 |     parser.add_argument('--max_steps', type=int, default=1000)
254 |     parser.add_argument('--exp_name', type=str, default='ddpg_per_class')
255 |     args = parser.parse_args()
256 | 
257 |     env = gym.make(args.env)
258 |     env = env.unwrapped
259 |     env.seed(args.seed)
260 | 
261 |     s_dim = env.observation_space.shape[0]
262 |     a_dim = env.action_space.shape[0]
263 |     a_bound = env.action_space.high[0]
264 | 
265 |     net = DDPG(a_dim, s_dim, a_bound,
266 |               batch_size=100,
267 |               sess_opt=0.1
268 |               )
269 |     ep_reward_list = []
270 |     test_ep_reward_list = []
271 | 
272 |     for i in range(args.epochs):
273 |         s = env.reset()
274 |         ep_reward = 0
275 |         st = time.time()
276 |         for j in range(args.max_steps):
277 | 
278 |             # Add exploration noise
279 |             if i < 10:
280 |                 a = np.random.rand(a_dim) * a_bound
281 |             else:
282 |                 # a = net.choose_action(s)
283 |                 a = net.get_action(s, 0.1)
284 |             # a = noise.add_noise(a)
285 | 
286 |             a = np.clip(a, -a_bound, a_bound)
287 | 
288 |             s_, r, done, info = env.step(a)
289 |             done = False if j == args.max_steps - 1 else done
290 | 
291 |             net.store_transition((s, a, r, s_, done))
292 | 
293 |             s = s_
294 |             ep_reward += r
295 |             if j == args.max_steps - 1:
296 |                 ep_update_time = time.time()
297 |                 for _ in range(args.max_steps):
298 |                     net.learn()
299 |                 ep_update_time = time.time() - ep_update_time
300 |                 ep_reward_list.append(ep_reward)
301 |                 print('Episode:', i, ' Reward: %i' % int(ep_reward),
302 |                       # 'Explore: %.2f' % var,
303 |                       "learn step:", net.learn_step,
304 |                       "ep_time:", np.round(time.time()-st, 3),
305 |                       "up_time:", np.round(ep_update_time, 3),
306 |                       )
307 |                 # if ep_reward > -300:RENDER = True
308 | 
309 |                 # 增加测试部分!
310 |                 if i % 20 == 0:
311 |                     test_ep_reward = net.test_agent(env=env, n=5)
312 |                     test_ep_reward_list.append(test_ep_reward)
313 |                     print("-" * 20)
314 |                     print('Episode:', i, ' Reward: %i' % int(ep_reward),
315 |                           'Test Reward: %i' % int(test_ep_reward),
316 |                           )
317 |                     print("-" * 20)
318 | 
319 |                 break
320 | 
321 |     import matplotlib.pyplot as plt
322 | 
323 |     plt.plot(ep_reward_list)
324 |     img_name = str(args.exp_name + "_" + args.env + "_epochs" +
325 |                    str(args.epochs) +
326 |                    "_seed" + str(args.seed))
327 |     plt.title(img_name + "_train")
328 |     plt.savefig(img_name + ".png")
329 |     plt.show()
330 |     plt.close()
331 | 
332 |     plt.plot(test_ep_reward_list)
333 |     plt.title(img_name + "_test")
334 |     plt.savefig(img_name + ".png")
335 |     plt.show()


--------------------------------------------------------------------------------
/ddpg_sp/DDPG_sp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | import sys
  6 | sys.path.append("../")
  7 | from ddpg_sp import core
  8 | from ddpg_sp.core import get_vars, mlp_actor_critic
  9 | 
 10 | 
 11 | class ReplayBuffer:
 12 |     """
 13 |     A simple FIFO experience replay buffer for DDPG agents.
 14 |     """
 15 | 
 16 |     def __init__(self, obs_dim, act_dim, size):
 17 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 18 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 19 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 20 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 21 |         self.done_buf = np.zeros(size, dtype=np.float32)
 22 |         self.ptr, self.size, self.max_size = 0, 0, size
 23 | 
 24 |     def store(self, obs, act, rew, next_obs, done):
 25 |         self.obs1_buf[self.ptr] = obs
 26 |         self.obs2_buf[self.ptr] = next_obs
 27 |         self.acts_buf[self.ptr] = act
 28 |         self.rews_buf[self.ptr] = rew
 29 |         self.done_buf[self.ptr] = done
 30 |         self.ptr = (self.ptr + 1) % self.max_size
 31 |         self.size = min(self.size + 1, self.max_size)
 32 | 
 33 |     def sample_batch(self, batch_size=32):
 34 |         idxs = np.random.randint(0, self.size, size=batch_size)
 35 |         return dict(obs1=self.obs1_buf[idxs],
 36 |                     obs2=self.obs2_buf[idxs],
 37 |                     acts=self.acts_buf[idxs],
 38 |                     rews=self.rews_buf[idxs],
 39 |                     done=self.done_buf[idxs])
 40 | 
 41 | 
 42 | def ddpg(env_fn, actor_critic=core.mlp_actor_critic,
 43 |          ac_kwargs=dict(), seed=0,
 44 |          steps_per_epoch=5000, epochs=100,
 45 |          replay_size=int(1e6), gamma=0.99,
 46 |          polyak=0.995, pi_lr=1e-3, q_lr=1e-3,
 47 |          batch_size=100, start_steps=10000,
 48 |          act_noise=0.1, max_ep_len=1000,
 49 |          logger_kwargs=dict(), save_freq=1):
 50 | 
 51 |     env, test_env = env_fn(), env_fn()
 52 |     obs_dim = env.observation_space.shape[0]
 53 |     act_dim = env.action_space.shape[0]
 54 | 
 55 |     # Action limit for clamping: critically, assumes all dimensions share the same bound!
 56 |     act_limit = env.action_space.high[0]
 57 | 
 58 |     # Share information about action space with policy architecture
 59 |     ac_kwargs['action_space'] = act_limit
 60 | 
 61 |     # Inputs to computation graph
 62 |     x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
 63 | 
 64 |     # Main outputs from computation graph
 65 |     with tf.variable_scope('main'):
 66 |         pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
 67 | 
 68 |     # Target networks
 69 |     with tf.variable_scope('target'):
 70 |         # Note that the action placeholder going to actor_critic here is
 71 |         # irrelevant, because we only need q_targ(s, pi_targ(s)).
 72 |         pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)
 73 | 
 74 |     # Experience buffer
 75 |     replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
 76 | 
 77 |     # Count variables
 78 |     var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
 79 |     print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts)
 80 | 
 81 |     # Bellman backup for Q function
 82 |     backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)
 83 | 
 84 |     # DDPG losses
 85 |     pi_loss = -tf.reduce_mean(q_pi)
 86 |     q_loss = tf.reduce_mean((q - backup) ** 2)
 87 | 
 88 |     # Separate train ops for pi, q
 89 |     pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
 90 |     q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
 91 |     train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
 92 |     train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))
 93 | 
 94 |     # Polyak averaging for target variables
 95 |     target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
 96 |                               for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
 97 | 
 98 |     # Initializing targets to match main variables
 99 |     target_init = tf.group([tf.assign(v_targ, v_main)
100 |                             for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
101 | 
102 |     sess = tf.Session()
103 |     sess.run(tf.global_variables_initializer())
104 |     sess.run(target_init)
105 | 
106 |     def get_action(o, noise_scale):
107 |         a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
108 |         a += noise_scale * np.random.randn(act_dim)
109 |         return np.clip(a, -act_limit, act_limit)
110 | 
111 |     def test_agent(n=10):
112 |         for j in range(n):
113 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
114 |             while not (d or (ep_len == max_ep_len)):
115 |                 # Take deterministic actions at test time (noise_scale=0)
116 |                 o, r, d, _ = test_env.step(get_action(o, 0))
117 |                 ep_ret += r
118 |                 ep_len += 1
119 | 
120 |     start_time = time.time()
121 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
122 |     total_steps = steps_per_epoch * epochs
123 | 
124 |     ep_ret_list = []
125 |     episode = 0
126 | 
127 |     # Main loop: collect experience in env and update/log each epoch
128 |     for t in range(total_steps):
129 | 
130 |         """
131 |         Until start_steps have elapsed, randomly sample actions
132 |         from a uniform distribution for better exploration. Afterwards, 
133 |         use the learned policy (with some noise, via act_noise). 
134 |         """
135 |         if t > start_steps:
136 |             a = get_action(o, act_noise)
137 |         else:
138 |             a = env.action_space.sample()
139 | 
140 |         # Step the env
141 |         o2, r, d, _ = env.step(a)
142 |         ep_ret += r
143 |         ep_len += 1
144 | 
145 |         # Ignore the "done" signal if it comes from hitting the time
146 |         # horizon (that is, when it's an artificial terminal signal
147 |         # that isn't based on the agent's state)
148 |         d = False if ep_len == max_ep_len else d
149 | 
150 |         # Store experience to replay buffer
151 |         replay_buffer.store(o, a, r, o2, d)
152 | 
153 |         # Super critical, easy to overlook step: make sure to update
154 |         # most recent observation!
155 |         o = o2
156 | 
157 |         if d or (ep_len == max_ep_len):
158 |             """
159 |             Perform all DDPG updates at the end of the trajectory,
160 |             in accordance with tuning done by TD3 paper authors.
161 |             """
162 |             episode += 1
163 |             ep_ret_list.append(ep_ret)
164 |             epoch = t // steps_per_epoch
165 |             print("Epoch:", epoch)
166 |             print("Episode:", episode)
167 |             print("Training Step:", t)
168 |             print("Episode Reward:", ep_ret)
169 | 
170 |             for _ in range(ep_len):
171 |                 batch = replay_buffer.sample_batch(batch_size)
172 |                 feed_dict = {x_ph: batch['obs1'],
173 |                              x2_ph: batch['obs2'],
174 |                              a_ph: batch['acts'],
175 |                              r_ph: batch['rews'],
176 |                              d_ph: batch['done']
177 |                              }
178 | 
179 |                 # Q-learning update
180 |                 outs = sess.run([q_loss, q, train_q_op], feed_dict)
181 | 
182 |                 # Policy update
183 |                 outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict)
184 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
185 | 
186 |         # End of epoch wrap-up
187 |         if t > 0 and t % steps_per_epoch == 0:
188 |             test_agent()
189 | 
190 |     import matplotlib.pyplot as plt
191 |     plt.plot(ep_ret_list)
192 |     plt.show()
193 | 
194 | 
195 | if __name__ == '__main__':
196 |     import argparse
197 | 
198 |     parser = argparse.ArgumentParser()
199 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
200 |     parser.add_argument('--hid', type=int, default=300)
201 |     parser.add_argument('--l', type=int, default=1)
202 |     parser.add_argument('--gamma', type=float, default=0.99)
203 |     parser.add_argument('--seed', '-s', type=int, default=0)
204 |     parser.add_argument('--epochs', type=int, default=600)
205 |     parser.add_argument('--exp_name', type=str, default='ddpg')
206 |     args = parser.parse_args()
207 | 
208 |     ddpg(lambda: gym.make(args.env), actor_critic=core.mlp_actor_critic,
209 |          ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),
210 |          gamma=args.gamma, seed=args.seed, epochs=args.epochs,
211 |          )
212 | 


--------------------------------------------------------------------------------
/ddpg_sp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/ddpg_sp/__init__.py


--------------------------------------------------------------------------------
/ddpg_sp/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def placeholder(dim=None):
 6 |     return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
 7 | 
 8 | 
 9 | def placeholders(*args):
10 |     return [placeholder(dim) for dim in args]
11 | 
12 | 
13 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
14 |     for h in hidden_sizes[:-1]:
15 |         x = tf.layers.dense(x, units=h, activation=activation)
16 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
17 | 
18 | 
19 | def get_vars(scope):
20 |     return [x for x in tf.global_variables() if scope in x.name]
21 | 
22 | 
23 | def count_vars(scope):
24 |     v = get_vars(scope)
25 |     return sum([np.prod(var.shape.as_list()) for var in v])
26 | 
27 | 
28 | """
29 | Actor-Critics
30 | """
31 | 
32 | 
33 | def mlp_actor_critic(x, a, hidden_sizes=(400, 300), activation=tf.nn.relu,
34 |                      output_activation=tf.tanh, action_space=None):
35 |     act_dim = a.shape.as_list()[-1]
36 |     act_limit = action_space
37 |     with tf.variable_scope('pi'):
38 |         pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
39 |     with tf.variable_scope('q'):
40 |         q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
41 |     with tf.variable_scope('q', reuse=True):
42 |         q_pi = tf.squeeze(mlp(tf.concat([x, pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
43 |     return pi, q, q_pi
44 | 


--------------------------------------------------------------------------------
/ddpg_sp/ddpg_class_HalfCheetah-v2_epochs200_seed553.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/ddpg_sp/ddpg_class_HalfCheetah-v2_epochs200_seed553.png


--------------------------------------------------------------------------------
/ddpg_sp/ddpg_class_HalfCheetah-v2_epochs3000_seed485.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/ddpg_sp/ddpg_class_HalfCheetah-v2_epochs3000_seed485.png


--------------------------------------------------------------------------------
/memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/memory/__init__.py


--------------------------------------------------------------------------------
/memory/per_memory.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/memory/per_memory.py


--------------------------------------------------------------------------------
/memory/simple_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Memory:
 5 |     def __init__(self, memory_size, batch_size, transition_num):
 6 |         self.memory_list = []
 7 |         self.memory_size = memory_size
 8 |         self.batch_size = batch_size
 9 |         self.transition_num = transition_num
10 | 
11 |     def store(self, transition):
12 |         if self.memory_num >= self.memory_size:
13 |             del self.memory_list[0]
14 |         if len(transition) == 5:
15 |             s, a, r, s_, t = transition
16 |             self.memory_list.append([s, a, r, s_, t])
17 |         if len(transition) == 4:
18 |             s, a, r, s_ = transition
19 |             self.memory_list.append([s, a, r, s_])
20 | 
21 |     def sample(self):
22 |         assert self.memory_num >= self.batch_size
23 |         if self.memory_num < self.memory_size:
24 |             indices = np.random.choice(self.memory_num, size=self.batch_size)
25 |         else:
26 |             indices = np.random.choice(self.memory_size, self.batch_size)
27 |         batch_states, batch_actions, batch_rewards, batch_states_, batch_terminal = [], [], [], [], []
28 |         for i in indices:
29 |             batch_states.append(self.memory_list[i][0])
30 |             batch_actions.append(self.memory_list[i][1])
31 |             batch_rewards.append(self.memory_list[i][2])
32 |             batch_states_.append(self.memory_list[i][3])
33 |             if self.transition_num == 5:
34 |                 batch_terminal.append(self.memory_list[i][4])
35 | 
36 |         batch_states = np.array(batch_states)
37 |         batch_actions = np.array(batch_actions)
38 |         batch_rewards = np.array(batch_rewards)
39 |         batch_states_ = np.array(batch_states_)
40 |         batch_rewards = batch_rewards[:, np.newaxis]
41 |         if self.transition_num==5:
42 |             batch_terminal = np.array(batch_terminal)
43 |             batch_terminal = batch_terminal[:, np.newaxis]
44 |             return batch_states, batch_actions, batch_rewards, batch_states_, batch_terminal
45 |         if self.transition_num == 4:
46 |             return batch_states, batch_actions, batch_rewards, batch_states_
47 | 
48 |     @property
49 |     def memory_num(self):
50 |         return len(self.memory_list)
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/memory/sp_per_memory.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 和per_memory.py相比，per_memory.py是配合莫烦的DDPG算法，
  3 | 而sp_per_memory.py是配合spinningup系列打包的强化类写的。
  4 | 目前只单独适配了TD3_per_class.py，SAC的还没有适配，sp的DDPG也没适配。
  5 | """
  6 | 
  7 | import numpy as np
  8 | 
  9 | 
 10 | class SumTree(object):
 11 |     """
 12 |     This SumTree code is a modified version and the original code is from:
 13 |     https://github.com/jaara/AI-blog/blob/master/SumTree.py
 14 |     Story data with its priority in the tree.
 15 |     """
 16 |     data_pointer = 0
 17 | 
 18 |     def __init__(self, capacity):
 19 |         self.capacity = capacity  # for all priority values
 20 |         self.tree = np.zeros(2 * capacity - 1)
 21 |         # [--------------Parent nodes-------------][-------leaves to recode priority-------]
 22 |         #             size: capacity - 1                       size: capacity
 23 |         self.data = list(np.zeros(capacity, dtype=object))  # for all transitions
 24 |         # [--------------data frame-------------]
 25 |         #             size: capacity
 26 | 
 27 |     def add(self, p, transition):
 28 |         tree_idx = self.data_pointer + self.capacity - 1
 29 |         self.data[self.data_pointer] = transition  # update data_frame
 30 |         self.update(tree_idx, p)  # update tree_frame
 31 | 
 32 |         self.data_pointer += 1
 33 |         if self.data_pointer >= self.capacity:  # replace when exceed the capacity
 34 |             self.data_pointer = 0
 35 | 
 36 |     def update(self, tree_idx, p):
 37 |         change = p - self.tree[tree_idx]
 38 |         self.tree[tree_idx] = p
 39 |         # then propagate the change through tree
 40 |         while tree_idx != 0:    # this method is faster than the recursive loop in the reference code
 41 |             tree_idx = (tree_idx - 1) // 2
 42 |             self.tree[tree_idx] += change
 43 | 
 44 |     def get_leaf(self, v):
 45 |         """
 46 |         Tree structure and array storage:
 47 | 
 48 |         Tree index:
 49 |              0         -> storing priority sum
 50 |             / \
 51 |           1     2
 52 |          / \   / \
 53 |         3   4 5   6    -> storing priority for transitions
 54 | 
 55 |         Array type for storing:
 56 |         [0,1,2,3,4,5,6]
 57 |         """
 58 |         parent_idx = 0
 59 |         while True:     # the while loop is faster than the method in the reference code
 60 |             cl_idx = 2 * parent_idx + 1         # this leaf's left and right kids
 61 |             cr_idx = cl_idx + 1
 62 |             if cl_idx >= len(self.tree):        # reach bottom, end search
 63 |                 leaf_idx = parent_idx
 64 |                 break
 65 |             else:       # downward search, always search for a higher priority node
 66 |                 if v <= self.tree[cl_idx]:
 67 |                     parent_idx = cl_idx
 68 |                 else:
 69 |                     v -= self.tree[cl_idx]
 70 |                     parent_idx = cr_idx
 71 | 
 72 |         data_idx = leaf_idx - self.capacity + 1
 73 |         return leaf_idx, self.tree[leaf_idx], self.data[data_idx]
 74 | 
 75 |     @property
 76 |     def total_p(self):
 77 |         return self.tree[0]  # the root
 78 | 
 79 | 
 80 | class ReplayBuffer(object):  # stored as ( s, a, r, s_ ) in SumTree
 81 |     """
 82 |     This Memory class is modified based on the original code from:
 83 |     https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
 84 |     这些可以改的，但是目前我也没时间调参了，就凑活用吧
 85 |     """
 86 |     epsilon = 0.01  # small amount to avoid zero priority
 87 |     alpha = 0.6  # [0~1] convert the importance of TD error to priority
 88 |     beta = 0.4  # importance-sampling, from initial value increasing to 1
 89 |     beta_increment_per_sampling = 0.001
 90 |     abs_err_upper = 1.  # clipped abs error
 91 | 
 92 |     def __init__(self, 
 93 |                  obs_dim=32, 
 94 |                  act_dim=3,
 95 |                  size=int(1e6)
 96 |                  ):
 97 |         self.tree = SumTree(size)
 98 |         self.full_flag = False
 99 |         self.memory_num = 0
100 |         self.memory_size = size
101 | 
102 |     def store(self, transition):
103 |         max_p = np.max(self.tree.tree[-self.tree.capacity:])
104 |         if max_p == 0:
105 |             max_p = self.abs_err_upper
106 |         self.tree.add(max_p, transition)   # set the max p for new p
107 |         if self.memory_num < self.memory_size:
108 |             self.memory_num += 1
109 | 
110 |     def sample(self, batch_size=32):
111 |         n = batch_size
112 |         # n就是batch size！
113 |         # np.empty()这是一个随机初始化的一个矩阵！
114 |         b_idx, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1))
115 |         b_memory = []
116 |         pri_seg = self.tree.total_p / n       # priority segment
117 |         self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])  # max = 1
118 | 
119 |         min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p     # for later calculate ISweight
120 |         if min_prob == 0:
121 |             min_prob = 0.00001
122 |         for i in range(n):
123 |             a, b = pri_seg * i, pri_seg * (i + 1)
124 |             v = np.random.uniform(a, b)
125 |             idx, p, data = self.tree.get_leaf(v)
126 |             prob = p / self.tree.total_p
127 |             ISWeights[i, 0] = np.power(prob/min_prob, -self.beta)
128 |             b_idx[i] = idx
129 |             b_memory.append(data)
130 |         return b_idx, b_memory, ISWeights
131 | 
132 |     def batch_update(self, tree_idx, abs_errors):
133 |         abs_errors += self.epsilon  # convert to abs and avoid 0
134 |         clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
135 |         ps = np.power(clipped_errors, self.alpha)
136 |         for ti, p in zip(tree_idx, ps):
137 |             self.tree.update(ti, p)
138 | 


--------------------------------------------------------------------------------
/noise/__init__.py:
--------------------------------------------------------------------------------
1 | from .ou_noise import OU_noise
2 | from .simple_noise import Simple_noise


--------------------------------------------------------------------------------
/noise/ou_noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class OU_noise(object):
 5 |     def __init__(self, num_actions, action_low_bound, action_high_bound, dt,
 6 |                  mu=0.0, theta=0.15, max_sigma=2.0, min_sigma=0.1):
 7 |         self.mu = mu  # 0.0
 8 |         self.theta = theta  # 0.15
 9 |         self.sigma = max_sigma  # 0.3
10 |         self.max_sigma = max_sigma  # 0.3
11 |         self.min_sigma = min_sigma  # 0.1
12 |         self.dt = dt  # 0.001
13 |         self.num_actions = num_actions  # 1
14 |         self.action_low = action_low_bound  # -2
15 |         self.action_high = action_high_bound  # 2
16 |         self.reset()
17 | 
18 |     def reset(self):
19 |         self.state = np.zeros(self.num_actions)
20 | 
21 |     # self.state = np.zeros(self.num_actions)
22 |     def state_update(self):
23 |         x = self.state
24 |         dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.num_actions)  # np.random.randn()生成0,1的随机数
25 |         self.state = x + dx
26 | 
27 |     def add_noise(self, action):
28 |         self.state_update()
29 |         state = self.state
30 |         self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, self.dt)
31 |         return np.clip(action + state, self.action_low, self.action_high)
32 | 


--------------------------------------------------------------------------------
/noise/simple_noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Simple_noise(object):
 5 |     def __init__(self, num_actions, action_low_bound, action_high_bound,
 6 |                  dt=0.0001,
 7 |                  mu=0.0, theta=0.15, max_sigma=2.0, min_sigma=0.1):
 8 |         self.mu = mu  # 0.0
 9 |         self.theta = theta  # 0.15
10 |         self.sigma = max_sigma  # 0.3
11 |         self.max_sigma = max_sigma  # 0.3
12 |         self.min_sigma = min_sigma  # 0.1
13 |         self.dt = dt  # 0.001
14 |         self.num_actions = num_actions  # 1
15 |         self.action_low = action_low_bound  # -2
16 |         self.action_high = action_high_bound  # 2
17 | 
18 |     def add_noise(self, action):
19 |         action += self.max_sigma * np.random.randn(self.num_actions)
20 |         return np.clip(action, self.action_low, self.action_high)
21 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | 本代码库基本停止维护了，请移步最新的代码库：
 2 | https://github.com/kaixindelele/DRLib
 3 | 
 4 | 
 5 | 文件命名示例：
 6 | 
 7 | sac_sp.py：凡是带下划线sp的文件，都是spinup中的封装形式，即把强化算法封装成函数；
 8 | 
 9 | sac_class.py 凡是带class的文件，都是封装成类的形式，便于大家直接调用；
10 | 
11 | sac_auto_per_class 凡是带per的文件，都是可以选择是否调用优先经验回放的class，但是有时候优先经验回放并不一定好使，谨慎使用。
12 | 
13 | 另外关于sac_auto，也被称作sac2，或者自适应sac，因为alpha超参数是放到网络中进行学习，一般来说要比sac更容易调用。
14 | 
15 | 
16 | --
17 | 
18 | 2020-12-09
19 | 
20 | 发现这是我获得star最多的一个项目了。
21 | 
22 | 刚才过了一遍所有的文件，发现优先经验回放PER没有单独抽取出来，还是和DDPG打包在一起的，这样会导致不能和TD3，SAC兼容。
23 | 
24 | 另外sac-auto也没有提交。
25 | 
26 | 事后经验回放HER没有实现（我到现在还没有调出最好的参数，贼烦，我本以为her是灵丹妙药，没想到不过如此）。
27 | 
28 | 另外继续搞tf1感觉有种49年入国军的错觉。
29 | 
30 | 难顶
31 | 
32 | --
33 | 
34 | 
35 | 
36 | # DRL-tensorflow
37 | My DRL library with tensorflow1.14
38 | core codes based on https://github.com/openai/spinningup
39 | 
40 | My job is wrap the algorithms functions into classes in order to easy to call.
41 | Maintain the performance in gym environments of the original codes.
42 | 
43 | 越来越丰富了，基本上将主流的深度强化学习的off-policy的三个主要算法都打包成功了。
44 | **目前都是最简单的模式，直接进入algo_class.py的文件中，run就完事儿了。**
45 | 
46 | 对于结果的显示，以及性能的对比，目前做的还不够，因为我还没吃透spinning-up的log类，没有办法更方便的将这个功能嵌入进去。
47 | 还有画图的功能，目前只能用乞丐版的matplotlib画个图。
48 | 
49 | 等我有时间了再加点功能~
50 | 
51 | ----
52 | 已经更新了logger和plot功能，功能实现代码在sp_utils文件夹中，直接抽调了spinup的代码，做了稍许修改。
53 | 在run_in_gym这个文件夹中可以直接试用该功能，非常方便。
54 | spinup的这两个功能可以抽调到大家自己开发的包当中，比自己实现要省事儿很多。
55 | 
56 | 
57 | 另外，个人感觉我封装的这三个算法，好像不是特别的完美，在gym中测试好像没有问题，但是在机器人环境中无法收敛。
58 | 要是有人测试出bug的话，恳请告知~
59 | 
60 | 
61 | ----
62 | 
63 | 
64 | 
65 | 
66 | 大家要是用起来有什么bug，欢迎开issues~
67 | 要是有帮助的话，希望能给个star。
68 | 
69 | 过段时间看看能不能加个LSTM的，我已经看到有大佬的实现了，整合一下到我这个包里~
70 | 


--------------------------------------------------------------------------------
/run_in_gym/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/run_in_gym/__init__.py


--------------------------------------------------------------------------------
/run_in_gym/launch_with_gym.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | import gym
  8 | import os
  9 | import time
 10 | import sys
 11 | 
 12 | sys.path.append("../")
 13 | 
 14 | 
 15 | def run(seed=184,
 16 |         algo='td3', 
 17 |         per_flag=True,
 18 |         epochs=3000,
 19 |         gamma=0.99,
 20 |         RlNet=None,
 21 |         noise_size=0.1
 22 |         ):
 23 |     import argparse
 24 |     random_seed = seed
 25 |     parser = argparse.ArgumentParser()
 26 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
 27 |     parser.add_argument('--hid', type=int, default=300)
 28 |     parser.add_argument('--l', type=int, default=1)
 29 |     parser.add_argument('--gamma', type=float, default=gamma)
 30 |     parser.add_argument('--seed', '-s', type=int, default=random_seed)
 31 |     parser.add_argument('--epochs', type=int, default=epochs)
 32 |     parser.add_argument('--max_steps', type=int, default=1000)
 33 |     if per_flag:
 34 |         exp_name = algo+"_per"
 35 |     else:
 36 |         exp_name = algo
 37 |     parser.add_argument('--exp_name', type=str, default=exp_name)
 38 |     args = parser.parse_args()
 39 | 
 40 |     env = gym.make(args.env)
 41 |     env = env.unwrapped
 42 |     env.seed(args.seed)
 43 | 
 44 |     s_dim = env.observation_space.shape[0]
 45 |     a_dim = env.action_space.shape[0]
 46 |     a_bound = env.action_space.high[0]
 47 | 
 48 |     
 49 |     net = RlNet(a_dim, s_dim, a_bound,
 50 |               gamma=gamma,              
 51 |               sess_opt=0.1,
 52 |               per_flag=per_flag
 53 |               )
 54 |     ep_reward_list = []
 55 |     test_ep_reward_list = []
 56 | 
 57 |     for i in range(args.epochs):
 58 |         s = env.reset()
 59 |         ep_reward = 0
 60 |         st = time.time()
 61 |         for j in range(args.max_steps):
 62 | 
 63 |             # Add exploration noise
 64 |             if i < 10:
 65 |                 a = np.random.rand(a_dim) * a_bound
 66 |             else:
 67 |                 a = net.get_action(s, noise_size)
 68 | 
 69 |             a = np.clip(a, -a_bound, a_bound)
 70 | 
 71 |             s_, r, done, info = env.step(a)
 72 |             done = False if j == args.max_steps - 1 else done
 73 | 
 74 |             net.store_transition((s, a, r, s_, done))
 75 | 
 76 |             s = s_
 77 |             ep_reward += r
 78 |             if j == args.max_steps - 1:
 79 |                 up_st = time.time()
 80 |                 for _ in range(args.max_steps):
 81 |                     net.learn()
 82 | 
 83 |                 ep_update_time = time.time() - up_st
 84 | 
 85 |                 ep_reward_list.append(ep_reward)
 86 |                 print('Episode:', i, ' Reward: %i' % int(ep_reward),
 87 |                       # 'Explore: %.2f' % var,
 88 |                       "learn step:", net.learn_step,
 89 |                       "ep_time:", np.round(time.time()-st, 3),
 90 |                       "up_time:", np.round(ep_update_time, 3),
 91 |                       )
 92 |                 # if ep_reward > -300:RENDER = True
 93 | 
 94 |                 # 增加测试部分!
 95 |                 if i % 20 == 0:
 96 |                     test_ep_reward = net.test_agent(env=env, n=5)
 97 |                     test_ep_reward_list.append(test_ep_reward)
 98 |                     print("-" * 20)
 99 |                     print('Episode:', i, ' Reward: %i' % int(ep_reward),
100 |                           'Test Reward: %i' % int(test_ep_reward),
101 |                           )
102 |                     print("-" * 20)
103 | 
104 |                 break
105 | 
106 |     import matplotlib.pyplot as plt
107 | 
108 |     plt.plot(ep_reward_list)
109 |     img_name = str(args.exp_name + "_" + args.env + "_epochs" +
110 |                    str(args.epochs) +
111 |                    "_seed" + str(args.seed))
112 |     plt.title(img_name + "_train")
113 |     plt.savefig(img_name + ".png")
114 |     plt.show()
115 |     plt.close()
116 | 
117 |     plt.plot(test_ep_reward_list)
118 |     plt.title(img_name + "_test")
119 |     plt.savefig(img_name + ".png")
120 |     plt.show()
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     algo_index = 1
125 |     seed = 184
126 |     per_flag = True
127 | 
128 |     rl_algo_list = ["DDPG", "SAC_AUTO", "TD3", "SAC"]
129 |     import rl_algorithms
130 |     try:
131 |         net = eval("rl_algorithms."+rl_algo_list[algo_index])
132 |     except:        
133 |         pass
134 |     
135 |     run(seed=seed,
136 |         algo=rl_algo_list[algo_index], 
137 |         per_flag=per_flag,
138 |         epochs=3000,
139 |         gamma=0.99,
140 |         RlNet=net,
141 |         noise_size=0.1)
142 | 


--------------------------------------------------------------------------------
/run_in_gym/run_gym_sac_class.py:
--------------------------------------------------------------------------------
  1 | # 导入一些其他的必要包
  2 | import numpy as np
  3 | import time
  4 | import argparse
  5 | import os
  6 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
  7 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  8 | import tensorflow as tf
  9 | import sys
 10 | 
 11 | sys.path.append("../")
 12 | # 选择强化算法
 13 | from sac_sp.SAC_class import SAC
 14 | 
 15 | # 导入log包!
 16 | from sp_utils.logx import EpochLogger
 17 | from sp_utils.logx import setup_logger_kwargs
 18 | 
 19 | # 选择环境
 20 | import gym
 21 | 
 22 | 
 23 | def test_agent(args, net, env, n=5, logger=None):
 24 |     ep_reward_list = []
 25 |     for j in range(n):
 26 |         obs = env.reset()
 27 |         ep_reward = 0
 28 |         for i in range(args.max_steps):
 29 |             # Take deterministic actions at test time (noise_scale=0)
 30 |             s = obs
 31 | 
 32 |             a = net.get_action(s)
 33 |             obs, r, d, _ = env.step(a)            
 34 | 
 35 |             ep_reward += r
 36 |         if logger:
 37 |             logger.store(TestEpRet=ep_reward)
 38 | 
 39 |         ep_reward_list.append(ep_reward)
 40 |     mean_ep_reward = np.mean(np.array(ep_reward_list))
 41 |     if logger:
 42 |         return mean_ep_reward, logger
 43 |     else:
 44 |         return mean_ep_reward
 45 | 
 46 | 
 47 | def main():
 48 | 
 49 |     # 确定随机种子
 50 |     random_seed = int(time.time() * 10000 % 10000)
 51 |     # 设置传参和默认值
 52 |     parser = argparse.ArgumentParser()
 53 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
 54 |     parser.add_argument('--batch_size', type=int, default=128)
 55 |     parser.add_argument('--noise_scale', type=float, default=0.1)
 56 |     parser.add_argument('--alpha', type=float, default=0.1)
 57 |     parser.add_argument('--gamma', type=float, default=0.99)
 58 |     parser.add_argument('--seed', '-s', type=int, default=random_seed)
 59 |     # 默认的epochs=5000!
 60 |     parser.add_argument('--epochs', type=int, default=3000)
 61 |     parser.add_argument('--max_steps', type=int, default=200)
 62 |     # 实验名字需要改对应起来
 63 |     parser.add_argument('--exp_name', type=str, default='sac_')
 64 | 
 65 |     args = parser.parse_args()
 66 | 
 67 |     tf.reset_default_graph()
 68 | 
 69 |     # 实例化log函数!	
 70 |     exp_name = 'sac_{}_alpha_{}_noise_{}'.format(
 71 |         args.env,
 72 |         args.alpha,
 73 |         args.noise,
 74 |         )
 75 | 	
 76 |     logger_kwargs = setup_logger_kwargs(exp_name=exp_name,
 77 |                                         seed=args.seed,
 78 |                                         output_dir="../sp_data_logs/")
 79 |     # 将字典传进去
 80 |     logger = EpochLogger(**logger_kwargs)
 81 | 
 82 |     print("locals():", locals())
 83 |     logger.save_config(locals())
 84 | 
 85 |     # 创建虚拟环境    
 86 |     env = gym.make(args.env)
 87 |     # 设置环境的随机种子:robosuite可能没有
 88 |     # env.seed(args.seed)
 89 |     tf.set_random_seed(args.seed)
 90 |     np.random.seed(args.seed)
 91 | 
 92 |     obs = env.reset()
 93 |     perception_dim = env.observation_space.shape[0]
 94 | 
 95 |     # 确定state和action维度和动作上限
 96 |     s_dim = perception_dim
 97 |     a_dim = env.action_space.shape[0]
 98 |     a_bound = env.action_space.high[0]
 99 | 
100 |     # 创建强化算法类,里面还有一些参数,需要看里面的代码
101 |     # SAC主要调整alpha,从0.1到0.25,找到最佳的一组
102 |     net = SAC(a_dim, s_dim, a_bound,
103 |               alpha=args.alpha,
104 |               batch_size=args.batch_size,
105 |               )
106 | 
107 |     # 设定保存的一些参数.
108 |     ep_reward_list = []
109 |     test_ep_reward_list = []
110 |     start_time = time.time()
111 |     # 主循环
112 |     for i in range(args.epochs):
113 |         # 环境的重置和一些变量的归零
114 |         obs = env.reset()
115 |         s = obs
116 |         ep_reward = 0
117 |         episode_time = time.time()
118 |         for j in range(args.max_steps):
119 |             # 选择动作
120 |             # Add exploration noise
121 |             a = net.get_action(s, args.noise_scale)
122 | 
123 |             a = np.clip(a, -a_bound, a_bound)
124 | 
125 |             obs, r, done, info = env.step(a)
126 | 
127 |             s_ = obs
128 |             net.store_transition((s, a, r, s_, done))
129 | 
130 |             s = s_
131 |             ep_reward += r
132 |             if j == args.max_steps - 1:
133 |                 # 存episode reward.这里安心的存进去就好，到时候它会自己计算均值
134 |                 logger.store(EpRet=ep_reward)
135 |                 for _ in range(args.max_steps):
136 |                     net.learn()
137 | 
138 |                 ep_reward_list.append(ep_reward)
139 |                 print('Episode:', i, ' Reward: %0.4f' % float(ep_reward),
140 |                       "learn step:", net.learn_step)
141 | 
142 |                 # 增加测试部分!
143 |                 if i % 20 == 0:
144 |                     test_ep_reward, logger = test_agent(args=args,
145 |                                                         net=net,
146 |                                                         env=env,
147 |                                                         n=5,
148 |                                                         logger=logger
149 |                                                         )
150 |                     test_ep_reward_list.append(test_ep_reward)
151 | 
152 |                     logger.log_tabular('Epoch', i)
153 | 					# 不用with_min_and_max的时候，就不会有AverageEpRet这个值~画图的时候会找不到~
154 | 					# 每个test都打印一次，如果已经存过的就不用管了，没存过的，赋值就行
155 |                     logger.log_tabular('EpRet', with_min_and_max=True)
156 |                     logger.log_tabular('TestEpRet', with_min_and_max=True)
157 |                     logger.log_tabular('TotalEnvInteracts', i*args.max_steps+j)
158 |                     logger.log_tabular('TotalTime', time.time() - start_time)
159 |                     # logger.log_tabular('EpisopeTime', time.time() - episode_time)
160 |                     logger.dump_tabular()
161 | 
162 |                 break
163 | 
164 | 
165 | if __name__ == '__main__':
166 | 
167 |     main()
168 | 


--------------------------------------------------------------------------------
/sac_auto/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sac_auto/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | EPS = 1e-8
  5 | 
  6 | def placeholder(dim=None):
  7 |     return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
  8 | 
  9 | def placeholders(*args):
 10 |     return [placeholder(dim) for dim in args]
 11 | 
 12 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 13 |     for h in hidden_sizes[:-1]:
 14 |         x = tf.layers.dense(x, units=h, activation=activation)
 15 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 16 | 
 17 | def get_vars(scope):
 18 |     return [x for x in tf.global_variables() if scope in x.name]
 19 | 
 20 | def count_vars(scope):
 21 |     v = get_vars(scope)
 22 |     return sum([np.prod(var.shape.as_list()) for var in v])
 23 | 
 24 | def gaussian_likelihood(x, mu, log_std):
 25 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
 26 |     return tf.reduce_sum(pre_sum, axis=1)
 27 | 
 28 | def clip_but_pass_gradient(x, l=-1., u=1.):
 29 |     clip_up = tf.cast(x > u, tf.float32)
 30 |     clip_low = tf.cast(x < l, tf.float32)
 31 |     return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low)
 32 | 
 33 | 
 34 | """
 35 | Policies
 36 | """
 37 | 
 38 | LOG_STD_MAX = 2
 39 | LOG_STD_MIN = -20
 40 | 
 41 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
 42 |     act_dim = a.shape.as_list()[-1]
 43 |     net = mlp(x, list(hidden_sizes), activation, activation)
 44 |     mu = tf.layers.dense(net, act_dim, activation=output_activation)
 45 | 
 46 |     """
 47 |     Because algorithm maximizes trade-off of reward and entropy,
 48 |     entropy must be unique to state---and therefore log_stds need
 49 |     to be a neural network output instead of a shared-across-states
 50 |     learnable parameter vector. But for deep Relu and other nets,
 51 |     simply sticking an activationless dense layer at the end would
 52 |     be quite bad---at the beginning of training, a randomly initialized
 53 |     net could produce extremely large values for the log_stds, which
 54 |     would result in some actions being either entirely deterministic
 55 |     or too random to come back to earth. Either of these introduces
 56 |     numerical instability which could break the algorithm. To 
 57 |     protect against that, we'll constrain the output range of the 
 58 |     log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is 
 59 |     slightly different from the trick used by the original authors of
 60 |     SAC---they used tf.clip_by_value instead of squashing and rescaling.
 61 |     I prefer this approach because it allows gradient propagation
 62 |     through log_std where clipping wouldn't, but I don't know if
 63 |     it makes much of a difference.
 64 |     """
 65 |     log_std = tf.layers.dense(net, act_dim, activation=tf.tanh)
 66 |     log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
 67 | 
 68 |     std = tf.exp(log_std)
 69 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
 70 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
 71 |     return mu, pi, logp_pi
 72 | 
 73 | def apply_squashing_func(mu, pi, logp_pi):
 74 |     mu = tf.tanh(mu)
 75 |     pi = tf.tanh(pi)
 76 |     # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range.
 77 |     logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)
 78 |     return mu, pi, logp_pi
 79 | 
 80 | 
 81 | """
 82 | Actor-Critics
 83 | """
 84 | def mlp_actor_critic(x, x2, a,
 85 |                      hidden_sizes=(400,300),
 86 |                      activation=tf.nn.relu,
 87 |                      output_activation=None,
 88 |                      policy=mlp_gaussian_policy,
 89 |                      action_space=None):
 90 |     # policy
 91 |     with tf.variable_scope('pi'):
 92 |         mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation)
 93 |         mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)
 94 |     with tf.variable_scope('pi', reuse=True):
 95 |         mu2, pi2, logp_pi2 = policy(x2, a, hidden_sizes, activation, output_activation)
 96 |         mu2, pi2, logp_pi2 = apply_squashing_func(mu2, pi2, logp_pi2)
 97 | 
 98 |     # make sure actions are in correct range
 99 |     action_scale = action_space
100 |     mu *= action_scale
101 |     pi *= action_scale
102 | 
103 |     # vfs
104 |     # tf.squeeze( shape(?,1), axis=1 ) = shape(?,)
105 |     vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
106 |     with tf.variable_scope('q1'):
107 |         q1 = vf_mlp(tf.concat([x,a], axis=-1))
108 |     with tf.variable_scope('q1', reuse=True):
109 |         q1_pi = vf_mlp(tf.concat([x,pi], axis=-1))
110 |     with tf.variable_scope('q2'):
111 |         q2 = vf_mlp(tf.concat([x,a], axis=-1))
112 |     with tf.variable_scope('q2', reuse=True):
113 |         q2_pi = vf_mlp(tf.concat([x,pi], axis=-1))
114 | 
115 |     return mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi


--------------------------------------------------------------------------------
/sac_auto/sac_auto_class.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import os
  5 | import time
  6 | import sys
  7 | 
  8 | sys.path.append("../")
  9 | try:
 10 |     from rl_algorithms.sac_auto import core
 11 |     from rl_algorithms.sac_auto.core import get_vars
 12 | except:
 13 |     from sac_auto import core
 14 |     from sac_auto.core import get_vars
 15 | 
 16 | 
 17 | class ReplayBuffer:
 18 |     """
 19 |     A simple FIFO experience replay buffer for TD3 agents.
 20 |     """
 21 | 
 22 |     def __init__(self, obs_dim, act_dim, size):
 23 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 24 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 25 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 26 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 27 |         self.done_buf = np.zeros(size, dtype=np.float32)
 28 |         self.ptr, self.size, self.max_size = 0, 0, size
 29 | 
 30 |     def store(self, obs, act, rew, next_obs, done):
 31 |         self.obs1_buf[self.ptr] = obs
 32 |         self.obs2_buf[self.ptr] = next_obs
 33 |         self.acts_buf[self.ptr] = act
 34 |         self.rews_buf[self.ptr] = rew
 35 |         self.done_buf[self.ptr] = done
 36 |         self.ptr = (self.ptr + 1) % self.max_size
 37 |         self.size = min(self.size + 1, self.max_size)
 38 | 
 39 |     def sample_batch(self, batch_size=32):
 40 |         idxs = np.random.randint(0, self.size, size=batch_size)
 41 |         return dict(obs1=self.obs1_buf[idxs],
 42 |                     obs2=self.obs2_buf[idxs],
 43 |                     acts=self.acts_buf[idxs],
 44 |                     rews=self.rews_buf[idxs],
 45 |                     done=self.done_buf[idxs])
 46 | 
 47 | 
 48 | class SAC:
 49 |     def __init__(self,
 50 |                  a_dim, obs_dim, a_bound,
 51 |                  mlp_actor_critic=core.mlp_actor_critic,
 52 |                  ac_kwargs=dict(), seed=0,
 53 |                  replay_size=int(1e6), gamma=0.99,
 54 |                  polyak=0.995, alpha="auto",
 55 |                 #  pi_lr=1e-4, q_lr=1e-4,
 56 |                 #  batch_size=100,
 57 |                 #  act_noise=0.1, target_noise=0.2, noise_clip=0.5, 
 58 |                 #  policy_delay=2,
 59 |                  sess_opt=0.1,
 60 |                  ):
 61 | 
 62 |         self.learn_step = 0
 63 | 
 64 |         self.obs_dim = obs_dim
 65 |         self.act_dim = a_dim
 66 |         self.act_limit = a_bound
 67 |         self.policy_delay = policy_delay
 68 |         # self.action_noise = act_noise
 69 | 
 70 |         # Share information about action space with policy architecture
 71 |         ac_kwargs['action_space'] = a_bound
 72 | 
 73 |         # Inputs to computation graph
 74 |         self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(obs_dim, a_dim, obs_dim, None, None)
 75 |         self.actor_lr = tf.placeholder(tf.float32, shape=[], name='actor_lr')
 76 |         self.critic_lr = tf.placeholder(tf.float32, shape=[], name='critic_lr')
 77 | 
 78 |         # Main outputs from computation graph
 79 |         with tf.variable_scope('main'):
 80 |             self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi, = mlp_actor_critic(self.x_ph,
 81 |                                                                                           self.x2_ph,
 82 |                                                                                           self.a_ph,
 83 |                                                                                           **ac_kwargs)
 84 | 
 85 |         # Target value network
 86 |         with tf.variable_scope('target'):
 87 |             _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = mlp_actor_critic(self.x2_ph,
 88 |                                                                        self.x2_ph,
 89 |                                                                        self.a_ph,
 90 |                                                                        **ac_kwargs)
 91 | 
 92 |         # Experience buffer
 93 |         self.replay_buffer = ReplayBuffer(obs_dim=obs_dim,
 94 |                                           act_dim=self.act_dim,
 95 |                                           size=replay_size)
 96 | 
 97 |         # Count variables
 98 |         var_counts = tuple(core.count_vars(scope) for scope in
 99 |                            ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
100 |         print(('\nNumber of parameters: \t pi: %d, \t' + \
101 |                'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts)
102 |         # 重新修改下面这段!
103 |         target_entropy = (-np.prod(a_dim))
104 | 
105 |         log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)
106 |         alpha = tf.exp(log_alpha)
107 | 
108 |         alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy))
109 | 
110 |         alpha_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4,
111 |                                                  name='alpha_optimizer')
112 |         train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha])
113 | 
114 |         # Min Double-Q:
115 |         min_q_pi = tf.minimum(q1_pi_, q2_pi_)
116 | 
117 |         # Targets for Q and V regression
118 |         v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2)
119 |         q_backup = self.r_ph + gamma * (1 - self.d_ph) * v_backup
120 | 
121 |         # Soft actor-critic losses
122 |         pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
123 |         q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2)
124 |         q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2)
125 |         value_loss = q1_loss + q2_loss
126 | 
127 |         # Policy train op
128 |         # (has to be separate from value train op, because q1_pi appears in pi_loss)
129 |         pi_optimizer = tf.train.AdamOptimizer(learning_rate=self.actor_lr)
130 |         train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
131 | 
132 |         # Value train op
133 |         # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
134 |         value_optimizer = tf.train.AdamOptimizer(learning_rate=self.critic_lr)
135 |         value_params = get_vars('main/q')
136 |         with tf.control_dependencies([train_pi_op]):
137 |             train_value_op = value_optimizer.minimize(value_loss,
138 |                                                       var_list=value_params)
139 | 
140 |         # Polyak averaging for target variables
141 |         # (control flow because sess.run otherwise evaluates in nondeterministic order)
142 |         with tf.control_dependencies([train_value_op]):
143 |             target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
144 |                                       for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
145 | 
146 |         # All ops to call during one training step
147 |         self.step_ops = [pi_loss,
148 |                          q1_loss, q2_loss,
149 |                          q1, q2,
150 |                          logp_pi, alpha,
151 |                          train_pi_op,
152 |                          train_value_op,
153 |                          target_update,
154 |                          train_alpha_op]
155 | 
156 |         # Initializing targets to match main variables
157 |         target_init = tf.group([tf.assign(v_targ, v_main)
158 |                                 for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
159 | 
160 |         if sess_opt:
161 |             gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=sess_opt)
162 |             self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
163 |         else:
164 |             self.sess = tf.Session()
165 |         self.sess.run(tf.global_variables_initializer())
166 |         self.sess.run(target_init)
167 | 
168 |     def get_action(self, s, noise_scale=0):
169 |         if not noise_scale:
170 |             act_op = self.mu
171 |         else:
172 |             act_op = self.pi
173 |         a = self.sess.run(act_op,
174 |                           feed_dict={self.x_ph: s.reshape(1, -1)})[0]
175 |         return np.clip(a, -self.act_limit, self.act_limit)
176 | 
177 |     def store_transition(self, transition):
178 |         (s, a, r, s_, done) = transition
179 |         self.replay_buffer.store(s, a, r, s_, done)
180 | 
181 |     def test_agent(self, env, max_ep_len=200, n=5, logger=None):
182 |         ep_reward_list = []
183 |         for j in range(n):
184 |             s = env.reset()
185 |             ep_reward = 0
186 |             for i in range(max_ep_len):
187 |                 # Take deterministic actions at test time (noise_scale=0)                
188 |                 a = self.get_action(s)
189 |                 s, r, d, _ = env.step(a)                
190 |                 ep_reward += r
191 |             ep_reward_list.append(ep_reward)
192 |         mean_ep_reward = np.mean(np.array(ep_reward_list))
193 |         if logger:
194 |             logger.store(TestEpRet=mean_ep_reward)
195 |         if logger:
196 |             return mean_ep_reward, logger
197 |         else:
198 |             return mean_ep_reward
199 | 
200 |     def learn(self, batch_size=100,
201 |               actor_lr_input=0.001,
202 |               critic_lr_input=0.001,
203 |               ):
204 | 
205 |         batch = self.replay_buffer.sample_batch(batch_size)
206 |         feed_dict = {self.x_ph: batch['obs1'],
207 |                      self.x2_ph: batch['obs2'],
208 |                      self.a_ph: batch['acts'],
209 |                      self.r_ph: batch['rews'],
210 |                      self.d_ph: batch['done'],
211 |                      self.actor_lr: actor_lr_input,
212 |                      self.critic_lr: critic_lr_input,
213 |                      }
214 |         outs = self.sess.run(self.step_ops,
215 |                              feed_dict)
216 |         self.learn_step += 1
217 |         return outs
218 | 
219 |     def load_step_network(self, saver, load_path):
220 |         checkpoint = tf.train.get_checkpoint_state(load_path)
221 |         if checkpoint and checkpoint.model_checkpoint_path:
222 |             saver.restore(self.sess, tf.train.latest_checkpoint(load_path))
223 |             print("Successfully loaded:", checkpoint.model_checkpoint_path)
224 |             self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1])
225 |         else:
226 |             print("Could not find old network weights")
227 | 
228 |     def save_step_network(self, time_step, saver, save_path):
229 |         saver.save(self.sess, save_path + 'network', global_step=time_step,
230 |                    write_meta_graph=False)
231 | 
232 |     def load_simple_network(self, path):
233 |         saver = tf.train.Saver()
234 |         saver.restore(self.sess, tf.train.latest_checkpoint(path))
235 |         print("restore model successful")
236 | 
237 |     def save_simple_network(self, save_path):
238 |         saver = tf.train.Saver()
239 |         saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False)
240 | 
241 | 
242 | if __name__ == '__main__':
243 |     import argparse
244 | 
245 |     random_seed = int(time.time() * 1000 % 1000)
246 |     random_seed = 184
247 |     parser = argparse.ArgumentParser()
248 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
249 |     parser.add_argument('--hid', type=int, default=300)
250 |     parser.add_argument('--l', type=int, default=1)
251 |     parser.add_argument('--gamma', type=float, default=0.99)
252 |     parser.add_argument('--seed', '-s', type=int, default=random_seed)
253 |     parser.add_argument('--epochs', type=int, default=3000)
254 |     parser.add_argument('--max_steps', type=int, default=1000)
255 |     parser.add_argument('--exp_name', type=str, default='sac_auto_class')
256 |     args = parser.parse_args()
257 | 
258 |     env = gym.make(args.env)
259 |     env = env.unwrapped
260 |     env.seed(args.seed)
261 | 
262 |     s_dim = env.observation_space.shape[0]
263 |     a_dim = env.action_space.shape[0]
264 |     a_bound = env.action_space.high[0]
265 | 
266 |     net = SAC(a_dim, s_dim, a_bound,
267 |             #   batch_size=100,
268 |               sess_opt=0.1
269 |               )
270 |     ep_reward_list = []
271 |     test_ep_reward_list = []
272 | 
273 |     for i in range(args.epochs):
274 |         s = env.reset()
275 |         ep_reward = 0
276 |         st = time.time()
277 |         for j in range(args.max_steps):
278 | 
279 |             # Add exploration noise
280 |             if i < 10:
281 |                 a = np.random.rand(a_dim) * a_bound
282 |             else:
283 |                 a = net.get_action(s, 0.1)
284 | 
285 |             a = np.clip(a, -a_bound, a_bound)
286 | 
287 |             s_, r, done, info = env.step(a)
288 |             done = False if j == args.max_steps - 1 else done
289 | 
290 |             net.store_transition((s, a, r, s_, done))
291 | 
292 |             s = s_
293 |             ep_reward += r
294 |             if j == args.max_steps - 1:
295 |                 ep_update_time = time.time()
296 |                 for _ in range(args.max_steps):
297 |                     net.learn()
298 |                 ep_update_time = time.time() - ep_update_time
299 |                 ep_reward_list.append(ep_reward)
300 |                 print('Episode:', i, ' Reward: %i' % int(ep_reward),
301 |                       # 'Explore: %.2f' % var,
302 |                       "learn step:", net.learn_step,
303 |                       "ep_time:", np.round(time.time()-st, 3),
304 |                       "up_time:", np.round(ep_update_time, 3),
305 |                       )
306 |                 # if ep_reward > -300:RENDER = True
307 | 
308 |                 # 增加测试部分!
309 |                 if i % 20 == 0:
310 |                     test_ep_reward = net.test_agent(env=env, n=5)
311 |                     test_ep_reward_list.append(test_ep_reward)
312 |                     print("-" * 20)
313 |                     print('Episode:', i, ' Reward: %i' % int(ep_reward),
314 |                           'Test Reward: %i' % int(test_ep_reward),
315 |                           )
316 |                     print("-" * 20)
317 | 
318 |                 break
319 | 
320 |     import matplotlib.pyplot as plt
321 | 
322 |     plt.plot(ep_reward_list)
323 |     img_name = str(args.exp_name + "_" + args.env + "_epochs" +
324 |                    str(args.epochs) +
325 |                    "_seed" + str(args.seed))
326 |     plt.title(img_name + "_train")
327 |     plt.savefig(img_name + ".png")
328 |     plt.show()
329 |     plt.close()
330 | 
331 |     plt.plot(test_ep_reward_list)
332 |     plt.title(img_name + "_test")
333 |     plt.savefig(img_name + ".png")
334 |     plt.show()


--------------------------------------------------------------------------------
/sac_sp/SAC_class.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | import sys
  6 | sys.path.append("../")
  7 | from sac_sp import core
  8 | from sac_sp.core import get_vars, mlp_actor_critic
  9 | 
 10 | 
 11 | class ReplayBuffer:
 12 |     """
 13 |     A simple FIFO experience replay buffer for SAC agents.
 14 |     """
 15 | 
 16 |     def __init__(self, obs_dim, act_dim, size):
 17 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 18 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 19 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 20 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 21 |         self.done_buf = np.zeros(size, dtype=np.float32)
 22 |         self.ptr, self.size, self.max_size = 0, 0, size
 23 | 
 24 |     def store(self, obs, act, rew, next_obs, done):
 25 |         self.obs1_buf[self.ptr] = obs
 26 |         self.obs2_buf[self.ptr] = next_obs
 27 |         self.acts_buf[self.ptr] = act
 28 |         self.rews_buf[self.ptr] = rew
 29 |         self.done_buf[self.ptr] = done
 30 |         self.ptr = (self.ptr + 1) % self.max_size
 31 |         self.size = min(self.size + 1, self.max_size)
 32 | 
 33 |     def sample_batch(self, batch_size=32):
 34 |         idxs = np.random.randint(0, self.size, size=batch_size)
 35 |         return dict(obs1=self.obs1_buf[idxs],
 36 |                     obs2=self.obs2_buf[idxs],
 37 |                     acts=self.acts_buf[idxs],
 38 |                     rews=self.rews_buf[idxs],
 39 |                     done=self.done_buf[idxs])
 40 | 
 41 | 
 42 | class SAC:
 43 |     def __init__(self,
 44 |                  a_dim, obs_dim, a_bound,
 45 |                  mlp_actor_critic=core.mlp_actor_critic,
 46 |                  ac_kwargs=dict(), seed=0,
 47 | 
 48 |                  replay_size=int(1e6), gamma=0.99,
 49 |                  polyak=0.995, alpha=0.2,
 50 |                  pi_lr=1e-3, q_lr=1e-3,
 51 |                  batch_size=100,
 52 |                  # start_steps=10000,
 53 |                  act_noise=0.1, target_noise=0.2,
 54 |                  noise_clip=0.5, policy_delay=2,
 55 |                  # max_ep_len=1000,
 56 |                  # logger_kwargs=dict(), save_freq=1
 57 |                  ):
 58 | 
 59 |         self.learn_step = 0
 60 | 
 61 |         self.obs_dim = obs_dim
 62 |         self.act_dim = a_dim
 63 |         self.act_limit = a_bound
 64 |         self.policy_delay = policy_delay
 65 |         self.action_noise = act_noise
 66 | 
 67 |         # Share information about action space with policy architecture
 68 |         ac_kwargs['action_space'] = a_bound
 69 | 
 70 |         # Inputs to computation graph
 71 |         self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(obs_dim, a_dim, obs_dim, None, None)
 72 | 
 73 |         # Main outputs from computation graph
 74 |         with tf.variable_scope('main'):
 75 |             self.mu, self.pi, logp_pi, q1, q2, q1_pi, q2_pi, v = mlp_actor_critic(self.x_ph, self.a_ph, **ac_kwargs)
 76 | 
 77 |         # Target value network
 78 |         with tf.variable_scope('target'):
 79 |             _, _, _, _, _, _, _, v_targ = mlp_actor_critic(self.x2_ph, self.a_ph, **ac_kwargs)
 80 | 
 81 |         # Experience buffer
 82 |         self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=self.act_dim, size=replay_size)
 83 | 
 84 |         # Count variables
 85 |         var_counts = tuple(core.count_vars(scope) for scope in
 86 |                            ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
 87 |         print(('\nNumber of parameters: \t pi: %d, \t' + \
 88 |                'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts)
 89 | 
 90 |         # Min Double-Q:
 91 |         min_q_pi = tf.minimum(q1_pi, q2_pi)
 92 | 
 93 |         # Targets for Q and V regression
 94 |         q_backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * v_targ)
 95 |         v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)
 96 | 
 97 |         # Soft actor-critic losses
 98 |         pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
 99 |         q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2)
100 |         q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2)
101 |         v_loss = 0.5 * tf.reduce_mean((v_backup - v) ** 2)
102 |         value_loss = q1_loss + q2_loss + v_loss
103 | 
104 |         # Policy train op
105 |         # (has to be separate from value train op, because q1_pi appears in pi_loss)
106 |         pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
107 |         self.train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
108 | 
109 |         # Value train op
110 |         # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
111 |         value_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
112 |         value_params = get_vars('main/q') + get_vars('main/v')
113 |         with tf.control_dependencies([self.train_pi_op]):
114 |             self.train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)
115 | 
116 |         # Polyak averaging for target variables
117 |         # (control flow because sess.run otherwise evaluates in nondeterministic order)
118 |         with tf.control_dependencies([self.train_value_op]):
119 |             target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
120 |                                       for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
121 | 
122 |         # All ops to call during one training step
123 |         self.step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi,
124 |                          self.train_pi_op, self.train_value_op, target_update]
125 | 
126 |         # Initializing targets to match main variables
127 |         target_init = tf.group([tf.assign(v_targ, v_main)
128 |                                 for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
129 | 
130 |         self.sess = tf.Session()
131 |         self.sess.run(tf.global_variables_initializer())
132 |         self.sess.run(target_init)
133 | 
134 |     def get_action(self, s, noise_scale=0):
135 |         if not noise_scale:
136 |             act_op = self.mu
137 |         else:
138 |             act_op = self.pi
139 |         a = self.sess.run(act_op, feed_dict={self.x_ph: s.reshape(1, -1)})[0]
140 |         
141 |         return np.clip(a, -self.act_limit, self.act_limit)
142 | 
143 |     def store_transition(self, transition):
144 |         (s, a, r, s_, done) = transition
145 |         self.replay_buffer.store(s, a, r, s_, done)
146 | 
147 |     def test_agent(self, env, max_ep_len=1000, n=5):
148 |         ep_reward_list = []
149 |         for j in range(n):
150 |             s = env.reset()
151 |             ep_reward = 0
152 |             for i in range(max_ep_len):
153 |                 # Take deterministic actions at test time (noise_scale=0)
154 |                 s, r, d, _ = env.step(self.get_action(s))
155 |                 ep_reward += r
156 |             ep_reward_list.append(ep_reward)
157 |         mean_ep_reward = np.mean(np.array(ep_reward_list))
158 |         return mean_ep_reward
159 | 
160 |     def learn(self, batch_size=100):
161 | 
162 |         batch = self.replay_buffer.sample_batch(batch_size)
163 |         feed_dict = {self.x_ph: batch['obs1'],
164 |                      self.x2_ph: batch['obs2'],
165 |                      self.a_ph: batch['acts'],
166 |                      self.r_ph: batch['rews'],
167 |                      self.d_ph: batch['done']
168 |                      }
169 |         outs = self.sess.run(self.step_ops,feed_dict)
170 |         self.learn_step += 1
171 | 
172 |     def load_step_network(self, saver, load_path):
173 |         checkpoint = tf.train.get_checkpoint_state(load_path)
174 |         if checkpoint and checkpoint.model_checkpoint_path:
175 |             saver.restore(self.sess, tf.train.latest_checkpoint(load_path))
176 |             print("Successfully loaded:", checkpoint.model_checkpoint_path)
177 |             self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1])
178 |         else:
179 |             print("Could not find old network weights")
180 | 
181 |     def save_step_network(self, time_step, saver, save_path):
182 |         saver.save(self.sess, save_path + 'network', global_step=time_step,
183 |                    write_meta_graph=False)
184 | 
185 |     def load_simple_network(self, path):
186 |         saver = tf.train.Saver()
187 |         saver.restore(self.sess, tf.train.latest_checkpoint(path))
188 |         print("restore model successful")
189 | 
190 |     def save_simple_network(self, save_path):
191 |         saver = tf.train.Saver()
192 |         saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False)
193 | 
194 | 
195 | if __name__ == '__main__':
196 |     import argparse
197 | 
198 |     random_seed = int(time.time() * 1000 % 1000)
199 |     parser = argparse.ArgumentParser()
200 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
201 |     parser.add_argument('--hid', type=int, default=300)
202 |     parser.add_argument('--l', type=int, default=1)
203 |     parser.add_argument('--gamma', type=float, default=0.99)
204 |     parser.add_argument('--seed', '-s', type=int, default=random_seed)
205 |     parser.add_argument('--epochs', type=int, default=3000)
206 |     parser.add_argument('--max_steps', type=int, default=1000)
207 |     parser.add_argument('--exp_name', type=str, default='sac_class')
208 |     args = parser.parse_args()
209 | 
210 |     env = gym.make(args.env)
211 |     env = env.unwrapped
212 |     env.seed(args.seed)
213 | 
214 |     s_dim = env.observation_space.shape[0]
215 |     a_dim = env.action_space.shape[0]
216 |     a_bound = env.action_space.high[0]
217 | 
218 |     net = SAC(a_dim, s_dim, a_bound,
219 |               batch_size=100,
220 |               )
221 |     ep_reward_list = []
222 |     test_ep_reward_list = []
223 | 
224 |     for i in range(args.epochs):
225 |         s = env.reset()
226 |         ep_reward = 0
227 |         for j in range(args.max_steps):
228 | 
229 |             # Add exploration noise
230 |             if i < 10:
231 |                 a = np.random.rand(a_dim) * a_bound
232 |             else:
233 |                 # a = net.choose_action(s)
234 |                 a = net.get_action(s, 0.1)
235 |             # a = noise.add_noise(a)
236 | 
237 |             a = np.clip(a, -a_bound, a_bound)
238 | 
239 |             s_, r, done, info = env.step(a)
240 |             done = False if j == args.max_steps - 1 else done
241 | 
242 |             net.store_transition((s, a, r, s_, done))
243 | 
244 |             s = s_
245 |             ep_reward += r
246 |             if j == args.max_steps - 1:
247 | 
248 |                 for _ in range(args.max_steps):
249 |                     net.learn()
250 | 
251 |                 ep_reward_list.append(ep_reward)
252 |                 print('Episode:', i, ' Reward: %i' % int(ep_reward),
253 |                       # 'Explore: %.2f' % var,
254 |                       "learn step:", net.learn_step)
255 |                 # if ep_reward > -300:RENDER = True
256 | 
257 |                 # 增加测试部分!
258 |                 if i % 20 == 0:
259 |                     test_ep_reward = net.test_agent(env=env, n=5)
260 |                     test_ep_reward_list.append(test_ep_reward)
261 |                     print("-" * 20)
262 |                     print('Episode:', i, ' Reward: %i' % int(ep_reward),
263 |                           'Test Reward: %i' % int(test_ep_reward),
264 |                           )
265 |                     print("-" * 20)
266 | 
267 |                 break
268 | 
269 |     import matplotlib.pyplot as plt
270 | 
271 |     plt.plot(ep_reward_list)
272 |     img_name = str(args.exp_name + "_" + args.env + "_epochs" +
273 |                    str(args.epochs) +
274 |                    "_seed" + str(args.seed))
275 |     plt.title(img_name + "_train")
276 |     plt.savefig(img_name + ".png")
277 |     plt.show()
278 |     plt.close()
279 | 
280 |     plt.plot(test_ep_reward_list)
281 |     plt.title(img_name + "_test")
282 |     plt.savefig(img_name + ".png")
283 |     plt.show()
284 | 


--------------------------------------------------------------------------------
/sac_sp/SAC_sp.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 去掉了log的信息,直接简单的画了一个图.
  3 | 待会儿在这个基础上,封装一个类
  4 | """
  5 | 
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import gym
  9 | import time
 10 | import sys
 11 | sys.path.append("../")
 12 | from sac_sp import core
 13 | from sac_sp.core import get_vars, mlp_actor_critic
 14 | 
 15 | 
 16 | class ReplayBuffer:
 17 |     """
 18 |     A simple FIFO experience replay buffer for SAC agents.
 19 |     """
 20 | 
 21 |     def __init__(self, obs_dim, act_dim, size):
 22 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 23 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 24 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 25 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 26 |         self.done_buf = np.zeros(size, dtype=np.float32)
 27 |         self.ptr, self.size, self.max_size = 0, 0, size
 28 | 
 29 |     def store(self, obs, act, rew, next_obs, done):
 30 |         self.obs1_buf[self.ptr] = obs
 31 |         self.obs2_buf[self.ptr] = next_obs
 32 |         self.acts_buf[self.ptr] = act
 33 |         self.rews_buf[self.ptr] = rew
 34 |         self.done_buf[self.ptr] = done
 35 |         self.ptr = (self.ptr + 1) % self.max_size
 36 |         self.size = min(self.size + 1, self.max_size)
 37 | 
 38 |     def sample_batch(self, batch_size=32):
 39 |         idxs = np.random.randint(0, self.size, size=batch_size)
 40 |         return dict(obs1=self.obs1_buf[idxs],
 41 |                     obs2=self.obs2_buf[idxs],
 42 |                     acts=self.acts_buf[idxs],
 43 |                     rews=self.rews_buf[idxs],
 44 |                     done=self.done_buf[idxs])
 45 | 
 46 | 
 47 | def sac(env_fn, actor_critic=core.mlp_actor_critic,
 48 |         ac_kwargs=dict(), seed=0,
 49 |         steps_per_epoch=5000, epochs=600,
 50 |         replay_size=int(1e6), gamma=0.99,
 51 |         polyak=0.995, lr=1e-3, alpha=0.2,
 52 |         batch_size=100, start_steps=10000,
 53 |         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
 54 | 
 55 |     tf.set_random_seed(seed)
 56 |     np.random.seed(seed)
 57 | 
 58 |     env, test_env = env_fn(), env_fn()
 59 |     obs_dim = env.observation_space.shape[0]
 60 |     act_dim = env.action_space.shape[0]
 61 | 
 62 |     # Action limit for clamping: critically, assumes all dimensions share the same bound!
 63 |     act_limit = env.action_space.high[0]
 64 | 
 65 |     # Share information about action space with policy architecture
 66 |     ac_kwargs['action_space'] = env.action_space.high[0]
 67 | 
 68 |     # Inputs to computation graph
 69 |     x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
 70 | 
 71 |     # Main outputs from computation graph
 72 |     with tf.variable_scope('main'):
 73 |         mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
 74 | 
 75 |     # Target value network
 76 |     with tf.variable_scope('target'):
 77 |         _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)
 78 | 
 79 |     # Experience buffer
 80 |     replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
 81 | 
 82 |     # Count variables
 83 |     var_counts = tuple(core.count_vars(scope) for scope in
 84 |                        ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
 85 |     print(('\nNumber of parameters: \t pi: %d, \t' + \
 86 |            'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts)
 87 | 
 88 |     # Min Double-Q:
 89 |     min_q_pi = tf.minimum(q1_pi, q2_pi)
 90 | 
 91 |     # Targets for Q and V regression
 92 |     q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ)
 93 |     v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)
 94 | 
 95 |     # Soft actor-critic losses
 96 |     pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
 97 |     q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2)
 98 |     q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2)
 99 |     v_loss = 0.5 * tf.reduce_mean((v_backup - v) ** 2)
100 |     value_loss = q1_loss + q2_loss + v_loss
101 | 
102 |     # Policy train op
103 |     # (has to be separate from value train op, because q1_pi appears in pi_loss)
104 |     pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
105 |     train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
106 | 
107 |     # Value train op
108 |     # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
109 |     value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
110 |     value_params = get_vars('main/q') + get_vars('main/v')
111 |     with tf.control_dependencies([train_pi_op]):
112 |         train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)
113 | 
114 |     # Polyak averaging for target variables
115 |     # (control flow because sess.run otherwise evaluates in nondeterministic order)
116 |     with tf.control_dependencies([train_value_op]):
117 |         target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
118 |                                   for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
119 | 
120 |     # All ops to call during one training step
121 |     step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi,
122 |                 train_pi_op, train_value_op, target_update]
123 | 
124 |     # Initializing targets to match main variables
125 |     target_init = tf.group([tf.assign(v_targ, v_main)
126 |                             for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
127 | 
128 |     sess = tf.Session()
129 |     sess.run(tf.global_variables_initializer())
130 |     sess.run(target_init)
131 | 
132 |     def get_action(o, deterministic=False):
133 |         act_op = mu if deterministic else pi
134 |         return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]
135 | 
136 |     def test_agent(n=10):
137 |         global sess, mu, pi, q1, q2, q1_pi, q2_pi
138 |         for j in range(n):
139 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
140 |             while not (d or (ep_len == max_ep_len)):
141 |                 # Take deterministic actions at test time
142 |                 o, r, d, _ = test_env.step(get_action(o, True))
143 |                 ep_ret += r
144 |                 ep_len += 1
145 | 
146 |     start_time = time.time()
147 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
148 |     total_steps = steps_per_epoch * epochs
149 | 
150 |     ep_ret_list = []
151 |     episode = 0
152 | 
153 |     # Main loop: collect experience in env and update/log each epoch
154 |     for t in range(total_steps):
155 | 
156 |         """
157 |         Until start_steps have elapsed, randomly sample actions
158 |         from a uniform distribution for better exploration. Afterwards, 
159 |         use the learned policy. 
160 |         """
161 |         if t > start_steps:
162 |             a = get_action(o)
163 |         else:
164 |             a = env.action_space.sample()
165 | 
166 |         # Step the env
167 |         o2, r, d, _ = env.step(a)
168 |         ep_ret += r
169 |         ep_len += 1
170 | 
171 |         # Ignore the "done" signal if it comes from hitting the time
172 |         # horizon (that is, when it's an artificial terminal signal
173 |         # that isn't based on the agent's state)
174 |         d = False if ep_len == max_ep_len else d
175 | 
176 |         # Store experience to replay buffer
177 |         replay_buffer.store(o, a, r, o2, d)
178 | 
179 |         # Super critical, easy to overlook step: make sure to update
180 |         # most recent observation!
181 |         o = o2
182 | 
183 |         if d or (ep_len == max_ep_len):
184 |             """
185 |             Perform all SAC updates at the end of the trajectory.
186 |             This is a slight difference from the SAC specified in the
187 |             original paper.
188 |             """
189 |             episode += 1
190 |             ep_ret_list.append(ep_ret)
191 |             epoch = t // steps_per_epoch
192 |             print("Epoch:", epoch)
193 |             print("Episode:", episode)
194 |             print("Training Step:", t)
195 |             print("Episode Reward:", ep_ret)
196 | 
197 |             for j in range(ep_len):
198 |                 batch = replay_buffer.sample_batch(batch_size)
199 |                 feed_dict = {x_ph: batch['obs1'],
200 |                              x2_ph: batch['obs2'],
201 |                              a_ph: batch['acts'],
202 |                              r_ph: batch['rews'],
203 |                              d_ph: batch['done'],
204 |                              }
205 |                 outs = sess.run(step_ops, feed_dict)
206 | 
207 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
208 | 
209 |         # End of epoch wrap-up
210 |         if t > 0 and t % steps_per_epoch == 0:
211 |             test_agent()
212 | 
213 |     import matplotlib.pyplot as plt
214 |     plt.plot(ep_ret_list)
215 |     plt.show()
216 | 
217 | 
218 | if __name__ == '__main__':
219 |     import argparse
220 | 
221 |     parser = argparse.ArgumentParser()
222 |     parser.add_argument('--env', type=str, default='Hopper-v2')
223 |     parser.add_argument('--hid', type=int, default=300)
224 |     parser.add_argument('--l', type=int, default=1)
225 |     parser.add_argument('--gamma', type=float, default=0.99)
226 |     parser.add_argument('--seed', '-s', type=int, default=7)
227 |     parser.add_argument('--epochs', type=int, default=600)
228 |     parser.add_argument('--exp_name', type=str, default='sac')
229 |     args = parser.parse_args()
230 | 
231 |     sac(lambda: gym.make(args.env), actor_critic=core.mlp_actor_critic,
232 |         ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),
233 |         gamma=args.gamma, seed=args.seed, epochs=args.epochs,
234 |         )
235 | 
236 | 


--------------------------------------------------------------------------------
/sac_sp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/__init__.py


--------------------------------------------------------------------------------
/sac_sp/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | EPS = 1e-8
  5 | 
  6 | def placeholder(dim=None):
  7 |     return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
  8 | 
  9 | def placeholders(*args):
 10 |     return [placeholder(dim) for dim in args]
 11 | 
 12 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 13 |     for h in hidden_sizes[:-1]:
 14 |         x = tf.layers.dense(x, units=h, activation=activation)
 15 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 16 | 
 17 | def get_vars(scope):
 18 |     return [x for x in tf.global_variables() if scope in x.name]
 19 | 
 20 | def count_vars(scope):
 21 |     v = get_vars(scope)
 22 |     return sum([np.prod(var.shape.as_list()) for var in v])
 23 | 
 24 | 
 25 | def gaussian_likelihood(x, mu, log_std):
 26 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
 27 |     return tf.reduce_sum(pre_sum, axis=1)
 28 | 
 29 | 
 30 | def clip_but_pass_gradient(x, l=-1., u=1.):
 31 |     clip_up = tf.cast(x > u, tf.float32)
 32 |     clip_low = tf.cast(x < l, tf.float32)
 33 |     return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low)
 34 | 
 35 | 
 36 | """
 37 | Policies
 38 | """
 39 | 
 40 | LOG_STD_MAX = 2
 41 | LOG_STD_MIN = -20
 42 | 
 43 | 
 44 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
 45 |     act_dim = a.shape.as_list()[-1]
 46 |     net = mlp(x, list(hidden_sizes), activation, activation)
 47 |     mu = tf.layers.dense(net, act_dim, activation=output_activation)
 48 | 
 49 |     """
 50 |     Because algorithm maximizes trade-off of reward and entropy,
 51 |     entropy must be unique to state---and therefore log_stds need
 52 |     to be a neural network output instead of a shared-across-states
 53 |     learnable parameter vector. But for deep Relu and other nets,
 54 |     simply sticking an activationless dense layer at the end would
 55 |     be quite bad---at the beginning of training, a randomly initialized
 56 |     net could produce extremely large values for the log_stds, which
 57 |     would result in some actions being either entirely deterministic
 58 |     or too random to come back to earth. Either of these introduces
 59 |     numerical instability which could break the algorithm. To 
 60 |     protect against that, we'll constrain the output range of the 
 61 |     log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is 
 62 |     slightly different from the trick used by the original authors of
 63 |     SAC---they used tf.clip_by_value instead of squashing and rescaling.
 64 |     I prefer this approach because it allows gradient propagation
 65 |     through log_std where clipping wouldn't, but I don't know if
 66 |     it makes much of a difference.
 67 |     """
 68 |     log_std = tf.layers.dense(net, act_dim, activation=tf.tanh)
 69 |     log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
 70 | 
 71 |     std = tf.exp(log_std)
 72 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
 73 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
 74 |     return mu, pi, logp_pi
 75 | 
 76 | 
 77 | def apply_squashing_func(mu, pi, logp_pi):
 78 |     mu = tf.tanh(mu)
 79 |     pi = tf.tanh(pi)
 80 |     # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range.
 81 |     logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)
 82 |     return mu, pi, logp_pi
 83 | 
 84 | 
 85 | """
 86 | Actor-Critics
 87 | """
 88 | 
 89 | 
 90 | def mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu,
 91 |                      output_activation=None, policy=mlp_gaussian_policy, action_space=None):
 92 |     # policy
 93 |     with tf.variable_scope('pi'):
 94 |         mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation)
 95 |         mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)
 96 | 
 97 |     # make sure actions are in correct range
 98 |     # action_scale = action_space.high[0]
 99 |     action_scale = action_space
100 |     mu *= action_scale
101 |     pi *= action_scale
102 | 
103 |     # vfs
104 |     vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
105 |     with tf.variable_scope('q1'):
106 |         q1 = vf_mlp(tf.concat([x,a], axis=-1))
107 |     with tf.variable_scope('q1', reuse=True):
108 |         q1_pi = vf_mlp(tf.concat([x,pi], axis=-1))
109 |     with tf.variable_scope('q2'):
110 |         q2 = vf_mlp(tf.concat([x,a], axis=-1))
111 |     with tf.variable_scope('q2', reuse=True):
112 |         q2_pi = vf_mlp(tf.concat([x,pi], axis=-1))
113 |     with tf.variable_scope('v'):
114 |         v = vf_mlp(x)
115 |     return mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v


--------------------------------------------------------------------------------
/sac_sp/exp_images/HalfCheetah-v2-sac-class-300k-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/exp_images/HalfCheetah-v2-sac-class-300k-test.png


--------------------------------------------------------------------------------
/sac_sp/exp_images/HalfCheetah-v2-sac-class-300k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/exp_images/HalfCheetah-v2-sac-class-300k.png


--------------------------------------------------------------------------------
/sac_sp/exp_images/Hopper-v2-sac-class-3000k-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/exp_images/Hopper-v2-sac-class-3000k-test.png


--------------------------------------------------------------------------------
/sac_sp/exp_images/Hopper-v2-sac-class-3000k-train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/exp_images/Hopper-v2-sac-class-3000k-train.png


--------------------------------------------------------------------------------
/sac_sp/exp_images/Hopper-v2-sac-sp-5000k-train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sac_sp/exp_images/Hopper-v2-sac-sp-5000k-train.png


--------------------------------------------------------------------------------
/sac_sp/test_gym_sac_sp_class.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | import gym
  4 | import time
  5 | import sys
  6 | 
  7 | sys.path.append("../")
  8 | from sac_sp.SAC_class import SAC
  9 | 
 10 | MAX_EPISODES = 250
 11 | MAX_EP_STEPS = 1000
 12 | 
 13 | RENDER = False
 14 | ENV_NAME = 'Hopper-v2'
 15 | 
 16 | 
 17 | def test_agent(net, env, n=10):
 18 |     ep_reward_list = []
 19 |     for j in range(n):
 20 |         s = env.reset()
 21 |         ep_reward = 0
 22 |         for i in range(MAX_EP_STEPS):
 23 |             # Take deterministic actions at test time (noise_scale=0)
 24 |             s, r, d, _ = env.step(net.get_action(s))
 25 |             ep_reward += r
 26 | 
 27 |         ep_reward_list.append(ep_reward)
 28 |     mean_ep_reward = np.mean(np.array(ep_reward_list))
 29 |     return mean_ep_reward
 30 | 
 31 | 
 32 | def main():
 33 | 
 34 |     env = gym.make(ENV_NAME)
 35 |     env = env.unwrapped
 36 |     env.seed(4)
 37 | 
 38 |     s_dim = env.observation_space.shape[0]
 39 |     a_dim = env.action_space.shape[0]
 40 |     a_bound = env.action_space.high[0]
 41 | 
 42 |     net = SAC(a_dim, s_dim, a_bound,
 43 |               batch_size=100,
 44 |               )
 45 |     ep_reward_list = []
 46 |     test_ep_reward_list = []
 47 |     for i in range(MAX_EPISODES):
 48 |         s = env.reset()
 49 |         ep_reward = 0
 50 |         for j in range(MAX_EP_STEPS):
 51 |             if RENDER:
 52 |                 env.render()
 53 | 
 54 |             # Add exploration noise
 55 |             if i < 10:
 56 |                 a = np.random.rand(a_dim) * a_bound
 57 |             else:
 58 |                 # a = net.choose_action(s)
 59 |                 a = net.get_action(s, 0.1)
 60 |             # a = noise.add_noise(a)
 61 | 
 62 |             a = np.clip(a, -a_bound, a_bound)
 63 | 
 64 |             s_, r, done, info = env.step(a)
 65 |             done = False if j == MAX_EP_STEPS-1 else done
 66 | 
 67 |             net.store_transition((s, a, r, s_, done))
 68 | 
 69 |             s = s_
 70 |             ep_reward += r
 71 |             if j == MAX_EP_STEPS - 1:
 72 | 
 73 |                 for _ in range(MAX_EP_STEPS):
 74 |                     net.learn()
 75 | 
 76 |                 ep_reward_list.append(ep_reward)
 77 |                 print('Episode:', i, ' Reward: %i' % int(ep_reward),
 78 |                       # 'Explore: %.2f' % var,
 79 |                       "learn step:", net.learn_step)
 80 |                 # if ep_reward > -300:RENDER = True
 81 | 
 82 |                 # 增加测试部分!
 83 |                 if i % 20 == 0:
 84 |                     test_ep_reward = test_agent(net=net, env=env, n=5)
 85 |                     test_ep_reward_list.append(test_ep_reward)
 86 |                     print("-"*20)
 87 |                     print('Episode:', i, ' Reward: %i' % int(ep_reward),
 88 |                           'Test Reward: %i' % int(test_ep_reward),
 89 |                           )
 90 |                     print("-" * 20)
 91 | 
 92 |                 break
 93 | 
 94 |     plt.plot(ep_reward_list)
 95 |     plt.show()
 96 |     plt.plot(test_ep_reward_list)
 97 |     plt.show()
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 | 
103 | 


--------------------------------------------------------------------------------
/sp_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/sp_utils/__init__.py


--------------------------------------------------------------------------------
/sp_utils/logx.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Some simple logging functionality, inspired by rllab's logging.
  4 | 
  5 | Logs to a tab-separated-values file (path/to/output_directory/progress.txt)
  6 | 
  7 | """
  8 | import json
  9 | import joblib
 10 | import shutil
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | import os.path as osp, time, atexit, os
 14 | import sys
 15 | sys.path.append("../")
 16 | 
 17 | from sp_utils.mpi_tools import proc_id, mpi_statistics_scalar
 18 | from sp_utils.serialization_utils import convert_json
 19 | 
 20 | color2num = dict(
 21 |     gray=30,
 22 |     red=31,
 23 |     green=32,
 24 |     yellow=33,
 25 |     blue=34,
 26 |     magenta=35,
 27 |     cyan=36,
 28 |     white=37,
 29 |     crimson=38
 30 | )
 31 | 
 32 | 
 33 | def colorize(string, color, bold=False, highlight=False):
 34 |     """
 35 |     Colorize a string.
 36 | 
 37 |     This function was originally written by John Schulman.
 38 |     """
 39 |     attr = []
 40 |     num = color2num[color]
 41 |     if highlight: num += 10
 42 |     attr.append(str(num))
 43 |     if bold: attr.append('1')
 44 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 45 | 
 46 | 
 47 | def restore_tf_graph(sess, fpath):
 48 |     """
 49 |     Loads graphs saved by Logger.
 50 | 
 51 |     Will output a dictionary whose keys and values are from the 'inputs' 
 52 |     and 'outputs' dict you specified with logger.setup_tf_saver().
 53 | 
 54 |     Args:
 55 |         sess: A Tensorflow session.
 56 |         fpath: Filepath to save directory.
 57 | 
 58 |     Returns:
 59 |         A dictionary mapping from keys to tensors in the computation graph
 60 |         loaded from ``fpath``. 
 61 |     """
 62 |     tf.saved_model.loader.load(
 63 |                 sess,
 64 |                 [tf.saved_model.tag_constants.SERVING],
 65 |                 fpath
 66 |             )
 67 |     model_info = joblib.load(osp.join(fpath, 'model_info.pkl'))
 68 |     graph = tf.get_default_graph()
 69 |     model = dict()
 70 |     model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['inputs'].items()})
 71 |     model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['outputs'].items()})
 72 |     return model
 73 | 
 74 | 
 75 | def setup_logger_kwargs(exp_name, seed=None, output_dir=None, datestamp=False):
 76 |     """
 77 | 	从run.py文件里调过来的，output_dir名字修改了一下
 78 |     Sets up the output_dir for a logger and returns a dict for logger kwargs.
 79 | 
 80 |     If no seed is given and datestamp is false,
 81 | 
 82 |     ::
 83 | 
 84 |         output_dir = data_dir/exp_name
 85 | 
 86 |     If a seed is given and datestamp is false,
 87 | 
 88 |     ::
 89 | 
 90 |         output_dir = data_dir/exp_name/exp_name_s[seed]
 91 | 
 92 |     If datestamp is true, amend to
 93 | 
 94 |     ::
 95 | 
 96 |         output_dir = data_dir/YY-MM-DD_exp_name/YY-MM-DD_HH-MM-SS_exp_name_s[seed]
 97 | 
 98 |     You can force datestamp=True by setting ``FORCE_DATESTAMP=True`` in
 99 |     ``spinup/user_config.py``.
100 | 
101 |     Args:
102 | 
103 |         exp_name (string): Name for experiment.
104 | 
105 |         seed (int): Seed for random number generators used by experiment.
106 | 
107 |         data_dir (string): Path to folder where results should be saved.
108 |             Default is the ``DEFAULT_DATA_DIR`` in ``spinup/user_config.py``.
109 | 
110 |         datestamp (bool): Whether to include a date and timestamp in the
111 |             name of the save directory.
112 | 
113 |     Returns:
114 | 
115 |         logger_kwargs, a dict containing output_dir and exp_name.
116 |     """
117 | 
118 |     # Datestamp forcing
119 |     datestamp = datestamp or True
120 | 
121 |     # Make base path
122 |     ymd_time = time.strftime("%Y-%m-%d_") if datestamp else ''
123 |     relpath = ''.join([ymd_time, exp_name])
124 | 
125 |     if seed is not None:
126 |         # Make a seed-specific subfolder in the experiment directory.
127 |         if datestamp:
128 |             hms_time = time.strftime("%Y-%m-%d_%H-%M-%S")
129 |             subfolder = ''.join([hms_time, '-', exp_name, '_s', str(seed)])
130 |         else:
131 |             subfolder = ''.join([exp_name, '_s', str(seed)])
132 |         relpath = osp.join(relpath, subfolder)
133 | 
134 |     data_dir = output_dir or True
135 |     logger_kwargs = dict(output_dir=osp.join(data_dir, relpath),
136 |                          exp_name=exp_name)
137 |     return logger_kwargs
138 | 
139 | 
140 | class Logger:
141 |     """
142 |     A general-purpose logger.
143 | 
144 |     Makes it easy to save diagnostics, hyperparameter configurations, the 
145 |     state of a training run, and the trained model.
146 |     """
147 | 
148 |     def __init__(self, output_dir=None,
149 |                  output_fname='progress.txt', exp_name=None):
150 |         """
151 |         Initialize a Logger.
152 | 
153 |         Args:
154 |             output_dir (string): A directory for saving results to. If 
155 |                 ``None``, defaults to a temp directory of the form
156 |                 ``/tmp/experiments/somerandomnumber``.
157 | 
158 |             output_fname (string): Name for the tab-separated-value file 
159 |                 containing metrics logged throughout a training run. 
160 |                 Defaults to ``progress.txt``. 
161 | 
162 |             exp_name (string): Experiment name. If you run multiple training
163 |                 runs and give them all the same ``exp_name``, the plotter
164 |                 will know to group them. (Use case: if you run the same
165 |                 hyperparameter configuration with multiple random seeds, you
166 |                 should give them all the same ``exp_name``.)
167 |         """
168 |         if proc_id()==0:
169 |             self.output_dir = output_dir or "/tmp/experiments/%i"%int(time.time())
170 |             if osp.exists(self.output_dir):
171 |                 print("Warning: Log dir %s already exists! Storing info there anyway."%self.output_dir)
172 |             else:
173 |                 os.makedirs(self.output_dir)
174 |             self.output_file = open(osp.join(self.output_dir, output_fname), 'w')
175 |             atexit.register(self.output_file.close)
176 |             print(colorize("Logging data to %s"%self.output_file.name, 'green', bold=True))
177 |         else:
178 |             self.output_dir = None
179 |             self.output_file = None
180 |         self.first_row=True
181 |         self.log_headers = []
182 |         self.log_current_row = {}
183 |         self.exp_name = exp_name
184 | 
185 |     def log(self, msg, color='green'):
186 |         """Print a colorized message to stdout."""
187 |         if proc_id()==0:
188 |             print(colorize(msg, color, bold=True))
189 | 
190 |     def log_tabular(self, key, val):
191 |         """
192 |         Log a value of some diagnostic.
193 | 
194 |         Call this only once for each diagnostic quantity, each iteration.
195 |         After using ``log_tabular`` to store values for each diagnostic,
196 |         make sure to call ``dump_tabular`` to write them out to file and
197 |         stdout (otherwise they will not get saved anywhere).
198 |         """
199 |         if self.first_row:
200 |             self.log_headers.append(key)
201 |         else:
202 |             assert key in self.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
203 |         assert key not in self.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
204 |         self.log_current_row[key] = val
205 | 
206 |     def save_config(self, config):
207 |         """
208 |         Log an experiment configuration.
209 | 
210 |         Call this once at the top of your experiment, passing in all important
211 |         config vars as a dict. This will serialize the config to JSON, while
212 |         handling anything which can't be serialized in a graceful way (writing
213 |         as informative a string as possible). 
214 | 
215 |         Example use:
216 | 
217 |         .. code-block:: python
218 | 
219 |             logger = EpochLogger(**logger_kwargs)
220 |             logger.save_config(locals())
221 |         """
222 | 
223 |         config_json = convert_json(config)
224 |         if self.exp_name is not None:
225 |             config_json['exp_name'] = self.exp_name
226 |         if proc_id()==0:
227 |             output = json.dumps(config_json, separators=(',',':\t'),
228 |                                 indent=4, sort_keys=True)
229 |             print(colorize('Saving config:\n', color='cyan', bold=True))
230 |             print(output)
231 |             with open(osp.join(self.output_dir, "config.json"), 'w') as out:
232 |                 out.write(output)
233 | 
234 |     def save_state(self, state_dict, itr=None):
235 |         """
236 |         Saves the state of an experiment.
237 | 
238 |         To be clear: this is about saving *state*, not logging diagnostics.
239 |         All diagnostic logging is separate from this function. This function
240 |         will save whatever is in ``state_dict``---usually just a copy of the
241 |         environment---and the most recent parameters for the model you 
242 |         previously set up saving for with ``setup_tf_saver``. 
243 | 
244 |         Call with any frequency you prefer. If you only want to maintain a
245 |         single state and overwrite it at each call with the most recent 
246 |         version, leave ``itr=None``. If you want to keep all of the states you
247 |         save, provide unique (increasing) values for 'itr'.
248 | 
249 |         Args:
250 |             state_dict (dict): Dictionary containing essential elements to
251 |                 describe the current state of training.
252 | 
253 |             itr: An int, or None. Current iteration of training.
254 |         """
255 |         if proc_id()==0:
256 |             fname = 'vars.pkl' if itr is None else 'vars%d.pkl'%itr
257 |             try:
258 |                 joblib.dump(state_dict, osp.join(self.output_dir, fname))
259 |             except:
260 |                 self.log('Warning: could not pickle state_dict.', color='red')
261 |             if hasattr(self, 'tf_saver_elements'):
262 |                 self._tf_simple_save(itr)
263 | 
264 |     def setup_tf_saver(self, sess, inputs, outputs):
265 |         """
266 |         Set up easy model saving for tensorflow.
267 | 
268 |         Call once, after defining your computation graph but before training.
269 | 
270 |         Args:
271 |             sess: The Tensorflow session in which you train your computation
272 |                 graph.
273 | 
274 |             inputs (dict): A dictionary that maps from keys of your choice
275 |                 to the tensorflow placeholders that serve as inputs to the 
276 |                 computation graph. Make sure that *all* of the placeholders
277 |                 needed for your outputs are included!
278 | 
279 |             outputs (dict): A dictionary that maps from keys of your choice
280 |                 to the outputs from your computation graph.
281 |         """
282 |         self.tf_saver_elements = dict(session=sess, inputs=inputs,
283 |                                       outputs=outputs)
284 |         self.tf_saver_info = {'inputs': {k:v.name for k,v in inputs.items()},
285 |                               'outputs': {k:v.name for k,v in outputs.items()}}
286 | 
287 |     def _tf_simple_save(self, itr=None):
288 |         """
289 |         Uses simple_save to save a trained model, plus info to make it easy
290 |         to associated tensors to variables after restore. 
291 |         """
292 |         if proc_id()==0:
293 |             assert hasattr(self, 'tf_saver_elements'), \
294 |                 "First have to setup saving with self.setup_tf_saver"
295 |             fpath = 'simple_save' + ('%d'%itr if itr is not None else '')
296 |             fpath = osp.join(self.output_dir, fpath)
297 |             if osp.exists(fpath):
298 |                 # simple_save refuses to be useful if fpath already exists,
299 |                 # so just delete fpath if it's there.
300 |                 shutil.rmtree(fpath)
301 |             tf.saved_model.simple_save(export_dir=fpath, **self.tf_saver_elements)
302 |             joblib.dump(self.tf_saver_info, osp.join(fpath, 'model_info.pkl'))
303 |     
304 |     def dump_tabular(self):
305 |         """
306 |         Write all of the diagnostics from the current iteration.
307 | 
308 |         Writes both to stdout, and to the output file.
309 |         """
310 |         if proc_id()==0:
311 |             vals = []
312 |             key_lens = [len(key) for key in self.log_headers]
313 |             max_key_len = max(15,max(key_lens))
314 |             keystr = '%'+'%d'%max_key_len
315 |             fmt = "| " + keystr + "s | %15s |"
316 |             n_slashes = 22 + max_key_len
317 |             print("-"*n_slashes)
318 |             for key in self.log_headers:
319 |                 val = self.log_current_row.get(key, "")
320 |                 valstr = "%8.3g"%val if hasattr(val, "__float__") else val
321 |                 print(fmt%(key, valstr))
322 |                 vals.append(val)
323 |             print("-"*n_slashes)
324 |             if self.output_file is not None:
325 |                 if self.first_row:
326 |                     self.output_file.write("\t".join(self.log_headers)+"\n")
327 |                 self.output_file.write("\t".join(map(str,vals))+"\n")
328 |                 self.output_file.flush()
329 |         self.log_current_row.clear()
330 |         self.first_row=False
331 | 
332 | 
333 | class EpochLogger(Logger):
334 |     """
335 |     A variant of Logger tailored for tracking average values over epochs.
336 | 
337 |     Typical use case: there is some quantity which is calculated many times
338 |     throughout an epoch, and at the end of the epoch, you would like to 
339 |     report the average / std / min / max value of that quantity.
340 | 
341 |     With an EpochLogger, each time the quantity is calculated, you would
342 |     use 
343 | 
344 |     .. code-block:: python
345 | 
346 |         epoch_logger.store(NameOfQuantity=quantity_value)
347 | 
348 |     to load it into the EpochLogger's state. Then at the end of the epoch, you 
349 |     would use 
350 | 
351 |     .. code-block:: python
352 | 
353 |         epoch_logger.log_tabular(NameOfQuantity, **options)
354 | 
355 |     to record the desired values.
356 |     """
357 | 
358 |     def __init__(self, *args, **kwargs):
359 |         super().__init__(*args, **kwargs)
360 |         self.epoch_dict = dict()
361 | 
362 |     def store(self, **kwargs):
363 |         """
364 |         Save something into the epoch_logger's current state.
365 | 
366 |         Provide an arbitrary number of keyword arguments with numerical 
367 |         values.
368 |         """
369 |         for k,v in kwargs.items():
370 |             if not(k in self.epoch_dict.keys()):
371 |                 self.epoch_dict[k] = []
372 |             self.epoch_dict[k].append(v)
373 | 
374 |     def log_tabular(self, key, val=None, with_min_and_max=False, average_only=False):
375 |         """
376 |         Log a value or possibly the mean/std/min/max values of a diagnostic.
377 | 
378 |         Args:
379 |             key (string): The name of the diagnostic. If you are logging a
380 |                 diagnostic whose state has previously been saved with 
381 |                 ``store``, the key here has to match the key you used there.
382 | 
383 |             val: A value for the diagnostic. If you have previously saved
384 |                 values for this key via ``store``, do *not* provide a ``val``
385 |                 here.
386 | 
387 |             with_min_and_max (bool): If true, log min and max values of the 
388 |                 diagnostic over the epoch.
389 | 
390 |             average_only (bool): If true, do not log the standard deviation
391 |                 of the diagnostic over the epoch.
392 |         """
393 |         if val is not None:
394 |             super().log_tabular(key,val)
395 |         else:
396 |             v = self.epoch_dict[key]
397 |             vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v
398 |             stats = mpi_statistics_scalar(vals, with_min_and_max=with_min_and_max)
399 |             super().log_tabular(key if average_only else 'Average' + key, stats[0])
400 |             if not(average_only):
401 |                 super().log_tabular('Std'+key, stats[1])
402 |             if with_min_and_max:
403 |                 super().log_tabular('Max'+key, stats[3])
404 |                 super().log_tabular('Min'+key, stats[2])
405 |         self.epoch_dict[key] = []
406 | 
407 |     def get_stats(self, key):
408 |         """
409 |         Lets an algorithm ask the logger for mean/std/min/max of a diagnostic.
410 |         """
411 |         v = self.epoch_dict[key]
412 |         vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v
413 |         return mpi_statistics_scalar(vals)
414 | 


--------------------------------------------------------------------------------
/sp_utils/mpi_tools.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import os, subprocess, sys
 3 | import numpy as np
 4 | 
 5 | 
 6 | def mpi_fork(n, bind_to_core=False):
 7 |     """
 8 |     Re-launches the current script with workers linked by MPI.
 9 | 
10 |     Also, terminates the original process that launched it.
11 | 
12 |     Taken almost without modification from the Baselines function of the
13 |     `same name`_.
14 | 
15 |     .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py
16 | 
17 |     Args:
18 |         n (int): Number of process to split into.
19 | 
20 |         bind_to_core (bool): Bind each MPI process to a core.
21 |     """
22 |     if n<=1: 
23 |         return
24 |     if os.getenv("IN_MPI") is None:
25 |         env = os.environ.copy()
26 |         env.update(
27 |             MKL_NUM_THREADS="1",
28 |             OMP_NUM_THREADS="1",
29 |             IN_MPI="1"
30 |         )
31 |         args = ["mpirun", "-np", str(n)]
32 |         if bind_to_core:
33 |             args += ["-bind-to", "core"]
34 |         args += [sys.executable] + sys.argv
35 |         subprocess.check_call(args, env=env)
36 |         sys.exit()
37 | 
38 | 
39 | def msg(m, string=''):
40 |     print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m))
41 | 
42 | def proc_id():
43 |     """Get rank of calling process."""
44 |     return MPI.COMM_WORLD.Get_rank()
45 | 
46 | def allreduce(*args, **kwargs):
47 |     return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
48 | 
49 | def num_procs():
50 |     """Count active MPI processes."""
51 |     return MPI.COMM_WORLD.Get_size()
52 | 
53 | def broadcast(x, root=0):
54 |     MPI.COMM_WORLD.Bcast(x, root=root)
55 | 
56 | def mpi_op(x, op):
57 |     x, scalar = ([x], True) if np.isscalar(x) else (x, False)
58 |     x = np.asarray(x, dtype=np.float32)
59 |     buff = np.zeros_like(x, dtype=np.float32)
60 |     allreduce(x, buff, op=op)
61 |     return buff[0] if scalar else buff
62 | 
63 | def mpi_sum(x):
64 |     return mpi_op(x, MPI.SUM)
65 | 
66 | def mpi_avg(x):
67 |     """Average a scalar or vector over MPI processes."""
68 |     return mpi_sum(x) / num_procs()
69 |     
70 | def mpi_statistics_scalar(x, with_min_and_max=False):
71 |     """
72 |     Get mean/std and optional min/max of scalar x across MPI processes.
73 | 
74 |     Args:
75 |         x: An array containing samples of the scalar to produce statistics
76 |             for.
77 | 
78 |         with_min_and_max (bool): If true, return min and max of x in 
79 |             addition to mean and std.
80 |     """
81 |     x = np.array(x, dtype=np.float32)
82 |     global_sum, global_n = mpi_sum([np.sum(x), len(x)])
83 |     mean = global_sum / global_n
84 | 
85 |     global_sum_sq = mpi_sum(np.sum((x - mean)**2))
86 |     std = np.sqrt(global_sum_sq / global_n)  # compute global std
87 | 
88 |     if with_min_and_max:
89 |         global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
90 |         global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
91 |         return mean, std, global_min, global_max
92 |     return mean, std


--------------------------------------------------------------------------------
/sp_utils/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | import os.path as osp
  7 | import numpy as np
  8 | 
  9 | DIV_LINE_WIDTH = 50
 10 | 
 11 | # Global vars for tracking and labeling data at load time.
 12 | exp_idx = 0
 13 | units = dict()
 14 | 
 15 | 
 16 | def plot_data(data, xaxis='Epoch', value="AverageEpRet", condition="Condition1", smooth=1, **kwargs):
 17 |     if smooth > 1:
 18 |         """
 19 |         smooth data with moving window average.
 20 |         that is,
 21 |             smoothed_y[t] = average(y[t-k], y[t-k+1], ..., y[t+k-1], y[t+k])
 22 |         where the "smooth" param is width of that window (2k+1)
 23 |         """
 24 |         y = np.ones(smooth)
 25 |         for datum in data:
 26 |             x = np.asarray(datum[value])
 27 |             z = np.ones(len(x))
 28 |             smoothed_x = np.convolve(x,y,'same') / np.convolve(z,y,'same')
 29 |             datum[value] = smoothed_x
 30 | 
 31 |     if isinstance(data, list):
 32 |         data = pd.concat(data, ignore_index=True)
 33 |     sns.set(style="darkgrid", font_scale=1.5)
 34 |     sns.tsplot(data=data, time=xaxis, value=value, unit="Unit", condition=condition, ci='sd', **kwargs)
 35 |     """
 36 |     If you upgrade to any version of Seaborn greater than 0.8.1, switch from 
 37 |     tsplot to lineplot replacing L29 with:
 38 | 
 39 |         sns.lineplot(data=data, x=xaxis, y=value, hue=condition, ci='sd', **kwargs)
 40 | 
 41 |     Changes the colorscheme and the default legend style, though.
 42 |     """
 43 |     # plt.legend(loc='best').set_draggable(True)
 44 | 	# 上面的会报错~
 45 |     plt.legend(loc='best')
 46 | 
 47 |     """
 48 |     For the version of the legend used in the Spinning Up benchmarking page, 
 49 |     swap L38 with:
 50 | 
 51 |     plt.legend(loc='upper center', ncol=6, handlelength=1,
 52 |                mode="expand", borderaxespad=0., prop={'size': 13})
 53 |     """
 54 | 
 55 |     xscale = np.max(np.asarray(data[xaxis])) > 5e3
 56 |     if xscale:
 57 |         # Just some formatting niceness: x-axis scale in scientific notation if max x is large
 58 |         plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
 59 | 
 60 |     plt.tight_layout(pad=0.5)
 61 | 
 62 | 
 63 | def get_datasets(logdir, condition=None):
 64 |     """
 65 |     Recursively look through logdir for output files produced by
 66 |     spinup.logx.Logger. 
 67 | 
 68 |     Assumes that any file "progress.txt" is a valid hit. 
 69 |     """
 70 |     global exp_idx
 71 |     global units
 72 |     datasets = []
 73 |     for root, _, files in os.walk(logdir):
 74 |         if 'progress.txt' in files:
 75 |             exp_name = None
 76 |             try:
 77 |                 config_path = open(os.path.join(root,'config.json'))
 78 |                 config = json.load(config_path)
 79 |                 if 'exp_name' in config:
 80 |                     exp_name = config['exp_name']
 81 |             except:
 82 |                 print('No file named config.json')
 83 |             condition1 = condition or exp_name or 'exp'
 84 |             condition2 = condition1 + '-' + str(exp_idx)
 85 |             exp_idx += 1
 86 |             if condition1 not in units:
 87 |                 units[condition1] = 0
 88 |             unit = units[condition1]
 89 |             units[condition1] += 1
 90 | 
 91 |             try:
 92 |                 exp_data = pd.read_table(os.path.join(root,'progress.txt'))
 93 |             except:
 94 |                 print('Could not read from %s'%os.path.join(root,'progress.txt'))
 95 |                 continue
 96 |             performance = 'AverageTestEpRet' if 'AverageTestEpRet' in exp_data else 'AverageEpRet'
 97 |             exp_data.insert(len(exp_data.columns),'Unit',unit)
 98 |             exp_data.insert(len(exp_data.columns),'Condition1',condition1)
 99 |             exp_data.insert(len(exp_data.columns),'Condition2',condition2)
100 |             exp_data.insert(len(exp_data.columns),'Performance',exp_data[performance])
101 |             datasets.append(exp_data)
102 |     return datasets
103 | 
104 | 
105 | def get_all_datasets(all_logdirs, legend=None, select=None, exclude=None):
106 |     """
107 |     For every entry in all_logdirs,
108 |         1) check if the entry is a real directory and if it is, 
109 |            pull data from it; 
110 | 
111 |         2) if not, check to see if the entry is a prefix for a 
112 |            real directory, and pull data from that.
113 |     """
114 |     logdirs = []
115 |     for logdir in all_logdirs:
116 |         if osp.isdir(logdir) and logdir[-1]==os.sep:
117 |             logdirs += [logdir]
118 |         else:
119 |             basedir = osp.dirname(logdir)
120 |             fulldir = lambda x : osp.join(basedir, x)
121 |             prefix = logdir.split(os.sep)[-1]
122 |             listdir= os.listdir(basedir)
123 |             logdirs += sorted([fulldir(x) for x in listdir if prefix in x])
124 | 
125 |     """
126 |     Enforce selection rules, which check logdirs for certain substrings.
127 |     Makes it easier to look at graphs from particular ablations, if you
128 |     launch many jobs at once with similar names.
129 |     """
130 |     if select is not None:
131 |         logdirs = [log for log in logdirs if all(x in log for x in select)]
132 |     if exclude is not None:
133 |         logdirs = [log for log in logdirs if all(not(x in log) for x in exclude)]
134 | 
135 |     # Verify logdirs
136 |     print('Plotting from...\n' + '='*DIV_LINE_WIDTH + '\n')
137 |     for logdir in logdirs:
138 |         print(logdir)
139 |     print('\n' + '='*DIV_LINE_WIDTH)
140 | 
141 |     # Make sure the legend is compatible with the logdirs
142 |     assert not(legend) or (len(legend) == len(logdirs)), \
143 |         "Must give a legend title for each set of experiments."
144 | 
145 |     # Load data from logdirs
146 |     data = []
147 |     if legend:
148 |         for log, leg in zip(logdirs, legend):
149 |             data += get_datasets(log, leg)
150 |     else:
151 |         for log in logdirs:
152 |             data += get_datasets(log)
153 |     return data
154 | 
155 | 
156 | def make_plots(all_logdirs, legend=None, xaxis=None, values=None, count=False,  
157 |                font_scale=1.5, smooth=1, select=None, exclude=None, estimator='mean'):
158 |     data = get_all_datasets(all_logdirs, legend, select, exclude)
159 |     values = values if isinstance(values, list) else [values]
160 |     condition = 'Condition2' if count else 'Condition1'
161 |     estimator = getattr(np, estimator)      # choose what to show on main curve: mean? max? min?
162 |     for value in values:
163 |         plt.figure()
164 |         plot_data(data, xaxis=xaxis, value=value, condition=condition, smooth=smooth, estimator=estimator)
165 |     plt.show()
166 | 
167 | 
168 | def main():
169 |     import argparse
170 |     parser = argparse.ArgumentParser()
171 |     parser.add_argument('logdir', nargs='*')
172 |     parser.add_argument('--legend', '-l', nargs='*')
173 |     parser.add_argument('--xaxis', '-x', default='TotalEnvInteracts')
174 |     parser.add_argument('--value', '-y', default='Performance', nargs='*')
175 |     parser.add_argument('--count', action='store_true')
176 |     parser.add_argument('--smooth', '-s', type=int, default=1)
177 |     parser.add_argument('--select', nargs='*')
178 |     parser.add_argument('--exclude', nargs='*')
179 |     parser.add_argument('--est', default='mean')
180 |     args = parser.parse_args()
181 |     """
182 | 
183 |     Args: 
184 |         logdir (strings): As many log directories (or prefixes to log 
185 |             directories, which the plotter will autocomplete internally) as 
186 |             you'd like to plot from.
187 | 
188 |         legend (strings): Optional way to specify legend for the plot. The 
189 |             plotter legend will automatically use the ``exp_name`` from the
190 |             config.json file, unless you tell it otherwise through this flag.
191 |             This only works if you provide a name for each directory that
192 |             will get plotted. (Note: this may not be the same as the number
193 |             of logdir args you provide! Recall that the plotter looks for
194 |             autocompletes of the logdir args: there may be more than one 
195 |             match for a given logdir prefix, and you will need to provide a 
196 |             legend string for each one of those matches---unless you have 
197 |             removed some of them as candidates via selection or exclusion 
198 |             rules (below).)
199 | 
200 |         xaxis (string): Pick what column from data is used for the x-axis.
201 |              Defaults to ``TotalEnvInteracts``.
202 | 
203 |         value (strings): Pick what columns from data to graph on the y-axis. 
204 |             Submitting multiple values will produce multiple graphs. Defaults
205 |             to ``Performance``, which is not an actual output of any algorithm.
206 |             Instead, ``Performance`` refers to either ``AverageEpRet``, the 
207 |             correct performance measure for the on-policy algorithms, or
208 |             ``AverageTestEpRet``, the correct performance measure for the 
209 |             off-policy algorithms. The plotter will automatically figure out 
210 |             which of ``AverageEpRet`` or ``AverageTestEpRet`` to report for 
211 |             each separate logdir.
212 | 
213 |         count: Optional flag. By default, the plotter shows y-values which
214 |             are averaged across all results that share an ``exp_name``, 
215 |             which is typically a set of identical experiments that only vary
216 |             in random seed. But if you'd like to see all of those curves 
217 |             separately, use the ``--count`` flag.
218 | 
219 |         smooth (int): Smooth data by averaging it over a fixed window. This 
220 |             parameter says how wide the averaging window will be.
221 | 
222 |         select (strings): Optional selection rule: the plotter will only show
223 |             curves from logdirs that contain all of these substrings.
224 | 
225 |         exclude (strings): Optional exclusion rule: plotter will only show 
226 |             curves from logdirs that do not contain these substrings.
227 | 
228 |     """
229 | 
230 |     make_plots(args.logdir, args.legend, args.xaxis, args.value, args.count, 
231 |                smooth=args.smooth, select=args.select, exclude=args.exclude,
232 |                estimator=args.est)
233 | 
234 | if __name__ == "__main__":
235 |     main()


--------------------------------------------------------------------------------
/sp_utils/serialization_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def convert_json(obj):
 5 |     """ Convert obj to a version which can be serialized with JSON.
 6 |         垃圾递归!删掉了造成无限递归的数据类型~
 7 |     """
 8 | 
 9 |     if is_json_serializable(obj):
10 |         return obj
11 |     else:
12 |         if isinstance(obj, dict):
13 |             return {convert_json(k): convert_json(v) 
14 |                     for k,v in obj.items()}
15 | 
16 |         elif isinstance(obj, tuple):
17 |             return (convert_json(x) for x in obj)
18 | 
19 |         elif isinstance(obj, list):
20 |             return [convert_json(x) for x in obj]
21 | 
22 |         elif hasattr(obj,'__name__') and not('lambda' in obj.__name__):
23 |             # return
24 |             return convert_json(obj.__name__)
25 | 
26 |         elif hasattr(obj,'__dict__') and obj.__dict__:
27 |             return
28 |             obj_dict = {convert_json(k): convert_json(v)
29 |                         for k,v in obj.__dict__.items()}
30 |             return {str(obj): obj_dict}
31 | 
32 |         return str(obj)
33 | 
34 | def is_json_serializable(v):
35 |     try:
36 |         json.dumps(v)
37 |         return True
38 |     except:
39 |         return False


--------------------------------------------------------------------------------
/td3_sp/TD3_class.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | import sys
  6 | sys.path.append("../")
  7 | from td3_sp import core
  8 | from td3_sp.core import get_vars, mlp_actor_critic
  9 | 
 10 | 
 11 | class ReplayBuffer:
 12 |     """
 13 |     A simple FIFO experience replay buffer for TD3 agents.
 14 |     """
 15 | 
 16 |     def __init__(self, obs_dim, act_dim, size):
 17 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 18 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 19 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 20 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 21 |         self.done_buf = np.zeros(size, dtype=np.float32)
 22 |         self.ptr, self.size, self.max_size = 0, 0, size
 23 | 
 24 |     def store(self, obs, act, rew, next_obs, done):
 25 |         self.obs1_buf[self.ptr] = obs
 26 |         self.obs2_buf[self.ptr] = next_obs
 27 |         self.acts_buf[self.ptr] = act
 28 |         self.rews_buf[self.ptr] = rew
 29 |         self.done_buf[self.ptr] = done
 30 |         self.ptr = (self.ptr + 1) % self.max_size
 31 |         self.size = min(self.size + 1, self.max_size)
 32 | 
 33 |     def sample_batch(self, batch_size=32):
 34 |         idxs = np.random.randint(0, self.size, size=batch_size)
 35 |         return dict(obs1=self.obs1_buf[idxs],
 36 |                     obs2=self.obs2_buf[idxs],
 37 |                     acts=self.acts_buf[idxs],
 38 |                     rews=self.rews_buf[idxs],
 39 |                     done=self.done_buf[idxs])
 40 | 
 41 | 
 42 | class TD3:
 43 |     def __init__(self,
 44 |                  a_dim, obs_dim, a_bound,
 45 |                  mlp_actor_critic=core.mlp_actor_critic,
 46 |                  ac_kwargs=dict(), seed=0,
 47 | 
 48 |                  replay_size=int(1e6), gamma=0.99,
 49 |                  polyak=0.995, pi_lr=1e-3, q_lr=1e-3,
 50 |                  batch_size=100,
 51 |                  # start_steps=10000,
 52 |                  act_noise=0.1, target_noise=0.2,
 53 |                  noise_clip=0.5, policy_delay=2,
 54 |                  # max_ep_len=1000,
 55 |                  # logger_kwargs=dict(), save_freq=1
 56 |                  ):
 57 | 
 58 |         self.learn_step = 0
 59 | 
 60 |         self.obs_dim = obs_dim
 61 |         self.act_dim = a_dim
 62 |         self.act_limit = a_bound
 63 |         self.policy_delay = policy_delay
 64 |         self.action_noise = act_noise
 65 | 
 66 |         # Share information about action space with policy architecture
 67 |         ac_kwargs['action_space'] = a_bound
 68 | 
 69 |         # Inputs to computation graph
 70 |         self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(obs_dim, a_dim, obs_dim, None, None)
 71 | 
 72 |         # Main outputs from computation graph
 73 |         with tf.variable_scope('main'):
 74 |             self.pi, self.q1, self.q2, self.q1_pi = mlp_actor_critic(self.x_ph, self.a_ph, **ac_kwargs)
 75 | 
 76 |         # Target policy network
 77 |         with tf.variable_scope('target'):
 78 |             pi_targ, _, _, _ = mlp_actor_critic(self.x2_ph, self.a_ph, **ac_kwargs)
 79 | 
 80 |         # Target Q networks
 81 |         with tf.variable_scope('target', reuse=True):
 82 | 
 83 |             # Target policy smoothing, by adding clipped noise to target actions
 84 |             epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
 85 |             epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
 86 |             a2 = pi_targ + epsilon
 87 |             a2 = tf.clip_by_value(a2, -self.act_limit, self.act_limit)
 88 | 
 89 |             # Target Q-values, using action from target policy
 90 |             _, q1_targ, q2_targ, _ = mlp_actor_critic(self.x2_ph, a2, **ac_kwargs)
 91 | 
 92 |         # Experience buffer
 93 |         self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=self.act_dim, size=replay_size)
 94 | 
 95 |         # Count variables
 96 |         var_counts = tuple(core.count_vars(scope)
 97 |                            for scope in ['main/pi',
 98 |                                          'main/q1',
 99 |                                          'main/q2',
100 |                                          'main'])
101 |         print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts)
102 | 
103 |         # Bellman backup for Q functions, using Clipped Double-Q targets
104 |         min_q_targ = tf.minimum(q1_targ, q2_targ)
105 |         backup = tf.stop_gradient(self.r_ph + gamma * (1 - self.d_ph) * min_q_targ)
106 | 
107 |         # TD3 losses
108 |         self.pi_loss = -tf.reduce_mean(self.q1_pi)
109 |         q1_loss = tf.reduce_mean((self.q1 - backup) ** 2)
110 |         q2_loss = tf.reduce_mean((self.q2 - backup) ** 2)
111 |         # 为啥这里的loss是加起来的?
112 |         self.q_loss = q1_loss + q2_loss
113 | 
114 |         # Separate train ops for pi, q
115 |         pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
116 |         q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
117 |         self.train_pi_op = pi_optimizer.minimize(self.pi_loss,
118 |                                                  var_list=get_vars('main/pi'))
119 |         # 这里的参数,怎么是总的q?
120 |         # 难道这里的字符串只需要匹配就好了?
121 |         self.train_q_op = q_optimizer.minimize(self.q_loss,
122 |                                                var_list=get_vars('main/q'))
123 | 
124 |         # Polyak averaging for target variables
125 |         self.target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
126 |                                        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
127 | 
128 |         # Initializing targets to match main variables
129 |         target_init = tf.group([tf.assign(v_targ, v_main)
130 |                                 for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
131 | 
132 |         self.sess = tf.Session()
133 |         self.sess.run(tf.global_variables_initializer())
134 |         self.sess.run(target_init)
135 | 
136 |     def get_action(self, s, noise_scale=0):
137 |         if not noise_scale:
138 |             noise_scale = self.action_noise
139 |         a = self.sess.run(self.pi, feed_dict={self.x_ph: s.reshape(1, -1)})[0]
140 |         a += noise_scale * np.random.randn(self.act_dim)
141 |         return np.clip(a, -self.act_limit, self.act_limit)
142 | 
143 |     def store_transition(self, transition):
144 |         (s, a, r, s_, done) = transition
145 |         self.replay_buffer.store(s, a, r, s_, done)
146 | 
147 |     def test_agent(self, env, max_ep_len=1000, n=5):
148 |         ep_reward_list = []
149 |         for j in range(n):
150 |             s = env.reset()
151 |             ep_reward = 0
152 |             for i in range(max_ep_len):
153 |                 # Take deterministic actions at test time (noise_scale=0)
154 |                 s, r, d, _ = env.step(self.get_action(s))
155 |                 ep_reward += r
156 |             ep_reward_list.append(ep_reward)
157 |         mean_ep_reward = np.mean(np.array(ep_reward_list))
158 |         return mean_ep_reward
159 | 
160 |     def learn(self, batch_size=100):
161 | 
162 |         batch = self.replay_buffer.sample_batch(batch_size)
163 |         feed_dict = {self.x_ph: batch['obs1'],
164 |                      self.x2_ph: batch['obs2'],
165 |                      self.a_ph: batch['acts'],
166 |                      self.r_ph: batch['rews'],
167 |                      self.d_ph: batch['done']
168 |                      }
169 |         q_step_ops = [self.q_loss, self.q1, self.q2, self.train_q_op]
170 |         outs = self.sess.run(q_step_ops, feed_dict)
171 | 
172 |         if self.learn_step % self.policy_delay == 0:
173 |             # Delayed policy update
174 |             outs = self.sess.run([self.pi_loss, self.train_pi_op, self.target_update],
175 |                                  feed_dict)
176 |         self.learn_step += 1
177 | 
178 |     def load_step_network(self, saver, load_path):
179 |         checkpoint = tf.train.get_checkpoint_state(load_path)
180 |         if checkpoint and checkpoint.model_checkpoint_path:
181 |             saver.restore(self.sess, tf.train.latest_checkpoint(load_path))
182 |             print("Successfully loaded:", checkpoint.model_checkpoint_path)
183 |             self.learn_step = int(checkpoint.model_checkpoint_path.split('-')[-1])
184 |         else:
185 |             print("Could not find old network weights")
186 | 
187 |     def save_step_network(self, time_step, saver, save_path):
188 |         saver.save(self.sess, save_path + 'network', global_step=time_step,
189 |                    write_meta_graph=False)
190 | 
191 |     def load_simple_network(self, path):
192 |         saver = tf.train.Saver()
193 |         saver.restore(self.sess, tf.train.latest_checkpoint(path))
194 |         print("restore model successful")
195 | 
196 |     def save_simple_network(self, save_path):
197 |         saver = tf.train.Saver()
198 |         saver.save(self.sess, save_path=save_path + "/params", write_meta_graph=False)
199 | 
200 | 
201 | if __name__ == '__main__':
202 |     import argparse
203 | 
204 |     random_seed = int(time.time() * 1000 % 1000)
205 |     parser = argparse.ArgumentParser()
206 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
207 |     parser.add_argument('--hid', type=int, default=300)
208 |     parser.add_argument('--l', type=int, default=1)
209 |     parser.add_argument('--gamma', type=float, default=0.99)
210 |     parser.add_argument('--seed', '-s', type=int, default=random_seed)
211 |     parser.add_argument('--epochs', type=int, default=3000)
212 |     parser.add_argument('--max_steps', type=int, default=1000)
213 |     parser.add_argument('--exp_name', type=str, default='td3_class')
214 |     args = parser.parse_args()
215 | 
216 |     env = gym.make(args.env)
217 |     env = env.unwrapped
218 |     env.seed(args.seed)
219 | 
220 |     s_dim = env.observation_space.shape[0]
221 |     a_dim = env.action_space.shape[0]
222 |     a_bound = env.action_space.high[0]
223 | 
224 |     net = TD3(a_dim, s_dim, a_bound,
225 |               batch_size=100,
226 |               )
227 |     ep_reward_list = []
228 |     test_ep_reward_list = []
229 | 
230 |     for i in range(args.epochs):
231 |         s = env.reset()
232 |         ep_reward = 0
233 |         for j in range(args.max_steps):
234 | 
235 |             # Add exploration noise
236 |             if i < 10:
237 |                 a = np.random.rand(a_dim) * a_bound
238 |             else:
239 |                 # a = net.choose_action(s)
240 |                 a = net.get_action(s, 0.1)
241 |             # a = noise.add_noise(a)
242 | 
243 |             a = np.clip(a, -a_bound, a_bound)
244 | 
245 |             s_, r, done, info = env.step(a)
246 |             done = False if j == args.max_steps - 1 else done
247 | 
248 |             net.store_transition((s, a, r, s_, done))
249 | 
250 |             s = s_
251 |             ep_reward += r
252 |             if j == args.max_steps - 1:
253 | 
254 |                 for _ in range(args.max_steps):
255 |                     net.learn()
256 | 
257 |                 ep_reward_list.append(ep_reward)
258 |                 print('Episode:', i, ' Reward: %i' % int(ep_reward),
259 |                       # 'Explore: %.2f' % var,
260 |                       "learn step:", net.learn_step)
261 |                 # if ep_reward > -300:RENDER = True
262 | 
263 |                 # 增加测试部分!
264 |                 if i % 20 == 0:
265 |                     test_ep_reward = net.test_agent(env=env, n=5)
266 |                     test_ep_reward_list.append(test_ep_reward)
267 |                     print("-" * 20)
268 |                     print('Episode:', i, ' Reward: %i' % int(ep_reward),
269 |                           'Test Reward: %i' % int(test_ep_reward),
270 |                           )
271 |                     print("-" * 20)
272 | 
273 |                 break
274 | 
275 |     import matplotlib.pyplot as plt
276 | 
277 |     plt.plot(ep_reward_list)
278 |     img_name = str(args.exp_name + "_" + args.env + "_epochs" +
279 |                    str(args.epochs) +
280 |                    "_seed" + str(args.seed))
281 |     plt.title(img_name + "_train")
282 |     plt.savefig(img_name + ".png")
283 |     plt.show()
284 |     plt.close()
285 | 
286 |     plt.plot(test_ep_reward_list)
287 |     plt.title(img_name + "_test")
288 |     plt.savefig(img_name + ".png")
289 |     plt.show()
290 | 
291 | 


--------------------------------------------------------------------------------
/td3_sp/TD3_sp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | import sys
  6 | sys.path.append("../")
  7 | from td3_sp import core
  8 | from td3_sp.core import get_vars, mlp_actor_critic
  9 | 
 10 | 
 11 | class ReplayBuffer:
 12 |     """
 13 |     A simple FIFO experience replay buffer for TD3 agents.
 14 |     """
 15 | 
 16 |     def __init__(self, obs_dim, act_dim, size):
 17 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 18 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 19 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 20 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 21 |         self.done_buf = np.zeros(size, dtype=np.float32)
 22 |         self.ptr, self.size, self.max_size = 0, 0, size
 23 | 
 24 |     def store(self, obs, act, rew, next_obs, done):
 25 |         self.obs1_buf[self.ptr] = obs
 26 |         self.obs2_buf[self.ptr] = next_obs
 27 |         self.acts_buf[self.ptr] = act
 28 |         self.rews_buf[self.ptr] = rew
 29 |         self.done_buf[self.ptr] = done
 30 |         self.ptr = (self.ptr + 1) % self.max_size
 31 |         self.size = min(self.size + 1, self.max_size)
 32 | 
 33 |     def sample_batch(self, batch_size=32):
 34 |         idxs = np.random.randint(0, self.size, size=batch_size)
 35 |         return dict(obs1=self.obs1_buf[idxs],
 36 |                     obs2=self.obs2_buf[idxs],
 37 |                     acts=self.acts_buf[idxs],
 38 |                     rews=self.rews_buf[idxs],
 39 |                     done=self.done_buf[idxs])
 40 | 
 41 | 
 42 | """
 43 | 
 44 | TD3 (Twin Delayed DDPG)
 45 | 
 46 | """
 47 | 
 48 | 
 49 | def td3(env_fn, mlp_actor_critic=core.mlp_actor_critic,
 50 |         ac_kwargs=dict(), seed=0,
 51 |         steps_per_epoch=5000, epochs=250,
 52 |         replay_size=int(1e6), gamma=0.99,
 53 |         polyak=0.995, pi_lr=1e-3, q_lr=1e-3,
 54 |         batch_size=100, start_steps=10000,
 55 |         act_noise=0.1, target_noise=0.2,
 56 |         noise_clip=0.5, policy_delay=2,
 57 |         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
 58 | 
 59 |     tf.set_random_seed(seed)
 60 |     np.random.seed(seed)
 61 | 
 62 |     env, test_env = env_fn(), env_fn()
 63 |     obs_dim = env.observation_space.shape[0]
 64 |     act_dim = env.action_space.shape[0]
 65 | 
 66 |     # Action limit for clamping: critically, assumes all dimensions share the same bound!
 67 |     act_limit = env.action_space.high[0]
 68 | 
 69 |     # Share information about action space with policy architecture
 70 |     ac_kwargs['action_space'] = env.action_space.high[0]
 71 | 
 72 |     # Inputs to computation graph
 73 |     x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
 74 | 
 75 |     # Main outputs from computation graph
 76 |     with tf.variable_scope('main'):
 77 |         pi, q1, q2, q1_pi = mlp_actor_critic(x_ph, a_ph, **ac_kwargs)
 78 | 
 79 |     # Target policy network
 80 |     with tf.variable_scope('target'):
 81 |         pi_targ, _, _, _ = mlp_actor_critic(x2_ph, a_ph, **ac_kwargs)
 82 | 
 83 |     # Target Q networks
 84 |     with tf.variable_scope('target', reuse=True):
 85 | 
 86 |         # Target policy smoothing, by adding clipped noise to target actions
 87 |         epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
 88 |         epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
 89 |         a2 = pi_targ + epsilon
 90 |         a2 = tf.clip_by_value(a2, -act_limit, act_limit)
 91 | 
 92 |         # Target Q-values, using action from target policy
 93 |         _, q1_targ, q2_targ, _ = mlp_actor_critic(x2_ph, a2, **ac_kwargs)
 94 | 
 95 |     # Experience buffer
 96 |     replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
 97 | 
 98 |     # Count variables
 99 |     var_counts = tuple(core.count_vars(scope)
100 |                        for scope in ['main/pi',
101 |                                      'main/q1',
102 |                                      'main/q2',
103 |                                      'main'])
104 |     print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts)
105 | 
106 |     # Bellman backup for Q functions, using Clipped Double-Q targets
107 |     min_q_targ = tf.minimum(q1_targ, q2_targ)
108 |     backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ)
109 | 
110 |     # TD3 losses
111 |     pi_loss = -tf.reduce_mean(q1_pi)
112 |     q1_loss = tf.reduce_mean((q1 - backup) ** 2)
113 |     q2_loss = tf.reduce_mean((q2 - backup) ** 2)
114 |     # 为啥这里的loss是加起来的?
115 |     q_loss = q1_loss + q2_loss
116 | 
117 |     # Separate train ops for pi, q
118 |     pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
119 |     q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
120 |     train_pi_op = pi_optimizer.minimize(pi_loss,
121 |                                         var_list=get_vars('main/pi'))
122 |     # 这里的参数,怎么是总的q?
123 |     # 难道这里的字符串只需要匹配就好了?
124 |     train_q_op = q_optimizer.minimize(q_loss,
125 |                                       var_list=get_vars('main/q'))
126 | 
127 |     # Polyak averaging for target variables
128 |     target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
129 |                               for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
130 | 
131 |     # Initializing targets to match main variables
132 |     target_init = tf.group([tf.assign(v_targ, v_main)
133 |                             for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
134 | 
135 |     sess = tf.Session()
136 |     sess.run(tf.global_variables_initializer())
137 |     sess.run(target_init)
138 | 
139 |     def get_action(o, noise_scale):
140 |         a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
141 |         a += noise_scale * np.random.randn(act_dim)
142 |         return np.clip(a, -act_limit, act_limit)
143 | 
144 |     def test_agent(n=10):
145 |         for j in range(n):
146 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
147 |             while not (d or (ep_len == max_ep_len)):
148 |                 # Take deterministic actions at test time (noise_scale=0)
149 |                 o, r, d, _ = test_env.step(get_action(o, 0))
150 |                 ep_ret += r
151 |                 ep_len += 1
152 | 
153 |     start_time = time.time()
154 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
155 |     total_steps = steps_per_epoch * epochs
156 | 
157 |     ep_ret_list = []
158 |     episode = 0
159 | 
160 |     for t in range(total_steps):
161 | 
162 |         """
163 |         Until start_steps have elapsed, randomly sample actions
164 |         from a uniform distribution for better exploration. Afterwards, 
165 |         use the learned policy (with some noise, via act_noise). 
166 |         """
167 |         if t > start_steps:
168 |             a = get_action(o, act_noise)
169 |         else:
170 |             a = env.action_space.sample()
171 | 
172 |         # Step the env
173 |         o2, r, d, _ = env.step(a)
174 |         ep_ret += r
175 |         ep_len += 1
176 | 
177 |         # Ignore the "done" signal if it comes from hitting the time
178 |         # horizon (that is, when it's an artificial terminal signal
179 |         # that isn't based on the agent's state)
180 |         d = False if ep_len == max_ep_len else d
181 | 
182 |         # Store experience to replay buffer
183 |         replay_buffer.store(o, a, r, o2, d)
184 | 
185 |         # Super critical, easy to overlook step: make sure to update
186 |         # most recent observation!
187 |         o = o2
188 | 
189 |         if d or (ep_len == max_ep_len):
190 |             """
191 |             Perform all TD3 updates at the end of the trajectory
192 |             (in accordance with source code of TD3 published by
193 |             original authors).
194 |             """
195 |             episode += 1
196 |             ep_ret_list.append(ep_ret)
197 |             epoch = t // steps_per_epoch
198 |             print("Epoch:", epoch)
199 |             print("Episode:", episode)
200 |             print("Training Step:", t)
201 |             print("Episode Reward:", ep_ret)
202 |             for j in range(ep_len):
203 |                 batch = replay_buffer.sample_batch(batch_size)
204 |                 feed_dict = {x_ph: batch['obs1'],
205 |                              x2_ph: batch['obs2'],
206 |                              a_ph: batch['acts'],
207 |                              r_ph: batch['rews'],
208 |                              d_ph: batch['done']
209 |                              }
210 |                 q_step_ops = [q_loss, q1, q2, train_q_op]
211 |                 outs = sess.run(q_step_ops, feed_dict)
212 | 
213 |                 if j % policy_delay == 0:
214 |                     # Delayed policy update
215 |                     outs = sess.run([pi_loss, train_pi_op, target_update],
216 |                                     feed_dict)
217 | 
218 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
219 | 
220 |         # End of epoch wrap-up
221 |         if t > 0 and t % steps_per_epoch == 0:
222 | 
223 |             # Test the performance of the deterministic version of the agent.
224 |             test_agent()
225 | 
226 |     import matplotlib.pyplot as plt
227 |     plt.plot(ep_ret_list)
228 |     plt.show()
229 | 
230 | 
231 | if __name__ == '__main__':
232 |     import argparse
233 | 
234 |     parser = argparse.ArgumentParser()
235 |     parser.add_argument('--env', type=str, default='Hopper-v2')
236 |     parser.add_argument('--hid', type=int, default=300)
237 |     parser.add_argument('--l', type=int, default=1)
238 |     parser.add_argument('--gamma', type=float, default=0.99)
239 |     parser.add_argument('--seed', '-s', type=int, default=3)
240 |     parser.add_argument('--epochs', type=int, default=250)
241 |     parser.add_argument('--exp_name', type=str, default='td3')
242 |     args = parser.parse_args()
243 | 
244 |     td3(lambda: gym.make(args.env), mlp_actor_critic=core.mlp_actor_critic,
245 |         ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),
246 |         gamma=args.gamma, seed=args.seed, epochs=args.epochs,
247 |         )
248 | 
249 | 


--------------------------------------------------------------------------------
/td3_sp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaixindelele/DRL-tensorflow/a754d0a6ea7c35f191e8d9e2a1e59c6213b935ba/td3_sp/__init__.py


--------------------------------------------------------------------------------
/td3_sp/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def placeholder(dim=None):
 6 |     return tf.placeholder(dtype=tf.float32,
 7 |                           shape=(None,dim) if dim else (None,))
 8 | 
 9 | 
10 | def placeholders(*args):
11 |     return [placeholder(dim) for dim in args]
12 | 
13 | 
14 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
15 |     for h in hidden_sizes[:-1]:
16 |         x = tf.layers.dense(x, units=h, activation=activation)
17 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
18 | 
19 | 
20 | def get_vars(scope):
21 |     return [x for x in tf.global_variables() if scope in x.name]
22 | 
23 | 
24 | def count_vars(scope):
25 |     v = get_vars(scope)
26 |     return sum([np.prod(var.shape.as_list()) for var in v])
27 | 
28 | 
29 | """
30 | Actor-Critics
31 | """
32 | 
33 | 
34 | def mlp_actor_critic(x, a, hidden_sizes=(400, 300), activation=tf.nn.relu,
35 |                      output_activation=tf.tanh, action_space=None):
36 |     act_dim = a.shape.as_list()[-1]
37 |     act_limit = action_space
38 |     with tf.variable_scope('pi'):
39 |         pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim],
40 |                              activation, output_activation)
41 |     with tf.variable_scope('q1'):
42 |         q1 = tf.squeeze(mlp(tf.concat([x,a], axis=-1),
43 |                             list(hidden_sizes)+[1],
44 |                             activation, None), axis=1)
45 |     with tf.variable_scope('q2'):
46 |         q2 = tf.squeeze(mlp(tf.concat([x,a], axis=-1),
47 |                             list(hidden_sizes)+[1],
48 |                             activation, None), axis=1)
49 |     with tf.variable_scope('q1', reuse=True):
50 |         q1_pi = tf.squeeze(mlp(tf.concat([x, pi], axis=-1),
51 |                                list(hidden_sizes)+[1],
52 |                                activation, None), axis=1)
53 |     return pi, q1, q2, q1_pi
54 | 


--------------------------------------------------------------------------------
/td3_sp/td3_origin.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import time
  5 | import sys
  6 | sys.path.append("../")
  7 | from td3_sp import core
  8 | from td3_sp.core import get_vars, mlp_actor_critic
  9 | from sp_utils.logx import EpochLogger, setup_logger_kwargs
 10 | 
 11 | 
 12 | class ReplayBuffer:
 13 |     """
 14 |     A simple FIFO experience replay buffer for TD3 agents.
 15 |     """
 16 | 
 17 |     def __init__(self, obs_dim, act_dim, size):
 18 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 19 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 20 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 21 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 22 |         self.done_buf = np.zeros(size, dtype=np.float32)
 23 |         self.ptr, self.size, self.max_size = 0, 0, size
 24 | 
 25 |     def store(self, obs, act, rew, next_obs, done):
 26 |         self.obs1_buf[self.ptr] = obs
 27 |         self.obs2_buf[self.ptr] = next_obs
 28 |         self.acts_buf[self.ptr] = act
 29 |         self.rews_buf[self.ptr] = rew
 30 |         self.done_buf[self.ptr] = done
 31 |         self.ptr = (self.ptr + 1) % self.max_size
 32 |         self.size = min(self.size + 1, self.max_size)
 33 | 
 34 |     def sample_batch(self, batch_size=32):
 35 |         idxs = np.random.randint(0, self.size, size=batch_size)
 36 |         return dict(obs1=self.obs1_buf[idxs],
 37 |                     obs2=self.obs2_buf[idxs],
 38 |                     acts=self.acts_buf[idxs],
 39 |                     rews=self.rews_buf[idxs],
 40 |                     done=self.done_buf[idxs])
 41 | 
 42 | 
 43 | """
 44 | 
 45 | TD3 (Twin Delayed DDPG)
 46 | 
 47 | """
 48 | 
 49 | 
 50 | def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 51 |         steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99,
 52 |         polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
 53 |         act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2,
 54 |         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
 55 |     """
 56 | 
 57 |     Args:
 58 |         env_fn : A function which creates a copy of the environment.
 59 |             The environment must satisfy the OpenAI Gym API.
 60 | 
 61 |         actor_critic: A function which takes in placeholder symbols
 62 |             for state, ``x_ph``, and action, ``a_ph``, and returns the main
 63 |             outputs from the agent's Tensorflow computation graph:
 64 | 
 65 |             ===========  ================  ======================================
 66 |             Symbol       Shape             Description
 67 |             ===========  ================  ======================================
 68 |             ``pi``       (batch, act_dim)  | Deterministically computes actions
 69 |                                            | from policy given states.
 70 |             ``q1``       (batch,)          | Gives one estimate of Q* for
 71 |                                            | states in ``x_ph`` and actions in
 72 |                                            | ``a_ph``.
 73 |             ``q2``       (batch,)          | Gives another estimate of Q* for
 74 |                                            | states in ``x_ph`` and actions in
 75 |                                            | ``a_ph``.
 76 |             ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
 77 |                                            | ``pi`` for states in ``x_ph``:
 78 |                                            | q1(x, pi(x)).
 79 |             ===========  ================  ======================================
 80 | 
 81 |         ac_kwargs (dict): Any kwargs appropriate for the actor_critic
 82 |             function you provided to TD3.
 83 | 
 84 |         seed (int): Seed for random number generators.
 85 | 
 86 |         steps_per_epoch (int): Number of steps of interaction (state-action pairs)
 87 |             for the agent and the environment in each epoch.
 88 | 
 89 |         epochs (int): Number of epochs to run and train agent.
 90 | 
 91 |         replay_size (int): Maximum length of replay buffer.
 92 | 
 93 |         gamma (float): Discount factor. (Always between 0 and 1.)
 94 | 
 95 |         polyak (float): Interpolation factor in polyak averaging for target
 96 |             networks. Target networks are updated towards main networks
 97 |             according to:
 98 | 
 99 |             .. math:: \\theta_{\\text{targ}} \\leftarrow
100 |                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
101 | 
102 |             where :math:`\\rho` is polyak. (Always between 0 and 1, usually
103 |             close to 1.)
104 | 
105 |         pi_lr (float): Learning rate for policy.
106 | 
107 |         q_lr (float): Learning rate for Q-networks.
108 | 
109 |         batch_size (int): Minibatch size for SGD.
110 | 
111 |         start_steps (int): Number of steps for uniform-random action selection,
112 |             before running real policy. Helps exploration.
113 | 
114 |         act_noise (float): Stddev for Gaussian exploration noise added to
115 |             policy at training time. (At test time, no noise is added.)
116 | 
117 |         target_noise (float): Stddev for smoothing noise added to target
118 |             policy.
119 | 
120 |         noise_clip (float): Limit for absolute value of target policy
121 |             smoothing noise.
122 | 
123 |         policy_delay (int): Policy will only be updated once every
124 |             policy_delay times for each update of the Q-networks.
125 | 
126 |         max_ep_len (int): Maximum length of trajectory / episode / rollout.
127 | 
128 |         logger_kwargs (dict): Keyword args for EpochLogger.
129 | 
130 |         save_freq (int): How often (in terms of gap between epochs) to save
131 |             the current policy and value function.
132 | 
133 |     """
134 | 
135 |     logger = EpochLogger(**logger_kwargs)
136 |     print("local()", locals())
137 |     logger.save_config(locals())
138 | 
139 |     tf.set_random_seed(seed)
140 |     np.random.seed(seed)
141 | 
142 |     env, test_env = env_fn(), env_fn()
143 |     obs_dim = env.observation_space.shape[0]
144 |     act_dim = env.action_space.shape[0]
145 | 
146 |     # Action limit for clamping: critically, assumes all dimensions share the same bound!
147 |     act_limit = env.action_space.high[0]
148 | 
149 |     # Share information about action space with policy architecture
150 |     ac_kwargs['action_space'] = env.action_space
151 | 
152 |     # Inputs to computation graph
153 |     x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
154 | 
155 |     # Main outputs from computation graph
156 |     with tf.variable_scope('main'):
157 |         pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
158 | 
159 |     # Target policy network
160 |     with tf.variable_scope('target'):
161 |         pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)
162 | 
163 |     # Target Q networks
164 |     with tf.variable_scope('target', reuse=True):
165 | 
166 |         # Target policy smoothing, by adding clipped noise to target actions
167 |         epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
168 |         epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
169 |         a2 = pi_targ + epsilon
170 |         a2 = tf.clip_by_value(a2, -act_limit, act_limit)
171 | 
172 |         # Target Q-values, using action from target policy
173 |         _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs)
174 | 
175 |     # Experience buffer
176 |     replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
177 | 
178 |     # Count variables
179 |     var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
180 |     print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts)
181 | 
182 |     # Bellman backup for Q functions, using Clipped Double-Q targets
183 |     min_q_targ = tf.minimum(q1_targ, q2_targ)
184 |     backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ)
185 | 
186 |     # TD3 losses
187 |     pi_loss = -tf.reduce_mean(q1_pi)
188 |     q1_loss = tf.reduce_mean((q1 - backup) ** 2)
189 |     q2_loss = tf.reduce_mean((q2 - backup) ** 2)
190 |     q_loss = q1_loss + q2_loss
191 | 
192 |     # Separate train ops for pi, q
193 |     pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
194 |     q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
195 |     train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
196 |     train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))
197 | 
198 |     # Polyak averaging for target variables
199 |     target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
200 |                               for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
201 | 
202 |     # Initializing targets to match main variables
203 |     target_init = tf.group([tf.assign(v_targ, v_main)
204 |                             for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
205 | 
206 |     sess = tf.Session()
207 |     sess.run(tf.global_variables_initializer())
208 |     sess.run(target_init)
209 | 
210 |     # Setup model saving
211 |     logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2})
212 | 
213 |     def get_action(o, noise_scale):
214 |         a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
215 |         a += noise_scale * np.random.randn(act_dim)
216 |         return np.clip(a, -act_limit, act_limit)
217 | 
218 |     def test_agent(n=10):
219 |         for j in range(n):
220 |             o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
221 |             while not (d or (ep_len == max_ep_len)):
222 |                 # Take deterministic actions at test time (noise_scale=0)
223 |                 o, r, d, _ = test_env.step(get_action(o, 0))
224 |                 ep_ret += r
225 |                 ep_len += 1
226 |             logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
227 | 
228 |     start_time = time.time()
229 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
230 |     total_steps = steps_per_epoch * epochs
231 | 
232 |     # Main loop: collect experience in env and update/log each epoch
233 |     for t in range(total_steps):
234 | 
235 |         """
236 |         Until start_steps have elapsed, randomly sample actions
237 |         from a uniform distribution for better exploration. Afterwards, 
238 |         use the learned policy (with some noise, via act_noise). 
239 |         """
240 |         if t > start_steps:
241 |             a = get_action(o, act_noise)
242 |         else:
243 |             a = env.action_space.sample()
244 | 
245 |         # Step the env
246 |         o2, r, d, _ = env.step(a)
247 |         ep_ret += r
248 |         ep_len += 1
249 | 
250 |         # Ignore the "done" signal if it comes from hitting the time
251 |         # horizon (that is, when it's an artificial terminal signal
252 |         # that isn't based on the agent's state)
253 |         d = False if ep_len == max_ep_len else d
254 | 
255 |         # Store experience to replay buffer
256 |         replay_buffer.store(o, a, r, o2, d)
257 | 
258 |         # Super critical, easy to overlook step: make sure to update
259 |         # most recent observation!
260 |         o = o2
261 | 
262 |         if d or (ep_len == max_ep_len):
263 |             """
264 |             Perform all TD3 updates at the end of the trajectory
265 |             (in accordance with source code of TD3 published by
266 |             original authors).
267 |             """
268 |             for j in range(ep_len):
269 |                 batch = replay_buffer.sample_batch(batch_size)
270 |                 feed_dict = {x_ph: batch['obs1'],
271 |                              x2_ph: batch['obs2'],
272 |                              a_ph: batch['acts'],
273 |                              r_ph: batch['rews'],
274 |                              d_ph: batch['done']
275 |                              }
276 |                 q_step_ops = [q_loss, q1, q2, train_q_op]
277 |                 outs = sess.run(q_step_ops, feed_dict)
278 |                 logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])
279 | 
280 |                 if j % policy_delay == 0:
281 |                     # Delayed policy update
282 |                     outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict)
283 |                     logger.store(LossPi=outs[0])
284 | 
285 |             logger.store(EpRet=ep_ret, EpLen=ep_len)
286 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
287 | 
288 |         # End of epoch wrap-up
289 |         if t > 0 and t % steps_per_epoch == 0:
290 |             epoch = t // steps_per_epoch
291 | 
292 |             # Save model
293 |             if (epoch % save_freq == 0) or (epoch == epochs - 1):
294 |                 logger.save_state({'env': env}, None)
295 | 
296 |             # Test the performance of the deterministic version of the agent.
297 |             test_agent()
298 | 
299 |             # Log info about epoch
300 |             logger.log_tabular('Epoch', epoch)
301 |             logger.log_tabular('EpRet', with_min_and_max=True)
302 |             logger.log_tabular('TestEpRet', with_min_and_max=True)
303 |             logger.log_tabular('EpLen', average_only=True)
304 |             logger.log_tabular('TestEpLen', average_only=True)
305 |             logger.log_tabular('TotalEnvInteracts', t)
306 |             logger.log_tabular('Q1Vals', with_min_and_max=True)
307 |             logger.log_tabular('Q2Vals', with_min_and_max=True)
308 |             logger.log_tabular('LossPi', average_only=True)
309 |             logger.log_tabular('LossQ', average_only=True)
310 |             logger.log_tabular('Time', time.time() - start_time)
311 |             logger.dump_tabular()
312 | 
313 | 
314 | if __name__ == '__main__':
315 |     import argparse
316 | 
317 |     parser = argparse.ArgumentParser()
318 |     parser.add_argument('--env', type=str, default='Hopper-v2')
319 |     parser.add_argument('--hid', type=int, default=300)
320 |     parser.add_argument('--l', type=int, default=1)
321 |     parser.add_argument('--gamma', type=float, default=0.99)
322 |     parser.add_argument('--seed', '-s', type=int, default=0)
323 |     parser.add_argument('--epochs', type=int, default=50)
324 |     parser.add_argument('--exp_name', type=str, default='td3')
325 |     args = parser.parse_args()
326 | 
327 |     logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
328 | 
329 |     td3(lambda: gym.make(args.env), actor_critic=core.mlp_actor_critic,
330 |         ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),
331 |         gamma=args.gamma, seed=args.seed, epochs=args.epochs,
332 |         logger_kwargs=logger_kwargs)
333 | 


--------------------------------------------------------------------------------
/td3_sp/test_gym_td3_sp_class.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | import gym
  4 | import time
  5 | import sys
  6 | 
  7 | sys.path.append("../")
  8 | from td3_sp.TD3_class import TD3
  9 | 
 10 | MAX_EPISODES = 250
 11 | MAX_EP_STEPS = 1000
 12 | 
 13 | RENDER = False
 14 | ENV_NAME = 'Hopper-v2'
 15 | 
 16 | 
 17 | def test_agent(net, env, n=10):
 18 |     ep_reward_list = []
 19 |     for j in range(n):
 20 |         s = env.reset()
 21 |         ep_reward = 0
 22 |         for i in range(MAX_EP_STEPS):
 23 |             # Take deterministic actions at test time (noise_scale=0)
 24 |             s, r, d, _ = env.step(net.get_action(s))
 25 |             ep_reward += r
 26 | 
 27 |         ep_reward_list.append(ep_reward)
 28 |     mean_ep_reward = np.mean(np.array(ep_reward_list))
 29 |     return mean_ep_reward
 30 | 
 31 | 
 32 | def main():
 33 | 
 34 |     env = gym.make(ENV_NAME)
 35 |     env = env.unwrapped
 36 |     env.seed(4)
 37 | 
 38 |     s_dim = env.observation_space.shape[0]
 39 |     a_dim = env.action_space.shape[0]
 40 |     a_bound = env.action_space.high[0]
 41 | 
 42 |     net = TD3(a_dim, s_dim, a_bound,
 43 |               batch_size=100,
 44 |               )
 45 |     ep_reward_list = []
 46 |     test_ep_reward_list = []
 47 |     for i in range(MAX_EPISODES):
 48 |         s = env.reset()
 49 |         ep_reward = 0
 50 |         for j in range(MAX_EP_STEPS):
 51 |             if RENDER:
 52 |                 env.render()
 53 | 
 54 |             # Add exploration noise
 55 |             if i < 10:
 56 |                 a = np.random.rand(a_dim) * a_bound
 57 |             else:
 58 |                 # a = net.choose_action(s)
 59 |                 a = net.get_action(s, 0.1)
 60 |             # a = noise.add_noise(a)
 61 | 
 62 |             a = np.clip(a, -a_bound, a_bound)
 63 | 
 64 |             s_, r, done, info = env.step(a)
 65 |             done = False if j == MAX_EP_STEPS-1 else done
 66 | 
 67 |             net.store_transition((s, a, r, s_, done))
 68 | 
 69 |             s = s_
 70 |             ep_reward += r
 71 |             if j == MAX_EP_STEPS - 1:
 72 | 
 73 |                 for _ in range(MAX_EP_STEPS):
 74 |                     net.learn()
 75 | 
 76 |                 ep_reward_list.append(ep_reward)
 77 |                 print('Episode:', i, ' Reward: %i' % int(ep_reward),
 78 |                       # 'Explore: %.2f' % var,
 79 |                       "learn step:", net.learn_step)
 80 |                 # if ep_reward > -300:RENDER = True
 81 | 
 82 |                 # 增加测试部分!
 83 |                 if i % 20 == 0:
 84 |                     test_ep_reward = test_agent(net=net, env=env, n=5)
 85 |                     test_ep_reward_list.append(test_ep_reward)
 86 |                     print("-"*20)
 87 |                     print('Episode:', i, ' Reward: %i' % int(ep_reward),
 88 |                           'Test Reward: %i' % int(test_ep_reward),
 89 |                           )
 90 |                     print("-" * 20)
 91 | 
 92 |                 break
 93 | 
 94 |     plt.plot(ep_reward_list)
 95 |     plt.show()
 96 |     plt.plot(test_ep_reward_list)
 97 |     plt.show()
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 | 
103 | 


--------------------------------------------------------------------------------