├── Atari
    ├── DQN_Boxing.py
    ├── DQN_MsPacman.py
    ├── DQN_Pong.py
    ├── cnn-100-random-episode
    │   ├── delete
    │   ├── part1_boxing_a_score.npy
    │   ├── part1_boxing_b_score.npy
    │   ├── part1_boxing_length.npy
    │   ├── part1_pong_a_score.npy
    │   ├── part1_pong_b_score.npy
    │   ├── part1_pong_length.npy
    │   ├── part_b_part1_MsPacman.py
    │   ├── part_b_part1_boxing.py
    │   └── part_b_part1_pong.py
    ├── cnn-untrained-Q-network
    │   ├── Mspacman
    │   │   └── part2_MsPacman_length.npy
    │   ├── boxing
    │   │   ├── 41.png
    │   │   ├── 42.png
    │   │   ├── 43.png
    │   │   ├── 44.png
    │   │   ├── check_data.py
    │   │   ├── cnn_for_boxing.py
    │   │   ├── delete
    │   │   ├── part2_boxing_a_score.npy
    │   │   ├── part2_boxing_b_score.npy
    │   │   ├── part2_boxing_difference_score.npy
    │   │   └── part2_boxing_length.npy
    │   └── pong
    │   │   ├── 21.png
    │   │   ├── 22.png
    │   │   ├── 23.png
    │   │   ├── check_data.py
    │   │   ├── cnn_for_pong.py
    │   │   ├── delete
    │   │   ├── part2_pong_a_score.npy
    │   │   ├── part2_pong_b_score.npy
    │   │   ├── part2_pong_difference_score.npy
    │   │   └── part2_pong_length.npy
    ├── pong
    │   ├── 22.png
    │   ├── 23.png
    │   ├── check_data.py
    │   ├── cnn_for_pong.py
    │   ├── delete
    │   ├── part2_pong_a_score.npy
    │   ├── part2_pong_b_score.npy
    │   ├── part2_pong_difference_score.npy
    │   └── part2_pong_length.npy
    └── readme.md
├── CartPole
    ├── different-neural-size-Q-learning
    │   ├── cartpole_5_neural_1000_load.py
    │   ├── cartpole_5_neural_1000_saved.py
    │   ├── cartpole_5_neural_30_load.py
    │   ├── cartpole_5_neural_30_saved.py
    │   └── delete
    ├── double-q-learning
    │   ├── cartpole_8_load.py
    │   ├── cartpole_8_saved.py
    │   └── delete
    ├── experience_replay
    │   ├── cartpole_6_buffer_replay_load.py
    │   ├── cartpole_6_buffer_replay_saved.py
    │   └── delete
    ├── hundred-random-episode
    │   ├── 100_random_episodes.py
    │   └── delete
    ├── offline-batch-Q-learning
    │   ├── batch_Q_learning_linear_0.001_length.png
    │   ├── batch_Q_learning_linear_0.001_reward.png
    │   ├── batch_Q_learning_neural_0.0001_length.png
    │   ├── batch_Q_learning_neural_0.0001_reward.png
    │   ├── cartpole_3_collect_data.py
    │   ├── cartpole_3_linear_4_load.py
    │   ├── cartpole_3_linear_4_saved.py
    │   ├── cartpole_3_neural_5_load.py
    │   ├── cartpole_3_neural_5_saved.py
    │   ├── check_data.py
    │   ├── delete
    │   ├── figure_1-3.png
    │   ├── length_data_part3_4_300.npy
    │   ├── loss_data_part3_4_300.npy
    │   └── reward_data_part3_4_300.npy
    ├── online-Q-learning
    │   ├── cartpole_4_neural_load.py
    │   ├── cartpole_4_neural_saved.py
    │   └── delete
    ├── readme
    ├── target-parameter
    │   ├── cartpole_7_target_load.py
    │   ├── cartpole_7_target_saved.py
    │   └── delete
    ├── three-random-episode
    │   ├── 3_random_episode.py
    │   └── delete
    └── train_data_2.npy
├── learning_curve
    ├── Capture_1.JPG
    ├── DQN_PICTURE.JPG
    ├── MsPacman0.png
    ├── MsPacman301.png
    ├── Pong19.png
    ├── Pong256.png
    ├── batch_Q_learning_linear_0.001_length.png
    ├── batch_Q_learning_linear_0.001_reward.png
    ├── batch_Q_learning_neural_0.0001_length.png
    ├── batch_Q_learning_neural_0.0001_reward.png
    ├── boxing0.png
    ├── boxing313.png
    ├── boxing_128_128.png
    ├── boxing_28_28.png
    ├── double_Q_learning_length.png
    ├── double_Q_learning_reward.png
    ├── experience_replay_length.png
    ├── experience_replay_reward.png
    ├── mapacman_28_28.png
    ├── mspacman_128_128.png
    ├── mspacman_28_28.png
    ├── online_Q_learning_neural_0.001_length.png
    ├── online_Q_learning_neural_0.001_reward.png
    ├── pong_128_128.png
    ├── pong_28_28.png
    ├── readme
    ├── target_parameter_length.png
    └── target_parameter_reward.png
└── readme.md


/Atari/DQN_MsPacman.py:
--------------------------------------------------------------------------------
  1 | ### the environment MsPacman ##
  2 | import gym
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import scipy.misc
  6 | import random
  7 | import tensorflow as tf
  8 | import time
  9 | import os
 10 | ### the environment MsPacman ##
 11 | env = gym.make('MsPacman-v0')
 12 | ### saved trained model ###
 13 | def save_final_model(model):
 14 |     if not os.path.exists('./MsPacman_a_model/'):
 15 |         os.mkdir('./MsPacman_a_model/')
 16 |     saver = tf.train.Saver()
 17 |     saver.save(model, './MsPacman_a_model/model.checkpoint1')
 18 | 
 19 | ### the function to chnage image from RGB to greyscale ###
 20 | def rgb2gray(rgb):
 21 |     return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])
 22 | 
 23 | ### transfer input to size(28*28) ###
 24 | def tran_size(x, x_size, y_size):
 25 |     output = rgb2gray(x)
 26 |     output = scipy.misc.imresize(output, size=[x_size, y_size])
 27 |     return output
 28 | 
 29 | ### stack four frames to size (28*28*4) ###
 30 | def stack(x, index, x_size, y_size):
 31 |     output = np.reshape([x[index - 4], x[index - 3], x[index - 2], x[index - 1]], [x_size, y_size, 4])
 32 |     output = np.reshape(output, [-1, 4 * x_size * y_size])
 33 |     return output
 34 | 
 35 | ### define the weights in the CNN neural network###
 36 | def weight_variable(shape):
 37 |     output = tf.truncated_normal(shape, stddev=0.1)
 38 |     return tf.Variable(output)
 39 | 
 40 | def bias_variable(shape):
 41 |     output = tf.constant(0.1, shape=shape)
 42 |     return tf.Variable(output)
 43 | 
 44 | ### the convolution function ####
 45 | def conv2d(input, Weight, strides):
 46 |     return tf.nn.conv2d(input, Weight, strides, padding='SAME')
 47 | 
 48 | ###  the cnn function ###
 49 | def cnn_approximator(x, weight_convol_1, bias_convol_1, weight_convol_2, bias_convol_2,
 50 |                      weight_flat, bias_flat, weight_out, bias_out,flat_width,flat_length,keep_prob):
 51 | 
 52 |     output_convol_1 = tf.nn.relu(conv2d(input=x, Weight=weight_convol_1, strides=[1, 2, 2, 1]) + bias_convol_1)
 53 |     output_convol_2 = tf.nn.relu(
 54 |         conv2d(input=output_convol_1, Weight=weight_convol_2, strides=[1, 2, 2, 1]) + bias_convol_2)
 55 | 
 56 |     output_reshape = tf.reshape(output_convol_2, [-1, flat_width* flat_length * 32])
 57 |     output_flat = tf.matmul(output_reshape, weight_flat) + bias_flat
 58 | 
 59 |     out_drop=tf.nn.dropout(output_flat,keep_prob=keep_prob)
 60 |     weight_out=weight_variable([256,action_space])
 61 |     bias_out=bias_variable([action_space])
 62 |     y = tf.matmul(output_flat, weight_out) + bias_out
 63 |     print('====the cnn approximator is running====')
 64 |     return y
 65 | 
 66 | ### set hyperparameter and variables  for the CNN approximator ###
 67 | discount = 0.99
 68 | learn_rate = 0.001
 69 | eplison = 0.1
 70 | action_space = 9
 71 | width_size = 64
 72 | length_size = 64
 73 | flat_layer_width=int(width_size/4)
 74 | flat_layer_length=int(length_size/4)
 75 | 
 76 | ###  the Q_value weights ###
 77 | ## first layer ##
 78 | weight_convol_1 = weight_variable([6, 6, 4, 16])
 79 | bias_convol_1 = bias_variable([16])
 80 | ## second layer ##
 81 | weight_convol_2 = weight_variable([4, 4, 16, 32])
 82 | bias_convol_2 = bias_variable([32])
 83 | ## flat layer ##
 84 | weight_flat = weight_variable([flat_layer_width * flat_layer_length * 32, 256])
 85 | bias_flat = bias_variable([256])
 86 | ## linear layer ##
 87 | weight_out = weight_variable([256, action_space])
 88 | bias_out = bias_variable([action_space])
 89 | 
 90 | ### the target Q value weights placeholder ###
 91 | 
 92 | Weight_convol_1_target= tf.placeholder(tf.float32,shape=[6, 6, 4, 16])
 93 | Bias_convol_1_target = tf.placeholder(tf.float32,shape=[16])
 94 | Weight_convol_2_target = tf.placeholder(tf.float32,shape=[4, 4, 16, 32])
 95 | Bias_convol_2_target = tf.placeholder(tf.float32,shape=[32])
 96 | Weight_flat_target = tf.placeholder(tf.float32,[flat_layer_width * flat_layer_length * 32, 256])
 97 | Bias_flat_target = tf.placeholder(tf.float32,shape=[256])
 98 | Weight_out_target = tf.placeholder(tf.float32,shape=[256, action_space])
 99 | Bias_out_target = tf.placeholder(tf.float32,shape=[action_space])
100 | 
101 | 
102 | ### the  Input Placeholder ###
103 | x1 = tf.placeholder(tf.float32, shape=[None, width_size * length_size * 4])
104 | x2 = tf.placeholder(tf.float32, shape=[None, width_size * length_size * 4])
105 | x3 = tf.placeholder(tf.float32, shape=[None, 1])
106 | x4 = tf.placeholder(tf.int32, shape=[None, 1])
107 | x5 = tf.placeholder(tf.int32, shape=[None, 1])
108 | ## dropout ratio ##
109 | keep_prob = tf.placeholder(tf.float32)
110 | 
111 | ### reshape the stacked pictures before into cnn model ###
112 | x_1_image = tf.reshape(x1, [-1, width_size, length_size,4])
113 | x_2_image = tf.reshape(x2, [-1, width_size, length_size,4])
114 | 
115 | ### caucalate the q avlue and max _next value
116 | prediction_now = cnn_approximator(x_1_image, weight_convol_1, bias_convol_1, weight_convol_2, bias_convol_2,
117 |                                   weight_flat, bias_flat, weight_out,
118 |                                   bias_out,flat_layer_width,flat_layer_length,keep_prob)
119 | prediction_next = cnn_approximator(x_2_image, Weight_convol_1_target, Bias_convol_1_target,
120 |                                    Weight_convol_2_target, Bias_convol_2_target,
121 |                                    Weight_flat_target, Bias_flat_target, Weight_out_target,
122 |                                    Bias_out_target,flat_layer_width,flat_layer_length,keep_prob)
123 | 
124 | ### test action when test agent performance ###
125 | test_action = tf.cast(tf.argmax(prediction_now, 1), tf.int32)
126 | 
127 | ### take Q value underlying the actual action ###
128 | True_action = tf.cast(x4, tf.int32)
129 | True_action = tf.reshape(True_action, shape=[-1, 1])
130 | action_repeat = tf.reshape(tf.cast(x5, tf.int32), shape=[-1, 1])
131 | action_double = tf.concat([action_repeat, True_action], 1)
132 | 
133 | ### calcaulate the loss and training ###
134 | Q_value = tf.gather_nd(params=prediction_now, indices=action_double)
135 | Max_Q_value_next = tf.reduce_max(prediction_next, axis=1)
136 | print('......the reward is clipped .....')
137 | ## when the game break, just use reward as the Q target approximation ##
138 | delta=tf.add(x3 + discount * tf.stop_gradient((1+x3)*Max_Q_value_next), (-1 * Q_value))
139 | q_loss = tf.reduce_mean(tf.square(delta) / 2)
140 | train_optimizer_MsPacman = tf.train.RMSPropOptimizer(learn_rate).minimize((q_loss))
141 | 
142 | with tf.device('/cpu:0'):
143 |     with tf.Session() as sess:
144 |         start_time=time.time()
145 |         print('======== build the experience replay ==========')
146 |         ### set the variable and empty set ###
147 |         length = []
148 |         total_score = []
149 |         experience_size = 100000 * 3
150 |         number_episode_buffer = 700
151 |         ### the buffer experience replay ###
152 |         tran_size_buffer = []
153 |         start_time = time.time()
154 |         experience_buffer = []
155 | 
156 |         for i_buffer in range(number_episode_buffer):
157 |             observation_0 = env.reset()
158 |             if (i_buffer+1)%20 ==0:
159 |                 print('......the {} th episodes and  information of '
160 |                   'observation_initial {}.....'.format(i_buffer+1,np.shape(observation_0)))
161 | 
162 |             Score = []
163 |             Action = []
164 |             for i_step in range(experience_size):
165 |                 ### collect data ###
166 |                 action = env.action_space.sample()
167 |                 #print('the action',action)
168 |                 observation_0, score, done, _ = env.step(action)
169 |                 observation_1 = tran_size(observation_0, width_size, length_size)
170 |                 #print('info score done _',score,done,_)
171 |                 Score.append(int(score))
172 |                 Action.append(action)
173 |                 tran_size_buffer.append(observation_1)
174 | 
175 |                 if (i_step + 1) % 4 == 0 and i_step >= 7:
176 | 
177 |                     #print('......the step is {} the size of tran_size_buffer is {}......'.format(i_step + 1,np.shape(tran_size_buffer)))
178 |                     sub_example = []
179 |                     sub_example.append(stack(tran_size_buffer, i_step + 1 - 4, width_size, length_size))
180 |                     sub_example.append(stack(tran_size_buffer, i_step + 1, width_size, length_size))
181 |                     ## clip the reward betweewn [-1 0 1]
182 | 
183 |                     if sum(Score[i_step + 1 - 8:i_step + 1 - 4])==0:
184 |                         sub_example.append(0)
185 |                     else:
186 |                         sub_example.append(1)
187 |                     sub_example.append(Action[i_step - 4])
188 |                     experience_buffer.append(sub_example)
189 |                     #print('.........the sub_example.....',sub_example[2:])
190 | 
191 |                 if done is True:
192 |                     final_example = []
193 |                     #print('......the step is {} the size of tran_size_buffer is {}......'.format(i_step + 1, np.shape(
194 |                         #tran_size_buffer)))
195 |                     final_example.append(stack(tran_size_buffer, i_step + 1 - 4, width_size, length_size))
196 |                     final_example.append(stack(tran_size_buffer, i_step + 1, width_size, length_size))
197 |                     final_example.append(-1)
198 |                     final_example.append(Action[i_step - 4])
199 |                     #print('the final_example.....', final_example)
200 |                     total_score.append(np.sum(Score, axis=0))
201 |                     length.append(i_step + 1)
202 |                     experience_buffer.append(final_example)
203 |                     break
204 |         tran_size_buffer=[]
205 |         print('the information of generated experience buffer ',
206 |               np.shape(experience_buffer), type(experience_buffer))
207 |         print('length of each episode',length)
208 |         print('total score of each episode',total_score)
209 |         print('==================the experience buffer process done ==============')
210 |         print('==================the total generated time is {}==========='
211 |               '======== (分)'.format((time.time()-start_time)/60))
212 | 
213 |         print('######################################## starting training the DQN algorithm '
214 |               '###################################')
215 | 
216 |         #### training the  DQN algorithm ####
217 |         sess.run(tf.global_variables_initializer())
218 |         episode_number_training = 1000
219 |         training_step=100000
220 |         batch_size =32
221 |         total_time=0
222 |         total_training_step=0
223 |         total_number_data=100000
224 |         total_test_score=[]
225 |         total_train_loss=[]
226 |         print('....... the number of data points in experience buffer ........',np.shape(experience_buffer))
227 |         print('....... the number of data points in experience buffer ........',
228 |               np.shape(experience_buffer[-1*total_number_data:]))
229 |         for i_episode in range(1,episode_number_training):
230 |             ### hold the old weights for target calculation in each 5 training episodes ###
231 |             if ((i_episode - 1) % 5 == 0):
232 |                 weight_convol_1_target,bias_convol_1_target, weight_convol_2_target,bias_convol_2_target,\
233 |                 weight_flat_target,bias_flat_target,weight_out_target,bias_out_target= \
234 |                         sess.run([weight_convol_1, bias_convol_1, weight_convol_2, bias_convol_2,
235 |                                   weight_flat,bias_flat,weight_out,bias_out])
236 |                 print('-----------the target parameters updated----------')
237 |                 print('========='
238 |                       '================================================'
239 |                       '{} th episode training is starting ================'
240 |                       '=========================================='.format(i_episode))
241 |             start_time = time.time()
242 |             ## reset the environment at the beginning of each episode ##
243 |             env.reset()
244 |             ## the list to store the updating experience ##
245 |             update_Score = []
246 |             update_Action=[]
247 |             update_transition=[]
248 |             action = env.action_space.sample()
249 |             each_episode_loss=0
250 |             for i_training in range(training_step):
251 |                 #env.render()
252 |                 observation_1,reward,done,info=env.step(action)
253 |                 update_Action.append(action)
254 |                 update_Score.append(reward)
255 |                 update_transition.append(tran_size(observation_1,width_size,length_size))
256 | 
257 |                 if i_training>=7 and (i_training+1)%4==0:
258 |                 ## update the experience ##
259 |                     update_Action,update_Score,update_transition=update_Action[-8:],\
260 |                                                                  update_Score[-8:],update_transition[-8:]
261 |                     experience_buffer=experience_buffer.tolist()
262 | 
263 |                     if sum(update_Score[4:])==0:
264 |                         added_sum_score=0
265 |                     else:
266 |                         added_sum_score=1
267 |                     experience_buffer.append([stack(update_transition,4,width_size,length_size),
268 |                                              stack(update_transition,8,width_size,length_size),
269 |                                              added_sum_score,update_Action[4]])
270 |                 ## keep the fixed size of experience replay ##
271 |                     experience_buffer=experience_buffer[-1*total_number_data:]
272 |                     experience_buffer=np.array(experience_buffer)
273 |                 ## take the randm action or greedy action ##
274 |                     if np.random.random() <= eplison:
275 |                         action = np.random.randint(0, 9)
276 |                     else:
277 |                         action= int(sess.run(test_action, feed_dict={x1: stack(update_transition,8,width_size,length_size),
278 |                                                                      keep_prob:0.8,
279 |                                                                      Weight_convol_1_target:weight_convol_1_target,
280 |                                                                     Bias_convol_1_target:bias_convol_1_target,
281 |                                                                      Weight_convol_2_target:weight_convol_2_target,
282 |                                                                      Bias_convol_2_target:bias_convol_2_target,
283 |                                                                      Weight_flat_target:weight_flat_target,
284 |                                                                      Bias_flat_target:bias_flat_target,
285 |                                                                      Weight_out_target:weight_out_target,
286 |                                                                      Bias_out_target:bias_out_target})[0])
287 | 
288 | 
289 |                 batch_sample = np.reshape(random.sample(range(0, len(experience_buffer)), batch_size), [-1, 1])
290 |                 #print('=====the batch-sample====', batch_sample)
291 | 
292 |                 experience_buffer = np.array(experience_buffer)
293 |                 mini_sample = np.reshape(experience_buffer[batch_sample], [batch_size, -1])
294 | 
295 |                 Input_1 = np.concatenate(mini_sample[:, 0], axis=0)
296 |                 Input_2 = np.concatenate(mini_sample[:, 1], axis=0)
297 |                 Input_3 = np.reshape(mini_sample[:, 2], [-1, 1])
298 |                 Input_4 = np.reshape(mini_sample[:, 3], [-1, 1])
299 |                 Input_5 = np.reshape(np.arange(batch_size), [-1, 1])
300 | 
301 |                 ## running the training step ##
302 |                 _,loss = sess.run([train_optimizer_MsPacman,q_loss],feed_dict={x1: Input_1,x2:Input_2,x3:Input_3,x4: Input_4,
303 |                                                                       x5: Input_5,keep_prob:0.8,
304 |                                                                                Weight_convol_1_target: weight_convol_1_target,
305 |                                                                                Bias_convol_1_target: bias_convol_1_target,
306 |                                                                                Weight_convol_2_target: weight_convol_2_target,
307 |                                                                                Bias_convol_2_target: bias_convol_2_target,
308 |                                                                                Weight_flat_target: weight_flat_target,
309 |                                                                                Bias_flat_target: bias_flat_target,
310 |                                                                                Weight_out_target: weight_out_target,
311 |                                                                                Bias_out_target: bias_out_target})
312 |                 each_episode_loss+=loss
313 |                 if done is True:
314 |                     ### record score for agent and computer each eposide ###
315 |                     ### always set -1(reward) when episode is done ###
316 |                     total_train_loss.append(each_episode_loss/(i_training+1))
317 |                     experience_buffer=experience_buffer.tolist()
318 | 
319 |                     experience_buffer.append([stack(update_transition,4,width_size,length_size),
320 |                                              stack(update_transition,8,width_size,length_size),
321 |                                              -1,update_Action[4]])
322 |                     experience_buffer=np.array(experience_buffer)
323 |                     total_time+=(time.time()-start_time)
324 |                     total_training_step+=i_training
325 | 
326 |                     if (i_episode - 1) % 5 == 0:
327 |                         print('*************** the {} th step trainning'
328 |                               ' loss is {} ***************'.format(i_training + 1, loss))
329 |                     print('*************** the {} th episode trainning'
330 |                           ' time is {} 分 ***************'.format(i_episode,(time.time()-start_time)/60))
331 | 
332 |                     print('************* the total steps trainning'
333 |                           ' until now is {} **************'.format(total_training_step+1))
334 |                     print('************* the total trainning '
335 |                           'time is {} 分 **************'.format(total_time/60))
336 |                     print('========='
337 |                           '================================================'
338 |                           '{} th episode is finished  ================'
339 |                           '=========================================='.format(i_episode))
340 |                     break
341 |             ### test the agent performance  until now ###
342 |             if (i_episode-1)%10 == 0:
343 |                 print('========='
344 |                       '================================================'
345 |                       'After {} th training episode, the agent testing is starting'
346 |                       '=========================================='.format(i_episode))
347 |                 test_episode_number=5
348 |                 test_score=0
349 |                 test_step=100000
350 |                 for i_test_number in range(test_episode_number):
351 |                     test_observation_0=env.reset()
352 |                     test_observation_0 = tran_size(test_observation_0, width_size, length_size)
353 |                     #test_update_Score = []
354 |                     test_update_Action = []
355 |                     test_update_transition = []
356 |                     test_episode_action = env.action_space.sample()
357 |                     for i_test_step in range(test_step):
358 |                         #env.render()
359 |                         test_observation_1, test_reward, test_done, test_info = env.step(test_episode_action)
360 |                         test_update_transition.append(tran_size(test_observation_1,width_size,length_size))
361 |                         test_score+=test_reward
362 |                         if (i_test_step+1)%4==0:
363 |                             test_update_transition=test_update_transition[-4:]
364 |                             test_episode_action = int(sess.run(test_action, feed_dict={
365 |                                 x1: stack(test_update_transition, 4, width_size, length_size),keep_prob:0.8}))
366 |                         if test_done:
367 |                             print('+++++++++++test {} th episode is done score is'
368 |                                   ' {} until now +++++++++'.format(i_test_number,test_score))
369 |                             break
370 |                 print('++++++++++++the test average score is++++++++++++++ '
371 |                       '{}'.format(test_score/test_episode_number))
372 | 
373 |                 total_test_score.append(test_score / test_episode_number)
374 |                 ### save the model each 10 turn training ###
375 |                 save_final_model(sess)
376 |                 print('//////saved the model///////',i_episode)
377 | 
378 |         print('agent_score',total_test_score)
379 |         print('train_loss',total_train_loss)
380 |         np.save('agent_score',total_test_score)
381 |         np.save('train_loss,',total_train_loss)
382 | 
383 | 


--------------------------------------------------------------------------------
/Atari/cnn-100-random-episode/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Atari/cnn-100-random-episode/part1_boxing_a_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_boxing_a_score.npy


--------------------------------------------------------------------------------
/Atari/cnn-100-random-episode/part1_boxing_b_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_boxing_b_score.npy


--------------------------------------------------------------------------------
/Atari/cnn-100-random-episode/part1_boxing_length.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_boxing_length.npy


--------------------------------------------------------------------------------
/Atari/cnn-100-random-episode/part1_pong_a_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_pong_a_score.npy


--------------------------------------------------------------------------------
/Atari/cnn-100-random-episode/part1_pong_b_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_pong_b_score.npy


--------------------------------------------------------------------------------
/Atari/cnn-100-random-episode/part1_pong_length.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_pong_length.npy


--------------------------------------------------------------------------------
/Atari/cnn-100-random-episode/part_b_part1_MsPacman.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | env = gym.make('MsPacman-v0')
 6 | ### set the variable and empty set ###
 7 | length = []
 8 | total_score=[]
 9 | ### the 100 eposides ###
10 | eposide_number=100
11 | ### the x axis value ###
12 | x=np.arange(eposide_number)
13 | x=x+1
14 | 
15 | for i_eposide in range(eposide_number):
16 |     env.reset()
17 |     Score=[]
18 |     for i_step in range(100000):
19 |         ### record computer and agent scors ###
20 | 
21 |         action=np.random.randint(9)
22 |         _, score, done,_ = env.step(action)
23 | 
24 |         Score.append(score)
25 | 
26 |         if done is True:
27 |             total_score.append(np.sum(Score,axis=0))
28 |             length.append(i_step + 1)
29 | 
30 |             break
31 | std_length=np.std(length,axis=0)
32 | std_score=np.std(total_score,axis=0)
33 | 
34 | 
35 | 
36 | 
37 | print('the length...',length)
38 | print('the score of agent...',total_score)
39 | print(std_score)
40 | print(std_length)
41 | print(np.mean(total_score,axis=0))
42 | print(np.mean(length,axis=0))
43 | 
44 | 
45 | 
46 | plt.plot(x,total_score)
47 | plt.xlabel('ith Num of episode')
48 | plt.ylabel('agent scores')
49 | plt.show()
50 | 
51 | 
52 | plt.plot(x,length)
53 | plt.xlabel('ith Num of episode')
54 | plt.ylabel('agent frames count')
55 | plt.show()
56 | 
57 | np.save('part1_boxing_a_score',total_score)
58 | 
59 | np.save('part1_boxing_length',length)
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/Atari/cnn-100-random-episode/part_b_part1_boxing.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | env = gym.make('Boxing-v0')
 6 | ### set the variable and empty set ###
 7 | length = []
 8 | total_score_a=[]
 9 | total_score_b=[]
10 | ### the 100 eposides ###
11 | eposide_number=100
12 | ### the x axis value ###
13 | x=np.arange(eposide_number)
14 | x=x+1
15 | 
16 | for i_eposide in range(eposide_number):
17 |     env.reset()
18 |     ### recorc computer and agent scors ###
19 |     Score_a=[]
20 |     Score_b=[]
21 |     for i_step in range(100000):
22 |         ### random select action ###
23 |         action=np.random.randint(18)
24 |         _, score, done,_ = env.step(action)
25 | 
26 |         if score <0:
27 |             Score_b.append(score)
28 |         if score>0:
29 |             Score_a.append(score)
30 | 
31 |         if done is True:
32 |             total_score_a.append(np.sum(Score_a,axis=0))
33 |             total_score_b.append(-1*np.sum(Score_b,axis=0))
34 | 
35 |             length.append(i_step + 1)
36 |             break
37 | ### calculate the standard of score and frame counts ###
38 | std_length=np.std(length,axis=0)
39 | std_score=np.std(total_score_a,axis=0)
40 | 
41 | print('the length...',length)
42 | print('the score of agent...',total_score_a)
43 | print('the score of computer..',total_score_b)
44 | print(std_score)
45 | print(std_length)
46 | print(np.mean(total_score_a,axis=0))
47 | print(np.mean(length,axis=0))
48 | 
49 | 
50 | ### plot the mean the score and length ###
51 | plt.plot(x,total_score_a)
52 | plt.xlabel('ith Num of episode')
53 | plt.ylabel('agent scores')
54 | plt.show()
55 | 
56 | plt.plot(x,total_score_b)
57 | plt.xlabel('ith Num of episode')
58 | plt.ylabel('computer scores')
59 | plt.show()
60 | 
61 | plt.plot(x,length)
62 | plt.xlabel('ith Num of episode')
63 | plt.ylabel('agent frames count')
64 | plt.show()
65 | 
66 | 
67 | np.save('part1_boxing_a_score',total_score_a)
68 | np.save('part1_boxing_b_score',total_score_b)
69 | np.save('part1_boxing_length',length)
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/Atari/cnn-100-random-episode/part_b_part1_pong.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | env = gym.make('Pong-v0')
 6 | 
 7 | ### set the variable and empty set ###
 8 | length = []
 9 | total_score_a=[]
10 | total_score_b=[]
11 | ### the 100 eposides ###
12 | eposide_number=100
13 | 
14 | ### the x axis value ###
15 | x=np.arange(eposide_number)
16 | x=x+1
17 | 
18 | for i_eposide in range(eposide_number):
19 |     env.reset()
20 |     ### record computer and agent scors ###
21 |     Score_a=[]
22 |     Score_b=[]
23 |     for i_step in range(100000):
24 |         #env.render()
25 |         ### random select action ###
26 |         action=np.random.randint(6)
27 |         _, score, done,_ = env.step(action)
28 | 
29 |         if score <0:
30 |             Score_b.append(score)
31 |         if score>0:
32 |             Score_a.append(score)
33 | 
34 |         if done is True:
35 |             total_score_a.append(np.sum(Score_a,axis=0))
36 |             total_score_b.append(-1*np.sum(Score_b,axis=0))
37 |             length.append(i_step + 1)
38 |             break
39 | 
40 | 
41 | ### calculate the standard of score and frame counts ###
42 | std_length=np.std(length,axis=0)
43 | std_score=np.std(total_score_a,axis=0)
44 | 
45 | print('the length...',length)
46 | print('the score of agent...',total_score_a)
47 | print('the score of computer..',total_score_b)
48 | print(std_score)
49 | print(std_length)
50 | print(np.mean(total_score_a,axis=0))
51 | print(np.mean(length,axis=0))
52 | 
53 | 
54 | ### plot the mean the score and length ###
55 | plt.plot(x,total_score_a)
56 | plt.xlabel('ith Num of episode')
57 | plt.ylabel('agent scores')
58 | plt.show()
59 | 
60 | plt.plot(x,total_score_b)
61 | plt.xlabel('ith Num of episode')
62 | plt.ylabel('computer scores')
63 | plt.show()
64 | 
65 | plt.plot(x,length)
66 | plt.xlabel('ith Num of episode')
67 | plt.ylabel('agent frames count')
68 | plt.show()
69 | 
70 | 
71 | np.save('part1_pong_a_score',total_score_a)
72 | np.save('part1_pong_b_score',total_score_b)
73 | np.save('part1_pong_length',length)
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/Mspacman/part2_MsPacman_length.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/Mspacman/part2_MsPacman_length.npy


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/41.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/41.png


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/42.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/42.png


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/43.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/43.png


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/44.png


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/check_data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | a_score=np.load('part2_boxing_a_score.npy')
 7 | b_score=np.load( 'part2_boxing_b_score.npy')
 8 | length=np.load('part2_boxing_length.npy')
 9 | differenct=np.load('part2_boxing_difference_score.npy')
10 | 
11 | eposide_number = 100
12 | 
13 | ### the x axis value ###
14 | x = np.arange(eposide_number)
15 | x = x + 1
16 | 
17 | plt.plot(x,a_score )
18 | plt.xlabel('ith Num of episode')
19 | plt.ylabel('agent scores')
20 | plt.show()
21 | 
22 | plt.plot(x, b_score)
23 | plt.xlabel('ith Num of episode')
24 | plt.ylabel('computer scores')
25 | plt.show()
26 | 
27 | plt.plot(x,differenct)
28 | plt.xlabel('ith Num of episode')
29 | plt.ylabel('difference between agent and computer')
30 | plt.show()
31 | 
32 | 
33 | plt.plot(x, length)
34 | plt.xlabel('ith Num of episode')
35 | plt.ylabel('agent frames count')
36 | plt.show()
37 | 


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/cnn_for_boxing.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import scipy.misc
  5 | 
  6 | import tensorflow as tf
  7 | 
  8 | env = gym.make('Boxing-v3')
  9 | 
 10 | 
 11 | ### the function to chnage image from RGB to greyscale ###
 12 | def rgb2gray(rgb):
 13 |     return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])
 14 | 
 15 | ### transfer input to size(28*28*1) ###
 16 | def tran_size(x):
 17 |     output = scipy.misc.imresize(x, size=[28, 28])
 18 |     output = rgb2gray(output)
 19 |     return output
 20 | 
 21 | ### stack four frames to size (28*28*4) ###
 22 | def stack(x, index):
 23 |     output = np.reshape([x[index], x[index - 1], x[index - 2], x[index - 3]], [28,28,4])
 24 |     return output
 25 | 
 26 | ### define the weights in the CNN neural network###
 27 | def weight_variable(shape):
 28 |     output = tf.truncated_normal(shape, stddev=0.1)
 29 |     return tf.Variable(output)
 30 | 
 31 | 
 32 | def bias_variable(shape):
 33 |     output = tf.constant(0.1, shape=shape)
 34 |     return tf.Variable(output)
 35 | 
 36 | ### the convolution function ####
 37 | def conv2d(input, Weight,strides):
 38 | 
 39 |     return tf.nn.conv2d(input, Weight, strides, padding='SAME')
 40 | 
 41 | ###  the cnn function ###
 42 | def cnn_pong(x):
 43 |     ### first layer ###
 44 |     weight_convol_1=weight_variable([6,6,4,16])
 45 |     bias_convol_1=bias_variable([16])
 46 | 
 47 |     output_convol_1=tf.nn.relu(conv2d(input=x,Weight=weight_convol_1,strides=[1,2,2,1])+bias_convol_1)
 48 |     ### second layer ###
 49 |     weight_convol_2=weight_variable([4,4,16,32])
 50 |     bias_convol_2=bias_variable([32])
 51 |     output_convol_2=tf.nn.relu(conv2d(input=output_convol_1,Weight=weight_convol_2,strides=[1,2,2,1])+bias_convol_2)
 52 |     ### flat layer ###
 53 |     weight_flat=weight_variable([7*7*32,256])
 54 |     bias_flat=bias_variable([256])
 55 |     output_reshape=tf.reshape(output_convol_2,[-1,7*7*32])
 56 |     output_flat=tf.matmul(output_reshape,weight_flat)+bias_flat
 57 | 
 58 |     ### linear layer ###
 59 |     out_drop=tf.nn.dropout(output_flat,0.8)
 60 |     weight_out=weight_variable([256,action_space])
 61 | 
 62 |     bias_out=bias_variable([action_space])
 63 | 
 64 |     y=tf.matmul(out_drop,weight_out)+bias_out
 65 | 
 66 |     return y
 67 | 
 68 | ### set hyperparameter and variables ###
 69 | discount=0.99
 70 | learn_rate=0.001
 71 | eplison=0.1
 72 | action_space=18
 73 | 
 74 | 
 75 | keep_drop=tf.placeholder(tf.float32)
 76 | x1=tf.placeholder(tf.float32,shape=[None,28,28,4])
 77 | x2=tf.placeholder(tf.float32,shape=[None,28,28,4])
 78 | x3=tf.placeholder(tf.float32,shape=[None,1])
 79 | x4=tf.placeholder(tf.int32,shape=[None,2])
 80 | 
 81 | ### caucalate the q avlue and max _next value
 82 | prediction_now=cnn_pong(x1)
 83 | prediction_next=cnn_pong(x2)
 84 | 
 85 | ### test action when test agent performance ###
 86 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 87 | 
 88 | 
 89 | ### calcaulate the loss and training ###
 90 | Q_value=tf.gather_nd(params=prediction_now,indices=x4)
 91 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 92 | 
 93 | delta=tf.add(x3+discount*tf.stop_gradient(Max_Q_value_next),(-1*Q_value))
 94 | q_loss=tf.reduce_sum(tf.square(delta)/2)
 95 | 
 96 | train_optimizer=tf.train.RMSPropOptimizer(learn_rate).minimize((q_loss))
 97 | 
 98 | #### save the model ####
 99 | saver=tf.train.Saver()
100 | 
101 | with tf.device('/cpu:0'):
102 |     with tf.Session() as sess:
103 | 
104 |         for i_run in range(1, 1 + 1):
105 |             sess.run(tf.global_variables_initializer())
106 | 
107 |             print('......start training data......')
108 | 
109 |             ### set the variable and empty set ###
110 |             length = []
111 |             total_score_a = []
112 |             total_score_b = []
113 |             total_absolute=[]
114 |             ### the 100 eposides ###
115 |             eposide_number = 100
116 | 
117 |             ### the x axis value ###
118 |             x = np.arange(eposide_number)
119 |             x = x + 1
120 |             ### the buffer experience replay ###
121 |             initial_buffer = []
122 |             buffer_replay = []
123 |             for i_eposide in range(eposide_number):
124 |                 env.reset()
125 |                 ### record score for computer and agent ###
126 |                 Score_a = []
127 |                 Score_b = []
128 | 
129 |                 for i_step in range(100000):
130 | 
131 |                     if len(initial_buffer) < 4:
132 |                         ### collect data ###
133 | 
134 |                         action = env.action_space.sample()
135 |                         obser_1, score, done, _ = env.step(action)
136 |                         obser_initial = tran_size(obser_1)
137 |                         if score < 0:
138 |                             Score_b.append(score)
139 |                         if score > 0:
140 |                             Score_a.append(score)
141 |                         #print(score, done)
142 | 
143 |                         initial_buffer.append(obser_initial)
144 | 
145 |                     else:
146 | 
147 |                         state_i = stack(initial_buffer, i_step - 1)
148 | 
149 |                         buffer_replay.append(state_i)
150 |                         ### select action by eplison policy ###
151 |                         if np.random.random() <= eplison:
152 |                             action_select=np.random.randint(6)
153 |                             #print('ewqrwqr......')
154 |                         else:
155 | 
156 |                             action_select = sess.run(test_action, feed_dict={x1: [state_i]})
157 | 
158 |                         action_select = int(action_select)
159 |                         obser_1, score, done, _ = env.step(action_select)
160 |                         if score < 0:
161 |                             Score_b.append(score)
162 |                         if score > 0:
163 |                             Score_a.append(score)
164 |                         #print(score, done)
165 | 
166 |                         obser_initial = tran_size(obser_1)
167 | 
168 |                         initial_buffer.append(obser_initial)
169 | 
170 |                     if done is True:
171 |                         ### record score for agent and computer each eposide ###
172 |                         total_score_a.append(np.sum(Score_a, axis=0))
173 |                         total_score_b.append(-1 * np.sum(Score_b, axis=0))
174 |                         length.append(i_step + 1)
175 |                         total_absolute.append((np.sum(Score_a, axis=0)+np.sum(Score_b, axis=0)))
176 | 
177 |                         break
178 | 
179 | ### calculate the standard of score and frame counts ###
180 | std_length = np.std(length, axis=0)
181 | std_score = np.std(total_score_a, axis=0)
182 | std_score_abso=np.std(total_absolute,axis=0)
183 | 
184 | print('the length...',length)
185 | print('the agent score...',total_score_a)
186 | print('the absolute value...',total_absolute)
187 | print('the std of agent score..',std_score)
188 | print('the std_score_abso..',std_score_abso)
189 | print('the std_length..',std_length)
190 | # print(std_length)
191 | print('the mean of total_score_a...',np.mean(total_score_a, axis=0))
192 | print('the mean of length...',np.mean(length, axis=0))
193 | print('the mean of total_absolute...',np.mean(total_absolute, axis=0))
194 | 
195 | ### plot the mean the score and length ###
196 | plt.plot(x, total_score_a)
197 | plt.xlabel('ith Num of episode')
198 | plt.ylabel('agent scores')
199 | plt.show()
200 | 
201 | plt.plot(x, total_score_b)
202 | plt.xlabel('ith Num of episode')
203 | plt.ylabel('computer scores')
204 | plt.show()
205 | 
206 | plt.plot(x, total_absolute)
207 | plt.xlabel('ith Num of episode')
208 | plt.ylabel('difference between agent and computer')
209 | plt.show()
210 | 
211 | 
212 | plt.plot(x, length)
213 | plt.xlabel('ith Num of episode')
214 | plt.ylabel('agent frames count')
215 | plt.show()
216 | 
217 | np.save('part2_boxing_a_score', total_score_a)
218 | np.save('part2_boxing_b_score', total_score_b)
219 | np.save('part2_boxing_length', length)
220 | np.save('part2_boxing_difference_score',total_absolute)
221 | 
222 | 


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/part2_boxing_a_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/part2_boxing_a_score.npy


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/part2_boxing_b_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/part2_boxing_b_score.npy


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/part2_boxing_difference_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/part2_boxing_difference_score.npy


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/boxing/part2_boxing_length.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/part2_boxing_length.npy


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/pong/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/21.png


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/pong/22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/22.png


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/pong/23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/23.png


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/pong/check_data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | a_score=np.load('part2_pong_a_score.npy')
 7 | b_score=np.load( 'part2_pong_b_score.npy')
 8 | length=np.load('part2_pong_length.npy')
 9 | differenct=np.load('part2_pong_difference_score.npy')
10 | 
11 | eposide_number = 100
12 | 
13 | ### the x axis value ###
14 | x = np.arange(eposide_number)
15 | x = x + 1
16 | 
17 | plt.plot(x,a_score )
18 | plt.xlabel('ith Num of episode')
19 | plt.ylabel('agent scores')
20 | plt.show()
21 | 
22 | plt.plot(x, b_score)
23 | plt.xlabel('ith Num of episode')
24 | plt.ylabel('computer scores')
25 | plt.show()
26 | 
27 | plt.plot(x,differenct)
28 | plt.xlabel('ith Num of episode')
29 | plt.ylabel('difference between agent and computer')
30 | plt.show()
31 | 
32 | 
33 | plt.plot(x, length)
34 | plt.xlabel('ith Num of episode')
35 | plt.ylabel('agent frames count')
36 | 


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/pong/cnn_for_pong.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import scipy.misc
  5 | 
  6 | import tensorflow as tf
  7 | 
  8 | env = gym.make('Pong-v0')
  9 | 
 10 | 
 11 | ### the function to chnage image from RGB to greyscale ###
 12 | def rgb2gray(rgb):
 13 |     return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])
 14 | 
 15 | ### transfer input to size(28*28*1) ###
 16 | def tran_size(x):
 17 |     output = scipy.misc.imresize(x, size=[28, 28])
 18 |     output = rgb2gray(output)
 19 |     return output
 20 | 
 21 | ### stack four frames to size (28*28*4) ###
 22 | def stack(x, index):
 23 |     output = np.reshape([x[index], x[index - 1], x[index - 2], x[index - 3]], [28,28,4])
 24 |     return output
 25 | 
 26 | ### define the weights in the CNN neural network###
 27 | def weight_variable(shape):
 28 |     output = tf.truncated_normal(shape, stddev=0.1)
 29 |     return tf.Variable(output)
 30 | 
 31 | 
 32 | def bias_variable(shape):
 33 |     output = tf.constant(0.1, shape=shape)
 34 |     return tf.Variable(output)
 35 | 
 36 | ### the convolution function ####
 37 | def conv2d(input, Weight,strides):
 38 | 
 39 |     return tf.nn.conv2d(input, Weight, strides, padding='SAME')
 40 | 
 41 | ###  the cnn function ###
 42 | def cnn_pong(x):
 43 |     ### first layer ###
 44 |     weight_convol_1=weight_variable([6,6,4,16])
 45 |     bias_convol_1=bias_variable([16])
 46 | 
 47 |     output_convol_1=tf.nn.relu(conv2d(input=x,Weight=weight_convol_1,strides=[1,2,2,1])+bias_convol_1)
 48 |     ### second layer ###
 49 |     weight_convol_2=weight_variable([4,4,16,32])
 50 |     bias_convol_2=bias_variable([32])
 51 |     output_convol_2=tf.nn.relu(conv2d(input=output_convol_1,Weight=weight_convol_2,strides=[1,2,2,1])+bias_convol_2)
 52 |     ### flat layer ###
 53 |     weight_flat=weight_variable([7*7*32,256])
 54 |     bias_flat=bias_variable([256])
 55 |     output_reshape=tf.reshape(output_convol_2,[-1,7*7*32])
 56 |     output_flat=tf.matmul(output_reshape,weight_flat)+bias_flat
 57 | 
 58 |     ### linear layer ###
 59 |     out_drop=tf.nn.dropout(output_flat,0.8)
 60 |     weight_out=weight_variable([256,action_space])
 61 | 
 62 |     bias_out=bias_variable([action_space])
 63 | 
 64 |     y=tf.matmul(out_drop,weight_out)+bias_out
 65 | 
 66 |     return y
 67 | 
 68 | ### set hyperparameter and variables ###
 69 | discount=0.99
 70 | learn_rate=0.001
 71 | eplison=0.1
 72 | action_space=6
 73 | 
 74 | 
 75 | keep_drop=tf.placeholder(tf.float32)
 76 | x1=tf.placeholder(tf.float32,shape=[None,28,28,4])
 77 | x2=tf.placeholder(tf.float32,shape=[None,28,28,4])
 78 | x3=tf.placeholder(tf.float32,shape=[None,1])
 79 | x4=tf.placeholder(tf.int32,shape=[None,2])
 80 | 
 81 | ### caucalate the q avlue and max _next value
 82 | prediction_now=cnn_pong(x1)
 83 | prediction_next=cnn_pong(x2)
 84 | 
 85 | ### test action when test agent performance ###
 86 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 87 | 
 88 | 
 89 | ### calcaulate the loss and training ###
 90 | Q_value=tf.gather_nd(params=prediction_now,indices=x4)
 91 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 92 | 
 93 | delta=tf.add(x3+discount*tf.stop_gradient(Max_Q_value_next),(-1*Q_value))
 94 | q_loss=tf.reduce_sum(tf.square(delta)/2)
 95 | 
 96 | train_optimizer=tf.train.RMSPropOptimizer(learn_rate).minimize((q_loss))
 97 | 
 98 | #### save the model ####
 99 | saver=tf.train.Saver()
100 | 
101 | with tf.device('/cpu:0'):
102 |     with tf.Session() as sess:
103 | 
104 |         for i_run in range(1, 1 + 1):
105 |             sess.run(tf.global_variables_initializer())
106 | 
107 |             print('......start training data......')
108 | 
109 |             ### set the variable and empty set ###
110 |             length = []
111 |             total_score_a = []
112 |             total_score_b = []
113 |             total_absolute=[]
114 |             ### the 100 eposides ###
115 |             eposide_number = 100
116 | 
117 |             ### the x axis value ###
118 |             x = np.arange(eposide_number)
119 |             x = x + 1
120 |             ### the buffer experience replay ###
121 |             initial_buffer = []
122 |             buffer_replay = []
123 |             for i_eposide in range(eposide_number):
124 |                 env.reset()
125 |                 ### record score for computer and agent ###
126 |                 Score_a = []
127 |                 Score_b = []
128 | 
129 |                 for i_step in range(100000):
130 | 
131 |                     if len(initial_buffer) < 4:
132 |                         ### collect data ###
133 |                         action = env.action_space.sample()
134 |                         obser_1, score, done, _ = env.step(action)
135 |                         obser_initial = tran_size(obser_1)
136 |                         if score < 0:
137 |                             Score_b.append(score)
138 |                         if score > 0:
139 |                             Score_a.append(score)
140 |                         #print(score, done)
141 | 
142 |                         initial_buffer.append(obser_initial)
143 | 
144 |                     else:
145 | 
146 |                         state_i = stack(initial_buffer, i_step - 1)
147 | 
148 |                         buffer_replay.append(state_i)
149 |                         ### select action by eplison policy ###
150 |                         if np.random.random() <= eplison:
151 |                             action_select=np.random.randint(6)
152 |                             #print('ewqrwqr......')
153 |                         else:
154 | 
155 |                             action_select = sess.run(test_action, feed_dict={x1: [state_i]})
156 | 
157 |                         action_select = int(action_select)
158 |                         obser_1, score, done, _ = env.step(action_select)
159 |                         if score < 0:
160 |                             Score_b.append(score)
161 |                         if score > 0:
162 |                             Score_a.append(score)
163 |                         #print(score, done)
164 | 
165 |                         obser_initial = tran_size(obser_1)
166 | 
167 |                         initial_buffer.append(obser_initial)
168 | 
169 |                     if done is True:
170 |                         ### record score for agent and computer each eposide ###
171 |                         total_score_a.append(np.sum(Score_a, axis=0))
172 |                         total_score_b.append(-1 * np.sum(Score_b, axis=0))
173 |                         length.append(i_step + 1)
174 |                         total_absolute.append((np.sum(Score_a, axis=0)+np.sum(Score_b, axis=0)))
175 | 
176 |                         break
177 | 
178 | ### calculate the standard of score and frame counts ###
179 | std_length = np.std(length, axis=0)
180 | std_score = np.std(total_score_a, axis=0)
181 | std_score_abso=np.std(total_absolute,axis=0)
182 | 
183 | print('the length...',length)
184 | print('the agent score...',total_score_a)
185 | print('the absolute value...',total_absolute)
186 | print('the std of agent score..',std_score)
187 | print('the std_score_abso..',std_score_abso)
188 | print('the std_length..',std_length)
189 | # print(std_length)
190 | print('the mean of total_score_a...',np.mean(total_score_a, axis=0))
191 | print('the mean of length...',np.mean(length, axis=0))
192 | print('the mean of total_absolute...',np.mean(total_absolute, axis=0))
193 | 
194 | ### plot the mean the score and length ###
195 | plt.plot(x, total_score_a)
196 | plt.xlabel('ith Num of episode')
197 | plt.ylabel('agent scores')
198 | plt.show()
199 | 
200 | plt.plot(x, total_score_b)
201 | plt.xlabel('ith Num of episode')
202 | plt.ylabel('computer scores')
203 | plt.show()
204 | 
205 | plt.plot(x, total_absolute)
206 | plt.xlabel('ith Num of episode')
207 | plt.ylabel('difference between agent and computer')
208 | plt.show()
209 | 
210 | 
211 | plt.plot(x, length)
212 | plt.xlabel('ith Num of episode')
213 | plt.ylabel('agent frames count')
214 | plt.show()
215 | 
216 | 
217 | np.save('part2_pong_a_score', total_score_a)
218 | np.save('part2_pong_b_score', total_score_b)
219 | np.save('part2_pong_length', length)
220 | np.save('part2_pong_difference_score',total_absolute)
221 | 
222 | 


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/pong/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/pong/part2_pong_a_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/part2_pong_a_score.npy


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/pong/part2_pong_b_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/part2_pong_b_score.npy


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/pong/part2_pong_difference_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/part2_pong_difference_score.npy


--------------------------------------------------------------------------------
/Atari/cnn-untrained-Q-network/pong/part2_pong_length.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/part2_pong_length.npy


--------------------------------------------------------------------------------
/Atari/pong/22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/22.png


--------------------------------------------------------------------------------
/Atari/pong/23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/23.png


--------------------------------------------------------------------------------
/Atari/pong/check_data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | a_score=np.load('part2_pong_a_score.npy')
 7 | b_score=np.load( 'part2_pong_b_score.npy')
 8 | length=np.load('part2_pong_length.npy')
 9 | differenct=np.load('part2_pong_difference_score.npy')
10 | 
11 | eposide_number = 100
12 | 
13 | ### the x axis value ###
14 | x = np.arange(eposide_number)
15 | x = x + 1
16 | 
17 | plt.plot(x,a_score )
18 | plt.xlabel('ith Num of episode')
19 | plt.ylabel('agent scores')
20 | plt.show()
21 | 
22 | plt.plot(x, b_score)
23 | plt.xlabel('ith Num of episode')
24 | plt.ylabel('computer scores')
25 | plt.show()
26 | 
27 | plt.plot(x,differenct)
28 | plt.xlabel('ith Num of episode')
29 | plt.ylabel('difference between agent and computer')
30 | plt.show()
31 | 
32 | 
33 | plt.plot(x, length)
34 | plt.xlabel('ith Num of episode')
35 | plt.ylabel('agent frames count')
36 | 


--------------------------------------------------------------------------------
/Atari/pong/cnn_for_pong.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import scipy.misc
  5 | 
  6 | import tensorflow as tf
  7 | 
  8 | env = gym.make('Pong-v0')
  9 | 
 10 | 
 11 | ### the function to chnage image from RGB to greyscale ###
 12 | def rgb2gray(rgb):
 13 |     return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])
 14 | 
 15 | ### transfer input to size(28*28*1) ###
 16 | def tran_size(x):
 17 |     output = scipy.misc.imresize(x, size=[28, 28])
 18 |     output = rgb2gray(output)
 19 |     return output
 20 | 
 21 | ### stack four frames to size (28*28*4) ###
 22 | def stack(x, index):
 23 |     output = np.reshape([x[index], x[index - 1], x[index - 2], x[index - 3]], [28,28,4])
 24 |     return output
 25 | 
 26 | ### define the weights in the CNN neural network###
 27 | def weight_variable(shape):
 28 |     output = tf.truncated_normal(shape, stddev=0.1)
 29 |     return tf.Variable(output)
 30 | 
 31 | 
 32 | def bias_variable(shape):
 33 |     output = tf.constant(0.1, shape=shape)
 34 |     return tf.Variable(output)
 35 | 
 36 | ### the convolution function ####
 37 | def conv2d(input, Weight,strides):
 38 | 
 39 |     return tf.nn.conv2d(input, Weight, strides, padding='SAME')
 40 | 
 41 | ###  the cnn function ###
 42 | def cnn_pong(x):
 43 |     ### first layer ###
 44 |     weight_convol_1=weight_variable([6,6,4,16])
 45 |     bias_convol_1=bias_variable([16])
 46 | 
 47 |     output_convol_1=tf.nn.relu(conv2d(input=x,Weight=weight_convol_1,strides=[1,2,2,1])+bias_convol_1)
 48 |     ### second layer ###
 49 |     weight_convol_2=weight_variable([4,4,16,32])
 50 |     bias_convol_2=bias_variable([32])
 51 |     output_convol_2=tf.nn.relu(conv2d(input=output_convol_1,Weight=weight_convol_2,strides=[1,2,2,1])+bias_convol_2)
 52 |     ### flat layer ###
 53 |     weight_flat=weight_variable([7*7*32,256])
 54 |     bias_flat=bias_variable([256])
 55 |     output_reshape=tf.reshape(output_convol_2,[-1,7*7*32])
 56 |     output_flat=tf.matmul(output_reshape,weight_flat)+bias_flat
 57 | 
 58 |     ### linear layer ###
 59 |     out_drop=tf.nn.dropout(output_flat,0.8)
 60 |     weight_out=weight_variable([256,action_space])
 61 | 
 62 |     bias_out=bias_variable([action_space])
 63 | 
 64 |     y=tf.matmul(out_drop,weight_out)+bias_out
 65 | 
 66 |     return y
 67 | 
 68 | ### set hyperparameter and variables ###
 69 | discount=0.99
 70 | learn_rate=0.001
 71 | eplison=0.1
 72 | action_space=6
 73 | 
 74 | 
 75 | keep_drop=tf.placeholder(tf.float32)
 76 | x1=tf.placeholder(tf.float32,shape=[None,28,28,4])
 77 | x2=tf.placeholder(tf.float32,shape=[None,28,28,4])
 78 | x3=tf.placeholder(tf.float32,shape=[None,1])
 79 | x4=tf.placeholder(tf.int32,shape=[None,2])
 80 | 
 81 | ### caucalate the q avlue and max _next value
 82 | prediction_now=cnn_pong(x1)
 83 | prediction_next=cnn_pong(x2)
 84 | 
 85 | ### test action when test agent performance ###
 86 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 87 | 
 88 | 
 89 | ### calcaulate the loss and training ###
 90 | Q_value=tf.gather_nd(params=prediction_now,indices=x4)
 91 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 92 | 
 93 | delta=tf.add(x3+discount*tf.stop_gradient(Max_Q_value_next),(-1*Q_value))
 94 | q_loss=tf.reduce_sum(tf.square(delta)/2)
 95 | 
 96 | train_optimizer=tf.train.RMSPropOptimizer(learn_rate).minimize((q_loss))
 97 | 
 98 | #### save the model ####
 99 | saver=tf.train.Saver()
100 | 
101 | with tf.device('/cpu:0'):
102 |     with tf.Session() as sess:
103 | 
104 |         for i_run in range(1, 1 + 1):
105 |             sess.run(tf.global_variables_initializer())
106 | 
107 |             print('......start training data......')
108 | 
109 |             ### set the variable and empty set ###
110 |             length = []
111 |             total_score_a = []
112 |             total_score_b = []
113 |             total_absolute=[]
114 |             ### the 100 eposides ###
115 |             eposide_number = 100
116 | 
117 |             ### the x axis value ###
118 |             x = np.arange(eposide_number)
119 |             x = x + 1
120 |             ### the buffer experience replay ###
121 |             initial_buffer = []
122 |             buffer_replay = []
123 |             for i_eposide in range(eposide_number):
124 |                 env.reset()
125 |                 ### record score for computer and agent ###
126 |                 Score_a = []
127 |                 Score_b = []
128 | 
129 |                 for i_step in range(100000):
130 | 
131 |                     if len(initial_buffer) < 4:
132 |                         ### collect data ###
133 |                         action = env.action_space.sample()
134 |                         obser_1, score, done, _ = env.step(action)
135 |                         obser_initial = tran_size(obser_1)
136 |                         if score < 0:
137 |                             Score_b.append(score)
138 |                         if score > 0:
139 |                             Score_a.append(score)
140 |                         #print(score, done)
141 | 
142 |                         initial_buffer.append(obser_initial)
143 | 
144 |                     else:
145 | 
146 |                         state_i = stack(initial_buffer, i_step - 1)
147 | 
148 |                         buffer_replay.append(state_i)
149 |                         ### select action by eplison policy ###
150 |                         if np.random.random() <= eplison:
151 |                             action_select=np.random.randint(6)
152 |                             #print('ewqrwqr......')
153 |                         else:
154 | 
155 |                             action_select = sess.run(test_action, feed_dict={x1: [state_i]})
156 | 
157 |                         action_select = int(action_select)
158 |                         obser_1, score, done, _ = env.step(action_select)
159 |                         if score < 0:
160 |                             Score_b.append(score)
161 |                         if score > 0:
162 |                             Score_a.append(score)
163 |                         #print(score, done)
164 | 
165 |                         obser_initial = tran_size(obser_1)
166 | 
167 |                         initial_buffer.append(obser_initial)
168 | 
169 |                     if done is True:
170 |                         ### record score for agent and computer each eposide ###
171 |                         total_score_a.append(np.sum(Score_a, axis=0))
172 |                         total_score_b.append(-1 * np.sum(Score_b, axis=0))
173 |                         length.append(i_step + 1)
174 |                         total_absolute.append((np.sum(Score_a, axis=0)+np.sum(Score_b, axis=0)))
175 | 
176 |                         break
177 | 
178 | ### calculate the standard of score and frame counts ###
179 | std_length = np.std(length, axis=0)
180 | std_score = np.std(total_score_a, axis=0)
181 | std_score_abso=np.std(total_absolute,axis=0)
182 | 
183 | print('the length...',length)
184 | print('the agent score...',total_score_a)
185 | print('the absolute value...',total_absolute)
186 | print('the std of agent score..',std_score)
187 | print('the std_score_abso..',std_score_abso)
188 | print('the std_length..',std_length)
189 | # print(std_length)
190 | print('the mean of total_score_a...',np.mean(total_score_a, axis=0))
191 | print('the mean of length...',np.mean(length, axis=0))
192 | print('the mean of total_absolute...',np.mean(total_absolute, axis=0))
193 | 
194 | ### plot the mean the score and length ###
195 | plt.plot(x, total_score_a)
196 | plt.xlabel('ith Num of episode')
197 | plt.ylabel('agent scores')
198 | plt.show()
199 | 
200 | plt.plot(x, total_score_b)
201 | plt.xlabel('ith Num of episode')
202 | plt.ylabel('computer scores')
203 | plt.show()
204 | 
205 | plt.plot(x, total_absolute)
206 | plt.xlabel('ith Num of episode')
207 | plt.ylabel('difference between agent and computer')
208 | plt.show()
209 | 
210 | 
211 | plt.plot(x, length)
212 | plt.xlabel('ith Num of episode')
213 | plt.ylabel('agent frames count')
214 | plt.show()
215 | 
216 | 
217 | np.save('part2_pong_a_score', total_score_a)
218 | np.save('part2_pong_b_score', total_score_b)
219 | np.save('part2_pong_length', length)
220 | np.save('part2_pong_difference_score',total_absolute)
221 | 
222 | 


--------------------------------------------------------------------------------
/Atari/pong/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Atari/pong/part2_pong_a_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/part2_pong_a_score.npy


--------------------------------------------------------------------------------
/Atari/pong/part2_pong_b_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/part2_pong_b_score.npy


--------------------------------------------------------------------------------
/Atari/pong/part2_pong_difference_score.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/part2_pong_difference_score.npy


--------------------------------------------------------------------------------
/Atari/pong/part2_pong_length.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/part2_pong_length.npy


--------------------------------------------------------------------------------
/CartPole/different-neural-size-Q-learning/cartpole_5_neural_1000_load.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | import matplotlib.pyplot as plt
  7 | env = gym.make('CartPole-v0')
  8 | env._max_episode_steps = 300
  9 | 
 10 | 
 11 | 
 12 | #### set variable and parameters ####
 13 | x1=tf.placeholder(tf.float32, shape=[None,4])
 14 | x2=tf.placeholder(tf.float32, shape=[None,4])
 15 | x3=tf.placeholder(tf.float32, shape=[None,1])
 16 | x4=tf.placeholder(tf.int32, shape=[None,2])
 17 | 
 18 | 
 19 | discount=0.99
 20 | learn_rate=0.0001
 21 | input_size=4
 22 | hidden_size=1000
 23 | output_size=2
 24 | eplison=0.05
 25 | max_eposide_length=300
 26 | 
 27 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 28 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 29 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 30 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 31 | 
 32 | 
 33 | 
 34 | ###   one hiddle layer neural network as function approximation ###
 35 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 36 | prediction_No=tf.nn.relu(middle_now)
 37 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 38 | 
 39 | 
 40 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 41 | prediction_Ne=tf.nn.relu(middle_next)
 42 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 43 | 
 44 | ### the best action based on observation_now ###
 45 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 46 | 
 47 | 
 48 | 
 49 | ### calcaulate the loss and training ###
 50 | Q_value=tf.gather_nd(params=prediction_now,indices=x4)
 51 | 
 52 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 53 | 
 54 | 
 55 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value))
 56 | 
 57 | q_loss=tf.reduce_sum(tf.square(delta)/2)
 58 | 
 59 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 60 | 
 61 | 
 62 | #### save the model ####
 63 | saver=tf.train.Saver()
 64 | 
 65 | 
 66 | 
 67 | with tf.device('/cpu:0'):
 68 |     with tf.Session() as sess:
 69 |         ## reload the weights ###
 70 |         saver.restore(sess, './part5_neural_1000_300/')
 71 |         eposide_length = []
 72 |         expected_value = []
 73 |         all_eposide_length = np.zeros((1, 10))
 74 |         all_reward = np.zeros((1, 100))
 75 |         #### run 10 times test eposide ###
 76 |         for i_episode in range(10):
 77 | 
 78 |             observation_init = env.reset()
 79 |             observation_init = [observation_init]
 80 | 
 81 |             for t in range(300):
 82 | 
 83 |                 if t == 0:
 84 | 
 85 |                     Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init})
 86 | 
 87 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 88 | 
 89 |                     observation_next = [observation_curr]
 90 |                 else:
 91 |                     Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next})
 92 | 
 93 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 94 |                     observation_next = [observation_curr]
 95 | 
 96 |                 if done is True:
 97 | 
 98 |                     eposide_length.append(t + 1)
 99 |                     reward = -1
100 |                     reward_return = reward * (discount ** (t))
101 |                     expected_value.append(reward_return)
102 | 
103 |                     break
104 |             all_eposide_length[0, i_episode] = t + 1
105 |             all_reward[0, i_episode] = reward_return
106 | 
107 |         all_eposide_length = np.mean(all_eposide_length, axis=0)
108 |         all_reward = np.mean(all_reward, axis=0)
109 | 
110 | 
111 | 
112 |         print('the mean of episode length', np.mean(eposide_length))
113 |         print('the mean of reward ',np.mean(expected_value))
114 | 
115 |         print('the standard deviation of episode length', np.std(eposide_length))
116 |         plt.plot(all_eposide_length)
117 |         plt.xlabel('Num of episode')
118 |         plt.ylabel('length of eposide')
119 |         plt.show()
120 | 
121 | 


--------------------------------------------------------------------------------
/CartPole/different-neural-size-Q-learning/cartpole_5_neural_1000_saved.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | 
  5 | import os
  6 | 
  7 | env = gym.make('CartPole-v0')
  8 | 
  9 | env._max_episode_steps = 300
 10 | 
 11 | 
 12 | 
 13 | #### set variable and parameters ####
 14 | x1=tf.placeholder(tf.float32, shape=[None,4])
 15 | x2=tf.placeholder(tf.float32, shape=[None,4])
 16 | x3=tf.placeholder(tf.float32, shape=[None,1])
 17 | x4=tf.placeholder(tf.int32, shape=[None,2])
 18 | 
 19 | 
 20 | discount=0.99
 21 | learn_rate=0.0001
 22 | input_size=4
 23 | hidden_size=1000
 24 | output_size=2
 25 | eplison=0.05
 26 | max_eposide_length=300
 27 | 
 28 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 29 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 30 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 31 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 32 | 
 33 | 
 34 | 
 35 | ###   one hiddle layer neural network as function approximation ###
 36 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 37 | prediction_No=tf.nn.relu(middle_now)
 38 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 39 | 
 40 | 
 41 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 42 | prediction_Ne=tf.nn.relu(middle_next)
 43 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 44 | 
 45 | ### the best action based on observation_now ###
 46 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 47 | 
 48 | 
 49 | 
 50 | ### calcaulate the loss and training ###
 51 | Q_value=tf.gather_nd(params=prediction_now,indices=x4)
 52 | 
 53 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 54 | 
 55 | 
 56 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value))
 57 | 
 58 | q_loss=tf.reduce_sum(tf.square(delta)/2)
 59 | 
 60 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 61 | 
 62 | 
 63 | #### save the model ####
 64 | saver=tf.train.Saver()
 65 | 
 66 | 
 67 | 
 68 | with tf.device('/cpu:0'):
 69 | 
 70 | #### set the set to save data ####
 71 |     run_size = 1
 72 |     all_episode_length = np.zeros((run_size, 2000))
 73 |     all_total_reward = np.zeros((run_size, 2000))
 74 |     all_test_episode_length = np.zeros((run_size, 100))
 75 |     all_test_reward = np.zeros((run_size, 100))
 76 |     all_train_loss = np.zeros((run_size, 100))
 77 | 
 78 | 
 79 |     with tf.Session() as sess:
 80 |         for i_run in range(1,1+run_size):
 81 |             sess.run(tf.global_variables_initializer())
 82 | 
 83 |             print('......start training data......')
 84 | 
 85 |             for i_eposide in range(1,2000+1):
 86 | 
 87 |                 ### begin a new eposide ###
 88 |                 observation_00 = env.reset()
 89 |                 total_reward=0
 90 |                 total_QQ_loss=0
 91 | 
 92 |                 for i_step in range(max_eposide_length):
 93 | 
 94 |                     ### greedy policy to select action ###
 95 |                     if np.random.random() <= eplison:
 96 |                         action_select_now=np.random.randint(2)
 97 | 
 98 |                     else:
 99 |                     ### use Q function to select action ###
100 |                         action_select_now=sess.run(test_action,feed_dict={x1:np.reshape(observation_00, [1, 4])})
101 |                         action_select_now=int(action_select_now)
102 | 
103 |                     observation_11,_,done_0,info=env.step(action_select_now)
104 | 
105 |                     if done_0 is False:
106 |                         reward=0
107 |                     else:
108 |                         reward=-1
109 |                     ### trainning step ###
110 |                     _,train_loss=sess.run([train_optimizer,q_loss], feed_dict={x1:np.reshape( observation_00,[1,4]), x2: np.reshape( observation_11,[1,4]), x3:np.reshape(reward,[1,1]),x4:np.reshape([0,action_select_now],[1,2])})
111 | 
112 |                     total_QQ_loss +=train_loss
113 | 
114 |                     observation_00 = observation_11
115 |                     if (i_eposide-1)%20==0:
116 | 
117 |                         if done_0 is True:
118 |                             reward=-1
119 | 
120 |                             final_reward =reward* discount**(i_step)
121 | 
122 |                             all_episode_length[i_run-1, i_eposide-1] = i_step + 1
123 |                             all_total_reward[i_run-1, i_eposide-1] = final_reward
124 | 
125 | 
126 | 
127 |                         ### record average test performance ###
128 |                             test_size=10
129 |                             Small_test_eposide_length = np.zeros((1, test_size))
130 |                             Small_test_reward = np.zeros((1, test_size))
131 | 
132 |                             for i_test_run in range(1,test_size+1):
133 |                                 observation_test_0 = env.reset()
134 | 
135 |                                 for i_test_length in range(max_eposide_length):
136 |                                     action_test_now = test_action.eval(feed_dict={x1: np.reshape(observation_test_0, [1, 4])})
137 |                                     action_test_now=int(action_test_now)
138 |                                     observation_test_1, _, test_done, test_info = env.step(int(action_test_now))
139 | 
140 |                                     observation_test_0=observation_test_1
141 | 
142 |                                     if test_done is False:
143 |                                         reward_test = 0,
144 |                                     else:
145 |                                         reward_test = -1
146 | 
147 |                                     if test_done is True:
148 |                                         Small_test_eposide_length[0,i_test_run-1]=i_test_length+1
149 |                                         Small_test_reward[0,i_test_run-1]=reward_test*(discount**(i_test_length))
150 |                                         #print(i_test_length+1)
151 | 
152 |                                         break
153 | 
154 | 
155 |                             small_mean_test_length=np.mean(np.mean(Small_test_eposide_length,axis=0),axis=0)
156 |                             small_mean_test_reward=np.mean(np.mean(Small_test_reward,axis=0),axis=0)
157 |                             print('ith_run', i_run-1, 'the ith eposide', i_eposide-1, 'the train_length_eposide', i_step + 1,
158 |                                   'the test average length',small_mean_test_length , '..loss..',
159 |                                   train_loss)
160 |                             all_test_episode_length[i_run-1, int((i_eposide-1)/20)]=small_mean_test_length
161 |                             all_test_reward[i_run-1, int((i_eposide-1)/20)]=small_mean_test_reward
162 |                             all_train_loss[i_run-1, int((i_eposide-1)/20)] = total_QQ_loss/(i_step+1)
163 | 
164 |                             if all_test_episode_length[i_run - 1, int((i_eposide - 1) / 20)] == np.amax(
165 |                                     all_test_episode_length):
166 | 
167 |                                 print('.....', all_test_episode_length[i_run - 1, int((i_eposide - 1) / 20)])
168 |                                 print(np.amax(all_test_episode_length))
169 | 
170 |                                 if not os.path.exists('./part5_neural_1000_300/'):
171 |                                     os.mkdir('./part5_neural_1000_300/')
172 |                                 saver.save(sess, "./part5_neural_1000_300/")
173 |                                 print('saved')
174 | 
175 |                             break
176 | 
177 | 
178 |                     else:
179 |                         if done_0 is True:
180 |                             reward = -1
181 | 
182 |                             final_reward = reward * discount ** (i_step)
183 | 
184 |                             all_episode_length[i_run - 1, i_eposide-1] = i_step + 1
185 |                             all_total_reward[i_run - 1, i_eposide-1] = final_reward
186 |                             break
187 | 
188 | 
189 | 
190 | 
191 | ### save and plot performance during training and tes ####
192 | outfile1=all_total_reward
193 | outfile2=all_episode_length
194 | outfile3=all_train_loss
195 | outfile4=all_test_reward
196 | outfile5=all_test_episode_length
197 | 
198 | 
199 | np.save('part_5_train_reward_1000_300', outfile1)
200 | np.save('part5_train_eposide_length_1000_300',outfile2)
201 | 
202 | np.save('part5_train_loss_1000_300', outfile3)
203 | np.save('part5_test_reward_1000_300', outfile4)
204 | np.save('part5_test_length_1000_300', outfile5)
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 


--------------------------------------------------------------------------------
/CartPole/different-neural-size-Q-learning/cartpole_5_neural_30_load.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | import matplotlib.pyplot as plt
  7 | env = gym.make('CartPole-v0')
  8 | env._max_episode_steps = 300
  9 | 
 10 | 
 11 | 
 12 | #### set variable and parameters ####
 13 | x1=tf.placeholder(tf.float32, shape=[None,4])
 14 | x2=tf.placeholder(tf.float32, shape=[None,4])
 15 | x3=tf.placeholder(tf.float32, shape=[None,1])
 16 | x4=tf.placeholder(tf.int32, shape=[None,2])
 17 | 
 18 | 
 19 | discount=0.99
 20 | learn_rate=0.0001
 21 | input_size=4
 22 | hidden_size=30
 23 | output_size=2
 24 | eplison=0.05
 25 | max_eposide_length=300
 26 | 
 27 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 28 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 29 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 30 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 31 | 
 32 | 
 33 | 
 34 | ###   one hiddle layer neural network as function approximation ###
 35 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 36 | prediction_No=tf.nn.relu(middle_now)
 37 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 38 | 
 39 | 
 40 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 41 | prediction_Ne=tf.nn.relu(middle_next)
 42 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 43 | 
 44 | ### the best action based on observation_now ###
 45 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 46 | 
 47 | 
 48 | 
 49 | ### calcaulate the loss and training ###
 50 | Q_value=tf.gather_nd(params=prediction_now,indices=x4)
 51 | 
 52 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 53 | 
 54 | 
 55 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value))
 56 | 
 57 | q_loss=tf.reduce_sum(tf.square(delta)/2)
 58 | 
 59 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 60 | 
 61 | 
 62 | #### save the model ####
 63 | saver=tf.train.Saver()
 64 | 
 65 | 
 66 | 
 67 | with tf.device('/cpu:0'):
 68 |     with tf.Session() as sess:
 69 |         ## reload the weights ###
 70 |         saver.restore(sess, './part5_neural_30_300/')
 71 |         eposide_length = []
 72 |         expected_value = []
 73 |         all_eposide_length = np.zeros((1, 10))
 74 |         all_reward = np.zeros((1, 100))
 75 |         ### run 10 times test eposide ###
 76 |         for i_episode in range(10):
 77 | 
 78 |             observation_init = env.reset()
 79 |             observation_init = [observation_init]
 80 | 
 81 |             for t in range(300):
 82 | 
 83 |                 if t == 0:
 84 | 
 85 |                     Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init})
 86 | 
 87 | 
 88 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 89 | 
 90 |                     observation_next = [observation_curr]
 91 |                 else:
 92 |                     Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next})
 93 | 
 94 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 95 |                     observation_next = [observation_curr]
 96 | 
 97 |                 if done is True:
 98 | 
 99 |                     eposide_length.append(t + 1)
100 |                     reward = -1
101 |                     reward_return = reward * (discount ** (t))
102 |                     expected_value.append(reward_return)
103 | 
104 |                     break
105 |             all_eposide_length[0, i_episode] = t + 1
106 |             all_reward[0, i_episode] = reward_return
107 | 
108 |         all_eposide_length = np.mean(all_eposide_length, axis=0)
109 |         all_reward = np.mean(all_reward, axis=0)
110 | 
111 | 
112 | 
113 |         print('the mean of episode length', np.mean(eposide_length))
114 |         print('the mean of reward ',np.mean(expected_value))
115 | 
116 |         print('the standard deviation of episode length', np.std(eposide_length))
117 |         plt.plot(all_eposide_length)
118 |         plt.xlabel('Num of episode')
119 |         plt.ylabel('length of eposide')
120 |         plt.show()
121 | 


--------------------------------------------------------------------------------
/CartPole/different-neural-size-Q-learning/cartpole_5_neural_30_saved.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | 
  5 | import os
  6 | 
  7 | env = gym.make('CartPole-v0')
  8 | env._max_episode_steps = 300
  9 | 
 10 | 
 11 | 
 12 | #### set variable and parameters ####
 13 | x1=tf.placeholder(tf.float32, shape=[None,4])
 14 | x2=tf.placeholder(tf.float32, shape=[None,4])
 15 | x3=tf.placeholder(tf.float32, shape=[None,1])
 16 | x4=tf.placeholder(tf.int32, shape=[None,2])
 17 | 
 18 | 
 19 | discount=0.99
 20 | learn_rate=0.0001
 21 | input_size=4
 22 | hidden_size=30
 23 | output_size=2
 24 | eplison=0.05
 25 | max_eposide_length=300
 26 | 
 27 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 28 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 29 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 30 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 31 | 
 32 | 
 33 | 
 34 | ###   one hiddle layer neural network as function approximation ###
 35 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 36 | prediction_No=tf.nn.relu(middle_now)
 37 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 38 | 
 39 | 
 40 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 41 | prediction_Ne=tf.nn.relu(middle_next)
 42 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 43 | 
 44 | ### the best action based on observation_now ###
 45 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 46 | 
 47 | 
 48 | 
 49 | ### calcaulate the loss and training ###
 50 | Q_value=tf.gather_nd(params=prediction_now,indices=x4)
 51 | 
 52 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 53 | 
 54 | 
 55 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value))
 56 | 
 57 | q_loss=tf.reduce_sum(tf.square(delta)/2)
 58 | 
 59 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 60 | 
 61 | 
 62 | #### save the model ####
 63 | saver=tf.train.Saver()
 64 | 
 65 | 
 66 | 
 67 | with tf.device('/cpu:0'):
 68 | 
 69 | #### set the set to save data ####
 70 |     run_size = 1
 71 |     all_episode_length = np.zeros((run_size, 2000))
 72 |     all_total_reward = np.zeros((run_size, 2000))
 73 |     all_test_episode_length = np.zeros((run_size, 100))
 74 |     all_test_reward = np.zeros((run_size, 100))
 75 |     all_train_loss = np.zeros((run_size, 100))
 76 | 
 77 | 
 78 |     with tf.Session() as sess:
 79 |         for i_run in range(1,1+run_size):
 80 |             sess.run(tf.global_variables_initializer())
 81 | 
 82 |             print('......start training data......')
 83 | 
 84 |             for i_eposide in range(1,2000+1):
 85 | 
 86 |                 ### begin a new eposide ###
 87 |                 observation_00 = env.reset()
 88 |                 total_reward=0
 89 |                 total_QQ_loss=0
 90 | 
 91 |                 for i_step in range(max_eposide_length):
 92 | 
 93 | 
 94 |                     ### greedy policy to select action ###
 95 |                     if np.random.random() <= eplison:
 96 |                         action_select_now=np.random.randint(2)
 97 | 
 98 |                     else:
 99 |                     ### use Q function to select action ###
100 |                         action_select_now=sess.run(test_action,feed_dict={x1:np.reshape(observation_00, [1, 4])})
101 |                         action_select_now=int(action_select_now)
102 | 
103 |                     observation_11,_,done_0,info=env.step(action_select_now)
104 | 
105 |                     if done_0 is False:
106 |                         reward=0
107 |                     else:
108 |                         reward=-1
109 |                     ### training step ###
110 |                     _,train_loss=sess.run([train_optimizer,q_loss], feed_dict={x1:np.reshape( observation_00,[1,4]), x2: np.reshape( observation_11,[1,4]), x3:np.reshape(reward,[1,1]),x4:np.reshape([0,action_select_now],[1,2])})
111 | 
112 |                     total_QQ_loss +=train_loss
113 | 
114 |                     observation_00 = observation_11
115 |                     if (i_eposide-1)%20==0:
116 | 
117 |                         if done_0 is True:
118 |                             reward=-1
119 | 
120 |                             final_reward =reward* discount**(i_step)
121 | 
122 |                             all_episode_length[i_run-1, i_eposide-1] = i_step + 1
123 |                             all_total_reward[i_run-1, i_eposide-1] = final_reward
124 | 
125 | 
126 | 
127 |                         ### record average test performance ###
128 |                             test_size=10
129 |                             Small_test_eposide_length = np.zeros((1, test_size))
130 |                             Small_test_reward = np.zeros((1, test_size))
131 | 
132 |                             for i_test_run in range(1,test_size+1):
133 |                                 observation_test_0 = env.reset()
134 | 
135 | 
136 |                                 for i_test_length in range(max_eposide_length):
137 |                                     #env.render()
138 |                                     action_test_now = test_action.eval(feed_dict={x1: np.reshape(observation_test_0, [1, 4])})
139 |                                     action_test_now=int(action_test_now)
140 |                                     observation_test_1, _, test_done, test_info = env.step(int(action_test_now))
141 | 
142 |                                     observation_test_0=observation_test_1
143 | 
144 |                                     if test_done is False:
145 |                                         reward_test = 0,
146 |                                     else:
147 |                                         reward_test = -1
148 | 
149 |                                     if test_done is True:
150 |                                         Small_test_eposide_length[0,i_test_run-1]=i_test_length+1
151 |                                         Small_test_reward[0,i_test_run-1]=reward_test*(discount**(i_test_length))
152 |                                         #print(i_test_length+1)
153 | 
154 |                                         break
155 | 
156 | 
157 |                             small_mean_test_length=np.mean(np.mean(Small_test_eposide_length,axis=0),axis=0)
158 |                             small_mean_test_reward=np.mean(np.mean(Small_test_reward,axis=0),axis=0)
159 |                             print('ith_run', i_run-1, 'the ith eposide', i_eposide-1,
160 |                                   'the test average length', small_mean_test_length , '..loss..',
161 |                                   train_loss)
162 |                             all_test_episode_length[i_run-1, int((i_eposide-1)/20)]=small_mean_test_length
163 |                             #print((i_eposide-1)/20)
164 |                             #print(int((i_eposide-1)/20))
165 |                             all_test_reward[i_run-1, int((i_eposide-1)/20)]=small_mean_test_reward
166 |                             all_train_loss[i_run-1, int((i_eposide-1)/20)] = total_QQ_loss/(i_step+1)
167 | 
168 |                             if all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)] == np.amax(
169 |                                    all_test_episode_length):
170 | 
171 |                                 print('.....', all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)])
172 |                                 print(np.amax(all_test_episode_length))
173 | 
174 | 
175 | 
176 |                                 if not os.path.exists('./part5_neural_30_300/'):
177 |                                     os.mkdir('./part5_neural_30_300/')
178 |                                 saver.save(sess, "./part5_neural_30_300/")
179 |                                 print('saved')
180 | 
181 | 
182 |                             break
183 |                     else:
184 |                         if done_0 is True:
185 |                             reward = -1
186 | 
187 |                             final_reward = reward * discount ** (i_step)
188 | 
189 |                             all_episode_length[i_run - 1, i_eposide-1] = i_step + 1
190 |                             all_total_reward[i_run - 1, i_eposide-1] = final_reward
191 |                             break
192 | 
193 | 
194 | 
195 | 
196 | ### save and plot performance during training and tes ####
197 | outfile1=all_total_reward
198 | outfile2=all_episode_length
199 | outfile3=all_train_loss
200 | outfile4=all_test_reward
201 | outfile5=all_test_episode_length
202 | 
203 | 
204 | np.save('part_5_train_reward_30_300', outfile1)
205 | np.save('part5_train_eposide_length_30_300',outfile2)
206 | 
207 | np.save('part5_train_loss_30_300', outfile3)
208 | np.save('part5_test_reward_30_300', outfile4)
209 | np.save('part5_test_length_30_300', outfile5)
210 | 
211 | 
212 | 
213 | 


--------------------------------------------------------------------------------
/CartPole/different-neural-size-Q-learning/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/CartPole/double-q-learning/cartpole_8_load.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import gym
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | env = gym.make('CartPole-v0')
  9 | env._max_episode_steps = 300
 10 | 
 11 | print("......Loading train_data......")
 12 | 
 13 | 
 14 | train_data=np.load('train_data_2.npy')
 15 | 
 16 | #### set variable and parameters ####
 17 | 
 18 | x1=tf.placeholder(tf.float32, shape=[None,4])
 19 | x2=tf.placeholder(tf.float32, shape=[None,4])
 20 | x3=tf.placeholder(tf.float32, shape=[None,2])
 21 | x4=tf.placeholder(tf.float32, shape=[None])
 22 | x5=tf.placeholder(tf.float32,shape=[None])
 23 | 
 24 | 
 25 | 
 26 | batch_size=128
 27 | discount=0.99
 28 | learn_rate=0.0001
 29 | input_size=4
 30 | hidden_size=100
 31 | output_size=2
 32 | max_eposide_length=300
 33 | eplison=0.05
 34 | 
 35 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 36 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 37 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 38 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 39 | 
 40 | Weight_double_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 41 | Weight_double_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 42 | Bias_double_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 43 | Bias_double_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 44 | 
 45 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 46 | prediction_No=tf.nn.relu(middle_now)
 47 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 48 | 
 49 | 
 50 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 51 | prediction_Ne=tf.nn.relu(middle_next)
 52 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 53 | 
 54 | 
 55 | middle_now_double=tf.matmul(x1,Weight_double_1)+Bias_double_1
 56 | prediction_No_double=tf.nn.relu(middle_now_double)
 57 | prediction_now_double=tf.matmul(prediction_No_double,Weight_double_2)+Bias_double_2
 58 | 
 59 | middle_next_double=tf.matmul(x2,Weight_double_1)+Bias_double_1
 60 | prediction_Ne_double=tf.nn.relu(middle_next_double)
 61 | prediction_next_double=tf.matmul(prediction_Ne_double,Weight_double_2)+Bias_double_2
 62 | #
 63 | 
 64 | 
 65 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 66 | test_action_double=tf.cast(tf.argmax(prediction_now_double,1),tf.int32)
 67 | 
 68 | True_action=tf.cast(x3,tf.int32)
 69 | 
 70 | Q_value=tf.gather_nd(params=prediction_now,indices=True_action)
 71 | Q_value_double=tf.gather_nd(params=prediction_now_double,indices=True_action)
 72 | 
 73 | 
 74 | 
 75 | next_action_b=tf.cast(tf.argmax(prediction_next_double,1),tf.int32)
 76 | next_action_b=tf.reshape(next_action_b,[-1,1])
 77 | action_repeat_b=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])#
 78 | action_b_next=tf.concat([action_repeat_b,next_action_b],1)
 79 | 
 80 | next_action_a=tf.cast(tf.argmax(prediction_next,1),tf.int32)
 81 | next_action_a=tf.reshape(next_action_a,[-1,1])
 82 | action_repeat_a=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])#
 83 | action_a_next=tf.concat([action_repeat_a,next_action_a],1)
 84 | 
 85 | 
 86 | Max_Q_value_next=tf.gather_nd(params=prediction_next_double,indices=action_a_next)
 87 | Max_Q_value_next_double=tf.gather_nd(params=prediction_next,indices=action_b_next)
 88 | 
 89 | delta_a=tf.add(x4+discount*tf.stop_gradient((1+x4)*Max_Q_value_next),(-1*Q_value))
 90 | delta_b=tf.add(x4+discount*tf.stop_gradient((1+x4)*Max_Q_value_next_double),(-1*Q_value_double))
 91 | 
 92 | 
 93 | q_loss_a=tf.reduce_mean((tf.square(delta_a))/2)
 94 | 
 95 | q_loss_b=tf.reduce_mean((tf.square(delta_b))/2)
 96 | 
 97 | train_optimizer_a=tf.train.AdamOptimizer(learn_rate).minimize(q_loss_a)
 98 | 
 99 | train_optimizer_b=tf.train.AdamOptimizer(learn_rate).minimize(q_loss_b)
100 | 
101 | 
102 | 
103 | saver = tf.train.Saver()
104 | 
105 | 
106 | with tf.device('/cpu:0'):
107 |     with tf.Session() as sess:
108 |         ## reload the weights ###
109 |         saver.restore(sess, './part8_double_dqn/')
110 |         eposide_length = []
111 |         expected_value = []
112 |         all_eposide_length = np.zeros((1, 10))
113 |         all_reward = np.zeros((1, 100))
114 | 
115 |         for i_episode in range(10):
116 | 
117 |             observation_init = env.reset()
118 |             observation_init = [observation_init]
119 | 
120 |             for t in range(300):
121 | 
122 |                 if t == 0:
123 | 
124 |                     Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init})
125 | 
126 | 
127 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
128 | 
129 |                     observation_next = [observation_curr]
130 |                 else:
131 |                     Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next})
132 | 
133 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
134 |                     observation_next = [observation_curr]
135 | 
136 |                 if done is True:
137 | 
138 |                     eposide_length.append(t + 1)
139 |                     reward = -1
140 |                     reward_return = reward * (discount ** (t))
141 |                     expected_value.append(reward_return)
142 | 
143 |                     break
144 |             all_eposide_length[0, i_episode] = t + 1
145 |             all_reward[0, i_episode] = reward_return
146 | 
147 |         all_eposide_length = np.mean(all_eposide_length, axis=0)
148 |         all_reward = np.mean(all_reward, axis=0)
149 | 
150 | 
151 | 
152 |         print('the mean of episode length', np.mean(eposide_length))
153 |         print('the mean of reward ',np.mean(expected_value))
154 | 
155 |         print('the standard deviation of episode length', np.std(eposide_length))
156 |         plt.plot(all_eposide_length)
157 |         plt.xlabel('Num of episode')
158 |         plt.ylabel('length of eposide')
159 |         plt.show()
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/CartPole/double-q-learning/cartpole_8_saved.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | 
  7 | env = gym.make('CartPole-v0')
  8 | env._max_episode_steps = 300
  9 | 
 10 | print("......Loading train_data......")
 11 | 
 12 | 
 13 | train_data=np.load('train_data_2.npy')
 14 | 
 15 | #### set variable and parameters ####
 16 | 
 17 | x1=tf.placeholder(tf.float32, shape=[None,4])
 18 | x2=tf.placeholder(tf.float32, shape=[None,4])
 19 | x3=tf.placeholder(tf.float32, shape=[None,2])
 20 | x4=tf.placeholder(tf.float32, shape=[None])
 21 | x5=tf.placeholder(tf.float32,shape=[None])
 22 | 
 23 | 
 24 | batch_size=128
 25 | discount=0.99
 26 | learn_rate=0.0001
 27 | input_size=4
 28 | hidden_size=100
 29 | output_size=2
 30 | max_eposide_length=300
 31 | eplison=0.05
 32 | 
 33 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 34 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 35 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 36 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 37 | 
 38 | ### the second neural network ###
 39 | Weight_double_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 40 | Weight_double_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 41 | Bias_double_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 42 | Bias_double_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 43 | 
 44 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 45 | prediction_No=tf.nn.relu(middle_now)
 46 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 47 | 
 48 | 
 49 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 50 | prediction_Ne=tf.nn.relu(middle_next)
 51 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 52 | 
 53 | 
 54 | middle_now_double=tf.matmul(x1,Weight_double_1)+Bias_double_1
 55 | prediction_No_double=tf.nn.relu(middle_now_double)
 56 | prediction_now_double=tf.matmul(prediction_No_double,Weight_double_2)+Bias_double_2
 57 | 
 58 | middle_next_double=tf.matmul(x2,Weight_double_1)+Bias_double_1
 59 | prediction_Ne_double=tf.nn.relu(middle_next_double)
 60 | prediction_next_double=tf.matmul(prediction_Ne_double,Weight_double_2)+Bias_double_2
 61 | 
 62 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 63 | test_action_double=tf.cast(tf.argmax(prediction_now_double,1),tf.int32)
 64 | 
 65 | True_action=tf.cast(x3,tf.int32)
 66 | 
 67 | Q_value=tf.gather_nd(params=prediction_now,indices=True_action)
 68 | Q_value_double=tf.gather_nd(params=prediction_now_double,indices=True_action)
 69 | 
 70 | ### calculate the target by the actual action which calculate by current network ###
 71 | next_action_b=tf.cast(tf.argmax(prediction_next_double,1),tf.int32)
 72 | next_action_b=tf.reshape(next_action_b,[-1,1])
 73 | action_repeat_b=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])#
 74 | action_b_next=tf.concat([action_repeat_b,next_action_b],1)
 75 | 
 76 | next_action_a=tf.cast(tf.argmax(prediction_next,1),tf.int32)
 77 | next_action_a=tf.reshape(next_action_a,[-1,1])
 78 | action_repeat_a=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])#
 79 | action_a_next=tf.concat([action_repeat_a,next_action_a],1)
 80 | 
 81 | 
 82 | Max_Q_value_next=tf.gather_nd(params=prediction_next_double,indices=action_a_next)
 83 | Max_Q_value_next_double=tf.gather_nd(params=prediction_next,indices=action_b_next)
 84 | 
 85 | 
 86 | ### calculate the loss by update network a or b ###
 87 | delta_a=tf.add(x4+discount*tf.stop_gradient((1+x4)*Max_Q_value_next),(-1*Q_value))
 88 | delta_b=tf.add(x4+discount*tf.stop_gradient((1+x4)*Max_Q_value_next_double),(-1*Q_value_double))
 89 | 
 90 | 
 91 | q_loss_a=tf.reduce_mean((tf.square(delta_a))/2)
 92 | q_loss_b=tf.reduce_mean((tf.square(delta_b))/2)
 93 | 
 94 | train_optimizer_a=tf.train.AdamOptimizer(learn_rate).minimize(q_loss_a)
 95 | train_optimizer_b=tf.train.AdamOptimizer(learn_rate).minimize(q_loss_b)
 96 | 
 97 | 
 98 | saver = tf.train.Saver()
 99 | 
100 | with tf.device('/cpu:0'):
101 | 
102 |     eposide_size = 2000
103 |     run_size = 1
104 |     all_episode_length = np.zeros((run_size, int(eposide_size)))
105 |     all_total_reward = np.zeros((run_size, int(eposide_size)))
106 |     all_test_episode_length = np.zeros((run_size, int(eposide_size)))
107 |     all_test_reward = np.zeros((run_size, int(eposide_size / 20)))
108 |     all_train_loss = np.zeros((run_size, int(eposide_size / 20)))
109 | 
110 |     length_of_train = len(train_data)
111 |     for i_run in range(1, run_size + 1):
112 |         ### build the experience replay ###
113 | 
114 |         buffer_size = 1024
115 |         mini_batch_size = 64
116 | 
117 |         length_of_train=len(train_data)
118 | 
119 |         buffer_sample=random.sample(range(0, length_of_train), buffer_size)
120 |         buffer_replay=train_data[buffer_sample]
121 | 
122 |         buffer_observation_now = []
123 |         buffer_observation_next=[]
124 |         buffer_action=[]
125 |         buffer_reward=[]
126 | 
127 |         for i_sele in range(buffer_size):
128 |             buffer_observation_now.append( buffer_replay[i_sele][0])
129 |             buffer_observation_next.append( buffer_replay[i_sele][1])
130 |             buffer_reward.append( buffer_replay[i_sele][2])
131 |             buffer_action.append( buffer_replay[i_sele][3])
132 | 
133 | 
134 |         with tf.Session() as sess:
135 | 
136 |             sess.run(tf.global_variables_initializer())
137 | 
138 |             for i_eposide in range(1,1+eposide_size):
139 | 
140 |                 observation_0 = env.reset()
141 | 
142 |                 total_QQ_loss = 0
143 | 
144 |                 for i_step in range(max_eposide_length):
145 | 
146 |                     if np.random.random() <= eplison:
147 |                         action_select_now = np.random.randint(2)
148 |                     else:
149 |                         Q = sess.run(test_action, feed_dict={x1: np.reshape(observation_0, [1, 4])})
150 |                         action_select_now=int(Q)
151 | 
152 |                     observation_1, _, done_0, _ = env.step(action_select_now)
153 | 
154 |                     if done_0:
155 |                         reward = -1
156 |                     else:
157 |                         reward = 0
158 | 
159 |                     # ##add new data to replay memory
160 |                     buffer_observation_now = np.append(buffer_observation_now, np.reshape(observation_0, [1, 4]), axis=0)
161 |                     buffer_observation_next = np.append(buffer_observation_next, np.reshape(observation_1, [1, 4]), axis=0)
162 |                     buffer_action = np.append(buffer_action, [action_select_now], axis=0)
163 |                     buffer_reward = np.append(buffer_reward, [reward], axis=0)
164 | 
165 |                     ### update the first neural network ###
166 |                     if np.random.randint(2) == 0:
167 |                         select_order = np.arange(mini_batch_size)
168 | 
169 |                         this_batch = random.sample(range(len(buffer_replay)), mini_batch_size)
170 | 
171 | 
172 | 
173 |                         _, loss_train = sess.run([train_optimizer_a, q_loss_a], feed_dict={x1: buffer_observation_now[this_batch, :],
174 |                                                                              x2: buffer_observation_next[this_batch, :],
175 |                                                                              x3: np.concatenate((np.reshape(
176 |                                                                                  np.arange(mini_batch_size),
177 |                                                                                  [mini_batch_size, 1]), np.reshape(
178 |                                                                                  buffer_action[this_batch],
179 |                                                                                  [mini_batch_size, 1])), axis=1)
180 |                                                                                 , x4: buffer_reward[this_batch],
181 |                                                                                    x5:select_order})
182 |                     else:
183 |                         ### update the second neural network ###
184 |                         this_batch = random.sample(range(len(buffer_replay)), mini_batch_size)
185 |                         select_order = np.arange(mini_batch_size)
186 | 
187 |                         _, loss_train = sess.run([train_optimizer_b, q_loss_b],
188 |                                                  feed_dict={x1: buffer_observation_now[this_batch, :],
189 |                                                             x2: buffer_observation_next[this_batch, :],
190 |                                                             x3: np.concatenate((np.reshape(
191 |                                                                 np.arange(mini_batch_size),
192 |                                                                 [mini_batch_size, 1]), np.reshape(
193 |                                                                 buffer_action[this_batch],
194 |                                                                 [mini_batch_size, 1])), axis=1)
195 |                                                      , x4: buffer_reward[this_batch],
196 |                                                             x5:select_order})
197 | 
198 |                     total_QQ_loss +=loss_train
199 | 
200 |                     observation_0 = observation_1
201 | 
202 |                     if (i_eposide - 1) % 20 == 0:
203 | 
204 |                         if done_0 is True:
205 |                             if i_step + 1 == 300:
206 |                                 report_reward = 0
207 |                             else:
208 |                                 report_reward = -1 * discount ** (i_step)
209 | 
210 |                             all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1
211 |                             all_total_reward[i_run - 1, i_eposide - 1] = report_reward
212 | 
213 |                             ### record average test performance ###
214 |                             test_size = 10
215 |                             Small_test_eposide_length = np.zeros((1, test_size))
216 |                             Small_test_reward = np.zeros((1, test_size))
217 | 
218 |                             for i_test_run in range(1, test_size + 1):
219 |                                 observation_test_0 = env.reset()
220 | 
221 |                                 for i_test_length in range(max_eposide_length):
222 |                                     action_test_now = test_action.eval(
223 |                                         feed_dict={x1: np.reshape(observation_test_0, [1, 4])})
224 |                                     action_test_now = int(action_test_now)
225 |                                     observation_test_1, _, test_done, test_info = env.step(action_test_now)
226 | 
227 |                                     observation_test_0 = observation_test_1
228 | 
229 |                                     if test_done is True:
230 |                                         if i_test_length+1==300:
231 |                                             reward_test=0
232 |                                         else:
233 |                                             reward_test=-1
234 |                                         Small_test_eposide_length[0, i_test_run - 1] = i_test_length + 1
235 |                                         Small_test_reward[0, i_test_run - 1] = reward_test * (
236 |                                         discount ** (i_test_length))
237 | 
238 | 
239 |                                         break
240 | 
241 |                             small_mean_test_length = np.mean(np.mean(Small_test_eposide_length, axis=0), axis=0)
242 |                             small_mean_test_reward = np.mean(np.mean(Small_test_reward, axis=0), axis=0)
243 |                             print('the ith running',i_run,'the ith eposide', i_eposide - 1, 'the test_average_length',
244 |                                   small_mean_test_length,
245 |                                   'the total_test_length ', Small_test_eposide_length, '..loss..',
246 |                                   total_QQ_loss / (i_step + 1))
247 |                             all_test_episode_length[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_length
248 | 
249 |                             all_test_reward[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_reward
250 |                             all_train_loss[i_run - 1, int((i_eposide - 1) / 20)] = total_QQ_loss / (i_step + 1)
251 | 
252 | 
253 |                             if all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)] == np.amax(
254 |                             all_test_episode_length):
255 | 
256 |                                 print('.....', all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)])
257 |                                 print(np.amax(all_test_episode_length))
258 | 
259 | 
260 | 
261 |                                 if not os.path.exists('./part8_double_dqn/'):
262 |                                     os.mkdir('./part8_double_dqn/')
263 |                                 saver.save(sess, "./part8_double_dqn/")
264 |                                 print('saved')
265 | 
266 |                             break
267 |                     else:
268 |                         if done_0 is True:
269 |                             reward = -1
270 | 
271 |                             final_reward = reward * discount ** (i_step)
272 | 
273 |                             all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1
274 |                             all_total_reward[i_run - 1, i_eposide - 1] = final_reward
275 | 
276 |                             break
277 | 
278 |     outfile1=all_total_reward
279 |     outfile2=all_episode_length
280 |     outfile3=all_train_loss
281 |     outfile4=all_test_reward
282 |     outfile5=all_test_episode_length
283 | 
284 |     np.save('reward_data_train_part8', outfile1)
285 |     np.save('length_data_train_part8',outfile2)
286 | 
287 |     np.save('loss_data_train_part8', outfile3)
288 |     np.save('length_data_test_part8', outfile4)
289 |     np.save('reward_data_test_part8', outfile5)
290 | 
291 | 
292 |     mean_episode_len = np.mean(all_episode_length, axis=0)
293 |     mean_total_reward = np.mean(all_total_reward, axis=0)
294 |     mean_loss_train=np.mean(all_train_loss,axis=0)
295 |     mean_test_eposide_length=np.mean(all_test_episode_length,axis=0)
296 |     mean_test_reward=np.mean(all_test_reward,axis=0)
297 | 
298 |     std_episode_len = np.std(all_episode_length, axis=0)
299 |     std_total_reward = np.std(all_total_reward, axis=0)
300 | 
301 | 


--------------------------------------------------------------------------------
/CartPole/double-q-learning/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/CartPole/experience_replay/cartpole_6_buffer_replay_load.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | env = gym.make('CartPole-v0')
  7 | env._max_episode_steps = 300
  8 | 
  9 | print("......Loading train_data......")
 10 | 
 11 | 
 12 | train_data=np.load('train_data_2.npy')
 13 | 
 14 | #### set variable and parameters ####
 15 | 
 16 | x1=tf.placeholder(tf.float32, shape=[None,4])
 17 | x2=tf.placeholder(tf.float32, shape=[None,4])
 18 | x3=tf.placeholder(tf.float32, shape=[None,2])
 19 | x4=tf.placeholder(tf.float32, shape=[None])
 20 | 
 21 | 
 22 | batch_size=128
 23 | discount=0.99
 24 | learn_rate=0.0001
 25 | input_size=4
 26 | hidden_size=100
 27 | output_size=2
 28 | max_eposide_length=300
 29 | eplison=0.05
 30 | 
 31 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 32 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 33 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 34 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 35 | 
 36 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 37 | prediction_No=tf.nn.relu(middle_now)
 38 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 39 | 
 40 | 
 41 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 42 | prediction_Ne=tf.nn.relu(middle_next)
 43 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 44 | #
 45 | 
 46 | True_action=tf.cast(x3,tf.int32)
 47 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 48 | Q_value = tf.gather_nd(prediction_now, True_action)
 49 | 
 50 | max_Q_value = tf.reduce_max(prediction_next, axis=1)
 51 | delta = x4 + discount * tf.stop_gradient((1 + x4) * max_Q_value) - Q_value
 52 | q_loss = tf.reduce_mean(tf.square(delta) / 2)
 53 | 
 54 | 
 55 | train_optimizer = tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 56 | 
 57 | saver = tf.train.Saver()
 58 | 
 59 | 
 60 | with tf.device('/cpu:0'):
 61 |     with tf.Session() as sess:
 62 |         ## reload the weights ###
 63 |         saver.restore(sess, './part6_neural_buffer/')
 64 |         eposide_length = []
 65 |         expected_value = []
 66 |         all_eposide_length = np.zeros((1, 10))
 67 |         all_reward = np.zeros((1, 100))
 68 | 
 69 |         ### test the final model performance ###
 70 |         for i_episode in range(10):
 71 | 
 72 |             observation_init = env.reset()
 73 |             observation_init = [observation_init]
 74 | 
 75 |             for t in range(300):
 76 | 
 77 |                 if t == 0:
 78 | 
 79 |                     Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init})
 80 | 
 81 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 82 |                     observation_next = [observation_curr]
 83 |                 else:
 84 |                     Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next})
 85 | 
 86 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 87 |                     observation_next = [observation_curr]
 88 | 
 89 |                 if done is True:
 90 | 
 91 |                     eposide_length.append(t + 1)
 92 |                     reward = -1
 93 |                     reward_return = reward * (discount ** (t))
 94 |                     expected_value.append(reward_return)
 95 | 
 96 |                     break
 97 |             all_eposide_length[0, i_episode] = t + 1
 98 |             all_reward[0, i_episode] = reward_return
 99 | 
100 |         all_eposide_length = np.mean(all_eposide_length, axis=0)
101 |         all_reward = np.mean(all_reward, axis=0)
102 | 
103 | 
104 | 
105 |         print('the mean of episode length', np.mean(eposide_length))
106 |         print('the mean of reward ',np.mean(expected_value))
107 | 
108 |         print('the standard deviation of episode length', np.std(eposide_length))
109 |         plt.plot(all_eposide_length)
110 |         plt.xlabel('Num of episode')
111 |         plt.ylabel('length of eposide')
112 |         plt.show()
113 | 
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/CartPole/experience_replay/cartpole_6_buffer_replay_saved.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import gym
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | import random
  6 | import os
  7 | 
  8 | env = gym.make('CartPole-v0')
  9 | env._max_episode_steps = 300
 10 | 
 11 | print("......Loading train_data......")
 12 | 
 13 | 
 14 | train_data=np.load('train_data_2.npy')
 15 | 
 16 | #### set variable and parameters ####
 17 | 
 18 | x1=tf.placeholder(tf.float32, shape=[None,4])
 19 | x2=tf.placeholder(tf.float32, shape=[None,4])
 20 | x3=tf.placeholder(tf.float32, shape=[None,2])
 21 | x4=tf.placeholder(tf.float32, shape=[None])
 22 | 
 23 | 
 24 | batch_size=128
 25 | discount=0.99
 26 | learn_rate=0.0001
 27 | input_size=4
 28 | hidden_size=100
 29 | output_size=2
 30 | max_eposide_length=300
 31 | eplison=0.05
 32 | 
 33 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 34 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 35 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 36 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 37 | 
 38 | ###   one hiddle layer neural network as function approximation ###
 39 | 
 40 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 41 | prediction_No=tf.nn.relu(middle_now)
 42 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 43 | 
 44 | 
 45 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 46 | prediction_Ne=tf.nn.relu(middle_next)
 47 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 48 | 
 49 | 
 50 | True_action=tf.cast(x3,tf.int32)
 51 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 52 | 
 53 | 
 54 | ### calcaulate the loss and training ###
 55 | Q_value = tf.gather_nd(prediction_now, True_action)
 56 | 
 57 | max_Q_value = tf.reduce_max(prediction_next, axis=1)
 58 | delta = x4 + discount * tf.stop_gradient((1 + x4) * max_Q_value) - Q_value
 59 | q_loss = tf.reduce_mean(tf.square(delta) / 2)
 60 | 
 61 | 
 62 | train_optimizer = tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 63 | 
 64 | #### save the model ####
 65 | saver = tf.train.Saver()
 66 | 
 67 | 
 68 | with tf.device('/cpu:0'):
 69 |     #### set the set to save data ####
 70 | 
 71 |     eposide_size = 200000
 72 |     run_size = 1
 73 |     all_episode_length = np.zeros((run_size, int(eposide_size)))
 74 |     all_total_reward = np.zeros((run_size, int(eposide_size)))
 75 |     all_test_episode_length = np.zeros((run_size, int(eposide_size/20)))
 76 |     all_test_reward = np.zeros((run_size, int(eposide_size / 20)))
 77 |     all_train_loss = np.zeros((run_size, int(eposide_size / 20)))
 78 | 
 79 |     length_of_train = len(train_data)
 80 |     for i_run in range(1, run_size + 1):
 81 | 
 82 |         ### set the experience buffer replay ###
 83 |         buffer_size = 1024
 84 |         mini_batch_size = 64
 85 |         length_of_train=len(train_data)
 86 |         buffer_sample=random.sample(range(0, length_of_train), buffer_size)
 87 |         buffer_replay=train_data[buffer_sample]
 88 |         buffer_observation_now = []
 89 |         buffer_observation_next=[]
 90 |         buffer_action=[]
 91 |         buffer_reward=[]
 92 | 
 93 |         for i_sele in range(buffer_size):
 94 |             buffer_observation_now.append( buffer_replay[i_sele][0])
 95 |             buffer_observation_next.append( buffer_replay[i_sele][1])
 96 |             buffer_reward.append( buffer_replay[i_sele][2])
 97 |             buffer_action.append( buffer_replay[i_sele][3])
 98 | 
 99 | 
100 | 
101 |         with tf.Session() as sess:
102 | 
103 |             sess.run(tf.global_variables_initializer())
104 | 
105 |             for i_eposide in range(1,1+eposide_size):
106 | 
107 |                 observation_0 = env.reset()
108 | 
109 |                 total_QQ_loss = 0
110 | 
111 |                 for i_step in range(max_eposide_length):
112 | 
113 |                     if np.random.random() <= eplison:
114 |                         action_train = np.random.randint(2)
115 |                     else:
116 |                         Q = sess.run(test_action, feed_dict={x1: np.reshape(observation_0, [1, 4])})
117 |                         action_select_now=int(Q)
118 | 
119 |                     # the retured parameters of the action
120 |                     observation_1, _, done_0, _ = env.step(action_select_now)
121 | 
122 |                     # set reward
123 |                     if done_0:
124 |                         reward = -1
125 |                     else:
126 |                         reward = 0
127 | 
128 |                     ### add new data to replay memory ###
129 |                     buffer_observation_now = np.append(buffer_observation_now, np.reshape(observation_0, [1, 4]), axis=0)
130 |                     buffer_observation_next = np.append(buffer_observation_next, np.reshape(observation_1, [1, 4]), axis=0)
131 |                     buffer_action = np.append(buffer_action, [action_select_now], axis=0)
132 |                     buffer_reward = np.append(buffer_reward, [reward], axis=0)
133 | 
134 | 
135 |                     this_batch = random.sample(range(len(buffer_replay)), mini_batch_size)
136 | 
137 |                     _, loss_train = sess.run([train_optimizer, q_loss], feed_dict={x1: buffer_observation_now[this_batch, :],
138 |                                                                              x2: buffer_observation_next[this_batch, :],
139 |                                                                              x3: np.concatenate((np.reshape(
140 |                                                                                  np.arange(mini_batch_size),
141 |                                                                                  [mini_batch_size, 1]), np.reshape(
142 |                                                                                  buffer_action[this_batch],
143 |                                                                                  [mini_batch_size, 1])), axis=1)
144 |                         , x4: buffer_reward[this_batch]})
145 |                     total_QQ_loss +=loss_train
146 | 
147 |                     observation_0 = observation_1
148 | 
149 |                     if (i_eposide - 1) % 20 == 0:
150 |                         ### test the agent performance ###
151 |                         env.render()
152 | 
153 |                         if done_0 is True:
154 |                             if i_step+1==300:
155 |                                 report_reward = 0
156 |                             else:
157 |                                 report_reward=-1*discount ** (i_step)
158 | 
159 | 
160 | 
161 |                             all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1
162 |                             all_total_reward[i_run - 1, i_eposide - 1] = report_reward
163 | 
164 |                             ### record average test performance ###
165 |                             test_size = 10
166 |                             Small_test_eposide_length = np.zeros((1, test_size))
167 |                             Small_test_reward = np.zeros((1, test_size))
168 | 
169 |                             for i_test_run in range(1, test_size + 1):
170 |                                 observation_test_0 = env.reset()
171 | 
172 |                                 for i_test_length in range(max_eposide_length):
173 |                                     action_test_now = test_action.eval(
174 |                                         feed_dict={x1: np.reshape(observation_test_0, [1, 4])})
175 |                                     action_test_now = int(action_test_now)
176 |                                     observation_test_1, _, test_done, test_info = env.step(action_test_now)
177 | 
178 |                                     observation_test_0 = observation_test_1
179 | 
180 |                                     if test_done is True:
181 |                                         if i_test_length+1==300:
182 |                                             reward_test=0
183 |                                         else:
184 |                                             reward_test=-1
185 |                                         Small_test_eposide_length[0, i_test_run - 1] = i_test_length + 1
186 |                                         Small_test_reward[0, i_test_run - 1] = reward_test * (
187 |                                         discount ** (i_test_length))
188 | 
189 |                                         break
190 | 
191 |                             small_mean_test_length = np.mean(np.mean(Small_test_eposide_length, axis=0), axis=0)
192 |                             small_mean_test_reward = np.mean(np.mean(Small_test_reward, axis=0), axis=0)
193 |                             print('the ith running',i_run,'the ith eposide', i_eposide - 1, 'the test_average_length',
194 |                                   small_mean_test_length,
195 |                                   'the total_test_length ', Small_test_eposide_length, '..loss..',
196 |                                   total_QQ_loss / (i_step + 1))
197 |                             all_test_episode_length[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_length
198 | 
199 |                             all_test_reward[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_reward
200 |                             all_train_loss[i_run - 1, int((i_eposide - 1) / 20)] = total_QQ_loss / (i_step + 1)
201 | 
202 | 
203 |                             if all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)] == np.amax(
204 |                             all_test_episode_length):
205 | 
206 |                                 print('.....', all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)])
207 |                                 print(np.amax(all_test_episode_length))
208 | 
209 | 
210 | 
211 |                                 if not os.path.exists('./part6_neural_buffer/'):
212 |                                     os.mkdir('./part6_neural_buffer/')
213 |                                 saver.save(sess, "./part6_neural_buffer/")
214 |                                 print('saved')
215 | 
216 |                             break
217 |                     else:
218 |                         if done_0 is True:
219 |                             reward = -1
220 | 
221 |                             final_reward = reward * discount ** (i_step)
222 | 
223 |                             all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1
224 |                             all_total_reward[i_run - 1, i_eposide - 1] = final_reward
225 | 
226 |                             break
227 | 
228 |     outfile1=all_total_reward
229 |     outfile2=all_episode_length
230 |     outfile3=all_train_loss
231 |     outfile4=all_test_reward
232 |     outfile5=all_test_episode_length
233 | 
234 |     np.save('reward_data_train_part6', outfile1)
235 |     np.save('length_data_train_part6',outfile2)
236 | 
237 |     np.save('loss_data_train_part6', outfile3)
238 |     np.save('length_data_test_part6', outfile4)
239 |     np.save('reward_data_test_part6', outfile5)
240 | 
241 | 
242 |     mean_episode_len = np.mean(all_episode_length, axis=0)
243 |     mean_total_reward = np.mean(all_total_reward, axis=0)
244 |     mean_loss_train=np.mean(all_train_loss,axis=0)
245 |     mean_test_eposide_length=np.mean(all_test_episode_length,axis=0)
246 |     mean_test_reward=np.mean(all_test_reward,axis=0)
247 | 
248 |     std_episode_len = np.std(all_episode_length, axis=0)
249 |     std_total_reward = np.std(all_total_reward, axis=0)
250 | 


--------------------------------------------------------------------------------
/CartPole/experience_replay/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/CartPole/hundred-random-episode/100_random_episodes.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from numpy import float32, uint32
 4 | env = gym.make('CartPole-v0')
 5 | ### set parameters and set ###
 6 | discount_factor=0.99
 7 | eposide_length=[]
 8 | expected_value=[]
 9 | for i_episode in range(100):
10 | 
11 |     observation_init = env.reset()
12 | 
13 |     for t in range(300):
14 |         ### select action by uniform distribution ###
15 |         action= np.random.uniform(0,1,1)
16 | 
17 |         action=np.round(action)
18 |         action=int(action)
19 | 
20 |         observation, reward, done, info = env.step(action)
21 | 
22 |         #print(reward)
23 |         if done:
24 |             ### when each eposide ended record the return and eposide's length
25 |             print("Episode length is  {} ".format(t+1))
26 |             eposide_length.append(t+1)
27 |             reward=-1
28 |             reward_return=reward*(discount_factor**(t))
29 |             expected_value.append(reward_return)
30 |             break
31 | 
32 | 
33 | print("the episode's  length", eposide_length)
34 | print('the mean of episode length',np.mean(eposide_length))
35 | 
36 | print('the standard deviation of episode length',np.std(eposide_length))
37 | 
38 | print('....expected return from the initial state.....')
39 | print(expected_value)
40 | print('the mean of initial return',np.mean(expected_value,axis=0))
41 | print('the standard deviation of initial return', np.std(expected_value,axis=0))
42 | 


--------------------------------------------------------------------------------
/CartPole/hundred-random-episode/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/batch_Q_learning_linear_0.001_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/batch_Q_learning_linear_0.001_length.png


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/batch_Q_learning_linear_0.001_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/batch_Q_learning_linear_0.001_reward.png


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/batch_Q_learning_neural_0.0001_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/batch_Q_learning_neural_0.0001_length.png


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/batch_Q_learning_neural_0.0001_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/batch_Q_learning_neural_0.0001_reward.png


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/cartpole_3_collect_data.py:
--------------------------------------------------------------------------------
 1 | #### collect data based on random policy with 2000 eposides ####
 2 | import gym
 3 | import numpy as np
 4 | from numpy import float32, uint32
 5 | 
 6 | env = gym.make('CartPole-v0')
 7 | 
 8 | discount_factor=0.99
 9 | eposide_length=[]
10 | expected_value=[]
11 | transition=[]
12 | for i_episode in range(2000):
13 |     print(i_episode)
14 |     observation_init = env.reset()
15 | 
16 |     for t in range(300):
17 | 
18 |         env.render()
19 | 
20 |         action= np.random.uniform(0,1,1)
21 |         action=np.round(action)
22 |         action=int(action)
23 |         observation, reward, done, info = env.step(action)
24 | 
25 |         action=np.array(action)
26 | 
27 | 
28 |         if done is False:
29 |             reward = 0
30 |             reward = np.array(reward)
31 |             print(observation, reward, done, info)
32 | 
33 | 
34 |             if t==0:
35 |                 this_observation=observation_init
36 |                 next_observation=observation
37 |                 transition.append((this_observation,next_observation,reward,action))
38 | 
39 |             else:
40 |                 this_observation=next_observation
41 |                 next_observation=observation
42 |                 transition.append((this_observation,next_observation,reward,action))
43 | 
44 |         if done is True:
45 |             print("Episode length is  {} ".format(t+1))
46 |             eposide_length.append(t+1)
47 |             reward=-1
48 |             reward=np.array(reward)
49 |             this_observation = next_observation
50 |             next_observation = observation
51 |             transition.append((this_observation, next_observation, reward, action))
52 | 
53 |             reward_return=reward*(discount_factor**(t))
54 |             expected_value.append(reward_return)
55 | 
56 |             break
57 | 
58 | print("the episode's  length", eposide_length)
59 | print('the mean of episode length',np.mean(eposide_length))
60 | 
61 | 
62 | print('the standard deviation of episode length',np.std(eposide_length))
63 | 
64 | 
65 | print('....expected return from the initial state.....')
66 | print('the expected value of return',expected_value)
67 | 
68 | outfile1 =np.array(transition)
69 | np.save('train_data_2',outfile1)
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/cartpole_3_linear_4_load.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | import matplotlib.pyplot as plt
  7 | env = gym.make('CartPole-v0')
  8 | env._max_episode_steps = 300
  9 | 
 10 | print("......Loading train_data......")
 11 | 
 12 | ### load the stored data ###
 13 | train_data=np.load('train_data_2.npy')
 14 | 
 15 | ##### set the variable#######
 16 | batch_size=5000
 17 | discount=0.99
 18 | learn_rate=0.001
 19 | input_size=4
 20 | output_size=2
 21 | eplison=0.05
 22 | 
 23 | x1=tf.placeholder(tf.float32, shape=[None,4])
 24 | x2=tf.placeholder(tf.float32, shape=[None,4])
 25 | x3=tf.placeholder(tf.float32, shape=[None])
 26 | x4=tf.placeholder(tf.float32, shape=[None])
 27 | x5=tf.placeholder(tf.float32, shape=[None])
 28 | 
 29 | 
 30 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,output_size]))
 31 | Bias_1=tf.Variable(tf.constant(0.1,shape=[output_size]))
 32 | 
 33 | ### the prediction for each action ###
 34 | prediction_now=tf.add(tf.matmul(x1,Weight_1),Bias_1)
 35 | 
 36 | prediction_next=tf.add(tf.matmul(x2,Weight_1),Bias_1)
 37 | 
 38 | ### take q value by actual action ###
 39 | True_action=tf.cast(x4,tf.int32)
 40 | True_action=tf.reshape(True_action,shape=[-1,1])
 41 | action_repeat=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])
 42 | action_double=tf.concat([action_repeat,True_action],1)
 43 | 
 44 | qa=tf.gather_nd(params=prediction_now,indices=action_double)
 45 | 
 46 | ### select the action during test ####
 47 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 48 | 
 49 | ### loss function ###
 50 | less=tf.add(x3+discount*tf.stop_gradient((1+x3)*tf.reduce_max(prediction_next,axis=1)),-1*qa)
 51 | 
 52 | delta=less
 53 | q_loss=tf.reduce_sum((tf.square(delta)))/2
 54 | 
 55 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 56 | 
 57 | #### save the model ####
 58 | saver=tf.train.Saver()
 59 | 
 60 | ### use gpu us training data ###
 61 | with tf.device('/cpu:0'):
 62 |     with tf.Session() as sess:
 63 |         ### reload the model ###
 64 |         saver.restore(sess, './part3_linear_4/')
 65 | 
 66 |         eposide_length = []
 67 |         expected_value = []
 68 | 
 69 |         test_size=50
 70 |         all_eposide_length = np.zeros((1, test_size))
 71 |         all_reward = np.zeros((1, test_size))
 72 | 
 73 |         ### test the performance for final model ###
 74 |         ### reset 50 times to test the performance ###
 75 | 
 76 |         for i_episode in range(test_size):
 77 | 
 78 |             observation_init = env.reset()
 79 |             observation_init = [observation_init]
 80 | 
 81 |             for t in range(300):
 82 | 
 83 |                 if t == 0:
 84 | 
 85 |                     Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init})
 86 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 87 | 
 88 |                     observation_next = [observation_curr]
 89 |                 else:
 90 |                     Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next})
 91 | 
 92 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 93 |                     observation_next = [observation_curr]
 94 | 
 95 |                 if done is True:
 96 | 
 97 |                     eposide_length.append(t + 1)
 98 |                     reward = -1
 99 |                     reward_return = reward * (discount ** (t))
100 |                     expected_value.append(reward_return)
101 | 
102 |                     break
103 |             all_eposide_length[0, i_episode] = t + 1
104 |             all_reward[0, i_episode] = reward_return
105 | 
106 | 
107 |         all_eposide_length = np.sum(all_eposide_length, axis=0)
108 |         all_reward = np.sum(all_reward, axis=0)
109 | 
110 | 
111 | 
112 |         print('the mean of episode length', np.mean(eposide_length))
113 |         print('the mean of  episode length', np.mean(all_reward))
114 | 
115 |         print('the standard deviation of episode length', np.std(eposide_length))
116 |         ### print the eposide length and all reward during test ###
117 |         plt.plot(all_eposide_length)
118 |         plt.xlabel('Num of episode')
119 |         plt.ylabel('length of eposide')
120 |         plt.show()
121 |         plt.plot(all_reward)
122 |         plt.xlabel('Num of episode')
123 |         plt.ylabel('reward')
124 | 


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/cartpole_3_linear_4_saved.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | import matplotlib.pyplot as plt
  7 | env = gym.make('CartPole-v0')
  8 | env._max_episode_steps = 300
  9 | 
 10 | print("......Loading train_data......")
 11 | 
 12 | ### load the stored data ###
 13 | train_data=np.load('train_data_2.npy')
 14 | 
 15 | ##### set the variable#######
 16 | batch_size=5000
 17 | discount=0.99
 18 | learn_rate=0.001
 19 | input_size=4
 20 | output_size=2
 21 | eplison=0.05
 22 | 
 23 | x1=tf.placeholder(tf.float32, shape=[None,4])
 24 | x2=tf.placeholder(tf.float32, shape=[None,4])
 25 | x3=tf.placeholder(tf.float32, shape=[None])
 26 | x4=tf.placeholder(tf.float32, shape=[None])
 27 | x5=tf.placeholder(tf.float32, shape=[None])
 28 | 
 29 | 
 30 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,output_size]))
 31 | Bias_1=tf.Variable(tf.constant(0.1,shape=[output_size]))
 32 | 
 33 | ### the prediction for each action ###
 34 | prediction_now=tf.add(tf.matmul(x1,Weight_1),Bias_1)
 35 | 
 36 | prediction_next=tf.add(tf.matmul(x2,Weight_1),Bias_1)
 37 | 
 38 | ### take q value by actual action ###
 39 | True_action=tf.cast(x4,tf.int32)
 40 | True_action=tf.reshape(True_action,shape=[-1,1])
 41 | action_repeat=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])
 42 | action_double=tf.concat([action_repeat,True_action],1)
 43 | 
 44 | qa=tf.gather_nd(params=prediction_now,indices=action_double)
 45 | 
 46 | ### select the action during test ####
 47 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 48 | 
 49 | ### loss function ###
 50 | less=tf.add(x3+discount*tf.stop_gradient((1+x3)*tf.reduce_max(prediction_next,axis=1)),-1*qa)
 51 | 
 52 | delta=less
 53 | q_loss=tf.reduce_sum((tf.square(delta)))/2
 54 | 
 55 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 56 | 
 57 | #### save the model ####
 58 | saver=tf.train.Saver()
 59 | 
 60 | with tf.device('/cpu:0'):
 61 |     with tf.Session() as sess:
 62 | 
 63 |         sess.run(tf.global_variables_initializer())
 64 | 
 65 |         print('......start training data......')
 66 | 
 67 |         length_total_data=len(train_data)
 68 | 
 69 |         ### consist indices for tf.gather_nd function ###
 70 |         select_order=np.arange(batch_size)
 71 | 
 72 |         ### the training and test size ###
 73 |         batch_number=5000
 74 |         test_size=20
 75 | 
 76 |         ### set to store output ###
 77 |         all_eposide_length=np.zeros((1,batch_number))
 78 |         all_reward=np.zeros((1,batch_number))
 79 |         all_loss=np.zeros((1,batch_number))
 80 | 
 81 | 
 82 |         for i_batch in range(batch_number):
 83 |             batch_sample = random.sample(range(0, length_total_data), batch_size)
 84 | 
 85 |             ### obtain sample data ###
 86 |             insert_data=train_data[batch_sample]
 87 |             obser_now = []
 88 |             obser_next = []
 89 |             reward_now = []
 90 |             action_now = []
 91 | 
 92 |             for i_select in range(batch_size):
 93 |                 obser_now.append(insert_data[i_select][0])
 94 |                 obser_next.append(insert_data[i_select][1])
 95 | 
 96 |                 reward_now.append(insert_data[i_select][2])
 97 |                 action_now.append(insert_data[i_select][3])
 98 | 
 99 |             ### training network ###
100 |             _, train_loss = sess.run([train_optimizer, q_loss],feed_dict={x1: obser_now, x2: obser_next, x3: reward_now, x4: action_now,x5:select_order
101 |                                     })
102 |             ###  test the agent after each training
103 |             if i_batch % 1 == 0:
104 |                 print('...ith training....:', i_batch, 'average training loss:', train_loss/batch_size)
105 | 
106 |                 eposide_length = np.zeros((1,test_size))
107 |                 expected_value = np.zeros((1,test_size))
108 | 
109 |                 for i_episode in range(test_size):
110 |                     # print(i_episode)
111 |                     observation_init = env.reset()
112 |                     observation_init = [observation_init]
113 |                     observation_next=observation_init
114 |                     for t in range(300):
115 | 
116 |                         ### greedy policy to select action ###
117 |                         if np.random.random() <= eplison:
118 |                             Action = np.random.randint(2)
119 |                         else:
120 |                             Action = test_action.eval(feed_dict={x1: observation_next})
121 | 
122 |                         observation_curr, reward_curr, done, info = env.step(int(Action))
123 | 
124 |                         observation_next = [observation_curr]
125 | 
126 |                         if done is True:
127 | 
128 |                             eposide_length[0,i_episode]=t + 1
129 |                             reward = -1
130 |                             reward_return = reward * (discount ** (t))
131 |                             expected_value[0,i_episode]=reward_return
132 |                             break
133 | 
134 |                 all_eposide_length[0,i_batch]=np.mean(np.mean(eposide_length,axis=0),axis=0)
135 |                 all_reward[0,i_batch]=np.mean(np.mean(expected_value,axis=0),axis=0)
136 |                 all_loss[0, i_batch] = train_loss/batch_size
137 | 
138 |                 ### saved model weights ####
139 |                 if i_batch >= 2:
140 |                     if i_batch == np.argmax(all_eposide_length):
141 |                         print(i_batch)
142 |                         print(np.argmax(all_eposide_length))
143 | 
144 |                         if not os.path.exists('./part3_linear_4/'):
145 |                             os.mkdir('./part3_linear_4/')
146 |                         saver.save(sess, "./part3_linear_4/")
147 |                         print('saved')
148 | 
149 | 
150 |                 print('....the averagelength of test eposide....',np.mean(np.mean(eposide_length,axis=0),axis=0))
151 | 
152 | 
153 | 
154 |         outfile1 = all_reward
155 |         outfile2 = all_eposide_length
156 |         outfile3=all_loss
157 | 
158 |         ### save the output ###
159 |         np.save('reward_data_part3_4_300', outfile1)
160 |         np.save('length_data_part3_4_300', outfile2)
161 |         np.save('loss_data_part3_4_300', outfile3)
162 | 
163 |         mean_episode_len = np.mean(all_eposide_length,axis=0)
164 |         mean_total_reward = np.mean(all_reward,axis=0)
165 |         mean_total_loss =np.mean(all_loss,0)
166 | 
167 | 
168 |         std_episode_len = np.std(all_eposide_length, axis=0)
169 |         std_total_reward = np.std(all_reward, axis=0)
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/cartpole_3_neural_5_load.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | env = gym.make('CartPole-v0')
 10 | env._max_episode_steps = 300
 11 | 
 12 | 
 13 | 
 14 | print("......Loading train_data......")
 15 | ### load the stored data ###
 16 | ### load the stored data ###
 17 | train_data=np.load('train_data_2.npy')
 18 | 
 19 | 
 20 | #### set variable and parameters ####
 21 | x1=tf.placeholder(tf.float32, shape=[None,4])
 22 | x2=tf.placeholder(tf.float32, shape=[None,4])
 23 | x3=tf.placeholder(tf.float32, shape=[None])
 24 | x4=tf.placeholder(tf.float32, shape=[None])
 25 | x5=tf.placeholder(tf.float32, shape=[None])
 26 | 
 27 | ##### set the variable#######
 28 | batch_size=1000
 29 | discount=0.99
 30 | learn_rate=0.0001
 31 | input_size=4
 32 | hidden_size=100
 33 | output_size=2
 34 | eplison=0.05
 35 | 
 36 | 
 37 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 38 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 39 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 40 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 41 | 
 42 | 
 43 | 
 44 | ###   one hiddle layer neural network as function approximation ###
 45 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 46 | prediction_No=tf.nn.relu(middle_now)
 47 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 48 | 
 49 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 50 | prediction_Ne=tf.nn.relu(middle_next)
 51 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 52 | 
 53 | 
 54 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 55 | 
 56 | True_action=tf.cast(x4,tf.int32)
 57 | True_action=tf.reshape(True_action,shape=[-1,1])
 58 | action_repeat=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])
 59 | action_double=tf.concat([action_repeat,True_action],1)
 60 | 
 61 | ### calcaulate the loss and training ###
 62 | Q_value=tf.gather_nd(params=prediction_now,indices=action_double)
 63 | 
 64 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 65 | 
 66 | 
 67 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value))
 68 | 
 69 | q_loss=tf.reduce_sum(tf.square(delta))/2
 70 | 
 71 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 72 | 
 73 | 
 74 | 
 75 | 
 76 | #### save the model ####
 77 | saver=tf.train.Saver()
 78 | 
 79 | ### use gpu us training data ###
 80 | with tf.device('/cpu:0'):
 81 |     with tf.Session() as sess:
 82 |         ### reload the model ###
 83 |         saver.restore(sess, './part3_neural_5_300/')
 84 | 
 85 |         eposide_length = []
 86 |         expected_value = []
 87 | 
 88 |         test_size=50
 89 |         all_eposide_length = np.zeros((1, test_size))
 90 |         all_reward = np.zeros((1, test_size))
 91 | 
 92 |         ### test the performance for final model ###
 93 |         ### reset 50 times to test the performance ###
 94 | 
 95 |         for i_episode in range(test_size):
 96 | 
 97 |             observation_init = env.reset()
 98 |             observation_init = [observation_init]
 99 | 
100 |             for t in range(300):
101 | 
102 |                 if t == 0:
103 | 
104 |                     Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init})
105 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
106 | 
107 |                     observation_next = [observation_curr]
108 |                 else:
109 |                     Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next})
110 | 
111 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
112 |                     observation_next = [observation_curr]
113 | 
114 |                 if done is True:
115 | 
116 |                     eposide_length.append(t + 1)
117 |                     reward = -1
118 |                     reward_return = reward * (discount ** (t))
119 |                     expected_value.append(reward_return)
120 | 
121 |                     break
122 |             all_eposide_length[0, i_episode] = t + 1
123 |             all_reward[0, i_episode] = reward_return
124 | 
125 | 
126 |         all_eposide_length = np.sum(all_eposide_length, axis=0)
127 |         all_reward = np.sum(all_reward, axis=0)
128 | 
129 | 
130 | 
131 |         print('the mean of episode length', np.mean(eposide_length))
132 |         print('the mean of  episode length', np.mean(all_reward))
133 | 
134 |         print('the standard deviation of episode length', np.std(eposide_length))
135 |         ### print the eposide length and all reward during test ###
136 |         plt.plot(all_eposide_length)
137 |         plt.xlabel('Num of episode')
138 |         plt.ylabel('length of eposide')
139 |         plt.show()
140 |         plt.plot(all_reward)
141 |         plt.xlabel('Num of episode')
142 |         plt.ylabel('reward')
143 | 
144 | 


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/cartpole_3_neural_5_saved.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | 
  7 | #import matplotlib.pyplot as plt
  8 | 
  9 | env = gym.make('CartPole-v0')
 10 | env._max_episode_steps = 300
 11 | 
 12 | 
 13 | 
 14 | print("......Loading train_data......")
 15 | 
 16 | train_data=np.load('train_data_2.npy')
 17 | 
 18 | #### set variable and parameters ####
 19 | x1=tf.placeholder(tf.float32, shape=[None,4])
 20 | x2=tf.placeholder(tf.float32, shape=[None,4])
 21 | x3=tf.placeholder(tf.float32, shape=[None])
 22 | x4=tf.placeholder(tf.float32, shape=[None])
 23 | x5=tf.placeholder(tf.float32, shape=[None])
 24 | 
 25 | ##### set the variable#######
 26 | batch_size=1000
 27 | discount=0.99
 28 | learn_rate=0.0001
 29 | input_size=4
 30 | hidden_size=100
 31 | output_size=2
 32 | eplison=0.05
 33 | 
 34 | 
 35 | 
 36 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 37 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 38 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 39 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 40 | 
 41 | 
 42 | 
 43 | ###   one hiddle layer neural network as function approximation ###
 44 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 45 | prediction_No=tf.nn.relu(middle_now)
 46 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 47 | 
 48 | 
 49 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 50 | prediction_Ne=tf.nn.relu(middle_next)
 51 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 52 | 
 53 | 
 54 | 
 55 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 56 | 
 57 | True_action=tf.cast(x4,tf.int32)
 58 | True_action=tf.reshape(True_action,shape=[-1,1])
 59 | action_repeat=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])
 60 | action_double=tf.concat([action_repeat,True_action],1)
 61 | 
 62 | ### calcaulate the loss and training ###
 63 | Q_value=tf.gather_nd(params=prediction_now,indices=action_double)
 64 | 
 65 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 66 | 
 67 | 
 68 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value))
 69 | 
 70 | q_loss=tf.reduce_sum(tf.square(delta))/2
 71 | 
 72 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 73 | 
 74 | 
 75 | 
 76 | 
 77 | #### save the model ####
 78 | saver=tf.train.Saver()
 79 | 
 80 | ### use gpu us training data ###
 81 | with tf.device('/cpu:0'):
 82 |     with tf.Session() as sess:
 83 |         sess.run(tf.global_variables_initializer())
 84 | 
 85 |         print('......start training data......')
 86 |         length_total_data=len(train_data)
 87 |         print(train_data[1])
 88 | 
 89 |         #print(length_total_data)
 90 | 
 91 |         select_order=np.arange(batch_size)
 92 |         #print(select_order)
 93 |         batch_number=5000
 94 |         test_size=20
 95 | 
 96 |         #epoch=5000
 97 | 
 98 |         #eposide_number=100
 99 |         all_eposide_length=np.zeros((1,batch_number))
100 |         all_reward=np.zeros((1,batch_number))
101 |         all_loss=np.zeros((1,batch_number))
102 | 
103 |         for i_batch in range(batch_number):
104 |         #for i_eposide in range(1000000):
105 |             batch_sample = random.sample(range(0, length_total_data), batch_size)
106 |             insert_data=train_data[batch_sample]
107 | 
108 |             obser_now = []
109 |             obser_next = []
110 |             reward_now = []
111 |             action_now = []
112 | 
113 |             for i_select in range(batch_size):
114 |                 obser_now.append(insert_data[i_select][0])
115 |                 obser_next.append(insert_data[i_select][1])
116 | 
117 |                 reward_now.append(insert_data[i_select][2])
118 |                 action_now.append(insert_data[i_select][3])
119 | 
120 |             _, train_loss = sess.run([train_optimizer, q_loss],feed_dict={x1: obser_now, x2: obser_next, x3: reward_now, x4: action_now,x5:select_order
121 |                                     })
122 |             if i_batch % 1 == 0:
123 | 
124 |                 print('...ith training....:', i_batch, 'average training loss:', train_loss/batch_size)
125 | 
126 |                 eposide_length = np.zeros((1,test_size))
127 |                 expected_value = np.zeros((1,test_size))
128 | 
129 |                 for i_episode in range(test_size):
130 | 
131 |                     observation_init = env.reset()
132 |                     observation_next = [observation_init]
133 | 
134 |                     for t in range(300):
135 |                         #env.render()
136 | 
137 |                         ### greedy policy to select action ###
138 |                         if np.random.random() <= eplison:
139 |                             Action = np.random.randint(2)
140 | 
141 |                         else:
142 | 
143 |                             Action = test_action.eval(feed_dict={x1: observation_next})
144 | 
145 |                         observation_curr, reward_curr, done, info = env.step(int(Action))
146 | 
147 |                         observation_next = [observation_curr]
148 | 
149 | 
150 |                         if done is True:
151 | 
152 |                             eposide_length[0,i_episode]=t + 1
153 |                             reward = -1
154 |                             reward_return = reward * (discount ** (t))
155 |                             expected_value[0,i_episode]=reward_return
156 | 
157 |                             break
158 | 
159 | 
160 |                 all_eposide_length[0,i_batch]=np.mean(np.mean(eposide_length,axis=0),axis=0)
161 |                 all_reward[0,i_batch]=np.mean(np.mean(expected_value,axis=0),axis=0)
162 |                 all_loss[0, i_batch] = train_loss/batch_size
163 |             if i_batch>=2:
164 |                 if i_batch==np.argmax(all_eposide_length):
165 |                     print(i_batch)
166 |                     print(np.argmax(all_eposide_length))
167 | 
168 |                     if not os.path.exists('./part3_neural_5_300/'):
169 |                         os.mkdir('./part3_neural_5_300/')
170 |                     saver.save(sess, "./part3_neural_5_300/")
171 |                     print('saved')
172 | 
173 | 
174 |             print('....the averagelength of test eposide....',np.mean(np.mean(eposide_length,axis=0),axis=0))
175 | 
176 | 
177 | 
178 |         outfile1 = all_reward
179 |         outfile2 = all_eposide_length
180 |         outfile3=all_loss
181 |         #print(outfile2)
182 |         #print(outfile1)
183 |         #print(outfile3)
184 |         #print('....the all eposide_length....',all_eposide_length)
185 | 
186 |         np.save('reward_data_part3_5_neural_300', outfile1)
187 |         np.save('length_data_part3_5_neural_300', outfile2)
188 |         np.save('loss_data_part3_5_neural_300', outfile3)
189 | 
190 | 
191 |         mean_episode_len = np.mean(all_eposide_length,axis=0)
192 |         mean_total_reward = np.mean(all_reward,axis=0)
193 |         mean_total_loss =np.mean(all_loss,0)
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/check_data.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | import random
 5 | import os
 6 | import matplotlib.pyplot as plt
 7 | env = gym.make('CartPole-v0')
 8 | train_reward=np.load('reward_data_part3_4_300.npy')
 9 | train_length=np.load('length_data_part3_4_300.npy')
10 | train_loss=np.load('loss_data_part3_4_300.npy')
11 | 
12 | 
13 | plt.plot(np.mean(train_loss,axis=0))
14 | plt.xlabel('ith Num of training episode')
15 | plt.ylabel('train_Mean loss')
16 | plt.show()
17 | plt.plot(np.mean(train_reward,axis=0))
18 | plt.xlabel('ith Num of training episode')
19 | plt.ylabel('train_Mean reward')
20 | plt.show()
21 | 
22 | plt.plot(np.mean(train_length,axis=0))
23 | plt.xlabel('ith Num of training episode')
24 | plt.ylabel('train_Mean length')
25 | plt.show()
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/figure_1-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/figure_1-3.png


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/length_data_part3_4_300.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/length_data_part3_4_300.npy


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/loss_data_part3_4_300.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/loss_data_part3_4_300.npy


--------------------------------------------------------------------------------
/CartPole/offline-batch-Q-learning/reward_data_part3_4_300.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/reward_data_part3_4_300.npy


--------------------------------------------------------------------------------
/CartPole/online-Q-learning/cartpole_4_neural_load.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | import matplotlib.pyplot as plt
  7 | env = gym.make('CartPole-v0')
  8 | env._max_episode_steps = 300
  9 | 
 10 | 
 11 | 
 12 | #### set variable and parameters ####
 13 | x1=tf.placeholder(tf.float32, shape=[None,4])
 14 | x2=tf.placeholder(tf.float32, shape=[None,4])
 15 | x3=tf.placeholder(tf.float32, shape=[None,1])
 16 | x4=tf.placeholder(tf.int32, shape=[None,2])
 17 | 
 18 | 
 19 | discount=0.99
 20 | learn_rate=0.0001
 21 | input_size=4
 22 | hidden_size=100
 23 | output_size=2
 24 | eplison=0.05
 25 | max_eposide_length=300
 26 | 
 27 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 28 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 29 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 30 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 31 | 
 32 | 
 33 | 
 34 | ###   one hiddle layer neural network as function approximation ###
 35 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 36 | prediction_No=tf.nn.relu(middle_now)
 37 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 38 | 
 39 | 
 40 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 41 | prediction_Ne=tf.nn.relu(middle_next)
 42 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 43 | 
 44 | ### the best action based on observation_now ###
 45 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 46 | 
 47 | 
 48 | 
 49 | ### calcaulate the loss and training ###
 50 | Q_value=tf.gather_nd(params=prediction_now,indices=x4)
 51 | 
 52 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 53 | 
 54 | 
 55 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value))
 56 | 
 57 | q_loss=tf.reduce_sum(tf.square(delta)/2)
 58 | 
 59 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 60 | 
 61 | 
 62 | #### save the model ####
 63 | saver=tf.train.Saver()
 64 | 
 65 | 
 66 | 
 67 | with tf.device('/cpu:0'):
 68 |     with tf.Session() as sess:
 69 |         ## reload the weights ###
 70 |         saver.restore(sess, './part4_neural_300/')
 71 |         eposide_length = []
 72 |         expected_value = []
 73 |         all_eposide_length = np.zeros((1, 10))
 74 |         all_reward = np.zeros((1, 100))
 75 | 
 76 |         for i_episode in range(10):
 77 | 
 78 |             observation_init = env.reset()
 79 |             observation_init = [observation_init]
 80 | 
 81 |             for t in range(300):
 82 | 
 83 |                 if t == 0:
 84 | 
 85 |                     Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init})
 86 | 
 87 | 
 88 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 89 | 
 90 |                     observation_next = [observation_curr]
 91 |                 else:
 92 |                     Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next})
 93 | 
 94 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 95 |                     observation_next = [observation_curr]
 96 | 
 97 |                 if done is True:
 98 | 
 99 |                     eposide_length.append(t + 1)
100 |                     reward = -1
101 |                     reward_return = reward * (discount ** (t))
102 |                     expected_value.append(reward_return)
103 | 
104 |                     break
105 |             all_eposide_length[0, i_episode] = t + 1
106 |             all_reward[0, i_episode] = reward_return
107 | 
108 |         all_eposide_length = np.mean(all_eposide_length, axis=0)
109 |         all_reward = np.mean(all_reward, axis=0)
110 | 
111 | 
112 | 
113 |         print('the mean of episode length', np.mean(eposide_length))
114 |         print('the mean of reward ',np.mean(expected_value))
115 | 
116 |         print('the standard deviation of episode length', np.std(eposide_length))
117 |         plt.plot(all_eposide_length)
118 |         plt.xlabel('Num of episode')
119 |         plt.ylabel('length of eposide')
120 |         plt.show()
121 |         plt.plot(all_reward)
122 |         plt.xlabel('Num of episode')
123 |         plt.ylabel('reward')
124 |         plt.show()
125 | 


--------------------------------------------------------------------------------
/CartPole/online-Q-learning/cartpole_4_neural_saved.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | #import matplotlib.pyplot as plt
  7 | env = gym.make('CartPole-v0')
  8 | env._max_episode_steps = 300
  9 | 
 10 | 
 11 | 
 12 | #### set variable and parameters ####
 13 | x1=tf.placeholder(tf.float32, shape=[None,4])
 14 | x2=tf.placeholder(tf.float32, shape=[None,4])
 15 | x3=tf.placeholder(tf.float32, shape=[None,1])
 16 | x4=tf.placeholder(tf.int32, shape=[None,2])
 17 | 
 18 | 
 19 | discount=0.99
 20 | learn_rate=0.0001
 21 | input_size=4
 22 | hidden_size=100
 23 | output_size=2
 24 | eplison=0.05
 25 | max_eposide_length=300
 26 | 
 27 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size]))
 28 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size]))
 29 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size]))
 30 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size]))
 31 | 
 32 | 
 33 | 
 34 | ###   one hiddle layer neural network as function approximation ###
 35 | middle_now=tf.matmul(x1,Weight_1)+Bias_1
 36 | prediction_No=tf.nn.relu(middle_now)
 37 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2
 38 | 
 39 | 
 40 | middle_next=tf.matmul(x2,Weight_1)+Bias_1
 41 | prediction_Ne=tf.nn.relu(middle_next)
 42 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2
 43 | 
 44 | ### the best action based on observation_now ###
 45 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32)
 46 | 
 47 | 
 48 | 
 49 | ### calcaulate the loss and training ###
 50 | Q_value=tf.gather_nd(params=prediction_now,indices=x4)
 51 | 
 52 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1)
 53 | 
 54 | 
 55 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value))
 56 | 
 57 | q_loss=tf.reduce_sum(tf.square(delta)/2)
 58 | 
 59 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 60 | 
 61 | 
 62 | #### save the model ####
 63 | saver=tf.train.Saver()
 64 | 
 65 | 
 66 | 
 67 | with tf.device('/gpu:0'):
 68 | 
 69 | #### set the set to save data ####
 70 |     run_size = 100
 71 |     all_episode_length = np.zeros((run_size, 2000))
 72 |     all_total_reward = np.zeros((run_size, 2000))
 73 |     all_test_episode_length = np.zeros((run_size, 2000))
 74 |     all_test_reward = np.zeros((run_size, 2000))
 75 |     all_train_loss = np.zeros((run_size, 2000))
 76 | 
 77 | 
 78 |     with tf.Session() as sess:
 79 |         for i_run in range(1,run_size+1):
 80 |             sess.run(tf.global_variables_initializer())
 81 | 
 82 |             print('......start training data......')
 83 | 
 84 |             for i_eposide in range(1,1+2000):
 85 | 
 86 |                 ### begin a new eposide ###
 87 |                 observation_00 = env.reset()
 88 |                 total_reward=0
 89 |                 total_QQ_loss=0
 90 | 
 91 |                 for i_step in range(max_eposide_length):
 92 | 
 93 |                     ### greedy policy to select action ###
 94 |                     if np.random.random() <= eplison:
 95 |                         action_select_now=np.random.randint(2)
 96 | 
 97 |                     else:
 98 |                     ### use Q function to select action ###
 99 |                         action_select_now=sess.run(test_action,feed_dict={x1:np.reshape(observation_00, [1, 4])})
100 |                         action_select_now=int(action_select_now)
101 | 
102 |                     observation_11,_,done_0,info=env.step(action_select_now)
103 | 
104 |                     if done_0 is False:
105 |                         reward=0
106 |                     else:
107 |                         reward=-1
108 |                     ### training step ###
109 |                     _,train_loss=sess.run([train_optimizer,q_loss], feed_dict={x1:np.reshape( observation_00,[1,4]), x2: np.reshape( observation_11,[1,4]), x3:np.reshape(reward,[1,1]),x4:np.reshape([0,action_select_now],[1,2])})
110 | 
111 |                     total_QQ_loss +=train_loss
112 | 
113 |                     observation_00 = observation_11
114 | 
115 | 
116 |                     if done_0 is True:
117 |                         reward=-1
118 | 
119 |                         final_reward =reward* discount**(i_step)
120 | 
121 |                         all_episode_length[i_run-1, i_eposide-1] = i_step + 1
122 |                         all_total_reward[i_run-1, i_eposide-1] = final_reward
123 | 
124 | 
125 | 
126 |                     ### record average test performance ###
127 |                         test_size=10
128 |                         Small_test_eposide_length = np.zeros((1, test_size))
129 |                         Small_test_reward = np.zeros((1, test_size))
130 | 
131 |                         for i_test_run in range(test_size):
132 |                             #env.render()
133 |                             observation_test_0 = env.reset()
134 | 
135 |                             for i_test_length in range(max_eposide_length):
136 |                                 action_test_now = test_action.eval(feed_dict={x1: np.reshape(observation_test_0, [1, 4])})
137 |                                 action_test_now=int(action_test_now)
138 |                                 observation_test_1, _, test_done, test_info = env.step(int(action_test_now))
139 | 
140 |                                 observation_test_0=observation_test_1
141 | 
142 |                                 if test_done is False:
143 |                                     reward_test = 0,
144 |                                 else:
145 |                                     reward_test = -1
146 | 
147 |                                 if test_done is True:
148 |                                     Small_test_eposide_length[0,i_test_run]=i_test_length+1
149 |                                     Small_test_reward[0,i_test_run]=reward_test*(discount**(i_test_length))
150 | 
151 | 
152 |                                     break
153 | 
154 | 
155 |                         small_mean_test_length=np.mean(np.mean(Small_test_eposide_length,axis=0),axis=0)
156 |                         small_mean_test_reward=np.mean(np.mean(Small_test_reward,axis=0),axis=0)
157 |                         print('ith_run', i_run-1, 'the ith eposide', i_eposide, 'the train_length_eposide', i_step + 1,
158 |                               'the test average length', small_mean_test_length , '..loss..',
159 |                               train_loss)
160 |                         all_test_episode_length[i_run-1, i_eposide-1]=small_mean_test_length
161 |                         all_test_reward[i_run-1, i_eposide-1]=small_mean_test_reward
162 |                         all_train_loss[i_run-1, i_eposide-1] = total_QQ_loss/(i_step+1)
163 | 
164 |                         break
165 |                         ##### saved the model for best ...####
166 |                 if i_eposide >= 2:
167 |                     if all_test_episode_length[i_run-1, i_eposide-1] == np.amax(all_test_episode_length):
168 |                         print('.....',all_test_episode_length[i_run-1, i_eposide-1])
169 |                         print(np.amax(all_test_episode_length))
170 | 
171 |                         if not os.path.exists('./part4_neural_300/'):
172 | 
173 |                             os.mkdir('./part4_neural_300/')
174 |                         saver.save(sess, "./part4_neural_300/")
175 |                         print('saved')
176 | 
177 | 
178 | ### save and plot performance during training and tes ####
179 | outfile1=all_total_reward
180 | outfile2=all_episode_length
181 | outfile3=all_train_loss
182 | outfile4=all_test_reward
183 | outfile5=all_test_episode_length
184 | 
185 | 
186 | np.save('part4_train_reward_300', outfile1)
187 | np.save('part4_train_eposide_length_300',outfile2)
188 | 
189 | np.save('part4_train_loss_300', outfile3)
190 | np.save('part4_test_reward_300', outfile4)
191 | np.save('part4_test_length_300', outfile5)
192 | 


--------------------------------------------------------------------------------
/CartPole/online-Q-learning/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/CartPole/readme:
--------------------------------------------------------------------------------
1 | 运行 py文件是 请将train_data_2 放到相应的文件夹, 或者自己改成相应的路径
2 | 
3 | train data 2 由 offline Q Learning文件夹中的 cartpole_3_collect_data 生成
4 | 
5 | ......saved.py 文件用于训练 和存储 模型
6 | ...... load.py 文件用于 加载存储好的模型参数, 用于测试训练好的模型效果
7 | 


--------------------------------------------------------------------------------
/CartPole/target-parameter/cartpole_7_target_load.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | env = gym.make('CartPole-v0')
 10 | env._max_episode_steps = 300
 11 | 
 12 | print("......Loading train_data......")
 13 | 
 14 | 
 15 | train_data=np.load('train_data_2.npy')
 16 | 
 17 | #### set variable and parameters ####
 18 | x1 = tf.placeholder(tf.float32, shape=[None, 4])
 19 | x2 = tf.placeholder(tf.float32, shape=[None, 4])
 20 | x3 = tf.placeholder(tf.float32, shape=[None, 2])
 21 | x4 = tf.placeholder(tf.float32, shape=[None])
 22 | 
 23 | batch_size = 128
 24 | discount = 0.99
 25 | learn_rate = 0.0001
 26 | input_size = 4
 27 | hidden_size = 100
 28 | output_size = 2
 29 | max_eposide_length = 300
 30 | eplison = 0.05
 31 | 
 32 | Weight_1 = tf.Variable(tf.truncated_normal(shape=[input_size, hidden_size]))
 33 | Weight_2 = tf.Variable(tf.truncated_normal(shape=[hidden_size, output_size]))
 34 | Bias_1 = tf.Variable(tf.constant(0.1, shape=[hidden_size]))
 35 | Bias_2 = tf.Variable(tf.constant(0.1, shape=[output_size]))
 36 | 
 37 | Weight_old_1=tf.placeholder(tf.float32,shape=[input_size,hidden_size])
 38 | Weight_old_2=tf.placeholder(tf.float32,shape=[hidden_size,output_size])
 39 | Bias_old_1=tf.placeholder(tf.float32,shape=[hidden_size])
 40 | Bias_old_2=tf.placeholder(tf.float32,shape=[output_size])
 41 | 
 42 | 
 43 | middle_now = tf.matmul(x1, Weight_1) + Bias_1
 44 | prediction_No = tf.nn.relu(middle_now)
 45 | prediction_now = tf.matmul(prediction_No, Weight_2) + Bias_2
 46 | 
 47 | middle_next = tf.matmul(x2, Weight_old_1) + Bias_old_1
 48 | prediction_Ne = tf.nn.relu(middle_next)
 49 | prediction_next = tf.matmul(prediction_Ne, Weight_old_2) + Bias_old_2
 50 | #
 51 | 
 52 | True_action = tf.cast(x3, tf.int32)
 53 | test_action = tf.cast(tf.argmax(prediction_now, 1), tf.int32)
 54 | Q_value = tf.gather_nd(prediction_now, True_action)
 55 | 
 56 | max_Q_value = tf.reduce_max(prediction_next, axis=1)
 57 | delta = x4 + discount * tf.stop_gradient((1 + x4) * max_Q_value) - Q_value
 58 | q_loss = tf.reduce_mean(tf.square(delta) / 2)
 59 | 
 60 | train_optimizer = tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 61 | 
 62 | saver = tf.train.Saver()
 63 | 
 64 | 
 65 | with tf.device('/cpu:0'):
 66 |     with tf.Session() as sess:
 67 |         ## reload the weights ###
 68 |         saver.restore(sess, './part7_target/')
 69 |         eposide_length = []
 70 |         expected_value = []
 71 |         all_eposide_length = np.zeros((1, 10))
 72 |         all_reward = np.zeros((1, 100))
 73 | 
 74 |         for i_episode in range(10):
 75 | 
 76 |             observation_init = env.reset()
 77 |             observation_init = [observation_init]
 78 | 
 79 |             for t in range(300):
 80 | 
 81 |                 if t == 0:
 82 | 
 83 |                     Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init})
 84 | 
 85 | 
 86 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 87 | 
 88 |                     observation_next = [observation_curr]
 89 |                 else:
 90 |                     Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next})
 91 | 
 92 |                     observation_curr, reward_curr, done, info = env.step(Action[0])
 93 |                     observation_next = [observation_curr]
 94 | 
 95 |                 if done is True:
 96 | 
 97 |                     eposide_length.append(t + 1)
 98 |                     reward = -1
 99 |                     reward_return = reward * (discount ** (t))
100 |                     expected_value.append(reward_return)
101 | 
102 |                     break
103 |             all_eposide_length[0, i_episode] = t + 1
104 |             all_reward[0, i_episode] = reward_return
105 | 
106 |         all_eposide_length = np.mean(all_eposide_length, axis=0)
107 |         all_reward = np.mean(all_reward, axis=0)
108 | 
109 | 
110 | 
111 |         print('the mean of episode length', np.mean(eposide_length))
112 |         print('the mean of reward ',np.mean(expected_value))
113 | 
114 |         print('the standard deviation of episode length', np.std(eposide_length))
115 |         plt.plot(all_eposide_length)
116 |         plt.xlabel('Num of episode')
117 |         plt.ylabel('length of eposide')
118 |         plt.show()
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/CartPole/target-parameter/cartpole_7_target_saved.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import random
  5 | import os
  6 | 
  7 | env = gym.make('CartPole-v0')
  8 | env._max_episode_steps = 300
  9 | print("......Loading train_data......")
 10 | 
 11 | train_data=np.load('train_data_2.npy')
 12 | 
 13 | #### set variable and parameters ####
 14 | x1 = tf.placeholder(tf.float32, shape=[None, 4])
 15 | x2 = tf.placeholder(tf.float32, shape=[None, 4])
 16 | x3 = tf.placeholder(tf.float32, shape=[None, 2])
 17 | x4 = tf.placeholder(tf.float32, shape=[None])
 18 | 
 19 | batch_size = 128
 20 | discount = 0.99
 21 | learn_rate = 0.0001
 22 | input_size = 4
 23 | hidden_size = 100
 24 | output_size = 2
 25 | max_eposide_length = 300
 26 | eplison = 0.05
 27 | 
 28 | Weight_1 = tf.Variable(tf.truncated_normal(shape=[input_size, hidden_size]))
 29 | Weight_2 = tf.Variable(tf.truncated_normal(shape=[hidden_size, output_size]))
 30 | Bias_1 = tf.Variable(tf.constant(0.1, shape=[hidden_size]))
 31 | Bias_2 = tf.Variable(tf.constant(0.1, shape=[output_size]))
 32 | 
 33 | ### the old weights ###
 34 | Weight_old_1=tf.placeholder(tf.float32,shape=[input_size,hidden_size])
 35 | Weight_old_2=tf.placeholder(tf.float32,shape=[hidden_size,output_size])
 36 | Bias_old_1=tf.placeholder(tf.float32,shape=[hidden_size])
 37 | Bias_old_2=tf.placeholder(tf.float32,shape=[output_size])
 38 | 
 39 | 
 40 | middle_now = tf.matmul(x1, Weight_1) + Bias_1
 41 | prediction_No = tf.nn.relu(middle_now)
 42 | prediction_now = tf.matmul(prediction_No, Weight_2) + Bias_2
 43 | 
 44 | ### calculate the target value by old weights ###
 45 | middle_next = tf.matmul(x2, Weight_old_1) + Bias_old_1
 46 | prediction_Ne = tf.nn.relu(middle_next)
 47 | prediction_next = tf.matmul(prediction_Ne, Weight_old_2) + Bias_old_2
 48 | 
 49 | 
 50 | True_action = tf.cast(x3, tf.int32)
 51 | test_action = tf.cast(tf.argmax(prediction_now, 1), tf.int32)
 52 | Q_value = tf.gather_nd(prediction_now, True_action)
 53 | 
 54 | max_Q_value = tf.reduce_max(prediction_next, axis=1)
 55 | delta = x4 + discount * tf.stop_gradient((1 + x4) * max_Q_value) - Q_value
 56 | q_loss = tf.reduce_mean(tf.square(delta) / 2)
 57 | 
 58 | train_optimizer = tf.train.AdamOptimizer(learn_rate).minimize(q_loss)
 59 | 
 60 | saver = tf.train.Saver()
 61 | 
 62 | with tf.device('/cpu:0'):
 63 |     eposide_size = 2000
 64 |     run_size = 1
 65 |     all_episode_length = np.zeros((run_size, int(eposide_size)))
 66 |     all_total_reward = np.zeros((run_size, int(eposide_size)))
 67 |     all_test_episode_length = np.zeros((run_size, int(eposide_size)))
 68 |     all_test_reward = np.zeros((run_size, int(eposide_size / 20)))
 69 |     all_train_loss = np.zeros((run_size, int(eposide_size / 20)))
 70 | 
 71 |     length_of_train = len(train_data)
 72 |     for i_run in range(1, run_size + 1):
 73 |         ### build the experience replay ###
 74 | 
 75 |         buffer_size = 1024
 76 |         mini_batch_size = 64
 77 | 
 78 |         length_of_train = len(train_data)
 79 |         buffer_sample = random.sample(range(0, length_of_train), buffer_size)
 80 |         buffer_replay = train_data[buffer_sample]
 81 | 
 82 |         buffer_observation_now = []
 83 |         buffer_observation_next = []
 84 |         buffer_action = []
 85 |         buffer_reward = []
 86 | 
 87 |         for i_sele in range(buffer_size):
 88 |             buffer_observation_now.append(buffer_replay[i_sele][0])
 89 |             buffer_observation_next.append(buffer_replay[i_sele][1])
 90 |             buffer_reward.append(buffer_replay[i_sele][2])
 91 |             buffer_action.append(buffer_replay[i_sele][3])
 92 | 
 93 |         with tf.Session() as sess:
 94 | 
 95 |             sess.run(tf.global_variables_initializer())
 96 | 
 97 |             for i_eposide in range(1, 1 + eposide_size):
 98 | 
 99 |                 observation_0 = env.reset()
100 | 
101 |                 total_QQ_loss = 0
102 | 
103 |                 for i_step in range(max_eposide_length):
104 |                     ### hold the old weights for target calculation ###
105 |                     if ((i_eposide - 1) % 5 == 0):
106 |                         old_weight_1, old_weight_2, old_bias_1, old_bias_2 = sess.run([Weight_1, Weight_2, Bias_1, Bias_2])
107 | 
108 |                     if np.random.random() <= eplison:
109 |                         action_train = np.random.randint(2)
110 |                     else:
111 |                         Q = sess.run(test_action, feed_dict={x1: np.reshape(observation_0, [1, 4])})
112 |                         action_select_now = int(Q)
113 | 
114 | 
115 |                     observation_1, _, done_0, _ = env.step(action_select_now)
116 | 
117 |                     if done_0:
118 |                         reward = -1
119 |                     else:
120 |                         reward = 0
121 | 
122 |                     ### add new data to replay memory##
123 |                     buffer_observation_now = np.append(buffer_observation_now, np.reshape(observation_0, [1, 4]),
124 |                                                        axis=0)
125 |                     buffer_observation_next = np.append(buffer_observation_next, np.reshape(observation_1, [1, 4]),
126 |                                                         axis=0)
127 |                     buffer_action = np.append(buffer_action, [action_select_now], axis=0)
128 |                     buffer_reward = np.append(buffer_reward, [reward], axis=0)
129 | 
130 |                     this_batch = random.sample(range(len(buffer_replay)), mini_batch_size)
131 | 
132 |                     _, loss_train = sess.run([train_optimizer, q_loss],
133 |                                              feed_dict={x1: buffer_observation_now[this_batch, :],
134 |                                                         x2: buffer_observation_next[this_batch, :],
135 |                                                         x3: np.concatenate((np.reshape(
136 |                                                             np.arange(mini_batch_size),
137 |                                                             [mini_batch_size, 1]), np.reshape(
138 |                                                             buffer_action[this_batch],
139 |                                                             [mini_batch_size, 1])), axis=1)
140 |                                                  , x4: buffer_reward[this_batch],
141 |                                                         Weight_old_1:old_weight_1,
142 |                                                         Weight_old_2:old_weight_2,
143 |                                                         Bias_old_1:old_bias_1,
144 |                                                         Bias_old_2:old_bias_2,
145 |                                                         })
146 |                     total_QQ_loss += loss_train
147 | 
148 |                     observation_0 = observation_1
149 | 
150 |                     if (i_eposide - 1) % 20 == 0:
151 |                         if done_0 is True:
152 |                             if i_step + 1 == 300:
153 |                                 report_reward = 0
154 |                             else:
155 |                                 report_reward = -1 * discount ** (i_step)
156 | 
157 |                             all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1
158 |                             all_total_reward[i_run - 1, i_eposide - 1] = report_reward
159 | 
160 |                             ### record average test performance ###
161 |                             test_size = 10
162 |                             Small_test_eposide_length = np.zeros((1, test_size))
163 |                             Small_test_reward = np.zeros((1, test_size))
164 | 
165 |                             for i_test_run in range(1, test_size + 1):
166 |                                 observation_test_0 = env.reset()
167 | 
168 |                                 for i_test_length in range(max_eposide_length):
169 |                                     action_test_now = test_action.eval(
170 |                                         feed_dict={x1: np.reshape(observation_test_0, [1, 4])})
171 |                                     action_test_now = int(action_test_now)
172 |                                     observation_test_1, _, test_done, test_info = env.step(action_test_now)
173 | 
174 |                                     observation_test_0 = observation_test_1
175 | 
176 |                                     if test_done is True:
177 |                                         if i_test_length + 1 == 300:
178 |                                             reward_test = 0
179 |                                         else:
180 |                                             reward_test = -1
181 |                                         Small_test_eposide_length[0, i_test_run - 1] = i_test_length + 1
182 |                                         Small_test_reward[0, i_test_run - 1] = reward_test * (
183 |                                             discount ** (i_test_length))
184 | 
185 | 
186 |                                         break
187 | 
188 |                             small_mean_test_length = np.mean(np.mean(Small_test_eposide_length, axis=0), axis=0)
189 |                             small_mean_test_reward = np.mean(np.mean(Small_test_reward, axis=0), axis=0)
190 |                             print('the ith running',i_run,'the ith eposide', i_eposide - 1, 'the test_average_length',
191 |                                   small_mean_test_length,
192 |                                   'the total_test_length ', Small_test_eposide_length, '..loss..',
193 |                                   total_QQ_loss / (i_step + 1))
194 |                             all_test_episode_length[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_length
195 |                             # print((i_eposide-1)/20)
196 |                             # print(int((i_eposide-1)/20))
197 |                             all_test_reward[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_reward
198 |                             all_train_loss[i_run - 1, int((i_eposide - 1) / 20)] = total_QQ_loss / (i_step + 1)
199 | 
200 |                             if all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)] == np.amax(
201 |                             all_test_episode_length):
202 |                                 print('.....', all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)])
203 |                                 print(np.amax(all_test_episode_length))
204 |                                 if not os.path.exists('./part7_target/'):
205 |                                     os.mkdir('./part7_target/')
206 |                                 saver.save(sess, "./part7_target/")
207 |                                 print('saved')
208 | 
209 |                             break
210 |                     else:
211 |                         if done_0 is True:
212 |                             reward = -1
213 | 
214 |                             final_reward = reward * discount ** (i_step)
215 | 
216 |                             all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1
217 |                             all_total_reward[i_run - 1, i_eposide - 1] = final_reward
218 | 
219 |                             break
220 | 
221 | 
222 |     outfile1 = all_total_reward
223 |     outfile2 = all_episode_length
224 |     outfile3 = all_train_loss
225 |     outfile4 = all_test_reward
226 |     outfile5 = all_test_episode_length
227 | 
228 |     np.save('reward_data_train_part7', outfile1)
229 |     np.save('length_data_train_part7', outfile2)
230 | 
231 |     np.save('loss_data_train_part7', outfile3)
232 |     np.save('length_data_test_part7', outfile4)
233 |     np.save('reward_data_test_part7', outfile5)
234 | 
235 |     mean_episode_len = np.mean(all_episode_length, axis=0)
236 |     mean_total_reward = np.mean(all_total_reward, axis=0)
237 |     mean_loss_train = np.mean(all_train_loss, axis=0)
238 |     mean_test_eposide_length = np.mean(all_test_episode_length, axis=0)
239 |     mean_test_reward = np.mean(all_test_reward, axis=0)
240 | 
241 |     std_episode_len = np.std(all_episode_length, axis=0)
242 |     std_total_reward = np.std(all_total_reward, axis=0)
243 | 


--------------------------------------------------------------------------------
/CartPole/target-parameter/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/CartPole/three-random-episode/3_random_episode.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | 
 4 | env = gym.make('CartPole-v0')
 5 | 
 6 | ### set parameters and set ###
 7 | discount_factor=0.99
 8 | eposide_length=[]
 9 | expect_value=[]
10 | 
11 | for i_episode in range(3):
12 | 
13 |     observation_init = env.reset()
14 |     for t in range(300):
15 |         env.render()
16 |         ### select action by uniform distribution ###
17 |         action= np.random.uniform(0,1,1)
18 |         action=np.round(action)
19 |         action=int(action)
20 | 
21 |         observation, reward, done, info = env.step(action)
22 | 
23 |         if done:
24 |             ### when each eposide ended record the return and eposide's length
25 |             reward=-1
26 |             reward_return=reward*(discount_factor**(t))
27 |             expect_value.append(reward_return)
28 | 
29 |             print("Episode length is  {} ".format(t+1))
30 |             eposide_length.append(t+1)
31 | 
32 |             break
33 | 
34 | print("the trajectories'length :",eposide_length)
35 | print("the return from the starting state:",expect_value)
36 | 


--------------------------------------------------------------------------------
/CartPole/three-random-episode/delete:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/CartPole/train_data_2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/train_data_2.npy


--------------------------------------------------------------------------------
/learning_curve/Capture_1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/Capture_1.JPG


--------------------------------------------------------------------------------
/learning_curve/DQN_PICTURE.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/DQN_PICTURE.JPG


--------------------------------------------------------------------------------
/learning_curve/MsPacman0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/MsPacman0.png


--------------------------------------------------------------------------------
/learning_curve/MsPacman301.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/MsPacman301.png


--------------------------------------------------------------------------------
/learning_curve/Pong19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/Pong19.png


--------------------------------------------------------------------------------
/learning_curve/Pong256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/Pong256.png


--------------------------------------------------------------------------------
/learning_curve/batch_Q_learning_linear_0.001_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/batch_Q_learning_linear_0.001_length.png


--------------------------------------------------------------------------------
/learning_curve/batch_Q_learning_linear_0.001_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/batch_Q_learning_linear_0.001_reward.png


--------------------------------------------------------------------------------
/learning_curve/batch_Q_learning_neural_0.0001_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/batch_Q_learning_neural_0.0001_length.png


--------------------------------------------------------------------------------
/learning_curve/batch_Q_learning_neural_0.0001_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/batch_Q_learning_neural_0.0001_reward.png


--------------------------------------------------------------------------------
/learning_curve/boxing0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/boxing0.png


--------------------------------------------------------------------------------
/learning_curve/boxing313.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/boxing313.png


--------------------------------------------------------------------------------
/learning_curve/boxing_128_128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/boxing_128_128.png


--------------------------------------------------------------------------------
/learning_curve/boxing_28_28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/boxing_28_28.png


--------------------------------------------------------------------------------
/learning_curve/double_Q_learning_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/double_Q_learning_length.png


--------------------------------------------------------------------------------
/learning_curve/double_Q_learning_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/double_Q_learning_reward.png


--------------------------------------------------------------------------------
/learning_curve/experience_replay_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/experience_replay_length.png


--------------------------------------------------------------------------------
/learning_curve/experience_replay_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/experience_replay_reward.png


--------------------------------------------------------------------------------
/learning_curve/mapacman_28_28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/mapacman_28_28.png


--------------------------------------------------------------------------------
/learning_curve/mspacman_128_128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/mspacman_128_128.png


--------------------------------------------------------------------------------
/learning_curve/mspacman_28_28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/mspacman_28_28.png


--------------------------------------------------------------------------------
/learning_curve/online_Q_learning_neural_0.001_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/online_Q_learning_neural_0.001_length.png


--------------------------------------------------------------------------------
/learning_curve/online_Q_learning_neural_0.001_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/online_Q_learning_neural_0.001_reward.png


--------------------------------------------------------------------------------
/learning_curve/pong_128_128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/pong_128_128.png


--------------------------------------------------------------------------------
/learning_curve/pong_28_28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/pong_28_28.png


--------------------------------------------------------------------------------
/learning_curve/readme:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/learning_curve/target_parameter_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/target_parameter_length.png


--------------------------------------------------------------------------------
/learning_curve/target_parameter_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/target_parameter_reward.png


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # 经典强化学习算法在CartPole和几个Atari游戏的实现
  2 | 
  3 | 
  4 | 
  5 | # OpenAI Gym  环境的安装
  6 | 
  7 | ------
  8 | 
  9 | ##### **假如已经安装python3.5+  可以通过以下两种方式简单安装gym环境**
 10 | 
 11 | 
 12 | 
 13 | ```
 14 | pip install gym
 15 | ```
 16 | 
 17 | 或者：
 18 | 
 19 | ```
 20 | git clone https://github.com/openai/gym
 21 | cd gym
 22 | pip install -e .
 23 | ```
 24 | 
 25 | 通过以上简易安装, 已经可以执行一些基本的游戏 如 Cartpole
 26 | 
 27 | 运行以下实验, 来验证gym 安装成功
 28 | 
 29 | ```
 30 | import gym
 31 | env = gym.make('CartPole-v0')
 32 | env.reset()
 33 | for _ in range(1000):
 34 |     env.render()
 35 |     env.step(env.action_space.sample()) # take a random action
 36 |     
 37 | ```
 38 | 
 39 | 
 40 | 
 41 | 当然你也可以增加要安装的环境包,  在上面第二种安装方法的最后一行代码中加入 ['环境名称‘]
 42 | 
 43 | ```
 44 | pip install -e .['names']
 45 | ```
 46 | 
 47 | 特别的, 当 'names’=‘all'  将执行安装全部的环境, 这需要更多的依赖包如 cmake 和较新版本的pip, 由于这里我们要安装 Atari环境, 但往往个人主机里会缺少 atari_py、cmake
 48 | 
 49 | **因此 我们可以按照以下的步骤配置 Atari 环境 (windows)：**
 50 | 
 51 | ```
 52 | # 更新pip
 53 | python -m pip install --upgrade pip  
 54 | # 安装atari_py
 55 | pip install --no-index -f https://github.com/Kojoley/atari-py/releases atari_py
 56 | 
 57 | git clone https://github.com/openai/gym
 58 | cd gym
 59 | 
 60 | #安装 cmake
 61 | pip install cmake
 62 | 
 63 | pip install -e .[atari] 
 64 | ```
 65 | 
 66 | 
 67 | 
 68 | > "更多的关于gym环境的documents：" http://gym.openai.com/docs/
 69 | >
 70 | > 
 71 | 
 72 | # 使用Google Colab GPU 训练Deep-Q-Network模型
 73 | 
 74 | 相比于 CartPole游戏的数值输入与输出, Atari 游戏环境给出的是RGB的图片 一般为(210,160,3)格式，而且离散动作空间也相应更大, 因此要求进行更多有效的空间探索, 这需要更大的计算量. 而大的experience replay 对电脑的内存性能要求也更大, 因此使用平常的电脑CPU 进行实验往往十分耗时.由于google 在今年一月份发布了一个免费的GPU 使用项目, 该项目附于google drive 上, 进行文件的管理十分方便, 因此我们使用其进行模型训练.
 75 | 
 76 | ### 获得google colab授权:
 77 | 
 78 | 其中需要两次点进出现的网址, 登录谷歌账号授权, 得到相应的key,输入连接成功.
 79 | 
 80 | 在colab 里,！后输入命令相当于在cmd 终端里输入命令
 81 | 
 82 | 如下图所示:
 83 | 
 84 | 
 85 | 
 86 | ![](learning_curve/Capture_1.JPG)
 87 | 
 88 | 
 89 | 
 90 | ```
 91 | !apt-get install -y -qq software-properties-common python-software-properties module-init-tools
 92 | 
 93 | !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
 94 | 
 95 | !apt-get update -qq 2>&1 > /dev/null
 96 | 
 97 | !apt-get -y install -qq google-drive-ocamlfuse fuse
 98 | 
 99 | from google.colab import auth
100 | 
101 | auth.authenticate_user()
102 | 
103 | from oauth2client.client import GoogleCredentials
104 | 
105 | creds = GoogleCredentials.get_application_default()
106 | 
107 | import getpass
108 | 
109 | !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
110 | 
111 | vcode = getpass.getpass()
112 | 
113 | !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret
114 | 
115 | ```
116 | 
117 | 
118 | 
119 | ```
120 | !mkdir -p drive
121 | 
122 | !google-drive-ocamlfuse drive
123 | 
124 | ```
125 | 
126 | ### 安装Gym classical  和 Atari 环境
127 | 
128 |  直接安装基本的gym环境
129 | 
130 | ```
131 | !pip install gym
132 | ```
133 | 
134 | 安装atari 环境的依赖包 swig  和 cmake
135 | 
136 | ```
137 | !apt install git
138 | !git clone https://github.com/openai/gym.git
139 | cd gym
140 | !apt-get install cmake
141 | !apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig 
142 | # 接着安装 atari #
143 | !pip install -e '.[atari]
144 | 
145 | ```
146 | 
147 | 现在可以在 colab Notebook 上就测试gym atari 环境了. 
148 | 
149 | [更多的colab配置信息]: https://www.234du.com/1154.html
150 | 
151 | 
152 | 
153 | # 项目实现步骤
154 | 
155 | ------
156 | 
157 | ## Cart-Pole 游戏:
158 | 
159 | 
160 | 
161 | 首先我们搭建、调通一些经典的强化学习算法, 包括batch(offline) Q-learng、online Q-learning、Deep Q-Network, Double-Q-learning, 在经典的平衡杆(Cart-Pole )游戏测试每个算法的表现.
162 | 
163 | 由于平衡杆游戏每次给的的反馈是一个 四维数组<img src="https://latex.codecogs.com/gif.latex?[S_t,S_{t&plus;1},A,R]" title="[S_t,S_{t+1},A,R]" />, 因此我们只需要用一个前馈神经网络去作为动作值函数近似器.
164 | 
165 | 但在Atari 游戏中, 输入一个action, 游戏系统给的反馈是游戏进行中场景的图片, 因此, 我们用卷积神经网络代替前馈神经网络作为动作值函数近似器. 然后分别在上述的几个强化学习算法中使用.
166 | 
167 | 
168 | 
169 | 
170 | 
171 | ### Random Policy
172 | 
173 | 
174 | 
175 | 为了方便计算 我们人为设置 discount factor 为 0.99, reward 为 -1 当步骤是一个episode 的最后一步时, 否则 reward 为 0, 设置一个episode步长上限为300 `env._max_episode_steps = 300`.  PS(以上设置仅针对Cart-Pole游戏成立)
176 | 
177 | CartPole 游戏的 action只有 0或1, 首先使用随机选择action的策略熟悉游戏环境和观察平均步长 和回报.
178 | 
179 | py文件分别在文件夹three-random-episode和 hundred-random-episode 
180 | 
181 | 可以通过`env.render()`打开flash 观察游戏的进程. 平均步长和回报大概分别是22、-0.81.
182 | 
183 | 
184 | 
185 | 
186 | 
187 | ### batch (offline) Q-learning
188 | 
189 | 
190 | 
191 | 先收集2000个随机策略下的episodes 数据, 然后仅仅基于收集好的数据, 通过直接训练动作值函数  <img src="https://latex.codecogs.com/gif.latex?Q(S;A)" title="Q(S;A)" />  来学习控制平衡杆, 在这里我们分别用一个 线性转换 和仅含单层隐藏层(神经元数为100)的前馈神经网络来表达动作值函数, 尝试的学习率分别是  <img src="https://latex.codecogs.com/gif.latex?[10^{-5},10^{-4},10^{-3},10^{-2},10^{-1},0.5]" title="[10^{-5},10^{-4},10^{-3},10^{-2},10^{-1},0.5]" />  总的训练、更新参数次数为5000，每次训练的数据量为1000; 学习率、优化器分别是 0.001 和Adam.
192 | 
193 | 实验发现, 相对前馈神经网络, 训练过程中线性转换的动作值函数能更快的控制平衡杆达到300步, 但极易overfitting, 相反前馈神经网络的学习过程表现的更稳定, 最终的学习效果也更好.
194 | 
195 | 
196 | 
197 | ***learning rate =0.001，linear transformation***
198 | 
199 | ![](learning_curve/batch_Q_learning_linear_0.001_length.png)
200 | 
201 | ![](learning_curve/batch_Q_learning_linear_0.001_reward.png)
202 | 
203 | 
204 | 
205 | ***learning_rate=0.0001,hidden layer(100) { linear transformation + ReLU }***
206 | 
207 | ![](learning_curve/batch_Q_learning_neural_0.0001_length.png)
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | ![](learning_curve/batch_Q_learning_neural_0.0001_reward.png)
216 | 
217 | ***思考：*** 由于Cart-Pole 游戏比较简单, 线性近似的动作值函数能用更少的训练步骤达到目标, 且受参数初始化的影响较小; 但极易overfitting, 且不好控制训练的时间及量. 相反, 含单层隐藏层的前馈神经网络需要更多的训练步骤达到目标值, 但相比于线性近似更稳定. 并且最终收敛效果也更好.
218 | 
219 | 
220 | 
221 | 
222 | 
223 | ### online Q-learning
224 | 
225 | 
226 | 
227 | 从这里开始我们将仅仅使用神经网络来近似动作值函数,
228 | 
229 | 在 offline-Q-learning 中训练前馈神经网络用的是离线的数据, 且每次输入模型的数据量可以自由控制, 一旦训练次数偏多或每次的训练量偏大易造成overfitting, 这里我们每次输入的数据即进行中的 episode 的最新一步的反馈, 即每次只根据一个数据的信息更新动作值函数的参数.
230 | 
231 | 为了让模型更好、更快的学习, 在训练过程中, 我们采用 epsilon-greedy Q-learning 算法, epsilon rate=0.05, 即有0.05 的概率使用随机策略, 而在测试过程中则全部采用值函数给出的action.
232 | 
233 | 根据之前的经验,  学习率、优化器分别是 0.001, Adam；其它的设置与之前的单隐藏层前馈神经网络一致. 更多的,为了防止初始化参数带来的偏差, 我们训练一百个模型, 观察平均的步长和回报
234 | 
235 | 
236 | 
237 | ***online Q-learning 的学习曲线***
238 | 
239 | ![](learning_curve/online_Q_learning_neural_0.001_length.png)
240 | 
241 | 
242 | 
243 | ![](learning_curve/online_Q_learning_neural_0.001_reward.png)
244 | 
245 | ***思考***: 虽然, 每个 episode 的平均步长为120还远远未到达目标步长300, 但从学习曲线得知, 我们的模型一直在学习如何更好的控制平衡杆, 随着训练时间的增加, 会逐渐接近设定的目标, 相比于offline Q-learning,  online Q-learning 会更稳定的学习,  虽然消耗的训练时间更多. 
246 | 
247 | PS:　Note that with the automatic gradient computation in tensorflow,you must apply a stop gradient operation to avoid adapting the learning target.
248 | 
249 | <img src="https://latex.codecogs.com/gif.latex?\delta=R_{t&plus;1}&plus;\gamma*tf.stop\_gradident(max_{A_{t&plus;1}}Q(S_{t&plus;1,A_{t&plus;1}}))-&space;Q(S_{t},A_t)" title="\delta=R_{t+1}+\gamma*tf.stop\_gradident(max_{A_{t+1}}Q(S_{t+1,A_{t+1}}))- Q(S_{t},A_t)" />
250 | 
251 | <img src="https://latex.codecogs.com/gif.latex?loss=0.5*\delta^{2}" title="loss=0.5*\delta^{2}" />
252 | 
253 | 
254 | 
255 | 
256 | 
257 | ### Different Neural Size 
258 | 
259 | 
260 | 
261 | 这里我们使用不同的neural size, 测试online Q-learning的性能
262 | 
263 | neural size=30
264 | 
265 | 或者
266 | 
267 | neural size=1000
268 | 
269 | 
270 | 
271 | 
272 | 
273 | ### Experience Replay and Target Parameter
274 | 
275 | 
276 | 
277 | Deep Q-NetWork 是近些年提出的一种增强学习模型, 相比于传统的Q-learning 算法, 其增加了两个重要的机制：经验回放、目标函数参数固定.
278 | 
279 | NIPS DQN在基本的Deep Q-Learning算法的基础上使用了Experience Replay经验池. 通过将训练得到的数据储存起来然后随机采样的方法降低了数据样本的相关性, 提升了性能, 接下来, Nature DQN做了一个改进, 就是增加Target Q网络. 也就是我们在计算目标Q值时使用专门的一个目标Q网络来计算, 而不是直接使用预更新的Q网络. 
280 | 
281 | 这样做的目的是为了减少目标计算与当前值的相关性.
282 | 
283 | <img src="https://latex.codecogs.com/gif.latex?Loss=(r&plus;\gamma&space;max_{a^{'}}Q(s^{'},a^{'},w^{-})-Q(s,a,w))^2" title="Loss=(r+\gamma max_{a^{'}}Q(s^{'},a^{'},w^{-})-Q(s,a,w))^2" />
284 | 
285 | 如上面的损失函数公式所示, 计算目标Q值的函数使用的参数是<img src="https://latex.codecogs.com/gif.latex?w^{-}" title="w^{-}" />,相比之下, Nips 版本DQN 的 目标Q网络是随着Q网络实时更新的, 这样会导致 目标Q值与当前的Q值相关性较大, 容易造成过度估计（over estimation）问题
286 | 
287 |  因此提出单独使用一个目标Q网络. 那么目标Q网络的参数如何来呢？还是从Q网络中来, 只不过是延迟更新. 也就是每次等训练了一段时间再将当前Q网络的参数值复制给目标Q网络.
288 | 
289 | 
290 | 
291 | **在Q-learning 中仅加入 Experience Replay效果如下:**
292 | 
293 | ![](learning_curve/experience_replay_length.png)
294 | 
295 | ![](learning_curve/experience_replay_reward.png)
296 | 
297 | ***在 Q-learning中仅加入Target Parameter机制***
298 | 
299 | ![](learning_curve/target_parameter_length.png)
300 | 
301 | ![](learning_curve/target_parameter_reward.png)
302 | 
303 | ***思考***:  从学习曲线中可以看出, 分别加入两个机制 都对算法的学习性能有较大的提升, 因其分别减少了数据之间、Q值与目标Q值之间的相关性, 降低了overfitting 和过度估计的可能性, 因此可推断同时加入两个机制应该有更大的提升. 
304 | 
305 | 
306 | 
307 | ### Double Q-learning
308 | 
309 | 
310 | 
311 | 在上面的 target-parameter 中, 对于target Q值 与目前Q 值, 我们使用同一个Q网络, 只不过参数更新的频率不一样.
312 | 
313 | 而在double Q-learning 里,为了减少因为目标Q值里 max Q值计算带来的计算偏差, 或者称为过度估计（over estimation）问题, 我们训练两个网络; 用当前的Q网络来选择动作, 用目标Q网络来计算目标Q. 算法步骤如下图:
314 | 
315 | ![](learning_curve/DQN_PICTURE.JPG)
316 | 
317 | ***Double Q-Learning 的学习曲线***
318 | 
319 | 
320 | 
321 | ![](learning_curve/double_Q_learning_length.png)
322 | 
323 | ![](learning_curve/double_Q_learning_reward.png)
324 | 
325 | ***思考***:  减少目标Q值计算的过度估计 看起来也有积极的效果对于提升算法的性能
326 | 
327 | 
328 | 
329 | ## Atari Game(pong、Boxing、Mspacman)：
330 | 
331 | 
332 | 
333 | 这里我们关注经典强化学习算法在 Atari 游戏上的 表现, (Pong、Mspcaman、Boxing), 相应的 我们用cnn 代替上面的前馈神经网络和线性转换作为动作值函数近似. 
334 | 
335 | ***网络设置与数据预处理***
336 | 
337 | 将环境给的每一张反馈图片压缩成64×64×1的灰阶格式, 将每四张处理过的图片连接在一起(64×64×4),存储格式为(tf.uint8) 
338 | 
339 | (因为上述的atari 游戏中 每个动作会重复K 遍,以保证设定的合理性，K 随机从 [2,3,4] 中挑选).
340 | 
341 | 环境的名称分别为“Pong-v0”, “MsPacman-v0”, and “Boxing-v0” (没有-ram).
342 | 
343 | 使用卷积神经网络近似Q-value-function. 第一层的过滤器大小为6×6、步长为2、16个channel, 连接Relu 激活函数;
344 | 
345 | 第二层的过滤器大小为4×4、步长为2、32个channel, 连接Relu 激活函数;
346 | 
347 | 第三层为全连接层, 神经元数为256, 连接ReLU 函数
348 | 
349 | 最后通过线性转换预测状态动作值函数, 每个动作连接一个output(预测值).
350 | 
351 | epsilon rate =0.1、discount count=0.99、设置环境给的reward 为-1或0或1、minibatch size=32.
352 | 
353 | 优化器为 RMSProp 、learning rate=0.001.
354 | 
355 | 加入 experience replay(至少100000 transitions) 和 target network(更新 每训练5个episode) 机制在Q-Learning .
356 | 
357 | 
358 | 
359 | ### Random Policy
360 | 
361 | 使用随机策略 观测 各个游戏 的平均回报 与时长.
362 | 
363 | ### Cnn+DQN for three games
364 | 
365 | ##### MsPacman：
366 | 
367 | MsPacman 是一个假人在一个固定的环境中尽量躲避敌人并得分的游戏, 每次被敌人碰到就会丧失一次机会,一共有三次机会,  没有时间限制, 目的是尽可能的获得高的分数. 
368 | 
369 | MsPacman 的动作空间维度是9, 是从数字0到8的离散空间, gym 游戏中给出的reward 空间为[0, 10, 50],
370 | 
371 | 为了简便, 在我们的代码中, 我们将reward 控制成0  或 1.
372 | 
373 | 当得分增加时, 相对应的reward 不管10 还是50 均设置为 1,否则 reward 为0.
374 | 
375 | 然而,由于游戏中可以检测出什么时候被敌人碰到并丧失一次机会，即观察每步给出的 info 信息 {'ale.lives': 3}
376 | 
377 | 里面的数目即剩余的机会数,  因此可以设置当丧失一次机会时, reward 为 -1, 这样可以增加reward的合理性.
378 | 
379 | 场景如下图所示:
380 | 
381 | ![](learning_curve/MsPacman0.png)
382 | 
383 | 
384 | 
385 | ![](learning_curve/MsPacman301.png)
386 | 
387 | 
388 | 
389 | 
390 | 
391 | ##### Pong：
392 | 
393 | Pong 是两块不同颜色的板砖在玩类似于乒乓球的游戏, 红色是由电脑控制, 绿色则由我们的agent 控制. 小白球在两方之间受初始外力作用开始有规律的移动, 而板砖只能在竖直方向进行上下移动,通过碰撞改变小白球移动的方向, 哪方没有接住小白球致使其超过底线, 判断哪方输, 相应的 赢得一方则加上一分, 分数展示在相应位置的上方, 抢先获得21分的一方获胜, 没有时间限制 .
394 | 
395 | 其中, agent 的动作空间维度是6, 是数字0 到5 的离散空间; gym游戏中给出的reward空间为[-1,0,1], 刚好符合我们的设定.
396 | 
397 | 
398 | 
399 | 游戏过程如下图所示:
400 | 
401 | 
402 | 
403 | ![](learning_curve/Pong19.png)
404 | 
405 | 
406 | 
407 | ![](learning_curve/Pong256.png)
408 | 
409 | 
410 | 
411 | 
412 | 
413 | ##### Boxing:
414 | 
415 | Boxing 是两个拳击手在一个舞台上比赛的游戏, 其中黑色拳击手为电脑控制, 白色拳击手是我们的agent. 目的是在2分钟时间赢得比赛(获得比对手多的分数). 拳击手可以在舞台上进行任意的移动或者出拳. 其中 动作空间维度为18，是数字0到17的离散空间. reward 相对于前两种游戏更多样化, 为[-1,-2,0,1,2] , 这样也更符合拳击比赛的定义,比如重拳得分. 但为了更方便的实验, 我们这里也采用[-1,0,1]的设定.
416 | 
417 | 其中比赛过程如下图所示：
418 | 
419 | ![](learning_curve/boxing0.png)
420 | 
421 | 
422 | 
423 | ![](learning_curve/boxing313.png)
424 | 
425 | ### 实验结果与建议
426 | 
427 | 在编写三个游戏的代码中, 我们将reward设置成 1, 0 或者 -1, 接着我们使用 tf.stop_gradient 来计算目标Q 值  
428 | 
429 | ```
430 | tf.add(x3 + discount * tf.stop_gradient((1+x3)*Max_Q_value_next), (-1 * Q_value))
431 | ```
432 | 
433 | 相对于用DQN 训练Cartpole, Atari 游戏需要多得多的训练时间:
434 | 
435 | CartPole: CPU-------训练 20分钟即可得到一个比较稳定且高效的agent,
436 | 
437 | Atari:   Colab -GPU , 一个episode 1分多钟,  大概至少要训练1000个episode, agent
438 | 
439 | ​            才能展现比较好的性能, 在训练过程中, 虽然loss 能比较快的下降, 但 测试 得分(reward)
440 | 
441 | ​            要在几十万步以后才表现出逐渐提升的趋势.
442 | 
443 | 
444 | 
445 | 由于需要大量的训练量, 建议大约每50000步进行一次模型的测评, 也可以尝试不同的reshape size, 例如 (28,28) 与(128,128) 有着十分巨大的清晰度差别.
446 | 
447 | 如下图所示:
448 | 
449 | Boxing:
450 | 
451 | ![Boxing_28_28](learning_curve/boxing_28_28.png)
452 | 
453 | ![Boxing_128_128](learning_curve/boxing_128_128.png)
454 | 
455 | 
456 | Pong： 
457 | 
458 | ![Pong_28_28](learning_curve/pong_28_28.png)
459 | 
460 | ​                                        ![Pong_128_128](learning_curve/pong_128_128.png)
461 | 
462 | MsPacman: 
463 | 
464 | ​                                          ![MsPacman_28_28](learning_curve/mspacman_28_28.png)
465 | 
466 | ​                                         ![MsPacman_128_128](learning_curve/mspacman_128_128.png)
467 | 
468 | 
469 | 
470 | 
471 | 
472 | 如果机器允许, 也可以构建更大规格的experience replay
473 | 
474 | 
475 | 
476 | 
477 | 
478 |  
479 | 
480 | 
481 | 
482 | 
483 | 
484 | 
485 | 
486 | ------
487 | 
488 | 
489 | 
490 | # 相关的链接:
491 | 
492 | [Atari+Deep RL](https://arxiv.org/abs/1312.5602)
493 | 
494 | [Double-Q-learning](http://papers.nips.cc/paper/3964-double-q-learning)
495 | 
496 | [Deep-RL with Double Q-Learning](https://arxiv.org/abs/1509.06461)
497 | 
498 | [Human-level control through deep reinforcement learning](https://www.nature.com/articles/nature14236)
499 | 
500 | 


--------------------------------------------------------------------------------