├── Atari ├── DQN_Boxing.py ├── DQN_MsPacman.py ├── DQN_Pong.py ├── cnn-100-random-episode │ ├── delete │ ├── part1_boxing_a_score.npy │ ├── part1_boxing_b_score.npy │ ├── part1_boxing_length.npy │ ├── part1_pong_a_score.npy │ ├── part1_pong_b_score.npy │ ├── part1_pong_length.npy │ ├── part_b_part1_MsPacman.py │ ├── part_b_part1_boxing.py │ └── part_b_part1_pong.py ├── cnn-untrained-Q-network │ ├── Mspacman │ │ └── part2_MsPacman_length.npy │ ├── boxing │ │ ├── 41.png │ │ ├── 42.png │ │ ├── 43.png │ │ ├── 44.png │ │ ├── check_data.py │ │ ├── cnn_for_boxing.py │ │ ├── delete │ │ ├── part2_boxing_a_score.npy │ │ ├── part2_boxing_b_score.npy │ │ ├── part2_boxing_difference_score.npy │ │ └── part2_boxing_length.npy │ └── pong │ │ ├── 21.png │ │ ├── 22.png │ │ ├── 23.png │ │ ├── check_data.py │ │ ├── cnn_for_pong.py │ │ ├── delete │ │ ├── part2_pong_a_score.npy │ │ ├── part2_pong_b_score.npy │ │ ├── part2_pong_difference_score.npy │ │ └── part2_pong_length.npy ├── pong │ ├── 22.png │ ├── 23.png │ ├── check_data.py │ ├── cnn_for_pong.py │ ├── delete │ ├── part2_pong_a_score.npy │ ├── part2_pong_b_score.npy │ ├── part2_pong_difference_score.npy │ └── part2_pong_length.npy └── readme.md ├── CartPole ├── different-neural-size-Q-learning │ ├── cartpole_5_neural_1000_load.py │ ├── cartpole_5_neural_1000_saved.py │ ├── cartpole_5_neural_30_load.py │ ├── cartpole_5_neural_30_saved.py │ └── delete ├── double-q-learning │ ├── cartpole_8_load.py │ ├── cartpole_8_saved.py │ └── delete ├── experience_replay │ ├── cartpole_6_buffer_replay_load.py │ ├── cartpole_6_buffer_replay_saved.py │ └── delete ├── hundred-random-episode │ ├── 100_random_episodes.py │ └── delete ├── offline-batch-Q-learning │ ├── batch_Q_learning_linear_0.001_length.png │ ├── batch_Q_learning_linear_0.001_reward.png │ ├── batch_Q_learning_neural_0.0001_length.png │ ├── batch_Q_learning_neural_0.0001_reward.png │ ├── cartpole_3_collect_data.py │ ├── cartpole_3_linear_4_load.py │ ├── cartpole_3_linear_4_saved.py │ ├── cartpole_3_neural_5_load.py │ ├── cartpole_3_neural_5_saved.py │ ├── check_data.py │ ├── delete │ ├── figure_1-3.png │ ├── length_data_part3_4_300.npy │ ├── loss_data_part3_4_300.npy │ └── reward_data_part3_4_300.npy ├── online-Q-learning │ ├── cartpole_4_neural_load.py │ ├── cartpole_4_neural_saved.py │ └── delete ├── readme ├── target-parameter │ ├── cartpole_7_target_load.py │ ├── cartpole_7_target_saved.py │ └── delete ├── three-random-episode │ ├── 3_random_episode.py │ └── delete └── train_data_2.npy ├── learning_curve ├── Capture_1.JPG ├── DQN_PICTURE.JPG ├── MsPacman0.png ├── MsPacman301.png ├── Pong19.png ├── Pong256.png ├── batch_Q_learning_linear_0.001_length.png ├── batch_Q_learning_linear_0.001_reward.png ├── batch_Q_learning_neural_0.0001_length.png ├── batch_Q_learning_neural_0.0001_reward.png ├── boxing0.png ├── boxing313.png ├── boxing_128_128.png ├── boxing_28_28.png ├── double_Q_learning_length.png ├── double_Q_learning_reward.png ├── experience_replay_length.png ├── experience_replay_reward.png ├── mapacman_28_28.png ├── mspacman_128_128.png ├── mspacman_28_28.png ├── online_Q_learning_neural_0.001_length.png ├── online_Q_learning_neural_0.001_reward.png ├── pong_128_128.png ├── pong_28_28.png ├── readme ├── target_parameter_length.png └── target_parameter_reward.png └── readme.md /Atari/DQN_MsPacman.py: -------------------------------------------------------------------------------- 1 | ### the environment MsPacman ## 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import scipy.misc 6 | import random 7 | import tensorflow as tf 8 | import time 9 | import os 10 | ### the environment MsPacman ## 11 | env = gym.make('MsPacman-v0') 12 | ### saved trained model ### 13 | def save_final_model(model): 14 | if not os.path.exists('./MsPacman_a_model/'): 15 | os.mkdir('./MsPacman_a_model/') 16 | saver = tf.train.Saver() 17 | saver.save(model, './MsPacman_a_model/model.checkpoint1') 18 | 19 | ### the function to chnage image from RGB to greyscale ### 20 | def rgb2gray(rgb): 21 | return np.dot(rgb[..., :3], [0.299, 0.587, 0.114]) 22 | 23 | ### transfer input to size(28*28) ### 24 | def tran_size(x, x_size, y_size): 25 | output = rgb2gray(x) 26 | output = scipy.misc.imresize(output, size=[x_size, y_size]) 27 | return output 28 | 29 | ### stack four frames to size (28*28*4) ### 30 | def stack(x, index, x_size, y_size): 31 | output = np.reshape([x[index - 4], x[index - 3], x[index - 2], x[index - 1]], [x_size, y_size, 4]) 32 | output = np.reshape(output, [-1, 4 * x_size * y_size]) 33 | return output 34 | 35 | ### define the weights in the CNN neural network### 36 | def weight_variable(shape): 37 | output = tf.truncated_normal(shape, stddev=0.1) 38 | return tf.Variable(output) 39 | 40 | def bias_variable(shape): 41 | output = tf.constant(0.1, shape=shape) 42 | return tf.Variable(output) 43 | 44 | ### the convolution function #### 45 | def conv2d(input, Weight, strides): 46 | return tf.nn.conv2d(input, Weight, strides, padding='SAME') 47 | 48 | ### the cnn function ### 49 | def cnn_approximator(x, weight_convol_1, bias_convol_1, weight_convol_2, bias_convol_2, 50 | weight_flat, bias_flat, weight_out, bias_out,flat_width,flat_length,keep_prob): 51 | 52 | output_convol_1 = tf.nn.relu(conv2d(input=x, Weight=weight_convol_1, strides=[1, 2, 2, 1]) + bias_convol_1) 53 | output_convol_2 = tf.nn.relu( 54 | conv2d(input=output_convol_1, Weight=weight_convol_2, strides=[1, 2, 2, 1]) + bias_convol_2) 55 | 56 | output_reshape = tf.reshape(output_convol_2, [-1, flat_width* flat_length * 32]) 57 | output_flat = tf.matmul(output_reshape, weight_flat) + bias_flat 58 | 59 | out_drop=tf.nn.dropout(output_flat,keep_prob=keep_prob) 60 | weight_out=weight_variable([256,action_space]) 61 | bias_out=bias_variable([action_space]) 62 | y = tf.matmul(output_flat, weight_out) + bias_out 63 | print('====the cnn approximator is running====') 64 | return y 65 | 66 | ### set hyperparameter and variables for the CNN approximator ### 67 | discount = 0.99 68 | learn_rate = 0.001 69 | eplison = 0.1 70 | action_space = 9 71 | width_size = 64 72 | length_size = 64 73 | flat_layer_width=int(width_size/4) 74 | flat_layer_length=int(length_size/4) 75 | 76 | ### the Q_value weights ### 77 | ## first layer ## 78 | weight_convol_1 = weight_variable([6, 6, 4, 16]) 79 | bias_convol_1 = bias_variable([16]) 80 | ## second layer ## 81 | weight_convol_2 = weight_variable([4, 4, 16, 32]) 82 | bias_convol_2 = bias_variable([32]) 83 | ## flat layer ## 84 | weight_flat = weight_variable([flat_layer_width * flat_layer_length * 32, 256]) 85 | bias_flat = bias_variable([256]) 86 | ## linear layer ## 87 | weight_out = weight_variable([256, action_space]) 88 | bias_out = bias_variable([action_space]) 89 | 90 | ### the target Q value weights placeholder ### 91 | 92 | Weight_convol_1_target= tf.placeholder(tf.float32,shape=[6, 6, 4, 16]) 93 | Bias_convol_1_target = tf.placeholder(tf.float32,shape=[16]) 94 | Weight_convol_2_target = tf.placeholder(tf.float32,shape=[4, 4, 16, 32]) 95 | Bias_convol_2_target = tf.placeholder(tf.float32,shape=[32]) 96 | Weight_flat_target = tf.placeholder(tf.float32,[flat_layer_width * flat_layer_length * 32, 256]) 97 | Bias_flat_target = tf.placeholder(tf.float32,shape=[256]) 98 | Weight_out_target = tf.placeholder(tf.float32,shape=[256, action_space]) 99 | Bias_out_target = tf.placeholder(tf.float32,shape=[action_space]) 100 | 101 | 102 | ### the Input Placeholder ### 103 | x1 = tf.placeholder(tf.float32, shape=[None, width_size * length_size * 4]) 104 | x2 = tf.placeholder(tf.float32, shape=[None, width_size * length_size * 4]) 105 | x3 = tf.placeholder(tf.float32, shape=[None, 1]) 106 | x4 = tf.placeholder(tf.int32, shape=[None, 1]) 107 | x5 = tf.placeholder(tf.int32, shape=[None, 1]) 108 | ## dropout ratio ## 109 | keep_prob = tf.placeholder(tf.float32) 110 | 111 | ### reshape the stacked pictures before into cnn model ### 112 | x_1_image = tf.reshape(x1, [-1, width_size, length_size,4]) 113 | x_2_image = tf.reshape(x2, [-1, width_size, length_size,4]) 114 | 115 | ### caucalate the q avlue and max _next value 116 | prediction_now = cnn_approximator(x_1_image, weight_convol_1, bias_convol_1, weight_convol_2, bias_convol_2, 117 | weight_flat, bias_flat, weight_out, 118 | bias_out,flat_layer_width,flat_layer_length,keep_prob) 119 | prediction_next = cnn_approximator(x_2_image, Weight_convol_1_target, Bias_convol_1_target, 120 | Weight_convol_2_target, Bias_convol_2_target, 121 | Weight_flat_target, Bias_flat_target, Weight_out_target, 122 | Bias_out_target,flat_layer_width,flat_layer_length,keep_prob) 123 | 124 | ### test action when test agent performance ### 125 | test_action = tf.cast(tf.argmax(prediction_now, 1), tf.int32) 126 | 127 | ### take Q value underlying the actual action ### 128 | True_action = tf.cast(x4, tf.int32) 129 | True_action = tf.reshape(True_action, shape=[-1, 1]) 130 | action_repeat = tf.reshape(tf.cast(x5, tf.int32), shape=[-1, 1]) 131 | action_double = tf.concat([action_repeat, True_action], 1) 132 | 133 | ### calcaulate the loss and training ### 134 | Q_value = tf.gather_nd(params=prediction_now, indices=action_double) 135 | Max_Q_value_next = tf.reduce_max(prediction_next, axis=1) 136 | print('......the reward is clipped .....') 137 | ## when the game break, just use reward as the Q target approximation ## 138 | delta=tf.add(x3 + discount * tf.stop_gradient((1+x3)*Max_Q_value_next), (-1 * Q_value)) 139 | q_loss = tf.reduce_mean(tf.square(delta) / 2) 140 | train_optimizer_MsPacman = tf.train.RMSPropOptimizer(learn_rate).minimize((q_loss)) 141 | 142 | with tf.device('/cpu:0'): 143 | with tf.Session() as sess: 144 | start_time=time.time() 145 | print('======== build the experience replay ==========') 146 | ### set the variable and empty set ### 147 | length = [] 148 | total_score = [] 149 | experience_size = 100000 * 3 150 | number_episode_buffer = 700 151 | ### the buffer experience replay ### 152 | tran_size_buffer = [] 153 | start_time = time.time() 154 | experience_buffer = [] 155 | 156 | for i_buffer in range(number_episode_buffer): 157 | observation_0 = env.reset() 158 | if (i_buffer+1)%20 ==0: 159 | print('......the {} th episodes and information of ' 160 | 'observation_initial {}.....'.format(i_buffer+1,np.shape(observation_0))) 161 | 162 | Score = [] 163 | Action = [] 164 | for i_step in range(experience_size): 165 | ### collect data ### 166 | action = env.action_space.sample() 167 | #print('the action',action) 168 | observation_0, score, done, _ = env.step(action) 169 | observation_1 = tran_size(observation_0, width_size, length_size) 170 | #print('info score done _',score,done,_) 171 | Score.append(int(score)) 172 | Action.append(action) 173 | tran_size_buffer.append(observation_1) 174 | 175 | if (i_step + 1) % 4 == 0 and i_step >= 7: 176 | 177 | #print('......the step is {} the size of tran_size_buffer is {}......'.format(i_step + 1,np.shape(tran_size_buffer))) 178 | sub_example = [] 179 | sub_example.append(stack(tran_size_buffer, i_step + 1 - 4, width_size, length_size)) 180 | sub_example.append(stack(tran_size_buffer, i_step + 1, width_size, length_size)) 181 | ## clip the reward betweewn [-1 0 1] 182 | 183 | if sum(Score[i_step + 1 - 8:i_step + 1 - 4])==0: 184 | sub_example.append(0) 185 | else: 186 | sub_example.append(1) 187 | sub_example.append(Action[i_step - 4]) 188 | experience_buffer.append(sub_example) 189 | #print('.........the sub_example.....',sub_example[2:]) 190 | 191 | if done is True: 192 | final_example = [] 193 | #print('......the step is {} the size of tran_size_buffer is {}......'.format(i_step + 1, np.shape( 194 | #tran_size_buffer))) 195 | final_example.append(stack(tran_size_buffer, i_step + 1 - 4, width_size, length_size)) 196 | final_example.append(stack(tran_size_buffer, i_step + 1, width_size, length_size)) 197 | final_example.append(-1) 198 | final_example.append(Action[i_step - 4]) 199 | #print('the final_example.....', final_example) 200 | total_score.append(np.sum(Score, axis=0)) 201 | length.append(i_step + 1) 202 | experience_buffer.append(final_example) 203 | break 204 | tran_size_buffer=[] 205 | print('the information of generated experience buffer ', 206 | np.shape(experience_buffer), type(experience_buffer)) 207 | print('length of each episode',length) 208 | print('total score of each episode',total_score) 209 | print('==================the experience buffer process done ==============') 210 | print('==================the total generated time is {}===========' 211 | '======== (分)'.format((time.time()-start_time)/60)) 212 | 213 | print('######################################## starting training the DQN algorithm ' 214 | '###################################') 215 | 216 | #### training the DQN algorithm #### 217 | sess.run(tf.global_variables_initializer()) 218 | episode_number_training = 1000 219 | training_step=100000 220 | batch_size =32 221 | total_time=0 222 | total_training_step=0 223 | total_number_data=100000 224 | total_test_score=[] 225 | total_train_loss=[] 226 | print('....... the number of data points in experience buffer ........',np.shape(experience_buffer)) 227 | print('....... the number of data points in experience buffer ........', 228 | np.shape(experience_buffer[-1*total_number_data:])) 229 | for i_episode in range(1,episode_number_training): 230 | ### hold the old weights for target calculation in each 5 training episodes ### 231 | if ((i_episode - 1) % 5 == 0): 232 | weight_convol_1_target,bias_convol_1_target, weight_convol_2_target,bias_convol_2_target,\ 233 | weight_flat_target,bias_flat_target,weight_out_target,bias_out_target= \ 234 | sess.run([weight_convol_1, bias_convol_1, weight_convol_2, bias_convol_2, 235 | weight_flat,bias_flat,weight_out,bias_out]) 236 | print('-----------the target parameters updated----------') 237 | print('=========' 238 | '================================================' 239 | '{} th episode training is starting ================' 240 | '=========================================='.format(i_episode)) 241 | start_time = time.time() 242 | ## reset the environment at the beginning of each episode ## 243 | env.reset() 244 | ## the list to store the updating experience ## 245 | update_Score = [] 246 | update_Action=[] 247 | update_transition=[] 248 | action = env.action_space.sample() 249 | each_episode_loss=0 250 | for i_training in range(training_step): 251 | #env.render() 252 | observation_1,reward,done,info=env.step(action) 253 | update_Action.append(action) 254 | update_Score.append(reward) 255 | update_transition.append(tran_size(observation_1,width_size,length_size)) 256 | 257 | if i_training>=7 and (i_training+1)%4==0: 258 | ## update the experience ## 259 | update_Action,update_Score,update_transition=update_Action[-8:],\ 260 | update_Score[-8:],update_transition[-8:] 261 | experience_buffer=experience_buffer.tolist() 262 | 263 | if sum(update_Score[4:])==0: 264 | added_sum_score=0 265 | else: 266 | added_sum_score=1 267 | experience_buffer.append([stack(update_transition,4,width_size,length_size), 268 | stack(update_transition,8,width_size,length_size), 269 | added_sum_score,update_Action[4]]) 270 | ## keep the fixed size of experience replay ## 271 | experience_buffer=experience_buffer[-1*total_number_data:] 272 | experience_buffer=np.array(experience_buffer) 273 | ## take the randm action or greedy action ## 274 | if np.random.random() <= eplison: 275 | action = np.random.randint(0, 9) 276 | else: 277 | action= int(sess.run(test_action, feed_dict={x1: stack(update_transition,8,width_size,length_size), 278 | keep_prob:0.8, 279 | Weight_convol_1_target:weight_convol_1_target, 280 | Bias_convol_1_target:bias_convol_1_target, 281 | Weight_convol_2_target:weight_convol_2_target, 282 | Bias_convol_2_target:bias_convol_2_target, 283 | Weight_flat_target:weight_flat_target, 284 | Bias_flat_target:bias_flat_target, 285 | Weight_out_target:weight_out_target, 286 | Bias_out_target:bias_out_target})[0]) 287 | 288 | 289 | batch_sample = np.reshape(random.sample(range(0, len(experience_buffer)), batch_size), [-1, 1]) 290 | #print('=====the batch-sample====', batch_sample) 291 | 292 | experience_buffer = np.array(experience_buffer) 293 | mini_sample = np.reshape(experience_buffer[batch_sample], [batch_size, -1]) 294 | 295 | Input_1 = np.concatenate(mini_sample[:, 0], axis=0) 296 | Input_2 = np.concatenate(mini_sample[:, 1], axis=0) 297 | Input_3 = np.reshape(mini_sample[:, 2], [-1, 1]) 298 | Input_4 = np.reshape(mini_sample[:, 3], [-1, 1]) 299 | Input_5 = np.reshape(np.arange(batch_size), [-1, 1]) 300 | 301 | ## running the training step ## 302 | _,loss = sess.run([train_optimizer_MsPacman,q_loss],feed_dict={x1: Input_1,x2:Input_2,x3:Input_3,x4: Input_4, 303 | x5: Input_5,keep_prob:0.8, 304 | Weight_convol_1_target: weight_convol_1_target, 305 | Bias_convol_1_target: bias_convol_1_target, 306 | Weight_convol_2_target: weight_convol_2_target, 307 | Bias_convol_2_target: bias_convol_2_target, 308 | Weight_flat_target: weight_flat_target, 309 | Bias_flat_target: bias_flat_target, 310 | Weight_out_target: weight_out_target, 311 | Bias_out_target: bias_out_target}) 312 | each_episode_loss+=loss 313 | if done is True: 314 | ### record score for agent and computer each eposide ### 315 | ### always set -1(reward) when episode is done ### 316 | total_train_loss.append(each_episode_loss/(i_training+1)) 317 | experience_buffer=experience_buffer.tolist() 318 | 319 | experience_buffer.append([stack(update_transition,4,width_size,length_size), 320 | stack(update_transition,8,width_size,length_size), 321 | -1,update_Action[4]]) 322 | experience_buffer=np.array(experience_buffer) 323 | total_time+=(time.time()-start_time) 324 | total_training_step+=i_training 325 | 326 | if (i_episode - 1) % 5 == 0: 327 | print('*************** the {} th step trainning' 328 | ' loss is {} ***************'.format(i_training + 1, loss)) 329 | print('*************** the {} th episode trainning' 330 | ' time is {} 分 ***************'.format(i_episode,(time.time()-start_time)/60)) 331 | 332 | print('************* the total steps trainning' 333 | ' until now is {} **************'.format(total_training_step+1)) 334 | print('************* the total trainning ' 335 | 'time is {} 分 **************'.format(total_time/60)) 336 | print('=========' 337 | '================================================' 338 | '{} th episode is finished ================' 339 | '=========================================='.format(i_episode)) 340 | break 341 | ### test the agent performance until now ### 342 | if (i_episode-1)%10 == 0: 343 | print('=========' 344 | '================================================' 345 | 'After {} th training episode, the agent testing is starting' 346 | '=========================================='.format(i_episode)) 347 | test_episode_number=5 348 | test_score=0 349 | test_step=100000 350 | for i_test_number in range(test_episode_number): 351 | test_observation_0=env.reset() 352 | test_observation_0 = tran_size(test_observation_0, width_size, length_size) 353 | #test_update_Score = [] 354 | test_update_Action = [] 355 | test_update_transition = [] 356 | test_episode_action = env.action_space.sample() 357 | for i_test_step in range(test_step): 358 | #env.render() 359 | test_observation_1, test_reward, test_done, test_info = env.step(test_episode_action) 360 | test_update_transition.append(tran_size(test_observation_1,width_size,length_size)) 361 | test_score+=test_reward 362 | if (i_test_step+1)%4==0: 363 | test_update_transition=test_update_transition[-4:] 364 | test_episode_action = int(sess.run(test_action, feed_dict={ 365 | x1: stack(test_update_transition, 4, width_size, length_size),keep_prob:0.8})) 366 | if test_done: 367 | print('+++++++++++test {} th episode is done score is' 368 | ' {} until now +++++++++'.format(i_test_number,test_score)) 369 | break 370 | print('++++++++++++the test average score is++++++++++++++ ' 371 | '{}'.format(test_score/test_episode_number)) 372 | 373 | total_test_score.append(test_score / test_episode_number) 374 | ### save the model each 10 turn training ### 375 | save_final_model(sess) 376 | print('//////saved the model///////',i_episode) 377 | 378 | print('agent_score',total_test_score) 379 | print('train_loss',total_train_loss) 380 | np.save('agent_score',total_test_score) 381 | np.save('train_loss,',total_train_loss) 382 | 383 | -------------------------------------------------------------------------------- /Atari/cnn-100-random-episode/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Atari/cnn-100-random-episode/part1_boxing_a_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_boxing_a_score.npy -------------------------------------------------------------------------------- /Atari/cnn-100-random-episode/part1_boxing_b_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_boxing_b_score.npy -------------------------------------------------------------------------------- /Atari/cnn-100-random-episode/part1_boxing_length.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_boxing_length.npy -------------------------------------------------------------------------------- /Atari/cnn-100-random-episode/part1_pong_a_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_pong_a_score.npy -------------------------------------------------------------------------------- /Atari/cnn-100-random-episode/part1_pong_b_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_pong_b_score.npy -------------------------------------------------------------------------------- /Atari/cnn-100-random-episode/part1_pong_length.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-100-random-episode/part1_pong_length.npy -------------------------------------------------------------------------------- /Atari/cnn-100-random-episode/part_b_part1_MsPacman.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | env = gym.make('MsPacman-v0') 6 | ### set the variable and empty set ### 7 | length = [] 8 | total_score=[] 9 | ### the 100 eposides ### 10 | eposide_number=100 11 | ### the x axis value ### 12 | x=np.arange(eposide_number) 13 | x=x+1 14 | 15 | for i_eposide in range(eposide_number): 16 | env.reset() 17 | Score=[] 18 | for i_step in range(100000): 19 | ### record computer and agent scors ### 20 | 21 | action=np.random.randint(9) 22 | _, score, done,_ = env.step(action) 23 | 24 | Score.append(score) 25 | 26 | if done is True: 27 | total_score.append(np.sum(Score,axis=0)) 28 | length.append(i_step + 1) 29 | 30 | break 31 | std_length=np.std(length,axis=0) 32 | std_score=np.std(total_score,axis=0) 33 | 34 | 35 | 36 | 37 | print('the length...',length) 38 | print('the score of agent...',total_score) 39 | print(std_score) 40 | print(std_length) 41 | print(np.mean(total_score,axis=0)) 42 | print(np.mean(length,axis=0)) 43 | 44 | 45 | 46 | plt.plot(x,total_score) 47 | plt.xlabel('ith Num of episode') 48 | plt.ylabel('agent scores') 49 | plt.show() 50 | 51 | 52 | plt.plot(x,length) 53 | plt.xlabel('ith Num of episode') 54 | plt.ylabel('agent frames count') 55 | plt.show() 56 | 57 | np.save('part1_boxing_a_score',total_score) 58 | 59 | np.save('part1_boxing_length',length) 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /Atari/cnn-100-random-episode/part_b_part1_boxing.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | env = gym.make('Boxing-v0') 6 | ### set the variable and empty set ### 7 | length = [] 8 | total_score_a=[] 9 | total_score_b=[] 10 | ### the 100 eposides ### 11 | eposide_number=100 12 | ### the x axis value ### 13 | x=np.arange(eposide_number) 14 | x=x+1 15 | 16 | for i_eposide in range(eposide_number): 17 | env.reset() 18 | ### recorc computer and agent scors ### 19 | Score_a=[] 20 | Score_b=[] 21 | for i_step in range(100000): 22 | ### random select action ### 23 | action=np.random.randint(18) 24 | _, score, done,_ = env.step(action) 25 | 26 | if score <0: 27 | Score_b.append(score) 28 | if score>0: 29 | Score_a.append(score) 30 | 31 | if done is True: 32 | total_score_a.append(np.sum(Score_a,axis=0)) 33 | total_score_b.append(-1*np.sum(Score_b,axis=0)) 34 | 35 | length.append(i_step + 1) 36 | break 37 | ### calculate the standard of score and frame counts ### 38 | std_length=np.std(length,axis=0) 39 | std_score=np.std(total_score_a,axis=0) 40 | 41 | print('the length...',length) 42 | print('the score of agent...',total_score_a) 43 | print('the score of computer..',total_score_b) 44 | print(std_score) 45 | print(std_length) 46 | print(np.mean(total_score_a,axis=0)) 47 | print(np.mean(length,axis=0)) 48 | 49 | 50 | ### plot the mean the score and length ### 51 | plt.plot(x,total_score_a) 52 | plt.xlabel('ith Num of episode') 53 | plt.ylabel('agent scores') 54 | plt.show() 55 | 56 | plt.plot(x,total_score_b) 57 | plt.xlabel('ith Num of episode') 58 | plt.ylabel('computer scores') 59 | plt.show() 60 | 61 | plt.plot(x,length) 62 | plt.xlabel('ith Num of episode') 63 | plt.ylabel('agent frames count') 64 | plt.show() 65 | 66 | 67 | np.save('part1_boxing_a_score',total_score_a) 68 | np.save('part1_boxing_b_score',total_score_b) 69 | np.save('part1_boxing_length',length) 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /Atari/cnn-100-random-episode/part_b_part1_pong.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | env = gym.make('Pong-v0') 6 | 7 | ### set the variable and empty set ### 8 | length = [] 9 | total_score_a=[] 10 | total_score_b=[] 11 | ### the 100 eposides ### 12 | eposide_number=100 13 | 14 | ### the x axis value ### 15 | x=np.arange(eposide_number) 16 | x=x+1 17 | 18 | for i_eposide in range(eposide_number): 19 | env.reset() 20 | ### record computer and agent scors ### 21 | Score_a=[] 22 | Score_b=[] 23 | for i_step in range(100000): 24 | #env.render() 25 | ### random select action ### 26 | action=np.random.randint(6) 27 | _, score, done,_ = env.step(action) 28 | 29 | if score <0: 30 | Score_b.append(score) 31 | if score>0: 32 | Score_a.append(score) 33 | 34 | if done is True: 35 | total_score_a.append(np.sum(Score_a,axis=0)) 36 | total_score_b.append(-1*np.sum(Score_b,axis=0)) 37 | length.append(i_step + 1) 38 | break 39 | 40 | 41 | ### calculate the standard of score and frame counts ### 42 | std_length=np.std(length,axis=0) 43 | std_score=np.std(total_score_a,axis=0) 44 | 45 | print('the length...',length) 46 | print('the score of agent...',total_score_a) 47 | print('the score of computer..',total_score_b) 48 | print(std_score) 49 | print(std_length) 50 | print(np.mean(total_score_a,axis=0)) 51 | print(np.mean(length,axis=0)) 52 | 53 | 54 | ### plot the mean the score and length ### 55 | plt.plot(x,total_score_a) 56 | plt.xlabel('ith Num of episode') 57 | plt.ylabel('agent scores') 58 | plt.show() 59 | 60 | plt.plot(x,total_score_b) 61 | plt.xlabel('ith Num of episode') 62 | plt.ylabel('computer scores') 63 | plt.show() 64 | 65 | plt.plot(x,length) 66 | plt.xlabel('ith Num of episode') 67 | plt.ylabel('agent frames count') 68 | plt.show() 69 | 70 | 71 | np.save('part1_pong_a_score',total_score_a) 72 | np.save('part1_pong_b_score',total_score_b) 73 | np.save('part1_pong_length',length) 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/Mspacman/part2_MsPacman_length.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/Mspacman/part2_MsPacman_length.npy -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/41.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/41.png -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/42.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/42.png -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/43.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/43.png -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/44.png -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/check_data.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | a_score=np.load('part2_boxing_a_score.npy') 7 | b_score=np.load( 'part2_boxing_b_score.npy') 8 | length=np.load('part2_boxing_length.npy') 9 | differenct=np.load('part2_boxing_difference_score.npy') 10 | 11 | eposide_number = 100 12 | 13 | ### the x axis value ### 14 | x = np.arange(eposide_number) 15 | x = x + 1 16 | 17 | plt.plot(x,a_score ) 18 | plt.xlabel('ith Num of episode') 19 | plt.ylabel('agent scores') 20 | plt.show() 21 | 22 | plt.plot(x, b_score) 23 | plt.xlabel('ith Num of episode') 24 | plt.ylabel('computer scores') 25 | plt.show() 26 | 27 | plt.plot(x,differenct) 28 | plt.xlabel('ith Num of episode') 29 | plt.ylabel('difference between agent and computer') 30 | plt.show() 31 | 32 | 33 | plt.plot(x, length) 34 | plt.xlabel('ith Num of episode') 35 | plt.ylabel('agent frames count') 36 | plt.show() 37 | -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/cnn_for_boxing.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import scipy.misc 5 | 6 | import tensorflow as tf 7 | 8 | env = gym.make('Boxing-v3') 9 | 10 | 11 | ### the function to chnage image from RGB to greyscale ### 12 | def rgb2gray(rgb): 13 | return np.dot(rgb[..., :3], [0.299, 0.587, 0.114]) 14 | 15 | ### transfer input to size(28*28*1) ### 16 | def tran_size(x): 17 | output = scipy.misc.imresize(x, size=[28, 28]) 18 | output = rgb2gray(output) 19 | return output 20 | 21 | ### stack four frames to size (28*28*4) ### 22 | def stack(x, index): 23 | output = np.reshape([x[index], x[index - 1], x[index - 2], x[index - 3]], [28,28,4]) 24 | return output 25 | 26 | ### define the weights in the CNN neural network### 27 | def weight_variable(shape): 28 | output = tf.truncated_normal(shape, stddev=0.1) 29 | return tf.Variable(output) 30 | 31 | 32 | def bias_variable(shape): 33 | output = tf.constant(0.1, shape=shape) 34 | return tf.Variable(output) 35 | 36 | ### the convolution function #### 37 | def conv2d(input, Weight,strides): 38 | 39 | return tf.nn.conv2d(input, Weight, strides, padding='SAME') 40 | 41 | ### the cnn function ### 42 | def cnn_pong(x): 43 | ### first layer ### 44 | weight_convol_1=weight_variable([6,6,4,16]) 45 | bias_convol_1=bias_variable([16]) 46 | 47 | output_convol_1=tf.nn.relu(conv2d(input=x,Weight=weight_convol_1,strides=[1,2,2,1])+bias_convol_1) 48 | ### second layer ### 49 | weight_convol_2=weight_variable([4,4,16,32]) 50 | bias_convol_2=bias_variable([32]) 51 | output_convol_2=tf.nn.relu(conv2d(input=output_convol_1,Weight=weight_convol_2,strides=[1,2,2,1])+bias_convol_2) 52 | ### flat layer ### 53 | weight_flat=weight_variable([7*7*32,256]) 54 | bias_flat=bias_variable([256]) 55 | output_reshape=tf.reshape(output_convol_2,[-1,7*7*32]) 56 | output_flat=tf.matmul(output_reshape,weight_flat)+bias_flat 57 | 58 | ### linear layer ### 59 | out_drop=tf.nn.dropout(output_flat,0.8) 60 | weight_out=weight_variable([256,action_space]) 61 | 62 | bias_out=bias_variable([action_space]) 63 | 64 | y=tf.matmul(out_drop,weight_out)+bias_out 65 | 66 | return y 67 | 68 | ### set hyperparameter and variables ### 69 | discount=0.99 70 | learn_rate=0.001 71 | eplison=0.1 72 | action_space=18 73 | 74 | 75 | keep_drop=tf.placeholder(tf.float32) 76 | x1=tf.placeholder(tf.float32,shape=[None,28,28,4]) 77 | x2=tf.placeholder(tf.float32,shape=[None,28,28,4]) 78 | x3=tf.placeholder(tf.float32,shape=[None,1]) 79 | x4=tf.placeholder(tf.int32,shape=[None,2]) 80 | 81 | ### caucalate the q avlue and max _next value 82 | prediction_now=cnn_pong(x1) 83 | prediction_next=cnn_pong(x2) 84 | 85 | ### test action when test agent performance ### 86 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 87 | 88 | 89 | ### calcaulate the loss and training ### 90 | Q_value=tf.gather_nd(params=prediction_now,indices=x4) 91 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 92 | 93 | delta=tf.add(x3+discount*tf.stop_gradient(Max_Q_value_next),(-1*Q_value)) 94 | q_loss=tf.reduce_sum(tf.square(delta)/2) 95 | 96 | train_optimizer=tf.train.RMSPropOptimizer(learn_rate).minimize((q_loss)) 97 | 98 | #### save the model #### 99 | saver=tf.train.Saver() 100 | 101 | with tf.device('/cpu:0'): 102 | with tf.Session() as sess: 103 | 104 | for i_run in range(1, 1 + 1): 105 | sess.run(tf.global_variables_initializer()) 106 | 107 | print('......start training data......') 108 | 109 | ### set the variable and empty set ### 110 | length = [] 111 | total_score_a = [] 112 | total_score_b = [] 113 | total_absolute=[] 114 | ### the 100 eposides ### 115 | eposide_number = 100 116 | 117 | ### the x axis value ### 118 | x = np.arange(eposide_number) 119 | x = x + 1 120 | ### the buffer experience replay ### 121 | initial_buffer = [] 122 | buffer_replay = [] 123 | for i_eposide in range(eposide_number): 124 | env.reset() 125 | ### record score for computer and agent ### 126 | Score_a = [] 127 | Score_b = [] 128 | 129 | for i_step in range(100000): 130 | 131 | if len(initial_buffer) < 4: 132 | ### collect data ### 133 | 134 | action = env.action_space.sample() 135 | obser_1, score, done, _ = env.step(action) 136 | obser_initial = tran_size(obser_1) 137 | if score < 0: 138 | Score_b.append(score) 139 | if score > 0: 140 | Score_a.append(score) 141 | #print(score, done) 142 | 143 | initial_buffer.append(obser_initial) 144 | 145 | else: 146 | 147 | state_i = stack(initial_buffer, i_step - 1) 148 | 149 | buffer_replay.append(state_i) 150 | ### select action by eplison policy ### 151 | if np.random.random() <= eplison: 152 | action_select=np.random.randint(6) 153 | #print('ewqrwqr......') 154 | else: 155 | 156 | action_select = sess.run(test_action, feed_dict={x1: [state_i]}) 157 | 158 | action_select = int(action_select) 159 | obser_1, score, done, _ = env.step(action_select) 160 | if score < 0: 161 | Score_b.append(score) 162 | if score > 0: 163 | Score_a.append(score) 164 | #print(score, done) 165 | 166 | obser_initial = tran_size(obser_1) 167 | 168 | initial_buffer.append(obser_initial) 169 | 170 | if done is True: 171 | ### record score for agent and computer each eposide ### 172 | total_score_a.append(np.sum(Score_a, axis=0)) 173 | total_score_b.append(-1 * np.sum(Score_b, axis=0)) 174 | length.append(i_step + 1) 175 | total_absolute.append((np.sum(Score_a, axis=0)+np.sum(Score_b, axis=0))) 176 | 177 | break 178 | 179 | ### calculate the standard of score and frame counts ### 180 | std_length = np.std(length, axis=0) 181 | std_score = np.std(total_score_a, axis=0) 182 | std_score_abso=np.std(total_absolute,axis=0) 183 | 184 | print('the length...',length) 185 | print('the agent score...',total_score_a) 186 | print('the absolute value...',total_absolute) 187 | print('the std of agent score..',std_score) 188 | print('the std_score_abso..',std_score_abso) 189 | print('the std_length..',std_length) 190 | # print(std_length) 191 | print('the mean of total_score_a...',np.mean(total_score_a, axis=0)) 192 | print('the mean of length...',np.mean(length, axis=0)) 193 | print('the mean of total_absolute...',np.mean(total_absolute, axis=0)) 194 | 195 | ### plot the mean the score and length ### 196 | plt.plot(x, total_score_a) 197 | plt.xlabel('ith Num of episode') 198 | plt.ylabel('agent scores') 199 | plt.show() 200 | 201 | plt.plot(x, total_score_b) 202 | plt.xlabel('ith Num of episode') 203 | plt.ylabel('computer scores') 204 | plt.show() 205 | 206 | plt.plot(x, total_absolute) 207 | plt.xlabel('ith Num of episode') 208 | plt.ylabel('difference between agent and computer') 209 | plt.show() 210 | 211 | 212 | plt.plot(x, length) 213 | plt.xlabel('ith Num of episode') 214 | plt.ylabel('agent frames count') 215 | plt.show() 216 | 217 | np.save('part2_boxing_a_score', total_score_a) 218 | np.save('part2_boxing_b_score', total_score_b) 219 | np.save('part2_boxing_length', length) 220 | np.save('part2_boxing_difference_score',total_absolute) 221 | 222 | -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/part2_boxing_a_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/part2_boxing_a_score.npy -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/part2_boxing_b_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/part2_boxing_b_score.npy -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/part2_boxing_difference_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/part2_boxing_difference_score.npy -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/boxing/part2_boxing_length.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/boxing/part2_boxing_length.npy -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/pong/21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/21.png -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/pong/22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/22.png -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/pong/23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/23.png -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/pong/check_data.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | a_score=np.load('part2_pong_a_score.npy') 7 | b_score=np.load( 'part2_pong_b_score.npy') 8 | length=np.load('part2_pong_length.npy') 9 | differenct=np.load('part2_pong_difference_score.npy') 10 | 11 | eposide_number = 100 12 | 13 | ### the x axis value ### 14 | x = np.arange(eposide_number) 15 | x = x + 1 16 | 17 | plt.plot(x,a_score ) 18 | plt.xlabel('ith Num of episode') 19 | plt.ylabel('agent scores') 20 | plt.show() 21 | 22 | plt.plot(x, b_score) 23 | plt.xlabel('ith Num of episode') 24 | plt.ylabel('computer scores') 25 | plt.show() 26 | 27 | plt.plot(x,differenct) 28 | plt.xlabel('ith Num of episode') 29 | plt.ylabel('difference between agent and computer') 30 | plt.show() 31 | 32 | 33 | plt.plot(x, length) 34 | plt.xlabel('ith Num of episode') 35 | plt.ylabel('agent frames count') 36 | -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/pong/cnn_for_pong.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import scipy.misc 5 | 6 | import tensorflow as tf 7 | 8 | env = gym.make('Pong-v0') 9 | 10 | 11 | ### the function to chnage image from RGB to greyscale ### 12 | def rgb2gray(rgb): 13 | return np.dot(rgb[..., :3], [0.299, 0.587, 0.114]) 14 | 15 | ### transfer input to size(28*28*1) ### 16 | def tran_size(x): 17 | output = scipy.misc.imresize(x, size=[28, 28]) 18 | output = rgb2gray(output) 19 | return output 20 | 21 | ### stack four frames to size (28*28*4) ### 22 | def stack(x, index): 23 | output = np.reshape([x[index], x[index - 1], x[index - 2], x[index - 3]], [28,28,4]) 24 | return output 25 | 26 | ### define the weights in the CNN neural network### 27 | def weight_variable(shape): 28 | output = tf.truncated_normal(shape, stddev=0.1) 29 | return tf.Variable(output) 30 | 31 | 32 | def bias_variable(shape): 33 | output = tf.constant(0.1, shape=shape) 34 | return tf.Variable(output) 35 | 36 | ### the convolution function #### 37 | def conv2d(input, Weight,strides): 38 | 39 | return tf.nn.conv2d(input, Weight, strides, padding='SAME') 40 | 41 | ### the cnn function ### 42 | def cnn_pong(x): 43 | ### first layer ### 44 | weight_convol_1=weight_variable([6,6,4,16]) 45 | bias_convol_1=bias_variable([16]) 46 | 47 | output_convol_1=tf.nn.relu(conv2d(input=x,Weight=weight_convol_1,strides=[1,2,2,1])+bias_convol_1) 48 | ### second layer ### 49 | weight_convol_2=weight_variable([4,4,16,32]) 50 | bias_convol_2=bias_variable([32]) 51 | output_convol_2=tf.nn.relu(conv2d(input=output_convol_1,Weight=weight_convol_2,strides=[1,2,2,1])+bias_convol_2) 52 | ### flat layer ### 53 | weight_flat=weight_variable([7*7*32,256]) 54 | bias_flat=bias_variable([256]) 55 | output_reshape=tf.reshape(output_convol_2,[-1,7*7*32]) 56 | output_flat=tf.matmul(output_reshape,weight_flat)+bias_flat 57 | 58 | ### linear layer ### 59 | out_drop=tf.nn.dropout(output_flat,0.8) 60 | weight_out=weight_variable([256,action_space]) 61 | 62 | bias_out=bias_variable([action_space]) 63 | 64 | y=tf.matmul(out_drop,weight_out)+bias_out 65 | 66 | return y 67 | 68 | ### set hyperparameter and variables ### 69 | discount=0.99 70 | learn_rate=0.001 71 | eplison=0.1 72 | action_space=6 73 | 74 | 75 | keep_drop=tf.placeholder(tf.float32) 76 | x1=tf.placeholder(tf.float32,shape=[None,28,28,4]) 77 | x2=tf.placeholder(tf.float32,shape=[None,28,28,4]) 78 | x3=tf.placeholder(tf.float32,shape=[None,1]) 79 | x4=tf.placeholder(tf.int32,shape=[None,2]) 80 | 81 | ### caucalate the q avlue and max _next value 82 | prediction_now=cnn_pong(x1) 83 | prediction_next=cnn_pong(x2) 84 | 85 | ### test action when test agent performance ### 86 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 87 | 88 | 89 | ### calcaulate the loss and training ### 90 | Q_value=tf.gather_nd(params=prediction_now,indices=x4) 91 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 92 | 93 | delta=tf.add(x3+discount*tf.stop_gradient(Max_Q_value_next),(-1*Q_value)) 94 | q_loss=tf.reduce_sum(tf.square(delta)/2) 95 | 96 | train_optimizer=tf.train.RMSPropOptimizer(learn_rate).minimize((q_loss)) 97 | 98 | #### save the model #### 99 | saver=tf.train.Saver() 100 | 101 | with tf.device('/cpu:0'): 102 | with tf.Session() as sess: 103 | 104 | for i_run in range(1, 1 + 1): 105 | sess.run(tf.global_variables_initializer()) 106 | 107 | print('......start training data......') 108 | 109 | ### set the variable and empty set ### 110 | length = [] 111 | total_score_a = [] 112 | total_score_b = [] 113 | total_absolute=[] 114 | ### the 100 eposides ### 115 | eposide_number = 100 116 | 117 | ### the x axis value ### 118 | x = np.arange(eposide_number) 119 | x = x + 1 120 | ### the buffer experience replay ### 121 | initial_buffer = [] 122 | buffer_replay = [] 123 | for i_eposide in range(eposide_number): 124 | env.reset() 125 | ### record score for computer and agent ### 126 | Score_a = [] 127 | Score_b = [] 128 | 129 | for i_step in range(100000): 130 | 131 | if len(initial_buffer) < 4: 132 | ### collect data ### 133 | action = env.action_space.sample() 134 | obser_1, score, done, _ = env.step(action) 135 | obser_initial = tran_size(obser_1) 136 | if score < 0: 137 | Score_b.append(score) 138 | if score > 0: 139 | Score_a.append(score) 140 | #print(score, done) 141 | 142 | initial_buffer.append(obser_initial) 143 | 144 | else: 145 | 146 | state_i = stack(initial_buffer, i_step - 1) 147 | 148 | buffer_replay.append(state_i) 149 | ### select action by eplison policy ### 150 | if np.random.random() <= eplison: 151 | action_select=np.random.randint(6) 152 | #print('ewqrwqr......') 153 | else: 154 | 155 | action_select = sess.run(test_action, feed_dict={x1: [state_i]}) 156 | 157 | action_select = int(action_select) 158 | obser_1, score, done, _ = env.step(action_select) 159 | if score < 0: 160 | Score_b.append(score) 161 | if score > 0: 162 | Score_a.append(score) 163 | #print(score, done) 164 | 165 | obser_initial = tran_size(obser_1) 166 | 167 | initial_buffer.append(obser_initial) 168 | 169 | if done is True: 170 | ### record score for agent and computer each eposide ### 171 | total_score_a.append(np.sum(Score_a, axis=0)) 172 | total_score_b.append(-1 * np.sum(Score_b, axis=0)) 173 | length.append(i_step + 1) 174 | total_absolute.append((np.sum(Score_a, axis=0)+np.sum(Score_b, axis=0))) 175 | 176 | break 177 | 178 | ### calculate the standard of score and frame counts ### 179 | std_length = np.std(length, axis=0) 180 | std_score = np.std(total_score_a, axis=0) 181 | std_score_abso=np.std(total_absolute,axis=0) 182 | 183 | print('the length...',length) 184 | print('the agent score...',total_score_a) 185 | print('the absolute value...',total_absolute) 186 | print('the std of agent score..',std_score) 187 | print('the std_score_abso..',std_score_abso) 188 | print('the std_length..',std_length) 189 | # print(std_length) 190 | print('the mean of total_score_a...',np.mean(total_score_a, axis=0)) 191 | print('the mean of length...',np.mean(length, axis=0)) 192 | print('the mean of total_absolute...',np.mean(total_absolute, axis=0)) 193 | 194 | ### plot the mean the score and length ### 195 | plt.plot(x, total_score_a) 196 | plt.xlabel('ith Num of episode') 197 | plt.ylabel('agent scores') 198 | plt.show() 199 | 200 | plt.plot(x, total_score_b) 201 | plt.xlabel('ith Num of episode') 202 | plt.ylabel('computer scores') 203 | plt.show() 204 | 205 | plt.plot(x, total_absolute) 206 | plt.xlabel('ith Num of episode') 207 | plt.ylabel('difference between agent and computer') 208 | plt.show() 209 | 210 | 211 | plt.plot(x, length) 212 | plt.xlabel('ith Num of episode') 213 | plt.ylabel('agent frames count') 214 | plt.show() 215 | 216 | 217 | np.save('part2_pong_a_score', total_score_a) 218 | np.save('part2_pong_b_score', total_score_b) 219 | np.save('part2_pong_length', length) 220 | np.save('part2_pong_difference_score',total_absolute) 221 | 222 | -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/pong/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/pong/part2_pong_a_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/part2_pong_a_score.npy -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/pong/part2_pong_b_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/part2_pong_b_score.npy -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/pong/part2_pong_difference_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/part2_pong_difference_score.npy -------------------------------------------------------------------------------- /Atari/cnn-untrained-Q-network/pong/part2_pong_length.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/cnn-untrained-Q-network/pong/part2_pong_length.npy -------------------------------------------------------------------------------- /Atari/pong/22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/22.png -------------------------------------------------------------------------------- /Atari/pong/23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/23.png -------------------------------------------------------------------------------- /Atari/pong/check_data.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | a_score=np.load('part2_pong_a_score.npy') 7 | b_score=np.load( 'part2_pong_b_score.npy') 8 | length=np.load('part2_pong_length.npy') 9 | differenct=np.load('part2_pong_difference_score.npy') 10 | 11 | eposide_number = 100 12 | 13 | ### the x axis value ### 14 | x = np.arange(eposide_number) 15 | x = x + 1 16 | 17 | plt.plot(x,a_score ) 18 | plt.xlabel('ith Num of episode') 19 | plt.ylabel('agent scores') 20 | plt.show() 21 | 22 | plt.plot(x, b_score) 23 | plt.xlabel('ith Num of episode') 24 | plt.ylabel('computer scores') 25 | plt.show() 26 | 27 | plt.plot(x,differenct) 28 | plt.xlabel('ith Num of episode') 29 | plt.ylabel('difference between agent and computer') 30 | plt.show() 31 | 32 | 33 | plt.plot(x, length) 34 | plt.xlabel('ith Num of episode') 35 | plt.ylabel('agent frames count') 36 | -------------------------------------------------------------------------------- /Atari/pong/cnn_for_pong.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import scipy.misc 5 | 6 | import tensorflow as tf 7 | 8 | env = gym.make('Pong-v0') 9 | 10 | 11 | ### the function to chnage image from RGB to greyscale ### 12 | def rgb2gray(rgb): 13 | return np.dot(rgb[..., :3], [0.299, 0.587, 0.114]) 14 | 15 | ### transfer input to size(28*28*1) ### 16 | def tran_size(x): 17 | output = scipy.misc.imresize(x, size=[28, 28]) 18 | output = rgb2gray(output) 19 | return output 20 | 21 | ### stack four frames to size (28*28*4) ### 22 | def stack(x, index): 23 | output = np.reshape([x[index], x[index - 1], x[index - 2], x[index - 3]], [28,28,4]) 24 | return output 25 | 26 | ### define the weights in the CNN neural network### 27 | def weight_variable(shape): 28 | output = tf.truncated_normal(shape, stddev=0.1) 29 | return tf.Variable(output) 30 | 31 | 32 | def bias_variable(shape): 33 | output = tf.constant(0.1, shape=shape) 34 | return tf.Variable(output) 35 | 36 | ### the convolution function #### 37 | def conv2d(input, Weight,strides): 38 | 39 | return tf.nn.conv2d(input, Weight, strides, padding='SAME') 40 | 41 | ### the cnn function ### 42 | def cnn_pong(x): 43 | ### first layer ### 44 | weight_convol_1=weight_variable([6,6,4,16]) 45 | bias_convol_1=bias_variable([16]) 46 | 47 | output_convol_1=tf.nn.relu(conv2d(input=x,Weight=weight_convol_1,strides=[1,2,2,1])+bias_convol_1) 48 | ### second layer ### 49 | weight_convol_2=weight_variable([4,4,16,32]) 50 | bias_convol_2=bias_variable([32]) 51 | output_convol_2=tf.nn.relu(conv2d(input=output_convol_1,Weight=weight_convol_2,strides=[1,2,2,1])+bias_convol_2) 52 | ### flat layer ### 53 | weight_flat=weight_variable([7*7*32,256]) 54 | bias_flat=bias_variable([256]) 55 | output_reshape=tf.reshape(output_convol_2,[-1,7*7*32]) 56 | output_flat=tf.matmul(output_reshape,weight_flat)+bias_flat 57 | 58 | ### linear layer ### 59 | out_drop=tf.nn.dropout(output_flat,0.8) 60 | weight_out=weight_variable([256,action_space]) 61 | 62 | bias_out=bias_variable([action_space]) 63 | 64 | y=tf.matmul(out_drop,weight_out)+bias_out 65 | 66 | return y 67 | 68 | ### set hyperparameter and variables ### 69 | discount=0.99 70 | learn_rate=0.001 71 | eplison=0.1 72 | action_space=6 73 | 74 | 75 | keep_drop=tf.placeholder(tf.float32) 76 | x1=tf.placeholder(tf.float32,shape=[None,28,28,4]) 77 | x2=tf.placeholder(tf.float32,shape=[None,28,28,4]) 78 | x3=tf.placeholder(tf.float32,shape=[None,1]) 79 | x4=tf.placeholder(tf.int32,shape=[None,2]) 80 | 81 | ### caucalate the q avlue and max _next value 82 | prediction_now=cnn_pong(x1) 83 | prediction_next=cnn_pong(x2) 84 | 85 | ### test action when test agent performance ### 86 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 87 | 88 | 89 | ### calcaulate the loss and training ### 90 | Q_value=tf.gather_nd(params=prediction_now,indices=x4) 91 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 92 | 93 | delta=tf.add(x3+discount*tf.stop_gradient(Max_Q_value_next),(-1*Q_value)) 94 | q_loss=tf.reduce_sum(tf.square(delta)/2) 95 | 96 | train_optimizer=tf.train.RMSPropOptimizer(learn_rate).minimize((q_loss)) 97 | 98 | #### save the model #### 99 | saver=tf.train.Saver() 100 | 101 | with tf.device('/cpu:0'): 102 | with tf.Session() as sess: 103 | 104 | for i_run in range(1, 1 + 1): 105 | sess.run(tf.global_variables_initializer()) 106 | 107 | print('......start training data......') 108 | 109 | ### set the variable and empty set ### 110 | length = [] 111 | total_score_a = [] 112 | total_score_b = [] 113 | total_absolute=[] 114 | ### the 100 eposides ### 115 | eposide_number = 100 116 | 117 | ### the x axis value ### 118 | x = np.arange(eposide_number) 119 | x = x + 1 120 | ### the buffer experience replay ### 121 | initial_buffer = [] 122 | buffer_replay = [] 123 | for i_eposide in range(eposide_number): 124 | env.reset() 125 | ### record score for computer and agent ### 126 | Score_a = [] 127 | Score_b = [] 128 | 129 | for i_step in range(100000): 130 | 131 | if len(initial_buffer) < 4: 132 | ### collect data ### 133 | action = env.action_space.sample() 134 | obser_1, score, done, _ = env.step(action) 135 | obser_initial = tran_size(obser_1) 136 | if score < 0: 137 | Score_b.append(score) 138 | if score > 0: 139 | Score_a.append(score) 140 | #print(score, done) 141 | 142 | initial_buffer.append(obser_initial) 143 | 144 | else: 145 | 146 | state_i = stack(initial_buffer, i_step - 1) 147 | 148 | buffer_replay.append(state_i) 149 | ### select action by eplison policy ### 150 | if np.random.random() <= eplison: 151 | action_select=np.random.randint(6) 152 | #print('ewqrwqr......') 153 | else: 154 | 155 | action_select = sess.run(test_action, feed_dict={x1: [state_i]}) 156 | 157 | action_select = int(action_select) 158 | obser_1, score, done, _ = env.step(action_select) 159 | if score < 0: 160 | Score_b.append(score) 161 | if score > 0: 162 | Score_a.append(score) 163 | #print(score, done) 164 | 165 | obser_initial = tran_size(obser_1) 166 | 167 | initial_buffer.append(obser_initial) 168 | 169 | if done is True: 170 | ### record score for agent and computer each eposide ### 171 | total_score_a.append(np.sum(Score_a, axis=0)) 172 | total_score_b.append(-1 * np.sum(Score_b, axis=0)) 173 | length.append(i_step + 1) 174 | total_absolute.append((np.sum(Score_a, axis=0)+np.sum(Score_b, axis=0))) 175 | 176 | break 177 | 178 | ### calculate the standard of score and frame counts ### 179 | std_length = np.std(length, axis=0) 180 | std_score = np.std(total_score_a, axis=0) 181 | std_score_abso=np.std(total_absolute,axis=0) 182 | 183 | print('the length...',length) 184 | print('the agent score...',total_score_a) 185 | print('the absolute value...',total_absolute) 186 | print('the std of agent score..',std_score) 187 | print('the std_score_abso..',std_score_abso) 188 | print('the std_length..',std_length) 189 | # print(std_length) 190 | print('the mean of total_score_a...',np.mean(total_score_a, axis=0)) 191 | print('the mean of length...',np.mean(length, axis=0)) 192 | print('the mean of total_absolute...',np.mean(total_absolute, axis=0)) 193 | 194 | ### plot the mean the score and length ### 195 | plt.plot(x, total_score_a) 196 | plt.xlabel('ith Num of episode') 197 | plt.ylabel('agent scores') 198 | plt.show() 199 | 200 | plt.plot(x, total_score_b) 201 | plt.xlabel('ith Num of episode') 202 | plt.ylabel('computer scores') 203 | plt.show() 204 | 205 | plt.plot(x, total_absolute) 206 | plt.xlabel('ith Num of episode') 207 | plt.ylabel('difference between agent and computer') 208 | plt.show() 209 | 210 | 211 | plt.plot(x, length) 212 | plt.xlabel('ith Num of episode') 213 | plt.ylabel('agent frames count') 214 | plt.show() 215 | 216 | 217 | np.save('part2_pong_a_score', total_score_a) 218 | np.save('part2_pong_b_score', total_score_b) 219 | np.save('part2_pong_length', length) 220 | np.save('part2_pong_difference_score',total_absolute) 221 | 222 | -------------------------------------------------------------------------------- /Atari/pong/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Atari/pong/part2_pong_a_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/part2_pong_a_score.npy -------------------------------------------------------------------------------- /Atari/pong/part2_pong_b_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/part2_pong_b_score.npy -------------------------------------------------------------------------------- /Atari/pong/part2_pong_difference_score.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/part2_pong_difference_score.npy -------------------------------------------------------------------------------- /Atari/pong/part2_pong_length.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/Atari/pong/part2_pong_length.npy -------------------------------------------------------------------------------- /CartPole/different-neural-size-Q-learning/cartpole_5_neural_1000_load.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | import matplotlib.pyplot as plt 7 | env = gym.make('CartPole-v0') 8 | env._max_episode_steps = 300 9 | 10 | 11 | 12 | #### set variable and parameters #### 13 | x1=tf.placeholder(tf.float32, shape=[None,4]) 14 | x2=tf.placeholder(tf.float32, shape=[None,4]) 15 | x3=tf.placeholder(tf.float32, shape=[None,1]) 16 | x4=tf.placeholder(tf.int32, shape=[None,2]) 17 | 18 | 19 | discount=0.99 20 | learn_rate=0.0001 21 | input_size=4 22 | hidden_size=1000 23 | output_size=2 24 | eplison=0.05 25 | max_eposide_length=300 26 | 27 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 28 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 29 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 30 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 31 | 32 | 33 | 34 | ### one hiddle layer neural network as function approximation ### 35 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 36 | prediction_No=tf.nn.relu(middle_now) 37 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 38 | 39 | 40 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 41 | prediction_Ne=tf.nn.relu(middle_next) 42 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 43 | 44 | ### the best action based on observation_now ### 45 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 46 | 47 | 48 | 49 | ### calcaulate the loss and training ### 50 | Q_value=tf.gather_nd(params=prediction_now,indices=x4) 51 | 52 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 53 | 54 | 55 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value)) 56 | 57 | q_loss=tf.reduce_sum(tf.square(delta)/2) 58 | 59 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 60 | 61 | 62 | #### save the model #### 63 | saver=tf.train.Saver() 64 | 65 | 66 | 67 | with tf.device('/cpu:0'): 68 | with tf.Session() as sess: 69 | ## reload the weights ### 70 | saver.restore(sess, './part5_neural_1000_300/') 71 | eposide_length = [] 72 | expected_value = [] 73 | all_eposide_length = np.zeros((1, 10)) 74 | all_reward = np.zeros((1, 100)) 75 | #### run 10 times test eposide ### 76 | for i_episode in range(10): 77 | 78 | observation_init = env.reset() 79 | observation_init = [observation_init] 80 | 81 | for t in range(300): 82 | 83 | if t == 0: 84 | 85 | Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init}) 86 | 87 | observation_curr, reward_curr, done, info = env.step(Action[0]) 88 | 89 | observation_next = [observation_curr] 90 | else: 91 | Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next}) 92 | 93 | observation_curr, reward_curr, done, info = env.step(Action[0]) 94 | observation_next = [observation_curr] 95 | 96 | if done is True: 97 | 98 | eposide_length.append(t + 1) 99 | reward = -1 100 | reward_return = reward * (discount ** (t)) 101 | expected_value.append(reward_return) 102 | 103 | break 104 | all_eposide_length[0, i_episode] = t + 1 105 | all_reward[0, i_episode] = reward_return 106 | 107 | all_eposide_length = np.mean(all_eposide_length, axis=0) 108 | all_reward = np.mean(all_reward, axis=0) 109 | 110 | 111 | 112 | print('the mean of episode length', np.mean(eposide_length)) 113 | print('the mean of reward ',np.mean(expected_value)) 114 | 115 | print('the standard deviation of episode length', np.std(eposide_length)) 116 | plt.plot(all_eposide_length) 117 | plt.xlabel('Num of episode') 118 | plt.ylabel('length of eposide') 119 | plt.show() 120 | 121 | -------------------------------------------------------------------------------- /CartPole/different-neural-size-Q-learning/cartpole_5_neural_1000_saved.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | import os 6 | 7 | env = gym.make('CartPole-v0') 8 | 9 | env._max_episode_steps = 300 10 | 11 | 12 | 13 | #### set variable and parameters #### 14 | x1=tf.placeholder(tf.float32, shape=[None,4]) 15 | x2=tf.placeholder(tf.float32, shape=[None,4]) 16 | x3=tf.placeholder(tf.float32, shape=[None,1]) 17 | x4=tf.placeholder(tf.int32, shape=[None,2]) 18 | 19 | 20 | discount=0.99 21 | learn_rate=0.0001 22 | input_size=4 23 | hidden_size=1000 24 | output_size=2 25 | eplison=0.05 26 | max_eposide_length=300 27 | 28 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 29 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 30 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 31 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 32 | 33 | 34 | 35 | ### one hiddle layer neural network as function approximation ### 36 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 37 | prediction_No=tf.nn.relu(middle_now) 38 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 39 | 40 | 41 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 42 | prediction_Ne=tf.nn.relu(middle_next) 43 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 44 | 45 | ### the best action based on observation_now ### 46 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 47 | 48 | 49 | 50 | ### calcaulate the loss and training ### 51 | Q_value=tf.gather_nd(params=prediction_now,indices=x4) 52 | 53 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 54 | 55 | 56 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value)) 57 | 58 | q_loss=tf.reduce_sum(tf.square(delta)/2) 59 | 60 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 61 | 62 | 63 | #### save the model #### 64 | saver=tf.train.Saver() 65 | 66 | 67 | 68 | with tf.device('/cpu:0'): 69 | 70 | #### set the set to save data #### 71 | run_size = 1 72 | all_episode_length = np.zeros((run_size, 2000)) 73 | all_total_reward = np.zeros((run_size, 2000)) 74 | all_test_episode_length = np.zeros((run_size, 100)) 75 | all_test_reward = np.zeros((run_size, 100)) 76 | all_train_loss = np.zeros((run_size, 100)) 77 | 78 | 79 | with tf.Session() as sess: 80 | for i_run in range(1,1+run_size): 81 | sess.run(tf.global_variables_initializer()) 82 | 83 | print('......start training data......') 84 | 85 | for i_eposide in range(1,2000+1): 86 | 87 | ### begin a new eposide ### 88 | observation_00 = env.reset() 89 | total_reward=0 90 | total_QQ_loss=0 91 | 92 | for i_step in range(max_eposide_length): 93 | 94 | ### greedy policy to select action ### 95 | if np.random.random() <= eplison: 96 | action_select_now=np.random.randint(2) 97 | 98 | else: 99 | ### use Q function to select action ### 100 | action_select_now=sess.run(test_action,feed_dict={x1:np.reshape(observation_00, [1, 4])}) 101 | action_select_now=int(action_select_now) 102 | 103 | observation_11,_,done_0,info=env.step(action_select_now) 104 | 105 | if done_0 is False: 106 | reward=0 107 | else: 108 | reward=-1 109 | ### trainning step ### 110 | _,train_loss=sess.run([train_optimizer,q_loss], feed_dict={x1:np.reshape( observation_00,[1,4]), x2: np.reshape( observation_11,[1,4]), x3:np.reshape(reward,[1,1]),x4:np.reshape([0,action_select_now],[1,2])}) 111 | 112 | total_QQ_loss +=train_loss 113 | 114 | observation_00 = observation_11 115 | if (i_eposide-1)%20==0: 116 | 117 | if done_0 is True: 118 | reward=-1 119 | 120 | final_reward =reward* discount**(i_step) 121 | 122 | all_episode_length[i_run-1, i_eposide-1] = i_step + 1 123 | all_total_reward[i_run-1, i_eposide-1] = final_reward 124 | 125 | 126 | 127 | ### record average test performance ### 128 | test_size=10 129 | Small_test_eposide_length = np.zeros((1, test_size)) 130 | Small_test_reward = np.zeros((1, test_size)) 131 | 132 | for i_test_run in range(1,test_size+1): 133 | observation_test_0 = env.reset() 134 | 135 | for i_test_length in range(max_eposide_length): 136 | action_test_now = test_action.eval(feed_dict={x1: np.reshape(observation_test_0, [1, 4])}) 137 | action_test_now=int(action_test_now) 138 | observation_test_1, _, test_done, test_info = env.step(int(action_test_now)) 139 | 140 | observation_test_0=observation_test_1 141 | 142 | if test_done is False: 143 | reward_test = 0, 144 | else: 145 | reward_test = -1 146 | 147 | if test_done is True: 148 | Small_test_eposide_length[0,i_test_run-1]=i_test_length+1 149 | Small_test_reward[0,i_test_run-1]=reward_test*(discount**(i_test_length)) 150 | #print(i_test_length+1) 151 | 152 | break 153 | 154 | 155 | small_mean_test_length=np.mean(np.mean(Small_test_eposide_length,axis=0),axis=0) 156 | small_mean_test_reward=np.mean(np.mean(Small_test_reward,axis=0),axis=0) 157 | print('ith_run', i_run-1, 'the ith eposide', i_eposide-1, 'the train_length_eposide', i_step + 1, 158 | 'the test average length',small_mean_test_length , '..loss..', 159 | train_loss) 160 | all_test_episode_length[i_run-1, int((i_eposide-1)/20)]=small_mean_test_length 161 | all_test_reward[i_run-1, int((i_eposide-1)/20)]=small_mean_test_reward 162 | all_train_loss[i_run-1, int((i_eposide-1)/20)] = total_QQ_loss/(i_step+1) 163 | 164 | if all_test_episode_length[i_run - 1, int((i_eposide - 1) / 20)] == np.amax( 165 | all_test_episode_length): 166 | 167 | print('.....', all_test_episode_length[i_run - 1, int((i_eposide - 1) / 20)]) 168 | print(np.amax(all_test_episode_length)) 169 | 170 | if not os.path.exists('./part5_neural_1000_300/'): 171 | os.mkdir('./part5_neural_1000_300/') 172 | saver.save(sess, "./part5_neural_1000_300/") 173 | print('saved') 174 | 175 | break 176 | 177 | 178 | else: 179 | if done_0 is True: 180 | reward = -1 181 | 182 | final_reward = reward * discount ** (i_step) 183 | 184 | all_episode_length[i_run - 1, i_eposide-1] = i_step + 1 185 | all_total_reward[i_run - 1, i_eposide-1] = final_reward 186 | break 187 | 188 | 189 | 190 | 191 | ### save and plot performance during training and tes #### 192 | outfile1=all_total_reward 193 | outfile2=all_episode_length 194 | outfile3=all_train_loss 195 | outfile4=all_test_reward 196 | outfile5=all_test_episode_length 197 | 198 | 199 | np.save('part_5_train_reward_1000_300', outfile1) 200 | np.save('part5_train_eposide_length_1000_300',outfile2) 201 | 202 | np.save('part5_train_loss_1000_300', outfile3) 203 | np.save('part5_test_reward_1000_300', outfile4) 204 | np.save('part5_test_length_1000_300', outfile5) 205 | 206 | 207 | 208 | 209 | 210 | -------------------------------------------------------------------------------- /CartPole/different-neural-size-Q-learning/cartpole_5_neural_30_load.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | import matplotlib.pyplot as plt 7 | env = gym.make('CartPole-v0') 8 | env._max_episode_steps = 300 9 | 10 | 11 | 12 | #### set variable and parameters #### 13 | x1=tf.placeholder(tf.float32, shape=[None,4]) 14 | x2=tf.placeholder(tf.float32, shape=[None,4]) 15 | x3=tf.placeholder(tf.float32, shape=[None,1]) 16 | x4=tf.placeholder(tf.int32, shape=[None,2]) 17 | 18 | 19 | discount=0.99 20 | learn_rate=0.0001 21 | input_size=4 22 | hidden_size=30 23 | output_size=2 24 | eplison=0.05 25 | max_eposide_length=300 26 | 27 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 28 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 29 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 30 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 31 | 32 | 33 | 34 | ### one hiddle layer neural network as function approximation ### 35 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 36 | prediction_No=tf.nn.relu(middle_now) 37 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 38 | 39 | 40 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 41 | prediction_Ne=tf.nn.relu(middle_next) 42 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 43 | 44 | ### the best action based on observation_now ### 45 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 46 | 47 | 48 | 49 | ### calcaulate the loss and training ### 50 | Q_value=tf.gather_nd(params=prediction_now,indices=x4) 51 | 52 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 53 | 54 | 55 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value)) 56 | 57 | q_loss=tf.reduce_sum(tf.square(delta)/2) 58 | 59 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 60 | 61 | 62 | #### save the model #### 63 | saver=tf.train.Saver() 64 | 65 | 66 | 67 | with tf.device('/cpu:0'): 68 | with tf.Session() as sess: 69 | ## reload the weights ### 70 | saver.restore(sess, './part5_neural_30_300/') 71 | eposide_length = [] 72 | expected_value = [] 73 | all_eposide_length = np.zeros((1, 10)) 74 | all_reward = np.zeros((1, 100)) 75 | ### run 10 times test eposide ### 76 | for i_episode in range(10): 77 | 78 | observation_init = env.reset() 79 | observation_init = [observation_init] 80 | 81 | for t in range(300): 82 | 83 | if t == 0: 84 | 85 | Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init}) 86 | 87 | 88 | observation_curr, reward_curr, done, info = env.step(Action[0]) 89 | 90 | observation_next = [observation_curr] 91 | else: 92 | Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next}) 93 | 94 | observation_curr, reward_curr, done, info = env.step(Action[0]) 95 | observation_next = [observation_curr] 96 | 97 | if done is True: 98 | 99 | eposide_length.append(t + 1) 100 | reward = -1 101 | reward_return = reward * (discount ** (t)) 102 | expected_value.append(reward_return) 103 | 104 | break 105 | all_eposide_length[0, i_episode] = t + 1 106 | all_reward[0, i_episode] = reward_return 107 | 108 | all_eposide_length = np.mean(all_eposide_length, axis=0) 109 | all_reward = np.mean(all_reward, axis=0) 110 | 111 | 112 | 113 | print('the mean of episode length', np.mean(eposide_length)) 114 | print('the mean of reward ',np.mean(expected_value)) 115 | 116 | print('the standard deviation of episode length', np.std(eposide_length)) 117 | plt.plot(all_eposide_length) 118 | plt.xlabel('Num of episode') 119 | plt.ylabel('length of eposide') 120 | plt.show() 121 | -------------------------------------------------------------------------------- /CartPole/different-neural-size-Q-learning/cartpole_5_neural_30_saved.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | import os 6 | 7 | env = gym.make('CartPole-v0') 8 | env._max_episode_steps = 300 9 | 10 | 11 | 12 | #### set variable and parameters #### 13 | x1=tf.placeholder(tf.float32, shape=[None,4]) 14 | x2=tf.placeholder(tf.float32, shape=[None,4]) 15 | x3=tf.placeholder(tf.float32, shape=[None,1]) 16 | x4=tf.placeholder(tf.int32, shape=[None,2]) 17 | 18 | 19 | discount=0.99 20 | learn_rate=0.0001 21 | input_size=4 22 | hidden_size=30 23 | output_size=2 24 | eplison=0.05 25 | max_eposide_length=300 26 | 27 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 28 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 29 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 30 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 31 | 32 | 33 | 34 | ### one hiddle layer neural network as function approximation ### 35 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 36 | prediction_No=tf.nn.relu(middle_now) 37 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 38 | 39 | 40 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 41 | prediction_Ne=tf.nn.relu(middle_next) 42 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 43 | 44 | ### the best action based on observation_now ### 45 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 46 | 47 | 48 | 49 | ### calcaulate the loss and training ### 50 | Q_value=tf.gather_nd(params=prediction_now,indices=x4) 51 | 52 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 53 | 54 | 55 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value)) 56 | 57 | q_loss=tf.reduce_sum(tf.square(delta)/2) 58 | 59 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 60 | 61 | 62 | #### save the model #### 63 | saver=tf.train.Saver() 64 | 65 | 66 | 67 | with tf.device('/cpu:0'): 68 | 69 | #### set the set to save data #### 70 | run_size = 1 71 | all_episode_length = np.zeros((run_size, 2000)) 72 | all_total_reward = np.zeros((run_size, 2000)) 73 | all_test_episode_length = np.zeros((run_size, 100)) 74 | all_test_reward = np.zeros((run_size, 100)) 75 | all_train_loss = np.zeros((run_size, 100)) 76 | 77 | 78 | with tf.Session() as sess: 79 | for i_run in range(1,1+run_size): 80 | sess.run(tf.global_variables_initializer()) 81 | 82 | print('......start training data......') 83 | 84 | for i_eposide in range(1,2000+1): 85 | 86 | ### begin a new eposide ### 87 | observation_00 = env.reset() 88 | total_reward=0 89 | total_QQ_loss=0 90 | 91 | for i_step in range(max_eposide_length): 92 | 93 | 94 | ### greedy policy to select action ### 95 | if np.random.random() <= eplison: 96 | action_select_now=np.random.randint(2) 97 | 98 | else: 99 | ### use Q function to select action ### 100 | action_select_now=sess.run(test_action,feed_dict={x1:np.reshape(observation_00, [1, 4])}) 101 | action_select_now=int(action_select_now) 102 | 103 | observation_11,_,done_0,info=env.step(action_select_now) 104 | 105 | if done_0 is False: 106 | reward=0 107 | else: 108 | reward=-1 109 | ### training step ### 110 | _,train_loss=sess.run([train_optimizer,q_loss], feed_dict={x1:np.reshape( observation_00,[1,4]), x2: np.reshape( observation_11,[1,4]), x3:np.reshape(reward,[1,1]),x4:np.reshape([0,action_select_now],[1,2])}) 111 | 112 | total_QQ_loss +=train_loss 113 | 114 | observation_00 = observation_11 115 | if (i_eposide-1)%20==0: 116 | 117 | if done_0 is True: 118 | reward=-1 119 | 120 | final_reward =reward* discount**(i_step) 121 | 122 | all_episode_length[i_run-1, i_eposide-1] = i_step + 1 123 | all_total_reward[i_run-1, i_eposide-1] = final_reward 124 | 125 | 126 | 127 | ### record average test performance ### 128 | test_size=10 129 | Small_test_eposide_length = np.zeros((1, test_size)) 130 | Small_test_reward = np.zeros((1, test_size)) 131 | 132 | for i_test_run in range(1,test_size+1): 133 | observation_test_0 = env.reset() 134 | 135 | 136 | for i_test_length in range(max_eposide_length): 137 | #env.render() 138 | action_test_now = test_action.eval(feed_dict={x1: np.reshape(observation_test_0, [1, 4])}) 139 | action_test_now=int(action_test_now) 140 | observation_test_1, _, test_done, test_info = env.step(int(action_test_now)) 141 | 142 | observation_test_0=observation_test_1 143 | 144 | if test_done is False: 145 | reward_test = 0, 146 | else: 147 | reward_test = -1 148 | 149 | if test_done is True: 150 | Small_test_eposide_length[0,i_test_run-1]=i_test_length+1 151 | Small_test_reward[0,i_test_run-1]=reward_test*(discount**(i_test_length)) 152 | #print(i_test_length+1) 153 | 154 | break 155 | 156 | 157 | small_mean_test_length=np.mean(np.mean(Small_test_eposide_length,axis=0),axis=0) 158 | small_mean_test_reward=np.mean(np.mean(Small_test_reward,axis=0),axis=0) 159 | print('ith_run', i_run-1, 'the ith eposide', i_eposide-1, 160 | 'the test average length', small_mean_test_length , '..loss..', 161 | train_loss) 162 | all_test_episode_length[i_run-1, int((i_eposide-1)/20)]=small_mean_test_length 163 | #print((i_eposide-1)/20) 164 | #print(int((i_eposide-1)/20)) 165 | all_test_reward[i_run-1, int((i_eposide-1)/20)]=small_mean_test_reward 166 | all_train_loss[i_run-1, int((i_eposide-1)/20)] = total_QQ_loss/(i_step+1) 167 | 168 | if all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)] == np.amax( 169 | all_test_episode_length): 170 | 171 | print('.....', all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)]) 172 | print(np.amax(all_test_episode_length)) 173 | 174 | 175 | 176 | if not os.path.exists('./part5_neural_30_300/'): 177 | os.mkdir('./part5_neural_30_300/') 178 | saver.save(sess, "./part5_neural_30_300/") 179 | print('saved') 180 | 181 | 182 | break 183 | else: 184 | if done_0 is True: 185 | reward = -1 186 | 187 | final_reward = reward * discount ** (i_step) 188 | 189 | all_episode_length[i_run - 1, i_eposide-1] = i_step + 1 190 | all_total_reward[i_run - 1, i_eposide-1] = final_reward 191 | break 192 | 193 | 194 | 195 | 196 | ### save and plot performance during training and tes #### 197 | outfile1=all_total_reward 198 | outfile2=all_episode_length 199 | outfile3=all_train_loss 200 | outfile4=all_test_reward 201 | outfile5=all_test_episode_length 202 | 203 | 204 | np.save('part_5_train_reward_30_300', outfile1) 205 | np.save('part5_train_eposide_length_30_300',outfile2) 206 | 207 | np.save('part5_train_loss_30_300', outfile3) 208 | np.save('part5_test_reward_30_300', outfile4) 209 | np.save('part5_test_length_30_300', outfile5) 210 | 211 | 212 | 213 | -------------------------------------------------------------------------------- /CartPole/different-neural-size-Q-learning/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /CartPole/double-q-learning/cartpole_8_load.py: -------------------------------------------------------------------------------- 1 | 2 | import gym 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | import matplotlib.pyplot as plt 7 | 8 | env = gym.make('CartPole-v0') 9 | env._max_episode_steps = 300 10 | 11 | print("......Loading train_data......") 12 | 13 | 14 | train_data=np.load('train_data_2.npy') 15 | 16 | #### set variable and parameters #### 17 | 18 | x1=tf.placeholder(tf.float32, shape=[None,4]) 19 | x2=tf.placeholder(tf.float32, shape=[None,4]) 20 | x3=tf.placeholder(tf.float32, shape=[None,2]) 21 | x4=tf.placeholder(tf.float32, shape=[None]) 22 | x5=tf.placeholder(tf.float32,shape=[None]) 23 | 24 | 25 | 26 | batch_size=128 27 | discount=0.99 28 | learn_rate=0.0001 29 | input_size=4 30 | hidden_size=100 31 | output_size=2 32 | max_eposide_length=300 33 | eplison=0.05 34 | 35 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 36 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 37 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 38 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 39 | 40 | Weight_double_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 41 | Weight_double_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 42 | Bias_double_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 43 | Bias_double_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 44 | 45 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 46 | prediction_No=tf.nn.relu(middle_now) 47 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 48 | 49 | 50 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 51 | prediction_Ne=tf.nn.relu(middle_next) 52 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 53 | 54 | 55 | middle_now_double=tf.matmul(x1,Weight_double_1)+Bias_double_1 56 | prediction_No_double=tf.nn.relu(middle_now_double) 57 | prediction_now_double=tf.matmul(prediction_No_double,Weight_double_2)+Bias_double_2 58 | 59 | middle_next_double=tf.matmul(x2,Weight_double_1)+Bias_double_1 60 | prediction_Ne_double=tf.nn.relu(middle_next_double) 61 | prediction_next_double=tf.matmul(prediction_Ne_double,Weight_double_2)+Bias_double_2 62 | # 63 | 64 | 65 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 66 | test_action_double=tf.cast(tf.argmax(prediction_now_double,1),tf.int32) 67 | 68 | True_action=tf.cast(x3,tf.int32) 69 | 70 | Q_value=tf.gather_nd(params=prediction_now,indices=True_action) 71 | Q_value_double=tf.gather_nd(params=prediction_now_double,indices=True_action) 72 | 73 | 74 | 75 | next_action_b=tf.cast(tf.argmax(prediction_next_double,1),tf.int32) 76 | next_action_b=tf.reshape(next_action_b,[-1,1]) 77 | action_repeat_b=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])# 78 | action_b_next=tf.concat([action_repeat_b,next_action_b],1) 79 | 80 | next_action_a=tf.cast(tf.argmax(prediction_next,1),tf.int32) 81 | next_action_a=tf.reshape(next_action_a,[-1,1]) 82 | action_repeat_a=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])# 83 | action_a_next=tf.concat([action_repeat_a,next_action_a],1) 84 | 85 | 86 | Max_Q_value_next=tf.gather_nd(params=prediction_next_double,indices=action_a_next) 87 | Max_Q_value_next_double=tf.gather_nd(params=prediction_next,indices=action_b_next) 88 | 89 | delta_a=tf.add(x4+discount*tf.stop_gradient((1+x4)*Max_Q_value_next),(-1*Q_value)) 90 | delta_b=tf.add(x4+discount*tf.stop_gradient((1+x4)*Max_Q_value_next_double),(-1*Q_value_double)) 91 | 92 | 93 | q_loss_a=tf.reduce_mean((tf.square(delta_a))/2) 94 | 95 | q_loss_b=tf.reduce_mean((tf.square(delta_b))/2) 96 | 97 | train_optimizer_a=tf.train.AdamOptimizer(learn_rate).minimize(q_loss_a) 98 | 99 | train_optimizer_b=tf.train.AdamOptimizer(learn_rate).minimize(q_loss_b) 100 | 101 | 102 | 103 | saver = tf.train.Saver() 104 | 105 | 106 | with tf.device('/cpu:0'): 107 | with tf.Session() as sess: 108 | ## reload the weights ### 109 | saver.restore(sess, './part8_double_dqn/') 110 | eposide_length = [] 111 | expected_value = [] 112 | all_eposide_length = np.zeros((1, 10)) 113 | all_reward = np.zeros((1, 100)) 114 | 115 | for i_episode in range(10): 116 | 117 | observation_init = env.reset() 118 | observation_init = [observation_init] 119 | 120 | for t in range(300): 121 | 122 | if t == 0: 123 | 124 | Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init}) 125 | 126 | 127 | observation_curr, reward_curr, done, info = env.step(Action[0]) 128 | 129 | observation_next = [observation_curr] 130 | else: 131 | Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next}) 132 | 133 | observation_curr, reward_curr, done, info = env.step(Action[0]) 134 | observation_next = [observation_curr] 135 | 136 | if done is True: 137 | 138 | eposide_length.append(t + 1) 139 | reward = -1 140 | reward_return = reward * (discount ** (t)) 141 | expected_value.append(reward_return) 142 | 143 | break 144 | all_eposide_length[0, i_episode] = t + 1 145 | all_reward[0, i_episode] = reward_return 146 | 147 | all_eposide_length = np.mean(all_eposide_length, axis=0) 148 | all_reward = np.mean(all_reward, axis=0) 149 | 150 | 151 | 152 | print('the mean of episode length', np.mean(eposide_length)) 153 | print('the mean of reward ',np.mean(expected_value)) 154 | 155 | print('the standard deviation of episode length', np.std(eposide_length)) 156 | plt.plot(all_eposide_length) 157 | plt.xlabel('Num of episode') 158 | plt.ylabel('length of eposide') 159 | plt.show() 160 | 161 | 162 | 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /CartPole/double-q-learning/cartpole_8_saved.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | 7 | env = gym.make('CartPole-v0') 8 | env._max_episode_steps = 300 9 | 10 | print("......Loading train_data......") 11 | 12 | 13 | train_data=np.load('train_data_2.npy') 14 | 15 | #### set variable and parameters #### 16 | 17 | x1=tf.placeholder(tf.float32, shape=[None,4]) 18 | x2=tf.placeholder(tf.float32, shape=[None,4]) 19 | x3=tf.placeholder(tf.float32, shape=[None,2]) 20 | x4=tf.placeholder(tf.float32, shape=[None]) 21 | x5=tf.placeholder(tf.float32,shape=[None]) 22 | 23 | 24 | batch_size=128 25 | discount=0.99 26 | learn_rate=0.0001 27 | input_size=4 28 | hidden_size=100 29 | output_size=2 30 | max_eposide_length=300 31 | eplison=0.05 32 | 33 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 34 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 35 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 36 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 37 | 38 | ### the second neural network ### 39 | Weight_double_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 40 | Weight_double_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 41 | Bias_double_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 42 | Bias_double_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 43 | 44 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 45 | prediction_No=tf.nn.relu(middle_now) 46 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 47 | 48 | 49 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 50 | prediction_Ne=tf.nn.relu(middle_next) 51 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 52 | 53 | 54 | middle_now_double=tf.matmul(x1,Weight_double_1)+Bias_double_1 55 | prediction_No_double=tf.nn.relu(middle_now_double) 56 | prediction_now_double=tf.matmul(prediction_No_double,Weight_double_2)+Bias_double_2 57 | 58 | middle_next_double=tf.matmul(x2,Weight_double_1)+Bias_double_1 59 | prediction_Ne_double=tf.nn.relu(middle_next_double) 60 | prediction_next_double=tf.matmul(prediction_Ne_double,Weight_double_2)+Bias_double_2 61 | 62 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 63 | test_action_double=tf.cast(tf.argmax(prediction_now_double,1),tf.int32) 64 | 65 | True_action=tf.cast(x3,tf.int32) 66 | 67 | Q_value=tf.gather_nd(params=prediction_now,indices=True_action) 68 | Q_value_double=tf.gather_nd(params=prediction_now_double,indices=True_action) 69 | 70 | ### calculate the target by the actual action which calculate by current network ### 71 | next_action_b=tf.cast(tf.argmax(prediction_next_double,1),tf.int32) 72 | next_action_b=tf.reshape(next_action_b,[-1,1]) 73 | action_repeat_b=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])# 74 | action_b_next=tf.concat([action_repeat_b,next_action_b],1) 75 | 76 | next_action_a=tf.cast(tf.argmax(prediction_next,1),tf.int32) 77 | next_action_a=tf.reshape(next_action_a,[-1,1]) 78 | action_repeat_a=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1])# 79 | action_a_next=tf.concat([action_repeat_a,next_action_a],1) 80 | 81 | 82 | Max_Q_value_next=tf.gather_nd(params=prediction_next_double,indices=action_a_next) 83 | Max_Q_value_next_double=tf.gather_nd(params=prediction_next,indices=action_b_next) 84 | 85 | 86 | ### calculate the loss by update network a or b ### 87 | delta_a=tf.add(x4+discount*tf.stop_gradient((1+x4)*Max_Q_value_next),(-1*Q_value)) 88 | delta_b=tf.add(x4+discount*tf.stop_gradient((1+x4)*Max_Q_value_next_double),(-1*Q_value_double)) 89 | 90 | 91 | q_loss_a=tf.reduce_mean((tf.square(delta_a))/2) 92 | q_loss_b=tf.reduce_mean((tf.square(delta_b))/2) 93 | 94 | train_optimizer_a=tf.train.AdamOptimizer(learn_rate).minimize(q_loss_a) 95 | train_optimizer_b=tf.train.AdamOptimizer(learn_rate).minimize(q_loss_b) 96 | 97 | 98 | saver = tf.train.Saver() 99 | 100 | with tf.device('/cpu:0'): 101 | 102 | eposide_size = 2000 103 | run_size = 1 104 | all_episode_length = np.zeros((run_size, int(eposide_size))) 105 | all_total_reward = np.zeros((run_size, int(eposide_size))) 106 | all_test_episode_length = np.zeros((run_size, int(eposide_size))) 107 | all_test_reward = np.zeros((run_size, int(eposide_size / 20))) 108 | all_train_loss = np.zeros((run_size, int(eposide_size / 20))) 109 | 110 | length_of_train = len(train_data) 111 | for i_run in range(1, run_size + 1): 112 | ### build the experience replay ### 113 | 114 | buffer_size = 1024 115 | mini_batch_size = 64 116 | 117 | length_of_train=len(train_data) 118 | 119 | buffer_sample=random.sample(range(0, length_of_train), buffer_size) 120 | buffer_replay=train_data[buffer_sample] 121 | 122 | buffer_observation_now = [] 123 | buffer_observation_next=[] 124 | buffer_action=[] 125 | buffer_reward=[] 126 | 127 | for i_sele in range(buffer_size): 128 | buffer_observation_now.append( buffer_replay[i_sele][0]) 129 | buffer_observation_next.append( buffer_replay[i_sele][1]) 130 | buffer_reward.append( buffer_replay[i_sele][2]) 131 | buffer_action.append( buffer_replay[i_sele][3]) 132 | 133 | 134 | with tf.Session() as sess: 135 | 136 | sess.run(tf.global_variables_initializer()) 137 | 138 | for i_eposide in range(1,1+eposide_size): 139 | 140 | observation_0 = env.reset() 141 | 142 | total_QQ_loss = 0 143 | 144 | for i_step in range(max_eposide_length): 145 | 146 | if np.random.random() <= eplison: 147 | action_select_now = np.random.randint(2) 148 | else: 149 | Q = sess.run(test_action, feed_dict={x1: np.reshape(observation_0, [1, 4])}) 150 | action_select_now=int(Q) 151 | 152 | observation_1, _, done_0, _ = env.step(action_select_now) 153 | 154 | if done_0: 155 | reward = -1 156 | else: 157 | reward = 0 158 | 159 | # ##add new data to replay memory 160 | buffer_observation_now = np.append(buffer_observation_now, np.reshape(observation_0, [1, 4]), axis=0) 161 | buffer_observation_next = np.append(buffer_observation_next, np.reshape(observation_1, [1, 4]), axis=0) 162 | buffer_action = np.append(buffer_action, [action_select_now], axis=0) 163 | buffer_reward = np.append(buffer_reward, [reward], axis=0) 164 | 165 | ### update the first neural network ### 166 | if np.random.randint(2) == 0: 167 | select_order = np.arange(mini_batch_size) 168 | 169 | this_batch = random.sample(range(len(buffer_replay)), mini_batch_size) 170 | 171 | 172 | 173 | _, loss_train = sess.run([train_optimizer_a, q_loss_a], feed_dict={x1: buffer_observation_now[this_batch, :], 174 | x2: buffer_observation_next[this_batch, :], 175 | x3: np.concatenate((np.reshape( 176 | np.arange(mini_batch_size), 177 | [mini_batch_size, 1]), np.reshape( 178 | buffer_action[this_batch], 179 | [mini_batch_size, 1])), axis=1) 180 | , x4: buffer_reward[this_batch], 181 | x5:select_order}) 182 | else: 183 | ### update the second neural network ### 184 | this_batch = random.sample(range(len(buffer_replay)), mini_batch_size) 185 | select_order = np.arange(mini_batch_size) 186 | 187 | _, loss_train = sess.run([train_optimizer_b, q_loss_b], 188 | feed_dict={x1: buffer_observation_now[this_batch, :], 189 | x2: buffer_observation_next[this_batch, :], 190 | x3: np.concatenate((np.reshape( 191 | np.arange(mini_batch_size), 192 | [mini_batch_size, 1]), np.reshape( 193 | buffer_action[this_batch], 194 | [mini_batch_size, 1])), axis=1) 195 | , x4: buffer_reward[this_batch], 196 | x5:select_order}) 197 | 198 | total_QQ_loss +=loss_train 199 | 200 | observation_0 = observation_1 201 | 202 | if (i_eposide - 1) % 20 == 0: 203 | 204 | if done_0 is True: 205 | if i_step + 1 == 300: 206 | report_reward = 0 207 | else: 208 | report_reward = -1 * discount ** (i_step) 209 | 210 | all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1 211 | all_total_reward[i_run - 1, i_eposide - 1] = report_reward 212 | 213 | ### record average test performance ### 214 | test_size = 10 215 | Small_test_eposide_length = np.zeros((1, test_size)) 216 | Small_test_reward = np.zeros((1, test_size)) 217 | 218 | for i_test_run in range(1, test_size + 1): 219 | observation_test_0 = env.reset() 220 | 221 | for i_test_length in range(max_eposide_length): 222 | action_test_now = test_action.eval( 223 | feed_dict={x1: np.reshape(observation_test_0, [1, 4])}) 224 | action_test_now = int(action_test_now) 225 | observation_test_1, _, test_done, test_info = env.step(action_test_now) 226 | 227 | observation_test_0 = observation_test_1 228 | 229 | if test_done is True: 230 | if i_test_length+1==300: 231 | reward_test=0 232 | else: 233 | reward_test=-1 234 | Small_test_eposide_length[0, i_test_run - 1] = i_test_length + 1 235 | Small_test_reward[0, i_test_run - 1] = reward_test * ( 236 | discount ** (i_test_length)) 237 | 238 | 239 | break 240 | 241 | small_mean_test_length = np.mean(np.mean(Small_test_eposide_length, axis=0), axis=0) 242 | small_mean_test_reward = np.mean(np.mean(Small_test_reward, axis=0), axis=0) 243 | print('the ith running',i_run,'the ith eposide', i_eposide - 1, 'the test_average_length', 244 | small_mean_test_length, 245 | 'the total_test_length ', Small_test_eposide_length, '..loss..', 246 | total_QQ_loss / (i_step + 1)) 247 | all_test_episode_length[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_length 248 | 249 | all_test_reward[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_reward 250 | all_train_loss[i_run - 1, int((i_eposide - 1) / 20)] = total_QQ_loss / (i_step + 1) 251 | 252 | 253 | if all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)] == np.amax( 254 | all_test_episode_length): 255 | 256 | print('.....', all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)]) 257 | print(np.amax(all_test_episode_length)) 258 | 259 | 260 | 261 | if not os.path.exists('./part8_double_dqn/'): 262 | os.mkdir('./part8_double_dqn/') 263 | saver.save(sess, "./part8_double_dqn/") 264 | print('saved') 265 | 266 | break 267 | else: 268 | if done_0 is True: 269 | reward = -1 270 | 271 | final_reward = reward * discount ** (i_step) 272 | 273 | all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1 274 | all_total_reward[i_run - 1, i_eposide - 1] = final_reward 275 | 276 | break 277 | 278 | outfile1=all_total_reward 279 | outfile2=all_episode_length 280 | outfile3=all_train_loss 281 | outfile4=all_test_reward 282 | outfile5=all_test_episode_length 283 | 284 | np.save('reward_data_train_part8', outfile1) 285 | np.save('length_data_train_part8',outfile2) 286 | 287 | np.save('loss_data_train_part8', outfile3) 288 | np.save('length_data_test_part8', outfile4) 289 | np.save('reward_data_test_part8', outfile5) 290 | 291 | 292 | mean_episode_len = np.mean(all_episode_length, axis=0) 293 | mean_total_reward = np.mean(all_total_reward, axis=0) 294 | mean_loss_train=np.mean(all_train_loss,axis=0) 295 | mean_test_eposide_length=np.mean(all_test_episode_length,axis=0) 296 | mean_test_reward=np.mean(all_test_reward,axis=0) 297 | 298 | std_episode_len = np.std(all_episode_length, axis=0) 299 | std_total_reward = np.std(all_total_reward, axis=0) 300 | 301 | -------------------------------------------------------------------------------- /CartPole/double-q-learning/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /CartPole/experience_replay/cartpole_6_buffer_replay_load.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import matplotlib.pyplot as plt 5 | 6 | env = gym.make('CartPole-v0') 7 | env._max_episode_steps = 300 8 | 9 | print("......Loading train_data......") 10 | 11 | 12 | train_data=np.load('train_data_2.npy') 13 | 14 | #### set variable and parameters #### 15 | 16 | x1=tf.placeholder(tf.float32, shape=[None,4]) 17 | x2=tf.placeholder(tf.float32, shape=[None,4]) 18 | x3=tf.placeholder(tf.float32, shape=[None,2]) 19 | x4=tf.placeholder(tf.float32, shape=[None]) 20 | 21 | 22 | batch_size=128 23 | discount=0.99 24 | learn_rate=0.0001 25 | input_size=4 26 | hidden_size=100 27 | output_size=2 28 | max_eposide_length=300 29 | eplison=0.05 30 | 31 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 32 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 33 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 34 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 35 | 36 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 37 | prediction_No=tf.nn.relu(middle_now) 38 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 39 | 40 | 41 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 42 | prediction_Ne=tf.nn.relu(middle_next) 43 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 44 | # 45 | 46 | True_action=tf.cast(x3,tf.int32) 47 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 48 | Q_value = tf.gather_nd(prediction_now, True_action) 49 | 50 | max_Q_value = tf.reduce_max(prediction_next, axis=1) 51 | delta = x4 + discount * tf.stop_gradient((1 + x4) * max_Q_value) - Q_value 52 | q_loss = tf.reduce_mean(tf.square(delta) / 2) 53 | 54 | 55 | train_optimizer = tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 56 | 57 | saver = tf.train.Saver() 58 | 59 | 60 | with tf.device('/cpu:0'): 61 | with tf.Session() as sess: 62 | ## reload the weights ### 63 | saver.restore(sess, './part6_neural_buffer/') 64 | eposide_length = [] 65 | expected_value = [] 66 | all_eposide_length = np.zeros((1, 10)) 67 | all_reward = np.zeros((1, 100)) 68 | 69 | ### test the final model performance ### 70 | for i_episode in range(10): 71 | 72 | observation_init = env.reset() 73 | observation_init = [observation_init] 74 | 75 | for t in range(300): 76 | 77 | if t == 0: 78 | 79 | Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init}) 80 | 81 | observation_curr, reward_curr, done, info = env.step(Action[0]) 82 | observation_next = [observation_curr] 83 | else: 84 | Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next}) 85 | 86 | observation_curr, reward_curr, done, info = env.step(Action[0]) 87 | observation_next = [observation_curr] 88 | 89 | if done is True: 90 | 91 | eposide_length.append(t + 1) 92 | reward = -1 93 | reward_return = reward * (discount ** (t)) 94 | expected_value.append(reward_return) 95 | 96 | break 97 | all_eposide_length[0, i_episode] = t + 1 98 | all_reward[0, i_episode] = reward_return 99 | 100 | all_eposide_length = np.mean(all_eposide_length, axis=0) 101 | all_reward = np.mean(all_reward, axis=0) 102 | 103 | 104 | 105 | print('the mean of episode length', np.mean(eposide_length)) 106 | print('the mean of reward ',np.mean(expected_value)) 107 | 108 | print('the standard deviation of episode length', np.std(eposide_length)) 109 | plt.plot(all_eposide_length) 110 | plt.xlabel('Num of episode') 111 | plt.ylabel('length of eposide') 112 | plt.show() 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /CartPole/experience_replay/cartpole_6_buffer_replay_saved.py: -------------------------------------------------------------------------------- 1 | 2 | import gym 3 | import numpy as np 4 | import tensorflow as tf 5 | import random 6 | import os 7 | 8 | env = gym.make('CartPole-v0') 9 | env._max_episode_steps = 300 10 | 11 | print("......Loading train_data......") 12 | 13 | 14 | train_data=np.load('train_data_2.npy') 15 | 16 | #### set variable and parameters #### 17 | 18 | x1=tf.placeholder(tf.float32, shape=[None,4]) 19 | x2=tf.placeholder(tf.float32, shape=[None,4]) 20 | x3=tf.placeholder(tf.float32, shape=[None,2]) 21 | x4=tf.placeholder(tf.float32, shape=[None]) 22 | 23 | 24 | batch_size=128 25 | discount=0.99 26 | learn_rate=0.0001 27 | input_size=4 28 | hidden_size=100 29 | output_size=2 30 | max_eposide_length=300 31 | eplison=0.05 32 | 33 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 34 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 35 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 36 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 37 | 38 | ### one hiddle layer neural network as function approximation ### 39 | 40 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 41 | prediction_No=tf.nn.relu(middle_now) 42 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 43 | 44 | 45 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 46 | prediction_Ne=tf.nn.relu(middle_next) 47 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 48 | 49 | 50 | True_action=tf.cast(x3,tf.int32) 51 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 52 | 53 | 54 | ### calcaulate the loss and training ### 55 | Q_value = tf.gather_nd(prediction_now, True_action) 56 | 57 | max_Q_value = tf.reduce_max(prediction_next, axis=1) 58 | delta = x4 + discount * tf.stop_gradient((1 + x4) * max_Q_value) - Q_value 59 | q_loss = tf.reduce_mean(tf.square(delta) / 2) 60 | 61 | 62 | train_optimizer = tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 63 | 64 | #### save the model #### 65 | saver = tf.train.Saver() 66 | 67 | 68 | with tf.device('/cpu:0'): 69 | #### set the set to save data #### 70 | 71 | eposide_size = 200000 72 | run_size = 1 73 | all_episode_length = np.zeros((run_size, int(eposide_size))) 74 | all_total_reward = np.zeros((run_size, int(eposide_size))) 75 | all_test_episode_length = np.zeros((run_size, int(eposide_size/20))) 76 | all_test_reward = np.zeros((run_size, int(eposide_size / 20))) 77 | all_train_loss = np.zeros((run_size, int(eposide_size / 20))) 78 | 79 | length_of_train = len(train_data) 80 | for i_run in range(1, run_size + 1): 81 | 82 | ### set the experience buffer replay ### 83 | buffer_size = 1024 84 | mini_batch_size = 64 85 | length_of_train=len(train_data) 86 | buffer_sample=random.sample(range(0, length_of_train), buffer_size) 87 | buffer_replay=train_data[buffer_sample] 88 | buffer_observation_now = [] 89 | buffer_observation_next=[] 90 | buffer_action=[] 91 | buffer_reward=[] 92 | 93 | for i_sele in range(buffer_size): 94 | buffer_observation_now.append( buffer_replay[i_sele][0]) 95 | buffer_observation_next.append( buffer_replay[i_sele][1]) 96 | buffer_reward.append( buffer_replay[i_sele][2]) 97 | buffer_action.append( buffer_replay[i_sele][3]) 98 | 99 | 100 | 101 | with tf.Session() as sess: 102 | 103 | sess.run(tf.global_variables_initializer()) 104 | 105 | for i_eposide in range(1,1+eposide_size): 106 | 107 | observation_0 = env.reset() 108 | 109 | total_QQ_loss = 0 110 | 111 | for i_step in range(max_eposide_length): 112 | 113 | if np.random.random() <= eplison: 114 | action_train = np.random.randint(2) 115 | else: 116 | Q = sess.run(test_action, feed_dict={x1: np.reshape(observation_0, [1, 4])}) 117 | action_select_now=int(Q) 118 | 119 | # the retured parameters of the action 120 | observation_1, _, done_0, _ = env.step(action_select_now) 121 | 122 | # set reward 123 | if done_0: 124 | reward = -1 125 | else: 126 | reward = 0 127 | 128 | ### add new data to replay memory ### 129 | buffer_observation_now = np.append(buffer_observation_now, np.reshape(observation_0, [1, 4]), axis=0) 130 | buffer_observation_next = np.append(buffer_observation_next, np.reshape(observation_1, [1, 4]), axis=0) 131 | buffer_action = np.append(buffer_action, [action_select_now], axis=0) 132 | buffer_reward = np.append(buffer_reward, [reward], axis=0) 133 | 134 | 135 | this_batch = random.sample(range(len(buffer_replay)), mini_batch_size) 136 | 137 | _, loss_train = sess.run([train_optimizer, q_loss], feed_dict={x1: buffer_observation_now[this_batch, :], 138 | x2: buffer_observation_next[this_batch, :], 139 | x3: np.concatenate((np.reshape( 140 | np.arange(mini_batch_size), 141 | [mini_batch_size, 1]), np.reshape( 142 | buffer_action[this_batch], 143 | [mini_batch_size, 1])), axis=1) 144 | , x4: buffer_reward[this_batch]}) 145 | total_QQ_loss +=loss_train 146 | 147 | observation_0 = observation_1 148 | 149 | if (i_eposide - 1) % 20 == 0: 150 | ### test the agent performance ### 151 | env.render() 152 | 153 | if done_0 is True: 154 | if i_step+1==300: 155 | report_reward = 0 156 | else: 157 | report_reward=-1*discount ** (i_step) 158 | 159 | 160 | 161 | all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1 162 | all_total_reward[i_run - 1, i_eposide - 1] = report_reward 163 | 164 | ### record average test performance ### 165 | test_size = 10 166 | Small_test_eposide_length = np.zeros((1, test_size)) 167 | Small_test_reward = np.zeros((1, test_size)) 168 | 169 | for i_test_run in range(1, test_size + 1): 170 | observation_test_0 = env.reset() 171 | 172 | for i_test_length in range(max_eposide_length): 173 | action_test_now = test_action.eval( 174 | feed_dict={x1: np.reshape(observation_test_0, [1, 4])}) 175 | action_test_now = int(action_test_now) 176 | observation_test_1, _, test_done, test_info = env.step(action_test_now) 177 | 178 | observation_test_0 = observation_test_1 179 | 180 | if test_done is True: 181 | if i_test_length+1==300: 182 | reward_test=0 183 | else: 184 | reward_test=-1 185 | Small_test_eposide_length[0, i_test_run - 1] = i_test_length + 1 186 | Small_test_reward[0, i_test_run - 1] = reward_test * ( 187 | discount ** (i_test_length)) 188 | 189 | break 190 | 191 | small_mean_test_length = np.mean(np.mean(Small_test_eposide_length, axis=0), axis=0) 192 | small_mean_test_reward = np.mean(np.mean(Small_test_reward, axis=0), axis=0) 193 | print('the ith running',i_run,'the ith eposide', i_eposide - 1, 'the test_average_length', 194 | small_mean_test_length, 195 | 'the total_test_length ', Small_test_eposide_length, '..loss..', 196 | total_QQ_loss / (i_step + 1)) 197 | all_test_episode_length[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_length 198 | 199 | all_test_reward[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_reward 200 | all_train_loss[i_run - 1, int((i_eposide - 1) / 20)] = total_QQ_loss / (i_step + 1) 201 | 202 | 203 | if all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)] == np.amax( 204 | all_test_episode_length): 205 | 206 | print('.....', all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)]) 207 | print(np.amax(all_test_episode_length)) 208 | 209 | 210 | 211 | if not os.path.exists('./part6_neural_buffer/'): 212 | os.mkdir('./part6_neural_buffer/') 213 | saver.save(sess, "./part6_neural_buffer/") 214 | print('saved') 215 | 216 | break 217 | else: 218 | if done_0 is True: 219 | reward = -1 220 | 221 | final_reward = reward * discount ** (i_step) 222 | 223 | all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1 224 | all_total_reward[i_run - 1, i_eposide - 1] = final_reward 225 | 226 | break 227 | 228 | outfile1=all_total_reward 229 | outfile2=all_episode_length 230 | outfile3=all_train_loss 231 | outfile4=all_test_reward 232 | outfile5=all_test_episode_length 233 | 234 | np.save('reward_data_train_part6', outfile1) 235 | np.save('length_data_train_part6',outfile2) 236 | 237 | np.save('loss_data_train_part6', outfile3) 238 | np.save('length_data_test_part6', outfile4) 239 | np.save('reward_data_test_part6', outfile5) 240 | 241 | 242 | mean_episode_len = np.mean(all_episode_length, axis=0) 243 | mean_total_reward = np.mean(all_total_reward, axis=0) 244 | mean_loss_train=np.mean(all_train_loss,axis=0) 245 | mean_test_eposide_length=np.mean(all_test_episode_length,axis=0) 246 | mean_test_reward=np.mean(all_test_reward,axis=0) 247 | 248 | std_episode_len = np.std(all_episode_length, axis=0) 249 | std_total_reward = np.std(all_total_reward, axis=0) 250 | -------------------------------------------------------------------------------- /CartPole/experience_replay/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /CartPole/hundred-random-episode/100_random_episodes.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from numpy import float32, uint32 4 | env = gym.make('CartPole-v0') 5 | ### set parameters and set ### 6 | discount_factor=0.99 7 | eposide_length=[] 8 | expected_value=[] 9 | for i_episode in range(100): 10 | 11 | observation_init = env.reset() 12 | 13 | for t in range(300): 14 | ### select action by uniform distribution ### 15 | action= np.random.uniform(0,1,1) 16 | 17 | action=np.round(action) 18 | action=int(action) 19 | 20 | observation, reward, done, info = env.step(action) 21 | 22 | #print(reward) 23 | if done: 24 | ### when each eposide ended record the return and eposide's length 25 | print("Episode length is {} ".format(t+1)) 26 | eposide_length.append(t+1) 27 | reward=-1 28 | reward_return=reward*(discount_factor**(t)) 29 | expected_value.append(reward_return) 30 | break 31 | 32 | 33 | print("the episode's length", eposide_length) 34 | print('the mean of episode length',np.mean(eposide_length)) 35 | 36 | print('the standard deviation of episode length',np.std(eposide_length)) 37 | 38 | print('....expected return from the initial state.....') 39 | print(expected_value) 40 | print('the mean of initial return',np.mean(expected_value,axis=0)) 41 | print('the standard deviation of initial return', np.std(expected_value,axis=0)) 42 | -------------------------------------------------------------------------------- /CartPole/hundred-random-episode/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/batch_Q_learning_linear_0.001_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/batch_Q_learning_linear_0.001_length.png -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/batch_Q_learning_linear_0.001_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/batch_Q_learning_linear_0.001_reward.png -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/batch_Q_learning_neural_0.0001_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/batch_Q_learning_neural_0.0001_length.png -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/batch_Q_learning_neural_0.0001_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/batch_Q_learning_neural_0.0001_reward.png -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/cartpole_3_collect_data.py: -------------------------------------------------------------------------------- 1 | #### collect data based on random policy with 2000 eposides #### 2 | import gym 3 | import numpy as np 4 | from numpy import float32, uint32 5 | 6 | env = gym.make('CartPole-v0') 7 | 8 | discount_factor=0.99 9 | eposide_length=[] 10 | expected_value=[] 11 | transition=[] 12 | for i_episode in range(2000): 13 | print(i_episode) 14 | observation_init = env.reset() 15 | 16 | for t in range(300): 17 | 18 | env.render() 19 | 20 | action= np.random.uniform(0,1,1) 21 | action=np.round(action) 22 | action=int(action) 23 | observation, reward, done, info = env.step(action) 24 | 25 | action=np.array(action) 26 | 27 | 28 | if done is False: 29 | reward = 0 30 | reward = np.array(reward) 31 | print(observation, reward, done, info) 32 | 33 | 34 | if t==0: 35 | this_observation=observation_init 36 | next_observation=observation 37 | transition.append((this_observation,next_observation,reward,action)) 38 | 39 | else: 40 | this_observation=next_observation 41 | next_observation=observation 42 | transition.append((this_observation,next_observation,reward,action)) 43 | 44 | if done is True: 45 | print("Episode length is {} ".format(t+1)) 46 | eposide_length.append(t+1) 47 | reward=-1 48 | reward=np.array(reward) 49 | this_observation = next_observation 50 | next_observation = observation 51 | transition.append((this_observation, next_observation, reward, action)) 52 | 53 | reward_return=reward*(discount_factor**(t)) 54 | expected_value.append(reward_return) 55 | 56 | break 57 | 58 | print("the episode's length", eposide_length) 59 | print('the mean of episode length',np.mean(eposide_length)) 60 | 61 | 62 | print('the standard deviation of episode length',np.std(eposide_length)) 63 | 64 | 65 | print('....expected return from the initial state.....') 66 | print('the expected value of return',expected_value) 67 | 68 | outfile1 =np.array(transition) 69 | np.save('train_data_2',outfile1) 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/cartpole_3_linear_4_load.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | import matplotlib.pyplot as plt 7 | env = gym.make('CartPole-v0') 8 | env._max_episode_steps = 300 9 | 10 | print("......Loading train_data......") 11 | 12 | ### load the stored data ### 13 | train_data=np.load('train_data_2.npy') 14 | 15 | ##### set the variable####### 16 | batch_size=5000 17 | discount=0.99 18 | learn_rate=0.001 19 | input_size=4 20 | output_size=2 21 | eplison=0.05 22 | 23 | x1=tf.placeholder(tf.float32, shape=[None,4]) 24 | x2=tf.placeholder(tf.float32, shape=[None,4]) 25 | x3=tf.placeholder(tf.float32, shape=[None]) 26 | x4=tf.placeholder(tf.float32, shape=[None]) 27 | x5=tf.placeholder(tf.float32, shape=[None]) 28 | 29 | 30 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,output_size])) 31 | Bias_1=tf.Variable(tf.constant(0.1,shape=[output_size])) 32 | 33 | ### the prediction for each action ### 34 | prediction_now=tf.add(tf.matmul(x1,Weight_1),Bias_1) 35 | 36 | prediction_next=tf.add(tf.matmul(x2,Weight_1),Bias_1) 37 | 38 | ### take q value by actual action ### 39 | True_action=tf.cast(x4,tf.int32) 40 | True_action=tf.reshape(True_action,shape=[-1,1]) 41 | action_repeat=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1]) 42 | action_double=tf.concat([action_repeat,True_action],1) 43 | 44 | qa=tf.gather_nd(params=prediction_now,indices=action_double) 45 | 46 | ### select the action during test #### 47 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 48 | 49 | ### loss function ### 50 | less=tf.add(x3+discount*tf.stop_gradient((1+x3)*tf.reduce_max(prediction_next,axis=1)),-1*qa) 51 | 52 | delta=less 53 | q_loss=tf.reduce_sum((tf.square(delta)))/2 54 | 55 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 56 | 57 | #### save the model #### 58 | saver=tf.train.Saver() 59 | 60 | ### use gpu us training data ### 61 | with tf.device('/cpu:0'): 62 | with tf.Session() as sess: 63 | ### reload the model ### 64 | saver.restore(sess, './part3_linear_4/') 65 | 66 | eposide_length = [] 67 | expected_value = [] 68 | 69 | test_size=50 70 | all_eposide_length = np.zeros((1, test_size)) 71 | all_reward = np.zeros((1, test_size)) 72 | 73 | ### test the performance for final model ### 74 | ### reset 50 times to test the performance ### 75 | 76 | for i_episode in range(test_size): 77 | 78 | observation_init = env.reset() 79 | observation_init = [observation_init] 80 | 81 | for t in range(300): 82 | 83 | if t == 0: 84 | 85 | Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init}) 86 | observation_curr, reward_curr, done, info = env.step(Action[0]) 87 | 88 | observation_next = [observation_curr] 89 | else: 90 | Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next}) 91 | 92 | observation_curr, reward_curr, done, info = env.step(Action[0]) 93 | observation_next = [observation_curr] 94 | 95 | if done is True: 96 | 97 | eposide_length.append(t + 1) 98 | reward = -1 99 | reward_return = reward * (discount ** (t)) 100 | expected_value.append(reward_return) 101 | 102 | break 103 | all_eposide_length[0, i_episode] = t + 1 104 | all_reward[0, i_episode] = reward_return 105 | 106 | 107 | all_eposide_length = np.sum(all_eposide_length, axis=0) 108 | all_reward = np.sum(all_reward, axis=0) 109 | 110 | 111 | 112 | print('the mean of episode length', np.mean(eposide_length)) 113 | print('the mean of episode length', np.mean(all_reward)) 114 | 115 | print('the standard deviation of episode length', np.std(eposide_length)) 116 | ### print the eposide length and all reward during test ### 117 | plt.plot(all_eposide_length) 118 | plt.xlabel('Num of episode') 119 | plt.ylabel('length of eposide') 120 | plt.show() 121 | plt.plot(all_reward) 122 | plt.xlabel('Num of episode') 123 | plt.ylabel('reward') 124 | -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/cartpole_3_linear_4_saved.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | import matplotlib.pyplot as plt 7 | env = gym.make('CartPole-v0') 8 | env._max_episode_steps = 300 9 | 10 | print("......Loading train_data......") 11 | 12 | ### load the stored data ### 13 | train_data=np.load('train_data_2.npy') 14 | 15 | ##### set the variable####### 16 | batch_size=5000 17 | discount=0.99 18 | learn_rate=0.001 19 | input_size=4 20 | output_size=2 21 | eplison=0.05 22 | 23 | x1=tf.placeholder(tf.float32, shape=[None,4]) 24 | x2=tf.placeholder(tf.float32, shape=[None,4]) 25 | x3=tf.placeholder(tf.float32, shape=[None]) 26 | x4=tf.placeholder(tf.float32, shape=[None]) 27 | x5=tf.placeholder(tf.float32, shape=[None]) 28 | 29 | 30 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,output_size])) 31 | Bias_1=tf.Variable(tf.constant(0.1,shape=[output_size])) 32 | 33 | ### the prediction for each action ### 34 | prediction_now=tf.add(tf.matmul(x1,Weight_1),Bias_1) 35 | 36 | prediction_next=tf.add(tf.matmul(x2,Weight_1),Bias_1) 37 | 38 | ### take q value by actual action ### 39 | True_action=tf.cast(x4,tf.int32) 40 | True_action=tf.reshape(True_action,shape=[-1,1]) 41 | action_repeat=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1]) 42 | action_double=tf.concat([action_repeat,True_action],1) 43 | 44 | qa=tf.gather_nd(params=prediction_now,indices=action_double) 45 | 46 | ### select the action during test #### 47 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 48 | 49 | ### loss function ### 50 | less=tf.add(x3+discount*tf.stop_gradient((1+x3)*tf.reduce_max(prediction_next,axis=1)),-1*qa) 51 | 52 | delta=less 53 | q_loss=tf.reduce_sum((tf.square(delta)))/2 54 | 55 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 56 | 57 | #### save the model #### 58 | saver=tf.train.Saver() 59 | 60 | with tf.device('/cpu:0'): 61 | with tf.Session() as sess: 62 | 63 | sess.run(tf.global_variables_initializer()) 64 | 65 | print('......start training data......') 66 | 67 | length_total_data=len(train_data) 68 | 69 | ### consist indices for tf.gather_nd function ### 70 | select_order=np.arange(batch_size) 71 | 72 | ### the training and test size ### 73 | batch_number=5000 74 | test_size=20 75 | 76 | ### set to store output ### 77 | all_eposide_length=np.zeros((1,batch_number)) 78 | all_reward=np.zeros((1,batch_number)) 79 | all_loss=np.zeros((1,batch_number)) 80 | 81 | 82 | for i_batch in range(batch_number): 83 | batch_sample = random.sample(range(0, length_total_data), batch_size) 84 | 85 | ### obtain sample data ### 86 | insert_data=train_data[batch_sample] 87 | obser_now = [] 88 | obser_next = [] 89 | reward_now = [] 90 | action_now = [] 91 | 92 | for i_select in range(batch_size): 93 | obser_now.append(insert_data[i_select][0]) 94 | obser_next.append(insert_data[i_select][1]) 95 | 96 | reward_now.append(insert_data[i_select][2]) 97 | action_now.append(insert_data[i_select][3]) 98 | 99 | ### training network ### 100 | _, train_loss = sess.run([train_optimizer, q_loss],feed_dict={x1: obser_now, x2: obser_next, x3: reward_now, x4: action_now,x5:select_order 101 | }) 102 | ### test the agent after each training 103 | if i_batch % 1 == 0: 104 | print('...ith training....:', i_batch, 'average training loss:', train_loss/batch_size) 105 | 106 | eposide_length = np.zeros((1,test_size)) 107 | expected_value = np.zeros((1,test_size)) 108 | 109 | for i_episode in range(test_size): 110 | # print(i_episode) 111 | observation_init = env.reset() 112 | observation_init = [observation_init] 113 | observation_next=observation_init 114 | for t in range(300): 115 | 116 | ### greedy policy to select action ### 117 | if np.random.random() <= eplison: 118 | Action = np.random.randint(2) 119 | else: 120 | Action = test_action.eval(feed_dict={x1: observation_next}) 121 | 122 | observation_curr, reward_curr, done, info = env.step(int(Action)) 123 | 124 | observation_next = [observation_curr] 125 | 126 | if done is True: 127 | 128 | eposide_length[0,i_episode]=t + 1 129 | reward = -1 130 | reward_return = reward * (discount ** (t)) 131 | expected_value[0,i_episode]=reward_return 132 | break 133 | 134 | all_eposide_length[0,i_batch]=np.mean(np.mean(eposide_length,axis=0),axis=0) 135 | all_reward[0,i_batch]=np.mean(np.mean(expected_value,axis=0),axis=0) 136 | all_loss[0, i_batch] = train_loss/batch_size 137 | 138 | ### saved model weights #### 139 | if i_batch >= 2: 140 | if i_batch == np.argmax(all_eposide_length): 141 | print(i_batch) 142 | print(np.argmax(all_eposide_length)) 143 | 144 | if not os.path.exists('./part3_linear_4/'): 145 | os.mkdir('./part3_linear_4/') 146 | saver.save(sess, "./part3_linear_4/") 147 | print('saved') 148 | 149 | 150 | print('....the averagelength of test eposide....',np.mean(np.mean(eposide_length,axis=0),axis=0)) 151 | 152 | 153 | 154 | outfile1 = all_reward 155 | outfile2 = all_eposide_length 156 | outfile3=all_loss 157 | 158 | ### save the output ### 159 | np.save('reward_data_part3_4_300', outfile1) 160 | np.save('length_data_part3_4_300', outfile2) 161 | np.save('loss_data_part3_4_300', outfile3) 162 | 163 | mean_episode_len = np.mean(all_eposide_length,axis=0) 164 | mean_total_reward = np.mean(all_reward,axis=0) 165 | mean_total_loss =np.mean(all_loss,0) 166 | 167 | 168 | std_episode_len = np.std(all_eposide_length, axis=0) 169 | std_total_reward = np.std(all_reward, axis=0) 170 | 171 | 172 | 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/cartpole_3_neural_5_load.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | 7 | import matplotlib.pyplot as plt 8 | 9 | env = gym.make('CartPole-v0') 10 | env._max_episode_steps = 300 11 | 12 | 13 | 14 | print("......Loading train_data......") 15 | ### load the stored data ### 16 | ### load the stored data ### 17 | train_data=np.load('train_data_2.npy') 18 | 19 | 20 | #### set variable and parameters #### 21 | x1=tf.placeholder(tf.float32, shape=[None,4]) 22 | x2=tf.placeholder(tf.float32, shape=[None,4]) 23 | x3=tf.placeholder(tf.float32, shape=[None]) 24 | x4=tf.placeholder(tf.float32, shape=[None]) 25 | x5=tf.placeholder(tf.float32, shape=[None]) 26 | 27 | ##### set the variable####### 28 | batch_size=1000 29 | discount=0.99 30 | learn_rate=0.0001 31 | input_size=4 32 | hidden_size=100 33 | output_size=2 34 | eplison=0.05 35 | 36 | 37 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 38 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 39 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 40 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 41 | 42 | 43 | 44 | ### one hiddle layer neural network as function approximation ### 45 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 46 | prediction_No=tf.nn.relu(middle_now) 47 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 48 | 49 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 50 | prediction_Ne=tf.nn.relu(middle_next) 51 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 52 | 53 | 54 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 55 | 56 | True_action=tf.cast(x4,tf.int32) 57 | True_action=tf.reshape(True_action,shape=[-1,1]) 58 | action_repeat=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1]) 59 | action_double=tf.concat([action_repeat,True_action],1) 60 | 61 | ### calcaulate the loss and training ### 62 | Q_value=tf.gather_nd(params=prediction_now,indices=action_double) 63 | 64 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 65 | 66 | 67 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value)) 68 | 69 | q_loss=tf.reduce_sum(tf.square(delta))/2 70 | 71 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 72 | 73 | 74 | 75 | 76 | #### save the model #### 77 | saver=tf.train.Saver() 78 | 79 | ### use gpu us training data ### 80 | with tf.device('/cpu:0'): 81 | with tf.Session() as sess: 82 | ### reload the model ### 83 | saver.restore(sess, './part3_neural_5_300/') 84 | 85 | eposide_length = [] 86 | expected_value = [] 87 | 88 | test_size=50 89 | all_eposide_length = np.zeros((1, test_size)) 90 | all_reward = np.zeros((1, test_size)) 91 | 92 | ### test the performance for final model ### 93 | ### reset 50 times to test the performance ### 94 | 95 | for i_episode in range(test_size): 96 | 97 | observation_init = env.reset() 98 | observation_init = [observation_init] 99 | 100 | for t in range(300): 101 | 102 | if t == 0: 103 | 104 | Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init}) 105 | observation_curr, reward_curr, done, info = env.step(Action[0]) 106 | 107 | observation_next = [observation_curr] 108 | else: 109 | Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next}) 110 | 111 | observation_curr, reward_curr, done, info = env.step(Action[0]) 112 | observation_next = [observation_curr] 113 | 114 | if done is True: 115 | 116 | eposide_length.append(t + 1) 117 | reward = -1 118 | reward_return = reward * (discount ** (t)) 119 | expected_value.append(reward_return) 120 | 121 | break 122 | all_eposide_length[0, i_episode] = t + 1 123 | all_reward[0, i_episode] = reward_return 124 | 125 | 126 | all_eposide_length = np.sum(all_eposide_length, axis=0) 127 | all_reward = np.sum(all_reward, axis=0) 128 | 129 | 130 | 131 | print('the mean of episode length', np.mean(eposide_length)) 132 | print('the mean of episode length', np.mean(all_reward)) 133 | 134 | print('the standard deviation of episode length', np.std(eposide_length)) 135 | ### print the eposide length and all reward during test ### 136 | plt.plot(all_eposide_length) 137 | plt.xlabel('Num of episode') 138 | plt.ylabel('length of eposide') 139 | plt.show() 140 | plt.plot(all_reward) 141 | plt.xlabel('Num of episode') 142 | plt.ylabel('reward') 143 | 144 | -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/cartpole_3_neural_5_saved.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | 7 | #import matplotlib.pyplot as plt 8 | 9 | env = gym.make('CartPole-v0') 10 | env._max_episode_steps = 300 11 | 12 | 13 | 14 | print("......Loading train_data......") 15 | 16 | train_data=np.load('train_data_2.npy') 17 | 18 | #### set variable and parameters #### 19 | x1=tf.placeholder(tf.float32, shape=[None,4]) 20 | x2=tf.placeholder(tf.float32, shape=[None,4]) 21 | x3=tf.placeholder(tf.float32, shape=[None]) 22 | x4=tf.placeholder(tf.float32, shape=[None]) 23 | x5=tf.placeholder(tf.float32, shape=[None]) 24 | 25 | ##### set the variable####### 26 | batch_size=1000 27 | discount=0.99 28 | learn_rate=0.0001 29 | input_size=4 30 | hidden_size=100 31 | output_size=2 32 | eplison=0.05 33 | 34 | 35 | 36 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 37 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 38 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 39 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 40 | 41 | 42 | 43 | ### one hiddle layer neural network as function approximation ### 44 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 45 | prediction_No=tf.nn.relu(middle_now) 46 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 47 | 48 | 49 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 50 | prediction_Ne=tf.nn.relu(middle_next) 51 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 52 | 53 | 54 | 55 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 56 | 57 | True_action=tf.cast(x4,tf.int32) 58 | True_action=tf.reshape(True_action,shape=[-1,1]) 59 | action_repeat=tf.reshape(tf.cast(x5,tf.int32),shape=[-1,1]) 60 | action_double=tf.concat([action_repeat,True_action],1) 61 | 62 | ### calcaulate the loss and training ### 63 | Q_value=tf.gather_nd(params=prediction_now,indices=action_double) 64 | 65 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 66 | 67 | 68 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value)) 69 | 70 | q_loss=tf.reduce_sum(tf.square(delta))/2 71 | 72 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 73 | 74 | 75 | 76 | 77 | #### save the model #### 78 | saver=tf.train.Saver() 79 | 80 | ### use gpu us training data ### 81 | with tf.device('/cpu:0'): 82 | with tf.Session() as sess: 83 | sess.run(tf.global_variables_initializer()) 84 | 85 | print('......start training data......') 86 | length_total_data=len(train_data) 87 | print(train_data[1]) 88 | 89 | #print(length_total_data) 90 | 91 | select_order=np.arange(batch_size) 92 | #print(select_order) 93 | batch_number=5000 94 | test_size=20 95 | 96 | #epoch=5000 97 | 98 | #eposide_number=100 99 | all_eposide_length=np.zeros((1,batch_number)) 100 | all_reward=np.zeros((1,batch_number)) 101 | all_loss=np.zeros((1,batch_number)) 102 | 103 | for i_batch in range(batch_number): 104 | #for i_eposide in range(1000000): 105 | batch_sample = random.sample(range(0, length_total_data), batch_size) 106 | insert_data=train_data[batch_sample] 107 | 108 | obser_now = [] 109 | obser_next = [] 110 | reward_now = [] 111 | action_now = [] 112 | 113 | for i_select in range(batch_size): 114 | obser_now.append(insert_data[i_select][0]) 115 | obser_next.append(insert_data[i_select][1]) 116 | 117 | reward_now.append(insert_data[i_select][2]) 118 | action_now.append(insert_data[i_select][3]) 119 | 120 | _, train_loss = sess.run([train_optimizer, q_loss],feed_dict={x1: obser_now, x2: obser_next, x3: reward_now, x4: action_now,x5:select_order 121 | }) 122 | if i_batch % 1 == 0: 123 | 124 | print('...ith training....:', i_batch, 'average training loss:', train_loss/batch_size) 125 | 126 | eposide_length = np.zeros((1,test_size)) 127 | expected_value = np.zeros((1,test_size)) 128 | 129 | for i_episode in range(test_size): 130 | 131 | observation_init = env.reset() 132 | observation_next = [observation_init] 133 | 134 | for t in range(300): 135 | #env.render() 136 | 137 | ### greedy policy to select action ### 138 | if np.random.random() <= eplison: 139 | Action = np.random.randint(2) 140 | 141 | else: 142 | 143 | Action = test_action.eval(feed_dict={x1: observation_next}) 144 | 145 | observation_curr, reward_curr, done, info = env.step(int(Action)) 146 | 147 | observation_next = [observation_curr] 148 | 149 | 150 | if done is True: 151 | 152 | eposide_length[0,i_episode]=t + 1 153 | reward = -1 154 | reward_return = reward * (discount ** (t)) 155 | expected_value[0,i_episode]=reward_return 156 | 157 | break 158 | 159 | 160 | all_eposide_length[0,i_batch]=np.mean(np.mean(eposide_length,axis=0),axis=0) 161 | all_reward[0,i_batch]=np.mean(np.mean(expected_value,axis=0),axis=0) 162 | all_loss[0, i_batch] = train_loss/batch_size 163 | if i_batch>=2: 164 | if i_batch==np.argmax(all_eposide_length): 165 | print(i_batch) 166 | print(np.argmax(all_eposide_length)) 167 | 168 | if not os.path.exists('./part3_neural_5_300/'): 169 | os.mkdir('./part3_neural_5_300/') 170 | saver.save(sess, "./part3_neural_5_300/") 171 | print('saved') 172 | 173 | 174 | print('....the averagelength of test eposide....',np.mean(np.mean(eposide_length,axis=0),axis=0)) 175 | 176 | 177 | 178 | outfile1 = all_reward 179 | outfile2 = all_eposide_length 180 | outfile3=all_loss 181 | #print(outfile2) 182 | #print(outfile1) 183 | #print(outfile3) 184 | #print('....the all eposide_length....',all_eposide_length) 185 | 186 | np.save('reward_data_part3_5_neural_300', outfile1) 187 | np.save('length_data_part3_5_neural_300', outfile2) 188 | np.save('loss_data_part3_5_neural_300', outfile3) 189 | 190 | 191 | mean_episode_len = np.mean(all_eposide_length,axis=0) 192 | mean_total_reward = np.mean(all_reward,axis=0) 193 | mean_total_loss =np.mean(all_loss,0) 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/check_data.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | import matplotlib.pyplot as plt 7 | env = gym.make('CartPole-v0') 8 | train_reward=np.load('reward_data_part3_4_300.npy') 9 | train_length=np.load('length_data_part3_4_300.npy') 10 | train_loss=np.load('loss_data_part3_4_300.npy') 11 | 12 | 13 | plt.plot(np.mean(train_loss,axis=0)) 14 | plt.xlabel('ith Num of training episode') 15 | plt.ylabel('train_Mean loss') 16 | plt.show() 17 | plt.plot(np.mean(train_reward,axis=0)) 18 | plt.xlabel('ith Num of training episode') 19 | plt.ylabel('train_Mean reward') 20 | plt.show() 21 | 22 | plt.plot(np.mean(train_length,axis=0)) 23 | plt.xlabel('ith Num of training episode') 24 | plt.ylabel('train_Mean length') 25 | plt.show() 26 | 27 | 28 | -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/figure_1-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/figure_1-3.png -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/length_data_part3_4_300.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/length_data_part3_4_300.npy -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/loss_data_part3_4_300.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/loss_data_part3_4_300.npy -------------------------------------------------------------------------------- /CartPole/offline-batch-Q-learning/reward_data_part3_4_300.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/offline-batch-Q-learning/reward_data_part3_4_300.npy -------------------------------------------------------------------------------- /CartPole/online-Q-learning/cartpole_4_neural_load.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | import matplotlib.pyplot as plt 7 | env = gym.make('CartPole-v0') 8 | env._max_episode_steps = 300 9 | 10 | 11 | 12 | #### set variable and parameters #### 13 | x1=tf.placeholder(tf.float32, shape=[None,4]) 14 | x2=tf.placeholder(tf.float32, shape=[None,4]) 15 | x3=tf.placeholder(tf.float32, shape=[None,1]) 16 | x4=tf.placeholder(tf.int32, shape=[None,2]) 17 | 18 | 19 | discount=0.99 20 | learn_rate=0.0001 21 | input_size=4 22 | hidden_size=100 23 | output_size=2 24 | eplison=0.05 25 | max_eposide_length=300 26 | 27 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 28 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 29 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 30 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 31 | 32 | 33 | 34 | ### one hiddle layer neural network as function approximation ### 35 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 36 | prediction_No=tf.nn.relu(middle_now) 37 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 38 | 39 | 40 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 41 | prediction_Ne=tf.nn.relu(middle_next) 42 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 43 | 44 | ### the best action based on observation_now ### 45 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 46 | 47 | 48 | 49 | ### calcaulate the loss and training ### 50 | Q_value=tf.gather_nd(params=prediction_now,indices=x4) 51 | 52 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 53 | 54 | 55 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value)) 56 | 57 | q_loss=tf.reduce_sum(tf.square(delta)/2) 58 | 59 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 60 | 61 | 62 | #### save the model #### 63 | saver=tf.train.Saver() 64 | 65 | 66 | 67 | with tf.device('/cpu:0'): 68 | with tf.Session() as sess: 69 | ## reload the weights ### 70 | saver.restore(sess, './part4_neural_300/') 71 | eposide_length = [] 72 | expected_value = [] 73 | all_eposide_length = np.zeros((1, 10)) 74 | all_reward = np.zeros((1, 100)) 75 | 76 | for i_episode in range(10): 77 | 78 | observation_init = env.reset() 79 | observation_init = [observation_init] 80 | 81 | for t in range(300): 82 | 83 | if t == 0: 84 | 85 | Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init}) 86 | 87 | 88 | observation_curr, reward_curr, done, info = env.step(Action[0]) 89 | 90 | observation_next = [observation_curr] 91 | else: 92 | Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next}) 93 | 94 | observation_curr, reward_curr, done, info = env.step(Action[0]) 95 | observation_next = [observation_curr] 96 | 97 | if done is True: 98 | 99 | eposide_length.append(t + 1) 100 | reward = -1 101 | reward_return = reward * (discount ** (t)) 102 | expected_value.append(reward_return) 103 | 104 | break 105 | all_eposide_length[0, i_episode] = t + 1 106 | all_reward[0, i_episode] = reward_return 107 | 108 | all_eposide_length = np.mean(all_eposide_length, axis=0) 109 | all_reward = np.mean(all_reward, axis=0) 110 | 111 | 112 | 113 | print('the mean of episode length', np.mean(eposide_length)) 114 | print('the mean of reward ',np.mean(expected_value)) 115 | 116 | print('the standard deviation of episode length', np.std(eposide_length)) 117 | plt.plot(all_eposide_length) 118 | plt.xlabel('Num of episode') 119 | plt.ylabel('length of eposide') 120 | plt.show() 121 | plt.plot(all_reward) 122 | plt.xlabel('Num of episode') 123 | plt.ylabel('reward') 124 | plt.show() 125 | -------------------------------------------------------------------------------- /CartPole/online-Q-learning/cartpole_4_neural_saved.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | #import matplotlib.pyplot as plt 7 | env = gym.make('CartPole-v0') 8 | env._max_episode_steps = 300 9 | 10 | 11 | 12 | #### set variable and parameters #### 13 | x1=tf.placeholder(tf.float32, shape=[None,4]) 14 | x2=tf.placeholder(tf.float32, shape=[None,4]) 15 | x3=tf.placeholder(tf.float32, shape=[None,1]) 16 | x4=tf.placeholder(tf.int32, shape=[None,2]) 17 | 18 | 19 | discount=0.99 20 | learn_rate=0.0001 21 | input_size=4 22 | hidden_size=100 23 | output_size=2 24 | eplison=0.05 25 | max_eposide_length=300 26 | 27 | Weight_1=tf.Variable(tf.truncated_normal(shape=[input_size,hidden_size])) 28 | Weight_2=tf.Variable(tf.truncated_normal(shape=[hidden_size,output_size])) 29 | Bias_1=tf.Variable(tf.constant(0.1,shape=[hidden_size])) 30 | Bias_2=tf.Variable(tf.constant(0.1,shape=[output_size])) 31 | 32 | 33 | 34 | ### one hiddle layer neural network as function approximation ### 35 | middle_now=tf.matmul(x1,Weight_1)+Bias_1 36 | prediction_No=tf.nn.relu(middle_now) 37 | prediction_now=tf.matmul(prediction_No,Weight_2)+Bias_2 38 | 39 | 40 | middle_next=tf.matmul(x2,Weight_1)+Bias_1 41 | prediction_Ne=tf.nn.relu(middle_next) 42 | prediction_next=tf.matmul(prediction_Ne,Weight_2)+Bias_2 43 | 44 | ### the best action based on observation_now ### 45 | test_action=tf.cast(tf.argmax(prediction_now,1),tf.int32) 46 | 47 | 48 | 49 | ### calcaulate the loss and training ### 50 | Q_value=tf.gather_nd(params=prediction_now,indices=x4) 51 | 52 | Max_Q_value_next=tf.reduce_max(prediction_next,axis=1) 53 | 54 | 55 | delta=tf.add(x3+discount*tf.stop_gradient((1+x3)*Max_Q_value_next),(-1*Q_value)) 56 | 57 | q_loss=tf.reduce_sum(tf.square(delta)/2) 58 | 59 | train_optimizer=tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 60 | 61 | 62 | #### save the model #### 63 | saver=tf.train.Saver() 64 | 65 | 66 | 67 | with tf.device('/gpu:0'): 68 | 69 | #### set the set to save data #### 70 | run_size = 100 71 | all_episode_length = np.zeros((run_size, 2000)) 72 | all_total_reward = np.zeros((run_size, 2000)) 73 | all_test_episode_length = np.zeros((run_size, 2000)) 74 | all_test_reward = np.zeros((run_size, 2000)) 75 | all_train_loss = np.zeros((run_size, 2000)) 76 | 77 | 78 | with tf.Session() as sess: 79 | for i_run in range(1,run_size+1): 80 | sess.run(tf.global_variables_initializer()) 81 | 82 | print('......start training data......') 83 | 84 | for i_eposide in range(1,1+2000): 85 | 86 | ### begin a new eposide ### 87 | observation_00 = env.reset() 88 | total_reward=0 89 | total_QQ_loss=0 90 | 91 | for i_step in range(max_eposide_length): 92 | 93 | ### greedy policy to select action ### 94 | if np.random.random() <= eplison: 95 | action_select_now=np.random.randint(2) 96 | 97 | else: 98 | ### use Q function to select action ### 99 | action_select_now=sess.run(test_action,feed_dict={x1:np.reshape(observation_00, [1, 4])}) 100 | action_select_now=int(action_select_now) 101 | 102 | observation_11,_,done_0,info=env.step(action_select_now) 103 | 104 | if done_0 is False: 105 | reward=0 106 | else: 107 | reward=-1 108 | ### training step ### 109 | _,train_loss=sess.run([train_optimizer,q_loss], feed_dict={x1:np.reshape( observation_00,[1,4]), x2: np.reshape( observation_11,[1,4]), x3:np.reshape(reward,[1,1]),x4:np.reshape([0,action_select_now],[1,2])}) 110 | 111 | total_QQ_loss +=train_loss 112 | 113 | observation_00 = observation_11 114 | 115 | 116 | if done_0 is True: 117 | reward=-1 118 | 119 | final_reward =reward* discount**(i_step) 120 | 121 | all_episode_length[i_run-1, i_eposide-1] = i_step + 1 122 | all_total_reward[i_run-1, i_eposide-1] = final_reward 123 | 124 | 125 | 126 | ### record average test performance ### 127 | test_size=10 128 | Small_test_eposide_length = np.zeros((1, test_size)) 129 | Small_test_reward = np.zeros((1, test_size)) 130 | 131 | for i_test_run in range(test_size): 132 | #env.render() 133 | observation_test_0 = env.reset() 134 | 135 | for i_test_length in range(max_eposide_length): 136 | action_test_now = test_action.eval(feed_dict={x1: np.reshape(observation_test_0, [1, 4])}) 137 | action_test_now=int(action_test_now) 138 | observation_test_1, _, test_done, test_info = env.step(int(action_test_now)) 139 | 140 | observation_test_0=observation_test_1 141 | 142 | if test_done is False: 143 | reward_test = 0, 144 | else: 145 | reward_test = -1 146 | 147 | if test_done is True: 148 | Small_test_eposide_length[0,i_test_run]=i_test_length+1 149 | Small_test_reward[0,i_test_run]=reward_test*(discount**(i_test_length)) 150 | 151 | 152 | break 153 | 154 | 155 | small_mean_test_length=np.mean(np.mean(Small_test_eposide_length,axis=0),axis=0) 156 | small_mean_test_reward=np.mean(np.mean(Small_test_reward,axis=0),axis=0) 157 | print('ith_run', i_run-1, 'the ith eposide', i_eposide, 'the train_length_eposide', i_step + 1, 158 | 'the test average length', small_mean_test_length , '..loss..', 159 | train_loss) 160 | all_test_episode_length[i_run-1, i_eposide-1]=small_mean_test_length 161 | all_test_reward[i_run-1, i_eposide-1]=small_mean_test_reward 162 | all_train_loss[i_run-1, i_eposide-1] = total_QQ_loss/(i_step+1) 163 | 164 | break 165 | ##### saved the model for best ...#### 166 | if i_eposide >= 2: 167 | if all_test_episode_length[i_run-1, i_eposide-1] == np.amax(all_test_episode_length): 168 | print('.....',all_test_episode_length[i_run-1, i_eposide-1]) 169 | print(np.amax(all_test_episode_length)) 170 | 171 | if not os.path.exists('./part4_neural_300/'): 172 | 173 | os.mkdir('./part4_neural_300/') 174 | saver.save(sess, "./part4_neural_300/") 175 | print('saved') 176 | 177 | 178 | ### save and plot performance during training and tes #### 179 | outfile1=all_total_reward 180 | outfile2=all_episode_length 181 | outfile3=all_train_loss 182 | outfile4=all_test_reward 183 | outfile5=all_test_episode_length 184 | 185 | 186 | np.save('part4_train_reward_300', outfile1) 187 | np.save('part4_train_eposide_length_300',outfile2) 188 | 189 | np.save('part4_train_loss_300', outfile3) 190 | np.save('part4_test_reward_300', outfile4) 191 | np.save('part4_test_length_300', outfile5) 192 | -------------------------------------------------------------------------------- /CartPole/online-Q-learning/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /CartPole/readme: -------------------------------------------------------------------------------- 1 | 运行 py文件是 请将train_data_2 放到相应的文件夹, 或者自己改成相应的路径 2 | 3 | train data 2 由 offline Q Learning文件夹中的 cartpole_3_collect_data 生成 4 | 5 | ......saved.py 文件用于训练 和存储 模型 6 | ...... load.py 文件用于 加载存储好的模型参数, 用于测试训练好的模型效果 7 | -------------------------------------------------------------------------------- /CartPole/target-parameter/cartpole_7_target_load.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | 7 | import matplotlib.pyplot as plt 8 | 9 | env = gym.make('CartPole-v0') 10 | env._max_episode_steps = 300 11 | 12 | print("......Loading train_data......") 13 | 14 | 15 | train_data=np.load('train_data_2.npy') 16 | 17 | #### set variable and parameters #### 18 | x1 = tf.placeholder(tf.float32, shape=[None, 4]) 19 | x2 = tf.placeholder(tf.float32, shape=[None, 4]) 20 | x3 = tf.placeholder(tf.float32, shape=[None, 2]) 21 | x4 = tf.placeholder(tf.float32, shape=[None]) 22 | 23 | batch_size = 128 24 | discount = 0.99 25 | learn_rate = 0.0001 26 | input_size = 4 27 | hidden_size = 100 28 | output_size = 2 29 | max_eposide_length = 300 30 | eplison = 0.05 31 | 32 | Weight_1 = tf.Variable(tf.truncated_normal(shape=[input_size, hidden_size])) 33 | Weight_2 = tf.Variable(tf.truncated_normal(shape=[hidden_size, output_size])) 34 | Bias_1 = tf.Variable(tf.constant(0.1, shape=[hidden_size])) 35 | Bias_2 = tf.Variable(tf.constant(0.1, shape=[output_size])) 36 | 37 | Weight_old_1=tf.placeholder(tf.float32,shape=[input_size,hidden_size]) 38 | Weight_old_2=tf.placeholder(tf.float32,shape=[hidden_size,output_size]) 39 | Bias_old_1=tf.placeholder(tf.float32,shape=[hidden_size]) 40 | Bias_old_2=tf.placeholder(tf.float32,shape=[output_size]) 41 | 42 | 43 | middle_now = tf.matmul(x1, Weight_1) + Bias_1 44 | prediction_No = tf.nn.relu(middle_now) 45 | prediction_now = tf.matmul(prediction_No, Weight_2) + Bias_2 46 | 47 | middle_next = tf.matmul(x2, Weight_old_1) + Bias_old_1 48 | prediction_Ne = tf.nn.relu(middle_next) 49 | prediction_next = tf.matmul(prediction_Ne, Weight_old_2) + Bias_old_2 50 | # 51 | 52 | True_action = tf.cast(x3, tf.int32) 53 | test_action = tf.cast(tf.argmax(prediction_now, 1), tf.int32) 54 | Q_value = tf.gather_nd(prediction_now, True_action) 55 | 56 | max_Q_value = tf.reduce_max(prediction_next, axis=1) 57 | delta = x4 + discount * tf.stop_gradient((1 + x4) * max_Q_value) - Q_value 58 | q_loss = tf.reduce_mean(tf.square(delta) / 2) 59 | 60 | train_optimizer = tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 61 | 62 | saver = tf.train.Saver() 63 | 64 | 65 | with tf.device('/cpu:0'): 66 | with tf.Session() as sess: 67 | ## reload the weights ### 68 | saver.restore(sess, './part7_target/') 69 | eposide_length = [] 70 | expected_value = [] 71 | all_eposide_length = np.zeros((1, 10)) 72 | all_reward = np.zeros((1, 100)) 73 | 74 | for i_episode in range(10): 75 | 76 | observation_init = env.reset() 77 | observation_init = [observation_init] 78 | 79 | for t in range(300): 80 | 81 | if t == 0: 82 | 83 | Action = test_action.eval(feed_dict={x1: observation_init, x2: observation_init}) 84 | 85 | 86 | observation_curr, reward_curr, done, info = env.step(Action[0]) 87 | 88 | observation_next = [observation_curr] 89 | else: 90 | Action = test_action.eval(feed_dict={x1: observation_next, x2: observation_next}) 91 | 92 | observation_curr, reward_curr, done, info = env.step(Action[0]) 93 | observation_next = [observation_curr] 94 | 95 | if done is True: 96 | 97 | eposide_length.append(t + 1) 98 | reward = -1 99 | reward_return = reward * (discount ** (t)) 100 | expected_value.append(reward_return) 101 | 102 | break 103 | all_eposide_length[0, i_episode] = t + 1 104 | all_reward[0, i_episode] = reward_return 105 | 106 | all_eposide_length = np.mean(all_eposide_length, axis=0) 107 | all_reward = np.mean(all_reward, axis=0) 108 | 109 | 110 | 111 | print('the mean of episode length', np.mean(eposide_length)) 112 | print('the mean of reward ',np.mean(expected_value)) 113 | 114 | print('the standard deviation of episode length', np.std(eposide_length)) 115 | plt.plot(all_eposide_length) 116 | plt.xlabel('Num of episode') 117 | plt.ylabel('length of eposide') 118 | plt.show() 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /CartPole/target-parameter/cartpole_7_target_saved.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import tensorflow as tf 4 | import random 5 | import os 6 | 7 | env = gym.make('CartPole-v0') 8 | env._max_episode_steps = 300 9 | print("......Loading train_data......") 10 | 11 | train_data=np.load('train_data_2.npy') 12 | 13 | #### set variable and parameters #### 14 | x1 = tf.placeholder(tf.float32, shape=[None, 4]) 15 | x2 = tf.placeholder(tf.float32, shape=[None, 4]) 16 | x3 = tf.placeholder(tf.float32, shape=[None, 2]) 17 | x4 = tf.placeholder(tf.float32, shape=[None]) 18 | 19 | batch_size = 128 20 | discount = 0.99 21 | learn_rate = 0.0001 22 | input_size = 4 23 | hidden_size = 100 24 | output_size = 2 25 | max_eposide_length = 300 26 | eplison = 0.05 27 | 28 | Weight_1 = tf.Variable(tf.truncated_normal(shape=[input_size, hidden_size])) 29 | Weight_2 = tf.Variable(tf.truncated_normal(shape=[hidden_size, output_size])) 30 | Bias_1 = tf.Variable(tf.constant(0.1, shape=[hidden_size])) 31 | Bias_2 = tf.Variable(tf.constant(0.1, shape=[output_size])) 32 | 33 | ### the old weights ### 34 | Weight_old_1=tf.placeholder(tf.float32,shape=[input_size,hidden_size]) 35 | Weight_old_2=tf.placeholder(tf.float32,shape=[hidden_size,output_size]) 36 | Bias_old_1=tf.placeholder(tf.float32,shape=[hidden_size]) 37 | Bias_old_2=tf.placeholder(tf.float32,shape=[output_size]) 38 | 39 | 40 | middle_now = tf.matmul(x1, Weight_1) + Bias_1 41 | prediction_No = tf.nn.relu(middle_now) 42 | prediction_now = tf.matmul(prediction_No, Weight_2) + Bias_2 43 | 44 | ### calculate the target value by old weights ### 45 | middle_next = tf.matmul(x2, Weight_old_1) + Bias_old_1 46 | prediction_Ne = tf.nn.relu(middle_next) 47 | prediction_next = tf.matmul(prediction_Ne, Weight_old_2) + Bias_old_2 48 | 49 | 50 | True_action = tf.cast(x3, tf.int32) 51 | test_action = tf.cast(tf.argmax(prediction_now, 1), tf.int32) 52 | Q_value = tf.gather_nd(prediction_now, True_action) 53 | 54 | max_Q_value = tf.reduce_max(prediction_next, axis=1) 55 | delta = x4 + discount * tf.stop_gradient((1 + x4) * max_Q_value) - Q_value 56 | q_loss = tf.reduce_mean(tf.square(delta) / 2) 57 | 58 | train_optimizer = tf.train.AdamOptimizer(learn_rate).minimize(q_loss) 59 | 60 | saver = tf.train.Saver() 61 | 62 | with tf.device('/cpu:0'): 63 | eposide_size = 2000 64 | run_size = 1 65 | all_episode_length = np.zeros((run_size, int(eposide_size))) 66 | all_total_reward = np.zeros((run_size, int(eposide_size))) 67 | all_test_episode_length = np.zeros((run_size, int(eposide_size))) 68 | all_test_reward = np.zeros((run_size, int(eposide_size / 20))) 69 | all_train_loss = np.zeros((run_size, int(eposide_size / 20))) 70 | 71 | length_of_train = len(train_data) 72 | for i_run in range(1, run_size + 1): 73 | ### build the experience replay ### 74 | 75 | buffer_size = 1024 76 | mini_batch_size = 64 77 | 78 | length_of_train = len(train_data) 79 | buffer_sample = random.sample(range(0, length_of_train), buffer_size) 80 | buffer_replay = train_data[buffer_sample] 81 | 82 | buffer_observation_now = [] 83 | buffer_observation_next = [] 84 | buffer_action = [] 85 | buffer_reward = [] 86 | 87 | for i_sele in range(buffer_size): 88 | buffer_observation_now.append(buffer_replay[i_sele][0]) 89 | buffer_observation_next.append(buffer_replay[i_sele][1]) 90 | buffer_reward.append(buffer_replay[i_sele][2]) 91 | buffer_action.append(buffer_replay[i_sele][3]) 92 | 93 | with tf.Session() as sess: 94 | 95 | sess.run(tf.global_variables_initializer()) 96 | 97 | for i_eposide in range(1, 1 + eposide_size): 98 | 99 | observation_0 = env.reset() 100 | 101 | total_QQ_loss = 0 102 | 103 | for i_step in range(max_eposide_length): 104 | ### hold the old weights for target calculation ### 105 | if ((i_eposide - 1) % 5 == 0): 106 | old_weight_1, old_weight_2, old_bias_1, old_bias_2 = sess.run([Weight_1, Weight_2, Bias_1, Bias_2]) 107 | 108 | if np.random.random() <= eplison: 109 | action_train = np.random.randint(2) 110 | else: 111 | Q = sess.run(test_action, feed_dict={x1: np.reshape(observation_0, [1, 4])}) 112 | action_select_now = int(Q) 113 | 114 | 115 | observation_1, _, done_0, _ = env.step(action_select_now) 116 | 117 | if done_0: 118 | reward = -1 119 | else: 120 | reward = 0 121 | 122 | ### add new data to replay memory## 123 | buffer_observation_now = np.append(buffer_observation_now, np.reshape(observation_0, [1, 4]), 124 | axis=0) 125 | buffer_observation_next = np.append(buffer_observation_next, np.reshape(observation_1, [1, 4]), 126 | axis=0) 127 | buffer_action = np.append(buffer_action, [action_select_now], axis=0) 128 | buffer_reward = np.append(buffer_reward, [reward], axis=0) 129 | 130 | this_batch = random.sample(range(len(buffer_replay)), mini_batch_size) 131 | 132 | _, loss_train = sess.run([train_optimizer, q_loss], 133 | feed_dict={x1: buffer_observation_now[this_batch, :], 134 | x2: buffer_observation_next[this_batch, :], 135 | x3: np.concatenate((np.reshape( 136 | np.arange(mini_batch_size), 137 | [mini_batch_size, 1]), np.reshape( 138 | buffer_action[this_batch], 139 | [mini_batch_size, 1])), axis=1) 140 | , x4: buffer_reward[this_batch], 141 | Weight_old_1:old_weight_1, 142 | Weight_old_2:old_weight_2, 143 | Bias_old_1:old_bias_1, 144 | Bias_old_2:old_bias_2, 145 | }) 146 | total_QQ_loss += loss_train 147 | 148 | observation_0 = observation_1 149 | 150 | if (i_eposide - 1) % 20 == 0: 151 | if done_0 is True: 152 | if i_step + 1 == 300: 153 | report_reward = 0 154 | else: 155 | report_reward = -1 * discount ** (i_step) 156 | 157 | all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1 158 | all_total_reward[i_run - 1, i_eposide - 1] = report_reward 159 | 160 | ### record average test performance ### 161 | test_size = 10 162 | Small_test_eposide_length = np.zeros((1, test_size)) 163 | Small_test_reward = np.zeros((1, test_size)) 164 | 165 | for i_test_run in range(1, test_size + 1): 166 | observation_test_0 = env.reset() 167 | 168 | for i_test_length in range(max_eposide_length): 169 | action_test_now = test_action.eval( 170 | feed_dict={x1: np.reshape(observation_test_0, [1, 4])}) 171 | action_test_now = int(action_test_now) 172 | observation_test_1, _, test_done, test_info = env.step(action_test_now) 173 | 174 | observation_test_0 = observation_test_1 175 | 176 | if test_done is True: 177 | if i_test_length + 1 == 300: 178 | reward_test = 0 179 | else: 180 | reward_test = -1 181 | Small_test_eposide_length[0, i_test_run - 1] = i_test_length + 1 182 | Small_test_reward[0, i_test_run - 1] = reward_test * ( 183 | discount ** (i_test_length)) 184 | 185 | 186 | break 187 | 188 | small_mean_test_length = np.mean(np.mean(Small_test_eposide_length, axis=0), axis=0) 189 | small_mean_test_reward = np.mean(np.mean(Small_test_reward, axis=0), axis=0) 190 | print('the ith running',i_run,'the ith eposide', i_eposide - 1, 'the test_average_length', 191 | small_mean_test_length, 192 | 'the total_test_length ', Small_test_eposide_length, '..loss..', 193 | total_QQ_loss / (i_step + 1)) 194 | all_test_episode_length[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_length 195 | # print((i_eposide-1)/20) 196 | # print(int((i_eposide-1)/20)) 197 | all_test_reward[i_run - 1, int((i_eposide - 1) / 20)] = small_mean_test_reward 198 | all_train_loss[i_run - 1, int((i_eposide - 1) / 20)] = total_QQ_loss / (i_step + 1) 199 | 200 | if all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)] == np.amax( 201 | all_test_episode_length): 202 | print('.....', all_test_episode_length[i_run-1, int((i_eposide - 1) / 20)]) 203 | print(np.amax(all_test_episode_length)) 204 | if not os.path.exists('./part7_target/'): 205 | os.mkdir('./part7_target/') 206 | saver.save(sess, "./part7_target/") 207 | print('saved') 208 | 209 | break 210 | else: 211 | if done_0 is True: 212 | reward = -1 213 | 214 | final_reward = reward * discount ** (i_step) 215 | 216 | all_episode_length[i_run - 1, i_eposide - 1] = i_step + 1 217 | all_total_reward[i_run - 1, i_eposide - 1] = final_reward 218 | 219 | break 220 | 221 | 222 | outfile1 = all_total_reward 223 | outfile2 = all_episode_length 224 | outfile3 = all_train_loss 225 | outfile4 = all_test_reward 226 | outfile5 = all_test_episode_length 227 | 228 | np.save('reward_data_train_part7', outfile1) 229 | np.save('length_data_train_part7', outfile2) 230 | 231 | np.save('loss_data_train_part7', outfile3) 232 | np.save('length_data_test_part7', outfile4) 233 | np.save('reward_data_test_part7', outfile5) 234 | 235 | mean_episode_len = np.mean(all_episode_length, axis=0) 236 | mean_total_reward = np.mean(all_total_reward, axis=0) 237 | mean_loss_train = np.mean(all_train_loss, axis=0) 238 | mean_test_eposide_length = np.mean(all_test_episode_length, axis=0) 239 | mean_test_reward = np.mean(all_test_reward, axis=0) 240 | 241 | std_episode_len = np.std(all_episode_length, axis=0) 242 | std_total_reward = np.std(all_total_reward, axis=0) 243 | -------------------------------------------------------------------------------- /CartPole/target-parameter/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /CartPole/three-random-episode/3_random_episode.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | 4 | env = gym.make('CartPole-v0') 5 | 6 | ### set parameters and set ### 7 | discount_factor=0.99 8 | eposide_length=[] 9 | expect_value=[] 10 | 11 | for i_episode in range(3): 12 | 13 | observation_init = env.reset() 14 | for t in range(300): 15 | env.render() 16 | ### select action by uniform distribution ### 17 | action= np.random.uniform(0,1,1) 18 | action=np.round(action) 19 | action=int(action) 20 | 21 | observation, reward, done, info = env.step(action) 22 | 23 | if done: 24 | ### when each eposide ended record the return and eposide's length 25 | reward=-1 26 | reward_return=reward*(discount_factor**(t)) 27 | expect_value.append(reward_return) 28 | 29 | print("Episode length is {} ".format(t+1)) 30 | eposide_length.append(t+1) 31 | 32 | break 33 | 34 | print("the trajectories'length :",eposide_length) 35 | print("the return from the starting state:",expect_value) 36 | -------------------------------------------------------------------------------- /CartPole/three-random-episode/delete: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /CartPole/train_data_2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/CartPole/train_data_2.npy -------------------------------------------------------------------------------- /learning_curve/Capture_1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/Capture_1.JPG -------------------------------------------------------------------------------- /learning_curve/DQN_PICTURE.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/DQN_PICTURE.JPG -------------------------------------------------------------------------------- /learning_curve/MsPacman0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/MsPacman0.png -------------------------------------------------------------------------------- /learning_curve/MsPacman301.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/MsPacman301.png -------------------------------------------------------------------------------- /learning_curve/Pong19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/Pong19.png -------------------------------------------------------------------------------- /learning_curve/Pong256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/Pong256.png -------------------------------------------------------------------------------- /learning_curve/batch_Q_learning_linear_0.001_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/batch_Q_learning_linear_0.001_length.png -------------------------------------------------------------------------------- /learning_curve/batch_Q_learning_linear_0.001_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/batch_Q_learning_linear_0.001_reward.png -------------------------------------------------------------------------------- /learning_curve/batch_Q_learning_neural_0.0001_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/batch_Q_learning_neural_0.0001_length.png -------------------------------------------------------------------------------- /learning_curve/batch_Q_learning_neural_0.0001_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/batch_Q_learning_neural_0.0001_reward.png -------------------------------------------------------------------------------- /learning_curve/boxing0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/boxing0.png -------------------------------------------------------------------------------- /learning_curve/boxing313.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/boxing313.png -------------------------------------------------------------------------------- /learning_curve/boxing_128_128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/boxing_128_128.png -------------------------------------------------------------------------------- /learning_curve/boxing_28_28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/boxing_28_28.png -------------------------------------------------------------------------------- /learning_curve/double_Q_learning_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/double_Q_learning_length.png -------------------------------------------------------------------------------- /learning_curve/double_Q_learning_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/double_Q_learning_reward.png -------------------------------------------------------------------------------- /learning_curve/experience_replay_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/experience_replay_length.png -------------------------------------------------------------------------------- /learning_curve/experience_replay_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/experience_replay_reward.png -------------------------------------------------------------------------------- /learning_curve/mapacman_28_28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/mapacman_28_28.png -------------------------------------------------------------------------------- /learning_curve/mspacman_128_128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/mspacman_128_128.png -------------------------------------------------------------------------------- /learning_curve/mspacman_28_28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/mspacman_28_28.png -------------------------------------------------------------------------------- /learning_curve/online_Q_learning_neural_0.001_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/online_Q_learning_neural_0.001_length.png -------------------------------------------------------------------------------- /learning_curve/online_Q_learning_neural_0.001_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/online_Q_learning_neural_0.001_reward.png -------------------------------------------------------------------------------- /learning_curve/pong_128_128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/pong_128_128.png -------------------------------------------------------------------------------- /learning_curve/pong_28_28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/pong_28_28.png -------------------------------------------------------------------------------- /learning_curve/readme: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /learning_curve/target_parameter_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/target_parameter_length.png -------------------------------------------------------------------------------- /learning_curve/target_parameter_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohaomao/Reinforcment-Leanring-algorithm/a68e5ae7fc76ccf425e15f1339fb9cb592d615a8/learning_curve/target_parameter_reward.png -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # 经典强化学习算法在CartPole和几个Atari游戏的实现 2 | 3 | 4 | 5 | # OpenAI Gym 环境的安装 6 | 7 | ------ 8 | 9 | ##### **假如已经安装python3.5+ 可以通过以下两种方式简单安装gym环境** 10 | 11 | 12 | 13 | ``` 14 | pip install gym 15 | ``` 16 | 17 | 或者: 18 | 19 | ``` 20 | git clone https://github.com/openai/gym 21 | cd gym 22 | pip install -e . 23 | ``` 24 | 25 | 通过以上简易安装, 已经可以执行一些基本的游戏 如 Cartpole 26 | 27 | 运行以下实验, 来验证gym 安装成功 28 | 29 | ``` 30 | import gym 31 | env = gym.make('CartPole-v0') 32 | env.reset() 33 | for _ in range(1000): 34 | env.render() 35 | env.step(env.action_space.sample()) # take a random action 36 | 37 | ``` 38 | 39 | 40 | 41 | 当然你也可以增加要安装的环境包, 在上面第二种安装方法的最后一行代码中加入 ['环境名称‘] 42 | 43 | ``` 44 | pip install -e .['names'] 45 | ``` 46 | 47 | 特别的, 当 'names’=‘all' 将执行安装全部的环境, 这需要更多的依赖包如 cmake 和较新版本的pip, 由于这里我们要安装 Atari环境, 但往往个人主机里会缺少 atari_py、cmake 48 | 49 | **因此 我们可以按照以下的步骤配置 Atari 环境 (windows):** 50 | 51 | ``` 52 | # 更新pip 53 | python -m pip install --upgrade pip 54 | # 安装atari_py 55 | pip install --no-index -f https://github.com/Kojoley/atari-py/releases atari_py 56 | 57 | git clone https://github.com/openai/gym 58 | cd gym 59 | 60 | #安装 cmake 61 | pip install cmake 62 | 63 | pip install -e .[atari] 64 | ``` 65 | 66 | 67 | 68 | > "更多的关于gym环境的documents:" http://gym.openai.com/docs/ 69 | > 70 | > 71 | 72 | # 使用Google Colab GPU 训练Deep-Q-Network模型 73 | 74 | 相比于 CartPole游戏的数值输入与输出, Atari 游戏环境给出的是RGB的图片 一般为(210,160,3)格式,而且离散动作空间也相应更大, 因此要求进行更多有效的空间探索, 这需要更大的计算量. 而大的experience replay 对电脑的内存性能要求也更大, 因此使用平常的电脑CPU 进行实验往往十分耗时.由于google 在今年一月份发布了一个免费的GPU 使用项目, 该项目附于google drive 上, 进行文件的管理十分方便, 因此我们使用其进行模型训练. 75 | 76 | ### 获得google colab授权: 77 | 78 | 其中需要两次点进出现的网址, 登录谷歌账号授权, 得到相应的key,输入连接成功. 79 | 80 | 在colab 里,!后输入命令相当于在cmd 终端里输入命令 81 | 82 | 如下图所示: 83 | 84 | 85 | 86 | ![](learning_curve/Capture_1.JPG) 87 | 88 | 89 | 90 | ``` 91 | !apt-get install -y -qq software-properties-common python-software-properties module-init-tools 92 | 93 | !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null 94 | 95 | !apt-get update -qq 2>&1 > /dev/null 96 | 97 | !apt-get -y install -qq google-drive-ocamlfuse fuse 98 | 99 | from google.colab import auth 100 | 101 | auth.authenticate_user() 102 | 103 | from oauth2client.client import GoogleCredentials 104 | 105 | creds = GoogleCredentials.get_application_default() 106 | 107 | import getpass 108 | 109 | !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL 110 | 111 | vcode = getpass.getpass() 112 | 113 | !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret 114 | 115 | ``` 116 | 117 | 118 | 119 | ``` 120 | !mkdir -p drive 121 | 122 | !google-drive-ocamlfuse drive 123 | 124 | ``` 125 | 126 | ### 安装Gym classical 和 Atari 环境 127 | 128 | 直接安装基本的gym环境 129 | 130 | ``` 131 | !pip install gym 132 | ``` 133 | 134 | 安装atari 环境的依赖包 swig 和 cmake 135 | 136 | ``` 137 | !apt install git 138 | !git clone https://github.com/openai/gym.git 139 | cd gym 140 | !apt-get install cmake 141 | !apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig 142 | # 接着安装 atari # 143 | !pip install -e '.[atari] 144 | 145 | ``` 146 | 147 | 现在可以在 colab Notebook 上就测试gym atari 环境了. 148 | 149 | [更多的colab配置信息]: https://www.234du.com/1154.html 150 | 151 | 152 | 153 | # 项目实现步骤 154 | 155 | ------ 156 | 157 | ## Cart-Pole 游戏: 158 | 159 | 160 | 161 | 首先我们搭建、调通一些经典的强化学习算法, 包括batch(offline) Q-learng、online Q-learning、Deep Q-Network, Double-Q-learning, 在经典的平衡杆(Cart-Pole )游戏测试每个算法的表现. 162 | 163 | 由于平衡杆游戏每次给的的反馈是一个 四维数组, 因此我们只需要用一个前馈神经网络去作为动作值函数近似器. 164 | 165 | 但在Atari 游戏中, 输入一个action, 游戏系统给的反馈是游戏进行中场景的图片, 因此, 我们用卷积神经网络代替前馈神经网络作为动作值函数近似器. 然后分别在上述的几个强化学习算法中使用. 166 | 167 | 168 | 169 | 170 | 171 | ### Random Policy 172 | 173 | 174 | 175 | 为了方便计算 我们人为设置 discount factor 为 0.99, reward 为 -1 当步骤是一个episode 的最后一步时, 否则 reward 为 0, 设置一个episode步长上限为300 `env._max_episode_steps = 300`. PS(以上设置仅针对Cart-Pole游戏成立) 176 | 177 | CartPole 游戏的 action只有 0或1, 首先使用随机选择action的策略熟悉游戏环境和观察平均步长 和回报. 178 | 179 | py文件分别在文件夹three-random-episode和 hundred-random-episode 180 | 181 | 可以通过`env.render()`打开flash 观察游戏的进程. 平均步长和回报大概分别是22、-0.81. 182 | 183 | 184 | 185 | 186 | 187 | ### batch (offline) Q-learning 188 | 189 | 190 | 191 | 先收集2000个随机策略下的episodes 数据, 然后仅仅基于收集好的数据, 通过直接训练动作值函数 来学习控制平衡杆, 在这里我们分别用一个 线性转换 和仅含单层隐藏层(神经元数为100)的前馈神经网络来表达动作值函数, 尝试的学习率分别是 总的训练、更新参数次数为5000,每次训练的数据量为1000; 学习率、优化器分别是 0.001 和Adam. 192 | 193 | 实验发现, 相对前馈神经网络, 训练过程中线性转换的动作值函数能更快的控制平衡杆达到300步, 但极易overfitting, 相反前馈神经网络的学习过程表现的更稳定, 最终的学习效果也更好. 194 | 195 | 196 | 197 | ***learning rate =0.001,linear transformation*** 198 | 199 | ![](learning_curve/batch_Q_learning_linear_0.001_length.png) 200 | 201 | ![](learning_curve/batch_Q_learning_linear_0.001_reward.png) 202 | 203 | 204 | 205 | ***learning_rate=0.0001,hidden layer(100) { linear transformation + ReLU }*** 206 | 207 | ![](learning_curve/batch_Q_learning_neural_0.0001_length.png) 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | ![](learning_curve/batch_Q_learning_neural_0.0001_reward.png) 216 | 217 | ***思考:*** 由于Cart-Pole 游戏比较简单, 线性近似的动作值函数能用更少的训练步骤达到目标, 且受参数初始化的影响较小; 但极易overfitting, 且不好控制训练的时间及量. 相反, 含单层隐藏层的前馈神经网络需要更多的训练步骤达到目标值, 但相比于线性近似更稳定. 并且最终收敛效果也更好. 218 | 219 | 220 | 221 | 222 | 223 | ### online Q-learning 224 | 225 | 226 | 227 | 从这里开始我们将仅仅使用神经网络来近似动作值函数, 228 | 229 | 在 offline-Q-learning 中训练前馈神经网络用的是离线的数据, 且每次输入模型的数据量可以自由控制, 一旦训练次数偏多或每次的训练量偏大易造成overfitting, 这里我们每次输入的数据即进行中的 episode 的最新一步的反馈, 即每次只根据一个数据的信息更新动作值函数的参数. 230 | 231 | 为了让模型更好、更快的学习, 在训练过程中, 我们采用 epsilon-greedy Q-learning 算法, epsilon rate=0.05, 即有0.05 的概率使用随机策略, 而在测试过程中则全部采用值函数给出的action. 232 | 233 | 根据之前的经验, 学习率、优化器分别是 0.001, Adam;其它的设置与之前的单隐藏层前馈神经网络一致. 更多的,为了防止初始化参数带来的偏差, 我们训练一百个模型, 观察平均的步长和回报 234 | 235 | 236 | 237 | ***online Q-learning 的学习曲线*** 238 | 239 | ![](learning_curve/online_Q_learning_neural_0.001_length.png) 240 | 241 | 242 | 243 | ![](learning_curve/online_Q_learning_neural_0.001_reward.png) 244 | 245 | ***思考***: 虽然, 每个 episode 的平均步长为120还远远未到达目标步长300, 但从学习曲线得知, 我们的模型一直在学习如何更好的控制平衡杆, 随着训练时间的增加, 会逐渐接近设定的目标, 相比于offline Q-learning, online Q-learning 会更稳定的学习, 虽然消耗的训练时间更多. 246 | 247 | PS: Note that with the automatic gradient computation in tensorflow,you must apply a stop gradient operation to avoid adapting the learning target. 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | ### Different Neural Size 258 | 259 | 260 | 261 | 这里我们使用不同的neural size, 测试online Q-learning的性能 262 | 263 | neural size=30 264 | 265 | 或者 266 | 267 | neural size=1000 268 | 269 | 270 | 271 | 272 | 273 | ### Experience Replay and Target Parameter 274 | 275 | 276 | 277 | Deep Q-NetWork 是近些年提出的一种增强学习模型, 相比于传统的Q-learning 算法, 其增加了两个重要的机制:经验回放、目标函数参数固定. 278 | 279 | NIPS DQN在基本的Deep Q-Learning算法的基础上使用了Experience Replay经验池. 通过将训练得到的数据储存起来然后随机采样的方法降低了数据样本的相关性, 提升了性能, 接下来, Nature DQN做了一个改进, 就是增加Target Q网络. 也就是我们在计算目标Q值时使用专门的一个目标Q网络来计算, 而不是直接使用预更新的Q网络. 280 | 281 | 这样做的目的是为了减少目标计算与当前值的相关性. 282 | 283 | 284 | 285 | 如上面的损失函数公式所示, 计算目标Q值的函数使用的参数是,相比之下, Nips 版本DQN 的 目标Q网络是随着Q网络实时更新的, 这样会导致 目标Q值与当前的Q值相关性较大, 容易造成过度估计(over estimation)问题 286 | 287 | 因此提出单独使用一个目标Q网络. 那么目标Q网络的参数如何来呢?还是从Q网络中来, 只不过是延迟更新. 也就是每次等训练了一段时间再将当前Q网络的参数值复制给目标Q网络. 288 | 289 | 290 | 291 | **在Q-learning 中仅加入 Experience Replay效果如下:** 292 | 293 | ![](learning_curve/experience_replay_length.png) 294 | 295 | ![](learning_curve/experience_replay_reward.png) 296 | 297 | ***在 Q-learning中仅加入Target Parameter机制*** 298 | 299 | ![](learning_curve/target_parameter_length.png) 300 | 301 | ![](learning_curve/target_parameter_reward.png) 302 | 303 | ***思考***: 从学习曲线中可以看出, 分别加入两个机制 都对算法的学习性能有较大的提升, 因其分别减少了数据之间、Q值与目标Q值之间的相关性, 降低了overfitting 和过度估计的可能性, 因此可推断同时加入两个机制应该有更大的提升. 304 | 305 | 306 | 307 | ### Double Q-learning 308 | 309 | 310 | 311 | 在上面的 target-parameter 中, 对于target Q值 与目前Q 值, 我们使用同一个Q网络, 只不过参数更新的频率不一样. 312 | 313 | 而在double Q-learning 里,为了减少因为目标Q值里 max Q值计算带来的计算偏差, 或者称为过度估计(over estimation)问题, 我们训练两个网络; 用当前的Q网络来选择动作, 用目标Q网络来计算目标Q. 算法步骤如下图: 314 | 315 | ![](learning_curve/DQN_PICTURE.JPG) 316 | 317 | ***Double Q-Learning 的学习曲线*** 318 | 319 | 320 | 321 | ![](learning_curve/double_Q_learning_length.png) 322 | 323 | ![](learning_curve/double_Q_learning_reward.png) 324 | 325 | ***思考***: 减少目标Q值计算的过度估计 看起来也有积极的效果对于提升算法的性能 326 | 327 | 328 | 329 | ## Atari Game(pong、Boxing、Mspacman): 330 | 331 | 332 | 333 | 这里我们关注经典强化学习算法在 Atari 游戏上的 表现, (Pong、Mspcaman、Boxing), 相应的 我们用cnn 代替上面的前馈神经网络和线性转换作为动作值函数近似. 334 | 335 | ***网络设置与数据预处理*** 336 | 337 | 将环境给的每一张反馈图片压缩成64×64×1的灰阶格式, 将每四张处理过的图片连接在一起(64×64×4),存储格式为(tf.uint8) 338 | 339 | (因为上述的atari 游戏中 每个动作会重复K 遍,以保证设定的合理性,K 随机从 [2,3,4] 中挑选). 340 | 341 | 环境的名称分别为“Pong-v0”, “MsPacman-v0”, and “Boxing-v0” (没有-ram). 342 | 343 | 使用卷积神经网络近似Q-value-function. 第一层的过滤器大小为6×6、步长为2、16个channel, 连接Relu 激活函数; 344 | 345 | 第二层的过滤器大小为4×4、步长为2、32个channel, 连接Relu 激活函数; 346 | 347 | 第三层为全连接层, 神经元数为256, 连接ReLU 函数 348 | 349 | 最后通过线性转换预测状态动作值函数, 每个动作连接一个output(预测值). 350 | 351 | epsilon rate =0.1、discount count=0.99、设置环境给的reward 为-1或0或1、minibatch size=32. 352 | 353 | 优化器为 RMSProp 、learning rate=0.001. 354 | 355 | 加入 experience replay(至少100000 transitions) 和 target network(更新 每训练5个episode) 机制在Q-Learning . 356 | 357 | 358 | 359 | ### Random Policy 360 | 361 | 使用随机策略 观测 各个游戏 的平均回报 与时长. 362 | 363 | ### Cnn+DQN for three games 364 | 365 | ##### MsPacman: 366 | 367 | MsPacman 是一个假人在一个固定的环境中尽量躲避敌人并得分的游戏, 每次被敌人碰到就会丧失一次机会,一共有三次机会, 没有时间限制, 目的是尽可能的获得高的分数. 368 | 369 | MsPacman 的动作空间维度是9, 是从数字0到8的离散空间, gym 游戏中给出的reward 空间为[0, 10, 50], 370 | 371 | 为了简便, 在我们的代码中, 我们将reward 控制成0 或 1. 372 | 373 | 当得分增加时, 相对应的reward 不管10 还是50 均设置为 1,否则 reward 为0. 374 | 375 | 然而,由于游戏中可以检测出什么时候被敌人碰到并丧失一次机会,即观察每步给出的 info 信息 {'ale.lives': 3} 376 | 377 | 里面的数目即剩余的机会数, 因此可以设置当丧失一次机会时, reward 为 -1, 这样可以增加reward的合理性. 378 | 379 | 场景如下图所示: 380 | 381 | ![](learning_curve/MsPacman0.png) 382 | 383 | 384 | 385 | ![](learning_curve/MsPacman301.png) 386 | 387 | 388 | 389 | 390 | 391 | ##### Pong: 392 | 393 | Pong 是两块不同颜色的板砖在玩类似于乒乓球的游戏, 红色是由电脑控制, 绿色则由我们的agent 控制. 小白球在两方之间受初始外力作用开始有规律的移动, 而板砖只能在竖直方向进行上下移动,通过碰撞改变小白球移动的方向, 哪方没有接住小白球致使其超过底线, 判断哪方输, 相应的 赢得一方则加上一分, 分数展示在相应位置的上方, 抢先获得21分的一方获胜, 没有时间限制 . 394 | 395 | 其中, agent 的动作空间维度是6, 是数字0 到5 的离散空间; gym游戏中给出的reward空间为[-1,0,1], 刚好符合我们的设定. 396 | 397 | 398 | 399 | 游戏过程如下图所示: 400 | 401 | 402 | 403 | ![](learning_curve/Pong19.png) 404 | 405 | 406 | 407 | ![](learning_curve/Pong256.png) 408 | 409 | 410 | 411 | 412 | 413 | ##### Boxing: 414 | 415 | Boxing 是两个拳击手在一个舞台上比赛的游戏, 其中黑色拳击手为电脑控制, 白色拳击手是我们的agent. 目的是在2分钟时间赢得比赛(获得比对手多的分数). 拳击手可以在舞台上进行任意的移动或者出拳. 其中 动作空间维度为18,是数字0到17的离散空间. reward 相对于前两种游戏更多样化, 为[-1,-2,0,1,2] , 这样也更符合拳击比赛的定义,比如重拳得分. 但为了更方便的实验, 我们这里也采用[-1,0,1]的设定. 416 | 417 | 其中比赛过程如下图所示: 418 | 419 | ![](learning_curve/boxing0.png) 420 | 421 | 422 | 423 | ![](learning_curve/boxing313.png) 424 | 425 | ### 实验结果与建议 426 | 427 | 在编写三个游戏的代码中, 我们将reward设置成 1, 0 或者 -1, 接着我们使用 tf.stop_gradient 来计算目标Q 值 428 | 429 | ``` 430 | tf.add(x3 + discount * tf.stop_gradient((1+x3)*Max_Q_value_next), (-1 * Q_value)) 431 | ``` 432 | 433 | 相对于用DQN 训练Cartpole, Atari 游戏需要多得多的训练时间: 434 | 435 | CartPole: CPU-------训练 20分钟即可得到一个比较稳定且高效的agent, 436 | 437 | Atari: Colab -GPU , 一个episode 1分多钟, 大概至少要训练1000个episode, agent 438 | 439 | ​ 才能展现比较好的性能, 在训练过程中, 虽然loss 能比较快的下降, 但 测试 得分(reward) 440 | 441 | ​ 要在几十万步以后才表现出逐渐提升的趋势. 442 | 443 | 444 | 445 | 由于需要大量的训练量, 建议大约每50000步进行一次模型的测评, 也可以尝试不同的reshape size, 例如 (28,28) 与(128,128) 有着十分巨大的清晰度差别. 446 | 447 | 如下图所示: 448 | 449 | Boxing: 450 | 451 | ![Boxing_28_28](learning_curve/boxing_28_28.png) 452 | 453 | ![Boxing_128_128](learning_curve/boxing_128_128.png) 454 | 455 | 456 | Pong: 457 | 458 | ![Pong_28_28](learning_curve/pong_28_28.png) 459 | 460 | ​ ![Pong_128_128](learning_curve/pong_128_128.png) 461 | 462 | MsPacman: 463 | 464 | ​ ![MsPacman_28_28](learning_curve/mspacman_28_28.png) 465 | 466 | ​ ![MsPacman_128_128](learning_curve/mspacman_128_128.png) 467 | 468 | 469 | 470 | 471 | 472 | 如果机器允许, 也可以构建更大规格的experience replay 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | ------ 487 | 488 | 489 | 490 | # 相关的链接: 491 | 492 | [Atari+Deep RL](https://arxiv.org/abs/1312.5602) 493 | 494 | [Double-Q-learning](http://papers.nips.cc/paper/3964-double-q-learning) 495 | 496 | [Deep-RL with Double Q-Learning](https://arxiv.org/abs/1509.06461) 497 | 498 | [Human-level control through deep reinforcement learning](https://www.nature.com/articles/nature14236) 499 | 500 | --------------------------------------------------------------------------------