├── .idea ├── Reinforcement_learning.iml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── Breakout ├── Breakout-DQN.py ├── Breakout_DQN_class.py ├── Breakout_PolicyGradient.py ├── Play_DQN.py └── breakout_dqn_pytorch.py ├── CartPole ├── CartPole_A2C_episodic.py ├── CartPole_C51.py ├── CartPole_DDQN.py ├── CartPole_DQN_NIPS2013.py ├── CartPole_DQN_Nature2015.py ├── CartPole_PAAC.py ├── CartPole_PAAC_multiproc.py ├── CartPole_PolicyGradient.py ├── CartPole_Q-Network.py ├── CartPole_Q-Network_reshape.py ├── Cartpole_A2C_nstep.py ├── Cartpole_A2C_onestep.py ├── cartpole_dqn.py ├── cartpole_ppo.py └── play_Cartpole.py ├── FrozenLake ├── FL_Q-Table.py ├── FL_Q-table_Stochastic.py ├── FL_Q-table_exp&dis.py ├── FrozenLake_Q-Network.ipynb └── FrozenLake_Q-Network.py ├── Pong ├── Pong_A2C_episodic.py └── Pong_PolicyGradient.py ├── README.md ├── Windygridworld ├── OptimalPolicy │ ├── optimal_Q-Learning.txt │ └── optimal_SARSA.txt ├── Q-learning_sarsa.py ├── QValue │ ├── Q-Learning_value.npy │ ├── Q-Learning_value.txt │ ├── SARSA_value.npy │ └── SARSA_value.txt ├── Readme.md ├── graph.png └── windygridworld.py ├── pendulum ├── pendulum_ddpg.py └── pendulum_ppo.py └── readme ├── 1x1conv.gif ├── Play.gif ├── q-learning.PNG ├── sarsa.PNG └── windy.PNG /.idea/Reinforcement_learning.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Breakout/Breakout-DQN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import gym 4 | import copy 5 | import numpy as np 6 | import random as ran 7 | import datetime 8 | import matplotlib.pyplot as plt 9 | 10 | from collections import deque 11 | from skimage.transform import resize 12 | from skimage.color import rgb2gray 13 | 14 | plt.ion() 15 | env = gym.make('BreakoutDeterministic-v3') 16 | 17 | DDQN = False 18 | 19 | # 꺼내서 사용할 리플레이 갯수 20 | MINIBATCH = 32 21 | # 리플레이를 저장할 리스트 22 | REPLAY_MEMORY = deque() 23 | 24 | HISTORY_STEP =4 25 | FRAMESKIP = 4 26 | TRAIN_INTERVAL = 4 27 | NO_STEP = 30 28 | TRAIN_START = 50000 29 | if DDQN: 30 | FINAL_EXPLORATION = 0.01 31 | TARGET_UPDATE = 30000 32 | else: 33 | FINAL_EXPLORATION = 0.1 34 | TARGET_UPDATE = 10000 35 | 36 | 37 | MEMORY_SIZE = 200000 38 | EXPLORATION = 1000000 39 | START_EXPLORATION = 1. 40 | 41 | 42 | INPUT = env.observation_space.shape 43 | OUTPUT = 3 44 | HEIGHT =84 45 | WIDTH = 84 46 | 47 | # 하이퍼파라미터 48 | LEARNING_RATE = 0.00025 49 | 50 | DISCOUNT = 0.99 51 | e = 1. 52 | frame = 0 53 | model_path = "save/Breakout.ckpt" 54 | def cliped_error(x): 55 | return tf.where(tf.abs(x) < 1.0 , 0.5 * tf.square(x), tf.abs(x)-0.5) 56 | 57 | # input data 전처리 58 | 59 | 60 | def pre_proc(X): 61 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거 62 | # x = np.maximum(X, X1) 63 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정 64 | x = np.uint8(resize(rgb2gray(X), (84,84))*255) 65 | return x 66 | 67 | # DQN 모델 68 | def model(input1, f1, f2, f3, w1, w2): 69 | c1 = tf.nn.relu(tf.nn.conv2d(input1, f1, strides=[1, 4, 4, 1],data_format="NHWC", padding = "VALID")) 70 | c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 2, 2, 1],data_format="NHWC", padding="VALID")) 71 | c3 = tf.nn.relu(tf.nn.conv2d(c2, f3, strides=[1,1,1,1],data_format="NHWC", padding="VALID")) 72 | 73 | l1 = tf.reshape(c3, [-1, w1.get_shape().as_list()[0]]) 74 | l2 = tf.nn.relu(tf.matmul(l1, w1)) 75 | 76 | pyx = tf.matmul(l2, w2) 77 | return pyx 78 | 79 | 80 | X = tf.placeholder("float", [None, 84, 84, 4]) 81 | 82 | # 메인 네트워크 Variable 83 | f1 = tf.get_variable("f1", shape=[8,8,4,32], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 84 | f2 = tf.get_variable("f2", shape=[4,4,32,64], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 85 | f3 = tf.get_variable("f3", shape=[3,3,64,64], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 86 | 87 | w1 = tf.get_variable("w1", shape=[7*7*64,512], initializer=tf.contrib.layers.xavier_initializer()) 88 | w2 = tf.get_variable("w2", shape=[512, OUTPUT], initializer=tf.contrib.layers.xavier_initializer()) 89 | 90 | py_x = model(X, f1, f2, f3 , w1, w2) 91 | 92 | # 타겟 네트워크 Variable 93 | f1_r = tf.get_variable("f1_r", shape=[8,8,4,32], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 94 | f2_r = tf.get_variable("f2_r", shape=[4,4,32,64], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 95 | f3_r = tf.get_variable("f3_r", shape=[3,3,64,64], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 96 | 97 | w1_r = tf.get_variable("w1_r", shape=[7*7*64,512], initializer=tf.contrib.layers.xavier_initializer()) 98 | w2_r = tf.get_variable("w2_r", shape=[512, OUTPUT], initializer=tf.contrib.layers.xavier_initializer()) 99 | 100 | py_x_r = model(X, f1_r, f2_r,f3_r, w1_r, w2_r) 101 | 102 | # 총 Reward를 저장해놓을 리스트 103 | rlist=[0] 104 | recent_rlist=[0] 105 | 106 | episode = 0 107 | epoch = 0 108 | epoch_score = deque() 109 | epoch_Q = deque() 110 | epoch_on = False 111 | average_Q = deque() 112 | average_reward = deque() 113 | no_life_game = False 114 | 115 | # Loss function 정의 116 | a= tf.placeholder(tf.int64, [None]) 117 | y = tf.placeholder(tf.float32, [None]) 118 | a_one_hot = tf.one_hot(a, OUTPUT, 1.0, 0.0) 119 | q_value = tf.reduce_sum(tf.multiply(py_x, a_one_hot), reduction_indices=1) 120 | error = tf.abs(y - q_value) 121 | 122 | quadratic_part = tf.clip_by_value(error, 0.0, 1.0) 123 | linear_part = error - quadratic_part 124 | loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part) 125 | 126 | optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE,momentum=0.95,epsilon= 0.01) 127 | train = optimizer.minimize(loss) 128 | 129 | saver = tf.train.Saver(max_to_keep=None) 130 | 131 | # 세션 정의 132 | with tf.Session() as sess: 133 | # 변수 초기화 134 | sess.run(tf.global_variables_initializer()) 135 | sess.run(w1_r.assign(w1)) 136 | sess.run(w2_r.assign(w2)) 137 | sess.run(f1_r.assign(f1)) 138 | sess.run(f2_r.assign(f2)) 139 | sess.run(f3_r.assign(f3)) 140 | 141 | # 에피소드 시작 142 | while np.mean(recent_rlist) < 500 : 143 | episode += 1 144 | 145 | # 가장 최근의 100개 episode의 total reward 146 | if len(recent_rlist) > 100: 147 | del recent_rlist[0] 148 | 149 | history = np.zeros((84, 84, 5), dtype=np.uint8) 150 | rall = 0 151 | d = False 152 | ter = False 153 | count = 0 154 | s = env.reset() 155 | avg_max_Q = 0 156 | avg_loss = 0 157 | 158 | # 에피소드 시작할때 최대 30만큼 동안 아무 행동 하지않음 159 | # for _ in range(ran.randint(1, NO_STEP)): 160 | # s1, _, _, _ = env.step(0) 161 | 162 | # state의 초기화 163 | for i in range(HISTORY_STEP): 164 | history[:, :, i] = pre_proc(s) 165 | 166 | # 에피소드가 끝나기 전까지 반복 167 | while not d : 168 | # env.render() 169 | # 최근 4개의 프레임을 현재 프레임으로 바꿔줌 170 | 171 | frame +=1 172 | count+=1 173 | 174 | if e > FINAL_EXPLORATION and frame > TRAIN_START: 175 | e -= (START_EXPLORATION - FINAL_EXPLORATION) / EXPLORATION 176 | 177 | # 현재 state로 Q값을 계산 178 | Q = sess.run(py_x, feed_dict = {X : np.reshape(np.float32(history/255.), (1, 84, 84, 5))[:, :, :, 0:4]}) 179 | average_Q.append(np.max(Q)) 180 | avg_max_Q += np.max(Q) 181 | 182 | if e > np.random.rand(1): 183 | action = np.random.randint(OUTPUT) 184 | else: 185 | action = np.argmax(Q) 186 | 187 | if action == 0: 188 | real_a = 1 189 | elif action == 1: 190 | real_a = 4 191 | else: 192 | real_a = 5 193 | 194 | 195 | # 결정된 action으로 Environment에 입력 196 | s1, r, d, l = env.step(real_a) 197 | ter = d 198 | reward= np.clip(r, -1,1) 199 | 200 | 201 | # next state를 history에 저장 202 | history[:,:, 4] = pre_proc(s1) 203 | 204 | # 저장된 state를 Experience Replay memory에 저장 205 | REPLAY_MEMORY.append((np.copy(history[:,:,:]), action ,reward, ter)) 206 | history[:,:,:4] = history[:,:,1:] 207 | 208 | # 저장된 Frame이 1백만개 이상 넘어가면 맨 앞 Replay부터 삭제 209 | if len(REPLAY_MEMORY) > MEMORY_SIZE: 210 | REPLAY_MEMORY.popleft() 211 | # 총 reward 합 212 | rall += r 213 | 214 | # 5만 frame 이상부터 4개의 Frame마다 학습 215 | if frame > TRAIN_START : 216 | s_stack = deque() 217 | a_stack = deque() 218 | r_stack = deque() 219 | s1_stack = deque() 220 | d_stack = deque() 221 | y_stack = deque() 222 | 223 | sample = ran.sample(REPLAY_MEMORY, MINIBATCH) 224 | 225 | for s_r, a_r, r_r, d_r in sample: 226 | s_stack.append(s_r[:,:,:4]) 227 | a_stack.append(a_r) 228 | r_stack.append(r_r) 229 | s1_stack.append(s_r[:,:,1:]) 230 | d_stack.append(d_r) 231 | 232 | d_stack = np.array(d_stack) + 0 233 | 234 | Q1 = sess.run(py_x_r, feed_dict={X: np.float32(np.array(s1_stack) / 255.)}) 235 | 236 | y_stack = r_stack + (1 - d_stack) * DISCOUNT * np.max(Q1, axis=1) 237 | 238 | # 업데이트 된 Q값으로 main네트워크를 학습 239 | sess.run(train, feed_dict={X: np.float32(np.array(s_stack) / 255.), y: y_stack, a: a_stack}) 240 | 241 | # 3만개의 Frame마다 타겟 네트워크 업데이트 242 | if frame % TARGET_UPDATE == 0 : 243 | sess.run(w1_r.assign(w1)) 244 | sess.run(w2_r.assign(w2)) 245 | sess.run(f1_r.assign(f1)) 246 | sess.run(f2_r.assign(f2)) 247 | sess.run(f3_r.assign(f3)) 248 | 249 | # epoch(50000 Trained frame) 마다 plot 250 | if (frame - TRAIN_START) % 50000 == 0: 251 | epoch_on = True 252 | 253 | if epoch_on: 254 | plt.clf() 255 | epoch += 1 256 | epoch_score.append(np.mean(average_reward)) 257 | epoch_Q.append(np.mean(average_Q)) 258 | 259 | plt.subplot(211) 260 | plt.axis([0, epoch, 0, np.max(epoch_Q)*6/5]) 261 | plt.xlabel('Training Epochs') 262 | plt.ylabel('Average Action Value(Q)') 263 | plt.plot(epoch_Q) 264 | 265 | plt.subplot(212) 266 | plt.axis([0, epoch , 0, np.max(epoch_score)*6/5]) 267 | plt.xlabel('Training Epochs') 268 | plt.ylabel('Average Reward per Episode') 269 | plt.plot(epoch_score, "r") 270 | 271 | epoch_on = False 272 | average_reward = deque() 273 | average_Q = deque() 274 | plt.pause(0.05) 275 | plt.savefig("graph/{} epoch".format(epoch-1)) 276 | 277 | save_path = saver.save(sess, model_path, global_step=(epoch-1)) 278 | print("Model(episode :",episode, ") saved in file: ", save_path , " Now time : " ,datetime.datetime.now()) 279 | 280 | 281 | 282 | # 총 reward의 합을 list에 저장 283 | recent_rlist.append(rall) 284 | rlist.append(rall) 285 | average_reward.append(rall) 286 | print("Episode:{0:6d} | Frames:{1:9d} | Steps:{2:5d} | Reward:{3:3.0f} | e-greedy:{4:.5f} | Avg_Max_Q:{5:2.5f} | " 287 | "Recent reward:{6:.5f} ".format(episode,frame, count, rall, e, avg_max_Q/float(count),np.mean(recent_rlist))) -------------------------------------------------------------------------------- /Breakout/Breakout_DQN_class.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | import gym 5 | 6 | import numpy as np 7 | import random as ran 8 | import datetime 9 | import matplotlib.pyplot as plt 10 | 11 | from collections import deque 12 | from skimage.transform import resize 13 | from skimage.color import rgb2gray 14 | 15 | plt.ion() 16 | # DQN paper setting(frameskip = 4, repeat_action_probability = 0) 17 | # {}Deterministic : frameskip = 4 18 | # {}-v4 : repeat_action_probability 19 | env = gym.make('BreakoutDeterministic-v4') 20 | 21 | # 하이퍼 파라미터 22 | MINIBATCH_SIZE = 32 23 | HISTORY_SIZE = 4 24 | TRAIN_START = 1000 25 | FINAL_EXPLORATION = 0.1 26 | TARGET_UPDATE = 10000 27 | MEMORY_SIZE = 200000 28 | EXPLORATION = 1000000 29 | START_EXPLORATION = 1. 30 | INPUT = env.observation_space.shape 31 | OUTPUT = env.action_space.n 32 | HEIGHT = 84 33 | WIDTH = 84 34 | LEARNING_RATE = 0.00025 35 | DISCOUNT = 0.99 36 | EPSILON = 0.01 37 | MOMENTUM = 0.95 38 | 39 | model_path = "save/Breakout.ckpt" 40 | 41 | 42 | def pre_proc(X): 43 | '''입력데이터 전처리. 44 | 45 | Args: 46 | X(np.array): 받아온 이미지를 그레이 스케일링 후 84X84로 크기변경 47 | 그리고 정수값으로 저장하기위해(메모리 효율 높이기 위해) 255를 곱함 48 | 49 | Returns: 50 | np.array: 변경된 이미지 51 | ''' 52 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거 53 | # x = np.maximum(X, X1) 54 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정 55 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255) 56 | return x 57 | 58 | 59 | def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"): 60 | '''타겟네트워크에 메인네트워크의 Weight값을 복사. 61 | 62 | Args: 63 | dest_scope_name="target"(DQN): 'target'이라는 이름을 가진 객체를 가져옴 64 | src_scope_name="main"(DQN): 'main'이라는 이름을 가진 객체를 가져옴 65 | 66 | Returns: 67 | list: main의 trainable한 값들이 target의 값으로 복사된 값 68 | ''' 69 | op_holder = [] 70 | 71 | src_vars = tf.get_collection( 72 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name) 73 | dest_vars = tf.get_collection( 74 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name) 75 | 76 | for src_var, dest_var in zip(src_vars, dest_vars): 77 | op_holder.append(dest_var.assign(src_var.value())) 78 | 79 | return op_holder 80 | 81 | 82 | def get_init_state(history, s): 83 | '''에피소드 시작 State를 초기화. 84 | 85 | Args: 86 | history(np.array): 5개의 프레임이 저장될 array 87 | s(list): 초기화된 이미지 88 | 89 | Note: 90 | history[:,:,:3]에 모두 초기화된 이미지(s)를 넣어줌 91 | ''' 92 | for i in range(HISTORY_SIZE): 93 | history[:, :, i] = pre_proc(s) 94 | 95 | 96 | def find_max_lifes(env): 97 | env.reset() 98 | _, _, _, info = env.step(0) 99 | return info['ale.lives'] 100 | 101 | 102 | def check_live(life, cur_life): 103 | if life > cur_life: 104 | return True 105 | else: 106 | return False 107 | 108 | 109 | def train_minibatch(mainDQN, targetDQN, mini_batch): 110 | '''미니배치로 가져온 sample데이터로 메인네트워크 학습 111 | 112 | Args: 113 | mainDQN(object): 메인 네트워크 114 | targetDQN(object): 타겟 네트워크 115 | minibatch: replay_memory에서 MINIBATCH 개수만큼 랜덤 sampling 해온 값 116 | 117 | Note: 118 | replay_memory에서 꺼내온 값으로 메인 네트워크를 학습 119 | ''' 120 | mini_batch = np.array(mini_batch).transpose() 121 | 122 | history = np.stack(mini_batch[0], axis=0) 123 | 124 | states = np.float32(history[:, :, :, :4]) / 255. 125 | actions = list(mini_batch[1]) 126 | rewards = list(mini_batch[2]) 127 | next_states = np.float32(history[:, :, :, 1:]) / 255. 128 | dones = mini_batch[3] 129 | 130 | # bool to binary 131 | dones = dones.astype(int) 132 | 133 | Q1 = targetDQN.get_q(next_states) 134 | 135 | y = rewards + (1 - dones) * DISCOUNT * np.max(Q1, axis=1) 136 | 137 | # 업데이트 된 Q값으로 main네트워크를 학습 138 | mainDQN.sess.run(mainDQN.train, feed_dict={mainDQN.X: states, mainDQN.Y: y, 139 | mainDQN.a: actions}) 140 | 141 | 142 | # 데이터 플롯 143 | def plot_data(epoch, epoch_score, average_reward, epoch_Q, average_Q, mainDQN): 144 | plt.clf() 145 | epoch_score.append(np.mean(average_reward)) 146 | epoch_Q.append(np.mean(average_Q)) 147 | 148 | plt.subplot(211) 149 | plt.axis([0, epoch, 0, np.max(epoch_Q) * 6 / 5]) 150 | plt.xlabel('Training Epochs') 151 | plt.ylabel('Average Action Value(Q)') 152 | plt.plot(epoch_Q) 153 | 154 | plt.subplot(212) 155 | plt.axis([0, epoch, 0, np.max(epoch_score) * 6 / 5]) 156 | plt.xlabel('Training Epochs') 157 | plt.ylabel('Average Reward per Episode') 158 | plt.plot(epoch_score, "r") 159 | 160 | plt.pause(0.05) 161 | plt.savefig("graph/{} epoch".format(epoch - 1)) 162 | 163 | save_path = mainDQN.saver.save(mainDQN.sess, model_path, global_step=(epoch - 1)) 164 | print("Model(epoch :", epoch, ") saved in file: ", save_path, " Now time : ", datetime.datetime.now()) 165 | 166 | 167 | # DQN 168 | class DQNAgent: 169 | def __init__(self, sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='main'): 170 | self.sess = sess 171 | self.height = HEIGHT 172 | self.width = WIDTH 173 | self.history_size = HISTORY_SIZE 174 | self.output = OUTPUT 175 | self.name = NAME 176 | 177 | self.build_network() 178 | 179 | def build_network(self): 180 | with tf.variable_scope(self.name): 181 | self.X = tf.placeholder('float', [None, self.height, self.width, self.history_size]) 182 | self.Y = tf.placeholder('float', [None]) 183 | self.a = tf.placeholder('int64', [None]) 184 | 185 | f1 = tf.get_variable("f1", shape=[8, 8, 4, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 186 | f2 = tf.get_variable("f2", shape=[4, 4, 32, 64], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 187 | f3 = tf.get_variable("f3", shape=[3, 3, 64, 64], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 188 | w1 = tf.get_variable("w1", shape=[7 * 7 * 64, 512], initializer=tf.contrib.layers.xavier_initializer()) 189 | w2 = tf.get_variable("w2", shape=[512, OUTPUT], initializer=tf.contrib.layers.xavier_initializer()) 190 | 191 | c1 = tf.nn.relu(tf.nn.conv2d(self.X, f1, strides=[1, 4, 4, 1], padding="VALID")) 192 | c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 2, 2, 1], padding="VALID")) 193 | c3 = tf.nn.relu(tf.nn.conv2d(c2, f3, strides=[1, 1, 1, 1], padding='VALID')) 194 | 195 | l1 = tf.reshape(c3, [-1, w1.get_shape().as_list()[0]]) 196 | l2 = tf.nn.relu(tf.matmul(l1, w1)) 197 | 198 | self.Q_pre = tf.matmul(l2, w2) 199 | 200 | a_one_hot = tf.one_hot(self.a, self.output, 1.0, 0.0) 201 | q_val = tf.reduce_sum(tf.multiply(self.Q_pre, a_one_hot), reduction_indices=1) 202 | 203 | # huber loss 204 | self.loss = tf.losses.huber_loss(self.Y, q_val) 205 | 206 | optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, momentum=MOMENTUM, epsilon=EPSILON) 207 | self.train = optimizer.minimize(self.loss) 208 | 209 | self.saver = tf.train.Saver(max_to_keep=None) 210 | 211 | def get_q(self, history): 212 | return self.sess.run(self.Q_pre, feed_dict={self.X: np.reshape(history, 213 | [-1, 84, 84, 4])}) 214 | 215 | def get_action(self, q, e): 216 | if e > np.random.rand(1): 217 | action = np.random.randint(self.output) 218 | else: 219 | action = np.argmax(q) 220 | return action 221 | 222 | 223 | def main(): 224 | config = tf.ConfigProto() 225 | config.gpu_options.allow_growth = True 226 | with tf.Session(config=config) as sess: 227 | mainDQN = DQNAgent(sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='main') 228 | targetDQN = DQNAgent(sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='target') 229 | 230 | sess.run(tf.global_variables_initializer()) 231 | 232 | # initial copy q_net -> target_net 233 | copy_ops = get_copy_var_ops(dest_scope_name="target", 234 | src_scope_name="main") 235 | sess.run(copy_ops) 236 | 237 | recent_rlist = deque(maxlen=100) 238 | e = 1. 239 | episode, epoch, frame = 0, 0, 0 240 | 241 | epoch_score, epoch_Q = deque(), deque() 242 | average_Q, average_reward = deque(), deque() 243 | 244 | epoch_on = False 245 | 246 | replay_memory = deque(maxlen=MEMORY_SIZE) 247 | 248 | max_life = find_max_lifes(env) 249 | # Train agent during 200 epoch 250 | while epoch <= 200: 251 | episode += 1 252 | 253 | history = np.zeros([84, 84, 5], dtype=np.uint8) 254 | rall, count = 0, 0 255 | d = False 256 | s = env.reset() 257 | life = max_life 258 | get_init_state(history, s) 259 | 260 | while not d: 261 | # env.render() 262 | 263 | frame += 1 264 | count += 1 265 | 266 | # e-greedy 267 | if e > FINAL_EXPLORATION and frame > TRAIN_START: 268 | e -= (START_EXPLORATION - FINAL_EXPLORATION) / EXPLORATION 269 | 270 | # 히스토리의 0~4까지 부분으로 Q값 예측 271 | Q = mainDQN.get_q(np.float32(history[:, :, :4]) / 255.) 272 | average_Q.append(np.max(Q)) 273 | 274 | # 액션 선택 275 | action = mainDQN.get_action(Q, e) 276 | 277 | # s1 : next frame / r : reward / d : done(terminal) / l : info(lives) 278 | s1, r, d, i = env.step(action) 279 | ter = check_live(life, i['ale.lives']) 280 | reward = np.clip(r, -1, 1) 281 | 282 | # 새로운 프레임을 히스토리 마지막에 넣어줌 283 | history[:, :, 4] = pre_proc(s1) 284 | 285 | # 메모리 저장 효율을 높이기 위해 5개의 프레임을 가진 히스토리를 저장 286 | # state와 next_state는 3개의 데이터가 겹침을 이용. 287 | replay_memory.append((np.copy(history[:, :, :]), action, reward, ter)) 288 | history[:, :, :4] = history[:, :, 1:] 289 | 290 | rall += r 291 | 292 | if frame > TRAIN_START: 293 | # 프레임 스킵때마다 학습 294 | minibatch = ran.sample(replay_memory, MINIBATCH_SIZE) 295 | train_minibatch(mainDQN, targetDQN, minibatch) 296 | 297 | # 1만 프레임일때마다 target_net 업데이트 298 | if frame % TARGET_UPDATE == 0: 299 | copy_ops = get_copy_var_ops(dest_scope_name="target", 300 | src_scope_name="main") 301 | sess.run(copy_ops) 302 | 303 | # 1 epoch(trained 50000 frame)마다 plot 304 | if (frame - TRAIN_START) % 50000 == 0: 305 | epoch_on = True 306 | 307 | recent_rlist.append(rall) 308 | 309 | average_reward.append(rall) 310 | 311 | print("Episode:{0:6d} | Frames:{1:9d} | Steps:{2:5d} | Reward:{3:3.0f} | e-greedy:{4:.5f} | " 312 | "Avg_Max_Q:{5:2.5f} | Recent reward:{6:.5f} ".format(episode, frame, count, rall, e, 313 | np.mean(average_Q), 314 | np.mean(recent_rlist))) 315 | 316 | if epoch_on: 317 | epoch += 1 318 | plot_data(epoch, epoch_score, average_reward, epoch_Q, average_Q, mainDQN) 319 | epoch_on = False 320 | average_reward = deque() 321 | average_Q = deque() 322 | 323 | 324 | if __name__ == "__main__": 325 | main() 326 | -------------------------------------------------------------------------------- /Breakout/Breakout_PolicyGradient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | import gym 5 | from collections import deque 6 | from skimage.transform import resize 7 | from skimage.color import rgb2gray 8 | import copy 9 | 10 | env = gym.make('Breakout-v3') 11 | 12 | # 하이퍼 파라미터 13 | LEARNING_RATE = 0.00025 14 | INPUT = env.observation_space.shape 15 | OUTPUT = 3 16 | DISCOUNT = 0.99 17 | HEIGHT = 84 18 | WIDTH = 84 19 | HISTORY_SIZE = 4 20 | 21 | model_path = 'save/breakout-pg.ckpt' 22 | 23 | def pre_proc(X): 24 | '''입력데이터 전처리. 25 | 26 | Args: 27 | X(np.array): 받아온 이미지를 그레이 스케일링 후 84X84로 크기변경 28 | 그리고 정수값으로 저장하기위해(메모리 효율 높이기 위해) 255를 곱함 29 | 30 | Returns: 31 | np.array: 변경된 이미지 32 | ''' 33 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거 34 | # x = np.maximum(X, X1) 35 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정 36 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255) 37 | 38 | return x 39 | 40 | def get_init_state(history, s): 41 | '''에피소드 시작 State를 초기화. 42 | 43 | Args: 44 | history(np.array): 5개의 프레임이 저장될 array 45 | s(list): 초기화된 이미지 46 | 47 | Note: 48 | history[:,:,:3]에 모두 초기화된 이미지(s)를 넣어줌 49 | ''' 50 | for i in range(HISTORY_SIZE): 51 | history[:, :, i] = pre_proc(s) 52 | 53 | 54 | def get_game_type(count, l, no_life_game, start_live): 55 | '''라이프가 있는 게임인지 판별 56 | 57 | Args: 58 | count(int): 에피소드 시작 후 첫 프레임인지 확인하기 위한 arg 59 | l(dict): 라이프 값들이 저장되어있는 dict ex) l['ale.lives'] 60 | no_life_game(bool): 라이프가 있는 게임일 경우, bool 값을 반환해주기 위한 arg 61 | start_live(int): 라이프가 있는 경우 라이프값을 초기화 하기 위한 arg 62 | 63 | Returns: 64 | list: 65 | no_life_game(bool): 라이프가 없는 게임이면 True, 있으면 False 66 | start_live(int): 라이프가 있는 게임이면 초기화된 라이프 67 | ''' 68 | if count == 1: 69 | start_live = l['ale.lives'] 70 | # 시작 라이프가 0일 경우, 라이프 없는 게임 71 | if start_live == 0: 72 | no_life_game = True 73 | else: 74 | no_life_game = False 75 | return [no_life_game, start_live] 76 | 77 | 78 | def get_terminal(start_live, l, reward, no_life_game, ter): 79 | '''목숨이 줄어들거나, negative reward를 받았을 때, terminal 처리 80 | 81 | Args: 82 | start_live(int): 라이프가 있는 게임일 경우, 현재 라이프 수 83 | l(dict): 다음 상태에서 라이프가 줄었는지 확인하기 위한 다음 frame의 라이프 info 84 | no_life_game(bool): 라이프가 없는 게임일 경우, negative reward를 받으면 terminal 처리를 해주기 위한 게임 타입 85 | ter(bool): terminal 처리를 저장할 arg 86 | 87 | Returns: 88 | list: 89 | ter(bool): terminal 상태 90 | start_live(int): 줄어든 라이프로 업데이트된 값 91 | ''' 92 | if no_life_game: 93 | # 목숨이 없는 게임일 경우 Terminal 처리 94 | if reward < 0: 95 | ter = True 96 | else: 97 | # 목숨 있는 게임일 경우 Terminal 처리 98 | if start_live > l['ale.lives']: 99 | ter = True 100 | start_live = l['ale.lives'] 101 | 102 | return [ter, start_live] 103 | 104 | def discount_rewards(r): 105 | '''Discounted reward를 구하기 위한 함수 106 | 107 | Args: 108 | r(np.array): reward 값이 저장된 array 109 | 110 | Returns: 111 | discounted_r(np.array): Discounted 된 reward가 저장된 array 112 | ''' 113 | discounted_r = np.zeros_like(r, dtype=np.float32) 114 | running_add = 0 115 | for t in reversed(range(len(r))): 116 | if r[t] < 0: # life가 줄었을때 마다 return 초기화 117 | running_add = 0 118 | running_add = running_add * DISCOUNT + r[t] 119 | discounted_r[t] = running_add 120 | 121 | discounted_r = discounted_r - discounted_r.mean() 122 | discounted_r = discounted_r / discounted_r.std() 123 | 124 | return discounted_r 125 | 126 | 127 | def train_episodic(PGagent, x, y, adv): 128 | '''에피소드당 학습을 하기위한 함수 129 | 130 | Args: 131 | PGagent(PolicyGradient): 학습될 네트워크 132 | x(np.array): State가 저장되어있는 array 133 | y(np.array): Action(one_hot)이 저장되어있는 array 134 | adv(np.array) : Discounted reward가 저장되어있는 array 135 | 136 | Returns: 137 | l(float): 네트워크에 의한 loss 138 | ''' 139 | 140 | l, _ = PGagent.sess.run([PGagent.loss, PGagent.train], feed_dict={PGagent.X: x, 141 | PGagent.Y: y, 142 | PGagent.adv: adv}) 143 | return l 144 | 145 | class PolicyGradient: 146 | def __init__(self, sess, input_size, output_size , name = 'main'): 147 | self.sess = sess 148 | self.input_size = input_size 149 | self.output_size = output_size 150 | self.height = HEIGHT 151 | self.width = WIDTH 152 | self.history_size = HISTORY_SIZE 153 | self.name = name 154 | self.build_network() 155 | 156 | def build_network(self): 157 | with tf.variable_scope(self.name): 158 | self.X = tf.placeholder('float', [None, self.height, self.width, self.history_size]) 159 | self.Y = tf.placeholder('float', [None, self.output_size]) 160 | self.adv = tf.placeholder('float') 161 | 162 | f1 = tf.get_variable("f1", shape=[1, 1, 4, 1], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 163 | f2 = tf.get_variable("f2", shape=[8, 8, 1, 16], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 164 | f3 = tf.get_variable("f3", shape=[4, 4, 16, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 165 | w1 = tf.get_variable("w1", shape=[9 * 9 * 32, 256], initializer=tf.contrib.layers.xavier_initializer()) 166 | w2 = tf.get_variable("w2", shape=[256, OUTPUT], initializer=tf.contrib.layers.xavier_initializer()) 167 | 168 | # 1x1 conv layer 169 | c1 = tf.nn.relu(tf.nn.conv2d(self.X, f1, strides=[1, 1, 1, 1], padding="VALID")) 170 | c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 4, 4, 1], padding="VALID")) 171 | c3 = tf.nn.relu(tf.nn.conv2d(c2, f3, strides=[1, 2, 2, 1], padding="VALID")) 172 | 173 | l1 = tf.reshape(c3, [-1, w1.get_shape().as_list()[0]]) 174 | l2 = tf.nn.relu(tf.matmul(l1, w1)) 175 | self.a_pre = tf.nn.softmax(tf.matmul(l2, w2)) 176 | 177 | self.log_p = tf.log(tf.clip_by_value(self.a_pre, 1e-10, 1.)) * self.Y 178 | self.log_lik = -self.log_p * self.adv 179 | self.loss = tf.reduce_mean(tf.reduce_sum(self.log_lik, axis=1)) 180 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss) 181 | self.saver = tf.train.Saver() 182 | 183 | def get_action(self, state, max_prob): 184 | action_p = self.sess.run(self.a_pre, feed_dict={self.X: np.reshape(np.float32(state/255.), 185 | [-1,HEIGHT,WIDTH,HISTORY_SIZE])}) 186 | # 각 액션의 확률로 액션을 결정 187 | max_prob.append(np.max(action_p)) 188 | action = np.random.choice(np.arange(self.output_size), p=action_p[0]) 189 | 190 | return action 191 | # config = tf.ConfigProto(device_count ={'GPU' : 0}) 192 | def main(): 193 | with tf.Session() as sess: 194 | PGagent = PolicyGradient(sess, INPUT, OUTPUT) 195 | 196 | PGagent.sess.run(tf.global_variables_initializer()) 197 | 198 | episode = 0 199 | recent_rlist = deque(maxlen=100) 200 | recent_rlist.append(0) 201 | 202 | no_life_game = False 203 | # 최근 100개의 점수가 195점 넘을 때까지 학습 204 | while np.mean(recent_rlist) <= 195: 205 | episode += 1 206 | 207 | state_memory = deque() 208 | action_memory = deque() 209 | reward_memory = deque() 210 | 211 | history = np.zeros([84, 84, HISTORY_SIZE+1], dtype=np.uint8) 212 | rall, count = 0, 0 213 | done = False 214 | 215 | s = env.reset() 216 | max_prob = deque() 217 | get_init_state(history, s) 218 | start_lives = 0 219 | while not done: 220 | #env.render() 221 | count += 1 222 | # 액션 선택 223 | action = PGagent.get_action(history[:,:,:HISTORY_SIZE], max_prob) 224 | 225 | # action을 one_hot으로 표현 226 | y = np.zeros(OUTPUT) 227 | y[action] = 1 228 | 229 | # 학습속도 개선을 위한 액션수 줄임 230 | if action == 0: 231 | real_a = 1 232 | elif action == 1: 233 | real_a = 4 234 | else: 235 | real_a = 5 236 | 237 | s1, reward, done, l = env.step(real_a) 238 | 239 | ter = done 240 | rall += reward 241 | reward = np.clip(reward, -1, 1) 242 | 243 | # 라이프가 있는 게임인지 아닌지 판별 244 | no_life_game, start_lives = get_game_type(count, l, no_life_game, start_lives) 245 | 246 | # 라이프가 줄어들거나 negative 리워드를 받았을 때 terminal 처리를 해줌 247 | ter, start_lives = get_terminal(start_lives, l, reward, no_life_game, ter) 248 | 249 | # 죽었을때 학습을 하기위한 negative reward 250 | if ter: 251 | reward = -1 252 | 253 | state_memory.append(np.copy(np.float32(history[:,:,:HISTORY_SIZE]/255.))) 254 | action_memory.append(np.copy(y)) 255 | reward_memory.append(np.copy(reward)) 256 | 257 | # 새로운 프레임을 히스토리 마지막에 넣어줌 258 | history[:, :, HISTORY_SIZE] = pre_proc(s1) 259 | history[:, :, :HISTORY_SIZE] = history[:, :, 1:] 260 | 261 | # 에피소드가 끝났을때 학습 262 | if done: 263 | rewards = discount_rewards(np.vstack(reward_memory)) 264 | 265 | l = train_episodic(PGagent, np.stack(state_memory, axis=0), 266 | np.stack(action_memory, axis =0), rewards) 267 | 268 | recent_rlist.append(rall) 269 | 270 | print("[Episode {0:6d}] Step:{4:6d} Reward: {1:4f} Loss: {2:5.5f} Recent Reward: {3:4f} Max Prob: {5:5.5f}". 271 | format(episode, rall, l, np.mean(recent_rlist), count, np.mean(max_prob))) 272 | 273 | if episode % 10 == 0: 274 | PGagent.saver.save(PGagent.sess, model_path, global_step= episode) 275 | 276 | 277 | if __name__ == "__main__": 278 | main() 279 | 280 | 281 | 282 | 283 | 284 | -------------------------------------------------------------------------------- /Breakout/Play_DQN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 김성훈님 ( https://github.com/hunkim/ReinforcementZeroToAll/blob/master/07_3_dqn_2015_cartpole.py ) 3 | # 김태훈님 ( https://github.com/devsisters/DQN-tensorflow ) 4 | # 코드를 참조했습니다. 감사합니다! 5 | # 6 | import tensorflow as tf 7 | import gym 8 | 9 | import numpy as np 10 | import random as ran 11 | import datetime 12 | import matplotlib.pyplot as plt 13 | 14 | from collections import deque 15 | from skimage.transform import resize 16 | from skimage.color import rgb2gray 17 | 18 | plt.ion() 19 | # DQN paper setting(frameskip = 4, repeat_action_probability = 0) 20 | # {}Deterministic : frameskip = 4 21 | # {}-v4 : repeat_action_probability 22 | env = gym.make('BreakoutDeterministic-v4') 23 | 24 | # 하이퍼 파라미터 25 | MINIBATCH_SIZE = 32 26 | HISTORY_SIZE = 4 27 | TRAIN_START = 50000 28 | FINAL_EXPLORATION = 0.1 29 | TARGET_UPDATE = 10000 30 | MEMORY_SIZE = 400000 31 | EXPLORATION = 1000000 32 | START_EXPLORATION = 1. 33 | INPUT = env.observation_space.shape 34 | OUTPUT = env.action_space.n 35 | HEIGHT = 84 36 | WIDTH = 84 37 | LEARNING_RATE = 0.00025 38 | DISCOUNT = 0.99 39 | EPSILON = 0.01 40 | MOMENTUM = 0.95 41 | 42 | # 트레이닝된 모델 경로 43 | model_path = "save/Breakout.ckpt" 44 | 45 | 46 | def cliped_error(error): 47 | '''후버로스를 사용하여 error 클립. 48 | 49 | Args: 50 | error(tensor): 클립을 해야할 tensor 51 | 52 | Returns: 53 | tensor: -1 ~ 1 사이로 클립된 error 54 | ''' 55 | return tf.where(tf.abs(error) < 1.0, 0.5 * tf.square(error), tf.abs(error) - 0.5) 56 | 57 | 58 | def pre_proc(X): 59 | '''입력데이터 전처리. 60 | 61 | Args: 62 | X(np.array): 받아온 이미지를 그레이 스케일링 후 84X84로 크기변경 63 | 그리고 정수값으로 저장하기위해(메모리 효율 높이기 위해) 255를 곱함 64 | 65 | Returns: 66 | np.array: 변경된 이미지 67 | ''' 68 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거 69 | # x = np.maximum(X, X1) 70 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정 71 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255) 72 | return x 73 | 74 | 75 | def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"): 76 | '''타겟네트워크에 메인네트워크의 Weight값을 복사. 77 | 78 | Args: 79 | dest_scope_name="target"(DQN): 'target'이라는 이름을 가진 객체를 가져옴 80 | src_scope_name="main"(DQN): 'main'이라는 이름을 가진 객체를 가져옴 81 | 82 | Returns: 83 | list: main의 trainable한 값들이 target의 값으로 복사된 값 84 | ''' 85 | op_holder = [] 86 | 87 | src_vars = tf.get_collection( 88 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name) 89 | dest_vars = tf.get_collection( 90 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name) 91 | 92 | for src_var, dest_var in zip(src_vars, dest_vars): 93 | op_holder.append(dest_var.assign(src_var.value())) 94 | 95 | return op_holder 96 | 97 | 98 | def get_init_state(history, s): 99 | '''에피소드 시작 State를 초기화. 100 | 101 | Args: 102 | history(np.array): 5개의 프레임이 저장될 array 103 | s(list): 초기화된 이미지 104 | 105 | Note: 106 | history[:,:,:3]에 모두 초기화된 이미지(s)를 넣어줌 107 | ''' 108 | for i in range(HISTORY_SIZE): 109 | history[:, :, i] = pre_proc(s) 110 | 111 | 112 | def get_game_type(count, l, no_life_game, start_live): 113 | '''라이프가 있는 게임인지 판별 114 | 115 | Args: 116 | count(int): 에피소드 시작 후 첫 프레임인지 확인하기 위한 arg 117 | l(dict): 라이프 값들이 저장되어있는 dict ex) l['ale.lives'] 118 | no_life_game(bool): 라이프가 있는 게임일 경우, bool 값을 반환해주기 위한 arg 119 | start_live(int): 라이프가 있는 경우 라이프값을 초기화 하기 위한 arg 120 | 121 | Returns: 122 | list: 123 | no_life_game(bool): 라이프가 없는 게임이면 True, 있으면 False 124 | start_live(int): 라이프가 있는 게임이면 초기화된 라이프 125 | ''' 126 | if count == 1: 127 | start_live = l['ale.lives'] 128 | # 시작 라이프가 0일 경우, 라이프 없는 게임 129 | if start_live == 0: 130 | no_life_game = True 131 | else: 132 | no_life_game = False 133 | return [no_life_game, start_live] 134 | 135 | 136 | def get_terminal(start_live, l, reward, no_life_game, ter): 137 | '''목숨이 줄어들거나, negative reward를 받았을 때, terminal 처리 138 | 139 | Args: 140 | start_live(int): 라이프가 있는 게임일 경우, 현재 라이프 수 141 | l(dict): 다음 상태에서 라이프가 줄었는지 확인하기 위한 다음 frame의 라이프 info 142 | no_life_game(bool): 라이프가 없는 게임일 경우, negative reward를 받으면 terminal 처리를 해주기 위한 게임 타입 143 | ter(bool): terminal 처리를 저장할 arg 144 | 145 | Returns: 146 | list: 147 | ter(bool): terminal 상태 148 | start_live(int): 줄어든 라이프로 업데이트된 값 149 | ''' 150 | if no_life_game: 151 | # 목숨이 없는 게임일 경우 Terminal 처리 152 | if reward < 0: 153 | ter = True 154 | else: 155 | # 목숨 있는 게임일 경우 Terminal 처리 156 | if start_live > l['ale.lives']: 157 | ter = True 158 | start_live = l['ale.lives'] 159 | 160 | return [ter, start_live] 161 | 162 | 163 | def train_minibatch(mainDQN, targetDQN, minibatch): 164 | '''미니배치로 가져온 sample데이터로 메인네트워크 학습 165 | 166 | Args: 167 | mainDQN(object): 메인 네트워크 168 | targetDQN(object): 타겟 네트워크 169 | minibatch: replay_memory에서 MINIBATCH 개수만큼 랜덤 sampling 해온 값 170 | 171 | Note: 172 | replay_memory에서 꺼내온 값으로 메인 네트워크를 학습 173 | ''' 174 | s_stack = [] 175 | a_stack = [] 176 | r_stack = [] 177 | s1_stack = [] 178 | d_stack = [] 179 | 180 | for s_r, a_r, r_r, d_r in minibatch: 181 | s_stack.append(s_r[:, :, :4]) 182 | a_stack.append(a_r) 183 | r_stack.append(r_r) 184 | s1_stack.append(s_r[:, :, 1:]) 185 | d_stack.append(d_r) 186 | 187 | # True, False 값을 1과 0으로 변환 188 | d_stack = np.array(d_stack) + 0 189 | 190 | Q1 = targetDQN.get_q(np.array(s1_stack)) 191 | 192 | y = r_stack + (1 - d_stack) * DISCOUNT * np.max(Q1, axis=1) 193 | 194 | # 업데이트 된 Q값으로 main네트워크를 학습 195 | mainDQN.sess.run(mainDQN.train, feed_dict={mainDQN.X: np.float32(np.array(s_stack) / 255.), mainDQN.Y: y, 196 | mainDQN.a: a_stack}) 197 | 198 | 199 | # 데이터 플롯 200 | def plot_data(epoch, epoch_score, average_reward, epoch_Q, average_Q, mainDQN): 201 | plt.clf() 202 | epoch_score.append(np.mean(average_reward)) 203 | epoch_Q.append(np.mean(average_Q)) 204 | 205 | plt.subplot(211) 206 | plt.axis([0, epoch, 0, np.max(epoch_Q) * 6 / 5]) 207 | plt.xlabel('Training Epochs') 208 | plt.ylabel('Average Action Value(Q)') 209 | plt.plot(epoch_Q) 210 | 211 | plt.subplot(212) 212 | plt.axis([0, epoch, 0, np.max(epoch_score) * 6 / 5]) 213 | plt.xlabel('Training Epochs') 214 | plt.ylabel('Average Reward per Episode') 215 | plt.plot(epoch_score, "r") 216 | 217 | plt.pause(0.05) 218 | plt.savefig("graph/{} epoch".format(epoch - 1)) 219 | 220 | save_path = mainDQN.saver.save(mainDQN.sess, model_path, global_step=(epoch - 1)) 221 | print("Model(epoch :", epoch, ") saved in file: ", save_path, " Now time : ", datetime.datetime.now()) 222 | 223 | 224 | # DQN 225 | class DQNAgent: 226 | def __init__(self, sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='main'): 227 | self.sess = sess 228 | self.height = HEIGHT 229 | self.width = WIDTH 230 | self.history_size = HISTORY_SIZE 231 | self.output = OUTPUT 232 | self.name = NAME 233 | 234 | self.build_network() 235 | 236 | def build_network(self): 237 | with tf.variable_scope(self.name): 238 | self.X = tf.placeholder('float', [None, self.height, self.width, self.history_size]) 239 | self.Y = tf.placeholder('float', [None]) 240 | self.a = tf.placeholder('int64', [None]) 241 | 242 | f1 = tf.get_variable("f1", shape=[8, 8, 4, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 243 | f2 = tf.get_variable("f2", shape=[4, 4, 32, 64], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 244 | f3 = tf.get_variable("f3", shape=[3, 3, 64, 64], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 245 | w1 = tf.get_variable("w1", shape=[7 * 7 * 64, 512], initializer=tf.contrib.layers.xavier_initializer()) 246 | w2 = tf.get_variable("w2", shape=[512, OUTPUT], initializer=tf.contrib.layers.xavier_initializer()) 247 | 248 | c1 = tf.nn.relu(tf.nn.conv2d(self.X, f1, strides=[1, 4, 4, 1], padding="VALID")) 249 | c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 2, 2, 1], padding="VALID")) 250 | c3 = tf.nn.relu(tf.nn.conv2d(c2, f3, strides=[1, 1, 1, 1], padding='VALID')) 251 | 252 | l1 = tf.reshape(c3, [-1, w1.get_shape().as_list()[0]]) 253 | l2 = tf.nn.relu(tf.matmul(l1, w1)) 254 | 255 | self.Q_pre = tf.matmul(l2, w2) 256 | 257 | a_one_hot = tf.one_hot(self.a, self.output, 1.0, 0.0) 258 | q_val = tf.reduce_sum(tf.multiply(self.Q_pre, a_one_hot), reduction_indices=1) 259 | 260 | # error를 -1~1 사이로 클립 261 | error = cliped_error(self.Y - q_val) 262 | 263 | self.loss = tf.reduce_mean(error) 264 | 265 | optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, momentum=MOMENTUM, epsilon=EPSILON) 266 | self.train = optimizer.minimize(self.loss) 267 | 268 | self.saver = tf.train.Saver(max_to_keep=None) 269 | 270 | def get_q(self, history): 271 | return self.sess.run(self.Q_pre, feed_dict={self.X: np.reshape(np.float32(history / 255.), 272 | [-1, 84, 84, 4])}) 273 | 274 | def get_action(self, q, e): 275 | if e > np.random.rand(1): 276 | action = np.random.randint(self.output) 277 | else: 278 | action = np.argmax(q) 279 | return action 280 | 281 | 282 | def main(): 283 | with tf.Session() as sess: 284 | mainDQN = DQNAgent(sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='main') 285 | targetDQN = DQNAgent(sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='target') 286 | 287 | mainDQN.saver.restore(sess, model_path) 288 | recent_rlist = deque(maxlen=100) 289 | e = 1. 290 | episode, epoch, frame = 0, 0, 0 291 | 292 | average_Q, average_reward = deque(), deque() 293 | 294 | # Train agent during 200 epoch 295 | while epoch <= 200: 296 | episode += 1 297 | 298 | history = np.zeros([84, 84, 5], dtype=np.uint8) 299 | rall, count = 0, 0 300 | d = False 301 | s = env.reset() 302 | 303 | get_init_state(history, s) 304 | while not d: 305 | # env.render() 306 | 307 | frame += 1 308 | count += 1 309 | 310 | # e-greedy(for test) 311 | e = 0.05 312 | 313 | # 히스토리의 0~4까지 부분으로 Q값 예측 314 | Q = mainDQN.get_q(history[:, :, :4]) 315 | average_Q.append(np.max(Q)) 316 | 317 | # 액션 선택 318 | action = mainDQN.get_action(Q, e) 319 | 320 | # s1 : next frame / r : reward / d : done(terminal) / l : info(lives) 321 | s1, r, d, l = env.step(action) 322 | 323 | # 새로운 프레임을 히스토리 마지막에 넣어줌 324 | history[:, :, 4] = pre_proc(s1) 325 | 326 | history[:, :, :4] = history[:, :, 1:] 327 | 328 | rall += r 329 | 330 | recent_rlist.append(rall) 331 | 332 | average_reward.append(rall) 333 | 334 | print("Episode:{0:6d} | Frames:{1:9d} | Steps:{2:5d} | Reward:{3:3.0f} | e-greedy:{4:.5f} | " 335 | "Avg_Max_Q:{5:2.5f} | Recent reward:{6:.5f} ".format(episode, frame, count, rall, e, 336 | np.mean(average_Q), 337 | np.mean(recent_rlist))) 338 | 339 | 340 | if __name__ == "__main__": 341 | main() 342 | 343 | 344 | 345 | 346 | 347 | -------------------------------------------------------------------------------- /Breakout/breakout_dqn_pytorch.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import torch 4 | import pylab 5 | import random 6 | import numpy as np 7 | from collections import deque 8 | from datetime import datetime 9 | from copy import deepcopy 10 | from skimage.transform import resize 11 | from skimage.color import rgb2gray 12 | import torch.nn as nn 13 | import torch.optim as optim 14 | import torch.nn.functional as F 15 | from torch.autograd import Variable 16 | 17 | 18 | def find_max_lifes(env): 19 | env.reset() 20 | _, _, _, info = env.step(0) 21 | return info['ale.lives'] 22 | 23 | 24 | def check_live(life, cur_life): 25 | if life > cur_life: 26 | return True 27 | else: 28 | return False 29 | 30 | 31 | def pre_proc(X): 32 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255) 33 | return x 34 | 35 | 36 | def get_init_state(history, s): 37 | for i in range(HISTORY_SIZE): 38 | history[i, :, :] = pre_proc(s) 39 | 40 | 41 | class Flatten(nn.Module): 42 | def forward(self, input): 43 | return input.view(input.size(0), -1) 44 | 45 | 46 | # approximate Q function using Neural Network 47 | # state is input and Q Value of each action is output of network 48 | class DQN(nn.Module): 49 | def __init__(self, action_size): 50 | super(DQN, self).__init__() 51 | self.fc = nn.Sequential( 52 | nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4), 53 | nn.ReLU(), 54 | nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), 55 | nn.ReLU(), 56 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), 57 | nn.ReLU(), 58 | Flatten(), 59 | nn.Linear(7 * 7 * 64, 512), 60 | nn.ReLU(), 61 | nn.Linear(512, action_size) 62 | ) 63 | 64 | def forward(self, x): 65 | return self.fc(x) 66 | 67 | 68 | # DQN Agent for the Cartpole 69 | # it uses Neural Network to approximate q function 70 | # and replay memory & target q network 71 | class DQNAgent(): 72 | def __init__(self, action_size): 73 | # if you want to see Cartpole learning, then change to True 74 | self.render = False 75 | self.load_model = False 76 | 77 | # get size of action 78 | self.action_size = action_size 79 | 80 | # These are hyper parameters for the DQN 81 | self.discount_factor = 0.99 82 | self.learning_rate = 0.0001 83 | self.memory_size = 1000000 84 | self.epsilon = 1.0 85 | self.epsilon_min = 0.02 86 | self.explore_step = 1000000 87 | self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step 88 | self.batch_size = 32 89 | self.train_start = 100000 90 | self.update_target = 1000 91 | 92 | # create replay memory using deque 93 | self.memory = deque(maxlen=self.memory_size) 94 | 95 | # create main model and target model 96 | self.model = DQN(action_size) 97 | self.model.cuda() 98 | self.model.apply(self.weights_init) 99 | self.target_model = DQN(action_size) 100 | self.target_model.cuda() 101 | 102 | # self.optimizer = optim.RMSprop(params=self.model.parameters(),lr=self.learning_rate, eps=0.01, momentum=0.95) 103 | self.optimizer = optim.Adam(params=self.model.parameters(), lr=self.learning_rate) 104 | 105 | # initialize target model 106 | self.update_target_model() 107 | 108 | if self.load_model: 109 | self.model = torch.load('save_model/breakout_dqn') 110 | 111 | # weight xavier initialize 112 | def weights_init(self, m): 113 | classname = m.__class__.__name__ 114 | if classname.find('Linear') != -1: 115 | torch.nn.init.xavier_uniform(m.weight) 116 | print(m) 117 | elif classname.find('Conv') != -1: 118 | torch.nn.init.xavier_uniform(m.weight) 119 | print(m) 120 | 121 | # after some time interval update the target model to be same with model 122 | def update_target_model(self): 123 | self.target_model.load_state_dict(self.model.state_dict()) 124 | 125 | # get action from model using epsilon-greedy policy 126 | def get_action(self, state): 127 | if np.random.rand() <= self.epsilon: 128 | return random.randrange(self.action_size) 129 | else: 130 | state = torch.from_numpy(state).unsqueeze(0) 131 | state = Variable(state).float().cuda() 132 | action = self.model(state).data.cpu().max(1)[1] 133 | return int(action) 134 | 135 | # save sample to the replay memory 136 | def append_sample(self, history, action, reward, done): 137 | self.memory.append((history, action, reward, done)) 138 | 139 | def get_sample(self, frame): 140 | mini_batch = [] 141 | if frame >= self.memory_size: 142 | sample_range = self.memory_size 143 | else: 144 | sample_range = frame 145 | 146 | # history size 147 | sample_range -= (HISTORY_SIZE + 1) 148 | 149 | idx_sample = random.sample(range(sample_range), self.batch_size) 150 | for i in idx_sample: 151 | sample = [] 152 | for j in range(HISTORY_SIZE + 1): 153 | sample.append(self.memory[i + j]) 154 | 155 | sample = np.array(sample) 156 | mini_batch.append((np.stack(sample[:, 0], axis=0), sample[3, 1], sample[3, 2], sample[3, 3])) 157 | 158 | return mini_batch 159 | 160 | # pick samples randomly from replay memory (with batch_size) 161 | def train_model(self, frame): 162 | if self.epsilon > self.epsilon_min: 163 | self.epsilon -= self.epsilon_decay 164 | 165 | mini_batch = self.get_sample(frame) 166 | mini_batch = np.array(mini_batch).transpose() 167 | 168 | history = np.stack(mini_batch[0], axis=0) 169 | states = np.float32(history[:, :4, :, :]) / 255. 170 | actions = list(mini_batch[1]) 171 | rewards = list(mini_batch[2]) 172 | next_states = np.float32(history[:, 1:, :, :]) / 255. 173 | dones = mini_batch[3] 174 | 175 | # bool to binary 176 | dones = dones.astype(int) 177 | 178 | # Q function of current state 179 | states = torch.Tensor(states) 180 | states = Variable(states).float().cuda() 181 | pred = self.model(states) 182 | 183 | # one-hot encoding 184 | a = torch.LongTensor(actions).view(-1, 1) 185 | 186 | one_hot_action = torch.FloatTensor(self.batch_size, self.action_size).zero_() 187 | one_hot_action.scatter_(1, a, 1) 188 | 189 | pred = torch.sum(pred.mul(Variable(one_hot_action).cuda()), dim=1) 190 | 191 | # Q function of next state 192 | next_states = torch.Tensor(next_states) 193 | next_states = Variable(next_states).float().cuda() 194 | next_pred = self.target_model(next_states).data.cpu() 195 | 196 | rewards = torch.FloatTensor(rewards) 197 | dones = torch.FloatTensor(dones) 198 | 199 | # Q Learning: get maximum Q value at s' from target model 200 | target = rewards + (1 - dones) * self.discount_factor * next_pred.max(1)[0] 201 | target = Variable(target).cuda() 202 | 203 | self.optimizer.zero_grad() 204 | 205 | # MSE Loss function 206 | loss = F.smooth_l1_loss(pred, target) 207 | loss.backward() 208 | 209 | # and train 210 | self.optimizer.step() 211 | 212 | 213 | if __name__ == "__main__": 214 | EPISODES = 500000 215 | HEIGHT = 84 216 | WIDTH = 84 217 | HISTORY_SIZE = 4 218 | 219 | env = gym.make('BreakoutDeterministic-v4') 220 | max_life = find_max_lifes(env) 221 | state_size = env.observation_space.shape 222 | # action_size = env.action_space.n 223 | action_size = 3 224 | scores, episodes = [], [] 225 | agent = DQNAgent(action_size) 226 | recent_reward = deque(maxlen=100) 227 | frame = 0 228 | memory_size = 0 229 | for e in range(EPISODES): 230 | done = False 231 | score = 0 232 | 233 | history = np.zeros([5, 84, 84], dtype=np.uint8) 234 | step = 0 235 | d = False 236 | state = env.reset() 237 | life = max_life 238 | 239 | get_init_state(history, state) 240 | 241 | while not done: 242 | step += 1 243 | frame += 1 244 | if agent.render: 245 | env.render() 246 | 247 | # get action for the current state and go one step in environment 248 | action = agent.get_action(np.float32(history[:4, :, :]) / 255.) 249 | 250 | next_state, reward, done, info = env.step(action + 1) 251 | 252 | pre_proc_next_state = pre_proc(next_state) 253 | history[4, :, :] = pre_proc_next_state 254 | ter = check_live(life, info['ale.lives']) 255 | 256 | life = info['ale.lives'] 257 | r = np.clip(reward, -1, 1) 258 | 259 | # save the sample to the replay memory 260 | agent.append_sample(deepcopy(pre_proc_next_state), action, r, ter) 261 | # every time step do the training 262 | if frame >= agent.train_start: 263 | agent.train_model(frame) 264 | if frame % agent.update_target == 0: 265 | agent.update_target_model() 266 | score += reward 267 | history[:4, :, :] = history[1:, :, :] 268 | 269 | if frame % 50000 == 0: 270 | print('now time : ', datetime.now()) 271 | scores.append(score) 272 | episodes.append(e) 273 | pylab.plot(episodes, scores, 'b') 274 | pylab.savefig("./save_graph/breakout_dqn.png") 275 | 276 | if done: 277 | recent_reward.append(score) 278 | # every episode, plot the play time 279 | print("episode:", e, " score:", score, " memory length:", 280 | len(agent.memory), " epsilon:", agent.epsilon, " steps:", step, 281 | " recent reward:", np.mean(recent_reward)) 282 | 283 | # if the mean of scores of last 10 episode is bigger than 400 284 | # stop training 285 | if np.mean(recent_reward) > 50: 286 | torch.save(agent.model, "./save_model/breakout_dqn") 287 | sys.exit() 288 | -------------------------------------------------------------------------------- /CartPole/CartPole_A2C_episodic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | import gym 5 | from collections import deque 6 | 7 | env = gym.make('CartPole-v0') 8 | 9 | # 하이퍼 파라미터 10 | LEARNING_RATE = 0.005 11 | INPUT = env.observation_space.shape[0] 12 | OUTPUT = env.action_space.n 13 | DISCOUNT = 0.99 14 | 15 | 16 | def discount_rewards(r): 17 | '''Discounted reward를 구하기 위한 함수 18 | 19 | Args: 20 | r(np.array): reward 값이 저장된 array 21 | 22 | Returns: 23 | discounted_r(np.array): Discounted 된 reward가 저장된 array 24 | ''' 25 | discounted_r = np.zeros_like(r, dtype=np.float32) 26 | running_add = 0 27 | for t in reversed(range(len(r))): 28 | running_add = running_add * DISCOUNT + r[t] 29 | discounted_r[t] = running_add 30 | 31 | return discounted_r 32 | 33 | 34 | def train_episodic(A2Cagent, x, y, r): 35 | '''에피소드당 학습을 하기위한 함수 36 | 37 | Args: 38 | A2Cagent(ActorCritic): 학습될 네트워크 39 | x(np.array): State가 저장되어있는 array 40 | y(np.array): Action(one_hot)이 저장되어있는 array 41 | r(np.array) : Discounted reward가 저장되어있는 array 42 | 43 | Returns: 44 | l(float): 네트워크에 의한 loss 45 | ''' 46 | l, _ = A2Cagent.sess.run([A2Cagent.loss, A2Cagent.train], feed_dict={A2Cagent.X: x, A2Cagent.Y: y, A2Cagent.r: r}) 47 | return l 48 | 49 | 50 | def play_cartpole(A2Cagent): 51 | '''학습된 네트워크로 Play하기 위한 함수 52 | 53 | Args: 54 | A2Cagent(ActorCritic): 학습된 네트워크 55 | ''' 56 | print("Play Cartpole!") 57 | episode = 0 58 | while True: 59 | s = env.reset() 60 | done = False 61 | rall = 0 62 | episode += 1 63 | while not done: 64 | env.render() 65 | action_p = A2Cagent.get_action(s) 66 | s1, reward, done, _ = env.step(action_p) 67 | s = s1 68 | rall += reward 69 | print("[Episode {0:6f}] Reward: {1:4f} ".format(episode, rall)) 70 | 71 | 72 | class ActorCritic: 73 | def __init__(self, sess, input_size, output_size): 74 | self.sess = sess 75 | self.input_size = input_size 76 | self.output_size = output_size 77 | 78 | self.build_network() 79 | 80 | def build_network(self): 81 | 82 | self.X = tf.placeholder('float', [None, self.input_size]) 83 | self.Y = tf.placeholder('float', [None, self.output_size]) 84 | 85 | self.r = tf.placeholder('float') 86 | 87 | # Actor Weight 88 | w1_a = tf.get_variable('w1', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer()) 89 | w2_a = tf.get_variable('w2', shape=[128, self.output_size], initializer=tf.contrib.layers.xavier_initializer()) 90 | 91 | # Critic Weight 92 | w1_c = tf.get_variable('w1_c', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer()) 93 | w2_c = tf.get_variable('w2_c', shape=[128, 1], initializer=tf.contrib.layers.xavier_initializer()) 94 | 95 | # Actor Critic Network 96 | l1_a = tf.nn.relu(tf.matmul(self.X, w1_a)) 97 | l1_c = tf.nn.relu(tf.matmul(self.X, w1_c)) 98 | self.a_prob = tf.nn.softmax(tf.matmul(l1_a, w2_a)) 99 | self.v = tf.matmul(l1_c, w2_c) 100 | 101 | # A_t = R_t - V(S_t) 102 | self.adv = self.r - self.v 103 | 104 | # Policy loss 105 | self.log_p = self.Y * tf.log(tf.clip_by_value(self.a_prob,1e-10,1.)) 106 | self.log_lik = self.log_p * tf.stop_gradient(self.adv) 107 | self.p_loss = -tf.reduce_mean(tf.reduce_sum(self.log_lik, axis=1)) 108 | 109 | # entropy(for more exploration) 110 | self.entropy = -tf.reduce_mean(tf.reduce_sum(self.a_prob * tf.log(tf.clip_by_value(self.a_prob,1e-10,1.)), axis=1)) 111 | 112 | # Value loss 113 | self.v_loss = tf.reduce_mean(tf.square(self.v - self.r), axis=1) 114 | 115 | # Total loss 116 | self.loss = self.p_loss + self.v_loss - self.entropy * 0.01 117 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss) 118 | 119 | def get_action(self, state): 120 | state_t = np.reshape(state, [1, self.input_size]) 121 | action_p = self.sess.run(self.a_prob, feed_dict={self.X: state_t}) 122 | 123 | # 각 액션의 확률로 액션을 결정 124 | action = np.random.choice(np.arange(self.output_size), p=action_p[0]) 125 | 126 | return action 127 | 128 | 129 | def main(): 130 | with tf.Session() as sess: 131 | A2Cagent = ActorCritic(sess, INPUT, OUTPUT) 132 | 133 | A2Cagent.sess.run(tf.global_variables_initializer()) 134 | episode = 0 135 | recent_rlist = deque(maxlen=100) 136 | recent_rlist.append(0) 137 | 138 | # 최근 100개의 점수가 195점 넘을 때까지 학습 139 | while np.mean(recent_rlist) <= 195: 140 | episode += 1 141 | episode_memory = deque() 142 | rall = 0 143 | s = env.reset() 144 | done = False 145 | 146 | while not done: 147 | # 액션 선택 148 | action = A2Cagent.get_action(s) 149 | 150 | # action을 one_hot으로 표현 151 | y = np.zeros(OUTPUT) 152 | y[action] = 1 153 | 154 | s1, reward, done, _ = env.step(action) 155 | rall += reward 156 | 157 | # 에피소드 메모리에 저장 158 | episode_memory.append([s, y, reward]) 159 | s = s1 160 | 161 | # 에피소드가 끝났을때 학습 162 | if done: 163 | episode_memory = np.array(episode_memory) 164 | 165 | discounted_rewards = discount_rewards(np.vstack(episode_memory[:, 2])) 166 | 167 | discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std()) 168 | 169 | train_episodic(A2Cagent, np.vstack(episode_memory[:, 0]), np.vstack(episode_memory[:, 1]), 170 | discounted_rewards) 171 | 172 | recent_rlist.append(rall) 173 | 174 | print("[Episode {0:6d}] Reward: {1:4f} Recent Reward: {2:4f}".format(episode, rall, np.mean(recent_rlist))) 175 | 176 | play_cartpole(A2Cagent) 177 | 178 | 179 | if __name__ == "__main__": 180 | main() 181 | 182 | 183 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /CartPole/CartPole_C51.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import math 3 | import random as ran 4 | 5 | import tensorflow as tf 6 | import gym 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | from collections import deque 11 | 12 | env = gym.make('CartPole-v1') 13 | 14 | # 하이퍼 파라미터 15 | MINIBATCH_SIZE = 64 16 | TRAIN_START = 1000 17 | FINAL_EXPLORATION = 0.01 18 | TARGET_UPDATE = 1000 19 | MEMORY_SIZE = 50000 20 | EXPLORATION = 20000 21 | START_EXPLORATION = 1. 22 | INPUT = env.observation_space.shape[0] 23 | OUTPUT = env.action_space.n 24 | LEARNING_RATE = 0.001 25 | DISCOUNT = 0.99 26 | VMIN = -10 27 | VMAX = 40 28 | CATEGORY = 51 29 | 30 | model_path = "save/CartPole_C51.ckpt" 31 | 32 | 33 | def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"): 34 | '''타겟네트워크에 메인네트워크의 Weight값을 복사. 35 | 36 | Args: 37 | dest_scope_name="target"(DQN): 'target'이라는 이름을 가진 객체를 가져옴 38 | src_scope_name="main"(DQN): 'main'이라는 이름을 가진 객체를 가져옴 39 | 40 | Returns: 41 | list: main의 trainable한 값들이 target의 값으로 복사된 값 42 | ''' 43 | op_holder = [] 44 | 45 | src_vars = tf.get_collection( 46 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name) 47 | dest_vars = tf.get_collection( 48 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name) 49 | 50 | for src_var, dest_var in zip(src_vars, dest_vars): 51 | op_holder.append(dest_var.assign(src_var.value())) 52 | 53 | return op_holder 54 | 55 | 56 | def train_minibatch(mainC51, targetC51, minibatch): 57 | '''미니배치로 가져온 sample데이터로 메인네트워크 학습 58 | 59 | Args: 60 | mainC51(object): 메인 네트워크 61 | targetC51(object): 타겟 네트워크 62 | minibatch: replay_memory에서 MINIBATCH 개수만큼 랜덤 sampling 해온 값 63 | 64 | Note: 65 | replay_memory에서 꺼내온 값으로 메인 네트워크를 학습 66 | ''' 67 | s_stack = [] 68 | a_stack = [] 69 | r_stack = [] 70 | s1_stack = [] 71 | d_stack = [] 72 | m_prob = [np.zeros((len(minibatch), mainC51.category_size)) for _ in range(OUTPUT)] 73 | 74 | for s_r, a_r, r_r, d_r, s1_r in minibatch: 75 | s_stack.append(s_r) 76 | a_stack.append(a_r) 77 | r_stack.append(r_r) 78 | s1_stack.append(s1_r) 79 | d_stack.append(d_r) 80 | 81 | # Categorical Algorithm 82 | target_sum_q = targetC51.sess.run(targetC51.soft_dist_Q, feed_dict={targetC51.X: np.vstack(s1_stack)}) 83 | 84 | # Get optimal action 85 | sum_q = mainC51.optimal_action(s1_stack) 86 | sum_q = sum_q.reshape([len(minibatch), OUTPUT], order='F') 87 | optimal_actions = np.argmax(sum_q, axis=1) 88 | 89 | for i in range(len(minibatch)): 90 | if d_stack[i]: 91 | # Compute the projection of Tz 92 | Tz = min(VMAX, max(VMIN, r_stack[i])) 93 | bj = (Tz - VMIN) / mainC51.delta_z 94 | m_l, m_u = math.floor(bj), math.ceil(bj) 95 | 96 | # Distribute probability Tz 97 | m_prob[a_stack[i]][i][int(m_l)] += (m_u - bj) 98 | m_prob[a_stack[i]][i][int(m_u)] += (bj - m_l) 99 | else: 100 | for j in range(mainC51.category_size): 101 | # Compute the projection of Tz 102 | Tz = min(VMAX, max(VMIN, r_stack[i] + DISCOUNT * mainC51.z[j])) 103 | bj = (Tz - VMIN) / mainC51.delta_z 104 | m_l, m_u = math.floor(bj), math.ceil(bj) 105 | 106 | # Distribute probability Tz 107 | m_prob[a_stack[i]][i][int(m_l)] += (m_u - bj) * target_sum_q[optimal_actions[i]][i][j] 108 | m_prob[a_stack[i]][i][int(m_u)] += (bj - m_l) * target_sum_q[optimal_actions[i]][i][j] 109 | 110 | mainC51.sess.run(mainC51.train, feed_dict={mainC51.X: np.vstack(s_stack), mainC51.Y: m_prob}) 111 | 112 | 113 | class C51Agent: 114 | def __init__(self, sess, INPUT, OUTPUT, VMAX, VMIN, CATEGORY, NAME='main'): 115 | self.sess = sess 116 | 117 | self.input_size = INPUT 118 | self.output_size = OUTPUT 119 | self.category_size = CATEGORY 120 | self.delta_z = (VMAX - VMIN) / float(self.category_size - 1) 121 | self.z = [VMIN + i * self.delta_z for i in range(self.category_size)] 122 | self.name = NAME 123 | 124 | self.build_network() 125 | 126 | def build_network(self): 127 | with tf.variable_scope(self.name): 128 | self.X = tf.placeholder('float', [None, self.input_size]) 129 | self.Y = tf.placeholder('float', [2, None, self.category_size]) 130 | 131 | self.dist_Q = [] 132 | 133 | w1 = tf.get_variable("w1", shape=[self.input_size, 256], initializer=tf.contrib.layers.xavier_initializer()) 134 | 135 | # Output weight 136 | for i in range(self.output_size): 137 | exec( 138 | 'w2_%s = tf.get_variable("w2_%s", shape=[256, self.category_size], initializer=tf.contrib.layers.xavier_initializer())' % ( 139 | i, i)) 140 | 141 | l1 = tf.nn.selu(tf.matmul(self.X, w1)) 142 | # Output Layer 143 | for i in range(self.output_size): 144 | exec('self.dist_Q.append(tf.matmul(l1, w2_%s))' % i) 145 | 146 | self.soft_dist_Q = tf.nn.softmax(self.dist_Q) 147 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.Y, logits=self.dist_Q)) 148 | optimizer = tf.train.AdamOptimizer(LEARNING_RATE) 149 | self.train = optimizer.minimize(self.loss) 150 | 151 | self.saver = tf.train.Saver(max_to_keep=None) 152 | 153 | def get_action(self, state, e): 154 | if e > np.random.rand(1): 155 | action = np.random.randint(self.output_size) 156 | else: 157 | sum_q = self.optimal_action(state) 158 | action = np.argmax(sum_q) 159 | return action 160 | 161 | def optimal_action(self, state): 162 | state = np.vstack(state) 163 | state = state.reshape([-1, self.input_size]) 164 | z = self.sess.run(self.soft_dist_Q, feed_dict={self.X: state}) 165 | z_stack = np.vstack(z) 166 | sum_q = np.sum(np.multiply(z_stack, np.array(self.z)), axis=1) 167 | return sum_q 168 | 169 | 170 | def main(): 171 | with tf.Session() as sess: 172 | mainC51 = C51Agent(sess, INPUT, OUTPUT, VMAX, VMIN, CATEGORY, NAME='main') 173 | targetC51 = C51Agent(sess, INPUT, OUTPUT, VMAX, VMIN, CATEGORY, NAME='target') 174 | 175 | sess.run(tf.global_variables_initializer()) 176 | 177 | # initial copy q_net -> target_net 178 | copy_ops = get_copy_var_ops(dest_scope_name="target", 179 | src_scope_name="main") 180 | sess.run(copy_ops) 181 | 182 | recent_rlist = deque(maxlen=100) 183 | recent_rlist.append(0) 184 | e = 1. 185 | episode, epoch, frame = 0, 0, 0 186 | 187 | replay_memory = deque(maxlen=MEMORY_SIZE) 188 | 189 | # Train agent 190 | while np.mean(recent_rlist) <= 495: 191 | episode += 1 192 | 193 | rall, count = 0, 0 194 | d = False 195 | s = env.reset() 196 | 197 | while not d: 198 | frame += 1 199 | count += 1 200 | 201 | # e-greedy 202 | if e > FINAL_EXPLORATION and frame > TRAIN_START: 203 | e -= (START_EXPLORATION - FINAL_EXPLORATION) / EXPLORATION 204 | 205 | # 액션 선택 206 | action = mainC51.get_action(s, e) 207 | 208 | # s1 : next frame / r : reward / d : done(terminal) / l : info(lives) 209 | s1, r, d, l = env.step(action) 210 | if d and count < env.spec.timestep_limit: 211 | reward = -1 212 | else: 213 | reward = r 214 | 215 | replay_memory.append((s, action, reward, d, s1)) 216 | s = s1 217 | 218 | rall += r 219 | 220 | if frame > TRAIN_START: 221 | minibatch = ran.sample(replay_memory, MINIBATCH_SIZE) 222 | train_minibatch(mainC51, targetC51, minibatch) 223 | 224 | if frame % TARGET_UPDATE == 0: 225 | copy_ops = get_copy_var_ops(dest_scope_name="target", 226 | src_scope_name="main") 227 | sess.run(copy_ops) 228 | 229 | recent_rlist.append(rall) 230 | 231 | print("Episode:{0:6d} | Frames:{1:9d} | Steps:{2:5d} | Reward:{3:3.0f} | e-greedy:{4:.5f} | " 232 | "Recent reward:{5:.5f} ".format(episode, frame, count, rall, e, 233 | np.mean(recent_rlist))) 234 | 235 | 236 | if __name__ == "__main__": 237 | main() 238 | -------------------------------------------------------------------------------- /CartPole/CartPole_DDQN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import gym 4 | from gym import wrappers 5 | import numpy as np 6 | import random as ran 7 | 8 | env = gym.make('CartPole-v0') 9 | 10 | # 꺼내서 사용할 리플레이 갯수 11 | REPLAY = 10 12 | # 리플레이를 저장할 리스트 13 | REPLAY_MEMORY = [] 14 | # 미니배치 15 | MINIBATCH = 50 16 | 17 | INPUT = env.observation_space.shape[0] 18 | OUTPUT = env.action_space.n 19 | 20 | # 하이퍼파라미터 21 | LEARNING_LATE = 0.001 22 | DISCOUNT = 0.99 23 | model_path = "save/model.ckpt" 24 | 25 | 26 | # 두개의 네트워크 구성 27 | 28 | x=tf.placeholder(dtype=tf.float32, shape=(None, INPUT)) 29 | 30 | y=tf.placeholder(dtype=tf.float32, shape=(None, OUTPUT)) 31 | dropout = tf.placeholder(dtype=tf.float32) 32 | 33 | # Main 네트워크 34 | W1 = tf.get_variable('W1',shape=[INPUT, 200],initializer=tf.contrib.layers.xavier_initializer()) 35 | W2 = tf.get_variable('W2',shape=[200,200],initializer=tf.contrib.layers.xavier_initializer()) 36 | # W3 = tf.get_variable('W3',shape=[200,150],initializer=tf.contrib.layers.xavier_initializer()) 37 | W4 = tf.get_variable('W4',shape=[200, OUTPUT],initializer=tf.contrib.layers.xavier_initializer()) 38 | 39 | b1 = tf.Variable(tf.zeros([1],dtype=tf.float32)) 40 | b2 = tf.Variable(tf.zeros([1],dtype=tf.float32)) 41 | 42 | _L1=tf.nn.relu(tf.matmul(x,W1)+b1) 43 | L1=tf.nn.dropout(_L1,dropout) 44 | _L2=tf.nn.relu(tf.matmul(L1,W2)+b2) 45 | L2=tf.nn.dropout(_L2,dropout) 46 | # L3=tf.nn.relu(tf.matmul(L2,W3)) 47 | Q_pre = tf.matmul(L2,W4) 48 | 49 | # Target 네트워크 50 | W1_r = tf.get_variable('W1_r',shape=[INPUT, 200]) 51 | W2_r = tf.get_variable('W2_r',shape=[200,200]) 52 | # W3_r = tf.get_variable('W3_r',shape=[200,150]) 53 | W4_r = tf.get_variable('W4_r',shape=[200, OUTPUT]) 54 | 55 | b1_r = tf.Variable(tf.zeros([1],dtype=tf.float32)) 56 | b2_r = tf.Variable(tf.zeros([1],dtype=tf.float32)) 57 | 58 | 59 | L1_r=tf.nn.relu(tf.matmul(x ,W1_r)+b1_r) 60 | L2_r=tf.nn.relu(tf.matmul(L1_r,W2_r)+b2_r) 61 | # L3_r=tf.nn.relu(tf.matmul(L2_r,W3_r)) 62 | Q_pre_r = tf.matmul(L2_r,W4_r) 63 | 64 | # 총 Reward를 저장해놓을 리스트 65 | rlist=[0] 66 | recent_rlist=[0] 67 | 68 | episode = 0 69 | 70 | # Loss function 정의 71 | cost = tf.reduce_sum(tf.square(y-Q_pre)) 72 | optimizer = tf.train.AdamOptimizer(LEARNING_LATE, epsilon=0.01) 73 | train = optimizer.minimize(cost) 74 | 75 | 76 | saver = tf.train.Saver() 77 | 78 | # 세션 정의 79 | with tf.Session(config = tf.ConfigProto(device_count ={'GPU' : 0})) as sess: 80 | # 변수 초기화 81 | sess.run(tf.global_variables_initializer()) 82 | # Target 네트워크에 main 네트워크 값을 카피해줌 83 | sess.run(W1_r.assign(W1)) 84 | sess.run(W2_r.assign(W2)) 85 | sess.run(W4_r.assign(W4)) 86 | sess.run(b1_r.assign(b1)) 87 | sess.run(b2_r.assign(b2)) 88 | 89 | # 에피소드 시작 90 | while np.mean(recent_rlist) < 195 : 91 | episode += 1 92 | 93 | # state 초기화 94 | s = env.reset() 95 | if len(recent_rlist) > 200: 96 | del recent_rlist[0] 97 | # e-greedy 98 | e = 1. / ((episode/25)+1) 99 | 100 | rall = 0 101 | d = False 102 | count = 0 103 | 104 | # 에피소드가 끝나기 전까지 반복 105 | while not d and count < 10000 : 106 | 107 | #env.render() 108 | count += 1 109 | 110 | # state 값의 전처리 111 | s_t = np.reshape(s,[1,INPUT]) 112 | 113 | # 현재 상태의 Q값을 에측 114 | Q = sess.run(Q_pre, feed_dict={x:s_t, dropout: 1}) 115 | 116 | # e-greedy 정책으로 랜덤하게 action 결정 117 | if e > np.random.rand(1): 118 | a = env.action_space.sample() 119 | else: 120 | a = np.argmax(Q) 121 | 122 | # 결정된 action으로 Environment에 입력 123 | s1, r, d, _ = env.step(a) 124 | 125 | # Environment에서 반환한 Next_state, action, reward, done 값들을 126 | # Replay_memory에 저장 127 | REPLAY_MEMORY.append([s_t,a,r,s1,d,count]) 128 | 129 | # 저장된 값들이 50000개 이상 넘어가면 맨 앞 Replay부터 삭제 130 | if len(REPLAY_MEMORY) > 50000: 131 | del REPLAY_MEMORY[0] 132 | 133 | # 총 reward 합 134 | rall += r 135 | # state를 Next_state로 바꿈 136 | s = s1 137 | 138 | 139 | # 10번의 episode마다 학습 140 | if len(REPLAY_MEMORY) > 50: 141 | 142 | # 50번의 미니배치로 학습 143 | # 저장된 리플레이 중에 학습에 사용할 랜덤한 리플레이 샘플들을 가져옴 144 | for sample in ran.sample(REPLAY_MEMORY, REPLAY): 145 | 146 | s_t_r, a_r, r_r, s1_r, d_r ,count_r= sample 147 | 148 | # 꺼내온 리플레이의 state의 Q값을 예측 149 | Y = sess.run(Q_pre, feed_dict={x: s_t_r, dropout: 1}) 150 | 151 | if d_r: 152 | # 꺼내온 리플레이의 상태가 끝난 상황이라면 Negative Reward를 부여 153 | if count_r < env.spec.timestep_limit : 154 | Y[0, a_r] = -100 155 | else: 156 | # 끝나지 않았다면 Q값을 업데이트 157 | s1_t_r= np.reshape(s1_r,[1,INPUT]) 158 | Q1, Q = sess.run([Q_pre_r,Q_pre], feed_dict={x: s1_t_r, dropout:1}) 159 | Y[0, a_r] = r_r + DISCOUNT * Q1[0, np.argmax(Q)] 160 | 161 | # 업데이트 된 Q값으로 main네트워크를 학습 162 | _, loss = sess.run([train, cost], feed_dict={x: s_t_r, y: Y, dropout:1}) 163 | 164 | # 10번 마다 target 네트워크에 main 네트워크 값을 copy 165 | sess.run(W1_r.assign(W1)) 166 | sess.run(W2_r.assign(W2)) 167 | sess.run(W4_r.assign(W4)) 168 | sess.run(b1_r.assign(b1)) 169 | sess.run(b2_r.assign(b2)) 170 | print(loss) 171 | 172 | # 총 reward의 합을 list에 저장 173 | recent_rlist.append(rall) 174 | rlist.append(rall) 175 | print("Episode:{} steps:{} reward:{} average reward:{} recent reward:{}".format(episode, count, rall, 176 | np.mean(rlist), 177 | np.mean(recent_rlist))) 178 | 179 | save_path = saver.save(sess, model_path) 180 | print("Model saved in file: ",save_path) 181 | 182 | 183 | rlist=[] 184 | recent_rlist=[] 185 | 186 | 187 | with tf.Session() as sess: 188 | sess.run(tf.global_variables_initializer()) 189 | saver.restore(sess, model_path) 190 | 191 | print("Model restored form file: ", save_path) 192 | for episode in range(500): 193 | # state 초기화 194 | s = env.reset() 195 | 196 | rall = 0 197 | d = False 198 | count = 0 199 | # 에피소드가 끝나기 전까지 반복 200 | while not d : 201 | env.render() 202 | count += 1 203 | # state 값의 전처리 204 | s_t = np.reshape(s, [1, INPUT]) 205 | 206 | # 현재 상태의 Q값을 에측 207 | Q = sess.run(Q_pre, feed_dict={x: s_t,dropout: 1}) 208 | a = np.argmax(Q) 209 | 210 | # 결정된 action으로 Environment에 입력 211 | s, r, d, _ = env.step(a) 212 | 213 | # 총 reward 합 214 | rall += r 215 | 216 | 217 | rlist.append(rall) 218 | 219 | print("Episode : {} steps : {} r={}. averge reward : {}".format(episode, count, rall, 220 | np.mean(rlist))) 221 | 222 | -------------------------------------------------------------------------------- /CartPole/CartPole_DQN_NIPS2013.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import gym 4 | import numpy as np 5 | import random as ran 6 | 7 | env = gym.make('CartPole-v1') 8 | 9 | # 꺼내서 사용할 리플레이 갯수 10 | REPLAY = 10 11 | # 리플레이를 저장할 리스트 12 | REPLAY_MEMORY = [] 13 | # 미니배치 14 | MINIBATCH = 50 15 | 16 | INPUT = env.observation_space.shape[0] 17 | OUTPUT = env.action_space.n 18 | 19 | # 하이퍼파라미터 20 | LEARNING_LATE = 0.01 21 | NUM_EPISODE = 2000 22 | 23 | DISCOUNT = 0.99 24 | 25 | 26 | # 네트워크 구성 27 | x=tf.placeholder(dtype=tf.float32, shape=(1,4)) 28 | 29 | W1 = tf.get_variable('W1',shape=[INPUT,10],initializer=tf.contrib.layers.xavier_initializer()) 30 | W2 = tf.get_variable('W4',shape=[10, OUTPUT],initializer=tf.contrib.layers.xavier_initializer()) 31 | 32 | L1=tf.nn.tanh(tf.matmul(x,W1)) 33 | Q_pre = tf.matmul(L1,W2) 34 | 35 | y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n)) 36 | 37 | # 손실 함수 38 | loss = tf.reduce_sum(tf.square(y-Q_pre)) 39 | optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_LATE) 40 | train = optimizer.minimize(loss) 41 | 42 | init = tf.global_variables_initializer() 43 | 44 | rList=[] 45 | 46 | with tf.Session() as sess: 47 | sess.run(init) 48 | for episode in range(5000): 49 | 50 | s = env.reset() 51 | 52 | e = 1. / ((episode/25)+1) 53 | rall = 0 54 | d = False 55 | count=0 56 | 57 | while not d: 58 | # env.render() 59 | count+=1 60 | 61 | # 현재 상태(s)로 Q값을 예측 62 | s_t = np.reshape(s,[1,INPUT]) 63 | Q = sess.run(Q_pre, feed_dict={x:s_t}) 64 | 65 | # e-greedy 를 사용하여 action값 구함 66 | if e > np.random.rand(1): 67 | a = env.action_space.sample() 68 | else: 69 | a = np.argmax(Q) 70 | 71 | # action을 취함 72 | s1, r, d, _ = env.step(a) 73 | 74 | # state, action, reward, next_state, done 을 메모리에 저장 75 | REPLAY_MEMORY.append([s_t,a,r,s1,d]) 76 | 77 | # 메모리에 50000개 이상의 값이 들어가면 가장 먼저 들어간 것부터 삭제 78 | if len(REPLAY_MEMORY) > 50000: 79 | del REPLAY_MEMORY[0] 80 | 81 | rall += r 82 | s = s1 83 | 84 | # 10 번의 스탭마다 미니배치로 학습 85 | if episode % 10 == 1 : 86 | 87 | for i in range(MINIBATCH): 88 | 89 | # 메모리에서 사용할 리플레이를 랜덤하게 가져옴 90 | for sample in ran.sample(REPLAY_MEMORY, REPLAY): 91 | 92 | s_t_r, a_r, r_r, s1_r ,d_r = sample 93 | 94 | # DQN 알고리즘으로 학습 95 | if d_r: 96 | Q[0, a_r] = -100 97 | else: 98 | s1_t_r= np.reshape(s1_r,[1,INPUT]) 99 | 100 | Q1 = sess.run(Q_pre, feed_dict={x: s1_t_r}) 101 | 102 | Q[0, a_r] = r_r + DISCOUNT * np.max(Q1) 103 | 104 | sess.run(train, feed_dict={x: s_t_r, y: Q}) 105 | 106 | 107 | 108 | rList.append(rall) 109 | print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(episode, count, rall, np.mean(rList))) 110 | 111 | 112 | for episode in range(500): 113 | # state 초기화 114 | s = env.reset() 115 | 116 | rall = 0 117 | d = False 118 | count = 0 119 | # 에피소드가 끝나기 전까지 반복 120 | while not d : 121 | env.render() 122 | count += 1 123 | # state 값의 전처리 124 | s_t = np.reshape(s, [1, INPUT]) 125 | 126 | # 현재 상태의 Q값을 에측 127 | Q = sess.run(Q_pre, feed_dict={x: s_t}) 128 | a = np.argmax(Q) 129 | 130 | # 결정된 action으로 Environment에 입력 131 | s, r, d, _ = env.step(a) 132 | 133 | # 총 reward 합 134 | rall += r 135 | 136 | 137 | rList.append(rall) 138 | 139 | print("Episode : {} steps : {} r={}. averge reward : {}".format(episode, count, rall, 140 | np.mean(rList))) 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /CartPole/CartPole_DQN_Nature2015.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import gym 4 | from gym import wrappers 5 | import numpy as np 6 | import random as ran 7 | 8 | env = gym.make('CartPole-v0') 9 | 10 | # 꺼내서 사용할 리플레이 갯수 11 | REPLAY = 50 12 | # 리플레이를 저장할 리스트 13 | REPLAY_MEMORY = [] 14 | # 미니배치 15 | MINIBATCH = 50 16 | 17 | INPUT = env.observation_space.shape[0] 18 | OUTPUT = env.action_space.n 19 | 20 | # 하이퍼파라미터 21 | LEARNING_LATE = 0.001 22 | DISCOUNT = 0.99 23 | model_path = "save/model.ckpt" 24 | 25 | 26 | # 두개의 네트워크 구성 27 | 28 | x=tf.placeholder(dtype=tf.float32, shape=(None, INPUT)) 29 | 30 | y=tf.placeholder(dtype=tf.float32, shape=(None, OUTPUT)) 31 | dropout = tf.placeholder(dtype=tf.float32) 32 | 33 | # Main 네트워크 34 | W1 = tf.get_variable('W1',shape=[INPUT, 200],initializer=tf.contrib.layers.xavier_initializer()) 35 | W2 = tf.get_variable('W2',shape=[200,200],initializer=tf.contrib.layers.xavier_initializer()) 36 | # W3 = tf.get_variable('W3',shape=[200,150],initializer=tf.contrib.layers.xavier_initializer()) 37 | W4 = tf.get_variable('W4',shape=[200, OUTPUT],initializer=tf.contrib.layers.xavier_initializer()) 38 | 39 | b1 = tf.Variable(tf.zeros([1],dtype=tf.float32)) 40 | b2 = tf.Variable(tf.zeros([1],dtype=tf.float32)) 41 | 42 | _L1=tf.nn.relu(tf.matmul(x,W1)+b1) 43 | L1=tf.nn.dropout(_L1,dropout) 44 | _L2=tf.nn.relu(tf.matmul(L1,W2)+b2) 45 | L2=tf.nn.dropout(_L2,dropout) 46 | # L3=tf.nn.relu(tf.matmul(L2,W3)) 47 | Q_pre = tf.matmul(L2,W4) 48 | 49 | # Target 네트워크 50 | W1_r = tf.get_variable('W1_r',shape=[INPUT, 200]) 51 | W2_r = tf.get_variable('W2_r',shape=[200,200]) 52 | # W3_r = tf.get_variable('W3_r',shape=[200,150]) 53 | W4_r = tf.get_variable('W4_r',shape=[200, OUTPUT]) 54 | 55 | b1_r = tf.Variable(tf.zeros([1],dtype=tf.float32)) 56 | b2_r = tf.Variable(tf.zeros([1],dtype=tf.float32)) 57 | 58 | 59 | L1_r=tf.nn.relu(tf.matmul(x ,W1_r)+b1_r) 60 | L2_r=tf.nn.relu(tf.matmul(L1_r,W2_r)+b2_r) 61 | # L3_r=tf.nn.relu(tf.matmul(L2_r,W3_r)) 62 | Q_pre_r = tf.matmul(L2_r,W4_r) 63 | 64 | # 총 Reward를 저장해놓을 리스트 65 | rlist=[0] 66 | recent_rlist=[0] 67 | 68 | episode = 0 69 | 70 | # Loss function 정의 71 | cost = tf.reduce_sum(tf.square(y-Q_pre)) 72 | optimizer = tf.train.AdamOptimizer(LEARNING_LATE, epsilon=0.01) 73 | train = optimizer.minimize(cost) 74 | 75 | 76 | saver = tf.train.Saver() 77 | 78 | # 세션 정의 79 | with tf.Session() as sess: 80 | # 변수 초기화 81 | sess.run(tf.global_variables_initializer()) 82 | # Target 네트워크에 main 네트워크 값을 카피해줌 83 | sess.run(W1_r.assign(W1)) 84 | sess.run(W2_r.assign(W2)) 85 | sess.run(W4_r.assign(W4)) 86 | sess.run(b1_r.assign(b1)) 87 | sess.run(b2_r.assign(b2)) 88 | 89 | # 에피소드 시작 90 | while np.mean(recent_rlist) < 195 : 91 | episode += 1 92 | 93 | # state 초기화 94 | s = env.reset() 95 | if len(recent_rlist) > 200: 96 | del recent_rlist[0] 97 | # e-greedy 98 | e = 1. / ((episode/25)+1) 99 | 100 | rall = 0 101 | d = False 102 | count = 0 103 | 104 | # 에피소드가 끝나기 전까지 반복 105 | while not d and count < 10000 : 106 | 107 | #env.render() 108 | count += 1 109 | 110 | # state 값의 전처리 111 | s_t = np.reshape(s,[1,INPUT]) 112 | 113 | # 현재 상태의 Q값을 에측 114 | Q = sess.run(Q_pre, feed_dict={x:s_t, dropout: 1}) 115 | 116 | # e-greedy 정책으로 랜덤하게 action 결정 117 | if e > np.random.rand(1): 118 | a = env.action_space.sample() 119 | else: 120 | a = np.argmax(Q) 121 | 122 | # 결정된 action으로 Environment에 입력 123 | s1, r, d, _ = env.step(a) 124 | 125 | # Environment에서 반환한 Next_state, action, reward, done 값들을 126 | # Replay_memory에 저장 127 | REPLAY_MEMORY.append([s_t,a,r,s1,d,count]) 128 | 129 | # 저장된 값들이 50000개 이상 넘어가면 맨 앞 Replay부터 삭제 130 | if len(REPLAY_MEMORY) > 50000: 131 | del REPLAY_MEMORY[0] 132 | 133 | # 총 reward 합 134 | rall += r 135 | # state를 Next_state로 바꿈 136 | s = s1 137 | 138 | 139 | # 10번의 episode마다 학습 140 | if episode % 10 == 1 and len(REPLAY_MEMORY) > 50: 141 | 142 | # 50번의 미니배치로 학습 143 | # 저장된 리플레이 중에 학습에 사용할 랜덤한 리플레이 샘플들을 가져옴 144 | for sample in ran.sample(REPLAY_MEMORY, REPLAY): 145 | 146 | s_t_r, a_r, r_r, s1_r, d_r ,count_r= sample 147 | 148 | # 꺼내온 리플레이의 state의 Q값을 예측 149 | Q = sess.run(Q_pre, feed_dict={x: s_t_r, dropout: 1}) 150 | 151 | if d_r: 152 | # 꺼내온 리플레이의 상태가 끝난 상황이라면 Negative Reward를 부여 153 | if count_r < env.spec.timestep_limit : 154 | Q[0, a_r] = -100 155 | else: 156 | # 끝나지 않았다면 Q값을 업데이트 157 | s1_t_r= np.reshape(s1_r,[1,INPUT]) 158 | Q1 = sess.run(Q_pre_r, feed_dict={x: s1_t_r}) 159 | Q[0, a_r] = r_r + DISCOUNT * np.max(Q1) 160 | 161 | # 업데이트 된 Q값으로 main네트워크를 학습 162 | _, loss = sess.run([train, cost], feed_dict={x: s_t_r, y: Q, dropout:1}) 163 | 164 | # 10번 마다 target 네트워크에 main 네트워크 값을 copy 165 | sess.run(W1_r.assign(W1)) 166 | sess.run(W2_r.assign(W2)) 167 | sess.run(W4_r.assign(W4)) 168 | sess.run(b1_r.assign(b1)) 169 | sess.run(b2_r.assign(b2)) 170 | print(loss) 171 | 172 | # 총 reward의 합을 list에 저장 173 | recent_rlist.append(rall) 174 | rlist.append(rall) 175 | print("Episode:{} steps:{} reward:{} average reward:{} recent reward:{}".format(episode, count, rall, 176 | np.mean(rlist), 177 | np.mean(recent_rlist))) 178 | 179 | save_path = saver.save(sess, model_path) 180 | print("Model saved in file: ",save_path) 181 | 182 | 183 | rlist=[] 184 | recent_rlist=[] 185 | 186 | 187 | with tf.Session() as sess: 188 | sess.run(tf.global_variables_initializer()) 189 | saver.restore(sess, model_path) 190 | 191 | print("Model restored form file: ", save_path) 192 | for episode in range(500): 193 | # state 초기화 194 | s = env.reset() 195 | 196 | rall = 0 197 | d = False 198 | count = 0 199 | # 에피소드가 끝나기 전까지 반복 200 | while not d : 201 | env.render() 202 | count += 1 203 | # state 값의 전처리 204 | s_t = np.reshape(s, [1, INPUT]) 205 | 206 | # 현재 상태의 Q값을 에측 207 | Q = sess.run(Q_pre, feed_dict={x: s_t,dropout: 1}) 208 | a = np.argmax(Q) 209 | 210 | # 결정된 action으로 Environment에 입력 211 | s, r, d, _ = env.step(a) 212 | 213 | # 총 reward 합 214 | rall += r 215 | 216 | 217 | rlist.append(rall) 218 | 219 | print("Episode : {} steps : {} r={}. averge reward : {}".format(episode, count, rall, 220 | np.mean(rlist))) 221 | 222 | -------------------------------------------------------------------------------- /CartPole/CartPole_PAAC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | import gym 5 | from collections import deque 6 | 7 | 8 | def make_batch(A2Cagent, sample): 9 | sample = np.stack(sample) 10 | discounted_return = np.empty([NSTEP, 1]) 11 | 12 | s = np.reshape(np.stack(sample[:, 0]), [NSTEP, A2Cagent.input_size]) 13 | s1 = np.reshape(np.stack(sample[:, 3]), [NSTEP, A2Cagent.input_size]) 14 | y = np.reshape(np.stack(sample[:, 1]), [NSTEP, A2Cagent.output_size]) 15 | r = np.reshape(np.stack(sample[:, 2]), [NSTEP, 1]) 16 | d = np.reshape(np.stack(sample[:, 4]), [NSTEP, 1]) 17 | 18 | value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X: s}) 19 | next_value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X: s1}) 20 | 21 | # Discounted Return 계산 22 | running_add = next_value[NSTEP - 1, 0] * d[NSTEP - 1, 0] 23 | for t in range(4, -1, -1): 24 | if d[t]: 25 | running_add = 0 26 | running_add = r[t] + DISCOUNT * running_add 27 | discounted_return[t, 0] = running_add 28 | 29 | # For critic 30 | target = r + DISCOUNT * d * next_value 31 | 32 | # For Actor 33 | adv = discounted_return - value 34 | 35 | return [s, target, y, adv] 36 | 37 | 38 | class ActorCritic: 39 | def __init__(self, sess, input_size, output_size): 40 | self.sess = sess 41 | self.input_size = input_size 42 | self.output_size = output_size 43 | 44 | self.build_network() 45 | 46 | def build_network(self): 47 | self.X = tf.placeholder('float', [None, self.input_size]) 48 | self.Y = tf.placeholder('float', [None, self.output_size]) 49 | self.adv = tf.placeholder('float') 50 | self.r = tf.placeholder('float') 51 | self.LR = tf.placeholder('float') 52 | 53 | # Common Weight 54 | w1 = tf.get_variable('w1', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer()) 55 | 56 | # Actor Weight 57 | w2_a = tf.get_variable('w2_a', shape=[128, self.output_size], initializer=tf.contrib.layers.xavier_initializer()) 58 | 59 | # Critic Weight 60 | w2_c = tf.get_variable('w2_c', shape=[128, 1], initializer=tf.contrib.layers.xavier_initializer()) 61 | 62 | # Common Layer 63 | l1 = tf.nn.selu(tf.matmul(self.X, w1)) 64 | 65 | # Actor Output 66 | self.a = tf.matmul(l1, w2_a) 67 | self.a_prob = tf.nn.softmax(tf.matmul(l1, w2_a)) 68 | 69 | # Critic Output 70 | self.v = tf.matmul(l1, w2_c) 71 | 72 | # Actor loss 73 | self.log_lik = tf.nn.softmax_cross_entropy_with_logits(labels=self.Y, logits=self.a) 74 | self.p_loss = tf.reduce_mean(self.log_lik * self.adv) 75 | 76 | # Critic loss 77 | self.v_loss = tf.reduce_mean(tf.square(self.v - self.r), axis=1) 78 | 79 | # entropy(for more exploration) 80 | self.entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.a_prob, logits=self.a)) 81 | 82 | self.loss = self.p_loss - self.entropy * 0.01 + self.v_loss * 0.5 83 | 84 | optimizer = tf.train.RMSPropOptimizer(learning_rate=self.LR, epsilon=EPSILON, decay=ALPHA) 85 | gradients, variables = zip(*optimizer.compute_gradients(self.loss)) 86 | gradients, _ = tf.clip_by_global_norm(gradients, 3.0) 87 | self.train = optimizer.apply_gradients(zip(gradients, variables)) 88 | 89 | def get_action(self, state): 90 | state_t = np.reshape(state, [1, self.input_size]) 91 | action_p = self.sess.run(self.a_prob, feed_dict={self.X: state_t}) 92 | 93 | # 각 액션의 확률로 액션을 결정 94 | action = np.random.choice(np.arange(self.output_size), p=action_p[0]) 95 | 96 | return action 97 | 98 | 99 | class Runner: 100 | def __init__(self, idx): 101 | self.env = gym.make('CartPole-v1') 102 | 103 | self.done = False 104 | self.s = self.env.reset() 105 | self.s1 = None 106 | self.sample = [] 107 | self.step = 0 108 | self.runner_idx = idx 109 | self.episode = 0 110 | self.rall = 0 111 | self.recent_rlist = deque(maxlen=100) 112 | self.recent_rlist.append(0) 113 | 114 | def run(self, A2Cagent): 115 | if self.done: 116 | self.episode += 1 117 | if self.runner_idx == 0: 118 | self.recent_rlist.append(self.rall) 119 | print("[Episode {0:6d}] Reward: {1:4.2f} Recent Reward: {2:4.2f}".format(self.episode, self.rall, 120 | np.mean(self.recent_rlist))) 121 | self.done = False 122 | self.rall = 0 123 | self.step = 0 124 | self.s = self.env.reset() 125 | 126 | self.step += 1 127 | action = A2Cagent.get_action(self.s) 128 | 129 | # action을 one_hot으로 표현 130 | y = np.zeros(OUTPUT) 131 | y[action] = 1 132 | s1, reward, self.done, _ = self.env.step(action) 133 | 134 | self.rall += reward 135 | 136 | # negative reward 137 | if self.done and self.step < self.env.spec.timestep_limit: 138 | reward = -100 139 | 140 | self.sample.append([self.s, y, reward, s1, self.done]) 141 | self.s = s1 142 | 143 | 144 | def main(): 145 | with tf.Session() as sess: 146 | A2Cagent = ActorCritic(sess, INPUT, OUTPUT) 147 | A2Cagent.sess.run(tf.global_variables_initializer()) 148 | 149 | step = 0 150 | runners = [Runner(i) for i in range(NENV)] 151 | 152 | while np.mean(runners[0].recent_rlist) <= 495: 153 | s_batch = [] 154 | target_batch = [] 155 | y_batch = [] 156 | adv_batch = [] 157 | 158 | learning_rate = LEARNING_RATE 159 | 160 | for t in range(NSTEP): 161 | for i in range(NENV): 162 | runners[i].run(A2Cagent) 163 | 164 | for i in range(NENV): 165 | batch = make_batch(A2Cagent, runners[i].sample) 166 | 167 | s_batch.extend(batch[0]) 168 | target_batch.extend(batch[1]) 169 | y_batch.extend(batch[2]) 170 | adv_batch.extend(batch[3]) 171 | 172 | runners[i].sample = [] 173 | 174 | feed_dict = {A2Cagent.X: s_batch, A2Cagent.r: target_batch, A2Cagent.Y: y_batch, A2Cagent.adv: adv_batch, 175 | A2Cagent.LR: learning_rate} 176 | 177 | # Train Network 178 | A2Cagent.sess.run([A2Cagent.train], feed_dict=feed_dict) 179 | 180 | step += NENV * NSTEP 181 | 182 | 183 | if __name__ == "__main__": 184 | env = gym.make('CartPole-v1') 185 | 186 | # 하이퍼 파라미터 187 | INPUT = env.observation_space.shape[0] 188 | OUTPUT = env.action_space.n 189 | DISCOUNT = 0.99 190 | NSTEP = 5 191 | NENV = 16 192 | EPSILON = 1e-5 193 | ALPHA = 0.99 194 | LEARNING_RATE = 7e-4 195 | main() 196 | -------------------------------------------------------------------------------- /CartPole/CartPole_PAAC_multiproc.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import torch.multiprocessing as mp 7 | 8 | import torch.nn as nn 9 | import torch 10 | 11 | from collections import deque 12 | 13 | from torch.distributions.categorical import Categorical 14 | 15 | 16 | def make_batch(sample, agent): 17 | sample = np.stack(sample) 18 | discounted_return = np.empty([NUM_STEP, 1]) 19 | 20 | s = np.reshape(np.stack(sample[:, 0]), [NUM_STEP, agent.input_size]) 21 | s1 = np.reshape(np.stack(sample[:, 3]), [NUM_STEP, agent.input_size]) 22 | y = sample[:, 1] 23 | r = np.reshape(np.stack(sample[:, 2]), [NUM_STEP, 1]) 24 | d = np.reshape(np.stack(sample[:, 4]), [NUM_STEP, 1]).astype(int) 25 | 26 | state = torch.from_numpy(s) 27 | state = state.float() 28 | _, value = agent.model(state) 29 | 30 | next_state = torch.from_numpy(s1) 31 | next_state = next_state.float() 32 | _, next_value = agent.model(next_state) 33 | 34 | value = value.data.numpy() 35 | next_value = next_value.data.numpy() 36 | 37 | # Discounted Return 38 | running_add = next_value[NUM_STEP - 1, 0] * (1 - d[NUM_STEP - 1, 0]) 39 | for t in range(NUM_STEP - 1, -1, -1): 40 | if d[t]: 41 | running_add = 0 42 | running_add = r[t] + DISCOUNT * running_add 43 | discounted_return[t, 0] = running_add 44 | 45 | # For critic 46 | target = r + DISCOUNT * (1 - d) * next_value 47 | 48 | # For Actor 49 | adv = discounted_return - value 50 | 51 | return [s, target, y, adv] 52 | 53 | 54 | class ActorCriticNetwork(nn.Module): 55 | def __init__(self, input_size, output_size): 56 | super(ActorCriticNetwork, self).__init__() 57 | self.feature = nn.Sequential( 58 | nn.Linear(input_size, 64), 59 | nn.ReLU(), 60 | nn.Linear(64, 64), 61 | nn.ReLU() 62 | ) 63 | self.actor = nn.Linear(64, output_size) 64 | self.critic = nn.Linear(64, 1) 65 | 66 | def forward(self, state): 67 | x = self.feature(state) 68 | policy = F.softmax(self.actor(x), dim=-1) 69 | value = self.critic(x) 70 | return policy, value 71 | 72 | 73 | # PAAC(Parallel Advantage Actor Critic) 74 | class ActorAgent(object): 75 | def __init__(self): 76 | self.model = ActorCriticNetwork(INPUT, OUTPUT) 77 | 78 | self.model.share_memory() 79 | 80 | self.output_size = OUTPUT 81 | self.input_size = INPUT 82 | 83 | def get_action(self, state): 84 | state = torch.from_numpy(state) 85 | state = state.float() 86 | policy, value = self.model(state) 87 | m = Categorical(policy) 88 | action = m.sample() 89 | return action.item() 90 | 91 | # after some time interval update the target model to be same with model 92 | def update_actor_model(self, target): 93 | self.model.load_state_dict(target.state_dict()) 94 | 95 | 96 | class LearnerAgent(object): 97 | def __init__(self): 98 | self.model = ActorCriticNetwork(INPUT, OUTPUT) 99 | # self.model.cuda() 100 | self.output_size = OUTPUT 101 | self.input_size = INPUT 102 | self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE) 103 | 104 | def train_model(self, s_batch, target_batch, y_batch, adv_batch): 105 | s_batch = torch.FloatTensor(s_batch) 106 | target_batch = torch.FloatTensor(target_batch) 107 | y_batch = torch.LongTensor(y_batch) 108 | adv_batch = torch.FloatTensor(adv_batch) 109 | 110 | # for multiply advantage 111 | policy, value = self.model(s_batch) 112 | m = Categorical(policy) 113 | 114 | # mse = nn.SmoothL1Loss() 115 | mse = nn.MSELoss() 116 | 117 | # Actor loss 118 | actor_loss = -m.log_prob(y_batch) * adv_batch.sum(1) 119 | 120 | # Entropy(for more exploration) 121 | entropy = m.entropy() 122 | # Critic loss 123 | critic_loss = mse(value, target_batch) 124 | 125 | # Total loss 126 | loss = actor_loss.mean() + 0.5 * critic_loss - 0.01 * entropy.mean() 127 | self.optimizer.zero_grad() 128 | loss.backward() 129 | 130 | self.optimizer.step() 131 | 132 | 133 | class Environment(object): 134 | def __init__(self, env, idx): 135 | self.env = env 136 | self.obs = self.env.reset() 137 | self.next_obs = None 138 | self.done = False 139 | self.env_idx = idx 140 | self.step = 0 141 | self.episode = 0 142 | self.rall = 0 143 | self.recent_rlist = deque(maxlen=100) 144 | self.recent_rlist.append(0) 145 | 146 | def run(self, agent): 147 | sample = [] 148 | for _ in range(NUM_STEP): 149 | self.step += 1 150 | action = agent.get_action(self.obs) 151 | self.next_obs, reward, self.done, _ = self.env.step(action) 152 | self.rall += reward 153 | 154 | # negative reward 155 | if self.done and self.step < self.env.spec.timestep_limit: 156 | reward = -100 157 | 158 | sample.append([self.obs[:], action, reward, self.next_obs[:], self.done]) 159 | 160 | self.obs = self.next_obs 161 | 162 | if self.done: 163 | self.episode += 1 164 | if self.env_idx == 0: 165 | self.recent_rlist.append(self.rall) 166 | print("[Episode {0:6d}] Reward: {1:4.2f} Recent Reward: {2:4.2f}" 167 | .format(self.episode, self.rall, np.mean(self.recent_rlist))) 168 | 169 | self.obs = self.env.reset() 170 | self.done = False 171 | self.step = 0 172 | self.rall = 0 173 | 174 | return make_batch(sample, agent) 175 | 176 | 177 | def runner(env, cond, memory, actor): 178 | while True: 179 | with cond: 180 | sample = env.run(actor) 181 | memory.put(sample) 182 | 183 | # wait runner 184 | cond.wait() 185 | 186 | 187 | def learner(cond, memory, actor_agent, learner_agent): 188 | while True: 189 | if memory.full(): 190 | s_batch, target_batch, y_batch, adv_batch = [], [], [], [] 191 | # while memory.qsize() != 0: 192 | # if you use MacOS, use under condition. 193 | while not memory.empty(): 194 | batch = memory.get() 195 | 196 | s_batch.extend(batch[0]) 197 | target_batch.extend(batch[1]) 198 | y_batch.extend(batch[2]) 199 | adv_batch.extend(batch[3]) 200 | 201 | # train 202 | learner_agent.train_model(s_batch, target_batch, y_batch, adv_batch) 203 | actor_agent.update_actor_model(learner_agent.model) 204 | # resume running 205 | with cond: 206 | cond.notify_all() 207 | 208 | 209 | def main(): 210 | num_envs = NUM_ENV 211 | memory = mp.Queue(maxsize=NUM_ENV) 212 | cond = mp.Condition() 213 | 214 | # make agent and share memory 215 | actor_agent = ActorAgent() 216 | learner_agent = LearnerAgent() 217 | 218 | # sync model 219 | actor_agent.update_actor_model(learner_agent.model) 220 | 221 | # make envs 222 | envs = [Environment(gym.make('CartPole-v1'), i) for i in range(num_envs)] 223 | 224 | # Learner Process(only Learn) 225 | learn_proc = mp.Process(target=learner, args=(cond, memory, actor_agent, learner_agent)) 226 | 227 | # Runner Process(just run, not learn) 228 | runners = [] 229 | for idx, env in enumerate(envs): 230 | run_proc = mp.Process(target=runner, args=(env, cond, memory, actor_agent)) 231 | runners.append(run_proc) 232 | run_proc.start() 233 | 234 | learn_proc.start() 235 | 236 | for proc in runners: 237 | proc.join() 238 | 239 | learn_proc.join() 240 | 241 | 242 | if __name__ == '__main__': 243 | torch.manual_seed(23) 244 | env = gym.make('CartPole-v1') 245 | # Hyper parameter 246 | INPUT = env.observation_space.shape[0] 247 | OUTPUT = env.action_space.n 248 | DISCOUNT = 0.99 249 | NUM_STEP = 5 250 | NUM_ENV = 1 251 | EPSILON = 1e-5 252 | ALPHA = 0.99 253 | LEARNING_RATE = 0.0007 254 | env.close() 255 | 256 | main() 257 | -------------------------------------------------------------------------------- /CartPole/CartPole_PolicyGradient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | import gym 5 | from collections import deque 6 | 7 | env = gym.make('CartPole-v0') 8 | 9 | # 하이퍼 파라미터 10 | LEARNING_RATE = 0.005 11 | INPUT = env.observation_space.shape[0] 12 | OUTPUT = env.action_space.n 13 | DISCOUNT = 0.99 14 | 15 | 16 | def discount_rewards(r): 17 | '''Discounted reward를 구하기 위한 함수 18 | 19 | Args: 20 | r(np.array): reward 값이 저장된 array 21 | 22 | Returns: 23 | discounted_r(np.array): Discounted 된 reward가 저장된 array 24 | ''' 25 | discounted_r = np.zeros_like(r, dtype=np.float32) 26 | running_add = 0 27 | for t in reversed(range(len(r))): 28 | running_add = running_add * DISCOUNT + r[t] 29 | discounted_r[t] = running_add 30 | 31 | return discounted_r 32 | 33 | 34 | def train_episodic(PGagent, x, y, adv): 35 | '''에피소드당 학습을 하기위한 함수 36 | 37 | Args: 38 | PGagent(PolicyGradient): 학습될 네트워크 39 | x(np.array): State가 저장되어있는 array 40 | y(np.array): Action(one_hot)이 저장되어있는 array 41 | adv(np.array) : Discounted reward가 저장되어있는 array 42 | 43 | Returns: 44 | l(float): 네트워크에 의한 loss 45 | ''' 46 | l,_ = PGagent.sess.run([PGagent.loss, PGagent.train], feed_dict={PGagent.X: x, PGagent.Y: y, PGagent.adv : adv}) 47 | return l 48 | 49 | def play_cartpole(PGagent): 50 | '''학습된 네트워크로 Play하기 위한 함수 51 | 52 | Args: 53 | PGagent(PolicyGradient): 학습된 네트워크 54 | ''' 55 | print("Play Cartpole!") 56 | episode = 0 57 | while True: 58 | s = env.reset() 59 | done = False 60 | rall = 0 61 | episode += 1 62 | while not done: 63 | env.render() 64 | action_p = PGagent.sess.run(PGagent.a_pre, feed_dict={PGagent.X : s}) 65 | s1, reward, done, _ = env.step(np.argmax(action_p)) 66 | s = s1 67 | rall += reward 68 | print("[Episode {0:6f}] Reward: {1:4f} ".format(episode, rall)) 69 | 70 | class PolicyGradient: 71 | def __init__(self, sess, input_size, output_size): 72 | self.sess = sess 73 | self.input_size = input_size 74 | self.output_size = output_size 75 | 76 | self.build_network() 77 | 78 | def build_network(self): 79 | self.X = tf.placeholder('float',[None, self.input_size]) 80 | self.Y = tf.placeholder('float', [None, self.output_size]) 81 | self.adv = tf.placeholder('float') 82 | 83 | w1 = tf.get_variable('w1', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer()) 84 | w2 = tf.get_variable('w2', shape=[128, self.output_size], initializer=tf.contrib.layers.xavier_initializer()) 85 | 86 | l1 = tf.nn.relu(tf.matmul(self.X, w1)) 87 | self.a_pre = tf.nn.softmax(tf.matmul(l1,w2)) 88 | 89 | self.log_p = self.Y * tf.log(self.a_pre) 90 | self.log_lik = self.log_p * self.adv 91 | self.loss = tf.reduce_mean(tf.reduce_sum(-self.log_lik, axis=1)) 92 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss) 93 | 94 | def get_action(self, state): 95 | state_t = np.reshape(state, [1, self.input_size]) 96 | action_p = self.sess.run(self.a_pre, feed_dict={self.X : state_t}) 97 | 98 | # 각 액션의 확률로 액션을 결정 99 | action = np.random.choice(np.arange(self.output_size), p=action_p[0]) 100 | 101 | return action 102 | 103 | def main(): 104 | with tf.Session() as sess: 105 | PGagent = PolicyGradient(sess, INPUT, OUTPUT) 106 | 107 | sess.run(tf.global_variables_initializer()) 108 | episode = 0 109 | recent_rlist = deque(maxlen=100) 110 | recent_rlist.append(0) 111 | 112 | # 최근 100개의 점수가 195점 넘을 때까지 학습 113 | while np.mean(recent_rlist) <= 195: 114 | episode += 1 115 | episode_memory = deque() 116 | rall = 0 117 | s = env.reset() 118 | done = False 119 | 120 | while not done: 121 | # 액션 선택 122 | action = PGagent.get_action(s) 123 | 124 | # action을 one_hot으로 표현 125 | y = np.zeros(OUTPUT) 126 | y[action] = 1 127 | 128 | s1, reward, done, _ = env.step(action) 129 | rall += reward 130 | 131 | # 에피소드 메모리에 저장 132 | episode_memory.append([s, y, reward]) 133 | s = s1 134 | 135 | # 에피소드가 끝났을때 학습 136 | if done: 137 | episode_memory = np.array(episode_memory) 138 | 139 | discounted_rewards = discount_rewards(np.vstack(episode_memory[:,2])) 140 | 141 | discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 142 | 1e-7) 143 | 144 | l = train_episodic(PGagent, np.vstack(episode_memory[:,0]), np.vstack(episode_memory[:,1]), 145 | discounted_rewards) 146 | 147 | recent_rlist.append(rall) 148 | 149 | print("[Episode {0:6f}] Reward: {1:4f} Loss: {2:5.5f} Recent Reward: {3:4f}".format(episode, rall, l, 150 | np.mean(recent_rlist))) 151 | 152 | play_cartpole(PGagent) 153 | 154 | if __name__ == "__main__": 155 | main() 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /CartPole/CartPole_Q-Network.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import gym 4 | import numpy as np 5 | 6 | env = gym.make('CartPole-v0') 7 | 8 | # 네트워크 구성 9 | 10 | x=tf.placeholder(dtype=tf.float32, shape=(1,4)) 11 | 12 | input = env.observation_space.shape[0] 13 | 14 | W1=tf.get_variable('W1',shape=[input,10],initializer=tf.contrib.layers.xavier_initializer()) 15 | W2=tf.get_variable('W2',shape=[10,20],initializer=tf.contrib.layers.xavier_initializer()) 16 | W3=tf.get_variable('W3',shape=[20,15],initializer=tf.contrib.layers.xavier_initializer()) 17 | W4=tf.get_variable('W4',shape=[15,env.action_space.n],initializer=tf.contrib.layers.xavier_initializer()) 18 | 19 | 20 | L1=tf.nn.relu(tf.matmul(x,W1)) 21 | L2=tf.nn.relu(tf.matmul(L1,W2)) 22 | L3=tf.nn.relu(tf.matmul(L2,W3)) 23 | Q_pre = tf.matmul(L3,W4) 24 | 25 | 26 | y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n)) 27 | 28 | # 하이퍼 파라미터 정의 29 | learning_rate = 0.1 30 | num_episode = 2000 31 | e = 0.1 32 | discount_factor = 0.99 33 | rlist=[] 34 | 35 | # 손실 함수 정의 36 | cost = tf.reduce_sum(tf.square(y-Q_pre)) 37 | optimizer = tf.train.AdamOptimizer(learning_rate) 38 | train = optimizer.minimize(cost) 39 | 40 | init = tf.global_variables_initializer() 41 | 42 | with tf.Session() as sess: 43 | # 변수 초기화 44 | sess.run(init) 45 | for step in range(num_episode): 46 | # stats 초기화 47 | s = env.reset() 48 | # e-greedy 49 | e = 1. / ((step/50)+10) 50 | rall = 0 51 | d = False 52 | j=0 53 | s_t = sess.run(tf.expand_dims(s, axis=0)) 54 | while not d: 55 | # env.render() 56 | j+=1 57 | 58 | # reshape을 통한 state 전처리 59 | 60 | # 현재 state에 대한 Q값 예측 61 | Q = sess.run(Q_pre, feed_dict={x:s_t}) 62 | 63 | # e-greedy 를 통한 랜덤한 action 64 | if e > np.random.rand(1): 65 | a = env.action_space.sample() 66 | else: 67 | a = np.argmax(Q) 68 | 69 | # action 수행 70 | s1, r, d, _ = env.step(a) 71 | 72 | 73 | if d: 74 | # 에피소드가 끝났을때 Negative reward 부여 75 | Q[0, a] = -100 76 | else: 77 | # next_state값의 전처리 후 Q-learning 78 | s1_t = sess.run(tf.expand_dims(s1, axis=0)) 79 | Q1 = sess.run(Q_pre, feed_dict={x: s1_t}) 80 | Q[0, a] = r + discount_factor * np.max(Q1) 81 | 82 | sess.run(train, feed_dict={x: s_t, y: Q}) 83 | 84 | rall += r 85 | 86 | s_t = s1_t 87 | 88 | slist=[] 89 | rlist.append(rall) 90 | print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(step, j, rall, np.mean(rlist))) 91 | 92 | print("성공한 확률" + str(sum(rlist) / num_episode) + "%") 93 | 94 | -------------------------------------------------------------------------------- /CartPole/CartPole_Q-Network_reshape.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import gym 4 | import numpy as np 5 | 6 | env = gym.make('CartPole-v0') 7 | 8 | # 네트워크 구성 9 | 10 | x=tf.placeholder(dtype=tf.float32, shape=(1,4)) 11 | 12 | input = env.observation_space.shape[0] 13 | 14 | W1=tf.get_variable('W1',shape=[input,10],initializer=tf.contrib.layers.xavier_initializer()) 15 | W2=tf.get_variable('W2',shape=[10,20],initializer=tf.contrib.layers.xavier_initializer()) 16 | W3=tf.get_variable('W3',shape=[20,15],initializer=tf.contrib.layers.xavier_initializer()) 17 | W4=tf.get_variable('W4',shape=[15,env.action_space.n],initializer=tf.contrib.layers.xavier_initializer()) 18 | 19 | 20 | L1=tf.nn.relu(tf.matmul(x,W1)) 21 | L2=tf.nn.relu(tf.matmul(L1,W2)) 22 | L3=tf.nn.relu(tf.matmul(L2,W3)) 23 | Q_pre = tf.matmul(L3,W4) 24 | 25 | 26 | y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n)) 27 | 28 | # 하이퍼 파라미터 정의 29 | learning_rate = 0.1 30 | num_episode = 2000 31 | e = 0.1 32 | discount_factor = 0.99 33 | rlist=[] 34 | 35 | # 손실 함수 정의 36 | cost = tf.reduce_sum(tf.square(y-Q_pre)) 37 | optimizer = tf.train.AdamOptimizer(learning_rate) 38 | train = optimizer.minimize(cost) 39 | 40 | init = tf.global_variables_initializer() 41 | 42 | with tf.Session() as sess: 43 | # 변수 초기화 44 | sess.run(init) 45 | for step in range(num_episode): 46 | # stats 초기화 47 | s = env.reset() 48 | # e-greedy 49 | e = 1. / ((step/50)+10) 50 | rall = 0 51 | d = False 52 | j=0 53 | 54 | while not d: 55 | # env.render() 56 | j+=1 57 | 58 | # reshape을 통한 state 전처리 59 | s_t = np.reshape(s,[1,input]) 60 | # 현재 state에 대한 Q값 예측 61 | Q = sess.run(Q_pre, feed_dict={x:s_t}) 62 | 63 | # e-greedy 를 통한 랜덤한 action 64 | if e > np.random.rand(1): 65 | a = env.action_space.sample() 66 | else: 67 | a = np.argmax(Q) 68 | 69 | # action 수행 70 | s1, r, d, _ = env.step(a) 71 | 72 | 73 | if d: 74 | # 에피소드가 끝났을때 Negative reward 부여 75 | Q[0, a] = -100 76 | else: 77 | # next_state값의 전처리 후 Q-learning 78 | s1_t= np.reshape(s1,[1,input]) 79 | Q1 = sess.run(Q_pre, feed_dict={x: s1_t}) 80 | Q[0, a] = r + discount_factor * np.max(Q1) 81 | 82 | sess.run(train, feed_dict={x: s_t, y: Q}) 83 | 84 | rall += r 85 | 86 | s = s1 87 | 88 | slist=[] 89 | rlist.append(rall) 90 | print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(step, j, rall, np.mean(rlist))) 91 | 92 | print("성공한 확률" + str(sum(rlist) / num_episode) + "%") 93 | 94 | -------------------------------------------------------------------------------- /CartPole/Cartpole_A2C_nstep.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | import gym 5 | from collections import deque 6 | 7 | env = gym.make('CartPole-v1') 8 | 9 | # 하이퍼 파라미터 10 | LEARNING_RATE = 0.001 11 | INPUT = env.observation_space.shape[0] 12 | OUTPUT = env.action_space.n 13 | DISCOUNT = 0.99 14 | NSTEP = 5 15 | 16 | 17 | def train_nstep(A2Cagent, sample): 18 | sample = np.stack(sample) 19 | discounted_return = np.empty([NSTEP, 1]) 20 | 21 | s = np.reshape(np.stack(sample[:, 0]), [NSTEP, A2Cagent.input_size]) 22 | s1 = np.reshape(np.stack(sample[:, 3]), [NSTEP, A2Cagent.input_size]) 23 | y = np.reshape(np.stack(sample[:, 1]), [NSTEP, A2Cagent.output_size]) 24 | r = np.reshape(np.stack(sample[:, 2]), [NSTEP, 1]) 25 | d = np.reshape(np.stack(sample[:, 4]), [NSTEP, 1]) 26 | 27 | value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X: s}) 28 | next_value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X: s1}) 29 | 30 | # Discounted Return 계산 31 | running_add = next_value[NSTEP - 1, 0] * d[NSTEP - 1, 0] 32 | for t in range(4, -1, -1): 33 | if d[t]: 34 | running_add = 0 35 | running_add = r[t] + DISCOUNT * running_add 36 | discounted_return[t, 0] = running_add 37 | 38 | # For critic 39 | target = r + DISCOUNT * d * next_value 40 | 41 | # For Actor 42 | adv = discounted_return - value 43 | 44 | A2Cagent.sess.run([A2Cagent.train], feed_dict={A2Cagent.X: s, A2Cagent.r: target, A2Cagent.Y: y, A2Cagent.adv: adv}) 45 | 46 | 47 | class ActorCritic: 48 | def __init__(self, sess, input_size, output_size): 49 | self.sess = sess 50 | self.input_size = input_size 51 | self.output_size = output_size 52 | 53 | self.build_network() 54 | 55 | def build_network(self): 56 | self.X = tf.placeholder('float', [None, self.input_size]) 57 | self.Y = tf.placeholder('float', [None, self.output_size]) 58 | self.adv = tf.placeholder('float') 59 | self.r = tf.placeholder('float') 60 | 61 | # Actor Weight 62 | w1_a = tf.get_variable('w1', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer()) 63 | w2_a = tf.get_variable('w2', shape=[128, self.output_size], initializer=tf.contrib.layers.xavier_initializer()) 64 | 65 | # Critic Weight 66 | w1_c = tf.get_variable('w1_c', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer()) 67 | w2_c = tf.get_variable('w2_c', shape=[128, 1], initializer=tf.contrib.layers.xavier_initializer()) 68 | 69 | # Actor Critic Network 70 | l1_a = tf.nn.selu(tf.matmul(self.X, w1_a)) 71 | l1_c = tf.nn.selu(tf.matmul(self.X, w1_c)) 72 | 73 | self.a = tf.matmul(l1_a, w2_a) 74 | self.a_prob = tf.nn.softmax(tf.matmul(l1_a, w2_a)) 75 | self.v = tf.matmul(l1_c, w2_c) 76 | 77 | # Actor loss 78 | self.log_lik = tf.nn.softmax_cross_entropy_with_logits(labels=self.Y, logits=self.a) 79 | self.p_loss = tf.reduce_mean(self.log_lik * self.adv) 80 | 81 | # Critic loss 82 | self.v_loss = tf.reduce_mean(tf.square(self.v - self.r), axis=1) 83 | 84 | # entropy(for more exploration) 85 | self.entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.a_prob, logits=self.a)) 86 | 87 | self.loss = self.p_loss + self.v_loss - self.entropy * 0.01 88 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss) 89 | 90 | def get_action(self, state): 91 | state_t = np.reshape(state, [1, self.input_size]) 92 | action_p = self.sess.run(self.a_prob, feed_dict={self.X: state_t}) 93 | 94 | # 각 액션의 확률로 액션을 결정 95 | action = np.random.choice(np.arange(self.output_size), p=action_p[0]) 96 | 97 | return action 98 | 99 | 100 | def main(): 101 | with tf.Session() as sess: 102 | A2Cagent = ActorCritic(sess, INPUT, OUTPUT) 103 | 104 | A2Cagent.sess.run(tf.global_variables_initializer()) 105 | episode = 0 106 | step = 0 107 | recent_rlist = deque(maxlen=100) 108 | recent_rlist.append(0) 109 | 110 | sample = [] 111 | 112 | # 최근 100개의 점수가 195점 넘을 때까지 학습 113 | while np.mean(recent_rlist) <= 195: 114 | episode += 1 115 | 116 | rall = 0 117 | count = 0 118 | s = env.reset() 119 | done = False 120 | 121 | while not done: 122 | count += 1 123 | step += 1 124 | # 액션 선택 125 | action = A2Cagent.get_action(s) 126 | 127 | # action을 one_hot으로 표현 128 | y = np.zeros(OUTPUT) 129 | y[action] = 1 130 | s1, reward, done, _ = env.step(action) 131 | rall += reward 132 | 133 | sample.append([s, y, reward, s1, done]) 134 | 135 | # negative reward 136 | if done and count < env.spec.timestep_limit: 137 | reward = -100 138 | 139 | if step % 5 == 0: 140 | train_nstep(A2Cagent, sample) 141 | sample = [] 142 | 143 | s = s1 144 | 145 | recent_rlist.append(rall) 146 | 147 | print("[Episode {0:6d}] Reward: {1:4f} Recent Reward: {2:4f}".format(episode, rall, np.mean(recent_rlist))) 148 | 149 | 150 | if __name__ == "__main__": 151 | main() 152 | -------------------------------------------------------------------------------- /CartPole/Cartpole_A2C_onestep.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | import gym 5 | from collections import deque 6 | 7 | env = gym.make('CartPole-v1') 8 | 9 | # 하이퍼 파라미터 10 | LEARNING_RATE = 0.001 11 | INPUT = env.observation_space.shape[0] 12 | OUTPUT = env.action_space.n 13 | DISCOUNT = 0.99 14 | print(env.spec.timestep_limit) 15 | 16 | def train_onestep(A2Cagent, s, y, r, s1, d): 17 | s = np.reshape(s, [1, A2Cagent.input_size]) 18 | s1 = np.reshape(s1, [1, A2Cagent.input_size]) 19 | y = np.reshape(y, [1, A2Cagent.output_size]) 20 | 21 | value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X: s}) 22 | if d: 23 | target = r 24 | adv = r - value 25 | else: 26 | next_value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X : s1}) 27 | target = r + DISCOUNT * next_value 28 | adv = r + DISCOUNT * next_value - value 29 | 30 | A2Cagent.sess.run([A2Cagent.train], feed_dict={A2Cagent.X: s, A2Cagent.r: target, A2Cagent.Y: y, A2Cagent.adv: adv}) 31 | 32 | class ActorCritic: 33 | def __init__(self, sess, input_size, output_size): 34 | self.sess = sess 35 | self.input_size = input_size 36 | self.output_size = output_size 37 | 38 | self.build_network() 39 | 40 | def build_network(self): 41 | 42 | self.X = tf.placeholder('float', [None, self.input_size]) 43 | self.Y = tf.placeholder('float', [None, self.output_size]) 44 | self.adv = tf.placeholder('float') 45 | self.r = tf.placeholder('float') 46 | 47 | # Actor Weight 48 | w1_a = tf.get_variable('w1', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer()) 49 | w2_a = tf.get_variable('w2', shape=[128, self.output_size], initializer=tf.contrib.layers.xavier_initializer()) 50 | 51 | # Critic Weight 52 | w1_c = tf.get_variable('w1_c', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer()) 53 | w2_c = tf.get_variable('w2_c', shape=[128, 1], initializer=tf.contrib.layers.xavier_initializer()) 54 | 55 | # Actor Critic Network 56 | l1_a = tf.nn.relu(tf.matmul(self.X, w1_a)) 57 | l1_c = tf.nn.relu(tf.matmul(self.X, w1_c)) 58 | self.a_prob = tf.nn.softmax(tf.matmul(l1_a, w2_a)) 59 | self.v = tf.matmul(l1_c, w2_c) 60 | 61 | # Policy loss 62 | self.log_p = self.Y * tf.log(tf.clip_by_value(self.a_prob,1e-10,1.)) 63 | self.log_lik = self.log_p * self.adv 64 | self.p_loss = -tf.reduce_mean(tf.reduce_sum(self.log_lik, axis=1)) 65 | 66 | # Value loss 67 | self.v_loss = tf.reduce_mean(tf.square(self.v - self.r), axis=1) 68 | 69 | # entropy(for more exploration) 70 | self.entropy = -tf.reduce_mean( 71 | tf.reduce_sum(self.a_prob * tf.log(tf.clip_by_value(self.a_prob, 1e-10, 1.)), axis=1)) 72 | 73 | self.loss = self.p_loss + self.v_loss - self.entropy * 0.01 74 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss) 75 | 76 | def get_action(self, state): 77 | state_t = np.reshape(state, [1, self.input_size]) 78 | action_p = self.sess.run(self.a_prob, feed_dict={self.X: state_t}) 79 | 80 | # 각 액션의 확률로 액션을 결정 81 | action = np.random.choice(np.arange(self.output_size), p=action_p[0]) 82 | 83 | return action 84 | 85 | 86 | def main(): 87 | with tf.Session() as sess: 88 | A2Cagent = ActorCritic(sess, INPUT, OUTPUT) 89 | 90 | A2Cagent.sess.run(tf.global_variables_initializer()) 91 | episode = 0 92 | recent_rlist = deque(maxlen=100) 93 | recent_rlist.append(0) 94 | 95 | # 최근 100개의 점수가 195점 넘을 때까지 학습 96 | while np.mean(recent_rlist) <= 195: 97 | episode += 1 98 | 99 | rall = 0 100 | count = 0 101 | s = env.reset() 102 | done = False 103 | i = 1 104 | while not done: 105 | count += 1 106 | # 액션 선택 107 | action = A2Cagent.get_action(s) 108 | 109 | # action을 one_hot으로 표현 110 | y = np.zeros(OUTPUT) 111 | y[action] = 1 112 | 113 | s1, reward, done, _ = env.step(action) 114 | rall += reward 115 | 116 | # negative reward 117 | if done and count < env.spec.timestep_limit: 118 | reward = -100 119 | 120 | train_onestep(A2Cagent, s, y, reward, s1, done) 121 | 122 | s = s1 123 | 124 | recent_rlist.append(rall) 125 | 126 | print("[Episode {0:6d}] Reward: {1:4f} Recent Reward: {2:4f}".format(episode, rall, np.mean(recent_rlist))) 127 | 128 | if __name__ == "__main__": 129 | main() 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /CartPole/cartpole_dqn.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/CartPole/cartpole_dqn.py -------------------------------------------------------------------------------- /CartPole/cartpole_ppo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import random 4 | 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | import torch.multiprocessing as mp 8 | 9 | import torch.nn as nn 10 | import torch 11 | 12 | from collections import deque 13 | 14 | from torch.distributions.categorical import Categorical 15 | 16 | 17 | def make_batch(sample, agent): 18 | sample = np.stack(sample) 19 | discounted_return = np.empty([NUM_STEP, 1]) 20 | 21 | s = np.reshape(np.stack(sample[:, 0]), [NUM_STEP, agent.input_size]) 22 | s1 = np.reshape(np.stack(sample[:, 3]), [NUM_STEP, agent.input_size]) 23 | y = sample[:, 1] 24 | r = np.reshape(np.stack(sample[:, 2]), [NUM_STEP, 1]) 25 | d = np.reshape(np.stack(sample[:, 4]), [NUM_STEP, 1]) 26 | with torch.no_grad(): 27 | state = torch.from_numpy(s) 28 | state = state.float() 29 | _, value = agent.model_old(state) 30 | 31 | next_state = torch.from_numpy(s1) 32 | next_state = next_state.float() 33 | _, next_value = agent.model_old(next_state) 34 | 35 | value = value.data.numpy() 36 | next_value = next_value.data.numpy() 37 | 38 | # Discounted Return 39 | gae = 0 40 | for t in range(NUM_STEP - 1, -1, -1): 41 | delta = r[t] + DISCOUNT * next_value[t] * (1 - d[t]) - value[t] 42 | gae = delta + DISCOUNT * LAM * (1 - d[t]) * gae 43 | discounted_return[t, 0] = gae + value[t] 44 | 45 | # For critic 46 | target = r + DISCOUNT * (1 - d) * next_value 47 | 48 | # For Actor 49 | adv = discounted_return - value 50 | # adv = (adv - adv.mean()) / (adv.std() + 1e-5) 51 | 52 | return [s, target, y, adv] 53 | 54 | 55 | class ActorCriticNetwork(nn.Module): 56 | def __init__(self, input_size, output_size): 57 | super(ActorCriticNetwork, self).__init__() 58 | self.feature = nn.Sequential( 59 | nn.Linear(input_size, 64), 60 | nn.ReLU(), 61 | nn.Linear(64, 64), 62 | nn.ReLU() 63 | ) 64 | self.actor = nn.Linear(64, output_size) 65 | self.critic = nn.Linear(64, 1) 66 | 67 | def forward(self, state): 68 | x = self.feature(state) 69 | policy = F.softmax(self.actor(x), dim=-1) 70 | value = self.critic(x) 71 | return policy, value 72 | 73 | 74 | # PAAC(Parallel Advantage Actor Critic) 75 | class ActorAgent(object): 76 | def __init__(self): 77 | self.model_old = ActorCriticNetwork(INPUT, OUTPUT) 78 | self.model_old.share_memory() 79 | 80 | self.output_size = OUTPUT 81 | self.input_size = INPUT 82 | 83 | def get_action(self, state): 84 | state = torch.from_numpy(state) 85 | state = state.float() 86 | policy, value = self.model_old(state) 87 | m = Categorical(policy) 88 | action = m.sample() 89 | return action.item() 90 | 91 | # after some time interval update the target model to be same with model 92 | def update_actor_model(self, target): 93 | self.model_old.load_state_dict(target.state_dict()) 94 | 95 | @staticmethod 96 | def weights_init(m): 97 | class_name = m.__class__.__name__ 98 | if class_name.find('Linear') != -1: 99 | torch.nn.init.kaiming_uniform(m.weight) 100 | print(m) 101 | elif class_name.find('Conv') != -1: 102 | torch.nn.init.kaiming_uniform(m.weight) 103 | print(m) 104 | 105 | 106 | class LearnerAgent(object): 107 | def __init__(self): 108 | self.model = ActorCriticNetwork(INPUT, OUTPUT) 109 | # self.model.cuda() 110 | self.output_size = OUTPUT 111 | self.input_size = INPUT 112 | self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE, eps=1e-5) 113 | 114 | def train_model(self, s_batch, target_batch, y_batch, adv_batch, actor_agent): 115 | s_batch = torch.FloatTensor(s_batch) 116 | target_batch = torch.FloatTensor(target_batch) 117 | adv_batch = torch.FloatTensor(adv_batch) 118 | with torch.no_grad(): 119 | policy_old, value_old = actor_agent.model_old(s_batch) 120 | m_old = Categorical(policy_old) 121 | y_batch_old = torch.LongTensor(y_batch) 122 | log_prob_old = m_old.log_prob(y_batch_old) 123 | 124 | # for multiply advantage 125 | policy, value = self.model(s_batch) 126 | m = Categorical(policy) 127 | y_batch = m.sample() 128 | log_prob = m.log_prob(y_batch) 129 | entropy = m.entropy().mean() 130 | 131 | for i in range(EPOCH): 132 | minibatch = random.sample(range(len(s_batch)), BATCH_SIZE) 133 | ratio = torch.exp(log_prob[minibatch] - log_prob_old[minibatch]) 134 | 135 | surr1 = ratio * adv_batch[minibatch].sum(1) 136 | surr2 = torch.clamp(ratio, 1.0 - EPSILON, 1.0 + EPSILON) * adv_batch[minibatch].sum(1) 137 | 138 | actor_loss = -torch.min(surr1, surr2).mean() 139 | critic_loss = F.mse_loss(value_old[minibatch], target_batch[minibatch]) 140 | 141 | self.optimizer.zero_grad() 142 | loss = actor_loss + V_COEF * critic_loss - 0.01 * entropy 143 | loss.backward(retain_graph=True) 144 | self.optimizer.step() 145 | 146 | 147 | class Environment(object): 148 | def __init__(self, env, idx): 149 | self.env = env 150 | self.obs = self.env.reset() 151 | self.next_obs = None 152 | self.done = False 153 | self.env_idx = idx 154 | self.step = 0 155 | self.episode = 0 156 | self.rall = 0 157 | self.recent_rlist = deque(maxlen=100) 158 | self.recent_rlist.append(0) 159 | 160 | def run(self, agent): 161 | sample = [] 162 | for _ in range(NUM_STEP): 163 | self.step += 1 164 | action = agent.get_action(self.obs) 165 | self.next_obs, reward, self.done, _ = self.env.step(action) 166 | self.rall += reward 167 | 168 | # negative reward 169 | if self.done and self.step < self.env.spec.timestep_limit: 170 | reward = 0 171 | 172 | sample.append([self.obs[:], action, reward, self.next_obs[:], self.done]) 173 | 174 | self.obs = self.next_obs 175 | 176 | if self.done: 177 | self.episode += 1 178 | if self.env_idx == 0: 179 | self.recent_rlist.append(self.rall) 180 | print("[Episode {0:6d}] Reward: {1:4.2f} Recent Reward: {2:4.2f}" 181 | .format(self.episode, self.rall, np.mean(self.recent_rlist))) 182 | 183 | self.obs = self.env.reset() 184 | self.done = False 185 | self.step = 0 186 | self.rall = 0 187 | 188 | return make_batch(sample, agent) 189 | 190 | 191 | def runner(env, cond, memory, actor): 192 | while True: 193 | with cond: 194 | sample = env.run(actor) 195 | memory.put(sample) 196 | 197 | # wait runner 198 | cond.wait() 199 | 200 | 201 | def learner(cond, memory, actor_agent, learner_agent): 202 | while True: 203 | if memory.full(): 204 | s_batch, target_batch, y_batch, adv_batch = [], [], [], [] 205 | # while memory.qsize() != 0: 206 | # if you use MacOS, use under condition. 207 | if NUM_ENV == 1: 208 | batch = memory.get() 209 | s_batch.extend(batch[0]) 210 | target_batch.extend(batch[1]) 211 | y_batch.extend(batch[2]) 212 | adv_batch.extend(batch[3]) 213 | else: 214 | while not memory.empty(): 215 | batch = memory.get() 216 | s_batch.extend(batch[0]) 217 | target_batch.extend(batch[1]) 218 | y_batch.extend(batch[2]) 219 | adv_batch.extend(batch[3]) 220 | 221 | # train 222 | learner_agent.train_model(s_batch, target_batch, y_batch, adv_batch, actor_agent) 223 | actor_agent.update_actor_model(learner_agent.model) 224 | # resume running 225 | with cond: 226 | cond.notify_all() 227 | 228 | 229 | def main(): 230 | num_envs = NUM_ENV 231 | memory = mp.Queue(maxsize=NUM_ENV) 232 | cond = mp.Condition() 233 | 234 | # make agent and share memory 235 | actor_agent = ActorAgent() 236 | learner_agent = LearnerAgent() 237 | 238 | # sync model 239 | actor_agent.update_actor_model(learner_agent.model) 240 | 241 | # make envs 242 | envs = [Environment(gym.make(ENV_ID), i) for i in range(num_envs)] 243 | 244 | # Learner Process(only Learn) 245 | learn_proc = mp.Process(target=learner, args=(cond, memory, actor_agent, learner_agent)) 246 | 247 | # Runner Process(just run, not learn) 248 | runners = [] 249 | for idx, env in enumerate(envs): 250 | run_proc = mp.Process(target=runner, args=(env, cond, memory, actor_agent)) 251 | runners.append(run_proc) 252 | run_proc.start() 253 | 254 | learn_proc.start() 255 | 256 | for proc in runners: 257 | proc.join() 258 | 259 | learn_proc.join() 260 | 261 | 262 | if __name__ == '__main__': 263 | torch.manual_seed(23) 264 | ENV_ID = 'CartPole-v0' 265 | env = gym.make(ENV_ID) 266 | # Hyper parameter 267 | INPUT = env.observation_space.shape[0] 268 | OUTPUT = env.action_space.n 269 | DISCOUNT = 0.99 270 | NUM_STEP = 128 271 | NUM_ENV = 4 272 | LAM = 0.95 273 | EPOCH = 5 274 | BATCH_SIZE = 32 275 | V_COEF = 1.0 276 | EPSILON = 0.2 277 | ALPHA = 0.99 278 | LEARNING_RATE = 0.0007 279 | env.close() 280 | 281 | main() 282 | -------------------------------------------------------------------------------- /CartPole/play_Cartpole.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import gym 3 | import numpy as np 4 | 5 | env = gym.make("CartPole-v0") 6 | 7 | print(env.observation_space) 8 | INPUT = env.observation_space.shape[0] 9 | OUTPUT = env.action_space.n 10 | 11 | # 하이퍼파라미터 12 | LEARNING_LATE = 0.001 13 | DISCOUNT = 0.99 14 | 15 | # Main 네트워크 16 | x=tf.placeholder(dtype=tf.float32, shape=(None, INPUT)) 17 | 18 | y=tf.placeholder(dtype=tf.float32, shape=(None, OUTPUT)) 19 | dropout = tf.placeholder(dtype=tf.float32) 20 | 21 | W1 = tf.get_variable('W1',shape=[INPUT, 200],initializer=tf.contrib.layers.xavier_initializer()) 22 | W2 = tf.get_variable('W2',shape=[200,200],initializer=tf.contrib.layers.xavier_initializer()) 23 | # W3 = tf.get_variable('W3',shape=[200,150],initializer=tf.contrib.layers.xavier_initializer()) 24 | W4 = tf.get_variable('W4',shape=[200, OUTPUT],initializer=tf.contrib.layers.xavier_initializer()) 25 | 26 | b1 = tf.Variable(tf.zeros([1],dtype=tf.float32)) 27 | b2 = tf.Variable(tf.zeros([1],dtype=tf.float32)) 28 | 29 | _L1=tf.nn.relu(tf.matmul(x,W1)+b1) 30 | L1=tf.nn.dropout(_L1,dropout) 31 | _L2=tf.nn.relu(tf.matmul(L1,W2)+b2) 32 | L2=tf.nn.dropout(_L2,dropout) 33 | # L3=tf.nn.relu(tf.matmul(L2,W3)) 34 | Q_pre = tf.matmul(L2,W4) 35 | 36 | saver = tf.train.Saver() 37 | model_path = "save/model.ckpt" 38 | with tf.Session() as sess: 39 | rlist=[] 40 | sess.run(tf.global_variables_initializer()) 41 | saver.restore(sess, model_path) 42 | 43 | print("Model restored form file: ", model_path) 44 | for episode in range(500): 45 | # state 초기화 46 | s = env.reset() 47 | e= 0.1 48 | rall = 0 49 | d = False 50 | count = 0 51 | # 에피소드가 끝나기 전까지 반복 52 | while not d and count < 5000: 53 | env.render() 54 | count += 1 55 | # state 값의 전처리 56 | s_t = np.reshape(s, [1, INPUT]) 57 | 58 | # 현재 상태의 Q값을 에측 59 | Q = sess.run(Q_pre, feed_dict={x: s_t, dropout: 1}) 60 | 61 | if e > np.random.rand(1): 62 | a = env.action_space.sample() 63 | else: 64 | a = np.argmax(Q) 65 | 66 | 67 | # 결정된 action으로 Environment에 입력 68 | 69 | s, r, d, _ = env.step(a) 70 | 71 | # 총 reward 합 72 | rall += r 73 | 74 | rlist.append(rall) 75 | 76 | print("Episode : {} steps : {} r={}. averge reward : {}".format(episode, count, rall, 77 | np.mean(rlist))) 78 | 79 | -------------------------------------------------------------------------------- /FrozenLake/FL_Q-Table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import random as pr 6 | 7 | 8 | env = gym.make('FrozenLake-v1') 9 | # env.monitor.start('tmp/Frozenlake8x8-0.2', force= True) 10 | # Q-Table 초기화 11 | Q = np.zeros([env.observation_space.n,env.action_space.n]) 12 | 13 | 14 | num_episodes = 1000 15 | 16 | # reward 값과 state 값들을 저장 해놓을 list 17 | 18 | rList = [] 19 | sList = [] 20 | 21 | # Q값이 모두 같을때 랜덤한 action을 구해주기 위한 함수 22 | def rargmax(vector): 23 | m = np.amax(vector) 24 | indices = np.nonzero(vector ==m)[0] 25 | return pr.choice(indices) 26 | 27 | for i in range(num_episodes): 28 | # Environment 초기화와 변수 초기화 29 | s = env.reset() 30 | rAll = 0 31 | d = False 32 | j = 0 33 | sList=[] 34 | # The Q-Table 알고리즘 35 | while not d and j<250: 36 | j+=1 37 | # 가장 Q값이 높은 action을 결정함 38 | a = rargmax(Q[s,:]) 39 | 40 | # action을 통해서 next_state, reward, done, info를 받아온다 41 | s1,r,d,_ = env.step(a) 42 | if r == 1: 43 | print(sList) 44 | # Q-Learning 45 | Q[s,a]= r+ np.max(Q[s1,:]) 46 | s=s1 47 | rAll=rAll+r 48 | sList.append(s) 49 | 50 | rList.append(rAll) 51 | 52 | 53 | print ("Final Q-Table Values") 54 | print (" left down right up") 55 | print (Q) 56 | print("성공한 확률 : ", len(rList)/num_episodes) 57 | plt.bar(range(len(rList)),rList, color="Blue") 58 | plt.show() -------------------------------------------------------------------------------- /FrozenLake/FL_Q-table_Stochastic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import random as pr 6 | 7 | env = gym.make('FrozenLake-v0') 8 | # env.monitor.start('tmp/Frozenlake8x8-0.2', force= True) 9 | # Q-Table 초기화 10 | Q = np.zeros([env.observation_space.n, env.action_space.n]) 11 | 12 | num_episodes = 2000 13 | discount = 0.99 14 | learning_rate = 0.85 15 | 16 | # reward 값과 state 값들을 저장 해놓을 list 17 | 18 | rList = [] 19 | sList = [] 20 | 21 | 22 | # Q값이 모두 같을때 랜덤한 action을 구해주기 위한 함수 23 | def rargmax(vector): 24 | m = np.amax(vector) 25 | indices = np.nonzero(vector == m)[0] 26 | return pr.choice(indices) 27 | 28 | 29 | for i in range(num_episodes): 30 | # Environment 초기화와 변수 초기화 31 | s = env.reset() 32 | rAll = 0 33 | d = False 34 | j = 0 35 | sList = [] 36 | e = 1. / ((i / 10) + 1) 37 | # The Q-Table 알고리즘 38 | while not d and j < 250: 39 | j += 1 40 | 41 | # 가장 Q값이 높은 action을 결정함 42 | # exploration 을 통한 랜덤한 움직임 결정 43 | if e > np.random.rand(1): 44 | a = env.action_space.sample() 45 | else: 46 | a = rargmax(Q[s, :]) 47 | 48 | # action을 통해서 next_state, reward, done, info를 받아온다 49 | s1, r, d, _ = env.step(a) 50 | if r == 1: 51 | print("episode : ",i," state record : " ,sList) 52 | # Q-Learning 53 | # discount factor를 적용하여 최단거리로 학습을 할 수 있음(미래에 대한 가중치) 54 | Q[s, a] = Q[s,a]*(1-learning_rate) + learning_rate * (r + discount * np.max(Q[s1, :])) 55 | s = s1 56 | rAll = rAll + r 57 | sList.append(s) 58 | 59 | rList.append(rAll) 60 | 61 | print ("Final Q-Table Values") 62 | print (" left down right up") 63 | print (Q) 64 | print("성공한 확률 : ", len(rList) / num_episodes) 65 | plt.bar(range(len(rList)), rList, color="Blue") 66 | plt.show() -------------------------------------------------------------------------------- /FrozenLake/FL_Q-table_exp&dis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import random as pr 6 | 7 | env = gym.make('FrozenLake-v1') 8 | # env.monitor.start('tmp/Frozenlake8x8-0.2', force= True) 9 | # Q-Table 초기화 10 | Q = np.zeros([env.observation_space.n, env.action_space.n]) 11 | 12 | num_episodes = 1000 13 | discount = 0.99 14 | 15 | # reward 값과 state 값들을 저장 해놓을 list 16 | 17 | rList = [] 18 | sList = [] 19 | 20 | 21 | # Q값이 모두 같을때 랜덤한 action을 구해주기 위한 함수 22 | def rargmax(vector): 23 | m = np.amax(vector) 24 | indices = np.nonzero(vector == m)[0] 25 | return pr.choice(indices) 26 | 27 | 28 | for i in range(num_episodes): 29 | # Environment 초기화와 변수 초기화 30 | s = env.reset() 31 | rAll = 0 32 | d = False 33 | j = 0 34 | sList = [] 35 | e = 1. / ((i / 10) + 1) 36 | # The Q-Table 알고리즘 37 | while not d and j < 250: 38 | j += 1 39 | 40 | # 가장 Q값이 높은 action을 결정함 41 | # exploration 을 통한 랜덤한 움직임 결정 42 | if e > np.random.rand(1): 43 | a = env.action_space.sample() 44 | else: 45 | a = rargmax(Q[s, :]) 46 | 47 | # action을 통해서 next_state, reward, done, info를 받아온다 48 | s1, r, d, _ = env.step(a) 49 | if r == 1: 50 | print(sList) 51 | # Q-Learning 52 | # discount factor를 적용하여 최단거리로 학습을 할 수 있음(미래에 대한 가중치) 53 | Q[s, a] = r + discount * np.max(Q[s1, :]) 54 | s = s1 55 | rAll = rAll + r 56 | sList.append(s) 57 | 58 | rList.append(rAll) 59 | 60 | print ("Final Q-Table Values") 61 | print (" left down right up") 62 | print (Q) 63 | print("성공한 확률 : ", len(rList) / num_episodes) 64 | plt.bar(range(len(rList)), rList, color="Blue") 65 | plt.show() -------------------------------------------------------------------------------- /FrozenLake/FrozenLake_Q-Network.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 27, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import tensorflow as tf\n", 14 | "import gym\n", 15 | "import numpy as np" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 28, 21 | "metadata": { 22 | "collapsed": false, 23 | "deletable": true, 24 | "editable": true 25 | }, 26 | "outputs": [ 27 | { 28 | "name": "stderr", 29 | "output_type": "stream", 30 | "text": [ 31 | "[2017-02-15 21:14:22,417] Making new env: FrozenLake-v0\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "env = gym.make('FrozenLake-v0')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 29, 42 | "metadata": { 43 | "collapsed": false, 44 | "deletable": true, 45 | "editable": true 46 | }, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "Tensor(\"Placeholder_13:0\", shape=(1, 16), dtype=float32) Tensor(\"Placeholder_14:0\", dtype=float32) Tensor(\"Variable_7/read:0\", shape=(16, 4), dtype=float32)\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "x=tf.placeholder(dtype=tf.float32, shape=(1,env.observation_space.n))\n", 58 | "W=tf.Variable(tf.random_uniform((env.observation_space.n, env.action_space.n)))\n", 59 | "\n", 60 | "Q_pre = tf.matmul(x,W)\n", 61 | "\n", 62 | "\n", 63 | "y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n))\n", 64 | "\n", 65 | "print(x,y,W)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 39, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "learning_rate = 0.1\n", 77 | "num_episode = 2000\n", 78 | "e = 0.1\n", 79 | "discount_factor = 0.99 \n", 80 | "rlist=[]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 31, 86 | "metadata": { 87 | "collapsed": false 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "cost = tf.reduce_sum(tf.square(y-Q_pre))\n", 92 | "optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", 93 | "train = optimizer.minimize(cost)\n", 94 | "\n", 95 | "init = tf.global_variables_initializer()" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 34, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "def one_hot(x):\n", 107 | " return np.identity(env.observation_space.n)[x:x+1]" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 40, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "\u001b[41mS\u001b[0mFFF\n", 122 | "FHFH\n", 123 | "FFFH\n", 124 | "HFFG\n", 125 | " (Up)\n" 126 | ] 127 | }, 128 | { 129 | "ename": "KeyboardInterrupt", 130 | "evalue": "", 131 | "traceback": [ 132 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 133 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 134 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0mQ\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mr\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdiscount_factor\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mQ1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0msess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mone_hot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mQ\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0mrall\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 135 | "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 765\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[0;32m--> 766\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 767\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 768\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 136 | "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run\u001b[0;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 962\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 963\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[0;32m--> 964\u001b[0;31m feed_dict_string, options, run_metadata)\n\u001b[0m\u001b[1;32m 965\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 137 | "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_run\u001b[0;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1012\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1013\u001b[0m return self._do_call(_run_fn, self._session, feed_dict, fetch_list,\n\u001b[0;32m-> 1014\u001b[0;31m target_list, options, run_metadata)\n\u001b[0m\u001b[1;32m 1015\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1016\u001b[0m return self._do_call(_prun_fn, self._session, handle, feed_dict,\n", 138 | "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1019\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1020\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1021\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1022\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1023\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 139 | "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[0;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[1;32m 1001\u001b[0m return tf_session.TF_Run(session, options,\n\u001b[1;32m 1002\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_list\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1003\u001b[0;31m status, run_metadata)\n\u001b[0m\u001b[1;32m 1004\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1005\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 140 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 141 | ], 142 | "output_type": "error" 143 | } 144 | ], 145 | "source": [ 146 | "with tf.Session() as sess:\n", 147 | " sess.run(init)\n", 148 | " env.render()\n", 149 | " for step in range(num_episode):\n", 150 | " s=env.reset()\n", 151 | " rall = 0\n", 152 | " d = False\n", 153 | " \n", 154 | " while not d:\n", 155 | " Q = sess.run(Q_pre, feed_dict={x:one_hot(s)})\n", 156 | " \n", 157 | " if e > np.random.rand(1):\n", 158 | " a = env.action_space.sample()\n", 159 | " else:\n", 160 | " a = np.argmax(Q)\n", 161 | " \n", 162 | " s1,r,d,_ = env.step(a)\n", 163 | " \n", 164 | " Q1 = sess.run(Q_pre, feed_dict={x:one_hot(s1)})\n", 165 | " \n", 166 | " if d:\n", 167 | " Q[0,a]=r\n", 168 | " else:\n", 169 | " Q[0,a]=r + discount_factor * np.argmax(Q1)\n", 170 | " \n", 171 | " sess.run(train, feed_dict={x:one_hot(s), y:Q})\n", 172 | " \n", 173 | " rall+=r\n", 174 | " s=s1\n", 175 | " \n", 176 | " rlist.append(rall)\n", 177 | " \n", 178 | " \n", 179 | "print(\"성공한 확률\"+ str(sum(rlist)/num_episode)+\"%\")" 180 | ] 181 | } 182 | ], 183 | "metadata": { 184 | "kernelspec": { 185 | "display_name": "Python 3", 186 | "language": "python", 187 | "name": "python3" 188 | }, 189 | "language_info": { 190 | "codemirror_mode": { 191 | "name": "ipython", 192 | "version": 3.0 193 | }, 194 | "file_extension": ".py", 195 | "mimetype": "text/x-python", 196 | "name": "python", 197 | "nbconvert_exporter": "python", 198 | "pygments_lexer": "ipython3", 199 | "version": "3.6.0" 200 | } 201 | }, 202 | "nbformat": 4, 203 | "nbformat_minor": 0 204 | } -------------------------------------------------------------------------------- /FrozenLake/FrozenLake_Q-Network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import gym 3 | import numpy as np 4 | 5 | env = gym.make('FrozenLake-v0') 6 | 7 | x=tf.placeholder(dtype=tf.float32, shape=(1,env.observation_space.n)) 8 | 9 | W1=tf.Variable(tf.random_uniform((env.observation_space.n, env.action_space.n),-0.1,0.1)) 10 | Q_pre = tf.matmul(x,W1) 11 | 12 | 13 | y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n)) 14 | 15 | 16 | 17 | learning_rate = 0.1 18 | num_episode = 2000 19 | e = 0.1 20 | discount_factor = 0.99 21 | rlist=[] 22 | slist=[] 23 | 24 | cost = tf.reduce_sum(tf.square(y-Q_pre)) 25 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) 26 | train = optimizer.minimize(cost) 27 | 28 | init = tf.global_variables_initializer() 29 | 30 | 31 | def one_hot(x): 32 | return np.identity(env.observation_space.n)[x:x + 1] 33 | 34 | 35 | with tf.Session() as sess: 36 | sess.run(init) 37 | for step in range(num_episode): 38 | 39 | s = env.reset() 40 | e = 1. / ((step/50)+10) 41 | rall = 0 42 | d = False 43 | j=0 44 | while not d: 45 | j+=1 46 | Q = sess.run(Q_pre, feed_dict={x: one_hot(s)}) 47 | 48 | if e > np.random.rand(1): 49 | a = env.action_space.sample() 50 | else: 51 | a = np.argmax(Q) 52 | 53 | s1, r, d, _ = env.step(a) 54 | 55 | if d: 56 | Q[0, a] = r 57 | else: 58 | Q1 = sess.run(Q_pre, feed_dict={x: one_hot(s1)}) 59 | Q[0, a] = r + discount_factor * np.max(Q1) 60 | 61 | sess.run(train, feed_dict={x: one_hot(s), y: Q}) 62 | 63 | rall += r 64 | slist.append(s) 65 | s = s1 66 | print(slist) 67 | slist=[] 68 | rlist.append(rall) 69 | print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(step, j, rall, np.mean(rlist))) 70 | 71 | print("성공한 확률" + str(sum(rlist) / num_episode) + "%") 72 | 73 | -------------------------------------------------------------------------------- /Pong/Pong_A2C_episodic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | import gym 5 | from collections import deque 6 | from skimage.transform import resize 7 | from skimage.color import rgb2gray 8 | 9 | env = gym.make('PongDeterministic-v4') 10 | 11 | # 하이퍼 파라미터 12 | LEARNING_RATE = 0.00025 13 | INPUT = env.observation_space.shape[0] 14 | OUTPUT = 2 # 액션수를 두개로 줄임. UP or DOWN 15 | DISCOUNT = 0.99 16 | 17 | HEIGHT = 84 18 | WIDTH = 84 19 | HISTORY_SIZE = 4 20 | 21 | model_path = 'save/pong-pg.ckpt' 22 | 23 | def pre_proc(X): 24 | '''입력데이터 전처리. 25 | 26 | Args: 27 | X(np.array): 받아온 이미지를 그레이 스케일링 후 84X84로 크기변경 28 | 그리고 정수값으로 저장하기위해(메모리 효율 높이기 위해) 255를 곱함 29 | 30 | Returns: 31 | np.array: 변경된 이미지 32 | ''' 33 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거 34 | # x = np.maximum(X, X1) 35 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정 36 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255) 37 | 38 | return x 39 | 40 | def discount_rewards(r): 41 | '''Discounted reward를 구하기 위한 함수 42 | 43 | Args: 44 | r(np.array): reward 값이 저장된 array 45 | 46 | Returns: 47 | discounted_r(np.array): Discounted 된 reward가 저장된 array 48 | ''' 49 | discounted_r = np.zeros_like(r, dtype=np.float32) 50 | running_add = 0 51 | for t in reversed(range(len(r))): 52 | running_add = running_add * DISCOUNT + r[t] 53 | discounted_r[t] = running_add 54 | 55 | return discounted_r 56 | 57 | def get_init_state(history, s): 58 | '''에피소드 시작 State를 초기화. 59 | 60 | Args: 61 | history(np.array): 5개의 프레임이 저장될 array 62 | s(list): 초기화된 이미지 63 | 64 | Note: 65 | history[:,:,:3]에 모두 초기화된 이미지(s)를 넣어줌 66 | ''' 67 | for i in range(HISTORY_SIZE): 68 | history[:, :, i] = pre_proc(s) 69 | 70 | 71 | def train_episodic(A2Cagent, x, y, r): 72 | '''에피소드당 학습을 하기위한 함수 73 | 74 | Args: 75 | A2Cagent(ActorCritic): 학습될 네트워크 76 | x(np.array): State가 저장되어있는 array 77 | y(np.array): Action(one_hot)이 저장되어있는 array 78 | r(np.array) : Discounted reward가 저장되어있는 array 79 | 80 | Returns: 81 | l(float): 네트워크에 의한 loss 82 | ''' 83 | l, _ = A2Cagent.sess.run([A2Cagent.loss, A2Cagent.train], feed_dict={A2Cagent.X: x, A2Cagent.Y: y, A2Cagent.r: r}) 84 | return l 85 | 86 | 87 | def play_atari(A2Cagent): 88 | '''학습된 네트워크로 Play하기 위한 함수 89 | 90 | Args: 91 | PGagent(PolicyGradient): 학습된 네트워크 92 | ''' 93 | print("Play Atari!") 94 | episode = 0 95 | while True: 96 | s = env.reset() 97 | history = np.zeros([84, 84, 5], dtype=np.uint8) 98 | done = False 99 | rall = 0 100 | episode += 1 101 | get_init_state(history, s) 102 | while not done: 103 | env.render() 104 | action_p = A2Cagent.sess.run( 105 | A2Cagent.a_prob,feed_dict={A2Cagent.X: np.reshape(np.float32(history[:,:,:4] / 255.), [-1, 84, 84, 4])}) 106 | s1, reward, done, _ = env.step(np.argmax(action_p)+2) 107 | history[:, :, 4] = pre_proc(s1) 108 | history[:, :, :4] = history[:, :, 1:] 109 | rall += reward 110 | print("[Episode {0:6f}] Reward: {1:4f} ".format(episode, rall)) 111 | 112 | 113 | class ActorCritic: 114 | def __init__(self, sess, input_size, output_size): 115 | self.sess = sess 116 | self.input_size = input_size 117 | self.output_size = output_size 118 | self.height = HEIGHT 119 | self.width = WIDTH 120 | self.history_size = HISTORY_SIZE 121 | self.build_network() 122 | 123 | def build_network(self): 124 | self.X = tf.placeholder('float', [None, self.height, self.width, self.history_size]) 125 | self.Y = tf.placeholder('float', [None, self.output_size]) 126 | self.r = tf.placeholder('float') 127 | 128 | # Actor network 129 | f1_a = tf.get_variable("f1_a", shape=[1, 1, 4, 1], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 130 | f2_a = tf.get_variable("f2_a", shape=[4, 4, 1, 16], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 131 | f3_a = tf.get_variable("f3_a", shape=[4, 4, 16, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 132 | w1_a = tf.get_variable("w1_a", shape=[6 * 6 * 32, 256], initializer=tf.contrib.layers.xavier_initializer()) 133 | w2_a = tf.get_variable("w2_a", shape=[256, OUTPUT], initializer=tf.contrib.layers.xavier_initializer()) 134 | 135 | c1_a = tf.nn.relu(tf.nn.conv2d(self.X, f1_a, strides=[1, 1, 1, 1], padding="VALID")) 136 | c2_a = tf.nn.relu(tf.nn.conv2d(c1_a, f2_a, strides=[1, 4, 4, 1], padding="VALID")) 137 | c3_a = tf.nn.relu(tf.nn.conv2d(c2_a, f3_a, strides=[1, 3, 3, 1], padding="VALID")) 138 | 139 | l1_a = tf.reshape(c3_a, [-1, w1_a.get_shape().as_list()[0]]) 140 | l2_a = tf.nn.relu(tf.matmul(l1_a, w1_a)) 141 | self.a_prob = tf.nn.softmax(tf.matmul(l2_a, w2_a)) 142 | 143 | # Critic network(like DQN network) 144 | f1_c = tf.get_variable("f1_c", shape=[8, 8, 4, 16], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 145 | f2_c = tf.get_variable("f2_c", shape=[4, 4, 16, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 146 | w1_c = tf.get_variable("w1_c", shape=[9 * 9 * 32, 256], initializer=tf.contrib.layers.xavier_initializer()) 147 | w2_c = tf.get_variable("w2_c", shape=[256, 1], initializer=tf.contrib.layers.xavier_initializer()) 148 | 149 | c1_c = tf.nn.relu(tf.nn.conv2d(self.X, f1_c, strides=[1, 4, 4, 1], padding="VALID")) 150 | c2_c = tf.nn.relu(tf.nn.conv2d(c1_c, f2_c, strides=[1, 2, 2, 1], padding="VALID")) 151 | 152 | l1_c = tf.reshape(c2_c, [-1, w1_c.get_shape().as_list()[0]]) 153 | l2_c = tf.nn.relu(tf.matmul(l1_c, w1_c)) 154 | self.v = tf.matmul(l2_c, w2_c) 155 | 156 | # A_t = R_t - V(S_t) 157 | self.adv = self.r - self.v 158 | 159 | # Policy loss 160 | self.log_p = self.Y * tf.log(tf.clip_by_value(self.a_prob,1e-10,1.)) 161 | self.log_lik = self.log_p * tf.stop_gradient(self.adv) 162 | self.p_loss = -tf.reduce_mean(tf.reduce_sum(self.log_lik, axis=1)) 163 | 164 | # Value loss 165 | self.v_loss = tf.reduce_mean(tf.square(self.v - self.r), axis=1) 166 | 167 | # Total loss 168 | self.loss = self.p_loss + self.v_loss 169 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss) 170 | self.saver = tf.train.Saver() 171 | 172 | def get_action(self, state, max_prob): 173 | action_p = self.sess.run(self.a_prob, feed_dict={self.X: np.reshape(np.float32(state / 255.), 174 | [-1, HEIGHT, WIDTH, HISTORY_SIZE])}) 175 | # 각 액션의 확률로 액션을 결정 176 | max_prob.append(np.max(action_p)) 177 | action = np.random.choice(np.arange(self.output_size), p=action_p[0]) 178 | return action 179 | 180 | def main(): 181 | with tf.Session() as sess: 182 | # VRAM이 부족하면 CPU로 학습 183 | # with tf.Session(config = tf.ConfigProto(device_count ={'GPU' : 0})) as sess: 184 | A2Cagent = ActorCritic(sess, INPUT, OUTPUT) 185 | 186 | A2Cagent.sess.run(tf.global_variables_initializer()) 187 | 188 | episode = 0 189 | recent_rlist = deque(maxlen=100) 190 | recent_rlist.append(0) 191 | 192 | # 최근 100개의 점수가 19점 넘을 때까지 학습 193 | while np.mean(recent_rlist) <= 19: 194 | episode += 1 195 | 196 | state_memory = deque() 197 | action_memory = deque() 198 | reward_memory = deque() 199 | 200 | # 공의 움직임을 알아보기 위한 History 201 | history = np.zeros([84, 84, HISTORY_SIZE+1], dtype=np.uint8) 202 | rall, count = 0, 0 203 | done = False 204 | 205 | s = env.reset() 206 | max_prob = deque() 207 | get_init_state(history, s) 208 | 209 | while not done: 210 | # env.render() 211 | count += 1 212 | # 액션 선택 213 | action = A2Cagent.get_action(history[:,:,:HISTORY_SIZE], max_prob) 214 | 215 | # action을 one_hot으로 표현 216 | y = np.zeros(OUTPUT) 217 | y[action] = 1 218 | 219 | # 학습속도 개선을 위해 액션의 개수를 2개로 줄임 (UP or DOWN) 220 | s1, reward, done, l = env.step(action + 2) 221 | 222 | rall += reward 223 | reward = np.clip(reward, -1, 1) 224 | 225 | # 한 에피소드를 저장 226 | state_memory.append(np.copy(np.float32(history[:,:,:HISTORY_SIZE]/255.))) 227 | action_memory.append(np.copy(y)) 228 | reward_memory.append(np.copy(reward)) 229 | 230 | # 새로운 프레임을 히스토리 마지막에 넣어줌 231 | history[:, :, HISTORY_SIZE] = pre_proc(s1) 232 | history[:, :, :HISTORY_SIZE] = history[:, :, 1:] 233 | 234 | # 에피소드가 끝났을때 학습 235 | if done: 236 | # Discounted return을 구함 237 | rewards = discount_rewards(np.vstack(reward_memory)) 238 | 239 | train_episodic(A2Cagent, np.stack(state_memory, axis=0), np.stack(action_memory, axis =0), rewards) 240 | recent_rlist.append(rall) 241 | 242 | print("[Episode {0:6d}] Reward: {1:4f} Recent Reward: {2:4f} Max Prob: {3:5.10f}".format(episode, rall, np.mean(recent_rlist), np.mean(max_prob))) 243 | if episode % 10 == 0: 244 | A2Cagent.saver.save(A2Cagent.sess, model_path, global_step= episode) 245 | 246 | 247 | if __name__ == "__main__": 248 | main() 249 | 250 | 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /Pong/Pong_PolicyGradient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | import gym 5 | from collections import deque 6 | from skimage.transform import resize 7 | from skimage.color import rgb2gray 8 | 9 | # {}Deterministic : frameskip = 4 10 | # {}-v4 : repeat_action_probability 11 | env = gym.make('PongDeterministic-v4') 12 | 13 | # 하이퍼 파라미터 14 | LEARNING_RATE = 0.00025 15 | INPUT = env.observation_space.shape 16 | OUTPUT = 2 17 | 18 | DISCOUNT = 0.99 19 | HEIGHT = 84 20 | WIDTH = 84 21 | HISTORY_SIZE = 4 22 | 23 | model_path = 'save/pong-pg.ckpt' 24 | 25 | 26 | def pre_proc(X): 27 | '''입력데이터 전처리. 28 | 29 | Args: 30 | X(np.array): 받아온 이미지를 그레이 스케일링 후 84X84로 크기변경 31 | 그리고 정수값으로 저장하기위해(메모리 효율 높이기 위해) 255를 곱함 32 | 33 | Returns: 34 | np.array: 변경된 이미지 35 | ''' 36 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거 37 | # x = np.maximum(X, X1) 38 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정 39 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255) 40 | 41 | return x 42 | 43 | def get_init_state(history, s): 44 | '''에피소드 시작 State를 초기화. 45 | 46 | Args: 47 | history(np.array): 5개의 프레임이 저장될 array 48 | s(list): 초기화된 이미지 49 | 50 | Note: 51 | history[:,:,:3]에 모두 초기화된 이미지(s)를 넣어줌 52 | ''' 53 | for i in range(HISTORY_SIZE): 54 | history[:, :, i] = pre_proc(s) 55 | 56 | 57 | def discount_rewards(r): 58 | '''Discounted reward를 구하기 위한 함수 59 | 60 | Args: 61 | r(np.array): reward 값이 저장된 array 62 | 63 | Returns: 64 | discounted_r(np.array): Discounted 된 reward가 저장된 array 65 | ''' 66 | discounted_r = np.zeros_like(r, dtype=np.float32) 67 | running_add = 0 68 | for t in reversed(range(len(r))): 69 | 70 | if r[t] != 0: 71 | # 점수를 받으면 에피소드 내부의 작은 에피소드가 끝난 것으로 간주(for Pong) 72 | running_add = 0 73 | running_add = running_add * DISCOUNT + r[t] 74 | discounted_r[t] = running_add 75 | 76 | # normalizing 77 | discounted_r = discounted_r - discounted_r.mean() 78 | discounted_r = discounted_r / discounted_r.std() 79 | 80 | return discounted_r 81 | 82 | def train_episodic(PGagent, x, y, adv): 83 | '''에피소드당 학습을 하기위한 함수 84 | 85 | Args: 86 | PGagent(PolicyGradient): 학습될 네트워크 87 | x(np.array): State가 저장되어있는 array 88 | y(np.array): Action(one_hot)이 저장되어있는 array 89 | adv(np.array) : Discounted reward가 저장되어있는 array 90 | 91 | Returns: 92 | l(float): 네트워크에 의한 loss 93 | ''' 94 | 95 | l, _ = PGagent.sess.run([PGagent.loss, PGagent.train], feed_dict={PGagent.X: x, 96 | PGagent.Y: y, 97 | PGagent.adv: adv}) 98 | return l 99 | 100 | def play_atari(PGagent): 101 | '''학습된 네트워크로 Play하기 위한 함수 102 | 103 | Args: 104 | PGagent(PolicyGradient): 학습된 네트워크 105 | ''' 106 | print("Play Atari!") 107 | episode = 0 108 | while True: 109 | s = env.reset() 110 | history = np.zeros([84, 84, 5], dtype=np.uint8) 111 | done = False 112 | rall = 0 113 | episode += 1 114 | get_init_state(history, s) 115 | while not done: 116 | env.render() 117 | action_p = PGagent.sess.run( 118 | PGagent.a_pre,feed_dict={PGagent.X: np.reshape(np.float32(history[:,:,:4] / 255.), [-1, 84, 84, 4])}) 119 | s1, reward, done, _ = env.step(np.argmax(action_p)+2) 120 | history[:, :, 4] = pre_proc(s1) 121 | history[:, :, :4] = history[:, :, 1:] 122 | rall += reward 123 | print("[Episode {0:6f}] Reward: {1:4f} ".format(episode, rall)) 124 | 125 | 126 | class PolicyGradient: 127 | def __init__(self, sess, input_size, output_size , name = 'main'): 128 | self.sess = sess 129 | self.input_size = input_size 130 | self.output_size = output_size 131 | self.height = HEIGHT 132 | self.width = WIDTH 133 | self.history_size = HISTORY_SIZE 134 | self.name = name 135 | self.build_network() 136 | 137 | def build_network(self): 138 | with tf.variable_scope(self.name): 139 | self.X = tf.placeholder('float', [None, self.height, self.width, self.history_size]) 140 | self.Y = tf.placeholder('float', [None, self.output_size]) 141 | self.adv = tf.placeholder('float') 142 | 143 | f1 = tf.get_variable("f1", shape=[1, 1, 4, 1], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 144 | f2 = tf.get_variable("f2", shape=[4, 4, 1, 16], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 145 | f3 = tf.get_variable("f3", shape=[4, 4, 16, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d()) 146 | w1 = tf.get_variable("w1", shape=[6*6*32, 256], initializer=tf.contrib.layers.xavier_initializer()) 147 | w2 = tf.get_variable("w2", shape=[256, OUTPUT], initializer=tf.contrib.layers.xavier_initializer()) 148 | 149 | # 1x1 conv layer 150 | c1 = tf.nn.relu(tf.nn.conv2d(self.X, f1, strides=[1, 1, 1, 1], padding="VALID")) 151 | c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 3, 3, 1], padding="VALID")) 152 | c3 = tf.nn.relu(tf.nn.conv2d(c2, f3, strides=[1, 4, 4, 1], padding="VALID")) 153 | 154 | l1 = tf.reshape(c3, [-1, w1.get_shape().as_list()[0]]) 155 | l2 = tf.nn.relu(tf.matmul(l1, w1)) 156 | self.a_pre = tf.nn.softmax(tf.matmul(l2, w2)) 157 | 158 | # nan problem(log(0)) 159 | self.log_p = tf.log(tf.clip_by_value(self.a_pre, 1e-10, 1.)) * self.Y 160 | 161 | self.log_lik = -self.log_p * self.adv 162 | self.loss = tf.reduce_mean(tf.reduce_sum(self.log_lik, axis=1)) 163 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss) 164 | self.saver = tf.train.Saver() 165 | 166 | def get_action(self, state, max_prob): 167 | action_p = self.sess.run(self.a_pre, feed_dict={self.X: np.reshape(np.float32(state/255.), 168 | [-1,HEIGHT,WIDTH,HISTORY_SIZE])}) 169 | # 각 액션의 확률로 액션을 결정 170 | max_prob.append(np.max(action_p)) 171 | action = np.random.choice(np.arange(self.output_size), p=action_p[0]) 172 | 173 | return action 174 | # config = tf.ConfigProto(device_count ={'GPU' : 0}) 175 | def main(): 176 | with tf.Session() as sess: 177 | # VRAM이 부족하면 CPU로 학습 178 | # with tf.Session(config = tf.ConfigProto(device_count ={'GPU' : 0})) as sess: 179 | PGagent = PolicyGradient(sess, INPUT, OUTPUT) 180 | 181 | PGagent.sess.run(tf.global_variables_initializer()) 182 | 183 | episode = 0 184 | recent_rlist = deque(maxlen=100) 185 | recent_rlist.append(0) 186 | 187 | # 최근 100개의 점수가 19점 넘을 때까지 학습 188 | while np.mean(recent_rlist) <= 19: 189 | episode += 1 190 | 191 | state_memory = deque() 192 | action_memory = deque() 193 | reward_memory = deque() 194 | 195 | # 공의 움직임을 알아보기 위한 History 196 | history = np.zeros([84, 84, HISTORY_SIZE+1], dtype=np.uint8) 197 | rall, count = 0, 0 198 | done = False 199 | 200 | s = env.reset() 201 | max_prob = deque() 202 | get_init_state(history, s) 203 | 204 | while not done: 205 | # env.render() 206 | count += 1 207 | # 액션 선택 208 | action = PGagent.get_action(history[:,:,:HISTORY_SIZE], max_prob) 209 | 210 | # action을 one_hot으로 표현 211 | y = np.zeros(OUTPUT) 212 | y[action] = 1 213 | 214 | # 학습속도 개선을 위해 액션의 개수를 2개로 줄임 (UP or DOWN) 215 | s1, reward, done, l = env.step(action + 2) 216 | 217 | rall += reward 218 | reward = np.clip(reward, -1, 1) 219 | 220 | # 한 에피소드를 저장 221 | state_memory.append(np.copy(np.float32(history[:,:,:HISTORY_SIZE]/255.))) 222 | action_memory.append(np.copy(y)) 223 | reward_memory.append(np.copy(reward)) 224 | 225 | # 새로운 프레임을 히스토리 마지막에 넣어줌 226 | history[:, :, HISTORY_SIZE] = pre_proc(s1) 227 | history[:, :, :HISTORY_SIZE] = history[:, :, 1:] 228 | 229 | # 에피소드가 끝났을때 학습 230 | if done: 231 | # Discounted return을 구함 232 | rewards = discount_rewards(np.vstack(reward_memory)) 233 | 234 | l = train_episodic(PGagent, np.stack(state_memory, axis=0), 235 | np.stack(action_memory, axis =0), rewards) 236 | 237 | 238 | recent_rlist.append(rall) 239 | 240 | print("[Episode {0:6d}] Step:{4:6d} Reward: {1:4f} Loss: {2:5.5f} Recent Reward: {3:4f} Max Prob: {5:5.5f}". 241 | format(episode, rall, l, np.mean(recent_rlist), count, np.mean(max_prob))) 242 | 243 | if episode % 10 == 0: 244 | PGagent.saver.save(PGagent.sess, model_path, global_step= episode) 245 | play_atari(PGagent) 246 | 247 | 248 | if __name__ == "__main__": 249 | main() 250 | 251 | 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning 2 | ###### 여러 환경에 적용해보는 강화학습 예제(파이토치로 옮기고 있습니다) 3 | # [Here is my new Repo for Policy Gradient!!](https://github.com/jcwleo/mario_rl) 4 | ------------------- 5 | 6 | ![Alt text](/readme/Play.gif) 7 | ###### [Breakout / Use DQN(Nature2015)] 8 | 9 | --------------- 10 | ## 1. Q-Learning / SARSA 11 | * FrozenLake(Gridword) 12 | * [Deterministic Q-Learning](https://github.com/jcwleo/Reinforcement_Learning/blob/master/FrozenLake/FL_Q-Table.py) 13 | * [Add Exploration & Discounted Factor](https://github.com/jcwleo/Reinforcement_Learning/blob/master/FrozenLake/FL_Q-table_exp%26dis.py) 14 | * [Stochastic Q-Learning](https://github.com/jcwleo/Reinforcement_Learning/blob/master/FrozenLake/FL_Q-table_Stochastic.py) 15 | * WindyGridWorld(in Sutton's book) 16 | * [Q-Learning / SARSA](https://github.com/jcwleo/Reinforcement_Learning/tree/master/Windygridworld) 17 | ## 2. Q-Network (Action-Value Function Approximation) 18 | * [FrozenLake(Gridword)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/FrozenLake/FrozenLake_Q-Network.py) 19 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_Q-Network.py) 20 | 21 | ## 3. DQN 22 | DQN(NIPS2013)은 (Experience Replay Memory / CNN) 을 사용. 23 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_DQN_NIPS2013.py) - Cartpole 같은 경우에는 CNN을 사용하지 않고 센서 정보를 통해서 학습 24 | 25 | DQN(Nature2015)은 (Experience Replay Memory / Target Network / CNN) 을 사용 26 | 27 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_DQN_Nature2015.py) 28 | * [Breakout(atari)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/Breakout/Breakout_DQN_class.py) 29 | * [Breakout(atari)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/Breakout/breakout_dqn_pytorch.py) 30 | * this code is made by pytorch and more efficient memory and train 31 | 32 | ## 5. Vanilla Policy Gradient(REINFORCE) 33 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_PolicyGradient.py) 34 | * [Pong(atari)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/Pong/Pong_PolicyGradient.py) 35 | * [Breakout(atari)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/Breakout/Breakout_PolicyGradient.py) 36 | 37 | ## 6. Advantage Actor Critic 38 | * episodic 39 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_A2C_episodic.py) 40 | * [Pong(atari)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/Pong/Pong_A2C_episodic.py) 41 | * one-step 42 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/Cartpole_A2C_onestep.py) 43 | * n-step 44 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/Cartpole_A2C_nstep.py) 45 | 46 | ## 7. Deep Deterministic Policy Gradient 47 | * [Pendulum(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/pendulum/pendulum_ddpg.py) 48 | 49 | ## 8. Parallel Advantage Actor Critic(is called 'A2C' in OpenAI) 50 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_PAAC.py)(used a single thread instead of multi thread) 51 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_PAAC_multiproc.py)(used multiprocessing in pytorch) 52 | * [Super Mario Bros](https://github.com/jcwleo/mario_rl)(used multiprocessing in pytorch) 53 | 54 | ## 9. C51(Distributional RL) 55 | * DDQN 56 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_C51.py) 57 | 58 | ## 10. PPO(Proximal Policy Optimization) 59 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/cartpole_ppo.py) 60 | -------------------------------------------------------------------------------- /Windygridworld/OptimalPolicy/optimal_Q-Learning.txt: -------------------------------------------------------------------------------- 1 | RRRRRRRRRD 2 | RRRRRRRDRD 3 | RRRRRRLURD 4 | UURRRRLLRD 5 | RRRRDLLLRD 6 | RRRRLLLLLD 7 | DDDLLLLLLL 8 | -------------------------------------------------------------------------------- /Windygridworld/OptimalPolicy/optimal_SARSA.txt: -------------------------------------------------------------------------------- 1 | LULRRRRRUD 2 | RLRRRRURDD 3 | LULURRLRLD 4 | DUDRRLLLRD 5 | RURLRLLLLD 6 | LLRDLLLLLD 7 | RRLLLLLLLL 8 | -------------------------------------------------------------------------------- /Windygridworld/Q-learning_sarsa.py: -------------------------------------------------------------------------------- 1 | from windygridworld import WindyGridWorld 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import random as rn 5 | 6 | 7 | def rargmax(vector): 8 | '''모두 같은 Q값일 때, 랜덤하게 액션을 정해주는 함수 9 | 10 | Args: 11 | vector(ndarray): Q-table 12 | 13 | Returns: 14 | action: 랜덤하게 정해진 action값 15 | 16 | ''' 17 | m = np.amax(vector) 18 | indices = np.nonzero(vector == m)[0] 19 | return rn.choice(indices) 20 | 21 | 22 | def array2index(array, width): 23 | ''' 24 | 25 | Args: 26 | array: gridworld array 27 | width: gridworld의 너비 28 | 29 | Returns: 30 | idx: 2D array인 array를 인덱스 값으로 바꾼 값 31 | ''' 32 | idx = array[0] * width + array[1] 33 | return idx 34 | 35 | 36 | def learning(max_step, learning_type,render): 37 | env = WindyGridWorld() 38 | 39 | # Q-table 생성 40 | Q = np.zeros([env.observation_space, env.action_space]) 41 | 42 | global_step = 0 43 | 44 | # 하이퍼파라미터 45 | alpha = 0.5 46 | epsilon = 0.1 47 | 48 | episode = 0 49 | plot_graph = [] 50 | 51 | while global_step <= max_step: 52 | episode += 1 53 | 54 | # 에피소드 리셋 55 | state = env.reset() 56 | 57 | done = False 58 | step = 0 59 | total_reward = 0 60 | while not done: 61 | if render: 62 | env.render() 63 | 64 | step += 1 65 | global_step += 1 66 | plot_graph.append(episode) 67 | 68 | # e-greedy 방법으로 action 결정 69 | if epsilon > np.random.rand(1): 70 | action = np.random.randint(env.action_space) 71 | else: 72 | action = rargmax(Q[array2index(state, env.width), :]) 73 | 74 | # 실제 액션 수행 75 | next_state, reward, done, _ = env.step(action) 76 | 77 | total_reward += reward 78 | 79 | # Q-learning일 때와 SARSA일 때를 구별하여 학습 80 | if learning_type == 'Q-Learning': 81 | # Q-learning 82 | # Q(s,a) = Q(s,a) + a * (reward + max_a(Q(s',a)) - Q(s,a)) 83 | Q[array2index(state, env.width), action] += ( 84 | alpha * (reward + np.max(Q[array2index(next_state, env.width), :]) 85 | - Q[array2index(state, env.width), action])) 86 | else: 87 | # SARSA 88 | # Q(s,a) = Q(s,a) + a * (reward + Q(s',a') - Q(s,a)) 89 | Q[array2index(state, env.width), action] += ( 90 | alpha * (reward + (Q[array2index(next_state, env.width), action]) 91 | - Q[array2index(state, env.width), action])) 92 | 93 | state = next_state[:] 94 | 95 | print('Learning Type : {} Episode : {:5.0f} Step : {:5.0f} reward : {:5.0f}' 96 | .format(learning_type,episode,step,total_reward)) 97 | 98 | # 학습된 Q값 저장 99 | np.save('QValue/{}_value'.format(learning_type), Q) 100 | np.savetxt('QValue/{}_value.txt'.format(learning_type), Q) 101 | 102 | direction = np.array(['L', 'U', 'R', 'D']) 103 | 104 | # 학습된 Optimal한 action 추출 105 | Q = np.argmax(Q, axis=1) 106 | optimal_policy = np.chararray([env.observation_space], unicode=True) 107 | for i in range(env.action_space): 108 | optimal_policy[Q == i] = direction[i] 109 | 110 | optimal_policy = optimal_policy.reshape([env.height, env.width]) 111 | 112 | # Optimal policy를 txt로 저장 113 | np.savetxt('OptimalPolicy/optimal_{}.txt'.format(learning_type), optimal_policy, delimiter='', fmt='%s') 114 | 115 | return plot_graph 116 | 117 | 118 | def main(): 119 | # 학습시킬 step 수 120 | max_step = 20000 121 | 122 | # 움직임을 실제 보고싶을시 True로 변경 123 | render = False 124 | 125 | # 각각 Q_learning과 Sarsa 학습 126 | q_graph = learning(max_step, 'Q-Learning',render) 127 | sarsa_graph = learning(max_step, 'SARSA', render) 128 | 129 | # Q_learning과 Sarsa 학습 그래프 Plot 130 | plt.xlim([0, max_step * 1.1]) 131 | plt.plot(q_graph, 'b', label='Q-learning') 132 | plt.plot(sarsa_graph, 'g', label='SARSA') 133 | plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, 134 | ncol=2, mode="expand", borderaxespad=0.) 135 | plt.savefig('graph.png') 136 | plt.show() 137 | 138 | 139 | if __name__ == '__main__': 140 | main() 141 | -------------------------------------------------------------------------------- /Windygridworld/QValue/Q-Learning_value.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/Windygridworld/QValue/Q-Learning_value.npy -------------------------------------------------------------------------------- /Windygridworld/QValue/Q-Learning_value.txt: -------------------------------------------------------------------------------- 1 | -1.606109091396425725e+01 -1.610001232580505715e+01 -1.584762083464220339e+01 -1.630073073954796925e+01 2 | -1.541752194222167915e+01 -1.538488113843050087e+01 -1.498136231988978295e+01 -1.529896876498860436e+01 3 | -1.466609184134511246e+01 -1.441765939708626121e+01 -1.399886281708481128e+01 -1.422399696898425248e+01 4 | -1.403975840801068209e+01 -1.304280009147792541e+01 -1.300000000000000000e+01 -1.381113840589452835e+01 5 | -1.398668998954349618e+01 -1.299979408507867618e+01 -1.200000000000000000e+01 -1.298531652716749818e+01 6 | -1.299976087288599125e+01 -1.199998733593630185e+01 -1.100000000000000000e+01 -1.199966338906388685e+01 7 | -1.199999982237042673e+01 -1.099999994034184070e+01 -1.000000000000000000e+01 -1.099999976158142090e+01 8 | -1.099999466260687875e+01 -9.999997589438164525e+00 -9.000000000000000000e+00 -9.999999498486896954e+00 9 | -9.999998310057273798e+00 -8.999999970180244446e+00 -8.000000000000000000e+00 -8.999999978395445410e+00 10 | -8.999998486836986444e+00 -7.999999981186673992e+00 -7.999999999693950592e+00 -7.000000000000000000e+00 11 | -1.626172585608986054e+01 -1.658059207685153780e+01 -1.597713827371924644e+01 -1.660554227998717991e+01 12 | -1.643381402014644266e+01 -1.557235125894735184e+01 -1.499991033583573596e+01 -1.543447798906282209e+01 13 | -1.517692580039580896e+01 -1.460284124118269844e+01 -1.399999999126103489e+01 -1.429390618888084674e+01 14 | -1.493757265185002225e+01 -1.399999736862767818e+01 -1.300000000000000000e+01 -1.398218581690473705e+01 15 | -1.334645919114836055e+01 -1.274473947216430680e+01 -1.199999999873805479e+01 -1.286422890813454067e+01 16 | -1.299607909255507820e+01 -1.197791104340741697e+01 -1.100000000000000000e+01 -1.198316236699534088e+01 17 | -1.094934285876594871e+01 -1.017485070106623724e+01 -9.997129935204984719e+00 -1.048353891316603281e+01 18 | -8.018247879035385495e+00 -6.017249159629500355e+00 -8.468204935173119452e+00 -5.283426284790039062e+00 19 | -7.383547711009669001e+00 -7.299712672339803810e+00 -6.999999777473749241e+00 -7.499088532475981950e+00 20 | -8.999998306851560770e+00 -7.999999938540030442e+00 -6.999999985098838806e+00 -6.000000000000000000e+00 21 | -1.698537158189627405e+01 -1.699999441852381565e+01 -1.600000000000000000e+01 -1.788986110612843561e+01 22 | -1.699956194601221071e+01 -1.599624101016233624e+01 -1.500000000000000000e+01 -1.699456858716657592e+01 23 | -1.599018404768081503e+01 -1.499480831099609190e+01 -1.400000000000000000e+01 -1.499915410552886819e+01 24 | -1.422119196246004691e+01 -1.304116248367921038e+01 -1.299999992047652242e+01 -1.303508915023628667e+01 25 | -1.399923735533061375e+01 -1.299974958775605494e+01 -1.200000000000000000e+01 -1.298437499053001076e+01 26 | -1.235948743736836164e+01 -1.174968104199036389e+01 -1.099930108758036340e+01 -1.139215684404715212e+01 27 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 28 | -7.500000000000000000e+00 -4.079315185546875000e+00 -5.748177064951963899e+00 -4.430676720337942243e+00 29 | -8.353223943527794404e+00 -6.994773362917942272e+00 -5.999999989244431120e+00 -6.336669921875000000e+00 30 | -7.999977320199444186e+00 -6.999999992910221458e+00 -5.999998087922945089e+00 -5.000000000000000000e+00 31 | -1.799909823412379950e+01 -1.699981158508516899e+01 -1.699981705762422735e+01 -1.699986625193924894e+01 32 | -1.789990493222170187e+01 -1.600000000000000000e+01 -1.699999809259701067e+01 -1.600000000000000000e+01 33 | -1.512968564860241472e+01 -1.468519456868694917e+01 -1.399999791341254962e+01 -1.449757585816993632e+01 34 | -1.489052999652578535e+01 -1.399088390900487511e+01 -1.300000000000000000e+01 -1.393324809313147838e+01 35 | -1.300870325873785127e+01 -1.282990356841157364e+01 -1.199458390272461017e+01 -1.227136230468750000e+01 36 | -1.171322996557396401e+01 -1.148988889283562997e+01 -1.098229512021653065e+01 -1.125000000000000000e+01 37 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 38 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 39 | -5.855931133031845093e+00 -7.196364956585241224e+00 -4.999999999960096808e+00 -5.000000000000000000e+00 40 | -6.999999278542817116e+00 -5.999989555910574701e+00 -4.999985963106155396e+00 -4.000000000000000000e+00 41 | -1.699819723707243568e+01 -1.798778248505182731e+01 -1.600000000000000000e+01 -1.700000000000000000e+01 42 | -1.699856503371485417e+01 -1.697990005204028563e+01 -1.500000000000000000e+01 -1.574690184543958793e+01 43 | -1.578901819996925582e+01 -1.497437362996795684e+01 -1.400000000000000000e+01 -1.486274657066915239e+01 44 | -1.392810581613018428e+01 -1.343661317201522465e+01 -1.297968200089046853e+01 -1.361869725172174483e+01 45 | -1.201060468427422734e+01 -1.226785252888633870e+01 -1.195749734665353614e+01 -1.192961187653998500e+01 46 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 47 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 48 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 49 | -4.443339188583195210e+00 -5.911444380167361956e+00 -3.999991390166542260e+00 -4.000000000000000000e+00 50 | -5.999991093075700554e+00 -4.999995257228647461e+00 -3.999999522416652020e+00 -3.000000000000000000e+00 51 | -1.577010001503160197e+01 -1.551488599894391385e+01 -1.551246395474427864e+01 -1.576567094844221195e+01 52 | -1.549649187959380470e+01 -1.528496152793104912e+01 -1.486852728068114970e+01 -1.506486514796082865e+01 53 | -1.397552628060613067e+01 -1.462096647017417439e+01 -1.395051287552606745e+01 -1.421640167917921360e+01 54 | -1.414565940417493550e+01 -1.295734866739119262e+01 -1.287681184539292900e+01 -1.300000000000000000e+01 55 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 56 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 57 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 58 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 59 | 0.000000000000000000e+00 -5.999985351276880152e+00 -2.999999992316588759e+00 -9.999999850988388062e-01 60 | -4.999892869434290787e+00 -3.999999378109350801e+00 -2.999969482339565729e+00 -2.000000000000000000e+00 61 | -1.536462809424847364e+01 -1.539057287892060799e+01 -1.529154700887497853e+01 -1.521031709612017480e+01 62 | -1.498813466990083576e+01 -1.459573974152747766e+01 -1.457605338710970244e+01 -1.450000000000000000e+01 63 | -1.419253091642605114e+01 -1.422051572091282168e+01 -1.373968121404789855e+01 -1.366981866991860173e+01 64 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 65 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 66 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 67 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 68 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 69 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 70 | -1.000000000000000000e+00 -2.999999970197677612e+00 -1.999999046325683594e+00 -1.999938964843750000e+00 71 | -------------------------------------------------------------------------------- /Windygridworld/QValue/SARSA_value.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/Windygridworld/QValue/SARSA_value.npy -------------------------------------------------------------------------------- /Windygridworld/QValue/SARSA_value.txt: -------------------------------------------------------------------------------- 1 | -2.750000000000000000e+01 -2.750000000000000000e+01 -2.755171852545975497e+01 -2.794097699769210053e+01 2 | -2.735494478117115236e+01 -2.700000000000000000e+01 -2.734153123529298313e+01 -2.734687984460050103e+01 3 | -2.689838912315719455e+01 -2.700000000000000000e+01 -2.736372676286155681e+01 -2.707966508952665663e+01 4 | -2.731632245055688202e+01 -2.700000000000000000e+01 -2.697839578178842856e+01 -2.700000000000000000e+01 5 | -2.657724991070490717e+01 -2.650000000000000000e+01 -2.599991075437475274e+01 -2.600000000000000000e+01 6 | -2.615094907709034544e+01 -2.500000000000000000e+01 -2.499999994869993714e+01 -2.550000000000000000e+01 7 | -2.405820291213153794e+01 -2.500000000000000000e+01 -2.399999999999540279e+01 -2.450000000000000000e+01 8 | -2.420340995284578156e+01 -2.300000000000000000e+01 -2.299999999999997513e+01 -2.350000000000000000e+01 9 | -2.379328314829689361e+01 -2.200000000000000000e+01 -2.200000000000000000e+01 -2.200000000000000000e+01 10 | -2.426928310791577559e+01 -1.800000000000000000e+01 -2.100000000000000000e+01 -1.599999998870652895e+01 11 | -2.700000000000000000e+01 -2.762966168297720060e+01 -2.676026358179247211e+01 -2.782720087022922684e+01 12 | -2.675502043365662530e+01 -2.725791131692798785e+01 -2.733043339699290897e+01 -2.720852208992571519e+01 13 | -2.708992262006324125e+01 -2.735905145148296924e+01 -2.707059957746382040e+01 -2.740283885071267633e+01 14 | -2.736746336446779537e+01 -2.743136641674209386e+01 -2.691921125211015209e+01 -2.700000000000000000e+01 15 | -2.645415901228469480e+01 -2.640191233158111572e+01 -2.596539275585391948e+01 -2.600000000000000000e+01 16 | -2.529348416526390508e+01 -2.571744495630264282e+01 -2.498471302248990611e+01 -2.500000000000000000e+01 17 | -2.428265391984366772e+01 -2.376977539062500000e+01 -2.382696147502261041e+01 -2.448913574218750000e+01 18 | -1.871860645846396665e+01 -1.518750000000000000e+01 -1.481575235491524722e+01 -1.821875000000000000e+01 19 | -1.957267516581529776e+01 -1.634765625000000000e+01 -1.610902905464172363e+01 -1.600000000000000000e+01 20 | -2.381555753075725335e+01 -1.747002146206796169e+01 -1.600000000000000000e+01 -1.499999999881756096e+01 21 | -2.800000000000000000e+01 -6.818945907169836573e+01 -2.804746459838423078e+01 -4.488060787718251277e+01 22 | -2.780552595379219838e+01 -2.744480348016970339e+01 -2.777906782193912250e+01 -2.821345237620777979e+01 23 | -2.740589903838792907e+01 -2.756148423787738722e+01 -2.761133809323506583e+01 -2.754469650995256558e+01 24 | -2.746726287278380596e+01 -2.666675722599029541e+01 -2.672322528627580951e+01 -2.700000000000000000e+01 25 | -2.702967000172590772e+01 -2.611713954806327820e+01 -2.591307265742956645e+01 -2.600000000000000000e+01 26 | -2.568309227073276091e+01 -2.490106201171875000e+01 -2.483078663441147427e+01 -2.500000000000000000e+01 27 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 28 | -1.179975311173511798e+01 -9.625000000000000000e+00 -8.824859619140625000e+00 -1.090625000000000000e+01 29 | -1.488830161209904190e+01 -1.873242187500000000e+01 -1.535912768449634314e+01 -1.500000000000000000e+01 30 | -1.977274485510834268e+01 -1.758000184083357453e+01 -1.500000000000000000e+01 -1.399999999987816501e+01 31 | -5.400000000000000000e+01 -5.597120564856491143e+01 -5.454495271566119641e+01 -5.203986278927537512e+01 32 | -4.981962152750929818e+01 -2.788261371571276825e+01 -6.038980781517233964e+01 -2.828242330827783491e+01 33 | -3.536586028046440333e+01 -2.777991488643285933e+01 -2.712984053489503466e+01 -2.711792195270746930e+01 34 | -2.810145826160442084e+01 -2.737530132710526232e+01 -2.669065260337092482e+01 -2.700000000000000000e+01 35 | -2.553866601464267205e+01 -2.589084246754646301e+01 -2.543151347043419364e+01 -2.550000000000000000e+01 36 | -2.444986838835328058e+01 -2.517120814323425293e+01 -2.446946968310250270e+01 -2.450000000000000000e+01 37 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 38 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 39 | -1.679571970998039276e+01 -1.623974609375000000e+01 -1.497991088032722473e+01 -1.500000000000000000e+01 40 | -1.588776146110567566e+01 -1.767090018984526978e+01 -1.400000000000000000e+01 -1.299999999999290878e+01 41 | -2.950000000000000000e+01 -5.307988381194701333e+01 -2.933618428396760436e+01 -5.596590163984805599e+01 42 | -2.899783305689743429e+01 -2.837034892286947141e+01 -2.837495737086192804e+01 -2.851418136812195669e+01 43 | -2.903510336858334995e+01 -2.769415967330536077e+01 -2.743419954331855237e+01 -2.754025260181720114e+01 44 | -2.595956777962328488e+01 -2.709387493133544922e+01 -2.605684741249711323e+01 -2.600000000000000000e+01 45 | -2.709679981338558719e+01 -2.572606408596038818e+01 -2.503135233563703110e+01 -2.550000000000000000e+01 46 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 47 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 48 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 49 | -1.092290295479242701e+01 -1.298925781250000000e+01 -1.218994140625000000e+01 -1.100000000000000000e+01 50 | -1.687865904550222496e+01 -1.834917828781726712e+01 -1.250000000000000000e+01 -1.199999999999972999e+01 51 | -2.750000000000000000e+01 -3.325200196528108876e+01 -2.752523696001526332e+01 -2.795410195172147283e+01 52 | -2.764261279377782543e+01 -2.880099473852042991e+01 -2.768212347066484824e+01 -2.769669787462559185e+01 53 | -2.760426634377552091e+01 -2.743772949512522885e+01 -2.691082028037558516e+01 -2.733659871582978695e+01 54 | -2.622418781429856693e+01 -2.550917020792257972e+01 -2.562161888607650440e+01 -2.550000000000000000e+01 55 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 56 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 57 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 58 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 59 | 0.000000000000000000e+00 -1.718215614557266235e+01 -1.213046455383300781e+01 -1.350000000000000000e+01 60 | -1.167318339636358360e+01 -1.873467007229146475e+01 -1.150000000000000000e+01 -1.099999999999999822e+01 61 | -2.750000000000000000e+01 -2.923979778774805638e+01 -2.746730817452127127e+01 -2.750000000000000000e+01 62 | -2.733467742053835536e+01 -2.714173086277996561e+01 -2.695090967456259179e+01 -2.700000000000000000e+01 63 | -2.630331841953739058e+01 -2.650975755363469233e+01 -2.632902625338213198e+01 -2.700000000000000000e+01 64 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 65 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 66 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 67 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 68 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 69 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 70 | -1.000000000000000000e+00 -1.851891208253221066e+01 -1.050000000000000000e+01 -1.000000000000000000e+01 71 | -------------------------------------------------------------------------------- /Windygridworld/Readme.md: -------------------------------------------------------------------------------- 1 | # Windy Gridworld 2 | ![Alt text](../readme/windy.PNG) 3 | ![Alt text](../readme/sarsa.PNG) 4 | ![Alt text](../readme/q-learning.PNG) 5 | -------------------------------------------------------------------------------- /Windygridworld/graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/Windygridworld/graph.png -------------------------------------------------------------------------------- /Windygridworld/windygridworld.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import time 4 | 5 | 6 | class WindyGridWorld: 7 | def __init__(self): 8 | self.width = 10 9 | self.height = 7 10 | self.grid = np.array(['O'] * 70).reshape([self.height, self.width]) 11 | self.weak_wind = [3, 4, 5, 8] 12 | self.strong_wind = [6, 7] 13 | self.action_space = 4 14 | self.observation_space = 70 15 | self.action = {0: 'left', 1: 'up', 2: 'right', 3: 'down'} 16 | self.goal = [3, 7] 17 | 18 | def reset(self): 19 | self.state = [3, 0] 20 | self.grid = np.array(['O'] * 70).reshape([self.height, self.width]) 21 | self.grid[self.state[0], self.state[1]] = 'X' 22 | return self.state 23 | 24 | def render(self, ): 25 | time.sleep(0.1) 26 | os.system('cls') 27 | print(self.grid) 28 | 29 | def step(self, action): 30 | # original action 31 | if action == 0: 32 | if self.state[1] != 0: 33 | self.state[1] -= 1 34 | elif action == 1: 35 | if self.state[0] != 0: 36 | self.state[0] -= 1 37 | elif action == 2: 38 | if self.state[1] != self.width - 1: 39 | self.state[1] += 1 40 | elif action == 3: 41 | if self.state[0] != self.height - 1: 42 | self.state[0] += 1 43 | 44 | else: 45 | print('올바르지 않은 action입니다.') 46 | 47 | # windy action 48 | if self.state[1] in self.weak_wind + self.strong_wind: 49 | if self.state[1] in self.weak_wind: 50 | if self.state[0] != 0: 51 | self.state[0] -= 1 52 | else: 53 | if self.state[0] >= 2: 54 | self.state[0] -= 2 55 | elif self.state[0] == 1: 56 | self.state[0] -= 1 57 | 58 | self.grid = np.array(['O'] * 70).reshape([self.height, self.width]) 59 | self.grid[self.state[0], self.state[1]] = 'X' 60 | 61 | if self.state == self.goal: 62 | return self.state, 0, True, None 63 | else: 64 | return self.state, -1, False, None 65 | -------------------------------------------------------------------------------- /pendulum/pendulum_ddpg.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import torch 4 | import pylab 5 | import random 6 | import argparse 7 | import numpy as np 8 | from collections import deque 9 | from datetime import datetime 10 | from copy import deepcopy 11 | from skimage.transform import resize 12 | from skimage.color import rgb2gray 13 | import torch.nn as nn 14 | import torch.optim as optim 15 | import torch.nn.functional as F 16 | 17 | 18 | class OrnsteinUhlenbeckActionNoise(object): 19 | def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2): 20 | self.action_dim = action_dim 21 | self.mu = mu 22 | self.theta = theta 23 | self.sigma = sigma 24 | self.X = np.ones(self.action_dim) * self.mu 25 | 26 | def reset(self): 27 | self.X = np.ones(self.action_dim) * self.mu 28 | 29 | def sample(self): 30 | dx = self.theta * (self.mu - self.X) 31 | dx = dx + self.sigma * np.random.randn(len(self.X)) 32 | self.X = self.X + dx 33 | return self.X 34 | 35 | 36 | class Flatten(nn.Module): 37 | def forward(self, input): 38 | return input.view(input.size(0), -1) 39 | 40 | 41 | class Actor(nn.Module): 42 | def __init__(self, obs_size, action_size, action_range): 43 | self.action_range = action_range 44 | super(Actor, self).__init__() 45 | self.network = nn.Sequential( 46 | nn.Linear(obs_size, 400), 47 | nn.ReLU(), 48 | nn.Linear(400, 300), 49 | nn.ReLU(), 50 | nn.Linear(300, action_size), 51 | nn.Tanh() 52 | ) 53 | 54 | def forward(self, x): 55 | return self.network(x) 56 | 57 | 58 | class Critic(nn.Module): 59 | def __init__(self, obs_size, action_size, action_range): 60 | self.action_range = action_range 61 | super(Critic, self).__init__() 62 | self.before_action = nn.Sequential( 63 | nn.Linear(obs_size, 400), 64 | nn.ReLU() 65 | ) 66 | self.after_action = nn.Sequential( 67 | nn.Linear(400 + action_size, 300), 68 | nn.ReLU(), 69 | nn.Linear(300, 1) 70 | ) 71 | 72 | def forward(self, x, action): 73 | x = self.before_action(x) 74 | x = torch.cat([x, action], dim=1) 75 | x = self.after_action(x) 76 | return x 77 | 78 | 79 | class DDPG(object): 80 | def __init__(self, options): 81 | # hyperparameter 82 | self.memory_size = options.get('memory_size', 1000000) 83 | self.action_size = options.get('action_size') 84 | self.action_range = options.get('action_range') 85 | self.obs_size = options.get('obs_size') 86 | self.batch_size = options.get('batch_size') 87 | self.actor_lr = options.get('actor_lr') 88 | self.critic_lr = options.get('critic_lr') 89 | self.gamma = options.get('gamma') 90 | self.decay = options.get('decay') 91 | self.tau = options.get('tau') 92 | 93 | # actor model 94 | self.actor = Actor(self.obs_size, self.action_size, self.action_range) 95 | self.actor_target = Actor(self.obs_size, self.action_size, self.action_range) 96 | 97 | # critic model 98 | self.critic = Critic(self.obs_size, self.action_size, self.action_range) 99 | self.critic_target = Critic(self.obs_size, self.action_size, self.action_range) 100 | 101 | # memory(uniformly) 102 | self.memory = deque(maxlen=self.memory_size) 103 | 104 | # explortion 105 | self.ou = OrnsteinUhlenbeckActionNoise(theta=args.ou_theta, sigma=args.ou_sigma, 106 | mu=args.ou_mu, action_dim=self.action_size) 107 | 108 | # optimizer 109 | self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) 110 | self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.critic_lr) 111 | 112 | # initialize target model 113 | self.actor_target.load_state_dict(self.actor.state_dict()) 114 | self.critic_target.load_state_dict(self.critic.state_dict()) 115 | 116 | def get_action(self, state): 117 | state = torch.from_numpy(state).float() 118 | model_action = self.actor(state).detach().numpy() * self.action_range 119 | action = model_action + self.ou.sample() * self.action_range 120 | return action 121 | 122 | def update_target_model(self): 123 | self._soft_update(self.actor_target, self.actor) 124 | self._soft_update(self.critic_target, self.critic) 125 | 126 | def _soft_update(self, target, source): 127 | for target_param, param in zip(target.parameters(), source.parameters()): 128 | target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) 129 | 130 | def append_sample(self, state, action, reward, next_state, done): 131 | self.memory.append((deepcopy(state), action, reward, deepcopy(next_state), done)) 132 | 133 | def _get_sample(self, batch_size): 134 | return random.sample(self.memory, batch_size) 135 | 136 | def train(self): 137 | minibatch = np.array(self._get_sample(self.batch_size)).transpose() 138 | 139 | states = np.vstack(minibatch[0]) 140 | actions = np.vstack(minibatch[1]) 141 | rewards = np.vstack(minibatch[2]) 142 | next_states = np.vstack(minibatch[3]) 143 | dones = np.vstack(minibatch[4].astype(int)) 144 | 145 | rewards = torch.Tensor(rewards) 146 | dones = torch.Tensor(dones) 147 | actions = torch.Tensor(actions) 148 | 149 | # critic update 150 | self.critic_optimizer.zero_grad() 151 | states = torch.Tensor(states) 152 | next_states = torch.Tensor(next_states) 153 | next_actions = self.actor_target(next_states) 154 | 155 | pred = self.critic(states, actions) 156 | next_pred = self.critic_target(next_states, next_actions) 157 | 158 | target = rewards + (1 - dones) * self.gamma * next_pred 159 | critic_loss = F.mse_loss(pred, target) 160 | critic_loss.backward() 161 | self.critic_optimizer.step() 162 | 163 | # actor update 164 | self.actor_optimizer.zero_grad() 165 | pred_actions = self.actor(states) 166 | actor_loss = self.critic(states, pred_actions).mean() 167 | actor_loss = -actor_loss 168 | actor_loss.backward() 169 | self.actor_optimizer.step() 170 | 171 | 172 | def main(args): 173 | env = gym.make(args.env) 174 | 175 | obs_size = env.observation_space.shape[0] 176 | action_size = env.action_space.shape[0] 177 | action_range = env.action_space.high[0] 178 | 179 | print(action_size, action_range) 180 | 181 | args_dict = vars(args) 182 | args_dict['action_size'] = action_size 183 | args_dict['obs_size'] = obs_size 184 | args_dict['action_range'] = action_range 185 | 186 | scores, episodes = [], [] 187 | agent = DDPG(args_dict) 188 | recent_reward = deque(maxlen=100) 189 | frame = 0 190 | 191 | for e in range(args.episode): 192 | score = 0 193 | step = 0 194 | done = False 195 | state = env.reset() 196 | state = np.reshape(state, [1, agent.obs_size]) 197 | while not done: 198 | step += 1 199 | frame += 1 200 | if args.render: 201 | env.render() 202 | 203 | # get action for the current state and go one step in environment 204 | action = agent.get_action(state) 205 | 206 | next_state, reward, done, info = env.step([action]) 207 | next_state = np.reshape(next_state, [1, agent.obs_size]) 208 | 209 | reward = float(reward[0, 0]) 210 | # save the sample to the replay memory 211 | agent.append_sample(state, action, reward, next_state, done) 212 | 213 | score += reward 214 | state = next_state 215 | if frame > agent.batch_size: 216 | agent.train() 217 | agent.update_target_model() 218 | 219 | if frame % 2000 == 0: 220 | print('now time : ', datetime.now()) 221 | scores.append(score) 222 | episodes.append(e) 223 | pylab.plot(episodes, scores, 'b') 224 | pylab.savefig("./save_graph/pendulum_ddpg.png") 225 | 226 | if done: 227 | recent_reward.append(score) 228 | # every episode, plot the play time 229 | print("episode:", e, " score:", score, " memory length:", 230 | len(agent.memory), " steps:", step, 231 | " recent reward:", np.mean(recent_reward)) 232 | 233 | # if the mean of scores of last 10 episode is bigger than 400 234 | # stop training 235 | 236 | 237 | if __name__ == '__main__': 238 | parser = argparse.ArgumentParser() 239 | 240 | parser.add_argument('--env', default='Pendulum-v0', type=str, help='open-ai gym environment') 241 | parser.add_argument('--episode', default=10000, type=int, help='the number of episode') 242 | parser.add_argument('--render', default=False, type=bool, help='is render') 243 | parser.add_argument('--memory_size', default=500000, type=int, help='replay memory size') 244 | parser.add_argument('--batch_size', default=64, type=int, help='minibatch size') 245 | parser.add_argument('--actor_lr', default=1e-4, type=float, help='actor learning rate') 246 | parser.add_argument('--critic_lr', default=1e-3, type=float, help='critic learning rate') 247 | parser.add_argument('--gamma', default=0.99, type=float, help='discounted factor') 248 | parser.add_argument('--decay', default=1e-2, type=int, help='critic weight decay') 249 | parser.add_argument('--tau', default=0.001, type=float, help='moving average for target network') 250 | parser.add_argument('--ou_theta', default=0.15, type=float, help='noise theta') 251 | parser.add_argument('--ou_sigma', default=0.2, type=float, help='noise sigma') 252 | parser.add_argument('--ou_mu', default=0.0, type=float, help='noise mu') 253 | 254 | args = parser.parse_args() 255 | print(vars(args)) 256 | main(args) 257 | -------------------------------------------------------------------------------- /pendulum/pendulum_ppo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import random 4 | 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | import torch.multiprocessing as mp 8 | 9 | import torch.nn as nn 10 | import torch 11 | 12 | from collections import deque 13 | 14 | from torch.distributions.categorical import Categorical 15 | from torch.distributions.normal import Normal 16 | 17 | 18 | def make_batch(sample, agent): 19 | sample = np.stack(sample) 20 | discounted_return = np.empty([NUM_STEP, 1]) 21 | 22 | s = np.reshape(np.stack(sample[:, 0]), [NUM_STEP, agent.input_size]) 23 | s1 = np.reshape(np.stack(sample[:, 3]), [NUM_STEP, agent.input_size]) 24 | y = sample[:, 1] 25 | r = np.reshape(np.stack(sample[:, 2]), [NUM_STEP, 1]) 26 | d = np.reshape(np.stack(sample[:, 4]), [NUM_STEP, 1]) 27 | with torch.no_grad(): 28 | state = torch.from_numpy(s) 29 | state = state.float() 30 | _, _, _, value = agent.model_old(state) 31 | 32 | next_state = torch.from_numpy(s1) 33 | next_state = next_state.float() 34 | _, _, _, next_value = agent.model_old(next_state) 35 | 36 | value = value.data.numpy() 37 | next_value = next_value.data.numpy() 38 | 39 | # Discounted Return 40 | gae = 0 41 | for t in range(NUM_STEP - 1, -1, -1): 42 | delta = r[t] + DISCOUNT * next_value[t] * (1 - d[t]) - value[t] 43 | gae = delta + DISCOUNT * LAM * (1 - d[t]) * gae 44 | discounted_return[t, 0] = gae + value[t] 45 | 46 | # For critic 47 | target = r + DISCOUNT * (1 - d) * next_value 48 | 49 | # For Actor 50 | adv = discounted_return - value 51 | # adv = (adv - adv.mean()) / (adv.std() + 1e-5) 52 | 53 | return [s, target, y, adv] 54 | 55 | 56 | class ActorCriticNetwork(nn.Module): 57 | def __init__(self, input_size, output_size): 58 | super(ActorCriticNetwork, self).__init__() 59 | self.feature = nn.Sequential( 60 | nn.Linear(input_size, 256), 61 | nn.Tanh(), 62 | nn.Linear(256, 256), 63 | nn.Tanh() 64 | ) 65 | self.mu = nn.Linear(256, output_size) 66 | self.critic = nn.Linear(256, 1) 67 | self.mu.weight.data.mul_(0.1) 68 | self.mu.bias.data.mul_(0.0) 69 | self.critic.weight.data.mul_(0.1) 70 | self.critic.bias.data.mul_(0.0) 71 | 72 | def forward(self, state): 73 | x = self.feature(state) 74 | mu = self.mu(x) 75 | logstd = torch.zeros_like(mu) 76 | std = torch.exp(logstd) 77 | value = self.critic(x) 78 | return mu, std, logstd, value 79 | 80 | 81 | # PAAC(Parallel Advantage Actor Critic) 82 | class ActorAgent(object): 83 | def __init__(self): 84 | self.model_old = ActorCriticNetwork(INPUT, OUTPUT) 85 | self.model_old.share_memory() 86 | 87 | self.output_size = OUTPUT 88 | self.input_size = INPUT 89 | 90 | def get_action(self, state): 91 | state = torch.from_numpy(state).unsqueeze(0) 92 | state = state.float() 93 | mu, std, logstd, value = self.model_old(state) 94 | m = Normal(loc=mu,scale=std) 95 | action = m.sample() 96 | return action.item() 97 | 98 | # after some time interval update the target model to be same with model 99 | def update_actor_model(self, target): 100 | self.model_old.load_state_dict(target.state_dict()) 101 | 102 | @staticmethod 103 | def weights_init(m): 104 | class_name = m.__class__.__name__ 105 | if class_name.find('Linear') != -1: 106 | torch.nn.init.kaiming_uniform(m.weight) 107 | print(m) 108 | elif class_name.find('Conv') != -1: 109 | torch.nn.init.kaiming_uniform(m.weight) 110 | print(m) 111 | 112 | 113 | class LearnerAgent(object): 114 | def __init__(self): 115 | self.model = ActorCriticNetwork(INPUT, OUTPUT) 116 | # self.model.cuda() 117 | self.output_size = OUTPUT 118 | self.input_size = INPUT 119 | self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE, eps=1e-5) 120 | 121 | def train_model(self, s_batch, target_batch, y_batch, adv_batch, actor_agent): 122 | s_batch = torch.FloatTensor(s_batch) 123 | target_batch = torch.FloatTensor(target_batch) 124 | adv_batch = torch.FloatTensor(adv_batch) 125 | with torch.no_grad(): 126 | mu_old, std_old, logstd_old, value_old = actor_agent.model_old(s_batch) 127 | m_old = Normal(loc=mu_old, scale=std_old) 128 | y_batch_old = torch.FloatTensor(y_batch) 129 | log_prob_old = m_old.log_prob(y_batch_old) 130 | 131 | # for multiply advantage 132 | mu, std, logstd, value = self.model(s_batch) 133 | m = Normal(loc=mu, scale=std) 134 | y_batch = m.sample() 135 | log_prob = m.log_prob(y_batch) 136 | entropy = m.entropy().mean() 137 | 138 | for i in range(EPOCH): 139 | minibatch = random.sample(range(len(s_batch)), BATCH_SIZE) 140 | ratio = torch.exp(log_prob[minibatch] - log_prob_old[minibatch]) 141 | 142 | surr1 = ratio * adv_batch[minibatch,0].sum(0) 143 | surr2 = torch.clamp(ratio, 1.0 - EPSILON, 1.0 + EPSILON) * adv_batch[minibatch,0].sum(0) 144 | 145 | actor_loss = -torch.min(surr1, surr2).mean() 146 | critic_loss = F.mse_loss(value_old[minibatch], target_batch[minibatch]) 147 | 148 | self.optimizer.zero_grad() 149 | loss = actor_loss + V_COEF * critic_loss - 0.0 * entropy 150 | loss.backward(retain_graph=True) 151 | self.optimizer.step() 152 | 153 | 154 | class Environment(object): 155 | def __init__(self, env, idx): 156 | self.env = env 157 | self.obs = self.env.reset() 158 | self.next_obs = None 159 | self.done = False 160 | self.env_idx = idx 161 | self.step = 0 162 | self.episode = 0 163 | self.rall = 0 164 | self.recent_rlist = deque(maxlen=100) 165 | self.recent_rlist.append(0) 166 | 167 | def run(self, agent): 168 | sample = [] 169 | for _ in range(NUM_STEP): 170 | self.step += 1 171 | action = agent.get_action(self.obs) 172 | self.next_obs, reward, self.done, _ = self.env.step([action]) 173 | self.rall += reward 174 | 175 | # # negative reward 176 | # if self.done and self.step < self.env.spec.timestep_limit: 177 | # reward = 0 178 | 179 | sample.append([self.obs[:], action, reward, self.next_obs[:], self.done]) 180 | 181 | self.obs = self.next_obs 182 | 183 | if self.done: 184 | self.episode += 1 185 | if self.env_idx == 0: 186 | self.recent_rlist.append(self.rall) 187 | print("[Episode {0:6d}] Reward: {1:4.2f} Recent Reward: {2:4.2f}" 188 | .format(self.episode, self.rall, np.mean(self.recent_rlist))) 189 | 190 | self.obs = self.env.reset() 191 | self.done = False 192 | self.step = 0 193 | self.rall = 0 194 | 195 | return make_batch(sample, agent) 196 | 197 | 198 | def runner(env, cond, memory, actor): 199 | while True: 200 | with cond: 201 | sample = env.run(actor) 202 | memory.put(sample) 203 | 204 | # wait runner 205 | cond.wait() 206 | 207 | 208 | def learner(cond, memory, actor_agent, learner_agent): 209 | while True: 210 | if memory.full(): 211 | s_batch, target_batch, y_batch, adv_batch = [], [], [], [] 212 | # while memory.qsize() != 0: 213 | # if you use MacOS, use under condition. 214 | if NUM_ENV == 1: 215 | batch = memory.get() 216 | s_batch.extend(batch[0]) 217 | target_batch.extend(batch[1]) 218 | y_batch.extend(batch[2]) 219 | adv_batch.extend(batch[3]) 220 | else: 221 | while not memory.empty(): 222 | batch = memory.get() 223 | s_batch.extend(batch[0]) 224 | target_batch.extend(batch[1]) 225 | y_batch.extend(batch[2]) 226 | adv_batch.extend(batch[3]) 227 | 228 | # train 229 | learner_agent.train_model(s_batch, target_batch, y_batch, adv_batch, actor_agent) 230 | actor_agent.update_actor_model(learner_agent.model) 231 | # resume running 232 | with cond: 233 | cond.notify_all() 234 | 235 | 236 | def main(): 237 | num_envs = NUM_ENV 238 | memory = mp.Queue(maxsize=NUM_ENV) 239 | cond = mp.Condition() 240 | 241 | # make agent and share memory 242 | actor_agent = ActorAgent() 243 | learner_agent = LearnerAgent() 244 | 245 | # sync model 246 | actor_agent.update_actor_model(learner_agent.model) 247 | 248 | # make envs 249 | envs = [Environment(gym.make(ENV_ID), i) for i in range(num_envs)] 250 | 251 | # Learner Process(only Learn) 252 | learn_proc = mp.Process(target=learner, args=(cond, memory, actor_agent, learner_agent)) 253 | 254 | # Runner Process(just run, not learn) 255 | runners = [] 256 | for idx, env in enumerate(envs): 257 | run_proc = mp.Process(target=runner, args=(env, cond, memory, actor_agent)) 258 | runners.append(run_proc) 259 | run_proc.start() 260 | 261 | learn_proc.start() 262 | 263 | for proc in runners: 264 | proc.join() 265 | 266 | learn_proc.join() 267 | 268 | 269 | if __name__ == '__main__': 270 | torch.manual_seed(23) 271 | ENV_ID = 'Pendulum-v0' 272 | env = gym.make(ENV_ID) 273 | # Hyper parameter 274 | INPUT = env.observation_space.shape[0] 275 | OUTPUT = env.action_space.shape[0] 276 | DISCOUNT = 0.99 277 | NUM_STEP = 2048 278 | NUM_ENV = 1 279 | LAM = 0.95 280 | EPOCH = 10 281 | BATCH_SIZE = 64 282 | V_COEF = 1.0 283 | EPSILON = 0.2 284 | ALPHA = 0.99 285 | LEARNING_RATE = 0.0003 286 | env.close() 287 | 288 | main() -------------------------------------------------------------------------------- /readme/1x1conv.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/readme/1x1conv.gif -------------------------------------------------------------------------------- /readme/Play.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/readme/Play.gif -------------------------------------------------------------------------------- /readme/q-learning.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/readme/q-learning.PNG -------------------------------------------------------------------------------- /readme/sarsa.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/readme/sarsa.PNG -------------------------------------------------------------------------------- /readme/windy.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/readme/windy.PNG --------------------------------------------------------------------------------