├── .idea
├── Reinforcement_learning.iml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── Breakout
├── Breakout-DQN.py
├── Breakout_DQN_class.py
├── Breakout_PolicyGradient.py
├── Play_DQN.py
└── breakout_dqn_pytorch.py
├── CartPole
├── CartPole_A2C_episodic.py
├── CartPole_C51.py
├── CartPole_DDQN.py
├── CartPole_DQN_NIPS2013.py
├── CartPole_DQN_Nature2015.py
├── CartPole_PAAC.py
├── CartPole_PAAC_multiproc.py
├── CartPole_PolicyGradient.py
├── CartPole_Q-Network.py
├── CartPole_Q-Network_reshape.py
├── Cartpole_A2C_nstep.py
├── Cartpole_A2C_onestep.py
├── cartpole_dqn.py
├── cartpole_ppo.py
└── play_Cartpole.py
├── FrozenLake
├── FL_Q-Table.py
├── FL_Q-table_Stochastic.py
├── FL_Q-table_exp&dis.py
├── FrozenLake_Q-Network.ipynb
└── FrozenLake_Q-Network.py
├── Pong
├── Pong_A2C_episodic.py
└── Pong_PolicyGradient.py
├── README.md
├── Windygridworld
├── OptimalPolicy
│ ├── optimal_Q-Learning.txt
│ └── optimal_SARSA.txt
├── Q-learning_sarsa.py
├── QValue
│ ├── Q-Learning_value.npy
│ ├── Q-Learning_value.txt
│ ├── SARSA_value.npy
│ └── SARSA_value.txt
├── Readme.md
├── graph.png
└── windygridworld.py
├── pendulum
├── pendulum_ddpg.py
└── pendulum_ppo.py
└── readme
├── 1x1conv.gif
├── Play.gif
├── q-learning.PNG
├── sarsa.PNG
└── windy.PNG
/.idea/Reinforcement_learning.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Breakout/Breakout-DQN.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import tensorflow as tf
3 | import gym
4 | import copy
5 | import numpy as np
6 | import random as ran
7 | import datetime
8 | import matplotlib.pyplot as plt
9 |
10 | from collections import deque
11 | from skimage.transform import resize
12 | from skimage.color import rgb2gray
13 |
14 | plt.ion()
15 | env = gym.make('BreakoutDeterministic-v3')
16 |
17 | DDQN = False
18 |
19 | # 꺼내서 사용할 리플레이 갯수
20 | MINIBATCH = 32
21 | # 리플레이를 저장할 리스트
22 | REPLAY_MEMORY = deque()
23 |
24 | HISTORY_STEP =4
25 | FRAMESKIP = 4
26 | TRAIN_INTERVAL = 4
27 | NO_STEP = 30
28 | TRAIN_START = 50000
29 | if DDQN:
30 | FINAL_EXPLORATION = 0.01
31 | TARGET_UPDATE = 30000
32 | else:
33 | FINAL_EXPLORATION = 0.1
34 | TARGET_UPDATE = 10000
35 |
36 |
37 | MEMORY_SIZE = 200000
38 | EXPLORATION = 1000000
39 | START_EXPLORATION = 1.
40 |
41 |
42 | INPUT = env.observation_space.shape
43 | OUTPUT = 3
44 | HEIGHT =84
45 | WIDTH = 84
46 |
47 | # 하이퍼파라미터
48 | LEARNING_RATE = 0.00025
49 |
50 | DISCOUNT = 0.99
51 | e = 1.
52 | frame = 0
53 | model_path = "save/Breakout.ckpt"
54 | def cliped_error(x):
55 | return tf.where(tf.abs(x) < 1.0 , 0.5 * tf.square(x), tf.abs(x)-0.5)
56 |
57 | # input data 전처리
58 |
59 |
60 | def pre_proc(X):
61 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거
62 | # x = np.maximum(X, X1)
63 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정
64 | x = np.uint8(resize(rgb2gray(X), (84,84))*255)
65 | return x
66 |
67 | # DQN 모델
68 | def model(input1, f1, f2, f3, w1, w2):
69 | c1 = tf.nn.relu(tf.nn.conv2d(input1, f1, strides=[1, 4, 4, 1],data_format="NHWC", padding = "VALID"))
70 | c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 2, 2, 1],data_format="NHWC", padding="VALID"))
71 | c3 = tf.nn.relu(tf.nn.conv2d(c2, f3, strides=[1,1,1,1],data_format="NHWC", padding="VALID"))
72 |
73 | l1 = tf.reshape(c3, [-1, w1.get_shape().as_list()[0]])
74 | l2 = tf.nn.relu(tf.matmul(l1, w1))
75 |
76 | pyx = tf.matmul(l2, w2)
77 | return pyx
78 |
79 |
80 | X = tf.placeholder("float", [None, 84, 84, 4])
81 |
82 | # 메인 네트워크 Variable
83 | f1 = tf.get_variable("f1", shape=[8,8,4,32], initializer=tf.contrib.layers.xavier_initializer_conv2d())
84 | f2 = tf.get_variable("f2", shape=[4,4,32,64], initializer=tf.contrib.layers.xavier_initializer_conv2d())
85 | f3 = tf.get_variable("f3", shape=[3,3,64,64], initializer=tf.contrib.layers.xavier_initializer_conv2d())
86 |
87 | w1 = tf.get_variable("w1", shape=[7*7*64,512], initializer=tf.contrib.layers.xavier_initializer())
88 | w2 = tf.get_variable("w2", shape=[512, OUTPUT], initializer=tf.contrib.layers.xavier_initializer())
89 |
90 | py_x = model(X, f1, f2, f3 , w1, w2)
91 |
92 | # 타겟 네트워크 Variable
93 | f1_r = tf.get_variable("f1_r", shape=[8,8,4,32], initializer=tf.contrib.layers.xavier_initializer_conv2d())
94 | f2_r = tf.get_variable("f2_r", shape=[4,4,32,64], initializer=tf.contrib.layers.xavier_initializer_conv2d())
95 | f3_r = tf.get_variable("f3_r", shape=[3,3,64,64], initializer=tf.contrib.layers.xavier_initializer_conv2d())
96 |
97 | w1_r = tf.get_variable("w1_r", shape=[7*7*64,512], initializer=tf.contrib.layers.xavier_initializer())
98 | w2_r = tf.get_variable("w2_r", shape=[512, OUTPUT], initializer=tf.contrib.layers.xavier_initializer())
99 |
100 | py_x_r = model(X, f1_r, f2_r,f3_r, w1_r, w2_r)
101 |
102 | # 총 Reward를 저장해놓을 리스트
103 | rlist=[0]
104 | recent_rlist=[0]
105 |
106 | episode = 0
107 | epoch = 0
108 | epoch_score = deque()
109 | epoch_Q = deque()
110 | epoch_on = False
111 | average_Q = deque()
112 | average_reward = deque()
113 | no_life_game = False
114 |
115 | # Loss function 정의
116 | a= tf.placeholder(tf.int64, [None])
117 | y = tf.placeholder(tf.float32, [None])
118 | a_one_hot = tf.one_hot(a, OUTPUT, 1.0, 0.0)
119 | q_value = tf.reduce_sum(tf.multiply(py_x, a_one_hot), reduction_indices=1)
120 | error = tf.abs(y - q_value)
121 |
122 | quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
123 | linear_part = error - quadratic_part
124 | loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)
125 |
126 | optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE,momentum=0.95,epsilon= 0.01)
127 | train = optimizer.minimize(loss)
128 |
129 | saver = tf.train.Saver(max_to_keep=None)
130 |
131 | # 세션 정의
132 | with tf.Session() as sess:
133 | # 변수 초기화
134 | sess.run(tf.global_variables_initializer())
135 | sess.run(w1_r.assign(w1))
136 | sess.run(w2_r.assign(w2))
137 | sess.run(f1_r.assign(f1))
138 | sess.run(f2_r.assign(f2))
139 | sess.run(f3_r.assign(f3))
140 |
141 | # 에피소드 시작
142 | while np.mean(recent_rlist) < 500 :
143 | episode += 1
144 |
145 | # 가장 최근의 100개 episode의 total reward
146 | if len(recent_rlist) > 100:
147 | del recent_rlist[0]
148 |
149 | history = np.zeros((84, 84, 5), dtype=np.uint8)
150 | rall = 0
151 | d = False
152 | ter = False
153 | count = 0
154 | s = env.reset()
155 | avg_max_Q = 0
156 | avg_loss = 0
157 |
158 | # 에피소드 시작할때 최대 30만큼 동안 아무 행동 하지않음
159 | # for _ in range(ran.randint(1, NO_STEP)):
160 | # s1, _, _, _ = env.step(0)
161 |
162 | # state의 초기화
163 | for i in range(HISTORY_STEP):
164 | history[:, :, i] = pre_proc(s)
165 |
166 | # 에피소드가 끝나기 전까지 반복
167 | while not d :
168 | # env.render()
169 | # 최근 4개의 프레임을 현재 프레임으로 바꿔줌
170 |
171 | frame +=1
172 | count+=1
173 |
174 | if e > FINAL_EXPLORATION and frame > TRAIN_START:
175 | e -= (START_EXPLORATION - FINAL_EXPLORATION) / EXPLORATION
176 |
177 | # 현재 state로 Q값을 계산
178 | Q = sess.run(py_x, feed_dict = {X : np.reshape(np.float32(history/255.), (1, 84, 84, 5))[:, :, :, 0:4]})
179 | average_Q.append(np.max(Q))
180 | avg_max_Q += np.max(Q)
181 |
182 | if e > np.random.rand(1):
183 | action = np.random.randint(OUTPUT)
184 | else:
185 | action = np.argmax(Q)
186 |
187 | if action == 0:
188 | real_a = 1
189 | elif action == 1:
190 | real_a = 4
191 | else:
192 | real_a = 5
193 |
194 |
195 | # 결정된 action으로 Environment에 입력
196 | s1, r, d, l = env.step(real_a)
197 | ter = d
198 | reward= np.clip(r, -1,1)
199 |
200 |
201 | # next state를 history에 저장
202 | history[:,:, 4] = pre_proc(s1)
203 |
204 | # 저장된 state를 Experience Replay memory에 저장
205 | REPLAY_MEMORY.append((np.copy(history[:,:,:]), action ,reward, ter))
206 | history[:,:,:4] = history[:,:,1:]
207 |
208 | # 저장된 Frame이 1백만개 이상 넘어가면 맨 앞 Replay부터 삭제
209 | if len(REPLAY_MEMORY) > MEMORY_SIZE:
210 | REPLAY_MEMORY.popleft()
211 | # 총 reward 합
212 | rall += r
213 |
214 | # 5만 frame 이상부터 4개의 Frame마다 학습
215 | if frame > TRAIN_START :
216 | s_stack = deque()
217 | a_stack = deque()
218 | r_stack = deque()
219 | s1_stack = deque()
220 | d_stack = deque()
221 | y_stack = deque()
222 |
223 | sample = ran.sample(REPLAY_MEMORY, MINIBATCH)
224 |
225 | for s_r, a_r, r_r, d_r in sample:
226 | s_stack.append(s_r[:,:,:4])
227 | a_stack.append(a_r)
228 | r_stack.append(r_r)
229 | s1_stack.append(s_r[:,:,1:])
230 | d_stack.append(d_r)
231 |
232 | d_stack = np.array(d_stack) + 0
233 |
234 | Q1 = sess.run(py_x_r, feed_dict={X: np.float32(np.array(s1_stack) / 255.)})
235 |
236 | y_stack = r_stack + (1 - d_stack) * DISCOUNT * np.max(Q1, axis=1)
237 |
238 | # 업데이트 된 Q값으로 main네트워크를 학습
239 | sess.run(train, feed_dict={X: np.float32(np.array(s_stack) / 255.), y: y_stack, a: a_stack})
240 |
241 | # 3만개의 Frame마다 타겟 네트워크 업데이트
242 | if frame % TARGET_UPDATE == 0 :
243 | sess.run(w1_r.assign(w1))
244 | sess.run(w2_r.assign(w2))
245 | sess.run(f1_r.assign(f1))
246 | sess.run(f2_r.assign(f2))
247 | sess.run(f3_r.assign(f3))
248 |
249 | # epoch(50000 Trained frame) 마다 plot
250 | if (frame - TRAIN_START) % 50000 == 0:
251 | epoch_on = True
252 |
253 | if epoch_on:
254 | plt.clf()
255 | epoch += 1
256 | epoch_score.append(np.mean(average_reward))
257 | epoch_Q.append(np.mean(average_Q))
258 |
259 | plt.subplot(211)
260 | plt.axis([0, epoch, 0, np.max(epoch_Q)*6/5])
261 | plt.xlabel('Training Epochs')
262 | plt.ylabel('Average Action Value(Q)')
263 | plt.plot(epoch_Q)
264 |
265 | plt.subplot(212)
266 | plt.axis([0, epoch , 0, np.max(epoch_score)*6/5])
267 | plt.xlabel('Training Epochs')
268 | plt.ylabel('Average Reward per Episode')
269 | plt.plot(epoch_score, "r")
270 |
271 | epoch_on = False
272 | average_reward = deque()
273 | average_Q = deque()
274 | plt.pause(0.05)
275 | plt.savefig("graph/{} epoch".format(epoch-1))
276 |
277 | save_path = saver.save(sess, model_path, global_step=(epoch-1))
278 | print("Model(episode :",episode, ") saved in file: ", save_path , " Now time : " ,datetime.datetime.now())
279 |
280 |
281 |
282 | # 총 reward의 합을 list에 저장
283 | recent_rlist.append(rall)
284 | rlist.append(rall)
285 | average_reward.append(rall)
286 | print("Episode:{0:6d} | Frames:{1:9d} | Steps:{2:5d} | Reward:{3:3.0f} | e-greedy:{4:.5f} | Avg_Max_Q:{5:2.5f} | "
287 | "Recent reward:{6:.5f} ".format(episode,frame, count, rall, e, avg_max_Q/float(count),np.mean(recent_rlist)))
--------------------------------------------------------------------------------
/Breakout/Breakout_DQN_class.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import tensorflow as tf
4 | import gym
5 |
6 | import numpy as np
7 | import random as ran
8 | import datetime
9 | import matplotlib.pyplot as plt
10 |
11 | from collections import deque
12 | from skimage.transform import resize
13 | from skimage.color import rgb2gray
14 |
15 | plt.ion()
16 | # DQN paper setting(frameskip = 4, repeat_action_probability = 0)
17 | # {}Deterministic : frameskip = 4
18 | # {}-v4 : repeat_action_probability
19 | env = gym.make('BreakoutDeterministic-v4')
20 |
21 | # 하이퍼 파라미터
22 | MINIBATCH_SIZE = 32
23 | HISTORY_SIZE = 4
24 | TRAIN_START = 1000
25 | FINAL_EXPLORATION = 0.1
26 | TARGET_UPDATE = 10000
27 | MEMORY_SIZE = 200000
28 | EXPLORATION = 1000000
29 | START_EXPLORATION = 1.
30 | INPUT = env.observation_space.shape
31 | OUTPUT = env.action_space.n
32 | HEIGHT = 84
33 | WIDTH = 84
34 | LEARNING_RATE = 0.00025
35 | DISCOUNT = 0.99
36 | EPSILON = 0.01
37 | MOMENTUM = 0.95
38 |
39 | model_path = "save/Breakout.ckpt"
40 |
41 |
42 | def pre_proc(X):
43 | '''입력데이터 전처리.
44 |
45 | Args:
46 | X(np.array): 받아온 이미지를 그레이 스케일링 후 84X84로 크기변경
47 | 그리고 정수값으로 저장하기위해(메모리 효율 높이기 위해) 255를 곱함
48 |
49 | Returns:
50 | np.array: 변경된 이미지
51 | '''
52 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거
53 | # x = np.maximum(X, X1)
54 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정
55 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255)
56 | return x
57 |
58 |
59 | def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"):
60 | '''타겟네트워크에 메인네트워크의 Weight값을 복사.
61 |
62 | Args:
63 | dest_scope_name="target"(DQN): 'target'이라는 이름을 가진 객체를 가져옴
64 | src_scope_name="main"(DQN): 'main'이라는 이름을 가진 객체를 가져옴
65 |
66 | Returns:
67 | list: main의 trainable한 값들이 target의 값으로 복사된 값
68 | '''
69 | op_holder = []
70 |
71 | src_vars = tf.get_collection(
72 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
73 | dest_vars = tf.get_collection(
74 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)
75 |
76 | for src_var, dest_var in zip(src_vars, dest_vars):
77 | op_holder.append(dest_var.assign(src_var.value()))
78 |
79 | return op_holder
80 |
81 |
82 | def get_init_state(history, s):
83 | '''에피소드 시작 State를 초기화.
84 |
85 | Args:
86 | history(np.array): 5개의 프레임이 저장될 array
87 | s(list): 초기화된 이미지
88 |
89 | Note:
90 | history[:,:,:3]에 모두 초기화된 이미지(s)를 넣어줌
91 | '''
92 | for i in range(HISTORY_SIZE):
93 | history[:, :, i] = pre_proc(s)
94 |
95 |
96 | def find_max_lifes(env):
97 | env.reset()
98 | _, _, _, info = env.step(0)
99 | return info['ale.lives']
100 |
101 |
102 | def check_live(life, cur_life):
103 | if life > cur_life:
104 | return True
105 | else:
106 | return False
107 |
108 |
109 | def train_minibatch(mainDQN, targetDQN, mini_batch):
110 | '''미니배치로 가져온 sample데이터로 메인네트워크 학습
111 |
112 | Args:
113 | mainDQN(object): 메인 네트워크
114 | targetDQN(object): 타겟 네트워크
115 | minibatch: replay_memory에서 MINIBATCH 개수만큼 랜덤 sampling 해온 값
116 |
117 | Note:
118 | replay_memory에서 꺼내온 값으로 메인 네트워크를 학습
119 | '''
120 | mini_batch = np.array(mini_batch).transpose()
121 |
122 | history = np.stack(mini_batch[0], axis=0)
123 |
124 | states = np.float32(history[:, :, :, :4]) / 255.
125 | actions = list(mini_batch[1])
126 | rewards = list(mini_batch[2])
127 | next_states = np.float32(history[:, :, :, 1:]) / 255.
128 | dones = mini_batch[3]
129 |
130 | # bool to binary
131 | dones = dones.astype(int)
132 |
133 | Q1 = targetDQN.get_q(next_states)
134 |
135 | y = rewards + (1 - dones) * DISCOUNT * np.max(Q1, axis=1)
136 |
137 | # 업데이트 된 Q값으로 main네트워크를 학습
138 | mainDQN.sess.run(mainDQN.train, feed_dict={mainDQN.X: states, mainDQN.Y: y,
139 | mainDQN.a: actions})
140 |
141 |
142 | # 데이터 플롯
143 | def plot_data(epoch, epoch_score, average_reward, epoch_Q, average_Q, mainDQN):
144 | plt.clf()
145 | epoch_score.append(np.mean(average_reward))
146 | epoch_Q.append(np.mean(average_Q))
147 |
148 | plt.subplot(211)
149 | plt.axis([0, epoch, 0, np.max(epoch_Q) * 6 / 5])
150 | plt.xlabel('Training Epochs')
151 | plt.ylabel('Average Action Value(Q)')
152 | plt.plot(epoch_Q)
153 |
154 | plt.subplot(212)
155 | plt.axis([0, epoch, 0, np.max(epoch_score) * 6 / 5])
156 | plt.xlabel('Training Epochs')
157 | plt.ylabel('Average Reward per Episode')
158 | plt.plot(epoch_score, "r")
159 |
160 | plt.pause(0.05)
161 | plt.savefig("graph/{} epoch".format(epoch - 1))
162 |
163 | save_path = mainDQN.saver.save(mainDQN.sess, model_path, global_step=(epoch - 1))
164 | print("Model(epoch :", epoch, ") saved in file: ", save_path, " Now time : ", datetime.datetime.now())
165 |
166 |
167 | # DQN
168 | class DQNAgent:
169 | def __init__(self, sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='main'):
170 | self.sess = sess
171 | self.height = HEIGHT
172 | self.width = WIDTH
173 | self.history_size = HISTORY_SIZE
174 | self.output = OUTPUT
175 | self.name = NAME
176 |
177 | self.build_network()
178 |
179 | def build_network(self):
180 | with tf.variable_scope(self.name):
181 | self.X = tf.placeholder('float', [None, self.height, self.width, self.history_size])
182 | self.Y = tf.placeholder('float', [None])
183 | self.a = tf.placeholder('int64', [None])
184 |
185 | f1 = tf.get_variable("f1", shape=[8, 8, 4, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d())
186 | f2 = tf.get_variable("f2", shape=[4, 4, 32, 64], initializer=tf.contrib.layers.xavier_initializer_conv2d())
187 | f3 = tf.get_variable("f3", shape=[3, 3, 64, 64], initializer=tf.contrib.layers.xavier_initializer_conv2d())
188 | w1 = tf.get_variable("w1", shape=[7 * 7 * 64, 512], initializer=tf.contrib.layers.xavier_initializer())
189 | w2 = tf.get_variable("w2", shape=[512, OUTPUT], initializer=tf.contrib.layers.xavier_initializer())
190 |
191 | c1 = tf.nn.relu(tf.nn.conv2d(self.X, f1, strides=[1, 4, 4, 1], padding="VALID"))
192 | c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 2, 2, 1], padding="VALID"))
193 | c3 = tf.nn.relu(tf.nn.conv2d(c2, f3, strides=[1, 1, 1, 1], padding='VALID'))
194 |
195 | l1 = tf.reshape(c3, [-1, w1.get_shape().as_list()[0]])
196 | l2 = tf.nn.relu(tf.matmul(l1, w1))
197 |
198 | self.Q_pre = tf.matmul(l2, w2)
199 |
200 | a_one_hot = tf.one_hot(self.a, self.output, 1.0, 0.0)
201 | q_val = tf.reduce_sum(tf.multiply(self.Q_pre, a_one_hot), reduction_indices=1)
202 |
203 | # huber loss
204 | self.loss = tf.losses.huber_loss(self.Y, q_val)
205 |
206 | optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, momentum=MOMENTUM, epsilon=EPSILON)
207 | self.train = optimizer.minimize(self.loss)
208 |
209 | self.saver = tf.train.Saver(max_to_keep=None)
210 |
211 | def get_q(self, history):
212 | return self.sess.run(self.Q_pre, feed_dict={self.X: np.reshape(history,
213 | [-1, 84, 84, 4])})
214 |
215 | def get_action(self, q, e):
216 | if e > np.random.rand(1):
217 | action = np.random.randint(self.output)
218 | else:
219 | action = np.argmax(q)
220 | return action
221 |
222 |
223 | def main():
224 | config = tf.ConfigProto()
225 | config.gpu_options.allow_growth = True
226 | with tf.Session(config=config) as sess:
227 | mainDQN = DQNAgent(sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='main')
228 | targetDQN = DQNAgent(sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='target')
229 |
230 | sess.run(tf.global_variables_initializer())
231 |
232 | # initial copy q_net -> target_net
233 | copy_ops = get_copy_var_ops(dest_scope_name="target",
234 | src_scope_name="main")
235 | sess.run(copy_ops)
236 |
237 | recent_rlist = deque(maxlen=100)
238 | e = 1.
239 | episode, epoch, frame = 0, 0, 0
240 |
241 | epoch_score, epoch_Q = deque(), deque()
242 | average_Q, average_reward = deque(), deque()
243 |
244 | epoch_on = False
245 |
246 | replay_memory = deque(maxlen=MEMORY_SIZE)
247 |
248 | max_life = find_max_lifes(env)
249 | # Train agent during 200 epoch
250 | while epoch <= 200:
251 | episode += 1
252 |
253 | history = np.zeros([84, 84, 5], dtype=np.uint8)
254 | rall, count = 0, 0
255 | d = False
256 | s = env.reset()
257 | life = max_life
258 | get_init_state(history, s)
259 |
260 | while not d:
261 | # env.render()
262 |
263 | frame += 1
264 | count += 1
265 |
266 | # e-greedy
267 | if e > FINAL_EXPLORATION and frame > TRAIN_START:
268 | e -= (START_EXPLORATION - FINAL_EXPLORATION) / EXPLORATION
269 |
270 | # 히스토리의 0~4까지 부분으로 Q값 예측
271 | Q = mainDQN.get_q(np.float32(history[:, :, :4]) / 255.)
272 | average_Q.append(np.max(Q))
273 |
274 | # 액션 선택
275 | action = mainDQN.get_action(Q, e)
276 |
277 | # s1 : next frame / r : reward / d : done(terminal) / l : info(lives)
278 | s1, r, d, i = env.step(action)
279 | ter = check_live(life, i['ale.lives'])
280 | reward = np.clip(r, -1, 1)
281 |
282 | # 새로운 프레임을 히스토리 마지막에 넣어줌
283 | history[:, :, 4] = pre_proc(s1)
284 |
285 | # 메모리 저장 효율을 높이기 위해 5개의 프레임을 가진 히스토리를 저장
286 | # state와 next_state는 3개의 데이터가 겹침을 이용.
287 | replay_memory.append((np.copy(history[:, :, :]), action, reward, ter))
288 | history[:, :, :4] = history[:, :, 1:]
289 |
290 | rall += r
291 |
292 | if frame > TRAIN_START:
293 | # 프레임 스킵때마다 학습
294 | minibatch = ran.sample(replay_memory, MINIBATCH_SIZE)
295 | train_minibatch(mainDQN, targetDQN, minibatch)
296 |
297 | # 1만 프레임일때마다 target_net 업데이트
298 | if frame % TARGET_UPDATE == 0:
299 | copy_ops = get_copy_var_ops(dest_scope_name="target",
300 | src_scope_name="main")
301 | sess.run(copy_ops)
302 |
303 | # 1 epoch(trained 50000 frame)마다 plot
304 | if (frame - TRAIN_START) % 50000 == 0:
305 | epoch_on = True
306 |
307 | recent_rlist.append(rall)
308 |
309 | average_reward.append(rall)
310 |
311 | print("Episode:{0:6d} | Frames:{1:9d} | Steps:{2:5d} | Reward:{3:3.0f} | e-greedy:{4:.5f} | "
312 | "Avg_Max_Q:{5:2.5f} | Recent reward:{6:.5f} ".format(episode, frame, count, rall, e,
313 | np.mean(average_Q),
314 | np.mean(recent_rlist)))
315 |
316 | if epoch_on:
317 | epoch += 1
318 | plot_data(epoch, epoch_score, average_reward, epoch_Q, average_Q, mainDQN)
319 | epoch_on = False
320 | average_reward = deque()
321 | average_Q = deque()
322 |
323 |
324 | if __name__ == "__main__":
325 | main()
326 |
--------------------------------------------------------------------------------
/Breakout/Breakout_PolicyGradient.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import tensorflow as tf
4 | import gym
5 | from collections import deque
6 | from skimage.transform import resize
7 | from skimage.color import rgb2gray
8 | import copy
9 |
10 | env = gym.make('Breakout-v3')
11 |
12 | # 하이퍼 파라미터
13 | LEARNING_RATE = 0.00025
14 | INPUT = env.observation_space.shape
15 | OUTPUT = 3
16 | DISCOUNT = 0.99
17 | HEIGHT = 84
18 | WIDTH = 84
19 | HISTORY_SIZE = 4
20 |
21 | model_path = 'save/breakout-pg.ckpt'
22 |
23 | def pre_proc(X):
24 | '''입력데이터 전처리.
25 |
26 | Args:
27 | X(np.array): 받아온 이미지를 그레이 스케일링 후 84X84로 크기변경
28 | 그리고 정수값으로 저장하기위해(메모리 효율 높이기 위해) 255를 곱함
29 |
30 | Returns:
31 | np.array: 변경된 이미지
32 | '''
33 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거
34 | # x = np.maximum(X, X1)
35 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정
36 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255)
37 |
38 | return x
39 |
40 | def get_init_state(history, s):
41 | '''에피소드 시작 State를 초기화.
42 |
43 | Args:
44 | history(np.array): 5개의 프레임이 저장될 array
45 | s(list): 초기화된 이미지
46 |
47 | Note:
48 | history[:,:,:3]에 모두 초기화된 이미지(s)를 넣어줌
49 | '''
50 | for i in range(HISTORY_SIZE):
51 | history[:, :, i] = pre_proc(s)
52 |
53 |
54 | def get_game_type(count, l, no_life_game, start_live):
55 | '''라이프가 있는 게임인지 판별
56 |
57 | Args:
58 | count(int): 에피소드 시작 후 첫 프레임인지 확인하기 위한 arg
59 | l(dict): 라이프 값들이 저장되어있는 dict ex) l['ale.lives']
60 | no_life_game(bool): 라이프가 있는 게임일 경우, bool 값을 반환해주기 위한 arg
61 | start_live(int): 라이프가 있는 경우 라이프값을 초기화 하기 위한 arg
62 |
63 | Returns:
64 | list:
65 | no_life_game(bool): 라이프가 없는 게임이면 True, 있으면 False
66 | start_live(int): 라이프가 있는 게임이면 초기화된 라이프
67 | '''
68 | if count == 1:
69 | start_live = l['ale.lives']
70 | # 시작 라이프가 0일 경우, 라이프 없는 게임
71 | if start_live == 0:
72 | no_life_game = True
73 | else:
74 | no_life_game = False
75 | return [no_life_game, start_live]
76 |
77 |
78 | def get_terminal(start_live, l, reward, no_life_game, ter):
79 | '''목숨이 줄어들거나, negative reward를 받았을 때, terminal 처리
80 |
81 | Args:
82 | start_live(int): 라이프가 있는 게임일 경우, 현재 라이프 수
83 | l(dict): 다음 상태에서 라이프가 줄었는지 확인하기 위한 다음 frame의 라이프 info
84 | no_life_game(bool): 라이프가 없는 게임일 경우, negative reward를 받으면 terminal 처리를 해주기 위한 게임 타입
85 | ter(bool): terminal 처리를 저장할 arg
86 |
87 | Returns:
88 | list:
89 | ter(bool): terminal 상태
90 | start_live(int): 줄어든 라이프로 업데이트된 값
91 | '''
92 | if no_life_game:
93 | # 목숨이 없는 게임일 경우 Terminal 처리
94 | if reward < 0:
95 | ter = True
96 | else:
97 | # 목숨 있는 게임일 경우 Terminal 처리
98 | if start_live > l['ale.lives']:
99 | ter = True
100 | start_live = l['ale.lives']
101 |
102 | return [ter, start_live]
103 |
104 | def discount_rewards(r):
105 | '''Discounted reward를 구하기 위한 함수
106 |
107 | Args:
108 | r(np.array): reward 값이 저장된 array
109 |
110 | Returns:
111 | discounted_r(np.array): Discounted 된 reward가 저장된 array
112 | '''
113 | discounted_r = np.zeros_like(r, dtype=np.float32)
114 | running_add = 0
115 | for t in reversed(range(len(r))):
116 | if r[t] < 0: # life가 줄었을때 마다 return 초기화
117 | running_add = 0
118 | running_add = running_add * DISCOUNT + r[t]
119 | discounted_r[t] = running_add
120 |
121 | discounted_r = discounted_r - discounted_r.mean()
122 | discounted_r = discounted_r / discounted_r.std()
123 |
124 | return discounted_r
125 |
126 |
127 | def train_episodic(PGagent, x, y, adv):
128 | '''에피소드당 학습을 하기위한 함수
129 |
130 | Args:
131 | PGagent(PolicyGradient): 학습될 네트워크
132 | x(np.array): State가 저장되어있는 array
133 | y(np.array): Action(one_hot)이 저장되어있는 array
134 | adv(np.array) : Discounted reward가 저장되어있는 array
135 |
136 | Returns:
137 | l(float): 네트워크에 의한 loss
138 | '''
139 |
140 | l, _ = PGagent.sess.run([PGagent.loss, PGagent.train], feed_dict={PGagent.X: x,
141 | PGagent.Y: y,
142 | PGagent.adv: adv})
143 | return l
144 |
145 | class PolicyGradient:
146 | def __init__(self, sess, input_size, output_size , name = 'main'):
147 | self.sess = sess
148 | self.input_size = input_size
149 | self.output_size = output_size
150 | self.height = HEIGHT
151 | self.width = WIDTH
152 | self.history_size = HISTORY_SIZE
153 | self.name = name
154 | self.build_network()
155 |
156 | def build_network(self):
157 | with tf.variable_scope(self.name):
158 | self.X = tf.placeholder('float', [None, self.height, self.width, self.history_size])
159 | self.Y = tf.placeholder('float', [None, self.output_size])
160 | self.adv = tf.placeholder('float')
161 |
162 | f1 = tf.get_variable("f1", shape=[1, 1, 4, 1], initializer=tf.contrib.layers.xavier_initializer_conv2d())
163 | f2 = tf.get_variable("f2", shape=[8, 8, 1, 16], initializer=tf.contrib.layers.xavier_initializer_conv2d())
164 | f3 = tf.get_variable("f3", shape=[4, 4, 16, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d())
165 | w1 = tf.get_variable("w1", shape=[9 * 9 * 32, 256], initializer=tf.contrib.layers.xavier_initializer())
166 | w2 = tf.get_variable("w2", shape=[256, OUTPUT], initializer=tf.contrib.layers.xavier_initializer())
167 |
168 | # 1x1 conv layer
169 | c1 = tf.nn.relu(tf.nn.conv2d(self.X, f1, strides=[1, 1, 1, 1], padding="VALID"))
170 | c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 4, 4, 1], padding="VALID"))
171 | c3 = tf.nn.relu(tf.nn.conv2d(c2, f3, strides=[1, 2, 2, 1], padding="VALID"))
172 |
173 | l1 = tf.reshape(c3, [-1, w1.get_shape().as_list()[0]])
174 | l2 = tf.nn.relu(tf.matmul(l1, w1))
175 | self.a_pre = tf.nn.softmax(tf.matmul(l2, w2))
176 |
177 | self.log_p = tf.log(tf.clip_by_value(self.a_pre, 1e-10, 1.)) * self.Y
178 | self.log_lik = -self.log_p * self.adv
179 | self.loss = tf.reduce_mean(tf.reduce_sum(self.log_lik, axis=1))
180 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
181 | self.saver = tf.train.Saver()
182 |
183 | def get_action(self, state, max_prob):
184 | action_p = self.sess.run(self.a_pre, feed_dict={self.X: np.reshape(np.float32(state/255.),
185 | [-1,HEIGHT,WIDTH,HISTORY_SIZE])})
186 | # 각 액션의 확률로 액션을 결정
187 | max_prob.append(np.max(action_p))
188 | action = np.random.choice(np.arange(self.output_size), p=action_p[0])
189 |
190 | return action
191 | # config = tf.ConfigProto(device_count ={'GPU' : 0})
192 | def main():
193 | with tf.Session() as sess:
194 | PGagent = PolicyGradient(sess, INPUT, OUTPUT)
195 |
196 | PGagent.sess.run(tf.global_variables_initializer())
197 |
198 | episode = 0
199 | recent_rlist = deque(maxlen=100)
200 | recent_rlist.append(0)
201 |
202 | no_life_game = False
203 | # 최근 100개의 점수가 195점 넘을 때까지 학습
204 | while np.mean(recent_rlist) <= 195:
205 | episode += 1
206 |
207 | state_memory = deque()
208 | action_memory = deque()
209 | reward_memory = deque()
210 |
211 | history = np.zeros([84, 84, HISTORY_SIZE+1], dtype=np.uint8)
212 | rall, count = 0, 0
213 | done = False
214 |
215 | s = env.reset()
216 | max_prob = deque()
217 | get_init_state(history, s)
218 | start_lives = 0
219 | while not done:
220 | #env.render()
221 | count += 1
222 | # 액션 선택
223 | action = PGagent.get_action(history[:,:,:HISTORY_SIZE], max_prob)
224 |
225 | # action을 one_hot으로 표현
226 | y = np.zeros(OUTPUT)
227 | y[action] = 1
228 |
229 | # 학습속도 개선을 위한 액션수 줄임
230 | if action == 0:
231 | real_a = 1
232 | elif action == 1:
233 | real_a = 4
234 | else:
235 | real_a = 5
236 |
237 | s1, reward, done, l = env.step(real_a)
238 |
239 | ter = done
240 | rall += reward
241 | reward = np.clip(reward, -1, 1)
242 |
243 | # 라이프가 있는 게임인지 아닌지 판별
244 | no_life_game, start_lives = get_game_type(count, l, no_life_game, start_lives)
245 |
246 | # 라이프가 줄어들거나 negative 리워드를 받았을 때 terminal 처리를 해줌
247 | ter, start_lives = get_terminal(start_lives, l, reward, no_life_game, ter)
248 |
249 | # 죽었을때 학습을 하기위한 negative reward
250 | if ter:
251 | reward = -1
252 |
253 | state_memory.append(np.copy(np.float32(history[:,:,:HISTORY_SIZE]/255.)))
254 | action_memory.append(np.copy(y))
255 | reward_memory.append(np.copy(reward))
256 |
257 | # 새로운 프레임을 히스토리 마지막에 넣어줌
258 | history[:, :, HISTORY_SIZE] = pre_proc(s1)
259 | history[:, :, :HISTORY_SIZE] = history[:, :, 1:]
260 |
261 | # 에피소드가 끝났을때 학습
262 | if done:
263 | rewards = discount_rewards(np.vstack(reward_memory))
264 |
265 | l = train_episodic(PGagent, np.stack(state_memory, axis=0),
266 | np.stack(action_memory, axis =0), rewards)
267 |
268 | recent_rlist.append(rall)
269 |
270 | print("[Episode {0:6d}] Step:{4:6d} Reward: {1:4f} Loss: {2:5.5f} Recent Reward: {3:4f} Max Prob: {5:5.5f}".
271 | format(episode, rall, l, np.mean(recent_rlist), count, np.mean(max_prob)))
272 |
273 | if episode % 10 == 0:
274 | PGagent.saver.save(PGagent.sess, model_path, global_step= episode)
275 |
276 |
277 | if __name__ == "__main__":
278 | main()
279 |
280 |
281 |
282 |
283 |
284 |
--------------------------------------------------------------------------------
/Breakout/Play_DQN.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # 김성훈님 ( https://github.com/hunkim/ReinforcementZeroToAll/blob/master/07_3_dqn_2015_cartpole.py )
3 | # 김태훈님 ( https://github.com/devsisters/DQN-tensorflow )
4 | # 코드를 참조했습니다. 감사합니다!
5 | #
6 | import tensorflow as tf
7 | import gym
8 |
9 | import numpy as np
10 | import random as ran
11 | import datetime
12 | import matplotlib.pyplot as plt
13 |
14 | from collections import deque
15 | from skimage.transform import resize
16 | from skimage.color import rgb2gray
17 |
18 | plt.ion()
19 | # DQN paper setting(frameskip = 4, repeat_action_probability = 0)
20 | # {}Deterministic : frameskip = 4
21 | # {}-v4 : repeat_action_probability
22 | env = gym.make('BreakoutDeterministic-v4')
23 |
24 | # 하이퍼 파라미터
25 | MINIBATCH_SIZE = 32
26 | HISTORY_SIZE = 4
27 | TRAIN_START = 50000
28 | FINAL_EXPLORATION = 0.1
29 | TARGET_UPDATE = 10000
30 | MEMORY_SIZE = 400000
31 | EXPLORATION = 1000000
32 | START_EXPLORATION = 1.
33 | INPUT = env.observation_space.shape
34 | OUTPUT = env.action_space.n
35 | HEIGHT = 84
36 | WIDTH = 84
37 | LEARNING_RATE = 0.00025
38 | DISCOUNT = 0.99
39 | EPSILON = 0.01
40 | MOMENTUM = 0.95
41 |
42 | # 트레이닝된 모델 경로
43 | model_path = "save/Breakout.ckpt"
44 |
45 |
46 | def cliped_error(error):
47 | '''후버로스를 사용하여 error 클립.
48 |
49 | Args:
50 | error(tensor): 클립을 해야할 tensor
51 |
52 | Returns:
53 | tensor: -1 ~ 1 사이로 클립된 error
54 | '''
55 | return tf.where(tf.abs(error) < 1.0, 0.5 * tf.square(error), tf.abs(error) - 0.5)
56 |
57 |
58 | def pre_proc(X):
59 | '''입력데이터 전처리.
60 |
61 | Args:
62 | X(np.array): 받아온 이미지를 그레이 스케일링 후 84X84로 크기변경
63 | 그리고 정수값으로 저장하기위해(메모리 효율 높이기 위해) 255를 곱함
64 |
65 | Returns:
66 | np.array: 변경된 이미지
67 | '''
68 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거
69 | # x = np.maximum(X, X1)
70 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정
71 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255)
72 | return x
73 |
74 |
75 | def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"):
76 | '''타겟네트워크에 메인네트워크의 Weight값을 복사.
77 |
78 | Args:
79 | dest_scope_name="target"(DQN): 'target'이라는 이름을 가진 객체를 가져옴
80 | src_scope_name="main"(DQN): 'main'이라는 이름을 가진 객체를 가져옴
81 |
82 | Returns:
83 | list: main의 trainable한 값들이 target의 값으로 복사된 값
84 | '''
85 | op_holder = []
86 |
87 | src_vars = tf.get_collection(
88 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
89 | dest_vars = tf.get_collection(
90 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)
91 |
92 | for src_var, dest_var in zip(src_vars, dest_vars):
93 | op_holder.append(dest_var.assign(src_var.value()))
94 |
95 | return op_holder
96 |
97 |
98 | def get_init_state(history, s):
99 | '''에피소드 시작 State를 초기화.
100 |
101 | Args:
102 | history(np.array): 5개의 프레임이 저장될 array
103 | s(list): 초기화된 이미지
104 |
105 | Note:
106 | history[:,:,:3]에 모두 초기화된 이미지(s)를 넣어줌
107 | '''
108 | for i in range(HISTORY_SIZE):
109 | history[:, :, i] = pre_proc(s)
110 |
111 |
112 | def get_game_type(count, l, no_life_game, start_live):
113 | '''라이프가 있는 게임인지 판별
114 |
115 | Args:
116 | count(int): 에피소드 시작 후 첫 프레임인지 확인하기 위한 arg
117 | l(dict): 라이프 값들이 저장되어있는 dict ex) l['ale.lives']
118 | no_life_game(bool): 라이프가 있는 게임일 경우, bool 값을 반환해주기 위한 arg
119 | start_live(int): 라이프가 있는 경우 라이프값을 초기화 하기 위한 arg
120 |
121 | Returns:
122 | list:
123 | no_life_game(bool): 라이프가 없는 게임이면 True, 있으면 False
124 | start_live(int): 라이프가 있는 게임이면 초기화된 라이프
125 | '''
126 | if count == 1:
127 | start_live = l['ale.lives']
128 | # 시작 라이프가 0일 경우, 라이프 없는 게임
129 | if start_live == 0:
130 | no_life_game = True
131 | else:
132 | no_life_game = False
133 | return [no_life_game, start_live]
134 |
135 |
136 | def get_terminal(start_live, l, reward, no_life_game, ter):
137 | '''목숨이 줄어들거나, negative reward를 받았을 때, terminal 처리
138 |
139 | Args:
140 | start_live(int): 라이프가 있는 게임일 경우, 현재 라이프 수
141 | l(dict): 다음 상태에서 라이프가 줄었는지 확인하기 위한 다음 frame의 라이프 info
142 | no_life_game(bool): 라이프가 없는 게임일 경우, negative reward를 받으면 terminal 처리를 해주기 위한 게임 타입
143 | ter(bool): terminal 처리를 저장할 arg
144 |
145 | Returns:
146 | list:
147 | ter(bool): terminal 상태
148 | start_live(int): 줄어든 라이프로 업데이트된 값
149 | '''
150 | if no_life_game:
151 | # 목숨이 없는 게임일 경우 Terminal 처리
152 | if reward < 0:
153 | ter = True
154 | else:
155 | # 목숨 있는 게임일 경우 Terminal 처리
156 | if start_live > l['ale.lives']:
157 | ter = True
158 | start_live = l['ale.lives']
159 |
160 | return [ter, start_live]
161 |
162 |
163 | def train_minibatch(mainDQN, targetDQN, minibatch):
164 | '''미니배치로 가져온 sample데이터로 메인네트워크 학습
165 |
166 | Args:
167 | mainDQN(object): 메인 네트워크
168 | targetDQN(object): 타겟 네트워크
169 | minibatch: replay_memory에서 MINIBATCH 개수만큼 랜덤 sampling 해온 값
170 |
171 | Note:
172 | replay_memory에서 꺼내온 값으로 메인 네트워크를 학습
173 | '''
174 | s_stack = []
175 | a_stack = []
176 | r_stack = []
177 | s1_stack = []
178 | d_stack = []
179 |
180 | for s_r, a_r, r_r, d_r in minibatch:
181 | s_stack.append(s_r[:, :, :4])
182 | a_stack.append(a_r)
183 | r_stack.append(r_r)
184 | s1_stack.append(s_r[:, :, 1:])
185 | d_stack.append(d_r)
186 |
187 | # True, False 값을 1과 0으로 변환
188 | d_stack = np.array(d_stack) + 0
189 |
190 | Q1 = targetDQN.get_q(np.array(s1_stack))
191 |
192 | y = r_stack + (1 - d_stack) * DISCOUNT * np.max(Q1, axis=1)
193 |
194 | # 업데이트 된 Q값으로 main네트워크를 학습
195 | mainDQN.sess.run(mainDQN.train, feed_dict={mainDQN.X: np.float32(np.array(s_stack) / 255.), mainDQN.Y: y,
196 | mainDQN.a: a_stack})
197 |
198 |
199 | # 데이터 플롯
200 | def plot_data(epoch, epoch_score, average_reward, epoch_Q, average_Q, mainDQN):
201 | plt.clf()
202 | epoch_score.append(np.mean(average_reward))
203 | epoch_Q.append(np.mean(average_Q))
204 |
205 | plt.subplot(211)
206 | plt.axis([0, epoch, 0, np.max(epoch_Q) * 6 / 5])
207 | plt.xlabel('Training Epochs')
208 | plt.ylabel('Average Action Value(Q)')
209 | plt.plot(epoch_Q)
210 |
211 | plt.subplot(212)
212 | plt.axis([0, epoch, 0, np.max(epoch_score) * 6 / 5])
213 | plt.xlabel('Training Epochs')
214 | plt.ylabel('Average Reward per Episode')
215 | plt.plot(epoch_score, "r")
216 |
217 | plt.pause(0.05)
218 | plt.savefig("graph/{} epoch".format(epoch - 1))
219 |
220 | save_path = mainDQN.saver.save(mainDQN.sess, model_path, global_step=(epoch - 1))
221 | print("Model(epoch :", epoch, ") saved in file: ", save_path, " Now time : ", datetime.datetime.now())
222 |
223 |
224 | # DQN
225 | class DQNAgent:
226 | def __init__(self, sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='main'):
227 | self.sess = sess
228 | self.height = HEIGHT
229 | self.width = WIDTH
230 | self.history_size = HISTORY_SIZE
231 | self.output = OUTPUT
232 | self.name = NAME
233 |
234 | self.build_network()
235 |
236 | def build_network(self):
237 | with tf.variable_scope(self.name):
238 | self.X = tf.placeholder('float', [None, self.height, self.width, self.history_size])
239 | self.Y = tf.placeholder('float', [None])
240 | self.a = tf.placeholder('int64', [None])
241 |
242 | f1 = tf.get_variable("f1", shape=[8, 8, 4, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d())
243 | f2 = tf.get_variable("f2", shape=[4, 4, 32, 64], initializer=tf.contrib.layers.xavier_initializer_conv2d())
244 | f3 = tf.get_variable("f3", shape=[3, 3, 64, 64], initializer=tf.contrib.layers.xavier_initializer_conv2d())
245 | w1 = tf.get_variable("w1", shape=[7 * 7 * 64, 512], initializer=tf.contrib.layers.xavier_initializer())
246 | w2 = tf.get_variable("w2", shape=[512, OUTPUT], initializer=tf.contrib.layers.xavier_initializer())
247 |
248 | c1 = tf.nn.relu(tf.nn.conv2d(self.X, f1, strides=[1, 4, 4, 1], padding="VALID"))
249 | c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 2, 2, 1], padding="VALID"))
250 | c3 = tf.nn.relu(tf.nn.conv2d(c2, f3, strides=[1, 1, 1, 1], padding='VALID'))
251 |
252 | l1 = tf.reshape(c3, [-1, w1.get_shape().as_list()[0]])
253 | l2 = tf.nn.relu(tf.matmul(l1, w1))
254 |
255 | self.Q_pre = tf.matmul(l2, w2)
256 |
257 | a_one_hot = tf.one_hot(self.a, self.output, 1.0, 0.0)
258 | q_val = tf.reduce_sum(tf.multiply(self.Q_pre, a_one_hot), reduction_indices=1)
259 |
260 | # error를 -1~1 사이로 클립
261 | error = cliped_error(self.Y - q_val)
262 |
263 | self.loss = tf.reduce_mean(error)
264 |
265 | optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, momentum=MOMENTUM, epsilon=EPSILON)
266 | self.train = optimizer.minimize(self.loss)
267 |
268 | self.saver = tf.train.Saver(max_to_keep=None)
269 |
270 | def get_q(self, history):
271 | return self.sess.run(self.Q_pre, feed_dict={self.X: np.reshape(np.float32(history / 255.),
272 | [-1, 84, 84, 4])})
273 |
274 | def get_action(self, q, e):
275 | if e > np.random.rand(1):
276 | action = np.random.randint(self.output)
277 | else:
278 | action = np.argmax(q)
279 | return action
280 |
281 |
282 | def main():
283 | with tf.Session() as sess:
284 | mainDQN = DQNAgent(sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='main')
285 | targetDQN = DQNAgent(sess, HEIGHT, WIDTH, HISTORY_SIZE, OUTPUT, NAME='target')
286 |
287 | mainDQN.saver.restore(sess, model_path)
288 | recent_rlist = deque(maxlen=100)
289 | e = 1.
290 | episode, epoch, frame = 0, 0, 0
291 |
292 | average_Q, average_reward = deque(), deque()
293 |
294 | # Train agent during 200 epoch
295 | while epoch <= 200:
296 | episode += 1
297 |
298 | history = np.zeros([84, 84, 5], dtype=np.uint8)
299 | rall, count = 0, 0
300 | d = False
301 | s = env.reset()
302 |
303 | get_init_state(history, s)
304 | while not d:
305 | # env.render()
306 |
307 | frame += 1
308 | count += 1
309 |
310 | # e-greedy(for test)
311 | e = 0.05
312 |
313 | # 히스토리의 0~4까지 부분으로 Q값 예측
314 | Q = mainDQN.get_q(history[:, :, :4])
315 | average_Q.append(np.max(Q))
316 |
317 | # 액션 선택
318 | action = mainDQN.get_action(Q, e)
319 |
320 | # s1 : next frame / r : reward / d : done(terminal) / l : info(lives)
321 | s1, r, d, l = env.step(action)
322 |
323 | # 새로운 프레임을 히스토리 마지막에 넣어줌
324 | history[:, :, 4] = pre_proc(s1)
325 |
326 | history[:, :, :4] = history[:, :, 1:]
327 |
328 | rall += r
329 |
330 | recent_rlist.append(rall)
331 |
332 | average_reward.append(rall)
333 |
334 | print("Episode:{0:6d} | Frames:{1:9d} | Steps:{2:5d} | Reward:{3:3.0f} | e-greedy:{4:.5f} | "
335 | "Avg_Max_Q:{5:2.5f} | Recent reward:{6:.5f} ".format(episode, frame, count, rall, e,
336 | np.mean(average_Q),
337 | np.mean(recent_rlist)))
338 |
339 |
340 | if __name__ == "__main__":
341 | main()
342 |
343 |
344 |
345 |
346 |
347 |
--------------------------------------------------------------------------------
/Breakout/breakout_dqn_pytorch.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import gym
3 | import torch
4 | import pylab
5 | import random
6 | import numpy as np
7 | from collections import deque
8 | from datetime import datetime
9 | from copy import deepcopy
10 | from skimage.transform import resize
11 | from skimage.color import rgb2gray
12 | import torch.nn as nn
13 | import torch.optim as optim
14 | import torch.nn.functional as F
15 | from torch.autograd import Variable
16 |
17 |
18 | def find_max_lifes(env):
19 | env.reset()
20 | _, _, _, info = env.step(0)
21 | return info['ale.lives']
22 |
23 |
24 | def check_live(life, cur_life):
25 | if life > cur_life:
26 | return True
27 | else:
28 | return False
29 |
30 |
31 | def pre_proc(X):
32 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255)
33 | return x
34 |
35 |
36 | def get_init_state(history, s):
37 | for i in range(HISTORY_SIZE):
38 | history[i, :, :] = pre_proc(s)
39 |
40 |
41 | class Flatten(nn.Module):
42 | def forward(self, input):
43 | return input.view(input.size(0), -1)
44 |
45 |
46 | # approximate Q function using Neural Network
47 | # state is input and Q Value of each action is output of network
48 | class DQN(nn.Module):
49 | def __init__(self, action_size):
50 | super(DQN, self).__init__()
51 | self.fc = nn.Sequential(
52 | nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4),
53 | nn.ReLU(),
54 | nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
55 | nn.ReLU(),
56 | nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
57 | nn.ReLU(),
58 | Flatten(),
59 | nn.Linear(7 * 7 * 64, 512),
60 | nn.ReLU(),
61 | nn.Linear(512, action_size)
62 | )
63 |
64 | def forward(self, x):
65 | return self.fc(x)
66 |
67 |
68 | # DQN Agent for the Cartpole
69 | # it uses Neural Network to approximate q function
70 | # and replay memory & target q network
71 | class DQNAgent():
72 | def __init__(self, action_size):
73 | # if you want to see Cartpole learning, then change to True
74 | self.render = False
75 | self.load_model = False
76 |
77 | # get size of action
78 | self.action_size = action_size
79 |
80 | # These are hyper parameters for the DQN
81 | self.discount_factor = 0.99
82 | self.learning_rate = 0.0001
83 | self.memory_size = 1000000
84 | self.epsilon = 1.0
85 | self.epsilon_min = 0.02
86 | self.explore_step = 1000000
87 | self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step
88 | self.batch_size = 32
89 | self.train_start = 100000
90 | self.update_target = 1000
91 |
92 | # create replay memory using deque
93 | self.memory = deque(maxlen=self.memory_size)
94 |
95 | # create main model and target model
96 | self.model = DQN(action_size)
97 | self.model.cuda()
98 | self.model.apply(self.weights_init)
99 | self.target_model = DQN(action_size)
100 | self.target_model.cuda()
101 |
102 | # self.optimizer = optim.RMSprop(params=self.model.parameters(),lr=self.learning_rate, eps=0.01, momentum=0.95)
103 | self.optimizer = optim.Adam(params=self.model.parameters(), lr=self.learning_rate)
104 |
105 | # initialize target model
106 | self.update_target_model()
107 |
108 | if self.load_model:
109 | self.model = torch.load('save_model/breakout_dqn')
110 |
111 | # weight xavier initialize
112 | def weights_init(self, m):
113 | classname = m.__class__.__name__
114 | if classname.find('Linear') != -1:
115 | torch.nn.init.xavier_uniform(m.weight)
116 | print(m)
117 | elif classname.find('Conv') != -1:
118 | torch.nn.init.xavier_uniform(m.weight)
119 | print(m)
120 |
121 | # after some time interval update the target model to be same with model
122 | def update_target_model(self):
123 | self.target_model.load_state_dict(self.model.state_dict())
124 |
125 | # get action from model using epsilon-greedy policy
126 | def get_action(self, state):
127 | if np.random.rand() <= self.epsilon:
128 | return random.randrange(self.action_size)
129 | else:
130 | state = torch.from_numpy(state).unsqueeze(0)
131 | state = Variable(state).float().cuda()
132 | action = self.model(state).data.cpu().max(1)[1]
133 | return int(action)
134 |
135 | # save sample to the replay memory
136 | def append_sample(self, history, action, reward, done):
137 | self.memory.append((history, action, reward, done))
138 |
139 | def get_sample(self, frame):
140 | mini_batch = []
141 | if frame >= self.memory_size:
142 | sample_range = self.memory_size
143 | else:
144 | sample_range = frame
145 |
146 | # history size
147 | sample_range -= (HISTORY_SIZE + 1)
148 |
149 | idx_sample = random.sample(range(sample_range), self.batch_size)
150 | for i in idx_sample:
151 | sample = []
152 | for j in range(HISTORY_SIZE + 1):
153 | sample.append(self.memory[i + j])
154 |
155 | sample = np.array(sample)
156 | mini_batch.append((np.stack(sample[:, 0], axis=0), sample[3, 1], sample[3, 2], sample[3, 3]))
157 |
158 | return mini_batch
159 |
160 | # pick samples randomly from replay memory (with batch_size)
161 | def train_model(self, frame):
162 | if self.epsilon > self.epsilon_min:
163 | self.epsilon -= self.epsilon_decay
164 |
165 | mini_batch = self.get_sample(frame)
166 | mini_batch = np.array(mini_batch).transpose()
167 |
168 | history = np.stack(mini_batch[0], axis=0)
169 | states = np.float32(history[:, :4, :, :]) / 255.
170 | actions = list(mini_batch[1])
171 | rewards = list(mini_batch[2])
172 | next_states = np.float32(history[:, 1:, :, :]) / 255.
173 | dones = mini_batch[3]
174 |
175 | # bool to binary
176 | dones = dones.astype(int)
177 |
178 | # Q function of current state
179 | states = torch.Tensor(states)
180 | states = Variable(states).float().cuda()
181 | pred = self.model(states)
182 |
183 | # one-hot encoding
184 | a = torch.LongTensor(actions).view(-1, 1)
185 |
186 | one_hot_action = torch.FloatTensor(self.batch_size, self.action_size).zero_()
187 | one_hot_action.scatter_(1, a, 1)
188 |
189 | pred = torch.sum(pred.mul(Variable(one_hot_action).cuda()), dim=1)
190 |
191 | # Q function of next state
192 | next_states = torch.Tensor(next_states)
193 | next_states = Variable(next_states).float().cuda()
194 | next_pred = self.target_model(next_states).data.cpu()
195 |
196 | rewards = torch.FloatTensor(rewards)
197 | dones = torch.FloatTensor(dones)
198 |
199 | # Q Learning: get maximum Q value at s' from target model
200 | target = rewards + (1 - dones) * self.discount_factor * next_pred.max(1)[0]
201 | target = Variable(target).cuda()
202 |
203 | self.optimizer.zero_grad()
204 |
205 | # MSE Loss function
206 | loss = F.smooth_l1_loss(pred, target)
207 | loss.backward()
208 |
209 | # and train
210 | self.optimizer.step()
211 |
212 |
213 | if __name__ == "__main__":
214 | EPISODES = 500000
215 | HEIGHT = 84
216 | WIDTH = 84
217 | HISTORY_SIZE = 4
218 |
219 | env = gym.make('BreakoutDeterministic-v4')
220 | max_life = find_max_lifes(env)
221 | state_size = env.observation_space.shape
222 | # action_size = env.action_space.n
223 | action_size = 3
224 | scores, episodes = [], []
225 | agent = DQNAgent(action_size)
226 | recent_reward = deque(maxlen=100)
227 | frame = 0
228 | memory_size = 0
229 | for e in range(EPISODES):
230 | done = False
231 | score = 0
232 |
233 | history = np.zeros([5, 84, 84], dtype=np.uint8)
234 | step = 0
235 | d = False
236 | state = env.reset()
237 | life = max_life
238 |
239 | get_init_state(history, state)
240 |
241 | while not done:
242 | step += 1
243 | frame += 1
244 | if agent.render:
245 | env.render()
246 |
247 | # get action for the current state and go one step in environment
248 | action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
249 |
250 | next_state, reward, done, info = env.step(action + 1)
251 |
252 | pre_proc_next_state = pre_proc(next_state)
253 | history[4, :, :] = pre_proc_next_state
254 | ter = check_live(life, info['ale.lives'])
255 |
256 | life = info['ale.lives']
257 | r = np.clip(reward, -1, 1)
258 |
259 | # save the sample to the replay memory
260 | agent.append_sample(deepcopy(pre_proc_next_state), action, r, ter)
261 | # every time step do the training
262 | if frame >= agent.train_start:
263 | agent.train_model(frame)
264 | if frame % agent.update_target == 0:
265 | agent.update_target_model()
266 | score += reward
267 | history[:4, :, :] = history[1:, :, :]
268 |
269 | if frame % 50000 == 0:
270 | print('now time : ', datetime.now())
271 | scores.append(score)
272 | episodes.append(e)
273 | pylab.plot(episodes, scores, 'b')
274 | pylab.savefig("./save_graph/breakout_dqn.png")
275 |
276 | if done:
277 | recent_reward.append(score)
278 | # every episode, plot the play time
279 | print("episode:", e, " score:", score, " memory length:",
280 | len(agent.memory), " epsilon:", agent.epsilon, " steps:", step,
281 | " recent reward:", np.mean(recent_reward))
282 |
283 | # if the mean of scores of last 10 episode is bigger than 400
284 | # stop training
285 | if np.mean(recent_reward) > 50:
286 | torch.save(agent.model, "./save_model/breakout_dqn")
287 | sys.exit()
288 |
--------------------------------------------------------------------------------
/CartPole/CartPole_A2C_episodic.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import tensorflow as tf
4 | import gym
5 | from collections import deque
6 |
7 | env = gym.make('CartPole-v0')
8 |
9 | # 하이퍼 파라미터
10 | LEARNING_RATE = 0.005
11 | INPUT = env.observation_space.shape[0]
12 | OUTPUT = env.action_space.n
13 | DISCOUNT = 0.99
14 |
15 |
16 | def discount_rewards(r):
17 | '''Discounted reward를 구하기 위한 함수
18 |
19 | Args:
20 | r(np.array): reward 값이 저장된 array
21 |
22 | Returns:
23 | discounted_r(np.array): Discounted 된 reward가 저장된 array
24 | '''
25 | discounted_r = np.zeros_like(r, dtype=np.float32)
26 | running_add = 0
27 | for t in reversed(range(len(r))):
28 | running_add = running_add * DISCOUNT + r[t]
29 | discounted_r[t] = running_add
30 |
31 | return discounted_r
32 |
33 |
34 | def train_episodic(A2Cagent, x, y, r):
35 | '''에피소드당 학습을 하기위한 함수
36 |
37 | Args:
38 | A2Cagent(ActorCritic): 학습될 네트워크
39 | x(np.array): State가 저장되어있는 array
40 | y(np.array): Action(one_hot)이 저장되어있는 array
41 | r(np.array) : Discounted reward가 저장되어있는 array
42 |
43 | Returns:
44 | l(float): 네트워크에 의한 loss
45 | '''
46 | l, _ = A2Cagent.sess.run([A2Cagent.loss, A2Cagent.train], feed_dict={A2Cagent.X: x, A2Cagent.Y: y, A2Cagent.r: r})
47 | return l
48 |
49 |
50 | def play_cartpole(A2Cagent):
51 | '''학습된 네트워크로 Play하기 위한 함수
52 |
53 | Args:
54 | A2Cagent(ActorCritic): 학습된 네트워크
55 | '''
56 | print("Play Cartpole!")
57 | episode = 0
58 | while True:
59 | s = env.reset()
60 | done = False
61 | rall = 0
62 | episode += 1
63 | while not done:
64 | env.render()
65 | action_p = A2Cagent.get_action(s)
66 | s1, reward, done, _ = env.step(action_p)
67 | s = s1
68 | rall += reward
69 | print("[Episode {0:6f}] Reward: {1:4f} ".format(episode, rall))
70 |
71 |
72 | class ActorCritic:
73 | def __init__(self, sess, input_size, output_size):
74 | self.sess = sess
75 | self.input_size = input_size
76 | self.output_size = output_size
77 |
78 | self.build_network()
79 |
80 | def build_network(self):
81 |
82 | self.X = tf.placeholder('float', [None, self.input_size])
83 | self.Y = tf.placeholder('float', [None, self.output_size])
84 |
85 | self.r = tf.placeholder('float')
86 |
87 | # Actor Weight
88 | w1_a = tf.get_variable('w1', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer())
89 | w2_a = tf.get_variable('w2', shape=[128, self.output_size], initializer=tf.contrib.layers.xavier_initializer())
90 |
91 | # Critic Weight
92 | w1_c = tf.get_variable('w1_c', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer())
93 | w2_c = tf.get_variable('w2_c', shape=[128, 1], initializer=tf.contrib.layers.xavier_initializer())
94 |
95 | # Actor Critic Network
96 | l1_a = tf.nn.relu(tf.matmul(self.X, w1_a))
97 | l1_c = tf.nn.relu(tf.matmul(self.X, w1_c))
98 | self.a_prob = tf.nn.softmax(tf.matmul(l1_a, w2_a))
99 | self.v = tf.matmul(l1_c, w2_c)
100 |
101 | # A_t = R_t - V(S_t)
102 | self.adv = self.r - self.v
103 |
104 | # Policy loss
105 | self.log_p = self.Y * tf.log(tf.clip_by_value(self.a_prob,1e-10,1.))
106 | self.log_lik = self.log_p * tf.stop_gradient(self.adv)
107 | self.p_loss = -tf.reduce_mean(tf.reduce_sum(self.log_lik, axis=1))
108 |
109 | # entropy(for more exploration)
110 | self.entropy = -tf.reduce_mean(tf.reduce_sum(self.a_prob * tf.log(tf.clip_by_value(self.a_prob,1e-10,1.)), axis=1))
111 |
112 | # Value loss
113 | self.v_loss = tf.reduce_mean(tf.square(self.v - self.r), axis=1)
114 |
115 | # Total loss
116 | self.loss = self.p_loss + self.v_loss - self.entropy * 0.01
117 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
118 |
119 | def get_action(self, state):
120 | state_t = np.reshape(state, [1, self.input_size])
121 | action_p = self.sess.run(self.a_prob, feed_dict={self.X: state_t})
122 |
123 | # 각 액션의 확률로 액션을 결정
124 | action = np.random.choice(np.arange(self.output_size), p=action_p[0])
125 |
126 | return action
127 |
128 |
129 | def main():
130 | with tf.Session() as sess:
131 | A2Cagent = ActorCritic(sess, INPUT, OUTPUT)
132 |
133 | A2Cagent.sess.run(tf.global_variables_initializer())
134 | episode = 0
135 | recent_rlist = deque(maxlen=100)
136 | recent_rlist.append(0)
137 |
138 | # 최근 100개의 점수가 195점 넘을 때까지 학습
139 | while np.mean(recent_rlist) <= 195:
140 | episode += 1
141 | episode_memory = deque()
142 | rall = 0
143 | s = env.reset()
144 | done = False
145 |
146 | while not done:
147 | # 액션 선택
148 | action = A2Cagent.get_action(s)
149 |
150 | # action을 one_hot으로 표현
151 | y = np.zeros(OUTPUT)
152 | y[action] = 1
153 |
154 | s1, reward, done, _ = env.step(action)
155 | rall += reward
156 |
157 | # 에피소드 메모리에 저장
158 | episode_memory.append([s, y, reward])
159 | s = s1
160 |
161 | # 에피소드가 끝났을때 학습
162 | if done:
163 | episode_memory = np.array(episode_memory)
164 |
165 | discounted_rewards = discount_rewards(np.vstack(episode_memory[:, 2]))
166 |
167 | discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std())
168 |
169 | train_episodic(A2Cagent, np.vstack(episode_memory[:, 0]), np.vstack(episode_memory[:, 1]),
170 | discounted_rewards)
171 |
172 | recent_rlist.append(rall)
173 |
174 | print("[Episode {0:6d}] Reward: {1:4f} Recent Reward: {2:4f}".format(episode, rall, np.mean(recent_rlist)))
175 |
176 | play_cartpole(A2Cagent)
177 |
178 |
179 | if __name__ == "__main__":
180 | main()
181 |
182 |
183 |
184 |
185 |
186 |
--------------------------------------------------------------------------------
/CartPole/CartPole_C51.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import math
3 | import random as ran
4 |
5 | import tensorflow as tf
6 | import gym
7 | import numpy as np
8 | import matplotlib.pyplot as plt
9 |
10 | from collections import deque
11 |
12 | env = gym.make('CartPole-v1')
13 |
14 | # 하이퍼 파라미터
15 | MINIBATCH_SIZE = 64
16 | TRAIN_START = 1000
17 | FINAL_EXPLORATION = 0.01
18 | TARGET_UPDATE = 1000
19 | MEMORY_SIZE = 50000
20 | EXPLORATION = 20000
21 | START_EXPLORATION = 1.
22 | INPUT = env.observation_space.shape[0]
23 | OUTPUT = env.action_space.n
24 | LEARNING_RATE = 0.001
25 | DISCOUNT = 0.99
26 | VMIN = -10
27 | VMAX = 40
28 | CATEGORY = 51
29 |
30 | model_path = "save/CartPole_C51.ckpt"
31 |
32 |
33 | def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"):
34 | '''타겟네트워크에 메인네트워크의 Weight값을 복사.
35 |
36 | Args:
37 | dest_scope_name="target"(DQN): 'target'이라는 이름을 가진 객체를 가져옴
38 | src_scope_name="main"(DQN): 'main'이라는 이름을 가진 객체를 가져옴
39 |
40 | Returns:
41 | list: main의 trainable한 값들이 target의 값으로 복사된 값
42 | '''
43 | op_holder = []
44 |
45 | src_vars = tf.get_collection(
46 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
47 | dest_vars = tf.get_collection(
48 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)
49 |
50 | for src_var, dest_var in zip(src_vars, dest_vars):
51 | op_holder.append(dest_var.assign(src_var.value()))
52 |
53 | return op_holder
54 |
55 |
56 | def train_minibatch(mainC51, targetC51, minibatch):
57 | '''미니배치로 가져온 sample데이터로 메인네트워크 학습
58 |
59 | Args:
60 | mainC51(object): 메인 네트워크
61 | targetC51(object): 타겟 네트워크
62 | minibatch: replay_memory에서 MINIBATCH 개수만큼 랜덤 sampling 해온 값
63 |
64 | Note:
65 | replay_memory에서 꺼내온 값으로 메인 네트워크를 학습
66 | '''
67 | s_stack = []
68 | a_stack = []
69 | r_stack = []
70 | s1_stack = []
71 | d_stack = []
72 | m_prob = [np.zeros((len(minibatch), mainC51.category_size)) for _ in range(OUTPUT)]
73 |
74 | for s_r, a_r, r_r, d_r, s1_r in minibatch:
75 | s_stack.append(s_r)
76 | a_stack.append(a_r)
77 | r_stack.append(r_r)
78 | s1_stack.append(s1_r)
79 | d_stack.append(d_r)
80 |
81 | # Categorical Algorithm
82 | target_sum_q = targetC51.sess.run(targetC51.soft_dist_Q, feed_dict={targetC51.X: np.vstack(s1_stack)})
83 |
84 | # Get optimal action
85 | sum_q = mainC51.optimal_action(s1_stack)
86 | sum_q = sum_q.reshape([len(minibatch), OUTPUT], order='F')
87 | optimal_actions = np.argmax(sum_q, axis=1)
88 |
89 | for i in range(len(minibatch)):
90 | if d_stack[i]:
91 | # Compute the projection of Tz
92 | Tz = min(VMAX, max(VMIN, r_stack[i]))
93 | bj = (Tz - VMIN) / mainC51.delta_z
94 | m_l, m_u = math.floor(bj), math.ceil(bj)
95 |
96 | # Distribute probability Tz
97 | m_prob[a_stack[i]][i][int(m_l)] += (m_u - bj)
98 | m_prob[a_stack[i]][i][int(m_u)] += (bj - m_l)
99 | else:
100 | for j in range(mainC51.category_size):
101 | # Compute the projection of Tz
102 | Tz = min(VMAX, max(VMIN, r_stack[i] + DISCOUNT * mainC51.z[j]))
103 | bj = (Tz - VMIN) / mainC51.delta_z
104 | m_l, m_u = math.floor(bj), math.ceil(bj)
105 |
106 | # Distribute probability Tz
107 | m_prob[a_stack[i]][i][int(m_l)] += (m_u - bj) * target_sum_q[optimal_actions[i]][i][j]
108 | m_prob[a_stack[i]][i][int(m_u)] += (bj - m_l) * target_sum_q[optimal_actions[i]][i][j]
109 |
110 | mainC51.sess.run(mainC51.train, feed_dict={mainC51.X: np.vstack(s_stack), mainC51.Y: m_prob})
111 |
112 |
113 | class C51Agent:
114 | def __init__(self, sess, INPUT, OUTPUT, VMAX, VMIN, CATEGORY, NAME='main'):
115 | self.sess = sess
116 |
117 | self.input_size = INPUT
118 | self.output_size = OUTPUT
119 | self.category_size = CATEGORY
120 | self.delta_z = (VMAX - VMIN) / float(self.category_size - 1)
121 | self.z = [VMIN + i * self.delta_z for i in range(self.category_size)]
122 | self.name = NAME
123 |
124 | self.build_network()
125 |
126 | def build_network(self):
127 | with tf.variable_scope(self.name):
128 | self.X = tf.placeholder('float', [None, self.input_size])
129 | self.Y = tf.placeholder('float', [2, None, self.category_size])
130 |
131 | self.dist_Q = []
132 |
133 | w1 = tf.get_variable("w1", shape=[self.input_size, 256], initializer=tf.contrib.layers.xavier_initializer())
134 |
135 | # Output weight
136 | for i in range(self.output_size):
137 | exec(
138 | 'w2_%s = tf.get_variable("w2_%s", shape=[256, self.category_size], initializer=tf.contrib.layers.xavier_initializer())' % (
139 | i, i))
140 |
141 | l1 = tf.nn.selu(tf.matmul(self.X, w1))
142 | # Output Layer
143 | for i in range(self.output_size):
144 | exec('self.dist_Q.append(tf.matmul(l1, w2_%s))' % i)
145 |
146 | self.soft_dist_Q = tf.nn.softmax(self.dist_Q)
147 | self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.Y, logits=self.dist_Q))
148 | optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
149 | self.train = optimizer.minimize(self.loss)
150 |
151 | self.saver = tf.train.Saver(max_to_keep=None)
152 |
153 | def get_action(self, state, e):
154 | if e > np.random.rand(1):
155 | action = np.random.randint(self.output_size)
156 | else:
157 | sum_q = self.optimal_action(state)
158 | action = np.argmax(sum_q)
159 | return action
160 |
161 | def optimal_action(self, state):
162 | state = np.vstack(state)
163 | state = state.reshape([-1, self.input_size])
164 | z = self.sess.run(self.soft_dist_Q, feed_dict={self.X: state})
165 | z_stack = np.vstack(z)
166 | sum_q = np.sum(np.multiply(z_stack, np.array(self.z)), axis=1)
167 | return sum_q
168 |
169 |
170 | def main():
171 | with tf.Session() as sess:
172 | mainC51 = C51Agent(sess, INPUT, OUTPUT, VMAX, VMIN, CATEGORY, NAME='main')
173 | targetC51 = C51Agent(sess, INPUT, OUTPUT, VMAX, VMIN, CATEGORY, NAME='target')
174 |
175 | sess.run(tf.global_variables_initializer())
176 |
177 | # initial copy q_net -> target_net
178 | copy_ops = get_copy_var_ops(dest_scope_name="target",
179 | src_scope_name="main")
180 | sess.run(copy_ops)
181 |
182 | recent_rlist = deque(maxlen=100)
183 | recent_rlist.append(0)
184 | e = 1.
185 | episode, epoch, frame = 0, 0, 0
186 |
187 | replay_memory = deque(maxlen=MEMORY_SIZE)
188 |
189 | # Train agent
190 | while np.mean(recent_rlist) <= 495:
191 | episode += 1
192 |
193 | rall, count = 0, 0
194 | d = False
195 | s = env.reset()
196 |
197 | while not d:
198 | frame += 1
199 | count += 1
200 |
201 | # e-greedy
202 | if e > FINAL_EXPLORATION and frame > TRAIN_START:
203 | e -= (START_EXPLORATION - FINAL_EXPLORATION) / EXPLORATION
204 |
205 | # 액션 선택
206 | action = mainC51.get_action(s, e)
207 |
208 | # s1 : next frame / r : reward / d : done(terminal) / l : info(lives)
209 | s1, r, d, l = env.step(action)
210 | if d and count < env.spec.timestep_limit:
211 | reward = -1
212 | else:
213 | reward = r
214 |
215 | replay_memory.append((s, action, reward, d, s1))
216 | s = s1
217 |
218 | rall += r
219 |
220 | if frame > TRAIN_START:
221 | minibatch = ran.sample(replay_memory, MINIBATCH_SIZE)
222 | train_minibatch(mainC51, targetC51, minibatch)
223 |
224 | if frame % TARGET_UPDATE == 0:
225 | copy_ops = get_copy_var_ops(dest_scope_name="target",
226 | src_scope_name="main")
227 | sess.run(copy_ops)
228 |
229 | recent_rlist.append(rall)
230 |
231 | print("Episode:{0:6d} | Frames:{1:9d} | Steps:{2:5d} | Reward:{3:3.0f} | e-greedy:{4:.5f} | "
232 | "Recent reward:{5:.5f} ".format(episode, frame, count, rall, e,
233 | np.mean(recent_rlist)))
234 |
235 |
236 | if __name__ == "__main__":
237 | main()
238 |
--------------------------------------------------------------------------------
/CartPole/CartPole_DDQN.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import tensorflow as tf
3 | import gym
4 | from gym import wrappers
5 | import numpy as np
6 | import random as ran
7 |
8 | env = gym.make('CartPole-v0')
9 |
10 | # 꺼내서 사용할 리플레이 갯수
11 | REPLAY = 10
12 | # 리플레이를 저장할 리스트
13 | REPLAY_MEMORY = []
14 | # 미니배치
15 | MINIBATCH = 50
16 |
17 | INPUT = env.observation_space.shape[0]
18 | OUTPUT = env.action_space.n
19 |
20 | # 하이퍼파라미터
21 | LEARNING_LATE = 0.001
22 | DISCOUNT = 0.99
23 | model_path = "save/model.ckpt"
24 |
25 |
26 | # 두개의 네트워크 구성
27 |
28 | x=tf.placeholder(dtype=tf.float32, shape=(None, INPUT))
29 |
30 | y=tf.placeholder(dtype=tf.float32, shape=(None, OUTPUT))
31 | dropout = tf.placeholder(dtype=tf.float32)
32 |
33 | # Main 네트워크
34 | W1 = tf.get_variable('W1',shape=[INPUT, 200],initializer=tf.contrib.layers.xavier_initializer())
35 | W2 = tf.get_variable('W2',shape=[200,200],initializer=tf.contrib.layers.xavier_initializer())
36 | # W3 = tf.get_variable('W3',shape=[200,150],initializer=tf.contrib.layers.xavier_initializer())
37 | W4 = tf.get_variable('W4',shape=[200, OUTPUT],initializer=tf.contrib.layers.xavier_initializer())
38 |
39 | b1 = tf.Variable(tf.zeros([1],dtype=tf.float32))
40 | b2 = tf.Variable(tf.zeros([1],dtype=tf.float32))
41 |
42 | _L1=tf.nn.relu(tf.matmul(x,W1)+b1)
43 | L1=tf.nn.dropout(_L1,dropout)
44 | _L2=tf.nn.relu(tf.matmul(L1,W2)+b2)
45 | L2=tf.nn.dropout(_L2,dropout)
46 | # L3=tf.nn.relu(tf.matmul(L2,W3))
47 | Q_pre = tf.matmul(L2,W4)
48 |
49 | # Target 네트워크
50 | W1_r = tf.get_variable('W1_r',shape=[INPUT, 200])
51 | W2_r = tf.get_variable('W2_r',shape=[200,200])
52 | # W3_r = tf.get_variable('W3_r',shape=[200,150])
53 | W4_r = tf.get_variable('W4_r',shape=[200, OUTPUT])
54 |
55 | b1_r = tf.Variable(tf.zeros([1],dtype=tf.float32))
56 | b2_r = tf.Variable(tf.zeros([1],dtype=tf.float32))
57 |
58 |
59 | L1_r=tf.nn.relu(tf.matmul(x ,W1_r)+b1_r)
60 | L2_r=tf.nn.relu(tf.matmul(L1_r,W2_r)+b2_r)
61 | # L3_r=tf.nn.relu(tf.matmul(L2_r,W3_r))
62 | Q_pre_r = tf.matmul(L2_r,W4_r)
63 |
64 | # 총 Reward를 저장해놓을 리스트
65 | rlist=[0]
66 | recent_rlist=[0]
67 |
68 | episode = 0
69 |
70 | # Loss function 정의
71 | cost = tf.reduce_sum(tf.square(y-Q_pre))
72 | optimizer = tf.train.AdamOptimizer(LEARNING_LATE, epsilon=0.01)
73 | train = optimizer.minimize(cost)
74 |
75 |
76 | saver = tf.train.Saver()
77 |
78 | # 세션 정의
79 | with tf.Session(config = tf.ConfigProto(device_count ={'GPU' : 0})) as sess:
80 | # 변수 초기화
81 | sess.run(tf.global_variables_initializer())
82 | # Target 네트워크에 main 네트워크 값을 카피해줌
83 | sess.run(W1_r.assign(W1))
84 | sess.run(W2_r.assign(W2))
85 | sess.run(W4_r.assign(W4))
86 | sess.run(b1_r.assign(b1))
87 | sess.run(b2_r.assign(b2))
88 |
89 | # 에피소드 시작
90 | while np.mean(recent_rlist) < 195 :
91 | episode += 1
92 |
93 | # state 초기화
94 | s = env.reset()
95 | if len(recent_rlist) > 200:
96 | del recent_rlist[0]
97 | # e-greedy
98 | e = 1. / ((episode/25)+1)
99 |
100 | rall = 0
101 | d = False
102 | count = 0
103 |
104 | # 에피소드가 끝나기 전까지 반복
105 | while not d and count < 10000 :
106 |
107 | #env.render()
108 | count += 1
109 |
110 | # state 값의 전처리
111 | s_t = np.reshape(s,[1,INPUT])
112 |
113 | # 현재 상태의 Q값을 에측
114 | Q = sess.run(Q_pre, feed_dict={x:s_t, dropout: 1})
115 |
116 | # e-greedy 정책으로 랜덤하게 action 결정
117 | if e > np.random.rand(1):
118 | a = env.action_space.sample()
119 | else:
120 | a = np.argmax(Q)
121 |
122 | # 결정된 action으로 Environment에 입력
123 | s1, r, d, _ = env.step(a)
124 |
125 | # Environment에서 반환한 Next_state, action, reward, done 값들을
126 | # Replay_memory에 저장
127 | REPLAY_MEMORY.append([s_t,a,r,s1,d,count])
128 |
129 | # 저장된 값들이 50000개 이상 넘어가면 맨 앞 Replay부터 삭제
130 | if len(REPLAY_MEMORY) > 50000:
131 | del REPLAY_MEMORY[0]
132 |
133 | # 총 reward 합
134 | rall += r
135 | # state를 Next_state로 바꿈
136 | s = s1
137 |
138 |
139 | # 10번의 episode마다 학습
140 | if len(REPLAY_MEMORY) > 50:
141 |
142 | # 50번의 미니배치로 학습
143 | # 저장된 리플레이 중에 학습에 사용할 랜덤한 리플레이 샘플들을 가져옴
144 | for sample in ran.sample(REPLAY_MEMORY, REPLAY):
145 |
146 | s_t_r, a_r, r_r, s1_r, d_r ,count_r= sample
147 |
148 | # 꺼내온 리플레이의 state의 Q값을 예측
149 | Y = sess.run(Q_pre, feed_dict={x: s_t_r, dropout: 1})
150 |
151 | if d_r:
152 | # 꺼내온 리플레이의 상태가 끝난 상황이라면 Negative Reward를 부여
153 | if count_r < env.spec.timestep_limit :
154 | Y[0, a_r] = -100
155 | else:
156 | # 끝나지 않았다면 Q값을 업데이트
157 | s1_t_r= np.reshape(s1_r,[1,INPUT])
158 | Q1, Q = sess.run([Q_pre_r,Q_pre], feed_dict={x: s1_t_r, dropout:1})
159 | Y[0, a_r] = r_r + DISCOUNT * Q1[0, np.argmax(Q)]
160 |
161 | # 업데이트 된 Q값으로 main네트워크를 학습
162 | _, loss = sess.run([train, cost], feed_dict={x: s_t_r, y: Y, dropout:1})
163 |
164 | # 10번 마다 target 네트워크에 main 네트워크 값을 copy
165 | sess.run(W1_r.assign(W1))
166 | sess.run(W2_r.assign(W2))
167 | sess.run(W4_r.assign(W4))
168 | sess.run(b1_r.assign(b1))
169 | sess.run(b2_r.assign(b2))
170 | print(loss)
171 |
172 | # 총 reward의 합을 list에 저장
173 | recent_rlist.append(rall)
174 | rlist.append(rall)
175 | print("Episode:{} steps:{} reward:{} average reward:{} recent reward:{}".format(episode, count, rall,
176 | np.mean(rlist),
177 | np.mean(recent_rlist)))
178 |
179 | save_path = saver.save(sess, model_path)
180 | print("Model saved in file: ",save_path)
181 |
182 |
183 | rlist=[]
184 | recent_rlist=[]
185 |
186 |
187 | with tf.Session() as sess:
188 | sess.run(tf.global_variables_initializer())
189 | saver.restore(sess, model_path)
190 |
191 | print("Model restored form file: ", save_path)
192 | for episode in range(500):
193 | # state 초기화
194 | s = env.reset()
195 |
196 | rall = 0
197 | d = False
198 | count = 0
199 | # 에피소드가 끝나기 전까지 반복
200 | while not d :
201 | env.render()
202 | count += 1
203 | # state 값의 전처리
204 | s_t = np.reshape(s, [1, INPUT])
205 |
206 | # 현재 상태의 Q값을 에측
207 | Q = sess.run(Q_pre, feed_dict={x: s_t,dropout: 1})
208 | a = np.argmax(Q)
209 |
210 | # 결정된 action으로 Environment에 입력
211 | s, r, d, _ = env.step(a)
212 |
213 | # 총 reward 합
214 | rall += r
215 |
216 |
217 | rlist.append(rall)
218 |
219 | print("Episode : {} steps : {} r={}. averge reward : {}".format(episode, count, rall,
220 | np.mean(rlist)))
221 |
222 |
--------------------------------------------------------------------------------
/CartPole/CartPole_DQN_NIPS2013.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import tensorflow as tf
3 | import gym
4 | import numpy as np
5 | import random as ran
6 |
7 | env = gym.make('CartPole-v1')
8 |
9 | # 꺼내서 사용할 리플레이 갯수
10 | REPLAY = 10
11 | # 리플레이를 저장할 리스트
12 | REPLAY_MEMORY = []
13 | # 미니배치
14 | MINIBATCH = 50
15 |
16 | INPUT = env.observation_space.shape[0]
17 | OUTPUT = env.action_space.n
18 |
19 | # 하이퍼파라미터
20 | LEARNING_LATE = 0.01
21 | NUM_EPISODE = 2000
22 |
23 | DISCOUNT = 0.99
24 |
25 |
26 | # 네트워크 구성
27 | x=tf.placeholder(dtype=tf.float32, shape=(1,4))
28 |
29 | W1 = tf.get_variable('W1',shape=[INPUT,10],initializer=tf.contrib.layers.xavier_initializer())
30 | W2 = tf.get_variable('W4',shape=[10, OUTPUT],initializer=tf.contrib.layers.xavier_initializer())
31 |
32 | L1=tf.nn.tanh(tf.matmul(x,W1))
33 | Q_pre = tf.matmul(L1,W2)
34 |
35 | y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n))
36 |
37 | # 손실 함수
38 | loss = tf.reduce_sum(tf.square(y-Q_pre))
39 | optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_LATE)
40 | train = optimizer.minimize(loss)
41 |
42 | init = tf.global_variables_initializer()
43 |
44 | rList=[]
45 |
46 | with tf.Session() as sess:
47 | sess.run(init)
48 | for episode in range(5000):
49 |
50 | s = env.reset()
51 |
52 | e = 1. / ((episode/25)+1)
53 | rall = 0
54 | d = False
55 | count=0
56 |
57 | while not d:
58 | # env.render()
59 | count+=1
60 |
61 | # 현재 상태(s)로 Q값을 예측
62 | s_t = np.reshape(s,[1,INPUT])
63 | Q = sess.run(Q_pre, feed_dict={x:s_t})
64 |
65 | # e-greedy 를 사용하여 action값 구함
66 | if e > np.random.rand(1):
67 | a = env.action_space.sample()
68 | else:
69 | a = np.argmax(Q)
70 |
71 | # action을 취함
72 | s1, r, d, _ = env.step(a)
73 |
74 | # state, action, reward, next_state, done 을 메모리에 저장
75 | REPLAY_MEMORY.append([s_t,a,r,s1,d])
76 |
77 | # 메모리에 50000개 이상의 값이 들어가면 가장 먼저 들어간 것부터 삭제
78 | if len(REPLAY_MEMORY) > 50000:
79 | del REPLAY_MEMORY[0]
80 |
81 | rall += r
82 | s = s1
83 |
84 | # 10 번의 스탭마다 미니배치로 학습
85 | if episode % 10 == 1 :
86 |
87 | for i in range(MINIBATCH):
88 |
89 | # 메모리에서 사용할 리플레이를 랜덤하게 가져옴
90 | for sample in ran.sample(REPLAY_MEMORY, REPLAY):
91 |
92 | s_t_r, a_r, r_r, s1_r ,d_r = sample
93 |
94 | # DQN 알고리즘으로 학습
95 | if d_r:
96 | Q[0, a_r] = -100
97 | else:
98 | s1_t_r= np.reshape(s1_r,[1,INPUT])
99 |
100 | Q1 = sess.run(Q_pre, feed_dict={x: s1_t_r})
101 |
102 | Q[0, a_r] = r_r + DISCOUNT * np.max(Q1)
103 |
104 | sess.run(train, feed_dict={x: s_t_r, y: Q})
105 |
106 |
107 |
108 | rList.append(rall)
109 | print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(episode, count, rall, np.mean(rList)))
110 |
111 |
112 | for episode in range(500):
113 | # state 초기화
114 | s = env.reset()
115 |
116 | rall = 0
117 | d = False
118 | count = 0
119 | # 에피소드가 끝나기 전까지 반복
120 | while not d :
121 | env.render()
122 | count += 1
123 | # state 값의 전처리
124 | s_t = np.reshape(s, [1, INPUT])
125 |
126 | # 현재 상태의 Q값을 에측
127 | Q = sess.run(Q_pre, feed_dict={x: s_t})
128 | a = np.argmax(Q)
129 |
130 | # 결정된 action으로 Environment에 입력
131 | s, r, d, _ = env.step(a)
132 |
133 | # 총 reward 합
134 | rall += r
135 |
136 |
137 | rList.append(rall)
138 |
139 | print("Episode : {} steps : {} r={}. averge reward : {}".format(episode, count, rall,
140 | np.mean(rList)))
141 |
142 |
143 |
144 |
145 |
--------------------------------------------------------------------------------
/CartPole/CartPole_DQN_Nature2015.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import tensorflow as tf
3 | import gym
4 | from gym import wrappers
5 | import numpy as np
6 | import random as ran
7 |
8 | env = gym.make('CartPole-v0')
9 |
10 | # 꺼내서 사용할 리플레이 갯수
11 | REPLAY = 50
12 | # 리플레이를 저장할 리스트
13 | REPLAY_MEMORY = []
14 | # 미니배치
15 | MINIBATCH = 50
16 |
17 | INPUT = env.observation_space.shape[0]
18 | OUTPUT = env.action_space.n
19 |
20 | # 하이퍼파라미터
21 | LEARNING_LATE = 0.001
22 | DISCOUNT = 0.99
23 | model_path = "save/model.ckpt"
24 |
25 |
26 | # 두개의 네트워크 구성
27 |
28 | x=tf.placeholder(dtype=tf.float32, shape=(None, INPUT))
29 |
30 | y=tf.placeholder(dtype=tf.float32, shape=(None, OUTPUT))
31 | dropout = tf.placeholder(dtype=tf.float32)
32 |
33 | # Main 네트워크
34 | W1 = tf.get_variable('W1',shape=[INPUT, 200],initializer=tf.contrib.layers.xavier_initializer())
35 | W2 = tf.get_variable('W2',shape=[200,200],initializer=tf.contrib.layers.xavier_initializer())
36 | # W3 = tf.get_variable('W3',shape=[200,150],initializer=tf.contrib.layers.xavier_initializer())
37 | W4 = tf.get_variable('W4',shape=[200, OUTPUT],initializer=tf.contrib.layers.xavier_initializer())
38 |
39 | b1 = tf.Variable(tf.zeros([1],dtype=tf.float32))
40 | b2 = tf.Variable(tf.zeros([1],dtype=tf.float32))
41 |
42 | _L1=tf.nn.relu(tf.matmul(x,W1)+b1)
43 | L1=tf.nn.dropout(_L1,dropout)
44 | _L2=tf.nn.relu(tf.matmul(L1,W2)+b2)
45 | L2=tf.nn.dropout(_L2,dropout)
46 | # L3=tf.nn.relu(tf.matmul(L2,W3))
47 | Q_pre = tf.matmul(L2,W4)
48 |
49 | # Target 네트워크
50 | W1_r = tf.get_variable('W1_r',shape=[INPUT, 200])
51 | W2_r = tf.get_variable('W2_r',shape=[200,200])
52 | # W3_r = tf.get_variable('W3_r',shape=[200,150])
53 | W4_r = tf.get_variable('W4_r',shape=[200, OUTPUT])
54 |
55 | b1_r = tf.Variable(tf.zeros([1],dtype=tf.float32))
56 | b2_r = tf.Variable(tf.zeros([1],dtype=tf.float32))
57 |
58 |
59 | L1_r=tf.nn.relu(tf.matmul(x ,W1_r)+b1_r)
60 | L2_r=tf.nn.relu(tf.matmul(L1_r,W2_r)+b2_r)
61 | # L3_r=tf.nn.relu(tf.matmul(L2_r,W3_r))
62 | Q_pre_r = tf.matmul(L2_r,W4_r)
63 |
64 | # 총 Reward를 저장해놓을 리스트
65 | rlist=[0]
66 | recent_rlist=[0]
67 |
68 | episode = 0
69 |
70 | # Loss function 정의
71 | cost = tf.reduce_sum(tf.square(y-Q_pre))
72 | optimizer = tf.train.AdamOptimizer(LEARNING_LATE, epsilon=0.01)
73 | train = optimizer.minimize(cost)
74 |
75 |
76 | saver = tf.train.Saver()
77 |
78 | # 세션 정의
79 | with tf.Session() as sess:
80 | # 변수 초기화
81 | sess.run(tf.global_variables_initializer())
82 | # Target 네트워크에 main 네트워크 값을 카피해줌
83 | sess.run(W1_r.assign(W1))
84 | sess.run(W2_r.assign(W2))
85 | sess.run(W4_r.assign(W4))
86 | sess.run(b1_r.assign(b1))
87 | sess.run(b2_r.assign(b2))
88 |
89 | # 에피소드 시작
90 | while np.mean(recent_rlist) < 195 :
91 | episode += 1
92 |
93 | # state 초기화
94 | s = env.reset()
95 | if len(recent_rlist) > 200:
96 | del recent_rlist[0]
97 | # e-greedy
98 | e = 1. / ((episode/25)+1)
99 |
100 | rall = 0
101 | d = False
102 | count = 0
103 |
104 | # 에피소드가 끝나기 전까지 반복
105 | while not d and count < 10000 :
106 |
107 | #env.render()
108 | count += 1
109 |
110 | # state 값의 전처리
111 | s_t = np.reshape(s,[1,INPUT])
112 |
113 | # 현재 상태의 Q값을 에측
114 | Q = sess.run(Q_pre, feed_dict={x:s_t, dropout: 1})
115 |
116 | # e-greedy 정책으로 랜덤하게 action 결정
117 | if e > np.random.rand(1):
118 | a = env.action_space.sample()
119 | else:
120 | a = np.argmax(Q)
121 |
122 | # 결정된 action으로 Environment에 입력
123 | s1, r, d, _ = env.step(a)
124 |
125 | # Environment에서 반환한 Next_state, action, reward, done 값들을
126 | # Replay_memory에 저장
127 | REPLAY_MEMORY.append([s_t,a,r,s1,d,count])
128 |
129 | # 저장된 값들이 50000개 이상 넘어가면 맨 앞 Replay부터 삭제
130 | if len(REPLAY_MEMORY) > 50000:
131 | del REPLAY_MEMORY[0]
132 |
133 | # 총 reward 합
134 | rall += r
135 | # state를 Next_state로 바꿈
136 | s = s1
137 |
138 |
139 | # 10번의 episode마다 학습
140 | if episode % 10 == 1 and len(REPLAY_MEMORY) > 50:
141 |
142 | # 50번의 미니배치로 학습
143 | # 저장된 리플레이 중에 학습에 사용할 랜덤한 리플레이 샘플들을 가져옴
144 | for sample in ran.sample(REPLAY_MEMORY, REPLAY):
145 |
146 | s_t_r, a_r, r_r, s1_r, d_r ,count_r= sample
147 |
148 | # 꺼내온 리플레이의 state의 Q값을 예측
149 | Q = sess.run(Q_pre, feed_dict={x: s_t_r, dropout: 1})
150 |
151 | if d_r:
152 | # 꺼내온 리플레이의 상태가 끝난 상황이라면 Negative Reward를 부여
153 | if count_r < env.spec.timestep_limit :
154 | Q[0, a_r] = -100
155 | else:
156 | # 끝나지 않았다면 Q값을 업데이트
157 | s1_t_r= np.reshape(s1_r,[1,INPUT])
158 | Q1 = sess.run(Q_pre_r, feed_dict={x: s1_t_r})
159 | Q[0, a_r] = r_r + DISCOUNT * np.max(Q1)
160 |
161 | # 업데이트 된 Q값으로 main네트워크를 학습
162 | _, loss = sess.run([train, cost], feed_dict={x: s_t_r, y: Q, dropout:1})
163 |
164 | # 10번 마다 target 네트워크에 main 네트워크 값을 copy
165 | sess.run(W1_r.assign(W1))
166 | sess.run(W2_r.assign(W2))
167 | sess.run(W4_r.assign(W4))
168 | sess.run(b1_r.assign(b1))
169 | sess.run(b2_r.assign(b2))
170 | print(loss)
171 |
172 | # 총 reward의 합을 list에 저장
173 | recent_rlist.append(rall)
174 | rlist.append(rall)
175 | print("Episode:{} steps:{} reward:{} average reward:{} recent reward:{}".format(episode, count, rall,
176 | np.mean(rlist),
177 | np.mean(recent_rlist)))
178 |
179 | save_path = saver.save(sess, model_path)
180 | print("Model saved in file: ",save_path)
181 |
182 |
183 | rlist=[]
184 | recent_rlist=[]
185 |
186 |
187 | with tf.Session() as sess:
188 | sess.run(tf.global_variables_initializer())
189 | saver.restore(sess, model_path)
190 |
191 | print("Model restored form file: ", save_path)
192 | for episode in range(500):
193 | # state 초기화
194 | s = env.reset()
195 |
196 | rall = 0
197 | d = False
198 | count = 0
199 | # 에피소드가 끝나기 전까지 반복
200 | while not d :
201 | env.render()
202 | count += 1
203 | # state 값의 전처리
204 | s_t = np.reshape(s, [1, INPUT])
205 |
206 | # 현재 상태의 Q값을 에측
207 | Q = sess.run(Q_pre, feed_dict={x: s_t,dropout: 1})
208 | a = np.argmax(Q)
209 |
210 | # 결정된 action으로 Environment에 입력
211 | s, r, d, _ = env.step(a)
212 |
213 | # 총 reward 합
214 | rall += r
215 |
216 |
217 | rlist.append(rall)
218 |
219 | print("Episode : {} steps : {} r={}. averge reward : {}".format(episode, count, rall,
220 | np.mean(rlist)))
221 |
222 |
--------------------------------------------------------------------------------
/CartPole/CartPole_PAAC.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import tensorflow as tf
4 | import gym
5 | from collections import deque
6 |
7 |
8 | def make_batch(A2Cagent, sample):
9 | sample = np.stack(sample)
10 | discounted_return = np.empty([NSTEP, 1])
11 |
12 | s = np.reshape(np.stack(sample[:, 0]), [NSTEP, A2Cagent.input_size])
13 | s1 = np.reshape(np.stack(sample[:, 3]), [NSTEP, A2Cagent.input_size])
14 | y = np.reshape(np.stack(sample[:, 1]), [NSTEP, A2Cagent.output_size])
15 | r = np.reshape(np.stack(sample[:, 2]), [NSTEP, 1])
16 | d = np.reshape(np.stack(sample[:, 4]), [NSTEP, 1])
17 |
18 | value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X: s})
19 | next_value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X: s1})
20 |
21 | # Discounted Return 계산
22 | running_add = next_value[NSTEP - 1, 0] * d[NSTEP - 1, 0]
23 | for t in range(4, -1, -1):
24 | if d[t]:
25 | running_add = 0
26 | running_add = r[t] + DISCOUNT * running_add
27 | discounted_return[t, 0] = running_add
28 |
29 | # For critic
30 | target = r + DISCOUNT * d * next_value
31 |
32 | # For Actor
33 | adv = discounted_return - value
34 |
35 | return [s, target, y, adv]
36 |
37 |
38 | class ActorCritic:
39 | def __init__(self, sess, input_size, output_size):
40 | self.sess = sess
41 | self.input_size = input_size
42 | self.output_size = output_size
43 |
44 | self.build_network()
45 |
46 | def build_network(self):
47 | self.X = tf.placeholder('float', [None, self.input_size])
48 | self.Y = tf.placeholder('float', [None, self.output_size])
49 | self.adv = tf.placeholder('float')
50 | self.r = tf.placeholder('float')
51 | self.LR = tf.placeholder('float')
52 |
53 | # Common Weight
54 | w1 = tf.get_variable('w1', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer())
55 |
56 | # Actor Weight
57 | w2_a = tf.get_variable('w2_a', shape=[128, self.output_size], initializer=tf.contrib.layers.xavier_initializer())
58 |
59 | # Critic Weight
60 | w2_c = tf.get_variable('w2_c', shape=[128, 1], initializer=tf.contrib.layers.xavier_initializer())
61 |
62 | # Common Layer
63 | l1 = tf.nn.selu(tf.matmul(self.X, w1))
64 |
65 | # Actor Output
66 | self.a = tf.matmul(l1, w2_a)
67 | self.a_prob = tf.nn.softmax(tf.matmul(l1, w2_a))
68 |
69 | # Critic Output
70 | self.v = tf.matmul(l1, w2_c)
71 |
72 | # Actor loss
73 | self.log_lik = tf.nn.softmax_cross_entropy_with_logits(labels=self.Y, logits=self.a)
74 | self.p_loss = tf.reduce_mean(self.log_lik * self.adv)
75 |
76 | # Critic loss
77 | self.v_loss = tf.reduce_mean(tf.square(self.v - self.r), axis=1)
78 |
79 | # entropy(for more exploration)
80 | self.entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.a_prob, logits=self.a))
81 |
82 | self.loss = self.p_loss - self.entropy * 0.01 + self.v_loss * 0.5
83 |
84 | optimizer = tf.train.RMSPropOptimizer(learning_rate=self.LR, epsilon=EPSILON, decay=ALPHA)
85 | gradients, variables = zip(*optimizer.compute_gradients(self.loss))
86 | gradients, _ = tf.clip_by_global_norm(gradients, 3.0)
87 | self.train = optimizer.apply_gradients(zip(gradients, variables))
88 |
89 | def get_action(self, state):
90 | state_t = np.reshape(state, [1, self.input_size])
91 | action_p = self.sess.run(self.a_prob, feed_dict={self.X: state_t})
92 |
93 | # 각 액션의 확률로 액션을 결정
94 | action = np.random.choice(np.arange(self.output_size), p=action_p[0])
95 |
96 | return action
97 |
98 |
99 | class Runner:
100 | def __init__(self, idx):
101 | self.env = gym.make('CartPole-v1')
102 |
103 | self.done = False
104 | self.s = self.env.reset()
105 | self.s1 = None
106 | self.sample = []
107 | self.step = 0
108 | self.runner_idx = idx
109 | self.episode = 0
110 | self.rall = 0
111 | self.recent_rlist = deque(maxlen=100)
112 | self.recent_rlist.append(0)
113 |
114 | def run(self, A2Cagent):
115 | if self.done:
116 | self.episode += 1
117 | if self.runner_idx == 0:
118 | self.recent_rlist.append(self.rall)
119 | print("[Episode {0:6d}] Reward: {1:4.2f} Recent Reward: {2:4.2f}".format(self.episode, self.rall,
120 | np.mean(self.recent_rlist)))
121 | self.done = False
122 | self.rall = 0
123 | self.step = 0
124 | self.s = self.env.reset()
125 |
126 | self.step += 1
127 | action = A2Cagent.get_action(self.s)
128 |
129 | # action을 one_hot으로 표현
130 | y = np.zeros(OUTPUT)
131 | y[action] = 1
132 | s1, reward, self.done, _ = self.env.step(action)
133 |
134 | self.rall += reward
135 |
136 | # negative reward
137 | if self.done and self.step < self.env.spec.timestep_limit:
138 | reward = -100
139 |
140 | self.sample.append([self.s, y, reward, s1, self.done])
141 | self.s = s1
142 |
143 |
144 | def main():
145 | with tf.Session() as sess:
146 | A2Cagent = ActorCritic(sess, INPUT, OUTPUT)
147 | A2Cagent.sess.run(tf.global_variables_initializer())
148 |
149 | step = 0
150 | runners = [Runner(i) for i in range(NENV)]
151 |
152 | while np.mean(runners[0].recent_rlist) <= 495:
153 | s_batch = []
154 | target_batch = []
155 | y_batch = []
156 | adv_batch = []
157 |
158 | learning_rate = LEARNING_RATE
159 |
160 | for t in range(NSTEP):
161 | for i in range(NENV):
162 | runners[i].run(A2Cagent)
163 |
164 | for i in range(NENV):
165 | batch = make_batch(A2Cagent, runners[i].sample)
166 |
167 | s_batch.extend(batch[0])
168 | target_batch.extend(batch[1])
169 | y_batch.extend(batch[2])
170 | adv_batch.extend(batch[3])
171 |
172 | runners[i].sample = []
173 |
174 | feed_dict = {A2Cagent.X: s_batch, A2Cagent.r: target_batch, A2Cagent.Y: y_batch, A2Cagent.adv: adv_batch,
175 | A2Cagent.LR: learning_rate}
176 |
177 | # Train Network
178 | A2Cagent.sess.run([A2Cagent.train], feed_dict=feed_dict)
179 |
180 | step += NENV * NSTEP
181 |
182 |
183 | if __name__ == "__main__":
184 | env = gym.make('CartPole-v1')
185 |
186 | # 하이퍼 파라미터
187 | INPUT = env.observation_space.shape[0]
188 | OUTPUT = env.action_space.n
189 | DISCOUNT = 0.99
190 | NSTEP = 5
191 | NENV = 16
192 | EPSILON = 1e-5
193 | ALPHA = 0.99
194 | LEARNING_RATE = 7e-4
195 | main()
196 |
--------------------------------------------------------------------------------
/CartPole/CartPole_PAAC_multiproc.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 |
4 | import torch.nn.functional as F
5 | import torch.optim as optim
6 | import torch.multiprocessing as mp
7 |
8 | import torch.nn as nn
9 | import torch
10 |
11 | from collections import deque
12 |
13 | from torch.distributions.categorical import Categorical
14 |
15 |
16 | def make_batch(sample, agent):
17 | sample = np.stack(sample)
18 | discounted_return = np.empty([NUM_STEP, 1])
19 |
20 | s = np.reshape(np.stack(sample[:, 0]), [NUM_STEP, agent.input_size])
21 | s1 = np.reshape(np.stack(sample[:, 3]), [NUM_STEP, agent.input_size])
22 | y = sample[:, 1]
23 | r = np.reshape(np.stack(sample[:, 2]), [NUM_STEP, 1])
24 | d = np.reshape(np.stack(sample[:, 4]), [NUM_STEP, 1]).astype(int)
25 |
26 | state = torch.from_numpy(s)
27 | state = state.float()
28 | _, value = agent.model(state)
29 |
30 | next_state = torch.from_numpy(s1)
31 | next_state = next_state.float()
32 | _, next_value = agent.model(next_state)
33 |
34 | value = value.data.numpy()
35 | next_value = next_value.data.numpy()
36 |
37 | # Discounted Return
38 | running_add = next_value[NUM_STEP - 1, 0] * (1 - d[NUM_STEP - 1, 0])
39 | for t in range(NUM_STEP - 1, -1, -1):
40 | if d[t]:
41 | running_add = 0
42 | running_add = r[t] + DISCOUNT * running_add
43 | discounted_return[t, 0] = running_add
44 |
45 | # For critic
46 | target = r + DISCOUNT * (1 - d) * next_value
47 |
48 | # For Actor
49 | adv = discounted_return - value
50 |
51 | return [s, target, y, adv]
52 |
53 |
54 | class ActorCriticNetwork(nn.Module):
55 | def __init__(self, input_size, output_size):
56 | super(ActorCriticNetwork, self).__init__()
57 | self.feature = nn.Sequential(
58 | nn.Linear(input_size, 64),
59 | nn.ReLU(),
60 | nn.Linear(64, 64),
61 | nn.ReLU()
62 | )
63 | self.actor = nn.Linear(64, output_size)
64 | self.critic = nn.Linear(64, 1)
65 |
66 | def forward(self, state):
67 | x = self.feature(state)
68 | policy = F.softmax(self.actor(x), dim=-1)
69 | value = self.critic(x)
70 | return policy, value
71 |
72 |
73 | # PAAC(Parallel Advantage Actor Critic)
74 | class ActorAgent(object):
75 | def __init__(self):
76 | self.model = ActorCriticNetwork(INPUT, OUTPUT)
77 |
78 | self.model.share_memory()
79 |
80 | self.output_size = OUTPUT
81 | self.input_size = INPUT
82 |
83 | def get_action(self, state):
84 | state = torch.from_numpy(state)
85 | state = state.float()
86 | policy, value = self.model(state)
87 | m = Categorical(policy)
88 | action = m.sample()
89 | return action.item()
90 |
91 | # after some time interval update the target model to be same with model
92 | def update_actor_model(self, target):
93 | self.model.load_state_dict(target.state_dict())
94 |
95 |
96 | class LearnerAgent(object):
97 | def __init__(self):
98 | self.model = ActorCriticNetwork(INPUT, OUTPUT)
99 | # self.model.cuda()
100 | self.output_size = OUTPUT
101 | self.input_size = INPUT
102 | self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE)
103 |
104 | def train_model(self, s_batch, target_batch, y_batch, adv_batch):
105 | s_batch = torch.FloatTensor(s_batch)
106 | target_batch = torch.FloatTensor(target_batch)
107 | y_batch = torch.LongTensor(y_batch)
108 | adv_batch = torch.FloatTensor(adv_batch)
109 |
110 | # for multiply advantage
111 | policy, value = self.model(s_batch)
112 | m = Categorical(policy)
113 |
114 | # mse = nn.SmoothL1Loss()
115 | mse = nn.MSELoss()
116 |
117 | # Actor loss
118 | actor_loss = -m.log_prob(y_batch) * adv_batch.sum(1)
119 |
120 | # Entropy(for more exploration)
121 | entropy = m.entropy()
122 | # Critic loss
123 | critic_loss = mse(value, target_batch)
124 |
125 | # Total loss
126 | loss = actor_loss.mean() + 0.5 * critic_loss - 0.01 * entropy.mean()
127 | self.optimizer.zero_grad()
128 | loss.backward()
129 |
130 | self.optimizer.step()
131 |
132 |
133 | class Environment(object):
134 | def __init__(self, env, idx):
135 | self.env = env
136 | self.obs = self.env.reset()
137 | self.next_obs = None
138 | self.done = False
139 | self.env_idx = idx
140 | self.step = 0
141 | self.episode = 0
142 | self.rall = 0
143 | self.recent_rlist = deque(maxlen=100)
144 | self.recent_rlist.append(0)
145 |
146 | def run(self, agent):
147 | sample = []
148 | for _ in range(NUM_STEP):
149 | self.step += 1
150 | action = agent.get_action(self.obs)
151 | self.next_obs, reward, self.done, _ = self.env.step(action)
152 | self.rall += reward
153 |
154 | # negative reward
155 | if self.done and self.step < self.env.spec.timestep_limit:
156 | reward = -100
157 |
158 | sample.append([self.obs[:], action, reward, self.next_obs[:], self.done])
159 |
160 | self.obs = self.next_obs
161 |
162 | if self.done:
163 | self.episode += 1
164 | if self.env_idx == 0:
165 | self.recent_rlist.append(self.rall)
166 | print("[Episode {0:6d}] Reward: {1:4.2f} Recent Reward: {2:4.2f}"
167 | .format(self.episode, self.rall, np.mean(self.recent_rlist)))
168 |
169 | self.obs = self.env.reset()
170 | self.done = False
171 | self.step = 0
172 | self.rall = 0
173 |
174 | return make_batch(sample, agent)
175 |
176 |
177 | def runner(env, cond, memory, actor):
178 | while True:
179 | with cond:
180 | sample = env.run(actor)
181 | memory.put(sample)
182 |
183 | # wait runner
184 | cond.wait()
185 |
186 |
187 | def learner(cond, memory, actor_agent, learner_agent):
188 | while True:
189 | if memory.full():
190 | s_batch, target_batch, y_batch, adv_batch = [], [], [], []
191 | # while memory.qsize() != 0:
192 | # if you use MacOS, use under condition.
193 | while not memory.empty():
194 | batch = memory.get()
195 |
196 | s_batch.extend(batch[0])
197 | target_batch.extend(batch[1])
198 | y_batch.extend(batch[2])
199 | adv_batch.extend(batch[3])
200 |
201 | # train
202 | learner_agent.train_model(s_batch, target_batch, y_batch, adv_batch)
203 | actor_agent.update_actor_model(learner_agent.model)
204 | # resume running
205 | with cond:
206 | cond.notify_all()
207 |
208 |
209 | def main():
210 | num_envs = NUM_ENV
211 | memory = mp.Queue(maxsize=NUM_ENV)
212 | cond = mp.Condition()
213 |
214 | # make agent and share memory
215 | actor_agent = ActorAgent()
216 | learner_agent = LearnerAgent()
217 |
218 | # sync model
219 | actor_agent.update_actor_model(learner_agent.model)
220 |
221 | # make envs
222 | envs = [Environment(gym.make('CartPole-v1'), i) for i in range(num_envs)]
223 |
224 | # Learner Process(only Learn)
225 | learn_proc = mp.Process(target=learner, args=(cond, memory, actor_agent, learner_agent))
226 |
227 | # Runner Process(just run, not learn)
228 | runners = []
229 | for idx, env in enumerate(envs):
230 | run_proc = mp.Process(target=runner, args=(env, cond, memory, actor_agent))
231 | runners.append(run_proc)
232 | run_proc.start()
233 |
234 | learn_proc.start()
235 |
236 | for proc in runners:
237 | proc.join()
238 |
239 | learn_proc.join()
240 |
241 |
242 | if __name__ == '__main__':
243 | torch.manual_seed(23)
244 | env = gym.make('CartPole-v1')
245 | # Hyper parameter
246 | INPUT = env.observation_space.shape[0]
247 | OUTPUT = env.action_space.n
248 | DISCOUNT = 0.99
249 | NUM_STEP = 5
250 | NUM_ENV = 1
251 | EPSILON = 1e-5
252 | ALPHA = 0.99
253 | LEARNING_RATE = 0.0007
254 | env.close()
255 |
256 | main()
257 |
--------------------------------------------------------------------------------
/CartPole/CartPole_PolicyGradient.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import tensorflow as tf
4 | import gym
5 | from collections import deque
6 |
7 | env = gym.make('CartPole-v0')
8 |
9 | # 하이퍼 파라미터
10 | LEARNING_RATE = 0.005
11 | INPUT = env.observation_space.shape[0]
12 | OUTPUT = env.action_space.n
13 | DISCOUNT = 0.99
14 |
15 |
16 | def discount_rewards(r):
17 | '''Discounted reward를 구하기 위한 함수
18 |
19 | Args:
20 | r(np.array): reward 값이 저장된 array
21 |
22 | Returns:
23 | discounted_r(np.array): Discounted 된 reward가 저장된 array
24 | '''
25 | discounted_r = np.zeros_like(r, dtype=np.float32)
26 | running_add = 0
27 | for t in reversed(range(len(r))):
28 | running_add = running_add * DISCOUNT + r[t]
29 | discounted_r[t] = running_add
30 |
31 | return discounted_r
32 |
33 |
34 | def train_episodic(PGagent, x, y, adv):
35 | '''에피소드당 학습을 하기위한 함수
36 |
37 | Args:
38 | PGagent(PolicyGradient): 학습될 네트워크
39 | x(np.array): State가 저장되어있는 array
40 | y(np.array): Action(one_hot)이 저장되어있는 array
41 | adv(np.array) : Discounted reward가 저장되어있는 array
42 |
43 | Returns:
44 | l(float): 네트워크에 의한 loss
45 | '''
46 | l,_ = PGagent.sess.run([PGagent.loss, PGagent.train], feed_dict={PGagent.X: x, PGagent.Y: y, PGagent.adv : adv})
47 | return l
48 |
49 | def play_cartpole(PGagent):
50 | '''학습된 네트워크로 Play하기 위한 함수
51 |
52 | Args:
53 | PGagent(PolicyGradient): 학습된 네트워크
54 | '''
55 | print("Play Cartpole!")
56 | episode = 0
57 | while True:
58 | s = env.reset()
59 | done = False
60 | rall = 0
61 | episode += 1
62 | while not done:
63 | env.render()
64 | action_p = PGagent.sess.run(PGagent.a_pre, feed_dict={PGagent.X : s})
65 | s1, reward, done, _ = env.step(np.argmax(action_p))
66 | s = s1
67 | rall += reward
68 | print("[Episode {0:6f}] Reward: {1:4f} ".format(episode, rall))
69 |
70 | class PolicyGradient:
71 | def __init__(self, sess, input_size, output_size):
72 | self.sess = sess
73 | self.input_size = input_size
74 | self.output_size = output_size
75 |
76 | self.build_network()
77 |
78 | def build_network(self):
79 | self.X = tf.placeholder('float',[None, self.input_size])
80 | self.Y = tf.placeholder('float', [None, self.output_size])
81 | self.adv = tf.placeholder('float')
82 |
83 | w1 = tf.get_variable('w1', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer())
84 | w2 = tf.get_variable('w2', shape=[128, self.output_size], initializer=tf.contrib.layers.xavier_initializer())
85 |
86 | l1 = tf.nn.relu(tf.matmul(self.X, w1))
87 | self.a_pre = tf.nn.softmax(tf.matmul(l1,w2))
88 |
89 | self.log_p = self.Y * tf.log(self.a_pre)
90 | self.log_lik = self.log_p * self.adv
91 | self.loss = tf.reduce_mean(tf.reduce_sum(-self.log_lik, axis=1))
92 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
93 |
94 | def get_action(self, state):
95 | state_t = np.reshape(state, [1, self.input_size])
96 | action_p = self.sess.run(self.a_pre, feed_dict={self.X : state_t})
97 |
98 | # 각 액션의 확률로 액션을 결정
99 | action = np.random.choice(np.arange(self.output_size), p=action_p[0])
100 |
101 | return action
102 |
103 | def main():
104 | with tf.Session() as sess:
105 | PGagent = PolicyGradient(sess, INPUT, OUTPUT)
106 |
107 | sess.run(tf.global_variables_initializer())
108 | episode = 0
109 | recent_rlist = deque(maxlen=100)
110 | recent_rlist.append(0)
111 |
112 | # 최근 100개의 점수가 195점 넘을 때까지 학습
113 | while np.mean(recent_rlist) <= 195:
114 | episode += 1
115 | episode_memory = deque()
116 | rall = 0
117 | s = env.reset()
118 | done = False
119 |
120 | while not done:
121 | # 액션 선택
122 | action = PGagent.get_action(s)
123 |
124 | # action을 one_hot으로 표현
125 | y = np.zeros(OUTPUT)
126 | y[action] = 1
127 |
128 | s1, reward, done, _ = env.step(action)
129 | rall += reward
130 |
131 | # 에피소드 메모리에 저장
132 | episode_memory.append([s, y, reward])
133 | s = s1
134 |
135 | # 에피소드가 끝났을때 학습
136 | if done:
137 | episode_memory = np.array(episode_memory)
138 |
139 | discounted_rewards = discount_rewards(np.vstack(episode_memory[:,2]))
140 |
141 | discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() +
142 | 1e-7)
143 |
144 | l = train_episodic(PGagent, np.vstack(episode_memory[:,0]), np.vstack(episode_memory[:,1]),
145 | discounted_rewards)
146 |
147 | recent_rlist.append(rall)
148 |
149 | print("[Episode {0:6f}] Reward: {1:4f} Loss: {2:5.5f} Recent Reward: {3:4f}".format(episode, rall, l,
150 | np.mean(recent_rlist)))
151 |
152 | play_cartpole(PGagent)
153 |
154 | if __name__ == "__main__":
155 | main()
156 |
157 |
158 |
159 |
160 |
161 |
--------------------------------------------------------------------------------
/CartPole/CartPole_Q-Network.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import tensorflow as tf
3 | import gym
4 | import numpy as np
5 |
6 | env = gym.make('CartPole-v0')
7 |
8 | # 네트워크 구성
9 |
10 | x=tf.placeholder(dtype=tf.float32, shape=(1,4))
11 |
12 | input = env.observation_space.shape[0]
13 |
14 | W1=tf.get_variable('W1',shape=[input,10],initializer=tf.contrib.layers.xavier_initializer())
15 | W2=tf.get_variable('W2',shape=[10,20],initializer=tf.contrib.layers.xavier_initializer())
16 | W3=tf.get_variable('W3',shape=[20,15],initializer=tf.contrib.layers.xavier_initializer())
17 | W4=tf.get_variable('W4',shape=[15,env.action_space.n],initializer=tf.contrib.layers.xavier_initializer())
18 |
19 |
20 | L1=tf.nn.relu(tf.matmul(x,W1))
21 | L2=tf.nn.relu(tf.matmul(L1,W2))
22 | L3=tf.nn.relu(tf.matmul(L2,W3))
23 | Q_pre = tf.matmul(L3,W4)
24 |
25 |
26 | y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n))
27 |
28 | # 하이퍼 파라미터 정의
29 | learning_rate = 0.1
30 | num_episode = 2000
31 | e = 0.1
32 | discount_factor = 0.99
33 | rlist=[]
34 |
35 | # 손실 함수 정의
36 | cost = tf.reduce_sum(tf.square(y-Q_pre))
37 | optimizer = tf.train.AdamOptimizer(learning_rate)
38 | train = optimizer.minimize(cost)
39 |
40 | init = tf.global_variables_initializer()
41 |
42 | with tf.Session() as sess:
43 | # 변수 초기화
44 | sess.run(init)
45 | for step in range(num_episode):
46 | # stats 초기화
47 | s = env.reset()
48 | # e-greedy
49 | e = 1. / ((step/50)+10)
50 | rall = 0
51 | d = False
52 | j=0
53 | s_t = sess.run(tf.expand_dims(s, axis=0))
54 | while not d:
55 | # env.render()
56 | j+=1
57 |
58 | # reshape을 통한 state 전처리
59 |
60 | # 현재 state에 대한 Q값 예측
61 | Q = sess.run(Q_pre, feed_dict={x:s_t})
62 |
63 | # e-greedy 를 통한 랜덤한 action
64 | if e > np.random.rand(1):
65 | a = env.action_space.sample()
66 | else:
67 | a = np.argmax(Q)
68 |
69 | # action 수행
70 | s1, r, d, _ = env.step(a)
71 |
72 |
73 | if d:
74 | # 에피소드가 끝났을때 Negative reward 부여
75 | Q[0, a] = -100
76 | else:
77 | # next_state값의 전처리 후 Q-learning
78 | s1_t = sess.run(tf.expand_dims(s1, axis=0))
79 | Q1 = sess.run(Q_pre, feed_dict={x: s1_t})
80 | Q[0, a] = r + discount_factor * np.max(Q1)
81 |
82 | sess.run(train, feed_dict={x: s_t, y: Q})
83 |
84 | rall += r
85 |
86 | s_t = s1_t
87 |
88 | slist=[]
89 | rlist.append(rall)
90 | print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(step, j, rall, np.mean(rlist)))
91 |
92 | print("성공한 확률" + str(sum(rlist) / num_episode) + "%")
93 |
94 |
--------------------------------------------------------------------------------
/CartPole/CartPole_Q-Network_reshape.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import tensorflow as tf
3 | import gym
4 | import numpy as np
5 |
6 | env = gym.make('CartPole-v0')
7 |
8 | # 네트워크 구성
9 |
10 | x=tf.placeholder(dtype=tf.float32, shape=(1,4))
11 |
12 | input = env.observation_space.shape[0]
13 |
14 | W1=tf.get_variable('W1',shape=[input,10],initializer=tf.contrib.layers.xavier_initializer())
15 | W2=tf.get_variable('W2',shape=[10,20],initializer=tf.contrib.layers.xavier_initializer())
16 | W3=tf.get_variable('W3',shape=[20,15],initializer=tf.contrib.layers.xavier_initializer())
17 | W4=tf.get_variable('W4',shape=[15,env.action_space.n],initializer=tf.contrib.layers.xavier_initializer())
18 |
19 |
20 | L1=tf.nn.relu(tf.matmul(x,W1))
21 | L2=tf.nn.relu(tf.matmul(L1,W2))
22 | L3=tf.nn.relu(tf.matmul(L2,W3))
23 | Q_pre = tf.matmul(L3,W4)
24 |
25 |
26 | y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n))
27 |
28 | # 하이퍼 파라미터 정의
29 | learning_rate = 0.1
30 | num_episode = 2000
31 | e = 0.1
32 | discount_factor = 0.99
33 | rlist=[]
34 |
35 | # 손실 함수 정의
36 | cost = tf.reduce_sum(tf.square(y-Q_pre))
37 | optimizer = tf.train.AdamOptimizer(learning_rate)
38 | train = optimizer.minimize(cost)
39 |
40 | init = tf.global_variables_initializer()
41 |
42 | with tf.Session() as sess:
43 | # 변수 초기화
44 | sess.run(init)
45 | for step in range(num_episode):
46 | # stats 초기화
47 | s = env.reset()
48 | # e-greedy
49 | e = 1. / ((step/50)+10)
50 | rall = 0
51 | d = False
52 | j=0
53 |
54 | while not d:
55 | # env.render()
56 | j+=1
57 |
58 | # reshape을 통한 state 전처리
59 | s_t = np.reshape(s,[1,input])
60 | # 현재 state에 대한 Q값 예측
61 | Q = sess.run(Q_pre, feed_dict={x:s_t})
62 |
63 | # e-greedy 를 통한 랜덤한 action
64 | if e > np.random.rand(1):
65 | a = env.action_space.sample()
66 | else:
67 | a = np.argmax(Q)
68 |
69 | # action 수행
70 | s1, r, d, _ = env.step(a)
71 |
72 |
73 | if d:
74 | # 에피소드가 끝났을때 Negative reward 부여
75 | Q[0, a] = -100
76 | else:
77 | # next_state값의 전처리 후 Q-learning
78 | s1_t= np.reshape(s1,[1,input])
79 | Q1 = sess.run(Q_pre, feed_dict={x: s1_t})
80 | Q[0, a] = r + discount_factor * np.max(Q1)
81 |
82 | sess.run(train, feed_dict={x: s_t, y: Q})
83 |
84 | rall += r
85 |
86 | s = s1
87 |
88 | slist=[]
89 | rlist.append(rall)
90 | print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(step, j, rall, np.mean(rlist)))
91 |
92 | print("성공한 확률" + str(sum(rlist) / num_episode) + "%")
93 |
94 |
--------------------------------------------------------------------------------
/CartPole/Cartpole_A2C_nstep.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import tensorflow as tf
4 | import gym
5 | from collections import deque
6 |
7 | env = gym.make('CartPole-v1')
8 |
9 | # 하이퍼 파라미터
10 | LEARNING_RATE = 0.001
11 | INPUT = env.observation_space.shape[0]
12 | OUTPUT = env.action_space.n
13 | DISCOUNT = 0.99
14 | NSTEP = 5
15 |
16 |
17 | def train_nstep(A2Cagent, sample):
18 | sample = np.stack(sample)
19 | discounted_return = np.empty([NSTEP, 1])
20 |
21 | s = np.reshape(np.stack(sample[:, 0]), [NSTEP, A2Cagent.input_size])
22 | s1 = np.reshape(np.stack(sample[:, 3]), [NSTEP, A2Cagent.input_size])
23 | y = np.reshape(np.stack(sample[:, 1]), [NSTEP, A2Cagent.output_size])
24 | r = np.reshape(np.stack(sample[:, 2]), [NSTEP, 1])
25 | d = np.reshape(np.stack(sample[:, 4]), [NSTEP, 1])
26 |
27 | value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X: s})
28 | next_value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X: s1})
29 |
30 | # Discounted Return 계산
31 | running_add = next_value[NSTEP - 1, 0] * d[NSTEP - 1, 0]
32 | for t in range(4, -1, -1):
33 | if d[t]:
34 | running_add = 0
35 | running_add = r[t] + DISCOUNT * running_add
36 | discounted_return[t, 0] = running_add
37 |
38 | # For critic
39 | target = r + DISCOUNT * d * next_value
40 |
41 | # For Actor
42 | adv = discounted_return - value
43 |
44 | A2Cagent.sess.run([A2Cagent.train], feed_dict={A2Cagent.X: s, A2Cagent.r: target, A2Cagent.Y: y, A2Cagent.adv: adv})
45 |
46 |
47 | class ActorCritic:
48 | def __init__(self, sess, input_size, output_size):
49 | self.sess = sess
50 | self.input_size = input_size
51 | self.output_size = output_size
52 |
53 | self.build_network()
54 |
55 | def build_network(self):
56 | self.X = tf.placeholder('float', [None, self.input_size])
57 | self.Y = tf.placeholder('float', [None, self.output_size])
58 | self.adv = tf.placeholder('float')
59 | self.r = tf.placeholder('float')
60 |
61 | # Actor Weight
62 | w1_a = tf.get_variable('w1', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer())
63 | w2_a = tf.get_variable('w2', shape=[128, self.output_size], initializer=tf.contrib.layers.xavier_initializer())
64 |
65 | # Critic Weight
66 | w1_c = tf.get_variable('w1_c', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer())
67 | w2_c = tf.get_variable('w2_c', shape=[128, 1], initializer=tf.contrib.layers.xavier_initializer())
68 |
69 | # Actor Critic Network
70 | l1_a = tf.nn.selu(tf.matmul(self.X, w1_a))
71 | l1_c = tf.nn.selu(tf.matmul(self.X, w1_c))
72 |
73 | self.a = tf.matmul(l1_a, w2_a)
74 | self.a_prob = tf.nn.softmax(tf.matmul(l1_a, w2_a))
75 | self.v = tf.matmul(l1_c, w2_c)
76 |
77 | # Actor loss
78 | self.log_lik = tf.nn.softmax_cross_entropy_with_logits(labels=self.Y, logits=self.a)
79 | self.p_loss = tf.reduce_mean(self.log_lik * self.adv)
80 |
81 | # Critic loss
82 | self.v_loss = tf.reduce_mean(tf.square(self.v - self.r), axis=1)
83 |
84 | # entropy(for more exploration)
85 | self.entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.a_prob, logits=self.a))
86 |
87 | self.loss = self.p_loss + self.v_loss - self.entropy * 0.01
88 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
89 |
90 | def get_action(self, state):
91 | state_t = np.reshape(state, [1, self.input_size])
92 | action_p = self.sess.run(self.a_prob, feed_dict={self.X: state_t})
93 |
94 | # 각 액션의 확률로 액션을 결정
95 | action = np.random.choice(np.arange(self.output_size), p=action_p[0])
96 |
97 | return action
98 |
99 |
100 | def main():
101 | with tf.Session() as sess:
102 | A2Cagent = ActorCritic(sess, INPUT, OUTPUT)
103 |
104 | A2Cagent.sess.run(tf.global_variables_initializer())
105 | episode = 0
106 | step = 0
107 | recent_rlist = deque(maxlen=100)
108 | recent_rlist.append(0)
109 |
110 | sample = []
111 |
112 | # 최근 100개의 점수가 195점 넘을 때까지 학습
113 | while np.mean(recent_rlist) <= 195:
114 | episode += 1
115 |
116 | rall = 0
117 | count = 0
118 | s = env.reset()
119 | done = False
120 |
121 | while not done:
122 | count += 1
123 | step += 1
124 | # 액션 선택
125 | action = A2Cagent.get_action(s)
126 |
127 | # action을 one_hot으로 표현
128 | y = np.zeros(OUTPUT)
129 | y[action] = 1
130 | s1, reward, done, _ = env.step(action)
131 | rall += reward
132 |
133 | sample.append([s, y, reward, s1, done])
134 |
135 | # negative reward
136 | if done and count < env.spec.timestep_limit:
137 | reward = -100
138 |
139 | if step % 5 == 0:
140 | train_nstep(A2Cagent, sample)
141 | sample = []
142 |
143 | s = s1
144 |
145 | recent_rlist.append(rall)
146 |
147 | print("[Episode {0:6d}] Reward: {1:4f} Recent Reward: {2:4f}".format(episode, rall, np.mean(recent_rlist)))
148 |
149 |
150 | if __name__ == "__main__":
151 | main()
152 |
--------------------------------------------------------------------------------
/CartPole/Cartpole_A2C_onestep.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import tensorflow as tf
4 | import gym
5 | from collections import deque
6 |
7 | env = gym.make('CartPole-v1')
8 |
9 | # 하이퍼 파라미터
10 | LEARNING_RATE = 0.001
11 | INPUT = env.observation_space.shape[0]
12 | OUTPUT = env.action_space.n
13 | DISCOUNT = 0.99
14 | print(env.spec.timestep_limit)
15 |
16 | def train_onestep(A2Cagent, s, y, r, s1, d):
17 | s = np.reshape(s, [1, A2Cagent.input_size])
18 | s1 = np.reshape(s1, [1, A2Cagent.input_size])
19 | y = np.reshape(y, [1, A2Cagent.output_size])
20 |
21 | value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X: s})
22 | if d:
23 | target = r
24 | adv = r - value
25 | else:
26 | next_value = A2Cagent.sess.run(A2Cagent.v, feed_dict={A2Cagent.X : s1})
27 | target = r + DISCOUNT * next_value
28 | adv = r + DISCOUNT * next_value - value
29 |
30 | A2Cagent.sess.run([A2Cagent.train], feed_dict={A2Cagent.X: s, A2Cagent.r: target, A2Cagent.Y: y, A2Cagent.adv: adv})
31 |
32 | class ActorCritic:
33 | def __init__(self, sess, input_size, output_size):
34 | self.sess = sess
35 | self.input_size = input_size
36 | self.output_size = output_size
37 |
38 | self.build_network()
39 |
40 | def build_network(self):
41 |
42 | self.X = tf.placeholder('float', [None, self.input_size])
43 | self.Y = tf.placeholder('float', [None, self.output_size])
44 | self.adv = tf.placeholder('float')
45 | self.r = tf.placeholder('float')
46 |
47 | # Actor Weight
48 | w1_a = tf.get_variable('w1', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer())
49 | w2_a = tf.get_variable('w2', shape=[128, self.output_size], initializer=tf.contrib.layers.xavier_initializer())
50 |
51 | # Critic Weight
52 | w1_c = tf.get_variable('w1_c', shape=[self.input_size, 128], initializer=tf.contrib.layers.xavier_initializer())
53 | w2_c = tf.get_variable('w2_c', shape=[128, 1], initializer=tf.contrib.layers.xavier_initializer())
54 |
55 | # Actor Critic Network
56 | l1_a = tf.nn.relu(tf.matmul(self.X, w1_a))
57 | l1_c = tf.nn.relu(tf.matmul(self.X, w1_c))
58 | self.a_prob = tf.nn.softmax(tf.matmul(l1_a, w2_a))
59 | self.v = tf.matmul(l1_c, w2_c)
60 |
61 | # Policy loss
62 | self.log_p = self.Y * tf.log(tf.clip_by_value(self.a_prob,1e-10,1.))
63 | self.log_lik = self.log_p * self.adv
64 | self.p_loss = -tf.reduce_mean(tf.reduce_sum(self.log_lik, axis=1))
65 |
66 | # Value loss
67 | self.v_loss = tf.reduce_mean(tf.square(self.v - self.r), axis=1)
68 |
69 | # entropy(for more exploration)
70 | self.entropy = -tf.reduce_mean(
71 | tf.reduce_sum(self.a_prob * tf.log(tf.clip_by_value(self.a_prob, 1e-10, 1.)), axis=1))
72 |
73 | self.loss = self.p_loss + self.v_loss - self.entropy * 0.01
74 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
75 |
76 | def get_action(self, state):
77 | state_t = np.reshape(state, [1, self.input_size])
78 | action_p = self.sess.run(self.a_prob, feed_dict={self.X: state_t})
79 |
80 | # 각 액션의 확률로 액션을 결정
81 | action = np.random.choice(np.arange(self.output_size), p=action_p[0])
82 |
83 | return action
84 |
85 |
86 | def main():
87 | with tf.Session() as sess:
88 | A2Cagent = ActorCritic(sess, INPUT, OUTPUT)
89 |
90 | A2Cagent.sess.run(tf.global_variables_initializer())
91 | episode = 0
92 | recent_rlist = deque(maxlen=100)
93 | recent_rlist.append(0)
94 |
95 | # 최근 100개의 점수가 195점 넘을 때까지 학습
96 | while np.mean(recent_rlist) <= 195:
97 | episode += 1
98 |
99 | rall = 0
100 | count = 0
101 | s = env.reset()
102 | done = False
103 | i = 1
104 | while not done:
105 | count += 1
106 | # 액션 선택
107 | action = A2Cagent.get_action(s)
108 |
109 | # action을 one_hot으로 표현
110 | y = np.zeros(OUTPUT)
111 | y[action] = 1
112 |
113 | s1, reward, done, _ = env.step(action)
114 | rall += reward
115 |
116 | # negative reward
117 | if done and count < env.spec.timestep_limit:
118 | reward = -100
119 |
120 | train_onestep(A2Cagent, s, y, reward, s1, done)
121 |
122 | s = s1
123 |
124 | recent_rlist.append(rall)
125 |
126 | print("[Episode {0:6d}] Reward: {1:4f} Recent Reward: {2:4f}".format(episode, rall, np.mean(recent_rlist)))
127 |
128 | if __name__ == "__main__":
129 | main()
130 |
131 |
132 |
133 |
134 |
135 |
--------------------------------------------------------------------------------
/CartPole/cartpole_dqn.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/CartPole/cartpole_dqn.py
--------------------------------------------------------------------------------
/CartPole/cartpole_ppo.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import random
4 |
5 | import torch.nn.functional as F
6 | import torch.optim as optim
7 | import torch.multiprocessing as mp
8 |
9 | import torch.nn as nn
10 | import torch
11 |
12 | from collections import deque
13 |
14 | from torch.distributions.categorical import Categorical
15 |
16 |
17 | def make_batch(sample, agent):
18 | sample = np.stack(sample)
19 | discounted_return = np.empty([NUM_STEP, 1])
20 |
21 | s = np.reshape(np.stack(sample[:, 0]), [NUM_STEP, agent.input_size])
22 | s1 = np.reshape(np.stack(sample[:, 3]), [NUM_STEP, agent.input_size])
23 | y = sample[:, 1]
24 | r = np.reshape(np.stack(sample[:, 2]), [NUM_STEP, 1])
25 | d = np.reshape(np.stack(sample[:, 4]), [NUM_STEP, 1])
26 | with torch.no_grad():
27 | state = torch.from_numpy(s)
28 | state = state.float()
29 | _, value = agent.model_old(state)
30 |
31 | next_state = torch.from_numpy(s1)
32 | next_state = next_state.float()
33 | _, next_value = agent.model_old(next_state)
34 |
35 | value = value.data.numpy()
36 | next_value = next_value.data.numpy()
37 |
38 | # Discounted Return
39 | gae = 0
40 | for t in range(NUM_STEP - 1, -1, -1):
41 | delta = r[t] + DISCOUNT * next_value[t] * (1 - d[t]) - value[t]
42 | gae = delta + DISCOUNT * LAM * (1 - d[t]) * gae
43 | discounted_return[t, 0] = gae + value[t]
44 |
45 | # For critic
46 | target = r + DISCOUNT * (1 - d) * next_value
47 |
48 | # For Actor
49 | adv = discounted_return - value
50 | # adv = (adv - adv.mean()) / (adv.std() + 1e-5)
51 |
52 | return [s, target, y, adv]
53 |
54 |
55 | class ActorCriticNetwork(nn.Module):
56 | def __init__(self, input_size, output_size):
57 | super(ActorCriticNetwork, self).__init__()
58 | self.feature = nn.Sequential(
59 | nn.Linear(input_size, 64),
60 | nn.ReLU(),
61 | nn.Linear(64, 64),
62 | nn.ReLU()
63 | )
64 | self.actor = nn.Linear(64, output_size)
65 | self.critic = nn.Linear(64, 1)
66 |
67 | def forward(self, state):
68 | x = self.feature(state)
69 | policy = F.softmax(self.actor(x), dim=-1)
70 | value = self.critic(x)
71 | return policy, value
72 |
73 |
74 | # PAAC(Parallel Advantage Actor Critic)
75 | class ActorAgent(object):
76 | def __init__(self):
77 | self.model_old = ActorCriticNetwork(INPUT, OUTPUT)
78 | self.model_old.share_memory()
79 |
80 | self.output_size = OUTPUT
81 | self.input_size = INPUT
82 |
83 | def get_action(self, state):
84 | state = torch.from_numpy(state)
85 | state = state.float()
86 | policy, value = self.model_old(state)
87 | m = Categorical(policy)
88 | action = m.sample()
89 | return action.item()
90 |
91 | # after some time interval update the target model to be same with model
92 | def update_actor_model(self, target):
93 | self.model_old.load_state_dict(target.state_dict())
94 |
95 | @staticmethod
96 | def weights_init(m):
97 | class_name = m.__class__.__name__
98 | if class_name.find('Linear') != -1:
99 | torch.nn.init.kaiming_uniform(m.weight)
100 | print(m)
101 | elif class_name.find('Conv') != -1:
102 | torch.nn.init.kaiming_uniform(m.weight)
103 | print(m)
104 |
105 |
106 | class LearnerAgent(object):
107 | def __init__(self):
108 | self.model = ActorCriticNetwork(INPUT, OUTPUT)
109 | # self.model.cuda()
110 | self.output_size = OUTPUT
111 | self.input_size = INPUT
112 | self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE, eps=1e-5)
113 |
114 | def train_model(self, s_batch, target_batch, y_batch, adv_batch, actor_agent):
115 | s_batch = torch.FloatTensor(s_batch)
116 | target_batch = torch.FloatTensor(target_batch)
117 | adv_batch = torch.FloatTensor(adv_batch)
118 | with torch.no_grad():
119 | policy_old, value_old = actor_agent.model_old(s_batch)
120 | m_old = Categorical(policy_old)
121 | y_batch_old = torch.LongTensor(y_batch)
122 | log_prob_old = m_old.log_prob(y_batch_old)
123 |
124 | # for multiply advantage
125 | policy, value = self.model(s_batch)
126 | m = Categorical(policy)
127 | y_batch = m.sample()
128 | log_prob = m.log_prob(y_batch)
129 | entropy = m.entropy().mean()
130 |
131 | for i in range(EPOCH):
132 | minibatch = random.sample(range(len(s_batch)), BATCH_SIZE)
133 | ratio = torch.exp(log_prob[minibatch] - log_prob_old[minibatch])
134 |
135 | surr1 = ratio * adv_batch[minibatch].sum(1)
136 | surr2 = torch.clamp(ratio, 1.0 - EPSILON, 1.0 + EPSILON) * adv_batch[minibatch].sum(1)
137 |
138 | actor_loss = -torch.min(surr1, surr2).mean()
139 | critic_loss = F.mse_loss(value_old[minibatch], target_batch[minibatch])
140 |
141 | self.optimizer.zero_grad()
142 | loss = actor_loss + V_COEF * critic_loss - 0.01 * entropy
143 | loss.backward(retain_graph=True)
144 | self.optimizer.step()
145 |
146 |
147 | class Environment(object):
148 | def __init__(self, env, idx):
149 | self.env = env
150 | self.obs = self.env.reset()
151 | self.next_obs = None
152 | self.done = False
153 | self.env_idx = idx
154 | self.step = 0
155 | self.episode = 0
156 | self.rall = 0
157 | self.recent_rlist = deque(maxlen=100)
158 | self.recent_rlist.append(0)
159 |
160 | def run(self, agent):
161 | sample = []
162 | for _ in range(NUM_STEP):
163 | self.step += 1
164 | action = agent.get_action(self.obs)
165 | self.next_obs, reward, self.done, _ = self.env.step(action)
166 | self.rall += reward
167 |
168 | # negative reward
169 | if self.done and self.step < self.env.spec.timestep_limit:
170 | reward = 0
171 |
172 | sample.append([self.obs[:], action, reward, self.next_obs[:], self.done])
173 |
174 | self.obs = self.next_obs
175 |
176 | if self.done:
177 | self.episode += 1
178 | if self.env_idx == 0:
179 | self.recent_rlist.append(self.rall)
180 | print("[Episode {0:6d}] Reward: {1:4.2f} Recent Reward: {2:4.2f}"
181 | .format(self.episode, self.rall, np.mean(self.recent_rlist)))
182 |
183 | self.obs = self.env.reset()
184 | self.done = False
185 | self.step = 0
186 | self.rall = 0
187 |
188 | return make_batch(sample, agent)
189 |
190 |
191 | def runner(env, cond, memory, actor):
192 | while True:
193 | with cond:
194 | sample = env.run(actor)
195 | memory.put(sample)
196 |
197 | # wait runner
198 | cond.wait()
199 |
200 |
201 | def learner(cond, memory, actor_agent, learner_agent):
202 | while True:
203 | if memory.full():
204 | s_batch, target_batch, y_batch, adv_batch = [], [], [], []
205 | # while memory.qsize() != 0:
206 | # if you use MacOS, use under condition.
207 | if NUM_ENV == 1:
208 | batch = memory.get()
209 | s_batch.extend(batch[0])
210 | target_batch.extend(batch[1])
211 | y_batch.extend(batch[2])
212 | adv_batch.extend(batch[3])
213 | else:
214 | while not memory.empty():
215 | batch = memory.get()
216 | s_batch.extend(batch[0])
217 | target_batch.extend(batch[1])
218 | y_batch.extend(batch[2])
219 | adv_batch.extend(batch[3])
220 |
221 | # train
222 | learner_agent.train_model(s_batch, target_batch, y_batch, adv_batch, actor_agent)
223 | actor_agent.update_actor_model(learner_agent.model)
224 | # resume running
225 | with cond:
226 | cond.notify_all()
227 |
228 |
229 | def main():
230 | num_envs = NUM_ENV
231 | memory = mp.Queue(maxsize=NUM_ENV)
232 | cond = mp.Condition()
233 |
234 | # make agent and share memory
235 | actor_agent = ActorAgent()
236 | learner_agent = LearnerAgent()
237 |
238 | # sync model
239 | actor_agent.update_actor_model(learner_agent.model)
240 |
241 | # make envs
242 | envs = [Environment(gym.make(ENV_ID), i) for i in range(num_envs)]
243 |
244 | # Learner Process(only Learn)
245 | learn_proc = mp.Process(target=learner, args=(cond, memory, actor_agent, learner_agent))
246 |
247 | # Runner Process(just run, not learn)
248 | runners = []
249 | for idx, env in enumerate(envs):
250 | run_proc = mp.Process(target=runner, args=(env, cond, memory, actor_agent))
251 | runners.append(run_proc)
252 | run_proc.start()
253 |
254 | learn_proc.start()
255 |
256 | for proc in runners:
257 | proc.join()
258 |
259 | learn_proc.join()
260 |
261 |
262 | if __name__ == '__main__':
263 | torch.manual_seed(23)
264 | ENV_ID = 'CartPole-v0'
265 | env = gym.make(ENV_ID)
266 | # Hyper parameter
267 | INPUT = env.observation_space.shape[0]
268 | OUTPUT = env.action_space.n
269 | DISCOUNT = 0.99
270 | NUM_STEP = 128
271 | NUM_ENV = 4
272 | LAM = 0.95
273 | EPOCH = 5
274 | BATCH_SIZE = 32
275 | V_COEF = 1.0
276 | EPSILON = 0.2
277 | ALPHA = 0.99
278 | LEARNING_RATE = 0.0007
279 | env.close()
280 |
281 | main()
282 |
--------------------------------------------------------------------------------
/CartPole/play_Cartpole.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import gym
3 | import numpy as np
4 |
5 | env = gym.make("CartPole-v0")
6 |
7 | print(env.observation_space)
8 | INPUT = env.observation_space.shape[0]
9 | OUTPUT = env.action_space.n
10 |
11 | # 하이퍼파라미터
12 | LEARNING_LATE = 0.001
13 | DISCOUNT = 0.99
14 |
15 | # Main 네트워크
16 | x=tf.placeholder(dtype=tf.float32, shape=(None, INPUT))
17 |
18 | y=tf.placeholder(dtype=tf.float32, shape=(None, OUTPUT))
19 | dropout = tf.placeholder(dtype=tf.float32)
20 |
21 | W1 = tf.get_variable('W1',shape=[INPUT, 200],initializer=tf.contrib.layers.xavier_initializer())
22 | W2 = tf.get_variable('W2',shape=[200,200],initializer=tf.contrib.layers.xavier_initializer())
23 | # W3 = tf.get_variable('W3',shape=[200,150],initializer=tf.contrib.layers.xavier_initializer())
24 | W4 = tf.get_variable('W4',shape=[200, OUTPUT],initializer=tf.contrib.layers.xavier_initializer())
25 |
26 | b1 = tf.Variable(tf.zeros([1],dtype=tf.float32))
27 | b2 = tf.Variable(tf.zeros([1],dtype=tf.float32))
28 |
29 | _L1=tf.nn.relu(tf.matmul(x,W1)+b1)
30 | L1=tf.nn.dropout(_L1,dropout)
31 | _L2=tf.nn.relu(tf.matmul(L1,W2)+b2)
32 | L2=tf.nn.dropout(_L2,dropout)
33 | # L3=tf.nn.relu(tf.matmul(L2,W3))
34 | Q_pre = tf.matmul(L2,W4)
35 |
36 | saver = tf.train.Saver()
37 | model_path = "save/model.ckpt"
38 | with tf.Session() as sess:
39 | rlist=[]
40 | sess.run(tf.global_variables_initializer())
41 | saver.restore(sess, model_path)
42 |
43 | print("Model restored form file: ", model_path)
44 | for episode in range(500):
45 | # state 초기화
46 | s = env.reset()
47 | e= 0.1
48 | rall = 0
49 | d = False
50 | count = 0
51 | # 에피소드가 끝나기 전까지 반복
52 | while not d and count < 5000:
53 | env.render()
54 | count += 1
55 | # state 값의 전처리
56 | s_t = np.reshape(s, [1, INPUT])
57 |
58 | # 현재 상태의 Q값을 에측
59 | Q = sess.run(Q_pre, feed_dict={x: s_t, dropout: 1})
60 |
61 | if e > np.random.rand(1):
62 | a = env.action_space.sample()
63 | else:
64 | a = np.argmax(Q)
65 |
66 |
67 | # 결정된 action으로 Environment에 입력
68 |
69 | s, r, d, _ = env.step(a)
70 |
71 | # 총 reward 합
72 | rall += r
73 |
74 | rlist.append(rall)
75 |
76 | print("Episode : {} steps : {} r={}. averge reward : {}".format(episode, count, rall,
77 | np.mean(rlist)))
78 |
79 |
--------------------------------------------------------------------------------
/FrozenLake/FL_Q-Table.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import gym
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import random as pr
6 |
7 |
8 | env = gym.make('FrozenLake-v1')
9 | # env.monitor.start('tmp/Frozenlake8x8-0.2', force= True)
10 | # Q-Table 초기화
11 | Q = np.zeros([env.observation_space.n,env.action_space.n])
12 |
13 |
14 | num_episodes = 1000
15 |
16 | # reward 값과 state 값들을 저장 해놓을 list
17 |
18 | rList = []
19 | sList = []
20 |
21 | # Q값이 모두 같을때 랜덤한 action을 구해주기 위한 함수
22 | def rargmax(vector):
23 | m = np.amax(vector)
24 | indices = np.nonzero(vector ==m)[0]
25 | return pr.choice(indices)
26 |
27 | for i in range(num_episodes):
28 | # Environment 초기화와 변수 초기화
29 | s = env.reset()
30 | rAll = 0
31 | d = False
32 | j = 0
33 | sList=[]
34 | # The Q-Table 알고리즘
35 | while not d and j<250:
36 | j+=1
37 | # 가장 Q값이 높은 action을 결정함
38 | a = rargmax(Q[s,:])
39 |
40 | # action을 통해서 next_state, reward, done, info를 받아온다
41 | s1,r,d,_ = env.step(a)
42 | if r == 1:
43 | print(sList)
44 | # Q-Learning
45 | Q[s,a]= r+ np.max(Q[s1,:])
46 | s=s1
47 | rAll=rAll+r
48 | sList.append(s)
49 |
50 | rList.append(rAll)
51 |
52 |
53 | print ("Final Q-Table Values")
54 | print (" left down right up")
55 | print (Q)
56 | print("성공한 확률 : ", len(rList)/num_episodes)
57 | plt.bar(range(len(rList)),rList, color="Blue")
58 | plt.show()
--------------------------------------------------------------------------------
/FrozenLake/FL_Q-table_Stochastic.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import gym
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import random as pr
6 |
7 | env = gym.make('FrozenLake-v0')
8 | # env.monitor.start('tmp/Frozenlake8x8-0.2', force= True)
9 | # Q-Table 초기화
10 | Q = np.zeros([env.observation_space.n, env.action_space.n])
11 |
12 | num_episodes = 2000
13 | discount = 0.99
14 | learning_rate = 0.85
15 |
16 | # reward 값과 state 값들을 저장 해놓을 list
17 |
18 | rList = []
19 | sList = []
20 |
21 |
22 | # Q값이 모두 같을때 랜덤한 action을 구해주기 위한 함수
23 | def rargmax(vector):
24 | m = np.amax(vector)
25 | indices = np.nonzero(vector == m)[0]
26 | return pr.choice(indices)
27 |
28 |
29 | for i in range(num_episodes):
30 | # Environment 초기화와 변수 초기화
31 | s = env.reset()
32 | rAll = 0
33 | d = False
34 | j = 0
35 | sList = []
36 | e = 1. / ((i / 10) + 1)
37 | # The Q-Table 알고리즘
38 | while not d and j < 250:
39 | j += 1
40 |
41 | # 가장 Q값이 높은 action을 결정함
42 | # exploration 을 통한 랜덤한 움직임 결정
43 | if e > np.random.rand(1):
44 | a = env.action_space.sample()
45 | else:
46 | a = rargmax(Q[s, :])
47 |
48 | # action을 통해서 next_state, reward, done, info를 받아온다
49 | s1, r, d, _ = env.step(a)
50 | if r == 1:
51 | print("episode : ",i," state record : " ,sList)
52 | # Q-Learning
53 | # discount factor를 적용하여 최단거리로 학습을 할 수 있음(미래에 대한 가중치)
54 | Q[s, a] = Q[s,a]*(1-learning_rate) + learning_rate * (r + discount * np.max(Q[s1, :]))
55 | s = s1
56 | rAll = rAll + r
57 | sList.append(s)
58 |
59 | rList.append(rAll)
60 |
61 | print ("Final Q-Table Values")
62 | print (" left down right up")
63 | print (Q)
64 | print("성공한 확률 : ", len(rList) / num_episodes)
65 | plt.bar(range(len(rList)), rList, color="Blue")
66 | plt.show()
--------------------------------------------------------------------------------
/FrozenLake/FL_Q-table_exp&dis.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import gym
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import random as pr
6 |
7 | env = gym.make('FrozenLake-v1')
8 | # env.monitor.start('tmp/Frozenlake8x8-0.2', force= True)
9 | # Q-Table 초기화
10 | Q = np.zeros([env.observation_space.n, env.action_space.n])
11 |
12 | num_episodes = 1000
13 | discount = 0.99
14 |
15 | # reward 값과 state 값들을 저장 해놓을 list
16 |
17 | rList = []
18 | sList = []
19 |
20 |
21 | # Q값이 모두 같을때 랜덤한 action을 구해주기 위한 함수
22 | def rargmax(vector):
23 | m = np.amax(vector)
24 | indices = np.nonzero(vector == m)[0]
25 | return pr.choice(indices)
26 |
27 |
28 | for i in range(num_episodes):
29 | # Environment 초기화와 변수 초기화
30 | s = env.reset()
31 | rAll = 0
32 | d = False
33 | j = 0
34 | sList = []
35 | e = 1. / ((i / 10) + 1)
36 | # The Q-Table 알고리즘
37 | while not d and j < 250:
38 | j += 1
39 |
40 | # 가장 Q값이 높은 action을 결정함
41 | # exploration 을 통한 랜덤한 움직임 결정
42 | if e > np.random.rand(1):
43 | a = env.action_space.sample()
44 | else:
45 | a = rargmax(Q[s, :])
46 |
47 | # action을 통해서 next_state, reward, done, info를 받아온다
48 | s1, r, d, _ = env.step(a)
49 | if r == 1:
50 | print(sList)
51 | # Q-Learning
52 | # discount factor를 적용하여 최단거리로 학습을 할 수 있음(미래에 대한 가중치)
53 | Q[s, a] = r + discount * np.max(Q[s1, :])
54 | s = s1
55 | rAll = rAll + r
56 | sList.append(s)
57 |
58 | rList.append(rAll)
59 |
60 | print ("Final Q-Table Values")
61 | print (" left down right up")
62 | print (Q)
63 | print("성공한 확률 : ", len(rList) / num_episodes)
64 | plt.bar(range(len(rList)), rList, color="Blue")
65 | plt.show()
--------------------------------------------------------------------------------
/FrozenLake/FrozenLake_Q-Network.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 27,
6 | "metadata": {
7 | "collapsed": false,
8 | "deletable": true,
9 | "editable": true
10 | },
11 | "outputs": [],
12 | "source": [
13 | "import tensorflow as tf\n",
14 | "import gym\n",
15 | "import numpy as np"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 28,
21 | "metadata": {
22 | "collapsed": false,
23 | "deletable": true,
24 | "editable": true
25 | },
26 | "outputs": [
27 | {
28 | "name": "stderr",
29 | "output_type": "stream",
30 | "text": [
31 | "[2017-02-15 21:14:22,417] Making new env: FrozenLake-v0\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "env = gym.make('FrozenLake-v0')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 29,
42 | "metadata": {
43 | "collapsed": false,
44 | "deletable": true,
45 | "editable": true
46 | },
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "Tensor(\"Placeholder_13:0\", shape=(1, 16), dtype=float32) Tensor(\"Placeholder_14:0\", dtype=float32) Tensor(\"Variable_7/read:0\", shape=(16, 4), dtype=float32)\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "x=tf.placeholder(dtype=tf.float32, shape=(1,env.observation_space.n))\n",
58 | "W=tf.Variable(tf.random_uniform((env.observation_space.n, env.action_space.n)))\n",
59 | "\n",
60 | "Q_pre = tf.matmul(x,W)\n",
61 | "\n",
62 | "\n",
63 | "y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n))\n",
64 | "\n",
65 | "print(x,y,W)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 39,
71 | "metadata": {
72 | "collapsed": true
73 | },
74 | "outputs": [],
75 | "source": [
76 | "learning_rate = 0.1\n",
77 | "num_episode = 2000\n",
78 | "e = 0.1\n",
79 | "discount_factor = 0.99 \n",
80 | "rlist=[]"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 31,
86 | "metadata": {
87 | "collapsed": false
88 | },
89 | "outputs": [],
90 | "source": [
91 | "cost = tf.reduce_sum(tf.square(y-Q_pre))\n",
92 | "optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n",
93 | "train = optimizer.minimize(cost)\n",
94 | "\n",
95 | "init = tf.global_variables_initializer()"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 34,
101 | "metadata": {
102 | "collapsed": true
103 | },
104 | "outputs": [],
105 | "source": [
106 | "def one_hot(x):\n",
107 | " return np.identity(env.observation_space.n)[x:x+1]"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 40,
113 | "metadata": {
114 | "collapsed": false
115 | },
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "\u001b[41mS\u001b[0mFFF\n",
122 | "FHFH\n",
123 | "FFFH\n",
124 | "HFFG\n",
125 | " (Up)\n"
126 | ]
127 | },
128 | {
129 | "ename": "KeyboardInterrupt",
130 | "evalue": "",
131 | "traceback": [
132 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
133 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
134 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0mQ\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mr\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdiscount_factor\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mQ1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0msess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mone_hot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mQ\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0mrall\u001b[0m\u001b[0;34m+=\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
135 | "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 765\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[0;32m--> 766\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 767\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 768\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
136 | "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run\u001b[0;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 962\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 963\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[0;32m--> 964\u001b[0;31m feed_dict_string, options, run_metadata)\n\u001b[0m\u001b[1;32m 965\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
137 | "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_run\u001b[0;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[1;32m 1012\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1013\u001b[0m return self._do_call(_run_fn, self._session, feed_dict, fetch_list,\n\u001b[0;32m-> 1014\u001b[0;31m target_list, options, run_metadata)\n\u001b[0m\u001b[1;32m 1015\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1016\u001b[0m return self._do_call(_prun_fn, self._session, handle, feed_dict,\n",
138 | "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_do_call\u001b[0;34m(self, fn, *args)\u001b[0m\n\u001b[1;32m 1019\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1020\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1021\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1022\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1023\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
139 | "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[0;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[1;32m 1001\u001b[0m return tf_session.TF_Run(session, options,\n\u001b[1;32m 1002\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_list\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1003\u001b[0;31m status, run_metadata)\n\u001b[0m\u001b[1;32m 1004\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1005\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
140 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
141 | ],
142 | "output_type": "error"
143 | }
144 | ],
145 | "source": [
146 | "with tf.Session() as sess:\n",
147 | " sess.run(init)\n",
148 | " env.render()\n",
149 | " for step in range(num_episode):\n",
150 | " s=env.reset()\n",
151 | " rall = 0\n",
152 | " d = False\n",
153 | " \n",
154 | " while not d:\n",
155 | " Q = sess.run(Q_pre, feed_dict={x:one_hot(s)})\n",
156 | " \n",
157 | " if e > np.random.rand(1):\n",
158 | " a = env.action_space.sample()\n",
159 | " else:\n",
160 | " a = np.argmax(Q)\n",
161 | " \n",
162 | " s1,r,d,_ = env.step(a)\n",
163 | " \n",
164 | " Q1 = sess.run(Q_pre, feed_dict={x:one_hot(s1)})\n",
165 | " \n",
166 | " if d:\n",
167 | " Q[0,a]=r\n",
168 | " else:\n",
169 | " Q[0,a]=r + discount_factor * np.argmax(Q1)\n",
170 | " \n",
171 | " sess.run(train, feed_dict={x:one_hot(s), y:Q})\n",
172 | " \n",
173 | " rall+=r\n",
174 | " s=s1\n",
175 | " \n",
176 | " rlist.append(rall)\n",
177 | " \n",
178 | " \n",
179 | "print(\"성공한 확률\"+ str(sum(rlist)/num_episode)+\"%\")"
180 | ]
181 | }
182 | ],
183 | "metadata": {
184 | "kernelspec": {
185 | "display_name": "Python 3",
186 | "language": "python",
187 | "name": "python3"
188 | },
189 | "language_info": {
190 | "codemirror_mode": {
191 | "name": "ipython",
192 | "version": 3.0
193 | },
194 | "file_extension": ".py",
195 | "mimetype": "text/x-python",
196 | "name": "python",
197 | "nbconvert_exporter": "python",
198 | "pygments_lexer": "ipython3",
199 | "version": "3.6.0"
200 | }
201 | },
202 | "nbformat": 4,
203 | "nbformat_minor": 0
204 | }
--------------------------------------------------------------------------------
/FrozenLake/FrozenLake_Q-Network.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import gym
3 | import numpy as np
4 |
5 | env = gym.make('FrozenLake-v0')
6 |
7 | x=tf.placeholder(dtype=tf.float32, shape=(1,env.observation_space.n))
8 |
9 | W1=tf.Variable(tf.random_uniform((env.observation_space.n, env.action_space.n),-0.1,0.1))
10 | Q_pre = tf.matmul(x,W1)
11 |
12 |
13 | y=tf.placeholder(dtype=tf.float32, shape=(1, env.action_space.n))
14 |
15 |
16 |
17 | learning_rate = 0.1
18 | num_episode = 2000
19 | e = 0.1
20 | discount_factor = 0.99
21 | rlist=[]
22 | slist=[]
23 |
24 | cost = tf.reduce_sum(tf.square(y-Q_pre))
25 | optimizer = tf.train.GradientDescentOptimizer(learning_rate)
26 | train = optimizer.minimize(cost)
27 |
28 | init = tf.global_variables_initializer()
29 |
30 |
31 | def one_hot(x):
32 | return np.identity(env.observation_space.n)[x:x + 1]
33 |
34 |
35 | with tf.Session() as sess:
36 | sess.run(init)
37 | for step in range(num_episode):
38 |
39 | s = env.reset()
40 | e = 1. / ((step/50)+10)
41 | rall = 0
42 | d = False
43 | j=0
44 | while not d:
45 | j+=1
46 | Q = sess.run(Q_pre, feed_dict={x: one_hot(s)})
47 |
48 | if e > np.random.rand(1):
49 | a = env.action_space.sample()
50 | else:
51 | a = np.argmax(Q)
52 |
53 | s1, r, d, _ = env.step(a)
54 |
55 | if d:
56 | Q[0, a] = r
57 | else:
58 | Q1 = sess.run(Q_pre, feed_dict={x: one_hot(s1)})
59 | Q[0, a] = r + discount_factor * np.max(Q1)
60 |
61 | sess.run(train, feed_dict={x: one_hot(s), y: Q})
62 |
63 | rall += r
64 | slist.append(s)
65 | s = s1
66 | print(slist)
67 | slist=[]
68 | rlist.append(rall)
69 | print("Episode {} finished after {} timesteps with r={}. Running score: {}".format(step, j, rall, np.mean(rlist)))
70 |
71 | print("성공한 확률" + str(sum(rlist) / num_episode) + "%")
72 |
73 |
--------------------------------------------------------------------------------
/Pong/Pong_A2C_episodic.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import tensorflow as tf
4 | import gym
5 | from collections import deque
6 | from skimage.transform import resize
7 | from skimage.color import rgb2gray
8 |
9 | env = gym.make('PongDeterministic-v4')
10 |
11 | # 하이퍼 파라미터
12 | LEARNING_RATE = 0.00025
13 | INPUT = env.observation_space.shape[0]
14 | OUTPUT = 2 # 액션수를 두개로 줄임. UP or DOWN
15 | DISCOUNT = 0.99
16 |
17 | HEIGHT = 84
18 | WIDTH = 84
19 | HISTORY_SIZE = 4
20 |
21 | model_path = 'save/pong-pg.ckpt'
22 |
23 | def pre_proc(X):
24 | '''입력데이터 전처리.
25 |
26 | Args:
27 | X(np.array): 받아온 이미지를 그레이 스케일링 후 84X84로 크기변경
28 | 그리고 정수값으로 저장하기위해(메모리 효율 높이기 위해) 255를 곱함
29 |
30 | Returns:
31 | np.array: 변경된 이미지
32 | '''
33 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거
34 | # x = np.maximum(X, X1)
35 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정
36 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255)
37 |
38 | return x
39 |
40 | def discount_rewards(r):
41 | '''Discounted reward를 구하기 위한 함수
42 |
43 | Args:
44 | r(np.array): reward 값이 저장된 array
45 |
46 | Returns:
47 | discounted_r(np.array): Discounted 된 reward가 저장된 array
48 | '''
49 | discounted_r = np.zeros_like(r, dtype=np.float32)
50 | running_add = 0
51 | for t in reversed(range(len(r))):
52 | running_add = running_add * DISCOUNT + r[t]
53 | discounted_r[t] = running_add
54 |
55 | return discounted_r
56 |
57 | def get_init_state(history, s):
58 | '''에피소드 시작 State를 초기화.
59 |
60 | Args:
61 | history(np.array): 5개의 프레임이 저장될 array
62 | s(list): 초기화된 이미지
63 |
64 | Note:
65 | history[:,:,:3]에 모두 초기화된 이미지(s)를 넣어줌
66 | '''
67 | for i in range(HISTORY_SIZE):
68 | history[:, :, i] = pre_proc(s)
69 |
70 |
71 | def train_episodic(A2Cagent, x, y, r):
72 | '''에피소드당 학습을 하기위한 함수
73 |
74 | Args:
75 | A2Cagent(ActorCritic): 학습될 네트워크
76 | x(np.array): State가 저장되어있는 array
77 | y(np.array): Action(one_hot)이 저장되어있는 array
78 | r(np.array) : Discounted reward가 저장되어있는 array
79 |
80 | Returns:
81 | l(float): 네트워크에 의한 loss
82 | '''
83 | l, _ = A2Cagent.sess.run([A2Cagent.loss, A2Cagent.train], feed_dict={A2Cagent.X: x, A2Cagent.Y: y, A2Cagent.r: r})
84 | return l
85 |
86 |
87 | def play_atari(A2Cagent):
88 | '''학습된 네트워크로 Play하기 위한 함수
89 |
90 | Args:
91 | PGagent(PolicyGradient): 학습된 네트워크
92 | '''
93 | print("Play Atari!")
94 | episode = 0
95 | while True:
96 | s = env.reset()
97 | history = np.zeros([84, 84, 5], dtype=np.uint8)
98 | done = False
99 | rall = 0
100 | episode += 1
101 | get_init_state(history, s)
102 | while not done:
103 | env.render()
104 | action_p = A2Cagent.sess.run(
105 | A2Cagent.a_prob,feed_dict={A2Cagent.X: np.reshape(np.float32(history[:,:,:4] / 255.), [-1, 84, 84, 4])})
106 | s1, reward, done, _ = env.step(np.argmax(action_p)+2)
107 | history[:, :, 4] = pre_proc(s1)
108 | history[:, :, :4] = history[:, :, 1:]
109 | rall += reward
110 | print("[Episode {0:6f}] Reward: {1:4f} ".format(episode, rall))
111 |
112 |
113 | class ActorCritic:
114 | def __init__(self, sess, input_size, output_size):
115 | self.sess = sess
116 | self.input_size = input_size
117 | self.output_size = output_size
118 | self.height = HEIGHT
119 | self.width = WIDTH
120 | self.history_size = HISTORY_SIZE
121 | self.build_network()
122 |
123 | def build_network(self):
124 | self.X = tf.placeholder('float', [None, self.height, self.width, self.history_size])
125 | self.Y = tf.placeholder('float', [None, self.output_size])
126 | self.r = tf.placeholder('float')
127 |
128 | # Actor network
129 | f1_a = tf.get_variable("f1_a", shape=[1, 1, 4, 1], initializer=tf.contrib.layers.xavier_initializer_conv2d())
130 | f2_a = tf.get_variable("f2_a", shape=[4, 4, 1, 16], initializer=tf.contrib.layers.xavier_initializer_conv2d())
131 | f3_a = tf.get_variable("f3_a", shape=[4, 4, 16, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d())
132 | w1_a = tf.get_variable("w1_a", shape=[6 * 6 * 32, 256], initializer=tf.contrib.layers.xavier_initializer())
133 | w2_a = tf.get_variable("w2_a", shape=[256, OUTPUT], initializer=tf.contrib.layers.xavier_initializer())
134 |
135 | c1_a = tf.nn.relu(tf.nn.conv2d(self.X, f1_a, strides=[1, 1, 1, 1], padding="VALID"))
136 | c2_a = tf.nn.relu(tf.nn.conv2d(c1_a, f2_a, strides=[1, 4, 4, 1], padding="VALID"))
137 | c3_a = tf.nn.relu(tf.nn.conv2d(c2_a, f3_a, strides=[1, 3, 3, 1], padding="VALID"))
138 |
139 | l1_a = tf.reshape(c3_a, [-1, w1_a.get_shape().as_list()[0]])
140 | l2_a = tf.nn.relu(tf.matmul(l1_a, w1_a))
141 | self.a_prob = tf.nn.softmax(tf.matmul(l2_a, w2_a))
142 |
143 | # Critic network(like DQN network)
144 | f1_c = tf.get_variable("f1_c", shape=[8, 8, 4, 16], initializer=tf.contrib.layers.xavier_initializer_conv2d())
145 | f2_c = tf.get_variable("f2_c", shape=[4, 4, 16, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d())
146 | w1_c = tf.get_variable("w1_c", shape=[9 * 9 * 32, 256], initializer=tf.contrib.layers.xavier_initializer())
147 | w2_c = tf.get_variable("w2_c", shape=[256, 1], initializer=tf.contrib.layers.xavier_initializer())
148 |
149 | c1_c = tf.nn.relu(tf.nn.conv2d(self.X, f1_c, strides=[1, 4, 4, 1], padding="VALID"))
150 | c2_c = tf.nn.relu(tf.nn.conv2d(c1_c, f2_c, strides=[1, 2, 2, 1], padding="VALID"))
151 |
152 | l1_c = tf.reshape(c2_c, [-1, w1_c.get_shape().as_list()[0]])
153 | l2_c = tf.nn.relu(tf.matmul(l1_c, w1_c))
154 | self.v = tf.matmul(l2_c, w2_c)
155 |
156 | # A_t = R_t - V(S_t)
157 | self.adv = self.r - self.v
158 |
159 | # Policy loss
160 | self.log_p = self.Y * tf.log(tf.clip_by_value(self.a_prob,1e-10,1.))
161 | self.log_lik = self.log_p * tf.stop_gradient(self.adv)
162 | self.p_loss = -tf.reduce_mean(tf.reduce_sum(self.log_lik, axis=1))
163 |
164 | # Value loss
165 | self.v_loss = tf.reduce_mean(tf.square(self.v - self.r), axis=1)
166 |
167 | # Total loss
168 | self.loss = self.p_loss + self.v_loss
169 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
170 | self.saver = tf.train.Saver()
171 |
172 | def get_action(self, state, max_prob):
173 | action_p = self.sess.run(self.a_prob, feed_dict={self.X: np.reshape(np.float32(state / 255.),
174 | [-1, HEIGHT, WIDTH, HISTORY_SIZE])})
175 | # 각 액션의 확률로 액션을 결정
176 | max_prob.append(np.max(action_p))
177 | action = np.random.choice(np.arange(self.output_size), p=action_p[0])
178 | return action
179 |
180 | def main():
181 | with tf.Session() as sess:
182 | # VRAM이 부족하면 CPU로 학습
183 | # with tf.Session(config = tf.ConfigProto(device_count ={'GPU' : 0})) as sess:
184 | A2Cagent = ActorCritic(sess, INPUT, OUTPUT)
185 |
186 | A2Cagent.sess.run(tf.global_variables_initializer())
187 |
188 | episode = 0
189 | recent_rlist = deque(maxlen=100)
190 | recent_rlist.append(0)
191 |
192 | # 최근 100개의 점수가 19점 넘을 때까지 학습
193 | while np.mean(recent_rlist) <= 19:
194 | episode += 1
195 |
196 | state_memory = deque()
197 | action_memory = deque()
198 | reward_memory = deque()
199 |
200 | # 공의 움직임을 알아보기 위한 History
201 | history = np.zeros([84, 84, HISTORY_SIZE+1], dtype=np.uint8)
202 | rall, count = 0, 0
203 | done = False
204 |
205 | s = env.reset()
206 | max_prob = deque()
207 | get_init_state(history, s)
208 |
209 | while not done:
210 | # env.render()
211 | count += 1
212 | # 액션 선택
213 | action = A2Cagent.get_action(history[:,:,:HISTORY_SIZE], max_prob)
214 |
215 | # action을 one_hot으로 표현
216 | y = np.zeros(OUTPUT)
217 | y[action] = 1
218 |
219 | # 학습속도 개선을 위해 액션의 개수를 2개로 줄임 (UP or DOWN)
220 | s1, reward, done, l = env.step(action + 2)
221 |
222 | rall += reward
223 | reward = np.clip(reward, -1, 1)
224 |
225 | # 한 에피소드를 저장
226 | state_memory.append(np.copy(np.float32(history[:,:,:HISTORY_SIZE]/255.)))
227 | action_memory.append(np.copy(y))
228 | reward_memory.append(np.copy(reward))
229 |
230 | # 새로운 프레임을 히스토리 마지막에 넣어줌
231 | history[:, :, HISTORY_SIZE] = pre_proc(s1)
232 | history[:, :, :HISTORY_SIZE] = history[:, :, 1:]
233 |
234 | # 에피소드가 끝났을때 학습
235 | if done:
236 | # Discounted return을 구함
237 | rewards = discount_rewards(np.vstack(reward_memory))
238 |
239 | train_episodic(A2Cagent, np.stack(state_memory, axis=0), np.stack(action_memory, axis =0), rewards)
240 | recent_rlist.append(rall)
241 |
242 | print("[Episode {0:6d}] Reward: {1:4f} Recent Reward: {2:4f} Max Prob: {3:5.10f}".format(episode, rall, np.mean(recent_rlist), np.mean(max_prob)))
243 | if episode % 10 == 0:
244 | A2Cagent.saver.save(A2Cagent.sess, model_path, global_step= episode)
245 |
246 |
247 | if __name__ == "__main__":
248 | main()
249 |
250 |
251 |
252 |
253 |
254 |
--------------------------------------------------------------------------------
/Pong/Pong_PolicyGradient.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | import tensorflow as tf
4 | import gym
5 | from collections import deque
6 | from skimage.transform import resize
7 | from skimage.color import rgb2gray
8 |
9 | # {}Deterministic : frameskip = 4
10 | # {}-v4 : repeat_action_probability
11 | env = gym.make('PongDeterministic-v4')
12 |
13 | # 하이퍼 파라미터
14 | LEARNING_RATE = 0.00025
15 | INPUT = env.observation_space.shape
16 | OUTPUT = 2
17 |
18 | DISCOUNT = 0.99
19 | HEIGHT = 84
20 | WIDTH = 84
21 | HISTORY_SIZE = 4
22 |
23 | model_path = 'save/pong-pg.ckpt'
24 |
25 |
26 | def pre_proc(X):
27 | '''입력데이터 전처리.
28 |
29 | Args:
30 | X(np.array): 받아온 이미지를 그레이 스케일링 후 84X84로 크기변경
31 | 그리고 정수값으로 저장하기위해(메모리 효율 높이기 위해) 255를 곱함
32 |
33 | Returns:
34 | np.array: 변경된 이미지
35 | '''
36 | # 바로 전 frame과 비교하여 max를 취함으로써 flickering을 제거
37 | # x = np.maximum(X, X1)
38 | # 그레이 스케일링과 리사이징을 하여 데이터 크기 수정
39 | x = np.uint8(resize(rgb2gray(X), (HEIGHT, WIDTH), mode='reflect') * 255)
40 |
41 | return x
42 |
43 | def get_init_state(history, s):
44 | '''에피소드 시작 State를 초기화.
45 |
46 | Args:
47 | history(np.array): 5개의 프레임이 저장될 array
48 | s(list): 초기화된 이미지
49 |
50 | Note:
51 | history[:,:,:3]에 모두 초기화된 이미지(s)를 넣어줌
52 | '''
53 | for i in range(HISTORY_SIZE):
54 | history[:, :, i] = pre_proc(s)
55 |
56 |
57 | def discount_rewards(r):
58 | '''Discounted reward를 구하기 위한 함수
59 |
60 | Args:
61 | r(np.array): reward 값이 저장된 array
62 |
63 | Returns:
64 | discounted_r(np.array): Discounted 된 reward가 저장된 array
65 | '''
66 | discounted_r = np.zeros_like(r, dtype=np.float32)
67 | running_add = 0
68 | for t in reversed(range(len(r))):
69 |
70 | if r[t] != 0:
71 | # 점수를 받으면 에피소드 내부의 작은 에피소드가 끝난 것으로 간주(for Pong)
72 | running_add = 0
73 | running_add = running_add * DISCOUNT + r[t]
74 | discounted_r[t] = running_add
75 |
76 | # normalizing
77 | discounted_r = discounted_r - discounted_r.mean()
78 | discounted_r = discounted_r / discounted_r.std()
79 |
80 | return discounted_r
81 |
82 | def train_episodic(PGagent, x, y, adv):
83 | '''에피소드당 학습을 하기위한 함수
84 |
85 | Args:
86 | PGagent(PolicyGradient): 학습될 네트워크
87 | x(np.array): State가 저장되어있는 array
88 | y(np.array): Action(one_hot)이 저장되어있는 array
89 | adv(np.array) : Discounted reward가 저장되어있는 array
90 |
91 | Returns:
92 | l(float): 네트워크에 의한 loss
93 | '''
94 |
95 | l, _ = PGagent.sess.run([PGagent.loss, PGagent.train], feed_dict={PGagent.X: x,
96 | PGagent.Y: y,
97 | PGagent.adv: adv})
98 | return l
99 |
100 | def play_atari(PGagent):
101 | '''학습된 네트워크로 Play하기 위한 함수
102 |
103 | Args:
104 | PGagent(PolicyGradient): 학습된 네트워크
105 | '''
106 | print("Play Atari!")
107 | episode = 0
108 | while True:
109 | s = env.reset()
110 | history = np.zeros([84, 84, 5], dtype=np.uint8)
111 | done = False
112 | rall = 0
113 | episode += 1
114 | get_init_state(history, s)
115 | while not done:
116 | env.render()
117 | action_p = PGagent.sess.run(
118 | PGagent.a_pre,feed_dict={PGagent.X: np.reshape(np.float32(history[:,:,:4] / 255.), [-1, 84, 84, 4])})
119 | s1, reward, done, _ = env.step(np.argmax(action_p)+2)
120 | history[:, :, 4] = pre_proc(s1)
121 | history[:, :, :4] = history[:, :, 1:]
122 | rall += reward
123 | print("[Episode {0:6f}] Reward: {1:4f} ".format(episode, rall))
124 |
125 |
126 | class PolicyGradient:
127 | def __init__(self, sess, input_size, output_size , name = 'main'):
128 | self.sess = sess
129 | self.input_size = input_size
130 | self.output_size = output_size
131 | self.height = HEIGHT
132 | self.width = WIDTH
133 | self.history_size = HISTORY_SIZE
134 | self.name = name
135 | self.build_network()
136 |
137 | def build_network(self):
138 | with tf.variable_scope(self.name):
139 | self.X = tf.placeholder('float', [None, self.height, self.width, self.history_size])
140 | self.Y = tf.placeholder('float', [None, self.output_size])
141 | self.adv = tf.placeholder('float')
142 |
143 | f1 = tf.get_variable("f1", shape=[1, 1, 4, 1], initializer=tf.contrib.layers.xavier_initializer_conv2d())
144 | f2 = tf.get_variable("f2", shape=[4, 4, 1, 16], initializer=tf.contrib.layers.xavier_initializer_conv2d())
145 | f3 = tf.get_variable("f3", shape=[4, 4, 16, 32], initializer=tf.contrib.layers.xavier_initializer_conv2d())
146 | w1 = tf.get_variable("w1", shape=[6*6*32, 256], initializer=tf.contrib.layers.xavier_initializer())
147 | w2 = tf.get_variable("w2", shape=[256, OUTPUT], initializer=tf.contrib.layers.xavier_initializer())
148 |
149 | # 1x1 conv layer
150 | c1 = tf.nn.relu(tf.nn.conv2d(self.X, f1, strides=[1, 1, 1, 1], padding="VALID"))
151 | c2 = tf.nn.relu(tf.nn.conv2d(c1, f2, strides=[1, 3, 3, 1], padding="VALID"))
152 | c3 = tf.nn.relu(tf.nn.conv2d(c2, f3, strides=[1, 4, 4, 1], padding="VALID"))
153 |
154 | l1 = tf.reshape(c3, [-1, w1.get_shape().as_list()[0]])
155 | l2 = tf.nn.relu(tf.matmul(l1, w1))
156 | self.a_pre = tf.nn.softmax(tf.matmul(l2, w2))
157 |
158 | # nan problem(log(0))
159 | self.log_p = tf.log(tf.clip_by_value(self.a_pre, 1e-10, 1.)) * self.Y
160 |
161 | self.log_lik = -self.log_p * self.adv
162 | self.loss = tf.reduce_mean(tf.reduce_sum(self.log_lik, axis=1))
163 | self.train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
164 | self.saver = tf.train.Saver()
165 |
166 | def get_action(self, state, max_prob):
167 | action_p = self.sess.run(self.a_pre, feed_dict={self.X: np.reshape(np.float32(state/255.),
168 | [-1,HEIGHT,WIDTH,HISTORY_SIZE])})
169 | # 각 액션의 확률로 액션을 결정
170 | max_prob.append(np.max(action_p))
171 | action = np.random.choice(np.arange(self.output_size), p=action_p[0])
172 |
173 | return action
174 | # config = tf.ConfigProto(device_count ={'GPU' : 0})
175 | def main():
176 | with tf.Session() as sess:
177 | # VRAM이 부족하면 CPU로 학습
178 | # with tf.Session(config = tf.ConfigProto(device_count ={'GPU' : 0})) as sess:
179 | PGagent = PolicyGradient(sess, INPUT, OUTPUT)
180 |
181 | PGagent.sess.run(tf.global_variables_initializer())
182 |
183 | episode = 0
184 | recent_rlist = deque(maxlen=100)
185 | recent_rlist.append(0)
186 |
187 | # 최근 100개의 점수가 19점 넘을 때까지 학습
188 | while np.mean(recent_rlist) <= 19:
189 | episode += 1
190 |
191 | state_memory = deque()
192 | action_memory = deque()
193 | reward_memory = deque()
194 |
195 | # 공의 움직임을 알아보기 위한 History
196 | history = np.zeros([84, 84, HISTORY_SIZE+1], dtype=np.uint8)
197 | rall, count = 0, 0
198 | done = False
199 |
200 | s = env.reset()
201 | max_prob = deque()
202 | get_init_state(history, s)
203 |
204 | while not done:
205 | # env.render()
206 | count += 1
207 | # 액션 선택
208 | action = PGagent.get_action(history[:,:,:HISTORY_SIZE], max_prob)
209 |
210 | # action을 one_hot으로 표현
211 | y = np.zeros(OUTPUT)
212 | y[action] = 1
213 |
214 | # 학습속도 개선을 위해 액션의 개수를 2개로 줄임 (UP or DOWN)
215 | s1, reward, done, l = env.step(action + 2)
216 |
217 | rall += reward
218 | reward = np.clip(reward, -1, 1)
219 |
220 | # 한 에피소드를 저장
221 | state_memory.append(np.copy(np.float32(history[:,:,:HISTORY_SIZE]/255.)))
222 | action_memory.append(np.copy(y))
223 | reward_memory.append(np.copy(reward))
224 |
225 | # 새로운 프레임을 히스토리 마지막에 넣어줌
226 | history[:, :, HISTORY_SIZE] = pre_proc(s1)
227 | history[:, :, :HISTORY_SIZE] = history[:, :, 1:]
228 |
229 | # 에피소드가 끝났을때 학습
230 | if done:
231 | # Discounted return을 구함
232 | rewards = discount_rewards(np.vstack(reward_memory))
233 |
234 | l = train_episodic(PGagent, np.stack(state_memory, axis=0),
235 | np.stack(action_memory, axis =0), rewards)
236 |
237 |
238 | recent_rlist.append(rall)
239 |
240 | print("[Episode {0:6d}] Step:{4:6d} Reward: {1:4f} Loss: {2:5.5f} Recent Reward: {3:4f} Max Prob: {5:5.5f}".
241 | format(episode, rall, l, np.mean(recent_rlist), count, np.mean(max_prob)))
242 |
243 | if episode % 10 == 0:
244 | PGagent.saver.save(PGagent.sess, model_path, global_step= episode)
245 | play_atari(PGagent)
246 |
247 |
248 | if __name__ == "__main__":
249 | main()
250 |
251 |
252 |
253 |
254 |
255 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement Learning
2 | ###### 여러 환경에 적용해보는 강화학습 예제(파이토치로 옮기고 있습니다)
3 | # [Here is my new Repo for Policy Gradient!!](https://github.com/jcwleo/mario_rl)
4 | -------------------
5 |
6 | 
7 | ###### [Breakout / Use DQN(Nature2015)]
8 |
9 | ---------------
10 | ## 1. Q-Learning / SARSA
11 | * FrozenLake(Gridword)
12 | * [Deterministic Q-Learning](https://github.com/jcwleo/Reinforcement_Learning/blob/master/FrozenLake/FL_Q-Table.py)
13 | * [Add Exploration & Discounted Factor](https://github.com/jcwleo/Reinforcement_Learning/blob/master/FrozenLake/FL_Q-table_exp%26dis.py)
14 | * [Stochastic Q-Learning](https://github.com/jcwleo/Reinforcement_Learning/blob/master/FrozenLake/FL_Q-table_Stochastic.py)
15 | * WindyGridWorld(in Sutton's book)
16 | * [Q-Learning / SARSA](https://github.com/jcwleo/Reinforcement_Learning/tree/master/Windygridworld)
17 | ## 2. Q-Network (Action-Value Function Approximation)
18 | * [FrozenLake(Gridword)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/FrozenLake/FrozenLake_Q-Network.py)
19 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_Q-Network.py)
20 |
21 | ## 3. DQN
22 | DQN(NIPS2013)은 (Experience Replay Memory / CNN) 을 사용.
23 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_DQN_NIPS2013.py) - Cartpole 같은 경우에는 CNN을 사용하지 않고 센서 정보를 통해서 학습
24 |
25 | DQN(Nature2015)은 (Experience Replay Memory / Target Network / CNN) 을 사용
26 |
27 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_DQN_Nature2015.py)
28 | * [Breakout(atari)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/Breakout/Breakout_DQN_class.py)
29 | * [Breakout(atari)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/Breakout/breakout_dqn_pytorch.py)
30 | * this code is made by pytorch and more efficient memory and train
31 |
32 | ## 5. Vanilla Policy Gradient(REINFORCE)
33 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_PolicyGradient.py)
34 | * [Pong(atari)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/Pong/Pong_PolicyGradient.py)
35 | * [Breakout(atari)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/Breakout/Breakout_PolicyGradient.py)
36 |
37 | ## 6. Advantage Actor Critic
38 | * episodic
39 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_A2C_episodic.py)
40 | * [Pong(atari)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/Pong/Pong_A2C_episodic.py)
41 | * one-step
42 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/Cartpole_A2C_onestep.py)
43 | * n-step
44 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/Cartpole_A2C_nstep.py)
45 |
46 | ## 7. Deep Deterministic Policy Gradient
47 | * [Pendulum(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/pendulum/pendulum_ddpg.py)
48 |
49 | ## 8. Parallel Advantage Actor Critic(is called 'A2C' in OpenAI)
50 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_PAAC.py)(used a single thread instead of multi thread)
51 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_PAAC_multiproc.py)(used multiprocessing in pytorch)
52 | * [Super Mario Bros](https://github.com/jcwleo/mario_rl)(used multiprocessing in pytorch)
53 |
54 | ## 9. C51(Distributional RL)
55 | * DDQN
56 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/CartPole_C51.py)
57 |
58 | ## 10. PPO(Proximal Policy Optimization)
59 | * [CartPole(Classic Control)](https://github.com/jcwleo/Reinforcement_Learning/blob/master/CartPole/cartpole_ppo.py)
60 |
--------------------------------------------------------------------------------
/Windygridworld/OptimalPolicy/optimal_Q-Learning.txt:
--------------------------------------------------------------------------------
1 | RRRRRRRRRD
2 | RRRRRRRDRD
3 | RRRRRRLURD
4 | UURRRRLLRD
5 | RRRRDLLLRD
6 | RRRRLLLLLD
7 | DDDLLLLLLL
8 |
--------------------------------------------------------------------------------
/Windygridworld/OptimalPolicy/optimal_SARSA.txt:
--------------------------------------------------------------------------------
1 | LULRRRRRUD
2 | RLRRRRURDD
3 | LULURRLRLD
4 | DUDRRLLLRD
5 | RURLRLLLLD
6 | LLRDLLLLLD
7 | RRLLLLLLLL
8 |
--------------------------------------------------------------------------------
/Windygridworld/Q-learning_sarsa.py:
--------------------------------------------------------------------------------
1 | from windygridworld import WindyGridWorld
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import random as rn
5 |
6 |
7 | def rargmax(vector):
8 | '''모두 같은 Q값일 때, 랜덤하게 액션을 정해주는 함수
9 |
10 | Args:
11 | vector(ndarray): Q-table
12 |
13 | Returns:
14 | action: 랜덤하게 정해진 action값
15 |
16 | '''
17 | m = np.amax(vector)
18 | indices = np.nonzero(vector == m)[0]
19 | return rn.choice(indices)
20 |
21 |
22 | def array2index(array, width):
23 | '''
24 |
25 | Args:
26 | array: gridworld array
27 | width: gridworld의 너비
28 |
29 | Returns:
30 | idx: 2D array인 array를 인덱스 값으로 바꾼 값
31 | '''
32 | idx = array[0] * width + array[1]
33 | return idx
34 |
35 |
36 | def learning(max_step, learning_type,render):
37 | env = WindyGridWorld()
38 |
39 | # Q-table 생성
40 | Q = np.zeros([env.observation_space, env.action_space])
41 |
42 | global_step = 0
43 |
44 | # 하이퍼파라미터
45 | alpha = 0.5
46 | epsilon = 0.1
47 |
48 | episode = 0
49 | plot_graph = []
50 |
51 | while global_step <= max_step:
52 | episode += 1
53 |
54 | # 에피소드 리셋
55 | state = env.reset()
56 |
57 | done = False
58 | step = 0
59 | total_reward = 0
60 | while not done:
61 | if render:
62 | env.render()
63 |
64 | step += 1
65 | global_step += 1
66 | plot_graph.append(episode)
67 |
68 | # e-greedy 방법으로 action 결정
69 | if epsilon > np.random.rand(1):
70 | action = np.random.randint(env.action_space)
71 | else:
72 | action = rargmax(Q[array2index(state, env.width), :])
73 |
74 | # 실제 액션 수행
75 | next_state, reward, done, _ = env.step(action)
76 |
77 | total_reward += reward
78 |
79 | # Q-learning일 때와 SARSA일 때를 구별하여 학습
80 | if learning_type == 'Q-Learning':
81 | # Q-learning
82 | # Q(s,a) = Q(s,a) + a * (reward + max_a(Q(s',a)) - Q(s,a))
83 | Q[array2index(state, env.width), action] += (
84 | alpha * (reward + np.max(Q[array2index(next_state, env.width), :])
85 | - Q[array2index(state, env.width), action]))
86 | else:
87 | # SARSA
88 | # Q(s,a) = Q(s,a) + a * (reward + Q(s',a') - Q(s,a))
89 | Q[array2index(state, env.width), action] += (
90 | alpha * (reward + (Q[array2index(next_state, env.width), action])
91 | - Q[array2index(state, env.width), action]))
92 |
93 | state = next_state[:]
94 |
95 | print('Learning Type : {} Episode : {:5.0f} Step : {:5.0f} reward : {:5.0f}'
96 | .format(learning_type,episode,step,total_reward))
97 |
98 | # 학습된 Q값 저장
99 | np.save('QValue/{}_value'.format(learning_type), Q)
100 | np.savetxt('QValue/{}_value.txt'.format(learning_type), Q)
101 |
102 | direction = np.array(['L', 'U', 'R', 'D'])
103 |
104 | # 학습된 Optimal한 action 추출
105 | Q = np.argmax(Q, axis=1)
106 | optimal_policy = np.chararray([env.observation_space], unicode=True)
107 | for i in range(env.action_space):
108 | optimal_policy[Q == i] = direction[i]
109 |
110 | optimal_policy = optimal_policy.reshape([env.height, env.width])
111 |
112 | # Optimal policy를 txt로 저장
113 | np.savetxt('OptimalPolicy/optimal_{}.txt'.format(learning_type), optimal_policy, delimiter='', fmt='%s')
114 |
115 | return plot_graph
116 |
117 |
118 | def main():
119 | # 학습시킬 step 수
120 | max_step = 20000
121 |
122 | # 움직임을 실제 보고싶을시 True로 변경
123 | render = False
124 |
125 | # 각각 Q_learning과 Sarsa 학습
126 | q_graph = learning(max_step, 'Q-Learning',render)
127 | sarsa_graph = learning(max_step, 'SARSA', render)
128 |
129 | # Q_learning과 Sarsa 학습 그래프 Plot
130 | plt.xlim([0, max_step * 1.1])
131 | plt.plot(q_graph, 'b', label='Q-learning')
132 | plt.plot(sarsa_graph, 'g', label='SARSA')
133 | plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
134 | ncol=2, mode="expand", borderaxespad=0.)
135 | plt.savefig('graph.png')
136 | plt.show()
137 |
138 |
139 | if __name__ == '__main__':
140 | main()
141 |
--------------------------------------------------------------------------------
/Windygridworld/QValue/Q-Learning_value.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/Windygridworld/QValue/Q-Learning_value.npy
--------------------------------------------------------------------------------
/Windygridworld/QValue/Q-Learning_value.txt:
--------------------------------------------------------------------------------
1 | -1.606109091396425725e+01 -1.610001232580505715e+01 -1.584762083464220339e+01 -1.630073073954796925e+01
2 | -1.541752194222167915e+01 -1.538488113843050087e+01 -1.498136231988978295e+01 -1.529896876498860436e+01
3 | -1.466609184134511246e+01 -1.441765939708626121e+01 -1.399886281708481128e+01 -1.422399696898425248e+01
4 | -1.403975840801068209e+01 -1.304280009147792541e+01 -1.300000000000000000e+01 -1.381113840589452835e+01
5 | -1.398668998954349618e+01 -1.299979408507867618e+01 -1.200000000000000000e+01 -1.298531652716749818e+01
6 | -1.299976087288599125e+01 -1.199998733593630185e+01 -1.100000000000000000e+01 -1.199966338906388685e+01
7 | -1.199999982237042673e+01 -1.099999994034184070e+01 -1.000000000000000000e+01 -1.099999976158142090e+01
8 | -1.099999466260687875e+01 -9.999997589438164525e+00 -9.000000000000000000e+00 -9.999999498486896954e+00
9 | -9.999998310057273798e+00 -8.999999970180244446e+00 -8.000000000000000000e+00 -8.999999978395445410e+00
10 | -8.999998486836986444e+00 -7.999999981186673992e+00 -7.999999999693950592e+00 -7.000000000000000000e+00
11 | -1.626172585608986054e+01 -1.658059207685153780e+01 -1.597713827371924644e+01 -1.660554227998717991e+01
12 | -1.643381402014644266e+01 -1.557235125894735184e+01 -1.499991033583573596e+01 -1.543447798906282209e+01
13 | -1.517692580039580896e+01 -1.460284124118269844e+01 -1.399999999126103489e+01 -1.429390618888084674e+01
14 | -1.493757265185002225e+01 -1.399999736862767818e+01 -1.300000000000000000e+01 -1.398218581690473705e+01
15 | -1.334645919114836055e+01 -1.274473947216430680e+01 -1.199999999873805479e+01 -1.286422890813454067e+01
16 | -1.299607909255507820e+01 -1.197791104340741697e+01 -1.100000000000000000e+01 -1.198316236699534088e+01
17 | -1.094934285876594871e+01 -1.017485070106623724e+01 -9.997129935204984719e+00 -1.048353891316603281e+01
18 | -8.018247879035385495e+00 -6.017249159629500355e+00 -8.468204935173119452e+00 -5.283426284790039062e+00
19 | -7.383547711009669001e+00 -7.299712672339803810e+00 -6.999999777473749241e+00 -7.499088532475981950e+00
20 | -8.999998306851560770e+00 -7.999999938540030442e+00 -6.999999985098838806e+00 -6.000000000000000000e+00
21 | -1.698537158189627405e+01 -1.699999441852381565e+01 -1.600000000000000000e+01 -1.788986110612843561e+01
22 | -1.699956194601221071e+01 -1.599624101016233624e+01 -1.500000000000000000e+01 -1.699456858716657592e+01
23 | -1.599018404768081503e+01 -1.499480831099609190e+01 -1.400000000000000000e+01 -1.499915410552886819e+01
24 | -1.422119196246004691e+01 -1.304116248367921038e+01 -1.299999992047652242e+01 -1.303508915023628667e+01
25 | -1.399923735533061375e+01 -1.299974958775605494e+01 -1.200000000000000000e+01 -1.298437499053001076e+01
26 | -1.235948743736836164e+01 -1.174968104199036389e+01 -1.099930108758036340e+01 -1.139215684404715212e+01
27 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
28 | -7.500000000000000000e+00 -4.079315185546875000e+00 -5.748177064951963899e+00 -4.430676720337942243e+00
29 | -8.353223943527794404e+00 -6.994773362917942272e+00 -5.999999989244431120e+00 -6.336669921875000000e+00
30 | -7.999977320199444186e+00 -6.999999992910221458e+00 -5.999998087922945089e+00 -5.000000000000000000e+00
31 | -1.799909823412379950e+01 -1.699981158508516899e+01 -1.699981705762422735e+01 -1.699986625193924894e+01
32 | -1.789990493222170187e+01 -1.600000000000000000e+01 -1.699999809259701067e+01 -1.600000000000000000e+01
33 | -1.512968564860241472e+01 -1.468519456868694917e+01 -1.399999791341254962e+01 -1.449757585816993632e+01
34 | -1.489052999652578535e+01 -1.399088390900487511e+01 -1.300000000000000000e+01 -1.393324809313147838e+01
35 | -1.300870325873785127e+01 -1.282990356841157364e+01 -1.199458390272461017e+01 -1.227136230468750000e+01
36 | -1.171322996557396401e+01 -1.148988889283562997e+01 -1.098229512021653065e+01 -1.125000000000000000e+01
37 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
38 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
39 | -5.855931133031845093e+00 -7.196364956585241224e+00 -4.999999999960096808e+00 -5.000000000000000000e+00
40 | -6.999999278542817116e+00 -5.999989555910574701e+00 -4.999985963106155396e+00 -4.000000000000000000e+00
41 | -1.699819723707243568e+01 -1.798778248505182731e+01 -1.600000000000000000e+01 -1.700000000000000000e+01
42 | -1.699856503371485417e+01 -1.697990005204028563e+01 -1.500000000000000000e+01 -1.574690184543958793e+01
43 | -1.578901819996925582e+01 -1.497437362996795684e+01 -1.400000000000000000e+01 -1.486274657066915239e+01
44 | -1.392810581613018428e+01 -1.343661317201522465e+01 -1.297968200089046853e+01 -1.361869725172174483e+01
45 | -1.201060468427422734e+01 -1.226785252888633870e+01 -1.195749734665353614e+01 -1.192961187653998500e+01
46 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
47 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
48 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
49 | -4.443339188583195210e+00 -5.911444380167361956e+00 -3.999991390166542260e+00 -4.000000000000000000e+00
50 | -5.999991093075700554e+00 -4.999995257228647461e+00 -3.999999522416652020e+00 -3.000000000000000000e+00
51 | -1.577010001503160197e+01 -1.551488599894391385e+01 -1.551246395474427864e+01 -1.576567094844221195e+01
52 | -1.549649187959380470e+01 -1.528496152793104912e+01 -1.486852728068114970e+01 -1.506486514796082865e+01
53 | -1.397552628060613067e+01 -1.462096647017417439e+01 -1.395051287552606745e+01 -1.421640167917921360e+01
54 | -1.414565940417493550e+01 -1.295734866739119262e+01 -1.287681184539292900e+01 -1.300000000000000000e+01
55 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
56 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
57 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
58 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
59 | 0.000000000000000000e+00 -5.999985351276880152e+00 -2.999999992316588759e+00 -9.999999850988388062e-01
60 | -4.999892869434290787e+00 -3.999999378109350801e+00 -2.999969482339565729e+00 -2.000000000000000000e+00
61 | -1.536462809424847364e+01 -1.539057287892060799e+01 -1.529154700887497853e+01 -1.521031709612017480e+01
62 | -1.498813466990083576e+01 -1.459573974152747766e+01 -1.457605338710970244e+01 -1.450000000000000000e+01
63 | -1.419253091642605114e+01 -1.422051572091282168e+01 -1.373968121404789855e+01 -1.366981866991860173e+01
64 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
65 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
66 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
67 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
68 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
69 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
70 | -1.000000000000000000e+00 -2.999999970197677612e+00 -1.999999046325683594e+00 -1.999938964843750000e+00
71 |
--------------------------------------------------------------------------------
/Windygridworld/QValue/SARSA_value.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/Windygridworld/QValue/SARSA_value.npy
--------------------------------------------------------------------------------
/Windygridworld/QValue/SARSA_value.txt:
--------------------------------------------------------------------------------
1 | -2.750000000000000000e+01 -2.750000000000000000e+01 -2.755171852545975497e+01 -2.794097699769210053e+01
2 | -2.735494478117115236e+01 -2.700000000000000000e+01 -2.734153123529298313e+01 -2.734687984460050103e+01
3 | -2.689838912315719455e+01 -2.700000000000000000e+01 -2.736372676286155681e+01 -2.707966508952665663e+01
4 | -2.731632245055688202e+01 -2.700000000000000000e+01 -2.697839578178842856e+01 -2.700000000000000000e+01
5 | -2.657724991070490717e+01 -2.650000000000000000e+01 -2.599991075437475274e+01 -2.600000000000000000e+01
6 | -2.615094907709034544e+01 -2.500000000000000000e+01 -2.499999994869993714e+01 -2.550000000000000000e+01
7 | -2.405820291213153794e+01 -2.500000000000000000e+01 -2.399999999999540279e+01 -2.450000000000000000e+01
8 | -2.420340995284578156e+01 -2.300000000000000000e+01 -2.299999999999997513e+01 -2.350000000000000000e+01
9 | -2.379328314829689361e+01 -2.200000000000000000e+01 -2.200000000000000000e+01 -2.200000000000000000e+01
10 | -2.426928310791577559e+01 -1.800000000000000000e+01 -2.100000000000000000e+01 -1.599999998870652895e+01
11 | -2.700000000000000000e+01 -2.762966168297720060e+01 -2.676026358179247211e+01 -2.782720087022922684e+01
12 | -2.675502043365662530e+01 -2.725791131692798785e+01 -2.733043339699290897e+01 -2.720852208992571519e+01
13 | -2.708992262006324125e+01 -2.735905145148296924e+01 -2.707059957746382040e+01 -2.740283885071267633e+01
14 | -2.736746336446779537e+01 -2.743136641674209386e+01 -2.691921125211015209e+01 -2.700000000000000000e+01
15 | -2.645415901228469480e+01 -2.640191233158111572e+01 -2.596539275585391948e+01 -2.600000000000000000e+01
16 | -2.529348416526390508e+01 -2.571744495630264282e+01 -2.498471302248990611e+01 -2.500000000000000000e+01
17 | -2.428265391984366772e+01 -2.376977539062500000e+01 -2.382696147502261041e+01 -2.448913574218750000e+01
18 | -1.871860645846396665e+01 -1.518750000000000000e+01 -1.481575235491524722e+01 -1.821875000000000000e+01
19 | -1.957267516581529776e+01 -1.634765625000000000e+01 -1.610902905464172363e+01 -1.600000000000000000e+01
20 | -2.381555753075725335e+01 -1.747002146206796169e+01 -1.600000000000000000e+01 -1.499999999881756096e+01
21 | -2.800000000000000000e+01 -6.818945907169836573e+01 -2.804746459838423078e+01 -4.488060787718251277e+01
22 | -2.780552595379219838e+01 -2.744480348016970339e+01 -2.777906782193912250e+01 -2.821345237620777979e+01
23 | -2.740589903838792907e+01 -2.756148423787738722e+01 -2.761133809323506583e+01 -2.754469650995256558e+01
24 | -2.746726287278380596e+01 -2.666675722599029541e+01 -2.672322528627580951e+01 -2.700000000000000000e+01
25 | -2.702967000172590772e+01 -2.611713954806327820e+01 -2.591307265742956645e+01 -2.600000000000000000e+01
26 | -2.568309227073276091e+01 -2.490106201171875000e+01 -2.483078663441147427e+01 -2.500000000000000000e+01
27 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
28 | -1.179975311173511798e+01 -9.625000000000000000e+00 -8.824859619140625000e+00 -1.090625000000000000e+01
29 | -1.488830161209904190e+01 -1.873242187500000000e+01 -1.535912768449634314e+01 -1.500000000000000000e+01
30 | -1.977274485510834268e+01 -1.758000184083357453e+01 -1.500000000000000000e+01 -1.399999999987816501e+01
31 | -5.400000000000000000e+01 -5.597120564856491143e+01 -5.454495271566119641e+01 -5.203986278927537512e+01
32 | -4.981962152750929818e+01 -2.788261371571276825e+01 -6.038980781517233964e+01 -2.828242330827783491e+01
33 | -3.536586028046440333e+01 -2.777991488643285933e+01 -2.712984053489503466e+01 -2.711792195270746930e+01
34 | -2.810145826160442084e+01 -2.737530132710526232e+01 -2.669065260337092482e+01 -2.700000000000000000e+01
35 | -2.553866601464267205e+01 -2.589084246754646301e+01 -2.543151347043419364e+01 -2.550000000000000000e+01
36 | -2.444986838835328058e+01 -2.517120814323425293e+01 -2.446946968310250270e+01 -2.450000000000000000e+01
37 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
38 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
39 | -1.679571970998039276e+01 -1.623974609375000000e+01 -1.497991088032722473e+01 -1.500000000000000000e+01
40 | -1.588776146110567566e+01 -1.767090018984526978e+01 -1.400000000000000000e+01 -1.299999999999290878e+01
41 | -2.950000000000000000e+01 -5.307988381194701333e+01 -2.933618428396760436e+01 -5.596590163984805599e+01
42 | -2.899783305689743429e+01 -2.837034892286947141e+01 -2.837495737086192804e+01 -2.851418136812195669e+01
43 | -2.903510336858334995e+01 -2.769415967330536077e+01 -2.743419954331855237e+01 -2.754025260181720114e+01
44 | -2.595956777962328488e+01 -2.709387493133544922e+01 -2.605684741249711323e+01 -2.600000000000000000e+01
45 | -2.709679981338558719e+01 -2.572606408596038818e+01 -2.503135233563703110e+01 -2.550000000000000000e+01
46 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
47 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
48 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
49 | -1.092290295479242701e+01 -1.298925781250000000e+01 -1.218994140625000000e+01 -1.100000000000000000e+01
50 | -1.687865904550222496e+01 -1.834917828781726712e+01 -1.250000000000000000e+01 -1.199999999999972999e+01
51 | -2.750000000000000000e+01 -3.325200196528108876e+01 -2.752523696001526332e+01 -2.795410195172147283e+01
52 | -2.764261279377782543e+01 -2.880099473852042991e+01 -2.768212347066484824e+01 -2.769669787462559185e+01
53 | -2.760426634377552091e+01 -2.743772949512522885e+01 -2.691082028037558516e+01 -2.733659871582978695e+01
54 | -2.622418781429856693e+01 -2.550917020792257972e+01 -2.562161888607650440e+01 -2.550000000000000000e+01
55 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
56 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
57 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
58 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
59 | 0.000000000000000000e+00 -1.718215614557266235e+01 -1.213046455383300781e+01 -1.350000000000000000e+01
60 | -1.167318339636358360e+01 -1.873467007229146475e+01 -1.150000000000000000e+01 -1.099999999999999822e+01
61 | -2.750000000000000000e+01 -2.923979778774805638e+01 -2.746730817452127127e+01 -2.750000000000000000e+01
62 | -2.733467742053835536e+01 -2.714173086277996561e+01 -2.695090967456259179e+01 -2.700000000000000000e+01
63 | -2.630331841953739058e+01 -2.650975755363469233e+01 -2.632902625338213198e+01 -2.700000000000000000e+01
64 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
65 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
66 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
67 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
68 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
69 | 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00
70 | -1.000000000000000000e+00 -1.851891208253221066e+01 -1.050000000000000000e+01 -1.000000000000000000e+01
71 |
--------------------------------------------------------------------------------
/Windygridworld/Readme.md:
--------------------------------------------------------------------------------
1 | # Windy Gridworld
2 | 
3 | 
4 | 
5 |
--------------------------------------------------------------------------------
/Windygridworld/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/Windygridworld/graph.png
--------------------------------------------------------------------------------
/Windygridworld/windygridworld.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import time
4 |
5 |
6 | class WindyGridWorld:
7 | def __init__(self):
8 | self.width = 10
9 | self.height = 7
10 | self.grid = np.array(['O'] * 70).reshape([self.height, self.width])
11 | self.weak_wind = [3, 4, 5, 8]
12 | self.strong_wind = [6, 7]
13 | self.action_space = 4
14 | self.observation_space = 70
15 | self.action = {0: 'left', 1: 'up', 2: 'right', 3: 'down'}
16 | self.goal = [3, 7]
17 |
18 | def reset(self):
19 | self.state = [3, 0]
20 | self.grid = np.array(['O'] * 70).reshape([self.height, self.width])
21 | self.grid[self.state[0], self.state[1]] = 'X'
22 | return self.state
23 |
24 | def render(self, ):
25 | time.sleep(0.1)
26 | os.system('cls')
27 | print(self.grid)
28 |
29 | def step(self, action):
30 | # original action
31 | if action == 0:
32 | if self.state[1] != 0:
33 | self.state[1] -= 1
34 | elif action == 1:
35 | if self.state[0] != 0:
36 | self.state[0] -= 1
37 | elif action == 2:
38 | if self.state[1] != self.width - 1:
39 | self.state[1] += 1
40 | elif action == 3:
41 | if self.state[0] != self.height - 1:
42 | self.state[0] += 1
43 |
44 | else:
45 | print('올바르지 않은 action입니다.')
46 |
47 | # windy action
48 | if self.state[1] in self.weak_wind + self.strong_wind:
49 | if self.state[1] in self.weak_wind:
50 | if self.state[0] != 0:
51 | self.state[0] -= 1
52 | else:
53 | if self.state[0] >= 2:
54 | self.state[0] -= 2
55 | elif self.state[0] == 1:
56 | self.state[0] -= 1
57 |
58 | self.grid = np.array(['O'] * 70).reshape([self.height, self.width])
59 | self.grid[self.state[0], self.state[1]] = 'X'
60 |
61 | if self.state == self.goal:
62 | return self.state, 0, True, None
63 | else:
64 | return self.state, -1, False, None
65 |
--------------------------------------------------------------------------------
/pendulum/pendulum_ddpg.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import gym
3 | import torch
4 | import pylab
5 | import random
6 | import argparse
7 | import numpy as np
8 | from collections import deque
9 | from datetime import datetime
10 | from copy import deepcopy
11 | from skimage.transform import resize
12 | from skimage.color import rgb2gray
13 | import torch.nn as nn
14 | import torch.optim as optim
15 | import torch.nn.functional as F
16 |
17 |
18 | class OrnsteinUhlenbeckActionNoise(object):
19 | def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2):
20 | self.action_dim = action_dim
21 | self.mu = mu
22 | self.theta = theta
23 | self.sigma = sigma
24 | self.X = np.ones(self.action_dim) * self.mu
25 |
26 | def reset(self):
27 | self.X = np.ones(self.action_dim) * self.mu
28 |
29 | def sample(self):
30 | dx = self.theta * (self.mu - self.X)
31 | dx = dx + self.sigma * np.random.randn(len(self.X))
32 | self.X = self.X + dx
33 | return self.X
34 |
35 |
36 | class Flatten(nn.Module):
37 | def forward(self, input):
38 | return input.view(input.size(0), -1)
39 |
40 |
41 | class Actor(nn.Module):
42 | def __init__(self, obs_size, action_size, action_range):
43 | self.action_range = action_range
44 | super(Actor, self).__init__()
45 | self.network = nn.Sequential(
46 | nn.Linear(obs_size, 400),
47 | nn.ReLU(),
48 | nn.Linear(400, 300),
49 | nn.ReLU(),
50 | nn.Linear(300, action_size),
51 | nn.Tanh()
52 | )
53 |
54 | def forward(self, x):
55 | return self.network(x)
56 |
57 |
58 | class Critic(nn.Module):
59 | def __init__(self, obs_size, action_size, action_range):
60 | self.action_range = action_range
61 | super(Critic, self).__init__()
62 | self.before_action = nn.Sequential(
63 | nn.Linear(obs_size, 400),
64 | nn.ReLU()
65 | )
66 | self.after_action = nn.Sequential(
67 | nn.Linear(400 + action_size, 300),
68 | nn.ReLU(),
69 | nn.Linear(300, 1)
70 | )
71 |
72 | def forward(self, x, action):
73 | x = self.before_action(x)
74 | x = torch.cat([x, action], dim=1)
75 | x = self.after_action(x)
76 | return x
77 |
78 |
79 | class DDPG(object):
80 | def __init__(self, options):
81 | # hyperparameter
82 | self.memory_size = options.get('memory_size', 1000000)
83 | self.action_size = options.get('action_size')
84 | self.action_range = options.get('action_range')
85 | self.obs_size = options.get('obs_size')
86 | self.batch_size = options.get('batch_size')
87 | self.actor_lr = options.get('actor_lr')
88 | self.critic_lr = options.get('critic_lr')
89 | self.gamma = options.get('gamma')
90 | self.decay = options.get('decay')
91 | self.tau = options.get('tau')
92 |
93 | # actor model
94 | self.actor = Actor(self.obs_size, self.action_size, self.action_range)
95 | self.actor_target = Actor(self.obs_size, self.action_size, self.action_range)
96 |
97 | # critic model
98 | self.critic = Critic(self.obs_size, self.action_size, self.action_range)
99 | self.critic_target = Critic(self.obs_size, self.action_size, self.action_range)
100 |
101 | # memory(uniformly)
102 | self.memory = deque(maxlen=self.memory_size)
103 |
104 | # explortion
105 | self.ou = OrnsteinUhlenbeckActionNoise(theta=args.ou_theta, sigma=args.ou_sigma,
106 | mu=args.ou_mu, action_dim=self.action_size)
107 |
108 | # optimizer
109 | self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr)
110 | self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.critic_lr)
111 |
112 | # initialize target model
113 | self.actor_target.load_state_dict(self.actor.state_dict())
114 | self.critic_target.load_state_dict(self.critic.state_dict())
115 |
116 | def get_action(self, state):
117 | state = torch.from_numpy(state).float()
118 | model_action = self.actor(state).detach().numpy() * self.action_range
119 | action = model_action + self.ou.sample() * self.action_range
120 | return action
121 |
122 | def update_target_model(self):
123 | self._soft_update(self.actor_target, self.actor)
124 | self._soft_update(self.critic_target, self.critic)
125 |
126 | def _soft_update(self, target, source):
127 | for target_param, param in zip(target.parameters(), source.parameters()):
128 | target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
129 |
130 | def append_sample(self, state, action, reward, next_state, done):
131 | self.memory.append((deepcopy(state), action, reward, deepcopy(next_state), done))
132 |
133 | def _get_sample(self, batch_size):
134 | return random.sample(self.memory, batch_size)
135 |
136 | def train(self):
137 | minibatch = np.array(self._get_sample(self.batch_size)).transpose()
138 |
139 | states = np.vstack(minibatch[0])
140 | actions = np.vstack(minibatch[1])
141 | rewards = np.vstack(minibatch[2])
142 | next_states = np.vstack(minibatch[3])
143 | dones = np.vstack(minibatch[4].astype(int))
144 |
145 | rewards = torch.Tensor(rewards)
146 | dones = torch.Tensor(dones)
147 | actions = torch.Tensor(actions)
148 |
149 | # critic update
150 | self.critic_optimizer.zero_grad()
151 | states = torch.Tensor(states)
152 | next_states = torch.Tensor(next_states)
153 | next_actions = self.actor_target(next_states)
154 |
155 | pred = self.critic(states, actions)
156 | next_pred = self.critic_target(next_states, next_actions)
157 |
158 | target = rewards + (1 - dones) * self.gamma * next_pred
159 | critic_loss = F.mse_loss(pred, target)
160 | critic_loss.backward()
161 | self.critic_optimizer.step()
162 |
163 | # actor update
164 | self.actor_optimizer.zero_grad()
165 | pred_actions = self.actor(states)
166 | actor_loss = self.critic(states, pred_actions).mean()
167 | actor_loss = -actor_loss
168 | actor_loss.backward()
169 | self.actor_optimizer.step()
170 |
171 |
172 | def main(args):
173 | env = gym.make(args.env)
174 |
175 | obs_size = env.observation_space.shape[0]
176 | action_size = env.action_space.shape[0]
177 | action_range = env.action_space.high[0]
178 |
179 | print(action_size, action_range)
180 |
181 | args_dict = vars(args)
182 | args_dict['action_size'] = action_size
183 | args_dict['obs_size'] = obs_size
184 | args_dict['action_range'] = action_range
185 |
186 | scores, episodes = [], []
187 | agent = DDPG(args_dict)
188 | recent_reward = deque(maxlen=100)
189 | frame = 0
190 |
191 | for e in range(args.episode):
192 | score = 0
193 | step = 0
194 | done = False
195 | state = env.reset()
196 | state = np.reshape(state, [1, agent.obs_size])
197 | while not done:
198 | step += 1
199 | frame += 1
200 | if args.render:
201 | env.render()
202 |
203 | # get action for the current state and go one step in environment
204 | action = agent.get_action(state)
205 |
206 | next_state, reward, done, info = env.step([action])
207 | next_state = np.reshape(next_state, [1, agent.obs_size])
208 |
209 | reward = float(reward[0, 0])
210 | # save the sample to the replay memory
211 | agent.append_sample(state, action, reward, next_state, done)
212 |
213 | score += reward
214 | state = next_state
215 | if frame > agent.batch_size:
216 | agent.train()
217 | agent.update_target_model()
218 |
219 | if frame % 2000 == 0:
220 | print('now time : ', datetime.now())
221 | scores.append(score)
222 | episodes.append(e)
223 | pylab.plot(episodes, scores, 'b')
224 | pylab.savefig("./save_graph/pendulum_ddpg.png")
225 |
226 | if done:
227 | recent_reward.append(score)
228 | # every episode, plot the play time
229 | print("episode:", e, " score:", score, " memory length:",
230 | len(agent.memory), " steps:", step,
231 | " recent reward:", np.mean(recent_reward))
232 |
233 | # if the mean of scores of last 10 episode is bigger than 400
234 | # stop training
235 |
236 |
237 | if __name__ == '__main__':
238 | parser = argparse.ArgumentParser()
239 |
240 | parser.add_argument('--env', default='Pendulum-v0', type=str, help='open-ai gym environment')
241 | parser.add_argument('--episode', default=10000, type=int, help='the number of episode')
242 | parser.add_argument('--render', default=False, type=bool, help='is render')
243 | parser.add_argument('--memory_size', default=500000, type=int, help='replay memory size')
244 | parser.add_argument('--batch_size', default=64, type=int, help='minibatch size')
245 | parser.add_argument('--actor_lr', default=1e-4, type=float, help='actor learning rate')
246 | parser.add_argument('--critic_lr', default=1e-3, type=float, help='critic learning rate')
247 | parser.add_argument('--gamma', default=0.99, type=float, help='discounted factor')
248 | parser.add_argument('--decay', default=1e-2, type=int, help='critic weight decay')
249 | parser.add_argument('--tau', default=0.001, type=float, help='moving average for target network')
250 | parser.add_argument('--ou_theta', default=0.15, type=float, help='noise theta')
251 | parser.add_argument('--ou_sigma', default=0.2, type=float, help='noise sigma')
252 | parser.add_argument('--ou_mu', default=0.0, type=float, help='noise mu')
253 |
254 | args = parser.parse_args()
255 | print(vars(args))
256 | main(args)
257 |
--------------------------------------------------------------------------------
/pendulum/pendulum_ppo.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import random
4 |
5 | import torch.nn.functional as F
6 | import torch.optim as optim
7 | import torch.multiprocessing as mp
8 |
9 | import torch.nn as nn
10 | import torch
11 |
12 | from collections import deque
13 |
14 | from torch.distributions.categorical import Categorical
15 | from torch.distributions.normal import Normal
16 |
17 |
18 | def make_batch(sample, agent):
19 | sample = np.stack(sample)
20 | discounted_return = np.empty([NUM_STEP, 1])
21 |
22 | s = np.reshape(np.stack(sample[:, 0]), [NUM_STEP, agent.input_size])
23 | s1 = np.reshape(np.stack(sample[:, 3]), [NUM_STEP, agent.input_size])
24 | y = sample[:, 1]
25 | r = np.reshape(np.stack(sample[:, 2]), [NUM_STEP, 1])
26 | d = np.reshape(np.stack(sample[:, 4]), [NUM_STEP, 1])
27 | with torch.no_grad():
28 | state = torch.from_numpy(s)
29 | state = state.float()
30 | _, _, _, value = agent.model_old(state)
31 |
32 | next_state = torch.from_numpy(s1)
33 | next_state = next_state.float()
34 | _, _, _, next_value = agent.model_old(next_state)
35 |
36 | value = value.data.numpy()
37 | next_value = next_value.data.numpy()
38 |
39 | # Discounted Return
40 | gae = 0
41 | for t in range(NUM_STEP - 1, -1, -1):
42 | delta = r[t] + DISCOUNT * next_value[t] * (1 - d[t]) - value[t]
43 | gae = delta + DISCOUNT * LAM * (1 - d[t]) * gae
44 | discounted_return[t, 0] = gae + value[t]
45 |
46 | # For critic
47 | target = r + DISCOUNT * (1 - d) * next_value
48 |
49 | # For Actor
50 | adv = discounted_return - value
51 | # adv = (adv - adv.mean()) / (adv.std() + 1e-5)
52 |
53 | return [s, target, y, adv]
54 |
55 |
56 | class ActorCriticNetwork(nn.Module):
57 | def __init__(self, input_size, output_size):
58 | super(ActorCriticNetwork, self).__init__()
59 | self.feature = nn.Sequential(
60 | nn.Linear(input_size, 256),
61 | nn.Tanh(),
62 | nn.Linear(256, 256),
63 | nn.Tanh()
64 | )
65 | self.mu = nn.Linear(256, output_size)
66 | self.critic = nn.Linear(256, 1)
67 | self.mu.weight.data.mul_(0.1)
68 | self.mu.bias.data.mul_(0.0)
69 | self.critic.weight.data.mul_(0.1)
70 | self.critic.bias.data.mul_(0.0)
71 |
72 | def forward(self, state):
73 | x = self.feature(state)
74 | mu = self.mu(x)
75 | logstd = torch.zeros_like(mu)
76 | std = torch.exp(logstd)
77 | value = self.critic(x)
78 | return mu, std, logstd, value
79 |
80 |
81 | # PAAC(Parallel Advantage Actor Critic)
82 | class ActorAgent(object):
83 | def __init__(self):
84 | self.model_old = ActorCriticNetwork(INPUT, OUTPUT)
85 | self.model_old.share_memory()
86 |
87 | self.output_size = OUTPUT
88 | self.input_size = INPUT
89 |
90 | def get_action(self, state):
91 | state = torch.from_numpy(state).unsqueeze(0)
92 | state = state.float()
93 | mu, std, logstd, value = self.model_old(state)
94 | m = Normal(loc=mu,scale=std)
95 | action = m.sample()
96 | return action.item()
97 |
98 | # after some time interval update the target model to be same with model
99 | def update_actor_model(self, target):
100 | self.model_old.load_state_dict(target.state_dict())
101 |
102 | @staticmethod
103 | def weights_init(m):
104 | class_name = m.__class__.__name__
105 | if class_name.find('Linear') != -1:
106 | torch.nn.init.kaiming_uniform(m.weight)
107 | print(m)
108 | elif class_name.find('Conv') != -1:
109 | torch.nn.init.kaiming_uniform(m.weight)
110 | print(m)
111 |
112 |
113 | class LearnerAgent(object):
114 | def __init__(self):
115 | self.model = ActorCriticNetwork(INPUT, OUTPUT)
116 | # self.model.cuda()
117 | self.output_size = OUTPUT
118 | self.input_size = INPUT
119 | self.optimizer = optim.Adam(self.model.parameters(), lr=LEARNING_RATE, eps=1e-5)
120 |
121 | def train_model(self, s_batch, target_batch, y_batch, adv_batch, actor_agent):
122 | s_batch = torch.FloatTensor(s_batch)
123 | target_batch = torch.FloatTensor(target_batch)
124 | adv_batch = torch.FloatTensor(adv_batch)
125 | with torch.no_grad():
126 | mu_old, std_old, logstd_old, value_old = actor_agent.model_old(s_batch)
127 | m_old = Normal(loc=mu_old, scale=std_old)
128 | y_batch_old = torch.FloatTensor(y_batch)
129 | log_prob_old = m_old.log_prob(y_batch_old)
130 |
131 | # for multiply advantage
132 | mu, std, logstd, value = self.model(s_batch)
133 | m = Normal(loc=mu, scale=std)
134 | y_batch = m.sample()
135 | log_prob = m.log_prob(y_batch)
136 | entropy = m.entropy().mean()
137 |
138 | for i in range(EPOCH):
139 | minibatch = random.sample(range(len(s_batch)), BATCH_SIZE)
140 | ratio = torch.exp(log_prob[minibatch] - log_prob_old[minibatch])
141 |
142 | surr1 = ratio * adv_batch[minibatch,0].sum(0)
143 | surr2 = torch.clamp(ratio, 1.0 - EPSILON, 1.0 + EPSILON) * adv_batch[minibatch,0].sum(0)
144 |
145 | actor_loss = -torch.min(surr1, surr2).mean()
146 | critic_loss = F.mse_loss(value_old[minibatch], target_batch[minibatch])
147 |
148 | self.optimizer.zero_grad()
149 | loss = actor_loss + V_COEF * critic_loss - 0.0 * entropy
150 | loss.backward(retain_graph=True)
151 | self.optimizer.step()
152 |
153 |
154 | class Environment(object):
155 | def __init__(self, env, idx):
156 | self.env = env
157 | self.obs = self.env.reset()
158 | self.next_obs = None
159 | self.done = False
160 | self.env_idx = idx
161 | self.step = 0
162 | self.episode = 0
163 | self.rall = 0
164 | self.recent_rlist = deque(maxlen=100)
165 | self.recent_rlist.append(0)
166 |
167 | def run(self, agent):
168 | sample = []
169 | for _ in range(NUM_STEP):
170 | self.step += 1
171 | action = agent.get_action(self.obs)
172 | self.next_obs, reward, self.done, _ = self.env.step([action])
173 | self.rall += reward
174 |
175 | # # negative reward
176 | # if self.done and self.step < self.env.spec.timestep_limit:
177 | # reward = 0
178 |
179 | sample.append([self.obs[:], action, reward, self.next_obs[:], self.done])
180 |
181 | self.obs = self.next_obs
182 |
183 | if self.done:
184 | self.episode += 1
185 | if self.env_idx == 0:
186 | self.recent_rlist.append(self.rall)
187 | print("[Episode {0:6d}] Reward: {1:4.2f} Recent Reward: {2:4.2f}"
188 | .format(self.episode, self.rall, np.mean(self.recent_rlist)))
189 |
190 | self.obs = self.env.reset()
191 | self.done = False
192 | self.step = 0
193 | self.rall = 0
194 |
195 | return make_batch(sample, agent)
196 |
197 |
198 | def runner(env, cond, memory, actor):
199 | while True:
200 | with cond:
201 | sample = env.run(actor)
202 | memory.put(sample)
203 |
204 | # wait runner
205 | cond.wait()
206 |
207 |
208 | def learner(cond, memory, actor_agent, learner_agent):
209 | while True:
210 | if memory.full():
211 | s_batch, target_batch, y_batch, adv_batch = [], [], [], []
212 | # while memory.qsize() != 0:
213 | # if you use MacOS, use under condition.
214 | if NUM_ENV == 1:
215 | batch = memory.get()
216 | s_batch.extend(batch[0])
217 | target_batch.extend(batch[1])
218 | y_batch.extend(batch[2])
219 | adv_batch.extend(batch[3])
220 | else:
221 | while not memory.empty():
222 | batch = memory.get()
223 | s_batch.extend(batch[0])
224 | target_batch.extend(batch[1])
225 | y_batch.extend(batch[2])
226 | adv_batch.extend(batch[3])
227 |
228 | # train
229 | learner_agent.train_model(s_batch, target_batch, y_batch, adv_batch, actor_agent)
230 | actor_agent.update_actor_model(learner_agent.model)
231 | # resume running
232 | with cond:
233 | cond.notify_all()
234 |
235 |
236 | def main():
237 | num_envs = NUM_ENV
238 | memory = mp.Queue(maxsize=NUM_ENV)
239 | cond = mp.Condition()
240 |
241 | # make agent and share memory
242 | actor_agent = ActorAgent()
243 | learner_agent = LearnerAgent()
244 |
245 | # sync model
246 | actor_agent.update_actor_model(learner_agent.model)
247 |
248 | # make envs
249 | envs = [Environment(gym.make(ENV_ID), i) for i in range(num_envs)]
250 |
251 | # Learner Process(only Learn)
252 | learn_proc = mp.Process(target=learner, args=(cond, memory, actor_agent, learner_agent))
253 |
254 | # Runner Process(just run, not learn)
255 | runners = []
256 | for idx, env in enumerate(envs):
257 | run_proc = mp.Process(target=runner, args=(env, cond, memory, actor_agent))
258 | runners.append(run_proc)
259 | run_proc.start()
260 |
261 | learn_proc.start()
262 |
263 | for proc in runners:
264 | proc.join()
265 |
266 | learn_proc.join()
267 |
268 |
269 | if __name__ == '__main__':
270 | torch.manual_seed(23)
271 | ENV_ID = 'Pendulum-v0'
272 | env = gym.make(ENV_ID)
273 | # Hyper parameter
274 | INPUT = env.observation_space.shape[0]
275 | OUTPUT = env.action_space.shape[0]
276 | DISCOUNT = 0.99
277 | NUM_STEP = 2048
278 | NUM_ENV = 1
279 | LAM = 0.95
280 | EPOCH = 10
281 | BATCH_SIZE = 64
282 | V_COEF = 1.0
283 | EPSILON = 0.2
284 | ALPHA = 0.99
285 | LEARNING_RATE = 0.0003
286 | env.close()
287 |
288 | main()
--------------------------------------------------------------------------------
/readme/1x1conv.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/readme/1x1conv.gif
--------------------------------------------------------------------------------
/readme/Play.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/readme/Play.gif
--------------------------------------------------------------------------------
/readme/q-learning.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/readme/q-learning.PNG
--------------------------------------------------------------------------------
/readme/sarsa.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/readme/sarsa.PNG
--------------------------------------------------------------------------------
/readme/windy.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcwleo/Reinforcement_Learning/3487878954165377e726978196362db108b03864/readme/windy.PNG
--------------------------------------------------------------------------------