├── .gitignore ├── 1 - Lectures ├── 2019 OSS Summer - RL basic, Day 1.pdf └── 2019 OSS Summer - RL basic, Day 2.pdf ├── 2 - Examples ├── 1 - Day 1 │ ├── 1 - Policy Iteration │ │ ├── environment.py │ │ └── policy_iteration.py │ ├── 2 - Value Iteration │ │ ├── environment.py │ │ └── value_iteration.py │ ├── 3 - Monte-Carlo │ │ ├── environment.py │ │ └── mc_agent.py │ ├── 4 - SARSA │ │ ├── environment.py │ │ └── sarsa_agent.py │ └── 5 - Q-learning │ │ ├── environment.py │ │ └── q_learning_agent.py ├── 2 - Day 2 │ ├── 1 - Deep SARSA │ │ ├── deep_sarsa_agent.py │ │ ├── environment.py │ │ ├── save_graph │ │ │ └── deep_sarsa_trained.png │ │ └── save_model │ │ │ └── deep_sarsa_trained.h5 │ ├── 2 - REINFORCE │ │ ├── environment.py │ │ ├── reinforce_agent.py │ │ ├── save_graph │ │ │ └── reinforce_trained.png │ │ └── save_model │ │ │ └── reinforce_trained.h5 │ ├── 3 - DQN │ │ ├── cartpole_dqn.py │ │ ├── save_graph │ │ │ └── cartpole_dqn.png │ │ └── save_model │ │ │ ├── cartpole_dqn.h5 │ │ │ └── cartpole_dqn_trained.h5 │ └── 4 - Actor-Critic │ │ ├── cartpole_a2c.py │ │ ├── save_graph │ │ └── cartpole_a2c.png │ │ └── save_model │ │ ├── cartpole_actor.h5 │ │ ├── cartpole_actor_trained.h5 │ │ ├── cartpole_critic.h5 │ │ └── cartpole_critic_trained.h5 └── Images │ ├── circle.png │ ├── down.png │ ├── left.png │ ├── rectangle.png │ ├── right.png │ ├── triangle.png │ └── up.png ├── 3 - Assignments ├── 1 - Day 1 (Maze for SARSA and Q-learning) │ ├── Problems │ │ ├── 2019 OSS Summer - Maze Specification.pdf │ │ ├── assignment.py │ │ └── environment.py │ └── Solutions │ │ ├── maze_q_learning.py │ │ └── maze_sarsa.py └── 2 - Day 2 (LunarLander for DQN) │ ├── Problems │ ├── 2019 OSS Summer - LunarLander Specification.pdf │ ├── 2019 OSS Summer - SWIG Installation.pdf │ └── lunarlander_dqn.py │ └── Solutions │ └── lunarlander_dqn_solve.py ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /1 - Lectures/2019 OSS Summer - RL basic, Day 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/1 - Lectures/2019 OSS Summer - RL basic, Day 1.pdf -------------------------------------------------------------------------------- /1 - Lectures/2019 OSS Summer - RL basic, Day 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/1 - Lectures/2019 OSS Summer - RL basic, Day 2.pdf -------------------------------------------------------------------------------- /2 - Examples/1 - Day 1/1 - Policy Iteration/environment.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import Button 3 | import time 4 | import numpy as np 5 | from PIL import ImageTk, Image 6 | 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # 픽셀 수 9 | HEIGHT = 5 # 그리드월드 세로 10 | WIDTH = 5 # 그리드월드 가로 11 | TRANSITION_PROB = 1 12 | POSSIBLE_ACTIONS = [0, 1, 2, 3] # 상, 하, 좌, 우 13 | ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 좌표로 나타낸 행동 14 | REWARDS = [] 15 | 16 | 17 | class GraphicDisplay(tk.Tk): 18 | def __init__(self, agent): 19 | super(GraphicDisplay, self).__init__() 20 | self.title('Policy Iteration') 21 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) 22 | self.texts = [] 23 | self.arrows = [] 24 | self.env = Env() 25 | self.agent = agent 26 | self.evaluation_count = 0 27 | self.improvement_count = 0 28 | self.is_moving = 0 29 | (self.up, self.down, self.left, self.right), self.shapes = self.load_images() 30 | self.canvas = self._build_canvas() 31 | self.text_reward(2, 2, "R : 1.0") 32 | self.text_reward(1, 2, "R : -1.0") 33 | self.text_reward(2, 1, "R : -1.0") 34 | 35 | def _build_canvas(self): 36 | canvas = tk.Canvas(self, bg='white', 37 | height=HEIGHT * UNIT, 38 | width=WIDTH * UNIT) 39 | # 버튼 초기화 40 | iteration_button = Button(self, text="Evaluate", 41 | command=self.evaluate_policy) 42 | iteration_button.configure(width=10, activebackground="#33B5E5") 43 | canvas.create_window(WIDTH * UNIT * 0.13, HEIGHT * UNIT + 10, 44 | window=iteration_button) 45 | policy_button = Button(self, text="Improve", 46 | command=self.improve_policy) 47 | policy_button.configure(width=10, activebackground="#33B5E5") 48 | canvas.create_window(WIDTH * UNIT * 0.37, HEIGHT * UNIT + 10, 49 | window=policy_button) 50 | policy_button = Button(self, text="move", command=self.move_by_policy) 51 | policy_button.configure(width=10, activebackground="#33B5E5") 52 | canvas.create_window(WIDTH * UNIT * 0.62, HEIGHT * UNIT + 10, 53 | window=policy_button) 54 | policy_button = Button(self, text="reset", command=self.reset) 55 | policy_button.configure(width=10, activebackground="#33B5E5") 56 | canvas.create_window(WIDTH * UNIT * 0.87, HEIGHT * UNIT + 10, 57 | window=policy_button) 58 | 59 | # 그리드 생성 60 | for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 61 | x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT 62 | canvas.create_line(x0, y0, x1, y1) 63 | for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 64 | x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row 65 | canvas.create_line(x0, y0, x1, y1) 66 | 67 | # 캔버스에 이미지 추가 68 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 69 | canvas.create_image(250, 150, image=self.shapes[1]) 70 | canvas.create_image(150, 250, image=self.shapes[1]) 71 | canvas.create_image(250, 250, image=self.shapes[2]) 72 | 73 | canvas.pack() 74 | 75 | return canvas 76 | 77 | def load_images(self): 78 | up = PhotoImage(Image.open("../../Images/up.png").resize((13, 13))) 79 | right = PhotoImage(Image.open("../../Images/right.png").resize((13, 13))) 80 | left = PhotoImage(Image.open("../../Images/left.png").resize((13, 13))) 81 | down = PhotoImage(Image.open("../../Images/down.png").resize((13, 13))) 82 | rectangle = PhotoImage(Image.open("../../Images/rectangle.png").resize((65, 65))) 83 | triangle = PhotoImage(Image.open("../../Images/triangle.png").resize((65, 65))) 84 | circle = PhotoImage(Image.open("../../Images/circle.png").resize((65, 65))) 85 | return (up, down, left, right), (rectangle, triangle, circle) 86 | 87 | def reset(self): 88 | if self.is_moving == 0: 89 | self.evaluation_count = 0 90 | self.improvement_count = 0 91 | for i in self.texts: 92 | self.canvas.delete(i) 93 | 94 | for i in self.arrows: 95 | self.canvas.delete(i) 96 | self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)] 97 | self.agent.policy_table = ([[[0.25, 0.25, 0.25, 0.25]] * WIDTH 98 | for _ in range(HEIGHT)]) 99 | self.agent.policy_table[2][2] = [] 100 | x, y = self.canvas.coords(self.rectangle) 101 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 102 | 103 | def text_value(self, row, col, contents, font='Helvetica', size=10, 104 | style='normal', anchor="nw"): 105 | origin_x, origin_y = 85, 70 106 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 107 | font = (font, str(size), style) 108 | text = self.canvas.create_text(x, y, fill="black", text=contents, 109 | font=font, anchor=anchor) 110 | return self.texts.append(text) 111 | 112 | def text_reward(self, row, col, contents, font='Helvetica', size=10, 113 | style='normal', anchor="nw"): 114 | origin_x, origin_y = 5, 5 115 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 116 | font = (font, str(size), style) 117 | text = self.canvas.create_text(x, y, fill="black", text=contents, 118 | font=font, anchor=anchor) 119 | return self.texts.append(text) 120 | 121 | def rectangle_move(self, action): 122 | base_action = np.array([0, 0]) 123 | location = self.find_rectangle() 124 | self.render() 125 | if action == 0 and location[0] > 0: # 상 126 | base_action[1] -= UNIT 127 | elif action == 1 and location[0] < HEIGHT - 1: # 하 128 | base_action[1] += UNIT 129 | elif action == 2 and location[1] > 0: # 좌 130 | base_action[0] -= UNIT 131 | elif action == 3 and location[1] < WIDTH - 1: # 우 132 | base_action[0] += UNIT 133 | # move agent 134 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 135 | 136 | def find_rectangle(self): 137 | temp = self.canvas.coords(self.rectangle) 138 | x = (temp[0] / 100) - 0.5 139 | y = (temp[1] / 100) - 0.5 140 | return int(y), int(x) 141 | 142 | def move_by_policy(self): 143 | if self.improvement_count != 0 and self.is_moving != 1: 144 | self.is_moving = 1 145 | 146 | x, y = self.canvas.coords(self.rectangle) 147 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 148 | 149 | x, y = self.find_rectangle() 150 | while len(self.agent.policy_table[x][y]) != 0: 151 | self.after(100, 152 | self.rectangle_move(self.agent.get_action([x, y]))) 153 | x, y = self.find_rectangle() 154 | self.is_moving = 0 155 | 156 | def draw_one_arrow(self, col, row, policy): 157 | if col == 2 and row == 2: 158 | return 159 | 160 | if policy[0] > 0: # up 161 | origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) 162 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 163 | image=self.up)) 164 | if policy[1] > 0: # down 165 | origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) 166 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 167 | image=self.down)) 168 | if policy[2] > 0: # left 169 | origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) 170 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 171 | image=self.left)) 172 | if policy[3] > 0: # right 173 | origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) 174 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 175 | image=self.right)) 176 | 177 | def draw_from_policy(self, policy_table): 178 | for i in range(HEIGHT): 179 | for j in range(WIDTH): 180 | self.draw_one_arrow(i, j, policy_table[i][j]) 181 | 182 | def print_value_table(self, value_table): 183 | for i in range(WIDTH): 184 | for j in range(HEIGHT): 185 | self.text_value(i, j, value_table[i][j]) 186 | 187 | def render(self): 188 | time.sleep(0.1) 189 | self.canvas.tag_raise(self.rectangle) 190 | self.update() 191 | 192 | def evaluate_policy(self): 193 | self.evaluation_count += 1 194 | for i in self.texts: 195 | self.canvas.delete(i) 196 | self.agent.policy_evaluation() 197 | self.print_value_table(self.agent.value_table) 198 | 199 | def improve_policy(self): 200 | self.improvement_count += 1 201 | for i in self.arrows: 202 | self.canvas.delete(i) 203 | self.agent.policy_improvement() 204 | self.draw_from_policy(self.agent.policy_table) 205 | 206 | 207 | class Env: 208 | def __init__(self): 209 | self.transition_probability = TRANSITION_PROB 210 | self.width = WIDTH 211 | self.height = HEIGHT 212 | self.reward = [[0] * WIDTH for _ in range(HEIGHT)] 213 | self.possible_actions = POSSIBLE_ACTIONS 214 | self.reward[2][2] = 1 # (2,2) 좌표 동그라미 위치에 보상 1 215 | self.reward[1][2] = -1 # (1,2) 좌표 세모 위치에 보상 -1 216 | self.reward[2][1] = -1 # (2,1) 좌표 세모 위치에 보상 -1 217 | self.all_state = [] 218 | 219 | for x in range(WIDTH): 220 | for y in range(HEIGHT): 221 | state = [x, y] 222 | self.all_state.append(state) 223 | 224 | def get_reward(self, state, action): 225 | next_state = self.state_after_action(state, action) 226 | return self.reward[next_state[0]][next_state[1]] 227 | 228 | def state_after_action(self, state, action_index): 229 | action = ACTIONS[action_index] 230 | return self.check_boundary([state[0] + action[0], state[1] + action[1]]) 231 | 232 | @staticmethod 233 | def check_boundary(state): 234 | state[0] = (0 if state[0] < 0 else WIDTH - 1 235 | if state[0] > WIDTH - 1 else state[0]) 236 | state[1] = (0 if state[1] < 0 else HEIGHT - 1 237 | if state[1] > HEIGHT - 1 else state[1]) 238 | return state 239 | 240 | def get_transition_prob(self, state, action): 241 | return self.transition_probability 242 | 243 | def get_all_states(self): 244 | return self.all_state 245 | -------------------------------------------------------------------------------- /2 - Examples/1 - Day 1/1 - Policy Iteration/policy_iteration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | from environment import GraphicDisplay, Env 4 | 5 | 6 | class PolicyIteration: 7 | def __init__(self, env): 8 | # 환경에 대한 객체 선언 9 | self.env = env 10 | # 가치함수를 2차원 리스트로 초기화 11 | self.value_table = [[0.0] * env.width for _ in range(env.height)] 12 | # 상 하 좌 우 동일한 확률로 정책 초기화 13 | self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width 14 | for _ in range(env.height)] 15 | # 마침 상태의 설정 16 | self.policy_table[2][2] = [] 17 | # 감가율 18 | self.discount_factor = 0.9 19 | 20 | def policy_evaluation(self): 21 | 22 | # 다음 가치함수 초기화 23 | next_value_table = [[0.00] * self.env.width 24 | for _ in range(self.env.height)] 25 | 26 | # 모든 상태에 대해서 벨만 기대방정식을 계산 27 | for state in self.env.get_all_states(): 28 | value = 0.0 29 | # 마침 상태의 가치 함수 = 0 30 | if state == [2, 2]: 31 | next_value_table[state[0]][state[1]] = value 32 | continue 33 | 34 | # 벨만 기대 방정식 35 | for action in self.env.possible_actions: 36 | next_state = self.env.state_after_action(state, action) 37 | reward = self.env.get_reward(state, action) 38 | next_value = self.get_value(next_state) 39 | value += (self.get_policy(state)[action] * 40 | (reward + self.discount_factor * next_value)) 41 | 42 | next_value_table[state[0]][state[1]] = round(value, 2) 43 | 44 | self.value_table = next_value_table 45 | 46 | # 현재 가치 함수에 대해서 탐욕 정책 발전 47 | def policy_improvement(self): 48 | next_policy = self.policy_table 49 | for state in self.env.get_all_states(): 50 | if state == [2, 2]: 51 | continue 52 | value = -99999 53 | max_index = [] 54 | # 반환할 정책 초기화 55 | result = [0.0, 0.0, 0.0, 0.0] 56 | 57 | # 모든 행동에 대해서 [보상 + (감가율 * 다음 상태 가치함수)] 계산 58 | for index, action in enumerate(self.env.possible_actions): 59 | next_state = self.env.state_after_action(state, action) 60 | reward = self.env.get_reward(state, action) 61 | next_value = self.get_value(next_state) 62 | temp = reward + self.discount_factor * next_value 63 | 64 | # 받을 보상이 최대인 행동의 index(최대가 복수라면 모두)를 추출 65 | if temp == value: 66 | max_index.append(index) 67 | elif temp > value: 68 | value = temp 69 | max_index.clear() 70 | max_index.append(index) 71 | 72 | # 행동의 확률 계산 73 | prob = 1 / len(max_index) 74 | 75 | for index in max_index: 76 | result[index] = prob 77 | 78 | next_policy[state[0]][state[1]] = result 79 | 80 | self.policy_table = next_policy 81 | 82 | # 특정 상태에서 정책에 따른 행동을 반환 83 | def get_action(self, state): 84 | # 0 ~ 1 사이의 값을 무작위로 추출 85 | random_pick = random.randrange(100) / 100 86 | 87 | policy = self.get_policy(state) 88 | policy_sum = 0.0 89 | # 정책에 담긴 행동 중에 무작위로 한 행동을 추출 90 | for index, value in enumerate(policy): 91 | policy_sum += value 92 | if random_pick < policy_sum: 93 | return index 94 | 95 | # 상태에 따른 정책 반환 96 | def get_policy(self, state): 97 | if state == [2, 2]: 98 | return 0.0 99 | return self.policy_table[state[0]][state[1]] 100 | 101 | # 가치 함수의 값을 반환 102 | def get_value(self, state): 103 | # 소숫점 둘째 자리까지만 계산 104 | return round(self.value_table[state[0]][state[1]], 2) 105 | 106 | if __name__ == "__main__": 107 | env = Env() 108 | policy_iteration = PolicyIteration(env) 109 | grid_world = GraphicDisplay(policy_iteration) 110 | grid_world.mainloop() 111 | -------------------------------------------------------------------------------- /2 - Examples/1 - Day 1/2 - Value Iteration/environment.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | import time 3 | import numpy as np 4 | import random 5 | from PIL import ImageTk, Image 6 | 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # 픽셀 수 9 | HEIGHT = 5 # 그리드월드 세로 10 | WIDTH = 5 # 그리드월드 가로 11 | TRANSITION_PROB = 1 12 | POSSIBLE_ACTIONS = [0, 1, 2, 3] # 상, 하, 좌, 우 13 | ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 좌표로 나타낸 행동 14 | REWARDS = [] 15 | 16 | 17 | class GraphicDisplay(tk.Tk): 18 | def __init__(self, value_iteration): 19 | super(GraphicDisplay, self).__init__() 20 | self.title('Value Iteration') 21 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) 22 | self.texts = [] 23 | self.arrows = [] 24 | self.env = Env() 25 | self.agent = value_iteration 26 | self.iteration_count = 0 27 | self.improvement_count = 0 28 | self.is_moving = 0 29 | (self.up, self.down, self.left, 30 | self.right), self.shapes = self.load_images() 31 | self.canvas = self._build_canvas() 32 | self.text_reward(2, 2, "R : 1.0") 33 | self.text_reward(1, 2, "R : -1.0") 34 | self.text_reward(2, 1, "R : -1.0") 35 | 36 | def _build_canvas(self): 37 | canvas = tk.Canvas(self, bg='white', 38 | height=HEIGHT * UNIT, 39 | width=WIDTH * UNIT) 40 | # 버튼 초기화 41 | iteration_button = tk.Button(self, text="Calculate", 42 | command=self.calculate_value) 43 | iteration_button.configure(width=10, activebackground="#33B5E5") 44 | canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, 45 | window=iteration_button) 46 | 47 | policy_button = tk.Button(self, text="Print Policy", 48 | command=self.print_optimal_policy) 49 | policy_button.configure(width=10, activebackground="#33B5E5") 50 | canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, 51 | window=policy_button) 52 | 53 | policy_button = tk.Button(self, text="Move", 54 | command=self.move_by_policy) 55 | policy_button.configure(width=10, activebackground="#33B5E5") 56 | canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, 57 | window=policy_button) 58 | 59 | policy_button = tk.Button(self, text="Clear", command=self.clear) 60 | policy_button.configure(width=10, activebackground="#33B5E5") 61 | canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, 62 | window=policy_button) 63 | 64 | # 그리드 생성 65 | for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 66 | x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT 67 | canvas.create_line(x0, y0, x1, y1) 68 | for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 69 | x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row 70 | canvas.create_line(x0, y0, x1, y1) 71 | 72 | # 캔버스에 이미지 추가 73 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 74 | canvas.create_image(250, 150, image=self.shapes[1]) 75 | canvas.create_image(150, 250, image=self.shapes[1]) 76 | canvas.create_image(250, 250, image=self.shapes[2]) 77 | 78 | canvas.pack() 79 | 80 | return canvas 81 | 82 | def load_images(self): 83 | PhotoImage = ImageTk.PhotoImage 84 | up = PhotoImage(Image.open("../../Images/up.png").resize((13, 13))) 85 | right = PhotoImage(Image.open("../../Images/right.png").resize((13, 13))) 86 | left = PhotoImage(Image.open("../../Images/left.png").resize((13, 13))) 87 | down = PhotoImage(Image.open("../../Images/down.png").resize((13, 13))) 88 | rectangle = PhotoImage( 89 | Image.open("../../Images/rectangle.png").resize((65, 65))) 90 | triangle = PhotoImage( 91 | Image.open("../../Images/triangle.png").resize((65, 65))) 92 | circle = PhotoImage(Image.open("../../Images/circle.png").resize((65, 65))) 93 | return (up, down, left, right), (rectangle, triangle, circle) 94 | 95 | def clear(self): 96 | 97 | if self.is_moving == 0: 98 | self.iteration_count = 0 99 | self.improvement_count = 0 100 | for i in self.texts: 101 | self.canvas.delete(i) 102 | 103 | for i in self.arrows: 104 | self.canvas.delete(i) 105 | 106 | self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)] 107 | 108 | x, y = self.canvas.coords(self.rectangle) 109 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 110 | 111 | def reset(self): 112 | self.update() 113 | time.sleep(0.5) 114 | self.canvas.delete(self.rectangle) 115 | return self.canvas.coords(self.rectangle) 116 | 117 | def text_value(self, row, col, contents, font='Helvetica', size=12, 118 | style='normal', anchor="nw"): 119 | origin_x, origin_y = 85, 70 120 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 121 | font = (font, str(size), style) 122 | text = self.canvas.create_text(x, y, fill="black", text=contents, 123 | font=font, anchor=anchor) 124 | return self.texts.append(text) 125 | 126 | def text_reward(self, row, col, contents, font='Helvetica', size=12, 127 | style='normal', anchor="nw"): 128 | origin_x, origin_y = 5, 5 129 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 130 | font = (font, str(size), style) 131 | text = self.canvas.create_text(x, y, fill="black", text=contents, 132 | font=font, anchor=anchor) 133 | return self.texts.append(text) 134 | 135 | def rectangle_move(self, action): 136 | base_action = np.array([0, 0]) 137 | location = self.find_rectangle() 138 | self.render() 139 | if action == 0 and location[0] > 0: # up 140 | base_action[1] -= UNIT 141 | elif action == 1 and location[0] < HEIGHT - 1: # down 142 | base_action[1] += UNIT 143 | elif action == 2 and location[1] > 0: # left 144 | base_action[0] -= UNIT 145 | elif action == 3 and location[1] < WIDTH - 1: # right 146 | base_action[0] += UNIT 147 | 148 | self.canvas.move(self.rectangle, base_action[0], 149 | base_action[1]) # move agent 150 | 151 | def find_rectangle(self): 152 | temp = self.canvas.coords(self.rectangle) 153 | x = (temp[0] / 100) - 0.5 154 | y = (temp[1] / 100) - 0.5 155 | return int(y), int(x) 156 | 157 | def move_by_policy(self): 158 | 159 | if self.improvement_count != 0 and self.is_moving != 1: 160 | self.is_moving = 1 161 | x, y = self.canvas.coords(self.rectangle) 162 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 163 | 164 | x, y = self.find_rectangle() 165 | while len(self.agent.get_action([x, y])) != 0: 166 | action = random.sample(self.agent.get_action([x, y]), 1)[0] 167 | self.after(100, self.rectangle_move(action)) 168 | x, y = self.find_rectangle() 169 | self.is_moving = 0 170 | 171 | def draw_one_arrow(self, col, row, action): 172 | if col == 2 and row == 2: 173 | return 174 | if action == 0: # up 175 | origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) 176 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 177 | image=self.up)) 178 | elif action == 1: # down 179 | origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) 180 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 181 | image=self.down)) 182 | elif action == 3: # right 183 | origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) 184 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 185 | image=self.right)) 186 | elif action == 2: # left 187 | origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) 188 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 189 | image=self.left)) 190 | 191 | def draw_from_values(self, state, action_list): 192 | i = state[0] 193 | j = state[1] 194 | for action in action_list: 195 | self.draw_one_arrow(i, j, action) 196 | 197 | def print_values(self, values): 198 | for i in range(WIDTH): 199 | for j in range(HEIGHT): 200 | self.text_value(i, j, values[i][j]) 201 | 202 | def render(self): 203 | time.sleep(0.1) 204 | self.canvas.tag_raise(self.rectangle) 205 | self.update() 206 | 207 | def calculate_value(self): 208 | self.iteration_count += 1 209 | for i in self.texts: 210 | self.canvas.delete(i) 211 | self.agent.value_iteration() 212 | self.print_values(self.agent.value_table) 213 | 214 | def print_optimal_policy(self): 215 | self.improvement_count += 1 216 | for i in self.arrows: 217 | self.canvas.delete(i) 218 | for state in self.env.get_all_states(): 219 | action = self.agent.get_action(state) 220 | self.draw_from_values(state, action) 221 | 222 | 223 | class Env: 224 | def __init__(self): 225 | self.transition_probability = TRANSITION_PROB 226 | self.width = WIDTH # Width of Grid World 227 | self.height = HEIGHT # Height of GridWorld 228 | self.reward = [[0] * WIDTH for _ in range(HEIGHT)] 229 | self.possible_actions = POSSIBLE_ACTIONS 230 | self.reward[2][2] = 1 # reward 1 for circle 231 | self.reward[1][2] = -1 # reward -1 for triangle 232 | self.reward[2][1] = -1 # reward -1 for triangle 233 | self.all_state = [] 234 | 235 | for x in range(WIDTH): 236 | for y in range(HEIGHT): 237 | state = [x, y] 238 | self.all_state.append(state) 239 | 240 | def get_reward(self, state, action): 241 | next_state = self.state_after_action(state, action) 242 | return self.reward[next_state[0]][next_state[1]] 243 | 244 | def state_after_action(self, state, action_index): 245 | action = ACTIONS[action_index] 246 | return self.check_boundary([state[0] + action[0], state[1] + action[1]]) 247 | 248 | @staticmethod 249 | def check_boundary(state): 250 | state[0] = (0 if state[0] < 0 else WIDTH - 1 251 | if state[0] > WIDTH - 1 else state[0]) 252 | state[1] = (0 if state[1] < 0 else HEIGHT - 1 253 | if state[1] > HEIGHT - 1 else state[1]) 254 | return state 255 | 256 | def get_transition_prob(self, state, action): 257 | return self.transition_probability 258 | 259 | def get_all_states(self): 260 | return self.all_state 261 | -------------------------------------------------------------------------------- /2 - Examples/1 - Day 1/2 - Value Iteration/value_iteration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from environment import GraphicDisplay, Env 3 | 4 | class ValueIteration: 5 | def __init__(self, env): 6 | # 환경 객체 생성 7 | self.env = env 8 | # 가치 함수를 2차원 리스트로 초기화 9 | self.value_table = [[0.0] * env.width for _ in range(env.height)] 10 | # 감가율 11 | self.discount_factor = 0.9 12 | 13 | # 가치 이터레이션 14 | # 벨만 최적 방정식을 통해 다음 가치 함수 계산 15 | def value_iteration(self): 16 | next_value_table = [[0.0] * self.env.width for _ in 17 | range(self.env.height)] 18 | for state in self.env.get_all_states(): 19 | if state == [2, 2]: 20 | next_value_table[state[0]][state[1]] = 0.0 21 | continue 22 | # 가치 함수를 위한 빈 리스트 23 | value_list = [] 24 | 25 | # 가능한 모든 행동에 대해 계산 26 | for action in self.env.possible_actions: 27 | next_state = self.env.state_after_action(state, action) 28 | reward = self.env.get_reward(state, action) 29 | next_value = self.get_value(next_state) 30 | value_list.append((reward + self.discount_factor * next_value)) 31 | # 최댓값을 다음 가치 함수로 대입 32 | next_value_table[state[0]][state[1]] = round(max(value_list), 2) 33 | self.value_table = next_value_table 34 | 35 | # 현재 가치 함수로부터 행동을 반환 36 | def get_action(self, state): 37 | action_list = [] 38 | max_value = -99999 39 | 40 | if state == [2, 2]: 41 | return [] 42 | 43 | # 모든 행동에 대해 큐함수 (보상 + (감가율 * 다음 상태 가치함수))를 계산 44 | # 최대 큐 함수를 가진 행동(복수일 경우 여러 개)을 반환 45 | for action in self.env.possible_actions: 46 | 47 | next_state = self.env.state_after_action(state, action) 48 | reward = self.env.get_reward(state, action) 49 | next_value = self.get_value(next_state) 50 | value = (reward + self.discount_factor * next_value) 51 | 52 | if value > max_value: 53 | action_list.clear() 54 | action_list.append(action) 55 | max_value = value 56 | elif value == max_value: 57 | action_list.append(action) 58 | 59 | return action_list 60 | 61 | def get_value(self, state): 62 | return round(self.value_table[state[0]][state[1]], 2) 63 | 64 | if __name__ == "__main__": 65 | env = Env() 66 | value_iteration = ValueIteration(env) 67 | grid_world = GraphicDisplay(value_iteration) 68 | grid_world.mainloop() 69 | -------------------------------------------------------------------------------- /2 - Examples/1 - Day 1/3 - Monte-Carlo/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | np.random.seed(1) 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # 픽셀 수 9 | HEIGHT = 5 # 그리드 월드 세로 10 | WIDTH = 5 # 그리드 월드 가로 11 | 12 | 13 | class Env(tk.Tk): 14 | def __init__(self): 15 | super(Env, self).__init__() 16 | self.action_space = ['u', 'd', 'l', 'r'] 17 | self.n_actions = len(self.action_space) 18 | self.title('monte carlo') 19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 20 | self.shapes = self.load_images() 21 | self.canvas = self._build_canvas() 22 | self.texts = [] 23 | 24 | def _build_canvas(self): 25 | canvas = tk.Canvas(self, bg='white', 26 | height=HEIGHT * UNIT, 27 | width=WIDTH * UNIT) 28 | # 그리드 생성 29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 31 | canvas.create_line(x0, y0, x1, y1) 32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 34 | canvas.create_line(x0, y0, x1, y1) 35 | 36 | # 캔버스에 이미지 추가 37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) 39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) 40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2]) 41 | 42 | canvas.pack() 43 | 44 | return canvas 45 | 46 | def load_images(self): 47 | rectangle = PhotoImage( 48 | Image.open("../../Images/rectangle.png").resize((65, 65))) 49 | triangle = PhotoImage( 50 | Image.open("../../Images/triangle.png").resize((65, 65))) 51 | circle = PhotoImage( 52 | Image.open("../../Images/circle.png").resize((65, 65))) 53 | 54 | return rectangle, triangle, circle 55 | 56 | @staticmethod 57 | def coords_to_state(coords): 58 | x = int((coords[0] - 50) / 100) 59 | y = int((coords[1] - 50) / 100) 60 | return [x, y] 61 | 62 | def reset(self): 63 | self.update() 64 | time.sleep(0.5) 65 | x, y = self.canvas.coords(self.rectangle) 66 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 67 | return self.coords_to_state(self.canvas.coords(self.rectangle)) 68 | 69 | def step(self, action): 70 | state = self.canvas.coords(self.rectangle) 71 | base_action = np.array([0, 0]) 72 | self.render() 73 | 74 | if action == 0: # 상 75 | if state[1] > UNIT: 76 | base_action[1] -= UNIT 77 | elif action == 1: # 하 78 | if state[1] < (HEIGHT - 1) * UNIT: 79 | base_action[1] += UNIT 80 | elif action == 2: # 좌 81 | if state[0] > UNIT: 82 | base_action[0] -= UNIT 83 | elif action == 3: # 우 84 | if state[0] < (WIDTH - 1) * UNIT: 85 | base_action[0] += UNIT 86 | # 에이전트 이동 87 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 88 | # 에이전트(빨간 네모)를 가장 상위로 배치 89 | self.canvas.tag_raise(self.rectangle) 90 | 91 | next_state = self.canvas.coords(self.rectangle) 92 | 93 | # 보상 함수 94 | if next_state == self.canvas.coords(self.circle): 95 | reward = 100 96 | done = True 97 | elif next_state in [self.canvas.coords(self.triangle1), 98 | self.canvas.coords(self.triangle2)]: 99 | reward = -100 100 | done = True 101 | else: 102 | reward = 0 103 | done = False 104 | 105 | next_state = self.coords_to_state(next_state) 106 | 107 | return next_state, reward, done 108 | 109 | def render(self): 110 | time.sleep(0.03) 111 | self.update() 112 | -------------------------------------------------------------------------------- /2 - Examples/1 - Day 1/3 - Monte-Carlo/mc_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import defaultdict 4 | from environment import Env 5 | 6 | 7 | # 몬테카를로 에이전트 (모든 에피소드 각각의 샘플로 부터 학습) 8 | class MCAgent: 9 | def __init__(self, actions): 10 | self.width = 5 11 | self.height = 5 12 | self.actions = actions 13 | self.learning_rate = 0.01 14 | self.discount_factor = 0.9 15 | self.epsilon = 0.1 16 | self.samples = [] 17 | self.value_table = defaultdict(float) 18 | 19 | # 메모리에 샘플을 추가 20 | def save_sample(self, state, reward, done): 21 | self.samples.append([state, reward, done]) 22 | 23 | # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트 24 | def update(self): 25 | G_t = 0 26 | visit_state = [] 27 | for reward in reversed(self.samples): 28 | state = str(reward[0]) 29 | if state not in visit_state: 30 | visit_state.append(state) 31 | G_t = self.discount_factor * (reward[1] + G_t) 32 | value = self.value_table[state] 33 | self.value_table[state] = (value + 34 | self.learning_rate * (G_t - value)) 35 | 36 | # 큐 함수에 따라서 행동을 반환 37 | # 입실론 탐욕 정책에 따라서 행동을 반환 38 | def get_action(self, state): 39 | if np.random.rand() < self.epsilon: 40 | # 랜덤 행동 41 | action = np.random.choice(self.actions) 42 | else: 43 | # 큐 함수에 따른 행동 44 | next_state = self.possible_next_state(state) 45 | action = self.arg_max(next_state) 46 | return int(action) 47 | 48 | # 후보가 여럿이면 arg_max를 계산하고 무작위로 하나를 반환 49 | @staticmethod 50 | def arg_max(next_state): 51 | max_index_list = [] 52 | max_value = next_state[0] 53 | for index, value in enumerate(next_state): 54 | if value > max_value: 55 | max_index_list.clear() 56 | max_value = value 57 | max_index_list.append(index) 58 | elif value == max_value: 59 | max_index_list.append(index) 60 | return random.choice(max_index_list) 61 | 62 | # 가능한 다음 모든 상태들을 반환 63 | def possible_next_state(self, state): 64 | col, row = state 65 | next_state = [0.0] * 4 66 | 67 | if row != 0: 68 | next_state[0] = self.value_table[str([col, row - 1])] 69 | else: 70 | next_state[0] = self.value_table[str(state)] 71 | if row != self.height - 1: 72 | next_state[1] = self.value_table[str([col, row + 1])] 73 | else: 74 | next_state[1] = self.value_table[str(state)] 75 | if col != 0: 76 | next_state[2] = self.value_table[str([col - 1, row])] 77 | else: 78 | next_state[2] = self.value_table[str(state)] 79 | if col != self.width - 1: 80 | next_state[3] = self.value_table[str([col + 1, row])] 81 | else: 82 | next_state[3] = self.value_table[str(state)] 83 | 84 | return next_state 85 | 86 | 87 | # 메인 함수 88 | if __name__ == "__main__": 89 | env = Env() 90 | agent = MCAgent(actions=list(range(env.n_actions))) 91 | 92 | for episode in range(1000): 93 | state = env.reset() 94 | action = agent.get_action(state) 95 | 96 | while True: 97 | env.render() 98 | 99 | # 다음 상태로 이동 100 | # 보상은 숫자이고, 완료 여부는 boolean 101 | next_state, reward, done = env.step(action) 102 | agent.save_sample(next_state, reward, done) 103 | 104 | # 다음 행동 받아옴 105 | action = agent.get_action(next_state) 106 | 107 | # 에피소드가 완료됐을 때, 큐 함수 업데이트 108 | if done: 109 | print("episode : ", episode) 110 | agent.update() 111 | agent.samples.clear() 112 | break 113 | -------------------------------------------------------------------------------- /2 - Examples/1 - Day 1/4 - SARSA/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | np.random.seed(1) 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # 필셀 수 9 | HEIGHT = 5 # 그리드 월드 가로 10 | WIDTH = 5 # 그리드 월드 세로 11 | 12 | 13 | class Env(tk.Tk): 14 | def __init__(self): 15 | super(Env, self).__init__() 16 | self.action_space = ['u', 'd', 'l', 'r'] 17 | self.n_actions = len(self.action_space) 18 | self.title('SARSA') 19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 20 | self.shapes = self.load_images() 21 | self.canvas = self._build_canvas() 22 | self.texts = [] 23 | 24 | def _build_canvas(self): 25 | canvas = tk.Canvas(self, bg='white', 26 | height=HEIGHT * UNIT, 27 | width=WIDTH * UNIT) 28 | # 그리드 생성 29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 31 | canvas.create_line(x0, y0, x1, y1) 32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 34 | canvas.create_line(x0, y0, x1, y1) 35 | 36 | # 캔버스에 이미지 추가 37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) 39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) 40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2]) 41 | 42 | canvas.pack() 43 | 44 | return canvas 45 | 46 | def load_images(self): 47 | rectangle = PhotoImage( 48 | Image.open("../../Images/rectangle.png").resize((65, 65))) 49 | triangle = PhotoImage( 50 | Image.open("../../Images/triangle.png").resize((65, 65))) 51 | circle = PhotoImage( 52 | Image.open("../../Images/circle.png").resize((65, 65))) 53 | 54 | return rectangle, triangle, circle 55 | 56 | def text_value(self, row, col, contents, action, font='Helvetica', size=10, 57 | style='normal', anchor="nw"): 58 | if action == 0: 59 | origin_x, origin_y = 7, 42 60 | elif action == 1: 61 | origin_x, origin_y = 85, 42 62 | elif action == 2: 63 | origin_x, origin_y = 42, 5 64 | else: 65 | origin_x, origin_y = 42, 77 66 | 67 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 68 | font = (font, str(size), style) 69 | text = self.canvas.create_text(x, y, fill="black", text=contents, 70 | font=font, anchor=anchor) 71 | return self.texts.append(text) 72 | 73 | def print_value_all(self, q_table): 74 | for i in self.texts: 75 | self.canvas.delete(i) 76 | self.texts.clear() 77 | for x in range(HEIGHT): 78 | for y in range(WIDTH): 79 | for action in range(0, 4): 80 | state = [x, y] 81 | if str(state) in q_table.keys(): 82 | temp = q_table[str(state)][action] 83 | self.text_value(y, x, round(temp, 2), action) 84 | 85 | def coords_to_state(self, coords): 86 | x = int((coords[0] - 50) / 100) 87 | y = int((coords[1] - 50) / 100) 88 | return [x, y] 89 | 90 | def reset(self): 91 | self.update() 92 | time.sleep(0.5) 93 | x, y = self.canvas.coords(self.rectangle) 94 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 95 | self.render() 96 | return self.coords_to_state(self.canvas.coords(self.rectangle)) 97 | 98 | def step(self, action): 99 | state = self.canvas.coords(self.rectangle) 100 | base_action = np.array([0, 0]) 101 | self.render() 102 | 103 | if action == 0: # 상 104 | if state[1] > UNIT: 105 | base_action[1] -= UNIT 106 | elif action == 1: # 하 107 | if state[1] < (HEIGHT - 1) * UNIT: 108 | base_action[1] += UNIT 109 | elif action == 2: # 좌 110 | if state[0] > UNIT: 111 | base_action[0] -= UNIT 112 | elif action == 3: # 우 113 | if state[0] < (WIDTH - 1) * UNIT: 114 | base_action[0] += UNIT 115 | 116 | # 에이전트 이동 117 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 118 | # 에이전트(빨간 네모)를 가장 상위로 배치 119 | self.canvas.tag_raise(self.rectangle) 120 | next_state = self.canvas.coords(self.rectangle) 121 | 122 | # 보상 함수 123 | if next_state == self.canvas.coords(self.circle): 124 | reward = 100 125 | done = True 126 | elif next_state in [self.canvas.coords(self.triangle1), 127 | self.canvas.coords(self.triangle2)]: 128 | reward = -100 129 | done = True 130 | else: 131 | reward = 0 132 | done = False 133 | 134 | next_state = self.coords_to_state(next_state) 135 | 136 | 137 | 138 | return next_state, reward, done 139 | 140 | def render(self): 141 | time.sleep(0.03) 142 | self.update() 143 | -------------------------------------------------------------------------------- /2 - Examples/1 - Day 1/4 - SARSA/sarsa_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import defaultdict 4 | from environment import Env 5 | 6 | 7 | class SARSAgent: 8 | def __init__(self, actions): 9 | self.actions = actions 10 | self.learning_rate = 0.01 11 | self.discount_factor = 0.9 12 | self.epsilon = 0.1 13 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) 14 | 15 | # 의 샘플로부터 큐함수를 업데이트 16 | def learn(self, state, action, reward, next_state, next_action): 17 | current_q = self.q_table[state][action] 18 | next_state_q = self.q_table[next_state][next_action] 19 | new_q = (current_q + self.learning_rate * 20 | (reward + self.discount_factor * next_state_q - current_q)) 21 | self.q_table[state][action] = new_q 22 | 23 | # 입실론 탐욕 정책에 따라서 행동을 반환 24 | def get_action(self, state): 25 | if np.random.rand() < self.epsilon: 26 | # 무작위 행동 반환 27 | action = np.random.choice(self.actions) 28 | else: 29 | # 큐함수에 따른 행동 반환 30 | state_action = self.q_table[state] 31 | action = self.arg_max(state_action) 32 | return action 33 | 34 | @staticmethod 35 | def arg_max(state_action): 36 | max_index_list = [] 37 | max_value = state_action[0] 38 | for index, value in enumerate(state_action): 39 | if value > max_value: 40 | max_index_list.clear() 41 | max_value = value 42 | max_index_list.append(index) 43 | elif value == max_value: 44 | max_index_list.append(index) 45 | return random.choice(max_index_list) 46 | 47 | if __name__ == "__main__": 48 | env = Env() 49 | agent = SARSAgent(actions=list(range(env.n_actions))) 50 | 51 | for episode in range(1000): 52 | # 게임 환경과 상태를 초기화 53 | state = env.reset() 54 | # 현재 상태에 대한 행동을 선택 55 | action = agent.get_action(str(state)) 56 | 57 | while True: 58 | env.render() 59 | 60 | # 행동을 위한 후 다음상태 보상 에피소드의 종료 여부를 받아옴 61 | next_state, reward, done = env.step(action) 62 | # 다음 상태에서의 다음 행동 선택 63 | next_action = agent.get_action(str(next_state)) 64 | 65 | # 로 큐함수를 업데이트 66 | agent.learn(str(state), action, reward, str(next_state), next_action) 67 | 68 | state = next_state 69 | action = next_action 70 | 71 | # 모든 큐함수를 화면에 표시 72 | env.print_value_all(agent.q_table) 73 | 74 | if done: 75 | break 76 | 77 | -------------------------------------------------------------------------------- /2 - Examples/1 - Day 1/5 - Q-learning/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | np.random.seed(1) 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # 픽셀 수 9 | HEIGHT = 5 # 그리드월드 세로 10 | WIDTH = 5 # 그리드월드 가로 11 | 12 | 13 | class Env(tk.Tk): 14 | def __init__(self): 15 | super(Env, self).__init__() 16 | self.action_space = ['u', 'd', 'l', 'r'] 17 | self.n_actions = len(self.action_space) 18 | self.title('Q Learning') 19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 20 | self.shapes = self.load_images() 21 | self.canvas = self._build_canvas() 22 | self.texts = [] 23 | 24 | def _build_canvas(self): 25 | canvas = tk.Canvas(self, bg='white', 26 | height=HEIGHT * UNIT, 27 | width=WIDTH * UNIT) 28 | # 그리드 생성 29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 31 | canvas.create_line(x0, y0, x1, y1) 32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 34 | canvas.create_line(x0, y0, x1, y1) 35 | 36 | # 캔버스에 이미지 추가 37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) 39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) 40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2]) 41 | 42 | canvas.pack() 43 | 44 | return canvas 45 | 46 | def load_images(self): 47 | rectangle = PhotoImage( 48 | Image.open("../../Images/rectangle.png").resize((65, 65))) 49 | triangle = PhotoImage( 50 | Image.open("../../Images/triangle.png").resize((65, 65))) 51 | circle = PhotoImage( 52 | Image.open("../../Images/circle.png").resize((65, 65))) 53 | 54 | return rectangle, triangle, circle 55 | 56 | def text_value(self, row, col, contents, action, font='Helvetica', size=10, 57 | style='normal', anchor="nw"): 58 | 59 | if action == 0: 60 | origin_x, origin_y = 7, 42 61 | elif action == 1: 62 | origin_x, origin_y = 85, 42 63 | elif action == 2: 64 | origin_x, origin_y = 42, 5 65 | else: 66 | origin_x, origin_y = 42, 77 67 | 68 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 69 | font = (font, str(size), style) 70 | text = self.canvas.create_text(x, y, fill="black", text=contents, 71 | font=font, anchor=anchor) 72 | return self.texts.append(text) 73 | 74 | def print_value_all(self, q_table): 75 | for i in self.texts: 76 | self.canvas.delete(i) 77 | self.texts.clear() 78 | for i in range(HEIGHT): 79 | for j in range(WIDTH): 80 | for action in range(0, 4): 81 | state = [i, j] 82 | if str(state) in q_table.keys(): 83 | temp = q_table[str(state)][action] 84 | self.text_value(j, i, round(temp, 2), action) 85 | 86 | def coords_to_state(self, coords): 87 | x = int((coords[0] - 50) / 100) 88 | y = int((coords[1] - 50) / 100) 89 | return [x, y] 90 | 91 | def state_to_coords(self, state): 92 | x = int(state[0] * 100 + 50) 93 | y = int(state[1] * 100 + 50) 94 | return [x, y] 95 | 96 | def reset(self): 97 | self.update() 98 | time.sleep(0.5) 99 | x, y = self.canvas.coords(self.rectangle) 100 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 101 | self.render() 102 | return self.coords_to_state(self.canvas.coords(self.rectangle)) 103 | 104 | def step(self, action): 105 | state = self.canvas.coords(self.rectangle) 106 | base_action = np.array([0, 0]) 107 | self.render() 108 | 109 | if action == 0: # 상 110 | if state[1] > UNIT: 111 | base_action[1] -= UNIT 112 | elif action == 1: # 하 113 | if state[1] < (HEIGHT - 1) * UNIT: 114 | base_action[1] += UNIT 115 | elif action == 2: # 좌 116 | if state[0] > UNIT: 117 | base_action[0] -= UNIT 118 | elif action == 3: # 우 119 | if state[0] < (WIDTH - 1) * UNIT: 120 | base_action[0] += UNIT 121 | 122 | # 에이전트 이동 123 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 124 | # 에이전트(빨간 네모)를 가장 상위로 배치 125 | self.canvas.tag_raise(self.rectangle) 126 | next_state = self.canvas.coords(self.rectangle) 127 | 128 | # 보상 함수 129 | if next_state == self.canvas.coords(self.circle): 130 | reward = 100 131 | done = True 132 | elif next_state in [self.canvas.coords(self.triangle1), 133 | self.canvas.coords(self.triangle2)]: 134 | reward = -100 135 | done = True 136 | else: 137 | reward = 0 138 | done = False 139 | 140 | next_state = self.coords_to_state(next_state) 141 | return next_state, reward, done 142 | 143 | def render(self): 144 | time.sleep(0.03) 145 | self.update() 146 | -------------------------------------------------------------------------------- /2 - Examples/1 - Day 1/5 - Q-learning/q_learning_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from environment import Env 4 | from collections import defaultdict 5 | 6 | class QLearningAgent: 7 | def __init__(self, actions): 8 | # 행동 = [0, 1, 2, 3] 순서대로 상, 하, 좌, 우 9 | self.actions = actions 10 | self.learning_rate = 0.01 11 | self.discount_factor = 0.9 12 | self.epsilon = 0.9 13 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) 14 | 15 | # 샘플로부터 큐함수 업데이트 16 | def learn(self, state, action, reward, next_state): 17 | q_1 = self.q_table[state][action] 18 | # 벨만 최적 방정식을 사용한 큐함수의 업데이트 19 | q_2 = reward + self.discount_factor * max(self.q_table[next_state]) 20 | self.q_table[state][action] += self.learning_rate * (q_2 - q_1) 21 | 22 | # 큐함수에 의거하여 입실론 탐욕 정책에 따라서 행동을 반환 23 | def get_action(self, state): 24 | if np.random.rand() < self.epsilon: 25 | # 무작위 행동 반환 26 | action = np.random.choice(self.actions) 27 | else: 28 | # 큐함수에 따른 행동 반환 29 | state_action = self.q_table[state] 30 | action = self.arg_max(state_action) 31 | return action 32 | 33 | @staticmethod 34 | def arg_max(state_action): 35 | max_index_list = [] 36 | max_value = state_action[0] 37 | for index, value in enumerate(state_action): 38 | if value > max_value: 39 | max_index_list.clear() 40 | max_value = value 41 | max_index_list.append(index) 42 | elif value == max_value: 43 | max_index_list.append(index) 44 | return random.choice(max_index_list) 45 | 46 | if __name__ == "__main__": 47 | env = Env() 48 | agent = QLearningAgent(actions=list(range(env.n_actions))) 49 | 50 | for episode in range(1000): 51 | state = env.reset() 52 | 53 | while True: 54 | env.render() 55 | 56 | # 현재 상태에 대한 행동 선택 57 | action = agent.get_action(str(state)) 58 | # 행동을 취한 후 다음 상태, 보상 에피소드의 종료여부를 받아옴 59 | next_state, reward, done = env.step(action) 60 | 61 | # 로 큐함수를 업데이트 62 | agent.learn(str(state), action, reward, str(next_state)) 63 | state = next_state 64 | # 모든 큐함수를 화면에 표시 65 | env.print_value_all(agent.q_table) 66 | 67 | if done: 68 | break 69 | -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/1 - Deep SARSA/deep_sarsa_agent.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import pylab 3 | import random 4 | import numpy as np 5 | from environment import Env 6 | from keras.layers import Dense 7 | from keras.optimizers import Adam 8 | from keras.models import Sequential 9 | 10 | EPISODES = 1000 11 | 12 | 13 | # 그리드월드 예제에서의 딥살사 에이전트 14 | class DeepSARSAgent: 15 | def __init__(self): 16 | self.load_model = False 17 | # 에이전트가 가능한 모든 행동 정의 18 | self.action_space = [0, 1, 2, 3, 4] 19 | # 상태의 크기와 행동의 크기 정의 20 | self.action_size = len(self.action_space) 21 | self.state_size = 15 22 | self.discount_factor = 0.99 23 | self.learning_rate = 0.001 24 | 25 | self.epsilon = 1. # exploration 26 | self.epsilon_decay = .9999 27 | self.epsilon_min = 0.01 28 | self.model = self.build_model() 29 | 30 | if self.load_model: 31 | self.epsilon = 0.05 32 | self.model.load_weights('./save_model/deep_sarsa_trained.h5') 33 | 34 | # 상태가 입력 큐함수가 출력인 인공신경망 생성 35 | def build_model(self): 36 | model = Sequential() 37 | model.add(Dense(30, input_dim=self.state_size, activation='relu')) 38 | model.add(Dense(30, activation='relu')) 39 | model.add(Dense(self.action_size, activation='linear')) 40 | model.summary() 41 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 42 | return model 43 | 44 | # 입실론 탐욕 방법으로 행동 선택 45 | def get_action(self, state): 46 | if np.random.rand() <= self.epsilon: 47 | # 무작위 행동 반환 48 | return random.randrange(self.action_size) 49 | else: 50 | # 모델로부터 행동 산출 51 | state = np.float32(state) 52 | q_values = self.model.predict(state) 53 | return np.argmax(q_values[0]) 54 | 55 | def train_model(self, state, action, reward, next_state, next_action, done): 56 | if self.epsilon > self.epsilon_min: 57 | self.epsilon *= self.epsilon_decay 58 | 59 | state = np.float32(state) 60 | next_state = np.float32(next_state) 61 | target = self.model.predict(state)[0] 62 | # 살사의 큐함수 업데이트 식 63 | if done: 64 | target[action] = reward 65 | else: 66 | target[action] = (reward + self.discount_factor * 67 | self.model.predict(next_state)[0][next_action]) 68 | 69 | # 출력 값 reshape 70 | target = np.reshape(target, [1, 5]) 71 | # 인공신경망 업데이트 72 | self.model.fit(state, target, epochs=1, verbose=0) 73 | 74 | 75 | if __name__ == "__main__": 76 | # 환경과 에이전트 생성 77 | env = Env() 78 | agent = DeepSARSAgent() 79 | 80 | global_step = 0 81 | scores, episodes = [], [] 82 | 83 | for e in range(EPISODES): 84 | done = False 85 | score = 0 86 | state = env.reset() 87 | state = np.reshape(state, [1, 15]) 88 | 89 | while not done: 90 | # env 초기화 91 | global_step += 1 92 | 93 | # 현재 상태에 대한 행동 선택 94 | action = agent.get_action(state) 95 | # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집 96 | next_state, reward, done = env.step(action) 97 | next_state = np.reshape(next_state, [1, 15]) 98 | next_action = agent.get_action(next_state) 99 | # 샘플로 모델 학습 100 | agent.train_model(state, action, reward, next_state, next_action, 101 | done) 102 | state = next_state 103 | score += reward 104 | 105 | state = copy.deepcopy(next_state) 106 | 107 | if done: 108 | # 에피소드마다 학습 결과 출력 109 | scores.append(score) 110 | episodes.append(e) 111 | pylab.plot(episodes, scores, 'b') 112 | pylab.savefig("./save_graph/deep_sarsa_.png") 113 | print("episode:", e, " score:", score, "global_step", 114 | global_step, " epsilon:", agent.epsilon) 115 | 116 | # 100 에피소드마다 모델 저장 117 | if e % 100 == 0: 118 | agent.model.save_weights("./save_model/deep_sarsa.h5") 119 | -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/1 - Deep SARSA/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | PhotoImage = ImageTk.PhotoImage 7 | UNIT = 50 # 픽셀 수 8 | HEIGHT = 5 # 그리드 세로 9 | WIDTH = 5 # 그리드 가로 10 | 11 | np.random.seed(1) 12 | 13 | 14 | class Env(tk.Tk): 15 | def __init__(self): 16 | super(Env, self).__init__() 17 | self.action_space = ['u', 'd', 'l', 'r'] 18 | self.action_size = len(self.action_space) 19 | self.title('DeepSARSA') 20 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 21 | self.shapes = self.load_images() 22 | self.canvas = self._build_canvas() 23 | self.counter = 0 24 | self.rewards = [] 25 | self.goal = [] 26 | # 장애물 설정 27 | self.set_reward([0, 1], -1) 28 | self.set_reward([1, 2], -1) 29 | self.set_reward([2, 3], -1) 30 | # 목표 지점 설정 31 | self.set_reward([4, 4], 1) 32 | 33 | def _build_canvas(self): 34 | canvas = tk.Canvas(self, bg='white', 35 | height=HEIGHT * UNIT, 36 | width=WIDTH * UNIT) 37 | # 그리드 생성 38 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 39 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 40 | canvas.create_line(x0, y0, x1, y1) 41 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 42 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 43 | canvas.create_line(x0, y0, x1, y1) 44 | 45 | self.rewards = [] 46 | self.goal = [] 47 | # 캔버스에 이미지 추가 48 | x, y = UNIT/2, UNIT/2 49 | self.rectangle = canvas.create_image(x, y, image=self.shapes[0]) 50 | 51 | canvas.pack() 52 | 53 | return canvas 54 | 55 | def load_images(self): 56 | rectangle = PhotoImage( 57 | Image.open("../../Images/rectangle.png").resize((30, 30))) 58 | triangle = PhotoImage( 59 | Image.open("../../Images/triangle.png").resize((30, 30))) 60 | circle = PhotoImage( 61 | Image.open("../../Images/circle.png").resize((30, 30))) 62 | 63 | return rectangle, triangle, circle 64 | 65 | def reset_reward(self): 66 | 67 | for reward in self.rewards: 68 | self.canvas.delete(reward['figure']) 69 | 70 | self.rewards.clear() 71 | self.goal.clear() 72 | self.set_reward([0, 1], -1) 73 | self.set_reward([1, 2], -1) 74 | self.set_reward([2, 3], -1) 75 | 76 | # #goal 77 | self.set_reward([4, 4], 1) 78 | 79 | def set_reward(self, state, reward): 80 | state = [int(state[0]), int(state[1])] 81 | x = int(state[0]) 82 | y = int(state[1]) 83 | temp = {} 84 | if reward > 0: 85 | temp['reward'] = reward 86 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, 87 | (UNIT * y) + UNIT / 2, 88 | image=self.shapes[2]) 89 | 90 | self.goal.append(temp['figure']) 91 | 92 | 93 | elif reward < 0: 94 | temp['direction'] = -1 95 | temp['reward'] = reward 96 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, 97 | (UNIT * y) + UNIT / 2, 98 | image=self.shapes[1]) 99 | 100 | temp['coords'] = self.canvas.coords(temp['figure']) 101 | temp['state'] = state 102 | self.rewards.append(temp) 103 | 104 | # new methods 105 | 106 | def check_if_reward(self, state): 107 | check_list = dict() 108 | check_list['if_goal'] = False 109 | rewards = 0 110 | 111 | for reward in self.rewards: 112 | if reward['state'] == state: 113 | rewards += reward['reward'] 114 | if reward['reward'] == 1: 115 | check_list['if_goal'] = True 116 | 117 | check_list['rewards'] = rewards 118 | 119 | return check_list 120 | 121 | def coords_to_state(self, coords): 122 | x = int((coords[0] - UNIT / 2) / UNIT) 123 | y = int((coords[1] - UNIT / 2) / UNIT) 124 | return [x, y] 125 | 126 | def reset(self): 127 | self.update() 128 | time.sleep(0.5) 129 | x, y = self.canvas.coords(self.rectangle) 130 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 131 | self.reset_reward() 132 | return self.get_state() 133 | 134 | def step(self, action): 135 | self.counter += 1 136 | self.render() 137 | 138 | if self.counter % 2 == 1: 139 | self.rewards = self.move_rewards() 140 | 141 | next_coords = self.move(self.rectangle, action) 142 | check = self.check_if_reward(self.coords_to_state(next_coords)) 143 | done = check['if_goal'] 144 | reward = check['rewards'] 145 | 146 | self.canvas.tag_raise(self.rectangle) 147 | 148 | s_ = self.get_state() 149 | 150 | return s_, reward, done 151 | 152 | def get_state(self): 153 | 154 | location = self.coords_to_state(self.canvas.coords(self.rectangle)) 155 | agent_x = location[0] 156 | agent_y = location[1] 157 | 158 | states = list() 159 | 160 | for reward in self.rewards: 161 | reward_location = reward['state'] 162 | states.append(reward_location[0] - agent_x) 163 | states.append(reward_location[1] - agent_y) 164 | if reward['reward'] < 0: 165 | states.append(-1) 166 | states.append(reward['direction']) 167 | else: 168 | states.append(1) 169 | 170 | return states 171 | 172 | def move_rewards(self): 173 | new_rewards = [] 174 | for temp in self.rewards: 175 | if temp['reward'] == 1: 176 | new_rewards.append(temp) 177 | continue 178 | temp['coords'] = self.move_const(temp) 179 | temp['state'] = self.coords_to_state(temp['coords']) 180 | new_rewards.append(temp) 181 | return new_rewards 182 | 183 | def move_const(self, target): 184 | 185 | s = self.canvas.coords(target['figure']) 186 | 187 | base_action = np.array([0, 0]) 188 | 189 | if s[0] == (WIDTH - 1) * UNIT + UNIT / 2: 190 | target['direction'] = 1 191 | elif s[0] == UNIT / 2: 192 | target['direction'] = -1 193 | 194 | if target['direction'] == -1: 195 | base_action[0] += UNIT 196 | elif target['direction'] == 1: 197 | base_action[0] -= UNIT 198 | 199 | if (target['figure'] is not self.rectangle 200 | and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]): 201 | base_action = np.array([0, 0]) 202 | 203 | self.canvas.move(target['figure'], base_action[0], base_action[1]) 204 | 205 | s_ = self.canvas.coords(target['figure']) 206 | 207 | return s_ 208 | 209 | def move(self, target, action): 210 | s = self.canvas.coords(target) 211 | 212 | base_action = np.array([0, 0]) 213 | 214 | if action == 0: # 상 215 | if s[1] > UNIT: 216 | base_action[1] -= UNIT 217 | elif action == 1: # 하 218 | if s[1] < (HEIGHT - 1) * UNIT: 219 | base_action[1] += UNIT 220 | elif action == 2: # 우 221 | if s[0] < (WIDTH - 1) * UNIT: 222 | base_action[0] += UNIT 223 | elif action == 3: # 좌 224 | if s[0] > UNIT: 225 | base_action[0] -= UNIT 226 | 227 | self.canvas.move(target, base_action[0], base_action[1]) 228 | 229 | s_ = self.canvas.coords(target) 230 | 231 | return s_ 232 | 233 | def render(self): 234 | # 게임 속도 조정 235 | time.sleep(0.05) 236 | self.update() 237 | -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/1 - Deep SARSA/save_graph/deep_sarsa_trained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/1 - Deep SARSA/save_graph/deep_sarsa_trained.png -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/1 - Deep SARSA/save_model/deep_sarsa_trained.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/1 - Deep SARSA/save_model/deep_sarsa_trained.h5 -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/2 - REINFORCE/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | PhotoImage = ImageTk.PhotoImage 7 | UNIT = 50 # 픽셀 수 8 | HEIGHT = 5 # 그리드월드 세로 9 | WIDTH = 5 # 그리드월드 가로 10 | 11 | np.random.seed(1) 12 | 13 | 14 | class Env(tk.Tk): 15 | def __init__(self): 16 | super(Env, self).__init__() 17 | self.action_space = ['u', 'd', 'l', 'r'] 18 | self.action_size = len(self.action_space) 19 | self.title('Reinforce') 20 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 21 | self.shapes = self.load_images() 22 | self.canvas = self._build_canvas() 23 | self.counter = 0 24 | self.rewards = [] 25 | self.goal = [] 26 | # 장애물 설정 27 | self.set_reward([0, 1], -1) 28 | self.set_reward([1, 2], -1) 29 | self.set_reward([2, 3], -1) 30 | # 목표지점 설정 31 | self.set_reward([4, 4], 1) 32 | 33 | def _build_canvas(self): 34 | canvas = tk.Canvas(self, bg='white', 35 | height=HEIGHT * UNIT, 36 | width=WIDTH * UNIT) 37 | # 그리드 생성 38 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 39 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 40 | canvas.create_line(x0, y0, x1, y1) 41 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 42 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 43 | canvas.create_line(x0, y0, x1, y1) 44 | 45 | self.rewards = [] 46 | self.goal = [] 47 | # 캔버스에 이미지 추가 48 | x, y = UNIT/2, UNIT/2 49 | self.rectangle = canvas.create_image(x, y, image=self.shapes[0]) 50 | 51 | canvas.pack() 52 | 53 | return canvas 54 | 55 | def load_images(self): 56 | rectangle = PhotoImage( 57 | Image.open("../../Images/rectangle.png").resize((30, 30))) 58 | triangle = PhotoImage( 59 | Image.open("../../Images/triangle.png").resize((30, 30))) 60 | circle = PhotoImage( 61 | Image.open("../../Images/circle.png").resize((30, 30))) 62 | 63 | return rectangle, triangle, circle 64 | 65 | def reset_reward(self): 66 | 67 | for reward in self.rewards: 68 | self.canvas.delete(reward['figure']) 69 | 70 | self.rewards.clear() 71 | self.goal.clear() 72 | self.set_reward([0, 1], -1) 73 | self.set_reward([1, 2], -1) 74 | self.set_reward([2, 3], -1) 75 | 76 | # 목표 지점 77 | self.set_reward([4, 4], 1) 78 | 79 | def set_reward(self, state, reward): 80 | state = [int(state[0]), int(state[1])] 81 | x = int(state[0]) 82 | y = int(state[1]) 83 | temp = {} 84 | if reward > 0: 85 | temp['reward'] = reward 86 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, 87 | (UNIT * y) + UNIT / 2, 88 | image=self.shapes[2]) 89 | 90 | self.goal.append(temp['figure']) 91 | 92 | 93 | elif reward < 0: 94 | temp['direction'] = -1 95 | temp['reward'] = reward 96 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, 97 | (UNIT * y) + UNIT / 2, 98 | image=self.shapes[1]) 99 | 100 | temp['coords'] = self.canvas.coords(temp['figure']) 101 | temp['state'] = state 102 | self.rewards.append(temp) 103 | 104 | def check_if_reward(self, state): 105 | check_list = dict() 106 | check_list['if_goal'] = False 107 | rewards = 0 108 | 109 | for reward in self.rewards: 110 | if reward['state'] == state: 111 | rewards += reward['reward'] 112 | if reward['reward'] > 0: 113 | check_list['if_goal'] = True 114 | 115 | check_list['rewards'] = rewards 116 | 117 | return check_list 118 | 119 | def coords_to_state(self, coords): 120 | x = int((coords[0] - UNIT / 2) / UNIT) 121 | y = int((coords[1] - UNIT / 2) / UNIT) 122 | return [x, y] 123 | 124 | def reset(self): 125 | self.update() 126 | x, y = self.canvas.coords(self.rectangle) 127 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 128 | self.reset_reward() 129 | return self.get_state() 130 | 131 | def step(self, action): 132 | self.counter += 1 133 | self.render() 134 | 135 | if self.counter % 2 == 1: 136 | self.rewards = self.move_rewards() 137 | 138 | next_coords = self.move(self.rectangle, action) 139 | check = self.check_if_reward(self.coords_to_state(next_coords)) 140 | done = check['if_goal'] 141 | reward = check['rewards'] 142 | reward -= 0.1 143 | self.canvas.tag_raise(self.rectangle) 144 | 145 | s_ = self.get_state() 146 | 147 | return s_, reward, done 148 | 149 | def get_state(self): 150 | 151 | location = self.coords_to_state(self.canvas.coords(self.rectangle)) 152 | agent_x = location[0] 153 | agent_y = location[1] 154 | 155 | states = list() 156 | 157 | for reward in self.rewards: 158 | reward_location = reward['state'] 159 | states.append(reward_location[0] - agent_x) 160 | states.append(reward_location[1] - agent_y) 161 | if reward['reward'] < 0: 162 | states.append(-1) 163 | states.append(reward['direction']) 164 | else: 165 | states.append(1) 166 | 167 | return states 168 | 169 | def move_rewards(self): 170 | new_rewards = [] 171 | for temp in self.rewards: 172 | if temp['reward'] > 0: 173 | new_rewards.append(temp) 174 | continue 175 | temp['coords'] = self.move_const(temp) 176 | temp['state'] = self.coords_to_state(temp['coords']) 177 | new_rewards.append(temp) 178 | return new_rewards 179 | 180 | def move_const(self, target): 181 | 182 | s = self.canvas.coords(target['figure']) 183 | 184 | base_action = np.array([0, 0]) 185 | 186 | if s[0] == (WIDTH - 1) * UNIT + UNIT / 2: 187 | target['direction'] = 1 188 | elif s[0] == UNIT / 2: 189 | target['direction'] = -1 190 | 191 | if target['direction'] == -1: 192 | base_action[0] += UNIT 193 | elif target['direction'] == 1: 194 | base_action[0] -= UNIT 195 | 196 | if (target['figure'] is not self.rectangle 197 | and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]): 198 | base_action = np.array([0, 0]) 199 | 200 | self.canvas.move(target['figure'], base_action[0], base_action[1]) 201 | 202 | s_ = self.canvas.coords(target['figure']) 203 | 204 | return s_ 205 | 206 | def move(self, target, action): 207 | s = self.canvas.coords(target) 208 | 209 | base_action = np.array([0, 0]) 210 | 211 | if action == 0: # 상 212 | if s[1] > UNIT: 213 | base_action[1] -= UNIT 214 | elif action == 1: # 하 215 | if s[1] < (HEIGHT - 1) * UNIT: 216 | base_action[1] += UNIT 217 | elif action == 2: # 우 218 | if s[0] < (WIDTH - 1) * UNIT: 219 | base_action[0] += UNIT 220 | elif action == 3: # 좌 221 | if s[0] > UNIT: 222 | base_action[0] -= UNIT 223 | 224 | self.canvas.move(target, base_action[0], base_action[1]) 225 | 226 | s_ = self.canvas.coords(target) 227 | 228 | return s_ 229 | 230 | def render(self): 231 | # 게임 속도 조정 232 | time.sleep(0.07) 233 | self.update() 234 | -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/2 - REINFORCE/reinforce_agent.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import pylab 3 | import numpy as np 4 | from environment import Env 5 | from keras.layers import Dense 6 | from keras.optimizers import Adam 7 | from keras.models import Sequential 8 | from keras import backend as K 9 | 10 | EPISODES = 2500 11 | 12 | # 그리드월드 예제에서의 REINFORCE 에이전트 13 | class ReinforceAgent: 14 | def __init__(self): 15 | self.load_model = False 16 | # 가능한 모든 행동 정의 17 | self.action_space = [0, 1, 2, 3, 4] 18 | # 상태와 행동의 크기 정의 19 | self.action_size = len(self.action_space) 20 | self.state_size = 15 21 | self.discount_factor = 0.99 22 | self.learning_rate = 0.001 23 | 24 | self.model = self.build_model() 25 | self.optimizer = self.optimizer() 26 | self.states, self.actions, self.rewards = [], [], [] 27 | 28 | if self.load_model: 29 | self.model.load_weights('./save_model/reinforce_trained.h5') 30 | 31 | # 상태가 입력, 각 행동의 확률이 출력인 인공신경망 생성 32 | def build_model(self): 33 | model = Sequential() 34 | model.add(Dense(24, input_dim=self.state_size, activation='relu')) 35 | model.add(Dense(24, activation='relu')) 36 | model.add(Dense(self.action_size, activation='softmax')) 37 | model.summary() 38 | return model 39 | 40 | # 정책신경망을 업데이트 하기 위한 오류함수와 훈련함수의 생성 41 | def optimizer(self): 42 | action = K.placeholder(shape=[None, 5]) 43 | discounted_rewards = K.placeholder(shape=[None, ]) 44 | 45 | # 크로스 엔트로피 오류함수 계산 46 | action_prob = K.sum(action * self.model.output, axis=1) 47 | cross_entropy = K.log(action_prob) * discounted_rewards 48 | loss = -K.sum(cross_entropy) 49 | 50 | # 정책신경망을 업데이트하는 훈련함수 생성 51 | optimizer = Adam(lr=self.learning_rate) 52 | updates = optimizer.get_updates(self.model.trainable_weights,[], 53 | loss) 54 | train = K.function([self.model.input, action, discounted_rewards], [], 55 | updates=updates) 56 | 57 | return train 58 | 59 | # 정책신경망으로 행동 선택 60 | def get_action(self, state): 61 | policy = self.model.predict(state)[0] 62 | return np.random.choice(self.action_size, 1, p=policy)[0] 63 | 64 | # 반환값 계산 65 | def discount_rewards(self, rewards): 66 | discounted_rewards = np.zeros_like(rewards) 67 | running_add = 0 68 | for t in reversed(range(0, len(rewards))): 69 | running_add = running_add * self.discount_factor + rewards[t] 70 | discounted_rewards[t] = running_add 71 | return discounted_rewards 72 | 73 | # 한 에피소드 동안의 상태, 행동, 보상을 저장 74 | def append_sample(self, state, action, reward): 75 | self.states.append(state[0]) 76 | self.rewards.append(reward) 77 | act = np.zeros(self.action_size) 78 | act[action] = 1 79 | self.actions.append(act) 80 | 81 | # 정책신경망 업데이트 82 | def train_model(self): 83 | discounted_rewards = np.float32(self.discount_rewards(self.rewards)) 84 | discounted_rewards -= np.mean(discounted_rewards) 85 | discounted_rewards /= np.std(discounted_rewards) 86 | 87 | self.optimizer([self.states, self.actions, discounted_rewards]) 88 | self.states, self.actions, self.rewards = [], [], [] 89 | 90 | 91 | if __name__ == "__main__": 92 | # 환경과 에이전트의 생성 93 | env = Env() 94 | agent = ReinforceAgent() 95 | 96 | global_step = 0 97 | scores, episodes = [], [] 98 | 99 | for e in range(EPISODES): 100 | done = False 101 | score = 0 102 | # env 초기화 103 | state = env.reset() 104 | state = np.reshape(state, [1, 15]) 105 | 106 | while not done: 107 | global_step += 1 108 | # 현재 상태에 대한 행동 선택 109 | action = agent.get_action(state) 110 | # 선택한 행동으로 환경에서 한 타임스탭 진행 후 샘플 수집 111 | next_state, reward, done = env.step(action) 112 | next_state = np.reshape(next_state, [1, 15]) 113 | 114 | agent.append_sample(state, action, reward) 115 | score += reward 116 | state = copy.deepcopy(next_state) 117 | 118 | if done: 119 | # 에피소드마다 정책신경망 업데이트 120 | agent.train_model() 121 | scores.append(score) 122 | episodes.append(e) 123 | score = round(score,2) 124 | print("episode:", e, " score:", score, " time_step:", 125 | global_step) 126 | 127 | # 100 에피소드마다 학습 결과 출력 및 모델 저장 128 | if e % 100 == 0: 129 | pylab.plot(episodes, scores, 'b') 130 | pylab.savefig("./save_graph/reinforce.png") 131 | agent.model.save_weights("./save_model/reinforce.h5") 132 | -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/2 - REINFORCE/save_graph/reinforce_trained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/2 - REINFORCE/save_graph/reinforce_trained.png -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/2 - REINFORCE/save_model/reinforce_trained.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/2 - REINFORCE/save_model/reinforce_trained.h5 -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/3 - DQN/cartpole_dqn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import pylab 4 | import random 5 | import numpy as np 6 | from collections import deque 7 | from keras.layers import Dense 8 | from keras.optimizers import Adam 9 | from keras.models import Sequential 10 | 11 | EPISODES = 300 12 | 13 | 14 | # 카트폴 예제에서의 DQN 에이전트 15 | class DQNAgent: 16 | def __init__(self, state_size, action_size): 17 | self.render = False 18 | self.load_model = False 19 | 20 | # 상태와 행동의 크기 정의 21 | self.state_size = state_size 22 | self.action_size = action_size 23 | 24 | # DQN 하이퍼파라미터 25 | self.discount_factor = 0.99 26 | self.learning_rate = 0.001 27 | self.epsilon = 1.0 28 | self.epsilon_decay = 0.999 29 | self.epsilon_min = 0.01 30 | self.batch_size = 64 31 | self.train_start = 1000 32 | 33 | # 리플레이 메모리, 최대 크기 2000 34 | self.memory = deque(maxlen=2000) 35 | 36 | # 모델과 타깃 모델 생성 37 | self.model = self.build_model() 38 | self.target_model = self.build_model() 39 | 40 | # 타깃 모델 초기화 41 | self.update_target_model() 42 | 43 | if self.load_model: 44 | self.model.load_weights("./save_model/cartpole_dqn_trained.h5") 45 | 46 | # 상태가 입력, 큐함수가 출력인 인공신경망 생성 47 | def build_model(self): 48 | model = Sequential() 49 | model.add(Dense(24, input_dim=self.state_size, activation='relu', 50 | kernel_initializer='he_uniform')) 51 | model.add(Dense(24, activation='relu', 52 | kernel_initializer='he_uniform')) 53 | model.add(Dense(self.action_size, activation='linear', 54 | kernel_initializer='he_uniform')) 55 | model.summary() 56 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 57 | return model 58 | 59 | # 타깃 모델을 모델의 가중치로 업데이트 60 | def update_target_model(self): 61 | self.target_model.set_weights(self.model.get_weights()) 62 | 63 | # 입실론 탐욕 정책으로 행동 선택 64 | def get_action(self, state): 65 | if np.random.rand() <= self.epsilon: 66 | return random.randrange(self.action_size) 67 | else: 68 | q_value = self.model.predict(state) 69 | return np.argmax(q_value[0]) 70 | 71 | # 샘플 을 리플레이 메모리에 저장 72 | def append_sample(self, state, action, reward, next_state, done): 73 | self.memory.append((state, action, reward, next_state, done)) 74 | 75 | # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습 76 | def train_model(self): 77 | if self.epsilon > self.epsilon_min: 78 | self.epsilon *= self.epsilon_decay 79 | 80 | # 메모리에서 배치 크기만큼 무작위로 샘플 추출 81 | mini_batch = random.sample(self.memory, self.batch_size) 82 | 83 | states = np.zeros((self.batch_size, self.state_size)) 84 | next_states = np.zeros((self.batch_size, self.state_size)) 85 | actions, rewards, dones = [], [], [] 86 | 87 | for i in range(self.batch_size): 88 | states[i] = mini_batch[i][0] 89 | actions.append(mini_batch[i][1]) 90 | rewards.append(mini_batch[i][2]) 91 | next_states[i] = mini_batch[i][3] 92 | dones.append(mini_batch[i][4]) 93 | 94 | # 현재 상태에 대한 모델의 큐함수 95 | # 다음 상태에 대한 타깃 모델의 큐함수 96 | target = self.model.predict(states) 97 | target_val = self.target_model.predict(next_states) 98 | 99 | # 벨만 최적 방정식을 이용한 업데이트 타깃 100 | for i in range(self.batch_size): 101 | if dones[i]: 102 | target[i][actions[i]] = rewards[i] 103 | else: 104 | target[i][actions[i]] = rewards[i] + self.discount_factor * ( 105 | np.amax(target_val[i])) 106 | 107 | self.model.fit(states, target, batch_size=self.batch_size, 108 | epochs=1, verbose=0) 109 | 110 | 111 | if __name__ == "__main__": 112 | # CartPole-v1 환경, 최대 타임스텝 수가 500 113 | env = gym.make('CartPole-v1') 114 | state_size = env.observation_space.shape[0] 115 | action_size = env.action_space.n 116 | 117 | # DQN 에이전트 생성 118 | agent = DQNAgent(state_size, action_size) 119 | 120 | scores, episodes = [], [] 121 | 122 | for e in range(EPISODES): 123 | done = False 124 | score = 0 125 | # env 초기화 126 | state = env.reset() 127 | state = np.reshape(state, [1, state_size]) 128 | 129 | while not done: 130 | if agent.render: 131 | env.render() 132 | 133 | # 현재 상태로 행동을 선택 134 | action = agent.get_action(state) 135 | # 선택한 행동으로 환경에서 한 타임스텝 진행 136 | next_state, reward, done, info = env.step(action) 137 | next_state = np.reshape(next_state, [1, state_size]) 138 | # 에피소드가 중간에 끝나면 -100 보상 139 | reward = reward if not done or score == 499 else -100 140 | 141 | # 리플레이 메모리에 샘플 저장 142 | agent.append_sample(state, action, reward, next_state, done) 143 | # 매 타임스텝마다 학습 144 | if len(agent.memory) >= agent.train_start: 145 | agent.train_model() 146 | 147 | score += reward 148 | state = next_state 149 | 150 | if done: 151 | # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트 152 | agent.update_target_model() 153 | 154 | score = score if score == 500 else score + 100 155 | # 에피소드마다 학습 결과 출력 156 | scores.append(score) 157 | episodes.append(e) 158 | pylab.plot(episodes, scores, 'b') 159 | pylab.savefig("./save_graph/cartpole_dqn.png") 160 | print("episode:", e, " score:", score, " memory length:", 161 | len(agent.memory), " epsilon:", agent.epsilon) 162 | 163 | # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단 164 | if np.mean(scores[-min(10, len(scores)):]) > 490: 165 | agent.model.save_weights("./save_model/cartpole_dqn.h5") 166 | sys.exit() -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/3 - DQN/save_graph/cartpole_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/3 - DQN/save_graph/cartpole_dqn.png -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/3 - DQN/save_model/cartpole_dqn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/3 - DQN/save_model/cartpole_dqn.h5 -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/3 - DQN/save_model/cartpole_dqn_trained.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/3 - DQN/save_model/cartpole_dqn_trained.h5 -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/4 - Actor-Critic/cartpole_a2c.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import pylab 4 | import numpy as np 5 | from keras.layers import Dense 6 | from keras.models import Sequential 7 | from keras.optimizers import Adam 8 | from keras import backend as K 9 | 10 | EPISODES = 1000 11 | 12 | 13 | # 카트폴 예제에서의 액터-크리틱(A2C) 에이전트 14 | class A2CAgent: 15 | def __init__(self, state_size, action_size): 16 | self.render = False 17 | self.load_model = False 18 | # 상태와 행동의 크기 정의 19 | self.state_size = state_size 20 | self.action_size = action_size 21 | self.value_size = 1 22 | 23 | # 액터-크리틱 하이퍼파라미터 24 | self.discount_factor = 0.99 25 | self.actor_lr = 0.001 26 | self.critic_lr = 0.005 27 | 28 | # 정책신경망과 가치신경망 생성 29 | self.actor = self.build_actor() 30 | self.critic = self.build_critic() 31 | self.actor_updater = self.actor_optimizer() 32 | self.critic_updater = self.critic_optimizer() 33 | 34 | if self.load_model: 35 | self.actor.load_weights("./save_model/cartpole_actor_trained.h5") 36 | self.critic.load_weights("./save_model/cartpole_critic_trained.h5") 37 | 38 | # actor: 상태를 받아 각 행동의 확률을 계산 39 | def build_actor(self): 40 | actor = Sequential() 41 | actor.add(Dense(24, input_dim=self.state_size, activation='relu', 42 | kernel_initializer='he_uniform')) 43 | actor.add(Dense(self.action_size, activation='softmax', 44 | kernel_initializer='he_uniform')) 45 | actor.summary() 46 | return actor 47 | 48 | # critic: 상태를 받아서 상태의 가치를 계산 49 | def build_critic(self): 50 | critic = Sequential() 51 | critic.add(Dense(24, input_dim=self.state_size, activation='relu', 52 | kernel_initializer='he_uniform')) 53 | critic.add(Dense(24, input_dim=self.state_size, activation='relu', 54 | kernel_initializer='he_uniform')) 55 | critic.add(Dense(self.value_size, activation='linear', 56 | kernel_initializer='he_uniform')) 57 | critic.summary() 58 | return critic 59 | 60 | # 정책신경망의 출력을 받아 확률적으로 행동을 선택 61 | def get_action(self, state): 62 | policy = self.actor.predict(state, batch_size=1).flatten() 63 | return np.random.choice(self.action_size, 1, p=policy)[0] 64 | 65 | # 정책신경망을 업데이트하는 함수 66 | def actor_optimizer(self): 67 | action = K.placeholder(shape=[None, self.action_size]) 68 | advantage = K.placeholder(shape=[None, ]) 69 | 70 | action_prob = K.sum(action * self.actor.output, axis=1) 71 | cross_entropy = K.log(action_prob) * advantage 72 | loss = -K.sum(cross_entropy) 73 | 74 | optimizer = Adam(lr=self.actor_lr) 75 | updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) 76 | train = K.function([self.actor.input, action, advantage], [], 77 | updates=updates) 78 | return train 79 | 80 | # 가치신경망을 업데이트하는 함수 81 | def critic_optimizer(self): 82 | target = K.placeholder(shape=[None, ]) 83 | 84 | loss = K.mean(K.square(target - self.critic.output)) 85 | 86 | optimizer = Adam(lr=self.critic_lr) 87 | updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) 88 | train = K.function([self.critic.input, target], [], updates=updates) 89 | 90 | return train 91 | 92 | # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트 93 | def train_model(self, state, action, reward, next_state, done): 94 | value = self.critic.predict(state)[0] 95 | next_value = self.critic.predict(next_state)[0] 96 | 97 | act = np.zeros([1, self.action_size]) 98 | act[0][action] = 1 99 | 100 | # 벨만 기대 방정식를 이용한 어드벤티지와 업데이트 타깃 101 | if done: 102 | advantage = reward - value 103 | target = [reward] 104 | else: 105 | advantage = (reward + self.discount_factor * next_value) - value 106 | target = reward + self.discount_factor * next_value 107 | 108 | self.actor_updater([state, act, advantage]) 109 | self.critic_updater([state, target]) 110 | 111 | 112 | if __name__ == "__main__": 113 | # CartPole-v1 환경, 최대 타임스텝 수가 500 114 | env = gym.make('CartPole-v1') 115 | # 환경으로부터 상태와 행동의 크기를 받아옴 116 | state_size = env.observation_space.shape[0] 117 | action_size = env.action_space.n 118 | 119 | # 액터-크리틱(A2C) 에이전트 생성 120 | agent = A2CAgent(state_size, action_size) 121 | 122 | scores, episodes = [], [] 123 | 124 | for e in range(EPISODES): 125 | done = False 126 | score = 0 127 | state = env.reset() 128 | state = np.reshape(state, [1, state_size]) 129 | 130 | while not done: 131 | if agent.render: 132 | env.render() 133 | 134 | action = agent.get_action(state) 135 | next_state, reward, done, info = env.step(action) 136 | next_state = np.reshape(next_state, [1, state_size]) 137 | # 에피소드가 중간에 끝나면 -100 보상 138 | reward = reward if not done or score == 499 else -100 139 | 140 | agent.train_model(state, action, reward, next_state, done) 141 | 142 | score += reward 143 | state = next_state 144 | 145 | if done: 146 | # 에피소드마다 학습 결과 출력 147 | score = score if score == 500.0 else score + 100 148 | scores.append(score) 149 | episodes.append(e) 150 | pylab.plot(episodes, scores, 'b') 151 | pylab.savefig("./save_graph/cartpole_a2c.png") 152 | print("episode:", e, " score:", score) 153 | 154 | # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단 155 | if np.mean(scores[-min(10, len(scores)):]) > 490: 156 | agent.actor.save_weights("./save_model/cartpole_actor.h5") 157 | agent.critic.save_weights( 158 | "./save_model/cartpole_critic.h5") 159 | sys.exit() -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/4 - Actor-Critic/save_graph/cartpole_a2c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/4 - Actor-Critic/save_graph/cartpole_a2c.png -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/4 - Actor-Critic/save_model/cartpole_actor.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/4 - Actor-Critic/save_model/cartpole_actor.h5 -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/4 - Actor-Critic/save_model/cartpole_actor_trained.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/4 - Actor-Critic/save_model/cartpole_actor_trained.h5 -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/4 - Actor-Critic/save_model/cartpole_critic.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/4 - Actor-Critic/save_model/cartpole_critic.h5 -------------------------------------------------------------------------------- /2 - Examples/2 - Day 2/4 - Actor-Critic/save_model/cartpole_critic_trained.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/2 - Day 2/4 - Actor-Critic/save_model/cartpole_critic_trained.h5 -------------------------------------------------------------------------------- /2 - Examples/Images/circle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/Images/circle.png -------------------------------------------------------------------------------- /2 - Examples/Images/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/Images/down.png -------------------------------------------------------------------------------- /2 - Examples/Images/left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/Images/left.png -------------------------------------------------------------------------------- /2 - Examples/Images/rectangle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/Images/rectangle.png -------------------------------------------------------------------------------- /2 - Examples/Images/right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/Images/right.png -------------------------------------------------------------------------------- /2 - Examples/Images/triangle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/Images/triangle.png -------------------------------------------------------------------------------- /2 - Examples/Images/up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/2 - Examples/Images/up.png -------------------------------------------------------------------------------- /3 - Assignments/1 - Day 1 (Maze for SARSA and Q-learning)/Problems/2019 OSS Summer - Maze Specification.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/3 - Assignments/1 - Day 1 (Maze for SARSA and Q-learning)/Problems/2019 OSS Summer - Maze Specification.pdf -------------------------------------------------------------------------------- /3 - Assignments/1 - Day 1 (Maze for SARSA and Q-learning)/Problems/assignment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import random 4 | from collections import defaultdict 5 | import gym 6 | import environment 7 | 8 | env = gym.make('maze-5x5-v0') 9 | 10 | # Reference code: https://github.com/suhoy901/Reinforcement_Learning/blob/master/05.maze_sarsa/sarsa_basic.py 11 | 12 | # State 의 boundary 13 | STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high)) 14 | # Maze의 size (10, 10) 15 | NUM_GRID = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int)) 16 | 17 | class Agent: 18 | def __init__(self, actions): 19 | self.actions = actions 20 | self.discount_factor = 0.9 # 감가율 21 | self.epsilon = 0.1 # 엡실론 22 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) 23 | 24 | # 의 샘플로부터 큐함수를 업데이트 25 | def learn(self, state, action, reward, next_state, next_action): 26 | # TODO: 큐함수를 업데이트 하는 코드를 작성 27 | # self.discount_factor와 self.q_table을 이용하세요. 28 | 29 | # 구현을 완료했다면 아래 pass는 지우셔도 됩니다. 30 | pass 31 | 32 | # 입실론 탐욕 정책에 따라서 행동을 반환하는 메소드입니다. 33 | def get_action(self, state): 34 | # TODO: ε-탐욕 정책 코드를 작성 35 | # self.epsilon을 이용하세요. 36 | 37 | action = np.random.choice(self.actions) 38 | 39 | return int(action) 40 | 41 | # 범위 밖으로 나간 state를 다시 maze안으로 넣어주는 코드 42 | def state_to_bucket(state): 43 | bucket_indice = [] 44 | for i in range(len(state)): 45 | if state[i] <= STATE_BOUNDS[i][0]: 46 | bucket_index = 0 47 | elif state[i] >= STATE_BOUNDS[i][1]: 48 | bucket_index = NUM_GRID[i] - 1 49 | else: 50 | # Mapping the state bounds to the bucket array 51 | bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0] 52 | offset = (NUM_GRID[i] - 1) * STATE_BOUNDS[i][0] / bound_width 53 | scaling = (NUM_GRID[i] - 1) / bound_width 54 | bucket_index = int(round(scaling * state[i] - offset)) 55 | bucket_indice.append(bucket_index) 56 | return tuple(bucket_indice) 57 | 58 | 59 | if __name__ == "__main__": 60 | env.reset() 61 | agent = Agent(actions=list(range(env.action_space.n))) 62 | scores = [] 63 | episodes = [] 64 | 65 | for episode in range(250): 66 | state = env.reset() 67 | state = state_to_bucket(state) 68 | action = agent.get_action(state) 69 | total_reward = 0 70 | 71 | while True: 72 | env.render() 73 | 74 | next_state, reward, done, _ = env.step(action) 75 | next_state = state_to_bucket(next_state) 76 | next_action = agent.get_action(next_state) 77 | 78 | agent.learn(str(state), action, reward, str(next_state), next_action) 79 | total_reward += reward 80 | state = next_state 81 | action = next_action 82 | 83 | if done: 84 | print("Episode : %d total reward = %f . " % (episode, total_reward)) 85 | episodes.append(episode) 86 | scores.append(total_reward) 87 | 88 | break -------------------------------------------------------------------------------- /3 - Assignments/1 - Day 1 (Maze for SARSA and Q-learning)/Problems/environment.py: -------------------------------------------------------------------------------- 1 | import pygame 2 | import random 3 | import os 4 | import numpy as np 5 | 6 | import gym 7 | from gym import error, spaces, utils 8 | from gym.utils import seeding 9 | from gym.envs.registration import register 10 | 11 | class MazeView2D: 12 | def __init__(self, maze_name="Maze2D", maze_file_path=None, 13 | maze_size=(30, 30), screen_size=(600, 600), 14 | has_loops=False, num_portals=3, enable_render=True): 15 | pygame.init() 16 | pygame.display.set_caption(maze_name) 17 | self.clock = pygame.time.Clock() 18 | self.__game_over = False 19 | self.__enable_render = enable_render 20 | 21 | self.__maze = Maze(maze_cells=Maze.load_maze()) 22 | 23 | self.maze_size = self.__maze.maze_size 24 | if self.__enable_render is True: 25 | self.screen = pygame.display.set_mode(screen_size) 26 | self.__screen_size = tuple(map(sum, zip(screen_size, (-1, -1)))) 27 | 28 | self.__entrance = np.zeros(2, dtype=int) 29 | 30 | self.__goal = np.array(self.maze_size) - np.array((1, 1)) 31 | 32 | self.__robot = self.entrance 33 | 34 | if self.__enable_render is True: 35 | self.background = pygame.Surface(self.screen.get_size()).convert() 36 | self.background.fill((255, 255, 255)) 37 | 38 | self.maze_layer = pygame.Surface(self.screen.get_size()).convert_alpha() 39 | self.maze_layer.fill((0, 0, 0, 0,)) 40 | 41 | self.__draw_maze() 42 | self.__draw_portals() 43 | self.__draw_robot() 44 | self.__draw_entrance() 45 | self.__draw_goal() 46 | 47 | def update(self, mode="human"): 48 | try: 49 | img_output = self.__view_update(mode) 50 | self.__controller_update() 51 | except Exception as e: 52 | self.__game_over = True 53 | self.quit_game() 54 | raise e 55 | else: 56 | return img_output 57 | 58 | def quit_game(self): 59 | try: 60 | self.__game_over = True 61 | if self.__enable_render is True: 62 | pygame.display.quit() 63 | pygame.quit() 64 | except Exception: 65 | pass 66 | 67 | def move_robot(self, dir): 68 | if dir not in self.__maze.COMPASS.keys(): 69 | raise ValueError("dir cannot be %s. The only valid dirs are %s." 70 | % (str(dir), str(self.__maze.COMPASS.keys()))) 71 | 72 | if self.__maze.is_open(self.__robot, dir): 73 | self.__draw_robot(transparency=0) 74 | 75 | self.__robot += np.array(self.__maze.COMPASS[dir]) 76 | if self.maze.is_portal(self.robot): 77 | self.__robot = np.array(self.maze.get_portal(tuple(self.robot)).teleport(tuple(self.robot))) 78 | self.__draw_robot(transparency=255) 79 | 80 | def reset_robot(self): 81 | 82 | self.__draw_robot(transparency=0) 83 | self.__robot = np.zeros(2, dtype=int) 84 | self.__draw_robot(transparency=255) 85 | 86 | def __controller_update(self): 87 | if not self.__game_over: 88 | for event in pygame.event.get(): 89 | if event.type == pygame.QUIT: 90 | self.__game_over = True 91 | self.quit_game() 92 | 93 | def __view_update(self, mode="human"): 94 | if not self.__game_over: 95 | self.__draw_entrance() 96 | self.__draw_goal() 97 | self.__draw_portals() 98 | self.__draw_robot() 99 | 100 | self.screen.blit(self.background, (0, 0)) 101 | self.screen.blit(self.maze_layer,(0, 0)) 102 | 103 | if mode == "human": 104 | pygame.display.flip() 105 | 106 | return np.flipud(np.rot90(pygame.surfarray.array3d(pygame.display.get_surface()))) 107 | 108 | def __draw_maze(self): 109 | if self.__enable_render is False: 110 | return 111 | 112 | line_colour = (0, 0, 0, 255) 113 | 114 | for y in range(self.maze.MAZE_H + 1): 115 | pygame.draw.line(self.maze_layer, line_colour, (0, y * self.CELL_H), 116 | (self.SCREEN_W, y * self.CELL_H)) 117 | 118 | for x in range(self.maze.MAZE_W + 1): 119 | pygame.draw.line(self.maze_layer, line_colour, (x * self.CELL_W, 0), 120 | (x * self.CELL_W, self.SCREEN_H)) 121 | 122 | for x in range(len(self.maze.maze_cells)): 123 | for y in range (len(self.maze.maze_cells[x])): 124 | walls_status = self.maze.get_walls_status(self.maze.maze_cells[x, y]) 125 | dirs = "" 126 | for dir, open in walls_status.items(): 127 | if open: 128 | dirs += dir 129 | self.__cover_walls(x, y, dirs) 130 | 131 | def __cover_walls(self, x, y, dirs, colour=(0, 0, 255, 15)): 132 | if self.__enable_render is False: 133 | return 134 | 135 | dx = x * self.CELL_W 136 | dy = y * self.CELL_H 137 | 138 | if not isinstance(dirs, str): 139 | raise TypeError("dirs must be a str.") 140 | 141 | for dir in dirs: 142 | if dir == "S": 143 | line_head = (dx + 1, dy + self.CELL_H) 144 | line_tail = (dx + self.CELL_W - 1, dy + self.CELL_H) 145 | elif dir == "N": 146 | line_head = (dx + 1, dy) 147 | line_tail = (dx + self.CELL_W - 1, dy) 148 | elif dir == "W": 149 | line_head = (dx, dy + 1) 150 | line_tail = (dx, dy + self.CELL_H - 1) 151 | elif dir == "E": 152 | line_head = (dx + self.CELL_W, dy + 1) 153 | line_tail = (dx + self.CELL_W, dy + self.CELL_H - 1) 154 | else: 155 | raise ValueError("The only valid directions are (N, S, E, W).") 156 | 157 | pygame.draw.line(self.maze_layer, colour, line_head, line_tail) 158 | 159 | def __draw_robot(self, colour=(0, 0, 150), transparency=255): 160 | if self.__enable_render is False: 161 | return 162 | 163 | x = int(self.__robot[0] * self.CELL_W + self.CELL_W * 0.5 + 0.5) 164 | y = int(self.__robot[1] * self.CELL_H + self.CELL_H * 0.5 + 0.5) 165 | r = int(min(self.CELL_W, self.CELL_H)/5 + 0.5) 166 | 167 | pygame.draw.circle(self.maze_layer, colour + (transparency,), (x, y), r) 168 | 169 | def __draw_entrance(self, colour=(0, 0, 150), transparency=235): 170 | self.__colour_cell(self.entrance, colour=colour, transparency=transparency) 171 | 172 | def __draw_goal(self, colour=(150, 0, 0), transparency=235): 173 | self.__colour_cell(self.goal, colour=colour, transparency=transparency) 174 | 175 | def __draw_portals(self, transparency=160): 176 | if self.__enable_render is False: 177 | return 178 | 179 | colour_range = np.linspace(0, 255, len(self.maze.portals), dtype=int) 180 | colour_i = 0 181 | for portal in self.maze.portals: 182 | colour = ((100 - colour_range[colour_i])% 255, colour_range[colour_i], 0) 183 | colour_i += 1 184 | for location in portal.locations: 185 | self.__colour_cell(location, colour=colour, transparency=transparency) 186 | 187 | def __colour_cell(self, cell, colour, transparency): 188 | if self.__enable_render is False: 189 | return 190 | 191 | if not (isinstance(cell, (list, tuple, np.ndarray)) and len(cell) == 2): 192 | raise TypeError("cell must a be a tuple, list, or numpy array of size 2") 193 | 194 | x = int(cell[0] * self.CELL_W + 0.5 + 1) 195 | y = int(cell[1] * self.CELL_H + 0.5 + 1) 196 | w = int(self.CELL_W + 0.5 - 1) 197 | h = int(self.CELL_H + 0.5 - 1) 198 | pygame.draw.rect(self.maze_layer, colour + (transparency,), (x, y, w, h)) 199 | 200 | @property 201 | def maze(self): 202 | return self.__maze 203 | 204 | @property 205 | def robot(self): 206 | return self.__robot 207 | 208 | @property 209 | def entrance(self): 210 | return self.__entrance 211 | 212 | @property 213 | def goal(self): 214 | return self.__goal 215 | 216 | @property 217 | def game_over(self): 218 | return self.__game_over 219 | 220 | @property 221 | def SCREEN_SIZE(self): 222 | return tuple(self.__screen_size) 223 | 224 | @property 225 | def SCREEN_W(self): 226 | return int(self.SCREEN_SIZE[0]) 227 | 228 | @property 229 | def SCREEN_H(self): 230 | return int(self.SCREEN_SIZE[1]) 231 | 232 | @property 233 | def CELL_W(self): 234 | return float(self.SCREEN_W) / float(self.maze.MAZE_W) 235 | 236 | @property 237 | def CELL_H(self): 238 | return float(self.SCREEN_H) / float(self.maze.MAZE_H) 239 | 240 | class Maze: 241 | COMPASS = { 242 | "N": (0, -1), 243 | "E": (1, 0), 244 | "S": (0, 1), 245 | "W": (-1, 0) 246 | } 247 | 248 | def __init__(self, maze_cells=None, maze_size=(10, 10), has_loops=True, num_portals=3): 249 | self.maze_cells = maze_cells 250 | self.has_loops = has_loops 251 | self.__portals_dict = dict() 252 | self.__portals = [] 253 | self.num_portals = num_portals 254 | 255 | if self.maze_cells is not None: 256 | if isinstance(self.maze_cells, (np.ndarray, np.generic)) and len(self.maze_cells.shape) == 2: 257 | self.maze_size = tuple(maze_cells.shape) 258 | else: 259 | raise ValueError("maze_cells must be a 2D NumPy array.") 260 | 261 | 262 | 263 | def make_portal(*locations): 264 | portal = Portal(*locations) 265 | self.__portals.append(portal) 266 | 267 | for location in locations: 268 | self.__portals_dict[location] = portal 269 | 270 | make_portal((6, 2), (4, 3)) 271 | make_portal((1, 7), (6, 7)) 272 | make_portal((6, 1), (0, 4)) 273 | else: 274 | if not (isinstance(maze_size, (list, tuple)) and len(maze_size) == 2): 275 | raise ValueError("maze_size must be a tuple: (width, height).") 276 | self.maze_size = maze_size 277 | 278 | self._generate_maze() 279 | 280 | @classmethod 281 | def load_maze(cls): 282 | return np.array( 283 | [[ 2, 4, 4, 4, 4, 4, 2, 1, 2, 1], 284 | [ 4, 8, 10, 1, 1, 1, 6, 8, 1, 9], 285 | [ 8, 5, 2, 2, 1, 1, 1, 4, 12, 8], 286 | [ 2, 9, 2, 3, 4, 6, 4, 8, 4, 8], 287 | [ 2, 9, 1, 4, 4, 4, 2, 8, 8, 1], 288 | [ 4, 10, 9, 4, 4, 2, 3, 8, 4, 8], 289 | [ 8, 4, 8, 2, 1, 1, 6, 8, 8, 3], 290 | [ 8, 4, 4, 4, 4, 2, 4, 2, 8, 2], 291 | [ 8, 8, 2, 1, 1, 3, 1, 6, 8, 3], 292 | [ 8, 1, 5, 4, 4, 4, 8, 1, 1, 1]] 293 | ) 294 | 295 | def _generate_maze(self): 296 | self.maze_cells = np.zeros(self.maze_size, dtype=int) 297 | 298 | current_cell = (random.randint(0, self.MAZE_W-1), random.randint(0, self.MAZE_H-1)) 299 | num_cells_visited = 1 300 | cell_stack = [current_cell] 301 | 302 | while cell_stack: 303 | current_cell = cell_stack.pop() 304 | x0, y0 = current_cell 305 | 306 | neighbours = dict() 307 | for dir_key, dir_val in self.COMPASS.items(): 308 | x1 = x0 + dir_val[0] 309 | y1 = y0 + dir_val[1] 310 | 311 | if 0 <= x1 < self.MAZE_W and 0 <= y1 < self.MAZE_H: 312 | if self.all_walls_intact(self.maze_cells[x1, y1]): 313 | neighbours[dir_key] = (x1, y1) 314 | 315 | if neighbours: 316 | dir = random.choice(tuple(neighbours.keys())) 317 | x1, y1 = neighbours[dir] 318 | 319 | self.maze_cells[x1, y1] = self.__break_walls(self.maze_cells[x1, y1], self.__get_opposite_wall(dir)) 320 | 321 | cell_stack.append(current_cell) 322 | 323 | cell_stack.append((x1, y1)) 324 | 325 | num_cells_visited += 1 326 | 327 | if self.has_loops: 328 | self.__break_random_walls(0.2) 329 | 330 | def __break_random_walls(self, percent): 331 | num_cells = int(round(self.MAZE_H*self.MAZE_W*percent)) 332 | cell_ids = random.sample(range(self.MAZE_W*self.MAZE_H), num_cells) 333 | 334 | for cell_id in cell_ids: 335 | x = cell_id % self.MAZE_H 336 | y = int(cell_id/self.MAZE_H) 337 | 338 | dirs = random.sample(list(self.COMPASS.keys()), len(self.COMPASS)) 339 | for dir in dirs: 340 | if self.is_breakable((x, y), dir): 341 | self.maze_cells[x, y] = self.__break_walls(self.maze_cells[x, y], dir) 342 | break 343 | 344 | def __set_random_portals(self, num_portal_sets, set_size=2): 345 | num_portal_sets = int(num_portal_sets) 346 | set_size = int(set_size) 347 | 348 | max_portal_sets = int(self.MAZE_W * self.MAZE_H / set_size) 349 | num_portal_sets = min(max_portal_sets, num_portal_sets) 350 | 351 | cell_ids = random.sample(range(1, self.MAZE_W * self.MAZE_H - 1), num_portal_sets*set_size) 352 | 353 | for i in range(num_portal_sets): 354 | portal_cell_ids = random.sample(cell_ids, set_size) 355 | portal_locations = [] 356 | for portal_cell_id in portal_cell_ids: 357 | cell_ids.pop(cell_ids.index(portal_cell_id)) 358 | x = portal_cell_id % self.MAZE_H 359 | y = int(portal_cell_id / self.MAZE_H) 360 | portal_locations.append((x,y)) 361 | portal = Portal(*portal_locations) 362 | self.__portals.append(portal) 363 | 364 | for portal_location in portal_locations: 365 | self.__portals_dict[portal_location] = portal 366 | 367 | def is_open(self, cell_id, dir): 368 | x1 = cell_id[0] + self.COMPASS[dir][0] 369 | y1 = cell_id[1] + self.COMPASS[dir][1] 370 | 371 | if self.is_within_bound(x1, y1): 372 | this_wall = bool(self.get_walls_status(self.maze_cells[cell_id[0], cell_id[1]])[dir]) 373 | other_wall = bool(self.get_walls_status(self.maze_cells[x1, y1])[self.__get_opposite_wall(dir)]) 374 | return this_wall or other_wall 375 | return False 376 | 377 | def is_breakable(self, cell_id, dir): 378 | x1 = cell_id[0] + self.COMPASS[dir][0] 379 | y1 = cell_id[1] + self.COMPASS[dir][1] 380 | 381 | return not self.is_open(cell_id, dir) and self.is_within_bound(x1, y1) 382 | 383 | def is_within_bound(self, x, y): 384 | return 0 <= x < self.MAZE_W and 0 <= y < self.MAZE_H 385 | 386 | def is_portal(self, cell): 387 | return tuple(cell) in self.__portals_dict 388 | 389 | @property 390 | def portals(self): 391 | return tuple(self.__portals) 392 | 393 | def get_portal(self, cell): 394 | if cell in self.__portals_dict: 395 | return self.__portals_dict[cell] 396 | return None 397 | 398 | @property 399 | def MAZE_W(self): 400 | return int(self.maze_size[0]) 401 | 402 | @property 403 | def MAZE_H(self): 404 | return int(self.maze_size[1]) 405 | 406 | @classmethod 407 | def get_walls_status(cls, cell): 408 | walls = { 409 | "N" : (cell & 0x1) >> 0, 410 | "E" : (cell & 0x2) >> 1, 411 | "S" : (cell & 0x4) >> 2, 412 | "W" : (cell & 0x8) >> 3, 413 | } 414 | return walls 415 | 416 | @classmethod 417 | def all_walls_intact(cls, cell): 418 | return cell & 0xF == 0 419 | 420 | @classmethod 421 | def num_walls_broken(cls, cell): 422 | walls = cls.get_walls_status(cell) 423 | num_broken = 0 424 | for wall_broken in walls.values(): 425 | num_broken += wall_broken 426 | return num_broken 427 | 428 | @classmethod 429 | def __break_walls(cls, cell, dirs): 430 | if "N" in dirs: 431 | cell |= 0x1 432 | if "E" in dirs: 433 | cell |= 0x2 434 | if "S" in dirs: 435 | cell |= 0x4 436 | if "W" in dirs: 437 | cell |= 0x8 438 | return cell 439 | 440 | @classmethod 441 | def __get_opposite_wall(cls, dirs): 442 | 443 | if not isinstance(dirs, str): 444 | raise TypeError("dirs must be a str.") 445 | 446 | opposite_dirs = "" 447 | 448 | for dir in dirs: 449 | if dir == "N": 450 | opposite_dir = "S" 451 | elif dir == "S": 452 | opposite_dir = "N" 453 | elif dir == "E": 454 | opposite_dir = "W" 455 | elif dir == "W": 456 | opposite_dir = "E" 457 | else: 458 | raise ValueError("The only valid directions are (N, S, E, W).") 459 | 460 | opposite_dirs += opposite_dir 461 | 462 | return opposite_dirs 463 | 464 | class Portal: 465 | def __init__(self, *locations): 466 | self.__locations = [] 467 | for location in locations: 468 | if isinstance(location, (tuple, list)): 469 | self.__locations.append(tuple(location)) 470 | else: 471 | raise ValueError("location must be a list or a tuple.") 472 | 473 | def teleport(self, cell): 474 | if cell in self.locations: 475 | return self.locations[(self.locations.index(cell) + 1) % len(self.locations)] 476 | return cell 477 | 478 | def get_index(self, cell): 479 | return self.locations.index(cell) 480 | 481 | @property 482 | def locations(self): 483 | return self.__locations 484 | 485 | class MazeEnv(gym.Env): 486 | metadata = { 487 | "render.modes": ["human", "rgb_array"], 488 | } 489 | 490 | ACTION = ["N", "S", "E", "W"] 491 | 492 | def __init__(self, maze_file=None, maze_size=(10, 10), mode=None, enable_render=True): 493 | self.viewer = None 494 | self.enable_render = enable_render 495 | 496 | has_loops = True 497 | num_portals = 3 498 | 499 | self.maze_view = MazeView2D(maze_name="OpenAI Gym - Maze (%d x %d)" % maze_size, 500 | maze_size=maze_size, screen_size=(640, 640), 501 | has_loops=has_loops, num_portals=num_portals, 502 | enable_render=enable_render) 503 | 504 | self.maze_size = self.maze_view.maze_size 505 | 506 | self.action_space = spaces.Discrete(2*len(self.maze_size)) 507 | 508 | low = np.zeros(len(self.maze_size), dtype=int) 509 | high = np.array(self.maze_size, dtype=int) - np.ones(len(self.maze_size), dtype=int) 510 | self.observation_space = spaces.Box(low, high, dtype=np.int64) 511 | 512 | self.state = None 513 | self.steps_beyond_done = None 514 | 515 | self.seed() 516 | self.reset() 517 | 518 | self.configure() 519 | 520 | def destroy(self): 521 | pass 522 | 523 | def __del__(self): 524 | if self.enable_render is True: 525 | self.maze_view.quit_game() 526 | 527 | def configure(self, display=None): 528 | self.display = display 529 | 530 | def seed(self, seed=None): 531 | self.np_random, seed = seeding.np_random(seed) 532 | return [seed] 533 | 534 | def step(self, action): 535 | if isinstance(action, int): 536 | self.maze_view.move_robot(self.ACTION[action]) 537 | else: 538 | self.maze_view.move_robot(action) 539 | 540 | if np.array_equal(self.maze_view.robot, self.maze_view.goal): 541 | reward = 1 542 | done = True 543 | else: 544 | reward = -0.1/(self.maze_size[0]*self.maze_size[1]) 545 | done = False 546 | 547 | self.state = self.maze_view.robot 548 | 549 | info = {} 550 | 551 | return self.state, reward, done, info 552 | 553 | def reset(self): 554 | self.maze_view.reset_robot() 555 | self.state = np.zeros(2) 556 | self.steps_beyond_done = None 557 | self.done = False 558 | return self.state 559 | 560 | def is_game_over(self): 561 | return self.maze_view.game_over 562 | 563 | def render(self, mode="human", close=False): 564 | if close: 565 | self.maze_view.quit_game() 566 | 567 | return self.maze_view.update(mode) 568 | 569 | register( 570 | id='maze-5x5-v0', 571 | entry_point='environment:MazeEnv', 572 | max_episode_steps=10000, 573 | nondeterministic=True, 574 | ) 575 | -------------------------------------------------------------------------------- /3 - Assignments/1 - Day 1 (Maze for SARSA and Q-learning)/Solutions/maze_q_learning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import random 4 | from collections import defaultdict 5 | import gym 6 | import environment 7 | import time 8 | 9 | env = gym.make('maze-5x5-v0') 10 | 11 | # State 의 boundary 12 | STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high)) 13 | # Maze의 size (10, 10) 14 | NUM_GRID = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int)) 15 | 16 | class Agent: 17 | def __init__(self, actions): 18 | self.actions = actions 19 | self.discount_factor = 0.9 # 감가율 20 | self.epsilon = 0.1 # 엡실론 21 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) # 큐테이블 22 | 23 | # 의 샘플로부터 큐함수를 업데이트 24 | def learn(self, state, action, reward, next_state, next_action): 25 | q_1 = self.q_table[state][action] 26 | # 벨만 최적 방정식을 사용한 큐함수의 업데이트 27 | q_2 = reward + self.discount_factor * max(self.q_table[next_state]) 28 | self.q_table[state][action] += 0.1 * (q_2 - q_1) 29 | 30 | # 입실론 탐욕 정책에 따라서 행동을 반환하는 메소드입니다. 31 | def get_action(self, state): 32 | if np.random.rand() < self.epsilon: 33 | # 무작위 행동 반환 34 | action = np.random.choice(self.actions) 35 | else: 36 | # 큐함수에 따른 행동 반환 37 | state_action = self.q_table[str(state)] 38 | action = self.arg_max(state_action) 39 | return int(action) 40 | 41 | @staticmethod 42 | def arg_max(state_action): 43 | max_index_list = [] 44 | max_value = state_action[0] 45 | for index, value in enumerate(state_action): 46 | if value > max_value: 47 | max_index_list.clear() 48 | max_value = value 49 | max_index_list.append(index) 50 | elif value == max_value: 51 | max_index_list.append(index) 52 | return random.choice(max_index_list) 53 | 54 | # 범위 밖으로 나간 state를 다시 maze안으로 넣어주는 코드 55 | def state_to_bucket(state): 56 | bucket_indice = [] 57 | for i in range(len(state)): 58 | if state[i] <= STATE_BOUNDS[i][0]: 59 | bucket_index = 0 60 | elif state[i] >= STATE_BOUNDS[i][1]: 61 | bucket_index = NUM_GRID[i] - 1 62 | else: 63 | # Mapping the state bounds to the bucket array 64 | bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0] 65 | offset = (NUM_GRID[i] - 1) * STATE_BOUNDS[i][0] / bound_width 66 | scaling = (NUM_GRID[i] - 1) / bound_width 67 | bucket_index = int(round(scaling * state[i] - offset)) 68 | bucket_indice.append(bucket_index) 69 | return tuple(bucket_indice) 70 | 71 | 72 | if __name__ == "__main__": 73 | env.reset() 74 | agent = Agent(actions=list(range(env.action_space.n))) 75 | scores = [] 76 | episodes = [] 77 | 78 | for episode in range(250): 79 | state = env.reset() 80 | state = state_to_bucket(state) 81 | action = agent.get_action(state) 82 | total_reward = 0 83 | 84 | while True: 85 | env.render() 86 | 87 | next_state, reward, done, _ = env.step(action) 88 | next_state = state_to_bucket(next_state) 89 | next_action = agent.get_action(next_state) 90 | 91 | agent.learn(str(state), action, reward, str(next_state), next_action) 92 | total_reward += reward 93 | state = next_state 94 | action = next_action 95 | 96 | if done: 97 | print("Episode : %d total reward = %f . " % (episode, total_reward)) 98 | episodes.append(episode) 99 | scores.append(total_reward) 100 | 101 | break -------------------------------------------------------------------------------- /3 - Assignments/1 - Day 1 (Maze for SARSA and Q-learning)/Solutions/maze_sarsa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import random 4 | from collections import defaultdict 5 | import gym 6 | import environment 7 | import time 8 | 9 | env = gym.make('maze-5x5-v0') 10 | 11 | # State 의 boundary 12 | STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high)) 13 | # Maze의 size (10, 10) 14 | NUM_GRID = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int)) 15 | 16 | class Agent: 17 | def __init__(self, actions): 18 | self.actions = actions 19 | self.discount_factor = 0.9 # 감가율 20 | self.epsilon = 0.1 # 엡실론 21 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) # 큐테이블 22 | 23 | # 의 샘플로부터 큐함수를 업데이트 24 | def learn(self, state, action, reward, next_state, next_action): 25 | current_q = self.q_table[state][action] 26 | next_state_q = self.q_table[next_state][next_action] 27 | new_q = (current_q + 0.2 * 28 | (reward + self.discount_factor * next_state_q - current_q)) 29 | self.q_table[state][action] = new_q 30 | 31 | # 입실론 탐욕 정책에 따라서 행동을 반환하는 메소드입니다. 32 | def get_action(self, state): 33 | if np.random.rand() < self.epsilon: 34 | # 무작위 행동 반환 35 | action = np.random.choice(self.actions) 36 | else: 37 | # 큐함수에 따른 행동 반환 38 | state_action = self.q_table[str(state)] 39 | action = self.arg_max(state_action) 40 | return int(action) 41 | 42 | @staticmethod 43 | def arg_max(state_action): 44 | max_index_list = [] 45 | max_value = state_action[0] 46 | for index, value in enumerate(state_action): 47 | if value > max_value: 48 | max_index_list.clear() 49 | max_value = value 50 | max_index_list.append(index) 51 | elif value == max_value: 52 | max_index_list.append(index) 53 | return random.choice(max_index_list) 54 | 55 | # 범위 밖으로 나간 state를 다시 maze안으로 넣어주는 코드 56 | def state_to_bucket(state): 57 | bucket_indice = [] 58 | for i in range(len(state)): 59 | if state[i] <= STATE_BOUNDS[i][0]: 60 | bucket_index = 0 61 | elif state[i] >= STATE_BOUNDS[i][1]: 62 | bucket_index = NUM_GRID[i] - 1 63 | else: 64 | # Mapping the state bounds to the bucket array 65 | bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0] 66 | offset = (NUM_GRID[i] - 1) * STATE_BOUNDS[i][0] / bound_width 67 | scaling = (NUM_GRID[i] - 1) / bound_width 68 | bucket_index = int(round(scaling * state[i] - offset)) 69 | bucket_indice.append(bucket_index) 70 | return tuple(bucket_indice) 71 | 72 | 73 | if __name__ == "__main__": 74 | env.reset() 75 | agent = Agent(actions=list(range(env.action_space.n))) 76 | scores = [] 77 | episodes = [] 78 | 79 | for episode in range(250): 80 | state = env.reset() 81 | state = state_to_bucket(state) 82 | action = agent.get_action(state) 83 | total_reward = 0 84 | 85 | while True: 86 | env.render() 87 | 88 | next_state, reward, done, _ = env.step(action) 89 | next_state = state_to_bucket(next_state) 90 | next_action = agent.get_action(next_state) 91 | 92 | agent.learn(str(state), action, reward, str(next_state), next_action) 93 | total_reward += reward 94 | state = next_state 95 | action = next_action 96 | 97 | if done: 98 | print("Episode : %d total reward = %f . " % (episode, total_reward)) 99 | episodes.append(episode) 100 | scores.append(total_reward) 101 | 102 | break -------------------------------------------------------------------------------- /3 - Assignments/2 - Day 2 (LunarLander for DQN)/Problems/2019 OSS Summer - LunarLander Specification.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/3 - Assignments/2 - Day 2 (LunarLander for DQN)/Problems/2019 OSS Summer - LunarLander Specification.pdf -------------------------------------------------------------------------------- /3 - Assignments/2 - Day 2 (LunarLander for DQN)/Problems/2019 OSS Summer - SWIG Installation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utilForever/2019-OSS-Summer-RLBasic/df3dabd9820248e69eeb4906e5cfbb9c39a0b931/3 - Assignments/2 - Day 2 (LunarLander for DQN)/Problems/2019 OSS Summer - SWIG Installation.pdf -------------------------------------------------------------------------------- /3 - Assignments/2 - Day 2 (LunarLander for DQN)/Problems/lunarlander_dqn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import pylab 4 | import random 5 | import numpy as np 6 | from collections import deque 7 | from keras.layers import Dense 8 | from keras.optimizers import Adam 9 | from keras.models import Sequential 10 | 11 | ''' 12 | 일단 하이퍼파라미터에 None이라고 되어있는 부분 위주로 수정해주세요. (다른 것들 잘못 건드시면 안될수도 있음) 13 | cartpole_dqn.py에 있는 예제 그대로 복사하셔도 됩니다. 14 | 하지만 이것 저것 수정해 보시면서 더 좋은 에이전트를 만들어 보는 것도 좋을 것 같습니다. 15 | ''' 16 | 17 | # 최대로 실행할 에피소드 수를 설정합니다. 18 | EPISODES = 2000 19 | 20 | # 카트폴 예제에서의 DQN 에이전트 21 | class DQNAgent: 22 | def __init__(self, state_size, action_size): 23 | ''' 24 | 구글 colab에서는 아래 render를 True로 만들면 실행이 안됩니다. 25 | ''' 26 | self.render = False 27 | 28 | ''' 29 | 저장해 놓은 신경망 모델을 가져올 지 선택합니다. (lunarlander_trainded.h5) 30 | 훈련을 중간에 중단해 놓았다가 다시 시작하려면 아래를 True로 바꾸고 실행하시면 됩니다. 31 | ''' 32 | self.load_model = False 33 | 34 | # 상태와 행동의 크기 정의 35 | self.state_size = state_size 36 | self.action_size = action_size 37 | 38 | # DQN 하이퍼파라미터 39 | ''' 40 | 일단 None이라고 되어있는 부분 위주로 수정해주세요. (다른 것들 잘못 건드시면 안될수도 있음) 41 | 아래 8개 하이퍼파라미터(maxlen 포함)는 cartpole_dqn 예제 그대로 복사하셔도 되고, 좀 수정하셔도 됩니다. 42 | ''' 43 | self.discount_factor = 0.99 44 | self.learning_rate = None 45 | self.epsilon = 1.0 46 | self.epsilon_decay = None 47 | self.epsilon_min = 0.01 48 | self.batch_size = 64 49 | self.train_start = 10000 50 | 51 | # 리플레이 메모리, 최대 크기 10000 52 | self.memory = deque(maxlen=20000) 53 | 54 | # 모델과 타깃 모델 생성 55 | ''' 56 | 아마 그냥 실행하시면 오류가 날텐데 57 | build_model을 완성하시면 오류가 사라집니다. 58 | ''' 59 | self.model = self.build_model() 60 | self.target_model = self.build_model() 61 | 62 | # 타깃 모델 초기화 63 | self.update_target_model() 64 | 65 | if self.load_model: 66 | self.model.load_weights("lunarlander_trainded.h5") 67 | 68 | # 상태가 입력, 큐함수가 출력인 인공신경망 생성 69 | def build_model(self): 70 | ''' 71 | cartpole_dqn 파일의 예제를 그대로 사용하셔도 되고, 72 | 좀 수정하셔도 됩니다. 73 | 수정하신 뒤에는 아래에 있는 pass를 지워주세요. 74 | ''' 75 | pass 76 | 77 | # 타깃 모델을 모델의 가중치로 업데이트 78 | def update_target_model(self): 79 | self.target_model.set_weights(self.model.get_weights()) 80 | 81 | # 입실론 탐욕 정책으로 행동 선택 82 | def get_action_greedy(self, state): 83 | # 어제 큐함수 구현 과제 내용이니 넘어가겠습니다. 84 | if np.random.rand() <= self.epsilon: 85 | return random.randrange(self.action_size) 86 | else: 87 | q_value = self.model.predict(state) 88 | return np.argmax(q_value[0]) 89 | 90 | # 샘플 을 리플레이 메모리에 저장 91 | def append_sample(self, state, action, reward, next_state, done): 92 | self.memory.append((state, action, reward, next_state, done)) 93 | 94 | # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습 95 | def train_model(self): 96 | # 메모리에서 배치 크기만큼 무작위로 샘플 추출 97 | mini_batch = random.sample(self.memory, self.batch_size) 98 | 99 | states = np.zeros((self.batch_size, self.state_size)) 100 | next_states = np.zeros((self.batch_size, self.state_size)) 101 | actions, rewards, dones = [], [], [] 102 | 103 | for i in range(self.batch_size): 104 | states[i] = mini_batch[i][0] 105 | actions.append(mini_batch[i][1]) 106 | rewards.append(mini_batch[i][2]) 107 | next_states[i] = mini_batch[i][3] 108 | dones.append(mini_batch[i][4]) 109 | 110 | # 현재 상태에 대한 모델의 큐함수 111 | # 다음 상태에 대한 타깃 모델의 큐함수 112 | target = self.model.predict(states) 113 | target_val = self.target_model.predict(next_states) 114 | 115 | # 벨만 최적 방정식을 이용한 업데이트 타깃 116 | for i in range(self.batch_size): 117 | if dones[i]: 118 | target[i][actions[i]] = rewards[i] 119 | else: 120 | target[i][actions[i]] = rewards[i] + self.discount_factor * ( 121 | np.amax(target_val[i])) 122 | 123 | self.model.fit(states, target, batch_size=self.batch_size, 124 | epochs=1, verbose=0) 125 | 126 | 127 | if __name__ == "__main__": 128 | # LunarLander-v2 환경, 최대 타임스텝 수가 500 129 | env = gym.make('LunarLander-v2') 130 | env.seed(0) 131 | 132 | state_size = env.observation_space.shape[0] 133 | action_size = env.action_space.n 134 | 135 | agent = DQNAgent(state_size, action_size) 136 | 137 | scores, episodes = [], [] 138 | 139 | for e in range(EPISODES): 140 | done = False 141 | score = 0 142 | 143 | # env 초기화 144 | state = env.reset() 145 | state = np.reshape(state, [1, state_size]) 146 | 147 | while not done: 148 | if agent.render: 149 | env.render() 150 | 151 | # 현재 상태로 행동을 선택 152 | action = agent.get_action_greedy(state) 153 | 154 | # 선택한 행동으로 환경에서 한 타임스텝 진행 155 | next_state, reward, done, info = env.step(action) 156 | next_state = np.reshape(next_state, [1, state_size]) 157 | 158 | # 리플레이 메모리에 샘플 저장 159 | agent.append_sample(state, action, reward, next_state, done) 160 | 161 | # 매 타임스텝마다 학습 162 | if len(agent.memory) >= agent.train_start: 163 | agent.train_model() 164 | 165 | score += reward 166 | state = next_state 167 | 168 | if done: 169 | # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트 170 | agent.update_target_model() 171 | 172 | # 100 에피소드마다 학습 결과 173 | scores.append(score) 174 | episodes.append(e) 175 | pylab.plot(episodes, scores, 'b') 176 | pylab.savefig("lunarlander_dqn.png") 177 | print("episode:", e, " score:", score, " memory length:", 178 | len(agent.memory), " epsilon:", agent.epsilon) 179 | 180 | if e != 0 and e % 100 == 0: 181 | print("Saved!") 182 | agent.model.save_weights("lunarlander_trainded.h5") 183 | 184 | # 이전 50개 에피소드의 점수 평균이 200보다 크면 학습 중단 185 | if np.mean(scores[-min(50, len(scores)):]) > 200: 186 | print("Success!") 187 | agent.model.save_weights("lunarlander_trainded.h5") 188 | sys.exit() 189 | 190 | # 무작위 행동으로 리플레이 메모리를 어느 정도 채운 다음부터 191 | # epsilon의 값을 epsilon_min 값까지 조금씩 줄여줍니다 192 | if len(agent.memory) >= agent.train_start and agent.epsilon > agent.epsilon_min: 193 | agent.epsilon *= agent.epsilon_decay -------------------------------------------------------------------------------- /3 - Assignments/2 - Day 2 (LunarLander for DQN)/Solutions/lunarlander_dqn_solve.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import pylab 4 | import random 5 | import numpy as np 6 | from collections import deque 7 | from keras.layers import Dense 8 | from keras.optimizers import Adam 9 | from keras.models import Sequential 10 | 11 | EPISODES = 2000 12 | 13 | # 카트폴 예제에서의 DQN 에이전트 14 | class DQNAgent: 15 | def __init__(self, state_size, action_size): 16 | self.render = True 17 | self.load_model = True 18 | 19 | # 상태와 행동의 크기 정의 20 | self.state_size = state_size 21 | self.action_size = action_size 22 | 23 | # DQN 하이퍼파라미터 24 | self.discount_factor = 0.99 25 | self.learning_rate = 0.0005 26 | self.epsilon = 1.0 27 | self.epsilon_decay = 0.99 28 | self.epsilon_min = 0.01 29 | self.batch_size = 64 30 | self.train_start = 12000 31 | 32 | # 리플레이 메모리, 최대 크기 20000 33 | self.memory = deque(maxlen=20000) 34 | 35 | # 모델과 타깃 모델 생성 36 | self.model = self.build_model() 37 | self.target_model = self.build_model() 38 | 39 | # 타깃 모델 초기화 40 | self.update_target_model() 41 | 42 | if self.load_model: 43 | self.model.load_weights("lunarlander_trainded.h5") 44 | 45 | # 상태가 입력, 큐함수가 출력인 인공신경망 생성 46 | def build_model(self): 47 | model = Sequential() 48 | model.add(Dense(64, input_dim=self.state_size, activation='relu', 49 | kernel_initializer='he_uniform')) 50 | model.add(Dense(64, activation='relu', 51 | kernel_initializer='he_uniform')) 52 | model.add(Dense(self.action_size, activation='linear', 53 | kernel_initializer='he_uniform')) 54 | model.summary() 55 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 56 | return model 57 | 58 | # 타깃 모델을 모델의 가중치로 업데이트 59 | def update_target_model(self): 60 | self.target_model.set_weights(self.model.get_weights()) 61 | 62 | # 입실론 탐욕 정책으로 행동 선택 63 | def get_action_greedy(self, state): 64 | if np.random.rand() <= self.epsilon: 65 | return random.randrange(self.action_size) 66 | else: 67 | q_value = self.model.predict(state) 68 | return np.argmax(q_value[0]) 69 | 70 | # 입실론 없이 바로 argmax를 취함 71 | def get_action_playing(self, state): 72 | q_value = self.model.predict(state) 73 | return np.argmax(q_value[0]) 74 | 75 | # 샘플 을 리플레이 메모리에 저장 76 | def append_sample(self, state, action, reward, next_state, done): 77 | self.memory.append((state, action, reward, next_state, done)) 78 | 79 | # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습 80 | def train_model(self): 81 | # 메모리에서 배치 크기만큼 무작위로 샘플 추출 82 | mini_batch = random.sample(self.memory, self.batch_size) 83 | 84 | states = np.zeros((self.batch_size, self.state_size)) 85 | next_states = np.zeros((self.batch_size, self.state_size)) 86 | actions, rewards, dones = [], [], [] 87 | 88 | for i in range(self.batch_size): 89 | states[i] = mini_batch[i][0] 90 | actions.append(mini_batch[i][1]) 91 | rewards.append(mini_batch[i][2]) 92 | next_states[i] = mini_batch[i][3] 93 | dones.append(mini_batch[i][4]) 94 | 95 | # 현재 상태에 대한 모델의 큐함수 96 | # 다음 상태에 대한 타깃 모델의 큐함수 97 | target = self.model.predict(states) 98 | target_val = self.target_model.predict(next_states) 99 | 100 | # 벨만 최적 방정식을 이용한 업데이트 타깃 101 | for i in range(self.batch_size): 102 | if dones[i]: 103 | target[i][actions[i]] = rewards[i] 104 | else: 105 | target[i][actions[i]] = rewards[i] + self.discount_factor * ( 106 | np.amax(target_val[i])) 107 | 108 | self.model.fit(states, target, batch_size=self.batch_size, 109 | epochs=1, verbose=0) 110 | 111 | 112 | if __name__ == "__main__": 113 | # LunarLander-v2 환경, 최대 타임스텝 수가 500 114 | env = gym.make('LunarLander-v2') 115 | env.seed(0) 116 | state_size = env.observation_space.shape[0] 117 | action_size = env.action_space.n 118 | # state = [ 119 | # 우주선의 x 위치, 120 | # 우주선의 y 위치, 121 | # 우주선의 x 속도, 122 | # 우주선의 y 속도, 123 | # 우주선의 각도, 124 | # 우주선의 각속도, 125 | # 우주선의 왼쪽 다리가 땅에 닿았는지 여부, 126 | # 우주선의 오른쪽 다리가 땅에 닿았는지 여부, 127 | # ] 128 | 129 | agent = DQNAgent(state_size, action_size) 130 | 131 | scores, episodes = [], [] 132 | 133 | for e in range(EPISODES): 134 | done = False 135 | score = 0 136 | # env 초기화 137 | state = env.reset() 138 | 139 | state = np.reshape(state, [1, state_size]) 140 | 141 | while not done: 142 | if agent.render: 143 | env.render() 144 | 145 | # 현재 상태로 행동을 선택 146 | if (e >= EPISODES): 147 | agent.render = True 148 | action = agent.get_action_playing(state) 149 | else: 150 | action = agent.get_action_greedy(state) 151 | 152 | # 선택한 행동으로 환경에서 한 타임스텝 진행 153 | next_state, reward, done, info = env.step(action) 154 | next_state = np.reshape(next_state, [1, state_size]) 155 | 156 | # 리플레이 메모리에 샘플 저장 157 | agent.append_sample(state, action, reward, next_state, done) 158 | # 매 타임스텝마다 학습 159 | if len(agent.memory) >= agent.train_start: 160 | agent.train_model() 161 | 162 | score += reward 163 | 164 | state = next_state 165 | 166 | if done: 167 | # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트 168 | agent.update_target_model() 169 | 170 | # 에피소드마다 학습 결과 출력 171 | 172 | scores.append(score) 173 | episodes.append(e) 174 | pylab.plot(episodes, scores, 'b') 175 | pylab.savefig("lunarlander_dqn.png") 176 | print("episode:", e, " score:", score, " memory length:", 177 | len(agent.memory), " epsilon:", agent.epsilon) 178 | 179 | if e != 0 and e % 100 == 0: 180 | agent.model.save_weights("lunarlander_trainded.h5") 181 | 182 | if len(agent.memory) >= agent.train_start and agent.epsilon > agent.epsilon_min: 183 | agent.epsilon *= agent.epsilon_decay -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Chris Ohk 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2019-OSS-Summer-RLBasic 2 | 3 | 2019-OSS-Summer-RLBasic is the material(lecture notes, examples and assignments) repository for reinforcement learning basic course that I have taught at Kookmin University in the summer of 2019. Note that examples and assignments in this repository uses [Keras](https://keras.io/). 4 | 5 | ## Related Repositories 6 | 7 | - [2020-OSS-Winter-AlphaZero](https://github.com/utilForever/2020-OSS-Winter-AlphaZero) 8 | 9 | ## Contents 10 | 11 | - Lecture 12 | - Day 1 13 | - What is Reinforcement Learning? 14 | - MDP (Markov Decision Process) 15 | - State 16 | - Action 17 | - Reward Function 18 | - State Transition Probability 19 | - Discount Rate 20 | - Policy 21 | - Value Function and Q-Function 22 | - Bellman Equation 23 | - Bellman Expectation Equation 24 | - Bellman Optimality Equation 25 | - Dynamic Programming 26 | - Policy Iteration 27 | - Value Iteration 28 | - Policy Evaluation 29 | - Monte-Carlo Prediction 30 | - Temporal-Difference Prediction 31 | - SARSA 32 | - Q-Learning 33 | - Day 2 34 | - Approximation Function 35 | - Neural Network 36 | - Node and Activation Function 37 | - Deep Learning 38 | - Deep SARSA 39 | - Policy Gradient 40 | - Policy-based Reinforcement Learning 41 | - REINFORCE 42 | - DQN (Deep Q-Network) 43 | - Examples 44 | - Day 1 (Grid World) 45 | - Policy Iteration 46 | - Value Iteration 47 | - Monet-Carlo 48 | - SARSA 49 | - Q-Learning 50 | - Day 2 (Grid World and CartPole-v0) 51 | - Deep SARSA 52 | - REINFORCE 53 | - DQN 54 | - Actor-Critic 55 | - Assignments 56 | - Day 1 (Maze) 57 | - SARSA 58 | - Q-Learning 59 | - Day 2 (LunarLander-v0) 60 | - DQN 61 | 62 | ## Acknowledgement 63 | 64 | This course uses the contents of [파이썬과 케라스로 배우는 강화학습(Wikibooks, 2017)](https://wikibook.co.kr/reinforcement-learning/) for lecture notes and [RLCode team repository](https://github.com/rlcode/reinforcement-learning-kr) for example codes. Thanks to maintainers ([이웅원](https://github.com/dnddnjs), [이영무](https://github.com/zzing0907), [양혁렬](https://github.com/Hyeokreal), [이의령](https://github.com/wooridle), [김건우](https://github.com/keon)). 65 | 66 | Also, I prepared this materials with teaching assistant [Junyeong Park](https://github.com/JYPark09), and [Hyeonsu Kim](https://github.com/git-rla). Thanks! :D 67 | 68 | ## How To Contribute 69 | 70 | Contributions are always welcome, either reporting issues/bugs or forking the repository and then issuing pull requests when you have completed some additional coding that you feel will be beneficial to the main project. If you are interested in contributing in a more dedicated capacity, then please contact me. 71 | 72 | ## Contact 73 | 74 | You can contact me via e-mail (utilForever at gmail.com). I am always happy to answer questions or help with any issues you might have, and please be sure to share any additional work or your creations with me, I love seeing what other people are making. 75 | 76 | ## License 77 | 78 | 79 | 80 | The class is licensed under the [MIT License](http://opensource.org/licenses/MIT): 81 | 82 | Copyright © 2019 [Chris Ohk](http://www.github.com/utilForever), [Junyeong Park](https://github.com/JYPark09), and [Hyeonsu Kim](https://github.com/git-rla). 83 | 84 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 85 | 86 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 87 | 88 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --------------------------------------------------------------------------------