├── .gitignore ├── 1-grid-world ├── 1-policy-iteration │ ├── environment.py │ └── policy_iteration.py ├── 2-value-iteration │ ├── environment.py │ └── value_iteration.py ├── 3-monte-carlo │ ├── environment.py │ └── mc_agent.py ├── 4-sarsa │ ├── environment.py │ └── sarsa_agent.py ├── 5-q-learning │ ├── environment.py │ └── q_learning_agent.py └── img │ ├── circle.png │ ├── down.png │ ├── left.png │ ├── rectangle.png │ ├── right.png │ ├── triangle.png │ └── up.png ├── 2-cartpole ├── 1-dqn │ ├── memory.py │ ├── model.py │ ├── test.py │ └── train.py ├── 2-actor-critic │ ├── main.py │ ├── model.py │ ├── save_model │ │ └── ckpt_1157.pth │ ├── test.py │ └── train.py └── 3-multi-step │ ├── memory.py │ ├── model.py │ ├── test.py │ └── train.py ├── 3-atari └── 1-dqn │ ├── memory.py │ ├── model.py │ ├── test.py │ ├── train.py │ └── utils.py ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | logs/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | *.local 107 | -------------------------------------------------------------------------------- /1-grid-world/1-policy-iteration/environment.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import Button 3 | import time 4 | import numpy as np 5 | from PIL import ImageTk, Image 6 | 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # 픽셀 수 9 | HEIGHT = 5 # 그리드월드 세로 10 | WIDTH = 5 # 그리드월드 가로 11 | TRANSITION_PROB = 1 12 | POSSIBLE_ACTIONS = [0, 1, 2, 3] # 상, 하, 좌, 우 13 | ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 좌표로 나타낸 행동 14 | REWARDS = [] 15 | 16 | 17 | class GraphicDisplay(tk.Tk): 18 | def __init__(self, agent): 19 | super(GraphicDisplay, self).__init__() 20 | self.title('Policy Iteration') 21 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) 22 | self.texts = [] 23 | self.arrows = [] 24 | self.env = Env() 25 | self.agent = agent 26 | self.evaluation_count = 0 27 | self.improvement_count = 0 28 | self.is_moving = 0 29 | (self.up, self.down, self.left, self.right), self.shapes = self.load_images() 30 | self.canvas = self._build_canvas() 31 | self.text_reward(2, 2, "R : 1.0") 32 | self.text_reward(1, 2, "R : -1.0") 33 | self.text_reward(2, 1, "R : -1.0") 34 | 35 | def _build_canvas(self): 36 | canvas = tk.Canvas(self, bg='white', 37 | height=HEIGHT * UNIT, 38 | width=WIDTH * UNIT) 39 | # 버튼 초기화 40 | iteration_button = Button(self, text="Evaluate", 41 | command=self.evaluate_policy) 42 | iteration_button.configure(width=10, activebackground="#33B5E5") 43 | canvas.create_window(WIDTH * UNIT * 0.13, HEIGHT * UNIT + 10, 44 | window=iteration_button) 45 | policy_button = Button(self, text="Improve", 46 | command=self.improve_policy) 47 | policy_button.configure(width=10, activebackground="#33B5E5") 48 | canvas.create_window(WIDTH * UNIT * 0.37, HEIGHT * UNIT + 10, 49 | window=policy_button) 50 | policy_button = Button(self, text="move", command=self.move_by_policy) 51 | policy_button.configure(width=10, activebackground="#33B5E5") 52 | canvas.create_window(WIDTH * UNIT * 0.62, HEIGHT * UNIT + 10, 53 | window=policy_button) 54 | policy_button = Button(self, text="reset", command=self.reset) 55 | policy_button.configure(width=10, activebackground="#33B5E5") 56 | canvas.create_window(WIDTH * UNIT * 0.87, HEIGHT * UNIT + 10, 57 | window=policy_button) 58 | 59 | # 그리드 생성 60 | for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 61 | x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT 62 | canvas.create_line(x0, y0, x1, y1) 63 | for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 64 | x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row 65 | canvas.create_line(x0, y0, x1, y1) 66 | 67 | # 캔버스에 이미지 추가 68 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 69 | canvas.create_image(250, 150, image=self.shapes[1]) 70 | canvas.create_image(150, 250, image=self.shapes[1]) 71 | canvas.create_image(250, 250, image=self.shapes[2]) 72 | 73 | canvas.pack() 74 | 75 | return canvas 76 | 77 | def load_images(self): 78 | up = PhotoImage(Image.open("../img/up.png").resize((13, 13))) 79 | right = PhotoImage(Image.open("../img/right.png").resize((13, 13))) 80 | left = PhotoImage(Image.open("../img/left.png").resize((13, 13))) 81 | down = PhotoImage(Image.open("../img/down.png").resize((13, 13))) 82 | rectangle = PhotoImage(Image.open("../img/rectangle.png").resize((65, 65))) 83 | triangle = PhotoImage(Image.open("../img/triangle.png").resize((65, 65))) 84 | circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65))) 85 | return (up, down, left, right), (rectangle, triangle, circle) 86 | 87 | def reset(self): 88 | if self.is_moving == 0: 89 | self.evaluation_count = 0 90 | self.improvement_count = 0 91 | for i in self.texts: 92 | self.canvas.delete(i) 93 | 94 | for i in self.arrows: 95 | self.canvas.delete(i) 96 | self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)] 97 | self.agent.policy_table = ([[[0.25, 0.25, 0.25, 0.25]] * WIDTH 98 | for _ in range(HEIGHT)]) 99 | self.agent.policy_table[2][2] = [] 100 | x, y = self.canvas.coords(self.rectangle) 101 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 102 | 103 | def text_value(self, row, col, contents, font='Helvetica', size=10, 104 | style='normal', anchor="nw"): 105 | origin_x, origin_y = 85, 70 106 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 107 | font = (font, str(size), style) 108 | text = self.canvas.create_text(x, y, fill="black", text=contents, 109 | font=font, anchor=anchor) 110 | return self.texts.append(text) 111 | 112 | def text_reward(self, row, col, contents, font='Helvetica', size=10, 113 | style='normal', anchor="nw"): 114 | origin_x, origin_y = 5, 5 115 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 116 | font = (font, str(size), style) 117 | text = self.canvas.create_text(x, y, fill="black", text=contents, 118 | font=font, anchor=anchor) 119 | return self.texts.append(text) 120 | 121 | def rectangle_move(self, action): 122 | base_action = np.array([0, 0]) 123 | location = self.find_rectangle() 124 | self.render() 125 | if action == 0 and location[0] > 0: # 상 126 | base_action[1] -= UNIT 127 | elif action == 1 and location[0] < HEIGHT - 1: # 하 128 | base_action[1] += UNIT 129 | elif action == 2 and location[1] > 0: # 좌 130 | base_action[0] -= UNIT 131 | elif action == 3 and location[1] < WIDTH - 1: # 우 132 | base_action[0] += UNIT 133 | # move agent 134 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 135 | 136 | def find_rectangle(self): 137 | temp = self.canvas.coords(self.rectangle) 138 | x = (temp[0] / 100) - 0.5 139 | y = (temp[1] / 100) - 0.5 140 | return int(y), int(x) 141 | 142 | def move_by_policy(self): 143 | if self.improvement_count != 0 and self.is_moving != 1: 144 | self.is_moving = 1 145 | 146 | x, y = self.canvas.coords(self.rectangle) 147 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 148 | 149 | x, y = self.find_rectangle() 150 | while len(self.agent.policy_table[x][y]) != 0: 151 | self.after(100, 152 | self.rectangle_move(self.agent.get_action([x, y]))) 153 | x, y = self.find_rectangle() 154 | self.is_moving = 0 155 | 156 | def draw_one_arrow(self, col, row, policy): 157 | if col == 2 and row == 2: 158 | return 159 | 160 | if policy[0] > 0: # up 161 | origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) 162 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 163 | image=self.up)) 164 | if policy[1] > 0: # down 165 | origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) 166 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 167 | image=self.down)) 168 | if policy[2] > 0: # left 169 | origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) 170 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 171 | image=self.left)) 172 | if policy[3] > 0: # right 173 | origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) 174 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 175 | image=self.right)) 176 | 177 | def draw_from_policy(self, policy_table): 178 | for i in range(HEIGHT): 179 | for j in range(WIDTH): 180 | self.draw_one_arrow(i, j, policy_table[i][j]) 181 | 182 | def print_value_table(self, value_table): 183 | for i in range(WIDTH): 184 | for j in range(HEIGHT): 185 | self.text_value(i, j, value_table[i][j]) 186 | 187 | def render(self): 188 | time.sleep(0.1) 189 | self.canvas.tag_raise(self.rectangle) 190 | self.update() 191 | 192 | def evaluate_policy(self): 193 | self.evaluation_count += 1 194 | for i in self.texts: 195 | self.canvas.delete(i) 196 | self.agent.policy_evaluation() 197 | self.print_value_table(self.agent.value_table) 198 | 199 | def improve_policy(self): 200 | self.improvement_count += 1 201 | for i in self.arrows: 202 | self.canvas.delete(i) 203 | self.agent.policy_improvement() 204 | self.draw_from_policy(self.agent.policy_table) 205 | 206 | 207 | class Env: 208 | def __init__(self): 209 | self.transition_probability = TRANSITION_PROB 210 | self.width = WIDTH 211 | self.height = HEIGHT 212 | self.reward = [[0] * WIDTH for _ in range(HEIGHT)] 213 | self.possible_actions = POSSIBLE_ACTIONS 214 | self.reward[2][2] = 1 # (2,2) 좌표 동그라미 위치에 보상 1 215 | self.reward[1][2] = -1 # (1,2) 좌표 세모 위치에 보상 -1 216 | self.reward[2][1] = -1 # (2,1) 좌표 세모 위치에 보상 -1 217 | self.all_state = [] 218 | 219 | for x in range(WIDTH): 220 | for y in range(HEIGHT): 221 | state = [x, y] 222 | self.all_state.append(state) 223 | 224 | def get_reward(self, state, action): 225 | next_state = self.state_after_action(state, action) 226 | return self.reward[next_state[0]][next_state[1]] 227 | 228 | def state_after_action(self, state, action_index): 229 | action = ACTIONS[action_index] 230 | return self.check_boundary([state[0] + action[0], state[1] + action[1]]) 231 | 232 | @staticmethod 233 | def check_boundary(state): 234 | state[0] = (0 if state[0] < 0 else WIDTH - 1 235 | if state[0] > WIDTH - 1 else state[0]) 236 | state[1] = (0 if state[1] < 0 else HEIGHT - 1 237 | if state[1] > HEIGHT - 1 else state[1]) 238 | return state 239 | 240 | def get_transition_prob(self, state, action): 241 | return self.transition_probability 242 | 243 | def get_all_states(self): 244 | return self.all_state 245 | -------------------------------------------------------------------------------- /1-grid-world/1-policy-iteration/policy_iteration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | from environment import GraphicDisplay, Env 4 | 5 | 6 | class PolicyIteration: 7 | def __init__(self, env): 8 | # 환경에 대한 객체 선언 9 | self.env = env 10 | # 가치함수를 2차원 리스트로 초기화 11 | self.value_table = [[0.0] * env.width for _ in range(env.height)] 12 | # 상 하 좌 우 동일한 확률로 정책 초기화 13 | self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width 14 | for _ in range(env.height)] 15 | # 마침 상태의 설정 16 | self.policy_table[2][2] = [] 17 | # 감가율 18 | self.discount_factor = 0.9 19 | 20 | def policy_evaluation(self): 21 | 22 | # 다음 가치함수 초기화 23 | next_value_table = [[0.00] * self.env.width 24 | for _ in range(self.env.height)] 25 | 26 | # 모든 상태에 대해서 벨만 기대방정식을 계산 27 | for state in self.env.get_all_states(): 28 | value = 0.0 29 | # 마침 상태의 가치 함수 = 0 30 | if state == [2, 2]: 31 | next_value_table[state[0]][state[1]] = value 32 | continue 33 | 34 | # 벨만 기대 방정식 35 | for action in self.env.possible_actions: 36 | next_state = self.env.state_after_action(state, action) 37 | reward = self.env.get_reward(state, action) 38 | next_value = self.get_value(next_state) 39 | value += (self.get_policy(state)[action] * 40 | (reward + self.discount_factor * next_value)) 41 | 42 | next_value_table[state[0]][state[1]] = round(value, 2) 43 | 44 | self.value_table = next_value_table 45 | 46 | # 현재 가치 함수에 대해서 탐욕 정책 발전 47 | def policy_improvement(self): 48 | next_policy = self.policy_table 49 | for state in self.env.get_all_states(): 50 | if state == [2, 2]: 51 | continue 52 | value = -99999 53 | max_index = [] 54 | # 반환할 정책 초기화 55 | result = [0.0, 0.0, 0.0, 0.0] 56 | 57 | # 모든 행동에 대해서 [보상 + (감가율 * 다음 상태 가치함수)] 계산 58 | for index, action in enumerate(self.env.possible_actions): 59 | next_state = self.env.state_after_action(state, action) 60 | reward = self.env.get_reward(state, action) 61 | next_value = self.get_value(next_state) 62 | temp = reward + self.discount_factor * next_value 63 | 64 | # 받을 보상이 최대인 행동의 index(최대가 복수라면 모두)를 추출 65 | if temp == value: 66 | max_index.append(index) 67 | elif temp > value: 68 | value = temp 69 | max_index.clear() 70 | max_index.append(index) 71 | 72 | # 행동의 확률 계산 73 | prob = 1 / len(max_index) 74 | 75 | for index in max_index: 76 | result[index] = prob 77 | 78 | next_policy[state[0]][state[1]] = result 79 | 80 | self.policy_table = next_policy 81 | 82 | # 특정 상태에서 정책에 따른 행동을 반환 83 | def get_action(self, state): 84 | # 0 ~ 1 사이의 값을 무작위로 추출 85 | random_pick = random.randrange(100) / 100 86 | 87 | policy = self.get_policy(state) 88 | policy_sum = 0.0 89 | # 정책에 담긴 행동 중에 무작위로 한 행동을 추출 90 | for index, value in enumerate(policy): 91 | policy_sum += value 92 | if random_pick < policy_sum: 93 | return index 94 | 95 | # 상태에 따른 정책 반환 96 | def get_policy(self, state): 97 | if state == [2, 2]: 98 | return 0.0 99 | return self.policy_table[state[0]][state[1]] 100 | 101 | # 가치 함수의 값을 반환 102 | def get_value(self, state): 103 | # 소숫점 둘째 자리까지만 계산 104 | return round(self.value_table[state[0]][state[1]], 2) 105 | 106 | if __name__ == "__main__": 107 | env = Env() 108 | policy_iteration = PolicyIteration(env) 109 | grid_world = GraphicDisplay(policy_iteration) 110 | grid_world.mainloop() 111 | -------------------------------------------------------------------------------- /1-grid-world/2-value-iteration/environment.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | import time 3 | import numpy as np 4 | import random 5 | from PIL import ImageTk, Image 6 | 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # 픽셀 수 9 | HEIGHT = 5 # 그리드월드 세로 10 | WIDTH = 5 # 그리드월드 가로 11 | TRANSITION_PROB = 1 12 | POSSIBLE_ACTIONS = [0, 1, 2, 3] # 상, 하, 좌, 우 13 | ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 좌표로 나타낸 행동 14 | REWARDS = [] 15 | 16 | 17 | class GraphicDisplay(tk.Tk): 18 | def __init__(self, value_iteration): 19 | super(GraphicDisplay, self).__init__() 20 | self.title('Value Iteration') 21 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) 22 | self.texts = [] 23 | self.arrows = [] 24 | self.env = Env() 25 | self.agent = value_iteration 26 | self.iteration_count = 0 27 | self.improvement_count = 0 28 | self.is_moving = 0 29 | (self.up, self.down, self.left, 30 | self.right), self.shapes = self.load_images() 31 | self.canvas = self._build_canvas() 32 | self.text_reward(2, 2, "R : 1.0") 33 | self.text_reward(1, 2, "R : -1.0") 34 | self.text_reward(2, 1, "R : -1.0") 35 | 36 | def _build_canvas(self): 37 | canvas = tk.Canvas(self, bg='white', 38 | height=HEIGHT * UNIT, 39 | width=WIDTH * UNIT) 40 | # 버튼 초기화 41 | iteration_button = tk.Button(self, text="Calculate", 42 | command=self.calculate_value) 43 | iteration_button.configure(width=10, activebackground="#33B5E5") 44 | canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, 45 | window=iteration_button) 46 | 47 | policy_button = tk.Button(self, text="Print Policy", 48 | command=self.print_optimal_policy) 49 | policy_button.configure(width=10, activebackground="#33B5E5") 50 | canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, 51 | window=policy_button) 52 | 53 | policy_button = tk.Button(self, text="Move", 54 | command=self.move_by_policy) 55 | policy_button.configure(width=10, activebackground="#33B5E5") 56 | canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, 57 | window=policy_button) 58 | 59 | policy_button = tk.Button(self, text="Clear", command=self.clear) 60 | policy_button.configure(width=10, activebackground="#33B5E5") 61 | canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, 62 | window=policy_button) 63 | 64 | # 그리드 생성 65 | for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 66 | x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT 67 | canvas.create_line(x0, y0, x1, y1) 68 | for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 69 | x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row 70 | canvas.create_line(x0, y0, x1, y1) 71 | 72 | # 캔버스에 이미지 추가 73 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 74 | canvas.create_image(250, 150, image=self.shapes[1]) 75 | canvas.create_image(150, 250, image=self.shapes[1]) 76 | canvas.create_image(250, 250, image=self.shapes[2]) 77 | 78 | canvas.pack() 79 | 80 | return canvas 81 | 82 | def load_images(self): 83 | PhotoImage = ImageTk.PhotoImage 84 | up = PhotoImage(Image.open("../img/up.png").resize((13, 13))) 85 | right = PhotoImage(Image.open("../img/right.png").resize((13, 13))) 86 | left = PhotoImage(Image.open("../img/left.png").resize((13, 13))) 87 | down = PhotoImage(Image.open("../img/down.png").resize((13, 13))) 88 | rectangle = PhotoImage( 89 | Image.open("../img/rectangle.png").resize((65, 65))) 90 | triangle = PhotoImage( 91 | Image.open("../img/triangle.png").resize((65, 65))) 92 | circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65))) 93 | return (up, down, left, right), (rectangle, triangle, circle) 94 | 95 | def clear(self): 96 | 97 | if self.is_moving == 0: 98 | self.iteration_count = 0 99 | self.improvement_count = 0 100 | for i in self.texts: 101 | self.canvas.delete(i) 102 | 103 | for i in self.arrows: 104 | self.canvas.delete(i) 105 | 106 | self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)] 107 | 108 | x, y = self.canvas.coords(self.rectangle) 109 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 110 | 111 | def reset(self): 112 | self.update() 113 | time.sleep(0.5) 114 | self.canvas.delete(self.rectangle) 115 | return self.canvas.coords(self.rectangle) 116 | 117 | def text_value(self, row, col, contents, font='Helvetica', size=12, 118 | style='normal', anchor="nw"): 119 | origin_x, origin_y = 85, 70 120 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 121 | font = (font, str(size), style) 122 | text = self.canvas.create_text(x, y, fill="black", text=contents, 123 | font=font, anchor=anchor) 124 | return self.texts.append(text) 125 | 126 | def text_reward(self, row, col, contents, font='Helvetica', size=12, 127 | style='normal', anchor="nw"): 128 | origin_x, origin_y = 5, 5 129 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 130 | font = (font, str(size), style) 131 | text = self.canvas.create_text(x, y, fill="black", text=contents, 132 | font=font, anchor=anchor) 133 | return self.texts.append(text) 134 | 135 | def rectangle_move(self, action): 136 | base_action = np.array([0, 0]) 137 | location = self.find_rectangle() 138 | self.render() 139 | if action == 0 and location[0] > 0: # up 140 | base_action[1] -= UNIT 141 | elif action == 1 and location[0] < HEIGHT - 1: # down 142 | base_action[1] += UNIT 143 | elif action == 2 and location[1] > 0: # left 144 | base_action[0] -= UNIT 145 | elif action == 3 and location[1] < WIDTH - 1: # right 146 | base_action[0] += UNIT 147 | 148 | self.canvas.move(self.rectangle, base_action[0], 149 | base_action[1]) # move agent 150 | 151 | def find_rectangle(self): 152 | temp = self.canvas.coords(self.rectangle) 153 | x = (temp[0] / 100) - 0.5 154 | y = (temp[1] / 100) - 0.5 155 | return int(y), int(x) 156 | 157 | def move_by_policy(self): 158 | 159 | if self.improvement_count != 0 and self.is_moving != 1: 160 | self.is_moving = 1 161 | x, y = self.canvas.coords(self.rectangle) 162 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 163 | 164 | x, y = self.find_rectangle() 165 | while len(self.agent.get_action([x, y])) != 0: 166 | action = random.sample(self.agent.get_action([x, y]), 1)[0] 167 | self.after(100, self.rectangle_move(action)) 168 | x, y = self.find_rectangle() 169 | self.is_moving = 0 170 | 171 | def draw_one_arrow(self, col, row, action): 172 | if col == 2 and row == 2: 173 | return 174 | if action == 0: # up 175 | origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) 176 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 177 | image=self.up)) 178 | elif action == 1: # down 179 | origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) 180 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 181 | image=self.down)) 182 | elif action == 3: # right 183 | origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) 184 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 185 | image=self.right)) 186 | elif action == 2: # left 187 | origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) 188 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 189 | image=self.left)) 190 | 191 | def draw_from_values(self, state, action_list): 192 | i = state[0] 193 | j = state[1] 194 | for action in action_list: 195 | self.draw_one_arrow(i, j, action) 196 | 197 | def print_values(self, values): 198 | for i in range(WIDTH): 199 | for j in range(HEIGHT): 200 | self.text_value(i, j, values[i][j]) 201 | 202 | def render(self): 203 | time.sleep(0.1) 204 | self.canvas.tag_raise(self.rectangle) 205 | self.update() 206 | 207 | def calculate_value(self): 208 | self.iteration_count += 1 209 | for i in self.texts: 210 | self.canvas.delete(i) 211 | self.agent.value_iteration() 212 | self.print_values(self.agent.value_table) 213 | 214 | def print_optimal_policy(self): 215 | self.improvement_count += 1 216 | for i in self.arrows: 217 | self.canvas.delete(i) 218 | for state in self.env.get_all_states(): 219 | action = self.agent.get_action(state) 220 | self.draw_from_values(state, action) 221 | 222 | 223 | class Env: 224 | def __init__(self): 225 | self.transition_probability = TRANSITION_PROB 226 | self.width = WIDTH # Width of Grid World 227 | self.height = HEIGHT # Height of GridWorld 228 | self.reward = [[0] * WIDTH for _ in range(HEIGHT)] 229 | self.possible_actions = POSSIBLE_ACTIONS 230 | self.reward[2][2] = 1 # reward 1 for circle 231 | self.reward[1][2] = -1 # reward -1 for triangle 232 | self.reward[2][1] = -1 # reward -1 for triangle 233 | self.all_state = [] 234 | 235 | for x in range(WIDTH): 236 | for y in range(HEIGHT): 237 | state = [x, y] 238 | self.all_state.append(state) 239 | 240 | def get_reward(self, state, action): 241 | next_state = self.state_after_action(state, action) 242 | return self.reward[next_state[0]][next_state[1]] 243 | 244 | def state_after_action(self, state, action_index): 245 | action = ACTIONS[action_index] 246 | return self.check_boundary([state[0] + action[0], state[1] + action[1]]) 247 | 248 | @staticmethod 249 | def check_boundary(state): 250 | state[0] = (0 if state[0] < 0 else WIDTH - 1 251 | if state[0] > WIDTH - 1 else state[0]) 252 | state[1] = (0 if state[1] < 0 else HEIGHT - 1 253 | if state[1] > HEIGHT - 1 else state[1]) 254 | return state 255 | 256 | def get_transition_prob(self, state, action): 257 | return self.transition_probability 258 | 259 | def get_all_states(self): 260 | return self.all_state 261 | -------------------------------------------------------------------------------- /1-grid-world/2-value-iteration/value_iteration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from environment import GraphicDisplay, Env 3 | 4 | class ValueIteration: 5 | def __init__(self, env): 6 | # 환경 객체 생성 7 | self.env = env 8 | # 가치 함수를 2차원 리스트로 초기화 9 | self.value_table = [[0.0] * env.width for _ in range(env.height)] 10 | # 감가율 11 | self.discount_factor = 0.9 12 | 13 | # 가치 이터레이션 14 | # 벨만 최적 방정식을 통해 다음 가치 함수 계산 15 | def value_iteration(self): 16 | next_value_table = [[0.0] * self.env.width for _ in 17 | range(self.env.height)] 18 | for state in self.env.get_all_states(): 19 | if state == [2, 2]: 20 | next_value_table[state[0]][state[1]] = 0.0 21 | continue 22 | # 가치 함수를 위한 빈 리스트 23 | value_list = [] 24 | 25 | # 가능한 모든 행동에 대해 계산 26 | for action in self.env.possible_actions: 27 | next_state = self.env.state_after_action(state, action) 28 | reward = self.env.get_reward(state, action) 29 | next_value = self.get_value(next_state) 30 | value_list.append((reward + self.discount_factor * next_value)) 31 | # 최댓값을 다음 가치 함수로 대입 32 | next_value_table[state[0]][state[1]] = round(max(value_list), 2) 33 | self.value_table = next_value_table 34 | 35 | # 현재 가치 함수로부터 행동을 반환 36 | def get_action(self, state): 37 | action_list = [] 38 | max_value = -99999 39 | 40 | if state == [2, 2]: 41 | return [] 42 | 43 | # 모든 행동에 대해 큐함수 (보상 + (감가율 * 다음 상태 가치함수))를 계산 44 | # 최대 큐 함수를 가진 행동(복수일 경우 여러 개)을 반환 45 | for action in self.env.possible_actions: 46 | 47 | next_state = self.env.state_after_action(state, action) 48 | reward = self.env.get_reward(state, action) 49 | next_value = self.get_value(next_state) 50 | value = (reward + self.discount_factor * next_value) 51 | 52 | if value > max_value: 53 | action_list.clear() 54 | action_list.append(action) 55 | max_value = value 56 | elif value == max_value: 57 | action_list.append(action) 58 | 59 | return action_list 60 | 61 | def get_value(self, state): 62 | return round(self.value_table[state[0]][state[1]], 2) 63 | 64 | if __name__ == "__main__": 65 | env = Env() 66 | value_iteration = ValueIteration(env) 67 | grid_world = GraphicDisplay(value_iteration) 68 | grid_world.mainloop() 69 | -------------------------------------------------------------------------------- /1-grid-world/3-monte-carlo/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | np.random.seed(1) 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # 픽셀 수 9 | HEIGHT = 5 # 그리드 월드 세로 10 | WIDTH = 5 # 그리드 월드 가로 11 | 12 | 13 | class Env(tk.Tk): 14 | def __init__(self): 15 | super(Env, self).__init__() 16 | self.action_space = ['u', 'd', 'l', 'r'] 17 | self.n_actions = len(self.action_space) 18 | self.title('monte carlo') 19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 20 | self.shapes = self.load_images() 21 | self.canvas = self._build_canvas() 22 | self.texts = [] 23 | 24 | def _build_canvas(self): 25 | canvas = tk.Canvas(self, bg='white', 26 | height=HEIGHT * UNIT, 27 | width=WIDTH * UNIT) 28 | # 그리드 생성 29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 31 | canvas.create_line(x0, y0, x1, y1) 32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 34 | canvas.create_line(x0, y0, x1, y1) 35 | 36 | # 캔버스에 이미지 추가 37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) 39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) 40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2]) 41 | 42 | canvas.pack() 43 | 44 | return canvas 45 | 46 | def load_images(self): 47 | rectangle = PhotoImage( 48 | Image.open("../img/rectangle.png").resize((65, 65))) 49 | triangle = PhotoImage( 50 | Image.open("../img/triangle.png").resize((65, 65))) 51 | circle = PhotoImage( 52 | Image.open("../img/circle.png").resize((65, 65))) 53 | 54 | return rectangle, triangle, circle 55 | 56 | @staticmethod 57 | def coords_to_state(coords): 58 | x = int((coords[0] - 50) / 100) 59 | y = int((coords[1] - 50) / 100) 60 | return [x, y] 61 | 62 | def reset(self): 63 | self.update() 64 | time.sleep(0.5) 65 | x, y = self.canvas.coords(self.rectangle) 66 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 67 | return self.coords_to_state(self.canvas.coords(self.rectangle)) 68 | 69 | def step(self, action): 70 | state = self.canvas.coords(self.rectangle) 71 | base_action = np.array([0, 0]) 72 | self.render() 73 | 74 | if action == 0: # 상 75 | if state[1] > UNIT: 76 | base_action[1] -= UNIT 77 | elif action == 1: # 하 78 | if state[1] < (HEIGHT - 1) * UNIT: 79 | base_action[1] += UNIT 80 | elif action == 2: # 좌 81 | if state[0] > UNIT: 82 | base_action[0] -= UNIT 83 | elif action == 3: # 우 84 | if state[0] < (WIDTH - 1) * UNIT: 85 | base_action[0] += UNIT 86 | # 에이전트 이동 87 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 88 | # 에이전트(빨간 네모)를 가장 상위로 배치 89 | self.canvas.tag_raise(self.rectangle) 90 | 91 | next_state = self.canvas.coords(self.rectangle) 92 | 93 | # 보상 함수 94 | if next_state == self.canvas.coords(self.circle): 95 | reward = 100 96 | done = True 97 | elif next_state in [self.canvas.coords(self.triangle1), 98 | self.canvas.coords(self.triangle2)]: 99 | reward = -100 100 | done = True 101 | else: 102 | reward = 0 103 | done = False 104 | 105 | next_state = self.coords_to_state(next_state) 106 | 107 | return next_state, reward, done 108 | 109 | def render(self): 110 | time.sleep(0.03) 111 | self.update() 112 | -------------------------------------------------------------------------------- /1-grid-world/3-monte-carlo/mc_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import defaultdict 4 | from environment import Env 5 | 6 | 7 | # 몬테카를로 에이전트 (모든 에피소드 각각의 샘플로 부터 학습) 8 | class MCAgent: 9 | def __init__(self, actions): 10 | self.width = 5 11 | self.height = 5 12 | self.actions = actions 13 | self.learning_rate = 0.01 14 | self.discount_factor = 0.9 15 | self.epsilon = 0.1 16 | self.samples = [] 17 | self.value_table = defaultdict(float) 18 | 19 | # 메모리에 샘플을 추가 20 | def save_sample(self, state, reward, done): 21 | self.samples.append([state, reward, done]) 22 | 23 | # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트 24 | def update(self): 25 | G_t = 0 26 | visit_state = [] 27 | for reward in reversed(self.samples): 28 | state = str(reward[0]) 29 | if state not in visit_state: 30 | visit_state.append(state) 31 | G_t = self.discount_factor * (reward[1] + G_t) 32 | value = self.value_table[state] 33 | self.value_table[state] = (value + 34 | self.learning_rate * (G_t - value)) 35 | 36 | # 큐 함수에 따라서 행동을 반환 37 | # 입실론 탐욕 정책에 따라서 행동을 반환 38 | def get_action(self, state): 39 | if np.random.rand() < self.epsilon: 40 | # 랜덤 행동 41 | action = np.random.choice(self.actions) 42 | else: 43 | # 큐 함수에 따른 행동 44 | next_state = self.possible_next_state(state) 45 | action = self.arg_max(next_state) 46 | return int(action) 47 | 48 | # 후보가 여럿이면 arg_max를 계산하고 무작위로 하나를 반환 49 | @staticmethod 50 | def arg_max(next_state): 51 | max_index_list = [] 52 | max_value = next_state[0] 53 | for index, value in enumerate(next_state): 54 | if value > max_value: 55 | max_index_list.clear() 56 | max_value = value 57 | max_index_list.append(index) 58 | elif value == max_value: 59 | max_index_list.append(index) 60 | return random.choice(max_index_list) 61 | 62 | # 가능한 다음 모든 상태들을 반환 63 | def possible_next_state(self, state): 64 | col, row = state 65 | next_state = [0.0] * 4 66 | 67 | if row != 0: 68 | next_state[0] = self.value_table[str([col, row - 1])] 69 | else: 70 | next_state[0] = self.value_table[str(state)] 71 | if row != self.height - 1: 72 | next_state[1] = self.value_table[str([col, row + 1])] 73 | else: 74 | next_state[1] = self.value_table[str(state)] 75 | if col != 0: 76 | next_state[2] = self.value_table[str([col - 1, row])] 77 | else: 78 | next_state[2] = self.value_table[str(state)] 79 | if col != self.width - 1: 80 | next_state[3] = self.value_table[str([col + 1, row])] 81 | else: 82 | next_state[3] = self.value_table[str(state)] 83 | 84 | return next_state 85 | 86 | 87 | # 메인 함수 88 | if __name__ == "__main__": 89 | env = Env() 90 | agent = MCAgent(actions=list(range(env.n_actions))) 91 | 92 | for episode in range(1000): 93 | state = env.reset() 94 | action = agent.get_action(state) 95 | 96 | while True: 97 | env.render() 98 | 99 | # 다음 상태로 이동 100 | # 보상은 숫자이고, 완료 여부는 boolean 101 | next_state, reward, done = env.step(action) 102 | agent.save_sample(next_state, reward, done) 103 | 104 | # 다음 행동 받아옴 105 | action = agent.get_action(next_state) 106 | 107 | # 에피소드가 완료됐을 때, 큐 함수 업데이트 108 | if done: 109 | print("episode : ", episode) 110 | agent.update() 111 | agent.samples.clear() 112 | break 113 | -------------------------------------------------------------------------------- /1-grid-world/4-sarsa/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | np.random.seed(1) 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # 필셀 수 9 | HEIGHT = 5 # 그리드 월드 가로 10 | WIDTH = 5 # 그리드 월드 세로 11 | 12 | 13 | class Env(tk.Tk): 14 | def __init__(self): 15 | super(Env, self).__init__() 16 | self.action_space = ['u', 'd', 'l', 'r'] 17 | self.n_actions = len(self.action_space) 18 | self.title('SARSA') 19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 20 | self.shapes = self.load_images() 21 | self.canvas = self._build_canvas() 22 | self.texts = [] 23 | 24 | def _build_canvas(self): 25 | canvas = tk.Canvas(self, bg='white', 26 | height=HEIGHT * UNIT, 27 | width=WIDTH * UNIT) 28 | # 그리드 생성 29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 31 | canvas.create_line(x0, y0, x1, y1) 32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 34 | canvas.create_line(x0, y0, x1, y1) 35 | 36 | # 캔버스에 이미지 추가 37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) 39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) 40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2]) 41 | 42 | canvas.pack() 43 | 44 | return canvas 45 | 46 | def load_images(self): 47 | rectangle = PhotoImage( 48 | Image.open("../img/rectangle.png").resize((65, 65))) 49 | triangle = PhotoImage( 50 | Image.open("../img/triangle.png").resize((65, 65))) 51 | circle = PhotoImage( 52 | Image.open("../img/circle.png").resize((65, 65))) 53 | 54 | return rectangle, triangle, circle 55 | 56 | def text_value(self, row, col, contents, action, font='Helvetica', size=10, 57 | style='normal', anchor="nw"): 58 | if action == 0: 59 | origin_x, origin_y = 7, 42 60 | elif action == 1: 61 | origin_x, origin_y = 85, 42 62 | elif action == 2: 63 | origin_x, origin_y = 42, 5 64 | else: 65 | origin_x, origin_y = 42, 77 66 | 67 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 68 | font = (font, str(size), style) 69 | text = self.canvas.create_text(x, y, fill="black", text=contents, 70 | font=font, anchor=anchor) 71 | return self.texts.append(text) 72 | 73 | def print_value_all(self, q_table): 74 | for i in self.texts: 75 | self.canvas.delete(i) 76 | self.texts.clear() 77 | for x in range(HEIGHT): 78 | for y in range(WIDTH): 79 | for action in range(0, 4): 80 | state = [x, y] 81 | if str(state) in q_table.keys(): 82 | temp = q_table[str(state)][action] 83 | self.text_value(y, x, round(temp, 2), action) 84 | 85 | def coords_to_state(self, coords): 86 | x = int((coords[0] - 50) / 100) 87 | y = int((coords[1] - 50) / 100) 88 | return [x, y] 89 | 90 | def reset(self): 91 | self.update() 92 | time.sleep(0.5) 93 | x, y = self.canvas.coords(self.rectangle) 94 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 95 | self.render() 96 | return self.coords_to_state(self.canvas.coords(self.rectangle)) 97 | 98 | def step(self, action): 99 | state = self.canvas.coords(self.rectangle) 100 | base_action = np.array([0, 0]) 101 | self.render() 102 | 103 | if action == 0: # 상 104 | if state[1] > UNIT: 105 | base_action[1] -= UNIT 106 | elif action == 1: # 하 107 | if state[1] < (HEIGHT - 1) * UNIT: 108 | base_action[1] += UNIT 109 | elif action == 2: # 좌 110 | if state[0] > UNIT: 111 | base_action[0] -= UNIT 112 | elif action == 3: # 우 113 | if state[0] < (WIDTH - 1) * UNIT: 114 | base_action[0] += UNIT 115 | 116 | # 에이전트 이동 117 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 118 | # 에이전트(빨간 네모)를 가장 상위로 배치 119 | self.canvas.tag_raise(self.rectangle) 120 | next_state = self.canvas.coords(self.rectangle) 121 | 122 | # 보상 함수 123 | if next_state == self.canvas.coords(self.circle): 124 | reward = 100 125 | done = True 126 | elif next_state in [self.canvas.coords(self.triangle1), 127 | self.canvas.coords(self.triangle2)]: 128 | reward = -100 129 | done = True 130 | else: 131 | reward = 0 132 | done = False 133 | 134 | next_state = self.coords_to_state(next_state) 135 | 136 | 137 | 138 | return next_state, reward, done 139 | 140 | def render(self): 141 | time.sleep(0.03) 142 | self.update() 143 | -------------------------------------------------------------------------------- /1-grid-world/4-sarsa/sarsa_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import defaultdict 4 | from environment import Env 5 | 6 | 7 | class SARSAgent: 8 | def __init__(self, actions): 9 | self.actions = actions 10 | self.learning_rate = 0.01 11 | self.discount_factor = 0.9 12 | self.epsilon = 0.1 13 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) 14 | 15 | # 의 샘플로부터 큐함수를 업데이트 16 | def learn(self, state, action, reward, next_state, next_action): 17 | current_q = self.q_table[state][action] 18 | next_state_q = self.q_table[next_state][next_action] 19 | new_q = (current_q + self.learning_rate * 20 | (reward + self.discount_factor * next_state_q - current_q)) 21 | self.q_table[state][action] = new_q 22 | 23 | # 입실론 탐욕 정책에 따라서 행동을 반환 24 | def get_action(self, state): 25 | if np.random.rand() < self.epsilon: 26 | # 무작위 행동 반환 27 | action = np.random.choice(self.actions) 28 | else: 29 | # 큐함수에 따른 행동 반환 30 | state_action = self.q_table[state] 31 | action = self.arg_max(state_action) 32 | return action 33 | 34 | @staticmethod 35 | def arg_max(state_action): 36 | max_index_list = [] 37 | max_value = state_action[0] 38 | for index, value in enumerate(state_action): 39 | if value > max_value: 40 | max_index_list.clear() 41 | max_value = value 42 | max_index_list.append(index) 43 | elif value == max_value: 44 | max_index_list.append(index) 45 | return random.choice(max_index_list) 46 | 47 | if __name__ == "__main__": 48 | env = Env() 49 | agent = SARSAgent(actions=list(range(env.n_actions))) 50 | 51 | for episode in range(1000): 52 | # 게임 환경과 상태를 초기화 53 | state = env.reset() 54 | # 현재 상태에 대한 행동을 선택 55 | action = agent.get_action(str(state)) 56 | 57 | while True: 58 | env.render() 59 | 60 | # 행동을 위한 후 다음상태 보상 에피소드의 종료 여부를 받아옴 61 | next_state, reward, done = env.step(action) 62 | # 다음 상태에서의 다음 행동 선택 63 | next_action = agent.get_action(str(next_state)) 64 | 65 | # 로 큐함수를 업데이트 66 | agent.learn(str(state), action, reward, str(next_state), next_action) 67 | 68 | state = next_state 69 | action = next_action 70 | 71 | # 모든 큐함수를 화면에 표시 72 | env.print_value_all(agent.q_table) 73 | 74 | if done: 75 | break 76 | 77 | -------------------------------------------------------------------------------- /1-grid-world/5-q-learning/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | np.random.seed(1) 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # 픽셀 수 9 | HEIGHT = 5 # 그리드월드 세로 10 | WIDTH = 5 # 그리드월드 가로 11 | 12 | 13 | class Env(tk.Tk): 14 | def __init__(self): 15 | super(Env, self).__init__() 16 | self.action_space = ['u', 'd', 'l', 'r'] 17 | self.n_actions = len(self.action_space) 18 | self.title('Q Learning') 19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 20 | self.shapes = self.load_images() 21 | self.canvas = self._build_canvas() 22 | self.texts = [] 23 | 24 | def _build_canvas(self): 25 | canvas = tk.Canvas(self, bg='white', 26 | height=HEIGHT * UNIT, 27 | width=WIDTH * UNIT) 28 | # 그리드 생성 29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 31 | canvas.create_line(x0, y0, x1, y1) 32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 34 | canvas.create_line(x0, y0, x1, y1) 35 | 36 | # 캔버스에 이미지 추가 37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) 39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) 40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2]) 41 | 42 | canvas.pack() 43 | 44 | return canvas 45 | 46 | def load_images(self): 47 | rectangle = PhotoImage( 48 | Image.open("../img/rectangle.png").resize((65, 65))) 49 | triangle = PhotoImage( 50 | Image.open("../img/triangle.png").resize((65, 65))) 51 | circle = PhotoImage( 52 | Image.open("../img/circle.png").resize((65, 65))) 53 | 54 | return rectangle, triangle, circle 55 | 56 | def text_value(self, row, col, contents, action, font='Helvetica', size=10, 57 | style='normal', anchor="nw"): 58 | 59 | if action == 0: 60 | origin_x, origin_y = 7, 42 61 | elif action == 1: 62 | origin_x, origin_y = 85, 42 63 | elif action == 2: 64 | origin_x, origin_y = 42, 5 65 | else: 66 | origin_x, origin_y = 42, 77 67 | 68 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 69 | font = (font, str(size), style) 70 | text = self.canvas.create_text(x, y, fill="black", text=contents, 71 | font=font, anchor=anchor) 72 | return self.texts.append(text) 73 | 74 | def print_value_all(self, q_table): 75 | for i in self.texts: 76 | self.canvas.delete(i) 77 | self.texts.clear() 78 | for i in range(HEIGHT): 79 | for j in range(WIDTH): 80 | for action in range(0, 4): 81 | state = [i, j] 82 | if str(state) in q_table.keys(): 83 | temp = q_table[str(state)][action] 84 | self.text_value(j, i, round(temp, 2), action) 85 | 86 | def coords_to_state(self, coords): 87 | x = int((coords[0] - 50) / 100) 88 | y = int((coords[1] - 50) / 100) 89 | return [x, y] 90 | 91 | def state_to_coords(self, state): 92 | x = int(state[0] * 100 + 50) 93 | y = int(state[1] * 100 + 50) 94 | return [x, y] 95 | 96 | def reset(self): 97 | self.update() 98 | time.sleep(0.5) 99 | x, y = self.canvas.coords(self.rectangle) 100 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 101 | self.render() 102 | return self.coords_to_state(self.canvas.coords(self.rectangle)) 103 | 104 | def step(self, action): 105 | state = self.canvas.coords(self.rectangle) 106 | base_action = np.array([0, 0]) 107 | self.render() 108 | 109 | if action == 0: # 상 110 | if state[1] > UNIT: 111 | base_action[1] -= UNIT 112 | elif action == 1: # 하 113 | if state[1] < (HEIGHT - 1) * UNIT: 114 | base_action[1] += UNIT 115 | elif action == 2: # 좌 116 | if state[0] > UNIT: 117 | base_action[0] -= UNIT 118 | elif action == 3: # 우 119 | if state[0] < (WIDTH - 1) * UNIT: 120 | base_action[0] += UNIT 121 | 122 | # 에이전트 이동 123 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 124 | # 에이전트(빨간 네모)를 가장 상위로 배치 125 | self.canvas.tag_raise(self.rectangle) 126 | next_state = self.canvas.coords(self.rectangle) 127 | 128 | # 보상 함수 129 | if next_state == self.canvas.coords(self.circle): 130 | reward = 100 131 | done = True 132 | elif next_state in [self.canvas.coords(self.triangle1), 133 | self.canvas.coords(self.triangle2)]: 134 | reward = -100 135 | done = True 136 | else: 137 | reward = 0 138 | done = False 139 | 140 | next_state = self.coords_to_state(next_state) 141 | return next_state, reward, done 142 | 143 | def render(self): 144 | time.sleep(0.03) 145 | self.update() 146 | -------------------------------------------------------------------------------- /1-grid-world/5-q-learning/q_learning_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from environment import Env 4 | from collections import defaultdict 5 | 6 | class QLearningAgent: 7 | def __init__(self, actions): 8 | # 행동 = [0, 1, 2, 3] 순서대로 상, 하, 좌, 우 9 | self.actions = actions 10 | self.learning_rate = 0.01 11 | self.discount_factor = 0.9 12 | self.epsilon = 0.9 13 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) 14 | 15 | # 샘플로부터 큐함수 업데이트 16 | def learn(self, state, action, reward, next_state): 17 | q_1 = self.q_table[state][action] 18 | # 벨만 최적 방정식을 사용한 큐함수의 업데이트 19 | q_2 = reward + self.discount_factor * max(self.q_table[next_state]) 20 | self.q_table[state][action] += self.learning_rate * (q_2 - q_1) 21 | 22 | # 큐함수에 의거하여 입실론 탐욕 정책에 따라서 행동을 반환 23 | def get_action(self, state): 24 | if np.random.rand() > self.epsilon: 25 | # 무작위 행동 반환 26 | action = np.random.choice(self.actions) 27 | else: 28 | # 큐함수에 따른 행동 반환 29 | state_action = self.q_table[state] 30 | action = self.arg_max(state_action) 31 | return action 32 | 33 | @staticmethod 34 | def arg_max(state_action): 35 | max_index_list = [] 36 | max_value = state_action[0] 37 | for index, value in enumerate(state_action): 38 | if value > max_value: 39 | max_index_list.clear() 40 | max_value = value 41 | max_index_list.append(index) 42 | elif value == max_value: 43 | max_index_list.append(index) 44 | return random.choice(max_index_list) 45 | 46 | if __name__ == "__main__": 47 | env = Env() 48 | agent = QLearningAgent(actions=list(range(env.n_actions))) 49 | 50 | for episode in range(1000): 51 | state = env.reset() 52 | 53 | while True: 54 | env.render() 55 | 56 | # 현재 상태에 대한 행동 선택 57 | action = agent.get_action(str(state)) 58 | # 행동을 취한 후 다음 상태, 보상 에피소드의 종료여부를 받아옴 59 | next_state, reward, done = env.step(action) 60 | 61 | # 로 큐함수를 업데이트 62 | agent.learn(str(state), action, reward, str(next_state)) 63 | state = next_state 64 | # 모든 큐함수를 화면에 표시 65 | env.print_value_all(agent.q_table) 66 | 67 | if done: 68 | break 69 | -------------------------------------------------------------------------------- /1-grid-world/img/circle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reinforcement-learning-kr/reinforcement-learning-pytorch/d047f17c3f4d0715c5fae9246b8d9dc79797bdca/1-grid-world/img/circle.png -------------------------------------------------------------------------------- /1-grid-world/img/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reinforcement-learning-kr/reinforcement-learning-pytorch/d047f17c3f4d0715c5fae9246b8d9dc79797bdca/1-grid-world/img/down.png -------------------------------------------------------------------------------- /1-grid-world/img/left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reinforcement-learning-kr/reinforcement-learning-pytorch/d047f17c3f4d0715c5fae9246b8d9dc79797bdca/1-grid-world/img/left.png -------------------------------------------------------------------------------- /1-grid-world/img/rectangle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reinforcement-learning-kr/reinforcement-learning-pytorch/d047f17c3f4d0715c5fae9246b8d9dc79797bdca/1-grid-world/img/rectangle.png -------------------------------------------------------------------------------- /1-grid-world/img/right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reinforcement-learning-kr/reinforcement-learning-pytorch/d047f17c3f4d0715c5fae9246b8d9dc79797bdca/1-grid-world/img/right.png -------------------------------------------------------------------------------- /1-grid-world/img/triangle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reinforcement-learning-kr/reinforcement-learning-pytorch/d047f17c3f4d0715c5fae9246b8d9dc79797bdca/1-grid-world/img/triangle.png -------------------------------------------------------------------------------- /1-grid-world/img/up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reinforcement-learning-kr/reinforcement-learning-pytorch/d047f17c3f4d0715c5fae9246b8d9dc79797bdca/1-grid-world/img/up.png -------------------------------------------------------------------------------- /2-cartpole/1-dqn/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple 3 | 4 | # Taken from 5 | # https://github.com/pytorch/tutorials/blob/master/Reinforcement%20(Q-)Learning%20with%20PyTorch.ipynb 6 | 7 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 8 | 9 | 10 | class Memory(object): 11 | def __init__(self, capacity): 12 | self.memory = [] 13 | self.capacity = capacity 14 | self.position = 0 15 | 16 | def push(self, state, next_state, action, reward, mask): 17 | """Saves a transition.""" 18 | if len(self.memory) < self.capacity: 19 | self.memory.append(Transition(state, next_state, action, reward, mask)) 20 | self.memory[self.position] = Transition(state, next_state, action, reward, mask) 21 | self.position = (self.position + 1) % self.capacity 22 | 23 | def sample(self, batch_size): 24 | transitions = random.sample(self.memory, batch_size) 25 | batch = Transition(*zip(*transitions)) 26 | return batch 27 | 28 | def __len__(self): 29 | return len(self.memory) -------------------------------------------------------------------------------- /2-cartpole/1-dqn/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class QNet(nn.Module): 7 | def __init__(self, num_inputs, num_outputs): 8 | super(QNet, self).__init__() 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | 12 | self.fc1 = nn.Linear(num_inputs, 128) 13 | self.fc2 = nn.Linear(128, num_outputs) 14 | 15 | for m in self.modules(): 16 | if isinstance(m, nn.Linear): 17 | nn.init.xavier_uniform(m.weight) 18 | 19 | def forward(self, x): 20 | x = F.relu(self.fc1(x)) 21 | qvalue = self.fc2(x) 22 | return qvalue -------------------------------------------------------------------------------- /2-cartpole/1-dqn/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from model import QNet 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', type=str, default="CartPole-v1", help='') 12 | parser.add_argument('--load_model', type=str, default=None) 13 | parser.add_argument('--save_path', default='./save_model/', help='') 14 | args = parser.parse_args() 15 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 16 | 17 | 18 | def get_action(qvalue): 19 | _, action = torch.max(qvalue, 1) 20 | return action.numpy()[0] 21 | 22 | 23 | def main(): 24 | env = gym.make(args.env_name) 25 | env.seed(500) 26 | torch.manual_seed(500) 27 | 28 | num_inputs = env.observation_space.shape[0] 29 | num_actions = env.action_space.n 30 | print('state size:', num_inputs) 31 | print('action size:', num_actions) 32 | 33 | net = QNet(num_inputs, num_actions) 34 | net.load_state_dict(torch.load(args.save_path + 'model.pth')) 35 | 36 | net.to(device) 37 | net.eval() 38 | running_score = 0 39 | steps = 0 40 | 41 | for e in range(5): 42 | done = False 43 | 44 | score = 0 45 | state = env.reset() 46 | state = torch.Tensor(state).to(device) 47 | state = state.unsqueeze(0) 48 | 49 | while not done: 50 | env.render() 51 | 52 | steps += 1 53 | qvalue = net(state) 54 | action = get_action(qvalue) 55 | next_state, reward, done, _ = env.step(action) 56 | 57 | next_state = torch.Tensor(next_state).to(device) 58 | next_state = next_state.unsqueeze(0) 59 | 60 | score += reward 61 | state = next_state 62 | 63 | print('{} episode | score: {:.2f}'.format(e, score)) 64 | 65 | 66 | if __name__=="__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /2-cartpole/1-dqn/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import argparse 6 | import numpy as np 7 | 8 | import torch 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from model import QNet 12 | from memory import Memory 13 | from tensorboardX import SummaryWriter 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--env_name', type=str, default="CartPole-v1", help='') 17 | parser.add_argument('--load_model', type=str, default=None) 18 | parser.add_argument('--save_path', default='./save_model/', help='') 19 | parser.add_argument('--render', default=False, action="store_true") 20 | parser.add_argument('--gamma', default=0.99, help='') 21 | parser.add_argument('--batch_size', default=32, help='') 22 | parser.add_argument('--initial_exploration', default=10000, help='') 23 | parser.add_argument('--update_target', default=100, help='') 24 | parser.add_argument('--log_interval', default=10, help='') 25 | parser.add_argument('--goal_score', default=400, help='') 26 | parser.add_argument('--logdir', type=str, default='./logs', 27 | help='tensorboardx logs directory') 28 | args = parser.parse_args() 29 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 30 | 31 | def train_model(net, target_net, optimizer, batch, batch_size): 32 | states = torch.stack(batch.state).to(device) 33 | next_states = torch.stack(batch.next_state).to(device) 34 | actions = torch.Tensor(batch.action).long().to(device) 35 | rewards = torch.Tensor(batch.reward).to(device) 36 | masks = torch.Tensor(batch.mask).to(device) 37 | 38 | pred = net(states).squeeze(1) 39 | next_pred = target_net(next_states).squeeze(1) 40 | 41 | one_hot_action = torch.zeros(batch_size, pred.size(-1)) 42 | one_hot_action.scatter_(1, actions.unsqueeze(1), 1) 43 | pred = torch.sum(pred.mul(one_hot_action), dim=1) 44 | 45 | target = rewards + masks * args.gamma * next_pred.max(1)[0] 46 | 47 | loss = F.mse_loss(pred, target.detach()) 48 | optimizer.zero_grad() 49 | loss.backward() 50 | optimizer.step() 51 | 52 | 53 | def get_action(epsilon, qvalue, num_actions): 54 | if np.random.rand() <= epsilon: 55 | return random.randrange(num_actions) 56 | else: 57 | _, action = torch.max(qvalue, 1) 58 | return action.numpy()[0] 59 | 60 | 61 | def update_target_model(net, target_net): 62 | target_net.load_state_dict(net.state_dict()) 63 | 64 | 65 | def main(): 66 | env = gym.make(args.env_name) 67 | env.seed(500) 68 | torch.manual_seed(500) 69 | 70 | num_inputs = env.observation_space.shape[0] 71 | num_actions = env.action_space.n 72 | print('state size:', num_inputs) 73 | print('action size:', num_actions) 74 | 75 | net = QNet(num_inputs, num_actions) 76 | target_net = QNet(num_inputs, num_actions) 77 | update_target_model(net, target_net) 78 | 79 | optimizer = optim.Adam(net.parameters(), lr=0.001) 80 | writer = SummaryWriter('logs') 81 | 82 | if not os.path.isdir(args.save_path): 83 | os.makedirs(args.save_path) 84 | 85 | net.to(device) 86 | target_net.to(device) 87 | net.train() 88 | target_net.train() 89 | memory = Memory(10000) 90 | running_score = 0 91 | epsilon = 1.0 92 | steps = 0 93 | 94 | for e in range(3000): 95 | done = False 96 | 97 | score = 0 98 | state = env.reset() 99 | state = torch.Tensor(state).to(device) 100 | state = state.unsqueeze(0) 101 | 102 | while not done: 103 | if args.render: 104 | env.render() 105 | 106 | steps += 1 107 | qvalue = net(state) 108 | action = get_action(epsilon, qvalue, num_actions) 109 | next_state, reward, done, _ = env.step(action) 110 | 111 | next_state = torch.Tensor(next_state) 112 | next_state = next_state.unsqueeze(0) 113 | 114 | mask = 0 if done else 1 115 | reward = reward if not done or score == 499 else -1 116 | memory.push(state, next_state, action, reward, mask) 117 | 118 | score += reward 119 | state = next_state 120 | 121 | if steps > args.initial_exploration: 122 | epsilon -= 0.00005 123 | epsilon = max(epsilon, 0.1) 124 | 125 | batch = memory.sample(args.batch_size) 126 | train_model(net, target_net, optimizer, batch, args.batch_size) 127 | 128 | if steps % args.update_target: 129 | update_target_model(net, target_net) 130 | 131 | score = score if score == 500.0 else score + 1 132 | running_score = 0.99 * running_score + 0.01 * score 133 | if e % args.log_interval == 0: 134 | print('{} episode | score: {:.2f} | epsilon: {:.2f}'.format( 135 | e, running_score, epsilon)) 136 | writer.add_scalar('log/score', float(score), running_score) 137 | 138 | if running_score > args.goal_score: 139 | ckpt_path = args.save_path + 'model.pth' 140 | torch.save(net.state_dict(), ckpt_path) 141 | print('running score exceeds 400 so end') 142 | break 143 | 144 | 145 | if __name__=="__main__": 146 | main() 147 | -------------------------------------------------------------------------------- /2-cartpole/2-actor-critic/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import Model 11 | from tensorboardX import SummaryWriter 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--env_name', type=str, default="CartPole-v1", help='') 15 | parser.add_argument('--load_model', type=str, default=None) 16 | parser.add_argument('--save_path', default='./save_model/', help='') 17 | parser.add_argument('--render', default=False, action="store_true") 18 | parser.add_argument('--gamma', default=0.99, help='') 19 | parser.add_argument('--log_interval', default=10, help='') 20 | parser.add_argument('--logdir', type=str, default='./logs', 21 | help='tensorboardx logs directory') 22 | args = parser.parse_args() 23 | 24 | 25 | def train_model(sample, policy, value): 26 | state = sample[0] 27 | action = sample[1] 28 | reward = sample[2] 29 | next_state = sample[3] 30 | mask = sample[4] 31 | 32 | _, next_value = net(next_state) 33 | pred = reward + mask * args.gamma * next_value[0] 34 | td_error = pred - value[0] 35 | 36 | log_policy = torch.log(policy[0])[action] 37 | loss1 = - log_policy * td_error.item() 38 | loss2 = F.mse_loss(value[0], pred.detach()) 39 | entropy = torch.log(policy[0]) * policy[0] 40 | loss = loss1 + loss2 - 0.1 * entropy.sum() 41 | 42 | optimizer.zero_grad() 43 | loss.backward() 44 | optimizer.step() 45 | 46 | 47 | def get_action(policy): 48 | policy = policy.data.numpy()[0] 49 | action = np.random.choice(num_actions, 1, p=policy)[0] 50 | return action 51 | 52 | 53 | if __name__=="__main__": 54 | env = gym.make(args.env_name) 55 | env.seed(500) 56 | torch.manual_seed(500) 57 | 58 | num_inputs = env.observation_space.shape[0] 59 | num_actions = env.action_space.n 60 | print('state size:', num_inputs) 61 | print('action size:', num_actions) 62 | 63 | net = Model(num_inputs, num_actions) 64 | optimizer = optim.Adam(net.parameters(), lr=0.001) 65 | writer = SummaryWriter('logs') 66 | 67 | if not os.path.isdir(args.save_path): 68 | os.makedirs(args.save_path) 69 | 70 | net.train() 71 | running_score = 0 72 | 73 | for e in range(3000): 74 | done = False 75 | score = 0 76 | 77 | state = env.reset() 78 | state = torch.Tensor(state) 79 | state = state.unsqueeze(0) 80 | 81 | while not done: 82 | if args.render: 83 | env.render() 84 | 85 | policy, value = net(state) 86 | action = get_action(policy) 87 | 88 | next_state, reward, done, _ = env.step(action) 89 | next_state = torch.Tensor(next_state) 90 | next_state = next_state.unsqueeze(0) 91 | 92 | mask = 0 if done else 1 93 | reward = reward if not done or score == 499 else -1 94 | sample = [state, action, reward, next_state, mask] 95 | train_model(sample, policy, value) 96 | 97 | score += reward 98 | state = next_state 99 | 100 | score = score if score == 500.0 else score + 1 101 | running_score = 0.99 * running_score + 0.01 * score 102 | if e % args.log_interval == 0: 103 | print('{} episode score: {:.2f}'.format(e, running_score)) 104 | writer.add_scalar('log/score', float(score), running_score) 105 | 106 | if running_score > env.spec.reward_threshold: 107 | running_score = int(running_score) 108 | ckpt_path = model_path + 'ckpt_'+ str(e)+'.pth' 109 | torch.save(net.state_dict(), ckpt_path) 110 | print('running score exceeds 485 so end') 111 | break 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /2-cartpole/2-actor-critic/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class ActorCritic(nn.Module): 7 | def __init__(self, num_inputs, num_outputs): 8 | super(ActorCritic, self).__init__() 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | 12 | self.fc1 = nn.Linear(num_inputs, 128) 13 | self.fc2 = nn.Linear(128, 128) 14 | self.fc_actor = nn.Linear(128, num_outputs) 15 | self.fc_critic = nn.Linear(128, 1) 16 | 17 | for m in self.modules(): 18 | if isinstance(m, nn.Linear): 19 | nn.init.xavier_uniform(m.weight) 20 | 21 | def forward(self, x): 22 | x = F.relu(self.fc1(x)) 23 | x = F.relu(self.fc2(x)) 24 | policy = F.softmax(self.fc_actor(x)) 25 | value = self.fc_critic(x) 26 | return policy, value -------------------------------------------------------------------------------- /2-cartpole/2-actor-critic/save_model/ckpt_1157.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reinforcement-learning-kr/reinforcement-learning-pytorch/d047f17c3f4d0715c5fae9246b8d9dc79797bdca/2-cartpole/2-actor-critic/save_model/ckpt_1157.pth -------------------------------------------------------------------------------- /2-cartpole/2-actor-critic/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from model import ActorCritic 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', type=str, default="CartPole-v1", help='') 12 | parser.add_argument('--load_model', type=str, default=None) 13 | parser.add_argument('--save_path', default='./save_model/', help='') 14 | args = parser.parse_args() 15 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 16 | 17 | 18 | def get_action(policy, num_actions): 19 | policy = policy.data.numpy()[0] 20 | action = np.random.choice(num_actions, 1, p=policy)[0] 21 | return action 22 | 23 | 24 | def main(): 25 | env = gym.make(args.env_name) 26 | env.seed(500) 27 | torch.manual_seed(500) 28 | 29 | num_inputs = env.observation_space.shape[0] 30 | num_actions = env.action_space.n 31 | print('state size:', num_inputs) 32 | print('action size:', num_actions) 33 | 34 | net = ActorCritic(num_inputs, num_actions) 35 | net.load_state_dict(torch.load(args.save_path + 'model.pth')) 36 | 37 | net.to(device) 38 | net.eval() 39 | running_score = 0 40 | steps = 0 41 | 42 | for e in range(5): 43 | done = False 44 | 45 | score = 0 46 | state = env.reset() 47 | state = torch.Tensor(state).to(device) 48 | state = state.unsqueeze(0) 49 | 50 | while not done: 51 | env.render() 52 | 53 | steps += 1 54 | policy, value = net(state) 55 | action = get_action(policy, num_actions) 56 | next_state, reward, done, _ = env.step(action) 57 | 58 | next_state = torch.Tensor(next_state).to(device) 59 | next_state = next_state.unsqueeze(0) 60 | 61 | score += reward 62 | state = next_state 63 | 64 | print('{} episode | score: {:.2f}'.format(e, score)) 65 | 66 | 67 | if __name__=="__main__": 68 | main() 69 | -------------------------------------------------------------------------------- /2-cartpole/2-actor-critic/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | from model import ActorCritic 11 | from tensorboardX import SummaryWriter 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--env_name', type=str, default="CartPole-v1", help='') 15 | parser.add_argument('--load_model', type=str, default=None) 16 | parser.add_argument('--save_path', default='./save_model/', help='') 17 | parser.add_argument('--render', default=False, action="store_true") 18 | parser.add_argument('--gamma', default=0.99, help='') 19 | parser.add_argument('--goal_score', default=400, help='') 20 | parser.add_argument('--log_interval', default=10, help='') 21 | parser.add_argument('--logdir', type=str, default='./logs', 22 | help='tensorboardx logs directory') 23 | args = parser.parse_args() 24 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 25 | 26 | 27 | def train_model(net, optimizer, transition, policy, value): 28 | state, next_state, action, reward, mask = transition 29 | 30 | _, next_value = net(next_state) 31 | pred = reward + mask * args.gamma * next_value[0] 32 | td_error = pred - value[0] 33 | 34 | log_policy = torch.log(policy[0])[action] 35 | loss1 = - log_policy * td_error.item() 36 | loss2 = F.mse_loss(value[0], pred.detach()) 37 | entropy = torch.log(policy[0]) * policy[0] 38 | loss = loss1 + loss2 - 0.1 * entropy.sum() 39 | 40 | optimizer.zero_grad() 41 | loss.backward() 42 | optimizer.step() 43 | 44 | 45 | def get_action(policy, num_actions): 46 | policy = policy.data.numpy()[0] 47 | action = np.random.choice(num_actions, 1, p=policy)[0] 48 | return action 49 | 50 | 51 | def main(): 52 | env = gym.make(args.env_name) 53 | env.seed(500) 54 | torch.manual_seed(500) 55 | 56 | num_inputs = env.observation_space.shape[0] 57 | num_actions = env.action_space.n 58 | print('state size:', num_inputs) 59 | print('action size:', num_actions) 60 | 61 | net = ActorCritic(num_inputs, num_actions) 62 | optimizer = optim.Adam(net.parameters(), lr=0.001) 63 | writer = SummaryWriter('logs') 64 | 65 | if not os.path.isdir(args.save_path): 66 | os.makedirs(args.save_path) 67 | 68 | net.to(device) 69 | net.train() 70 | running_score = 0 71 | 72 | for e in range(3000): 73 | done = False 74 | score = 0 75 | 76 | state = env.reset() 77 | state = torch.Tensor(state).to(device) 78 | state = state.unsqueeze(0) 79 | 80 | while not done: 81 | if args.render: 82 | env.render() 83 | 84 | policy, value = net(state) 85 | action = get_action(policy, num_actions) 86 | 87 | next_state, reward, done, _ = env.step(action) 88 | next_state = torch.Tensor(next_state).to(device) 89 | next_state = next_state.unsqueeze(0) 90 | 91 | mask = 0 if done else 1 92 | reward = reward if not done or score == 499 else -1 93 | transition = [state, next_state, action, reward, mask] 94 | train_model(net, optimizer, transition, policy, value) 95 | 96 | score += reward 97 | state = next_state 98 | 99 | score = score if score == 500.0 else score + 1 100 | running_score = 0.99 * running_score + 0.01 * score 101 | if e % args.log_interval == 0: 102 | print('{} episode | score: {:.2f}'.format(e, running_score)) 103 | writer.add_scalar('log/score', float(score), running_score) 104 | 105 | if running_score > args.goal_score: 106 | ckpt_path = args.save_path + 'model.pth' 107 | torch.save(net.state_dict(), ckpt_path) 108 | print('running score exceeds 400 so end') 109 | break 110 | 111 | 112 | if __name__=="__main__": 113 | main() 114 | -------------------------------------------------------------------------------- /2-cartpole/3-multi-step/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple 3 | 4 | # Taken from 5 | # https://github.com/pytorch/tutorials/blob/master/Reinforcement%20(Q-)Learning%20with%20PyTorch.ipynb 6 | 7 | Transition = namedtuple('Transition', ('state', 'next_state', 'action', 'reward', 'mask')) 8 | 9 | 10 | class Memory(object): 11 | def __init__(self, capacity): 12 | self.memory = [] 13 | self.capacity = capacity 14 | self.position = 0 15 | 16 | def push(self, state, next_state, action, reward, mask): 17 | """Saves a transition.""" 18 | if len(self.memory) < self.capacity: 19 | self.memory.append(Transition(state, next_state, action, reward, mask)) 20 | self.memory[self.position] = Transition(state, next_state, action, reward, mask) 21 | self.position = (self.position + 1) % self.capacity 22 | 23 | def sample(self): 24 | transitions = self.memory 25 | batch = Transition(*zip(*transitions)) 26 | return batch 27 | 28 | def __len__(self): 29 | return len(self.memory) -------------------------------------------------------------------------------- /2-cartpole/3-multi-step/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Model(nn.Module): 7 | def __init__(self, num_inputs, num_outputs): 8 | super(Model, self).__init__() 9 | self.num_inputs = num_inputs 10 | self.num_outputs = num_outputs 11 | 12 | self.fc1 = nn.Linear(num_inputs, 128) 13 | self.fc2 = nn.Linear(128, 128) 14 | self.fc_actor = nn.Linear(128, num_outputs) 15 | self.fc_critic = nn.Linear(128, 1) 16 | 17 | def forward(self, x): 18 | x = F.relu(self.fc1(x)) 19 | x = F.relu(self.fc2(x)) 20 | policy = F.softmax(self.fc_actor(x)) 21 | value = self.fc_critic(x) 22 | return policy, value -------------------------------------------------------------------------------- /2-cartpole/3-multi-step/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from model import ActorCritic 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', type=str, default="CartPole-v1", help='') 12 | parser.add_argument('--load_model', type=str, default=None) 13 | parser.add_argument('--save_path', default='./save_model/', help='') 14 | args = parser.parse_args() 15 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 16 | 17 | 18 | def get_action(policy, num_actions): 19 | policy = policy.data.numpy()[0] 20 | action = np.random.choice(num_actions, 1, p=policy)[0] 21 | return action 22 | 23 | 24 | def main(): 25 | env = gym.make(args.env_name) 26 | env.seed(500) 27 | torch.manual_seed(500) 28 | 29 | num_inputs = env.observation_space.shape[0] 30 | num_actions = env.action_space.n 31 | print('state size:', num_inputs) 32 | print('action size:', num_actions) 33 | 34 | net = ActorCritic(num_inputs, num_actions) 35 | net.load_state_dict(torch.load(args.save_path + 'model.pth')) 36 | 37 | net.to(device) 38 | net.eval() 39 | running_score = 0 40 | steps = 0 41 | 42 | for e in range(5): 43 | done = False 44 | 45 | score = 0 46 | state = env.reset() 47 | state = torch.Tensor(state).to(device) 48 | state = state.unsqueeze(0) 49 | 50 | while not done: 51 | env.render() 52 | 53 | steps += 1 54 | policy, value = net(state) 55 | action = get_action(policy, num_actions) 56 | next_state, reward, done, _ = env.step(action) 57 | 58 | next_state = torch.Tensor(next_state).to(device) 59 | next_state = next_state.unsqueeze(0) 60 | 61 | score += reward 62 | state = next_state 63 | 64 | print('{} episode | score: {:.2f}'.format(e, score)) 65 | 66 | 67 | if __name__=="__main__": 68 | main() 69 | -------------------------------------------------------------------------------- /2-cartpole/3-multi-step/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | import torch.optim as optim 9 | import torch.nn.functional as F 10 | 11 | from memory import Memory 12 | from model import ActorCritic 13 | from tensorboardX import SummaryWriter 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--env_name', type=str, default="CartPole-v1", help='') 17 | parser.add_argument('--load_model', type=str, default=None) 18 | parser.add_argument('--save_path', default='./save_model/', help='') 19 | parser.add_argument('--render', default=False, action="store_true") 20 | parser.add_argument('--gamma', default=0.99, help='') 21 | parser.add_argument('--goal_score', default=400, help='') 22 | parser.add_argument('--log_interval', default=10, help='') 23 | parser.add_argument('--logdir', type=str, default='./logs', 24 | help='tensorboardx logs directory') 25 | args = parser.parse_args() 26 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 27 | 28 | 29 | def train_model(net, optimizer, batch): 30 | states = torch.stack(batch.state).to(device) 31 | next_states = torch.stack(batch.next_state).to(device) 32 | actions = torch.Tensor(batch.action).long().to(device) 33 | rewards = torch.Tensor(batch.reward).to(device) 34 | masks = torch.Tensor(batch.mask).to(device) 35 | 36 | policy, value = net(states[0]) 37 | _, last_value = net(next_states[-1]) 38 | 39 | running_returns = last_value[0] 40 | for t in reversed(range(0, len(rewards))): 41 | running_returns = rewards[t] + args.gamma * running_returns * masks[t] 42 | 43 | pred = running_returns 44 | td_error = pred - value[0] 45 | 46 | log_policy = torch.log(policy[0] + 1e-5)[actions[0]] 47 | loss1 = - log_policy * td_error.item() 48 | loss2 = F.mse_loss(value[0], pred.detach()) 49 | entropy = torch.log(policy + 1e-5) * policy 50 | loss = loss1 + loss2 - 0.01 * entropy.sum() 51 | 52 | optimizer.zero_grad() 53 | loss.backward() 54 | optimizer.step() 55 | 56 | 57 | def get_action(policy, num_actions): 58 | policy = policy.data.numpy()[0] 59 | action = np.random.choice(num_actions, 1, p=policy)[0] 60 | return action 61 | 62 | 63 | def main(): 64 | env = gym.make(args.env_name) 65 | env.seed(500) 66 | torch.manual_seed(500) 67 | 68 | num_inputs = env.observation_space.shape[0] 69 | num_actions = env.action_space.n 70 | print('state size:', num_inputs) 71 | print('action size:', num_actions) 72 | 73 | net = ActorCritic(num_inputs, num_actions) 74 | optimizer = optim.Adam(net.parameters(), lr=0.0005) 75 | writer = SummaryWriter('logs') 76 | 77 | if not os.path.isdir(args.save_path): 78 | os.makedirs(args.save_path) 79 | 80 | net.to(device) 81 | net.train() 82 | memory = Memory(100) 83 | running_score = 0 84 | 85 | for e in range(3000): 86 | done = False 87 | score = 0 88 | steps = 0 89 | 90 | state = env.reset() 91 | state = torch.Tensor(state).to(device) 92 | state = state.unsqueeze(0) 93 | 94 | while not done: 95 | if args.render: 96 | env.render() 97 | 98 | steps += 1 99 | policy, value = net(state) 100 | action = get_action(policy, num_actions) 101 | 102 | next_state, reward, done, _ = env.step(action) 103 | next_state = torch.Tensor(next_state).to(device) 104 | next_state = next_state.unsqueeze(0) 105 | 106 | mask = 0 if done else 1 107 | reward = reward if not done or score == 499 else -1 108 | 109 | memory.push(state, next_state, action, reward, mask) 110 | 111 | if len(memory) == 5 or done: 112 | batch = memory.sample() 113 | train_model(net, optimizer, batch) 114 | memory = Memory(100) 115 | 116 | score += reward 117 | state = next_state 118 | 119 | score = score if score == 500.0 else score + 1 120 | running_score = 0.99 * running_score + 0.01 * score 121 | if e % args.log_interval == 0: 122 | print('{} episode | score: {:.2f}'.format(e, running_score)) 123 | writer.add_scalar('log/score', float(score), running_score) 124 | 125 | if running_score > args.goal_score: 126 | ckpt_path = args.save_path + 'model.pth' 127 | torch.save(net.state_dict(), ckpt_path) 128 | print('running score exceeds 400 so end') 129 | break 130 | 131 | 132 | if __name__=="__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /3-atari/1-dqn/memory.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple 3 | 4 | # Taken from 5 | # https://github.com/pytorch/tutorials/blob/master/Reinforcement%20(Q-)Learning%20with%20PyTorch.ipynb 6 | 7 | Transition = namedtuple('Transition', ('history', 'next_history', 'action', 'reward', 'mask')) 8 | 9 | 10 | class Memory(object): 11 | def __init__(self, capacity): 12 | self.memory = [] 13 | self.capacity = capacity 14 | self.position = 0 15 | 16 | def push(self, history, next_history, action, reward, mask): 17 | """Saves a transition.""" 18 | if len(self.memory) < self.capacity: 19 | self.memory.append(Transition(history, next_history, action, reward, mask)) 20 | self.memory[self.position] = Transition(history, next_history, action, reward, mask) 21 | self.position = (self.position + 1) % self.capacity 22 | 23 | def sample(self, batch_size): 24 | transitions = random.sample(self.memory, batch_size) 25 | batch = Transition(*zip(*transitions)) 26 | return batch 27 | 28 | def __len__(self): 29 | return len(self.memory) -------------------------------------------------------------------------------- /3-atari/1-dqn/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class QNet(nn.Module): 7 | def __init__(self, num_outputs): 8 | self.num_outputs = num_outputs 9 | super(QNet, self).__init__() 10 | self.conv1 = nn.Conv2d(in_channels=4, 11 | out_channels=32, 12 | kernel_size=8, 13 | stride=4) 14 | 15 | self.conv2 = nn.Conv2d(in_channels=32, 16 | out_channels=64, 17 | kernel_size=4, 18 | stride=2) 19 | 20 | self.conv3 = nn.Conv2d(in_channels=64, 21 | out_channels=64, 22 | kernel_size=3, 23 | stride=1) 24 | 25 | self.fc1 = nn.Linear(7*7*64, 512) 26 | self.fc2 = nn.Linear(512, num_outputs) 27 | 28 | for m in self.modules(): 29 | if isinstance(m, nn.Linear): 30 | nn.init.xavier_uniform(m.weight) 31 | elif isinstance(m, nn.Conv2d): 32 | nn.init.kaiming_normal_(m.weight, mode='fan_out', 33 | nonlinearity='relu') 34 | 35 | def forward(self, x): 36 | x = F.relu(self.conv1(x)) 37 | x = F.relu(self.conv2(x)) 38 | x = F.relu(self.conv3(x)) 39 | x = x.view(x.size(0), -1) 40 | x = F.relu(self.fc1(x)) 41 | qvalue = self.fc2(x) 42 | return qvalue -------------------------------------------------------------------------------- /3-atari/1-dqn/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from utils import pre_process, get_action 9 | from model import QNet 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--env_name', type=str, default="BreakoutDeterministic-v4", help='') 13 | parser.add_argument('--load_model', type=str, default=None) 14 | parser.add_argument('--save_path', default='./save_model/', help='') 15 | args = parser.parse_args() 16 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 17 | 18 | 19 | def main(): 20 | env = gym.make(args.env_name) 21 | env.seed(500) 22 | torch.manual_seed(500) 23 | 24 | img_shape = env.observation_space.shape 25 | num_actions = 3 26 | print('image size:', img_shape) 27 | print('action size:', num_actions) 28 | 29 | net = QNet(num_actions) 30 | net.load_state_dict(torch.load(args.save_path + 'model.pth')) 31 | 32 | net.to(device) 33 | net.eval() 34 | 35 | epsilon = 0 36 | 37 | for e in range(5): 38 | done = False 39 | 40 | score = 0 41 | state = env.reset() 42 | 43 | state = pre_process(state) 44 | state = torch.Tensor(state).to(device) 45 | history = torch.stack((state, state, state, state)) 46 | 47 | for i in range(3): 48 | action = env.action_space.sample() 49 | state, reward, done, info = env.step(action) 50 | state = pre_process(state) 51 | state = torch.Tensor(state).to(device) 52 | state = state.unsqueeze(0) 53 | history = torch.cat((state, history[:-1]), dim=0) 54 | 55 | while not done: 56 | if args.render: 57 | env.render() 58 | 59 | steps += 1 60 | qvalue = net(history.unsqueeze(0)) 61 | action = get_action(0, qvalue, num_actions) 62 | 63 | next_state, reward, done, info = env.step(action + 1) 64 | 65 | next_state = pre_process(next_state) 66 | next_state = torch.Tensor(next_state).to(device) 67 | next_state = next_state.unsqueeze(0) 68 | next_history = torch.cat((next_state, history[:-1]), dim=0) 69 | 70 | score += reward 71 | history = next_history 72 | 73 | print('{} episode | score: {:.2f}'.format(e, score)) 74 | 75 | 76 | if __name__=="__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /3-atari/1-dqn/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import random 5 | import argparse 6 | import numpy as np 7 | 8 | import torch 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | 12 | from utils import pre_process, get_action, update_target_model 13 | from model import QNet 14 | from memory import Memory 15 | from tensorboardX import SummaryWriter 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--env_name', type=str, default="BreakoutDeterministic-v4", help='') 19 | parser.add_argument('--load_model', type=str, default=None) 20 | parser.add_argument('--save_path', default='./save_model/', help='') 21 | parser.add_argument('--render', default=False, action="store_true") 22 | parser.add_argument('--gamma', default=0.99, help='') 23 | parser.add_argument('--batch_size', default=32, help='') 24 | parser.add_argument('--initial_exploration', default=1000, help='') 25 | parser.add_argument('--update_target', default=10000, help='') 26 | parser.add_argument('--log_interval', default=1, help='') 27 | parser.add_argument('--goal_score', default=300, help='') 28 | parser.add_argument('--logdir', type=str, default='./logs', 29 | help='tensorboardx logs directory') 30 | args = parser.parse_args() 31 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 32 | 33 | 34 | def train_model(net, target_net, optimizer, batch): 35 | history = torch.stack(batch.history).to(device) 36 | next_history = torch.stack(batch.next_history).to(device) 37 | actions = torch.Tensor(batch.action).long().to(device) 38 | rewards = torch.Tensor(batch.reward).to(device) 39 | masks = torch.Tensor(batch.mask).to(device) 40 | 41 | pred = net(history).squeeze(1) 42 | next_pred = target_net(next_history).squeeze(1) 43 | one_hot_action = torch.zeros(args.batch_size, pred.size(-1)) 44 | one_hot_action = one_hot_action.to(device) 45 | one_hot_action.scatter_(1, actions.unsqueeze(1), 1) 46 | pred = torch.sum(pred.mul(one_hot_action), dim=1) 47 | target = rewards + args.gamma * next_pred.max(1)[0] * masks 48 | 49 | loss = F.smooth_l1_loss(pred, target.detach(), size_average=True) 50 | optimizer.zero_grad() 51 | loss.backward() 52 | optimizer.step() 53 | return loss.cpu().data 54 | 55 | 56 | def main(): 57 | env = gym.make(args.env_name) 58 | env.seed(500) 59 | torch.manual_seed(500) 60 | 61 | img_shape = env.observation_space.shape 62 | num_actions = 3 63 | print('image size:', img_shape) 64 | print('action size:', num_actions) 65 | 66 | net = QNet(num_actions) 67 | target_net = QNet(num_actions) 68 | update_target_model(net, target_net) 69 | 70 | optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01) 71 | writer = SummaryWriter('logs') 72 | 73 | if not os.path.isdir(args.save_path): 74 | os.makedirs(args.save_path) 75 | 76 | net.to(device) 77 | target_net.to(device) 78 | net.train() 79 | target_net.train() 80 | memory = Memory(100000) 81 | running_score = 0 82 | epsilon = 1.0 83 | steps = 0 84 | 85 | for e in range(10000): 86 | done = False 87 | dead = False 88 | 89 | score = 0 90 | avg_loss = [] 91 | start_life = 5 92 | state = env.reset() 93 | 94 | state = pre_process(state) 95 | state = torch.Tensor(state).to(device) 96 | history = torch.stack((state, state, state, state)) 97 | 98 | for i in range(3): 99 | action = env.action_space.sample() 100 | state, reward, done, info = env.step(action) 101 | state = pre_process(state) 102 | state = torch.Tensor(state).to(device) 103 | state = state.unsqueeze(0) 104 | history = torch.cat((state, history[:-1]), dim=0) 105 | 106 | while not done: 107 | if args.render: 108 | env.render() 109 | 110 | steps += 1 111 | qvalue = net(history.unsqueeze(0)) 112 | action = get_action(epsilon, qvalue, num_actions) 113 | 114 | next_state, reward, done, info = env.step(action + 1) 115 | 116 | next_state = pre_process(next_state) 117 | next_state = torch.Tensor(next_state).to(device) 118 | next_state = next_state.unsqueeze(0) 119 | next_history = torch.cat((next_state, history[:-1]), dim=0) 120 | 121 | if start_life > info['ale.lives']: 122 | dead = True 123 | start_life = info['ale.lives'] 124 | 125 | score += reward 126 | reward = np.clip(reward, -1, 1) 127 | 128 | mask = 0 if dead else 1 129 | memory.push(history.cpu(), next_history.cpu(), action, reward, mask) 130 | 131 | if dead: 132 | dead = False 133 | 134 | if steps > args.initial_exploration: 135 | epsilon -= 1e-6 136 | epsilon = max(epsilon, 0.1) 137 | 138 | batch = memory.sample(args.batch_size) 139 | loss = train_model(net, target_net, optimizer, batch) 140 | 141 | if steps % args.update_target: 142 | update_target_model(net, target_net) 143 | else: 144 | loss = 0 145 | 146 | avg_loss.append(loss) 147 | history = next_history 148 | 149 | 150 | if e % args.log_interval == 0: 151 | print('{} episode | score: {:.2f} | epsilon: {:.4f} | steps: {} | loss: {:.4f}'.format( 152 | e, score, epsilon, steps, np.mean(avg_loss))) 153 | writer.add_scalar('log/score', float(score), steps) 154 | writer.add_scalar('log/score', np.mean(avg_loss), steps) 155 | 156 | if score > args.goal_score: 157 | ckpt_path = args.save_path + 'model.pth' 158 | torch.save(net.state_dict(), ckpt_path) 159 | print('running score exceeds 400 so end') 160 | break 161 | 162 | if __name__=="__main__": 163 | main() -------------------------------------------------------------------------------- /3-atari/1-dqn/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | import numpy as np 4 | from skimage.color import rgb2gray 5 | from skimage.transform import resize 6 | 7 | 8 | def pre_process(image): 9 | image = np.array(image) 10 | image = resize(image, (84, 84, 3)) 11 | image = rgb2gray(image) 12 | return image 13 | 14 | 15 | def get_action(epsilon, qvalue, num_actions): 16 | if np.random.rand() <= epsilon: 17 | return random.randrange(num_actions) 18 | else: 19 | _, action = torch.max(qvalue, 1) 20 | return action.cpu().numpy()[0] 21 | 22 | 23 | def update_target_model(net, target_net): 24 | target_net.load_state_dict(net.state_dict()) 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Reinforcement Learning KR 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # reinforcement-learning-pytorch 2 | Minimal and Clean Reinforcement Learning Examples in PyTorch 3 | --------------------------------------------------------------------------------