├── .gitignore
├── 1-grid-world
├── 1-policy-iteration
│ ├── environment.py
│ └── policy_iteration.py
├── 2-value-iteration
│ ├── environment.py
│ └── value_iteration.py
├── 3-monte-carlo
│ ├── environment.py
│ └── mc_agent.py
├── 4-sarsa
│ ├── .python-version
│ ├── environment.py
│ └── sarsa_agent.py
├── 5-q-learning
│ ├── .python-version
│ ├── environment.py
│ └── q_learning_agent.py
├── 6-deep-sarsa
│ ├── deep_sarsa_agent.py
│ ├── environment.py
│ ├── save_graph
│ │ └── deep_sarsa_trained.png
│ └── save_model
│ │ └── deep_sarsa_trained.h5
├── 7-reinforce
│ ├── environment.py
│ ├── reinforce_agent.py
│ ├── save_graph
│ │ └── reinforce_trained.png
│ └── save_model
│ │ └── reinforce_trained.h5
├── README.md
├── gridworld.png
└── img
│ ├── circle.png
│ ├── down.png
│ ├── left.png
│ ├── rectangle.png
│ ├── right.png
│ ├── triangle.png
│ └── up.png
├── 2-cartpole
├── 1-dqn
│ ├── cartpole_dqn.py
│ ├── save_graph
│ │ └── cartpole_dqn.png
│ └── save_model
│ │ └── cartpole_dqn_trained.h5
├── 2-actor-critic
│ ├── cartpole_a2c.py
│ ├── save_graph
│ │ └── cartpole_a2c.png
│ └── save_model
│ │ ├── cartpole_actor_trained.h5
│ │ └── cartpole_critic_trained.h5
├── LICENSE
├── README.md
└── cartpole.png
├── 3-atari
├── 1-breakout
│ ├── breakout_a3c.py
│ ├── breakout_dqn.py
│ ├── play_a3c_model.py
│ ├── play_dqn_model.py
│ ├── save_model
│ │ ├── breakout_a3c_1_actor.h5
│ │ ├── breakout_a3c_1_critic.h5
│ │ ├── breakout_a3c_2_actor.h5
│ │ ├── breakout_a3c_2_critic.h5
│ │ ├── breakout_a3c_3_actor.h5
│ │ ├── breakout_a3c_3_critic.h5
│ │ ├── breakout_a3c_4_actor.h5
│ │ ├── breakout_a3c_4_critic.h5
│ │ ├── breakout_a3c_5_actor.h5
│ │ ├── breakout_a3c_5_critic.h5
│ │ ├── breakout_dqn.h5
│ │ ├── breakout_dqn_1.h5
│ │ ├── breakout_dqn_2.h5
│ │ ├── breakout_dqn_3.h5
│ │ ├── breakout_dqn_4.h5
│ │ └── breakout_dqn_5.h5
│ └── summary
│ │ ├── breakout_a3c
│ │ └── events.out.tfevents.1497264638
│ │ └── breakout_dqn
│ │ └── events.out.tfevents.1496968668.young-System-Product-Name
└── LICENSE
├── LICENSE
├── README.md
├── images
└── Reinforcement-Learning.png
├── requirements.txt
└── wiki
├── README.md
├── how-to-windows(english).md
├── img
├── how-to-windows.png
├── link-env-with-pychar-1.png
├── link-env-with-pychar-2.png
├── link-env-with-pychar.png
├── numpy_install.png
├── numpy_install2.png
├── numpy_install3.png
├── python3png.png
├── python_install.png
├── win_atari.png
├── win_atari.py3.png
├── win_breakout.png
├── win_breakout2.png
├── win_git.png
├── win_git2.png
├── win_gym.png
├── win_make.png
├── win_make2.png
├── win_msys2.png
├── win_msys2_2.png
├── win_msys2_3.png
├── win_msys2_4.png
├── win_msys2_5.png
├── win_openai_gym.png
├── win_openai_gym2.png
├── win_openai_gym3.png
├── win_openai_gym4.png
├── win_openai_gym5.png
├── win_pycharm_install1.png
├── win_pycharm_project.png
├── win_pycharm_project2.png
├── win_pycharm_setting2.png
├── win_pycharm_settings.png
├── win_setting.png
├── win_setting2.png
├── win_setting3.png
├── win_setting4.png
├── win_setup.py.png
├── win_setup.py2.png
└── win_xming.png
├── install_guide_osx.md
├── install_guide_ubuntu.md
├── install_guide_window.md
└── install_image
├── atari_breakout.png
├── cartpole_exam.png
├── console_hello_world.png
├── default_config.png
├── file_setting.png
├── hello_world_ubuntu.png
├── openai_github.png
├── project_interpreter.png
├── pycham_new_project.png
├── pycharm_community.png
├── pycharm_drag.png
├── pycharm_init.png
├── python3_terminal.jpg
├── python_download.png
├── python_installed.png
├── python_intalled.png
├── rl_book_hello_world.png
├── rl_book_project.png
├── rl_book_venv.png
├── rl_book_virtualenv.png
├── rlcode_book_directory.png
├── rlcode_project.png
├── run_hello_world.png
├── sh_pycharm.sh.png
└── terminal_rlcode_book.png
/.gitignore:
--------------------------------------------------------------------------------
1 | *.project
2 | *.pydevproject
3 | .idea/
4 | .DS_Store
5 | __pycache__
--------------------------------------------------------------------------------
/1-grid-world/1-policy-iteration/environment.py:
--------------------------------------------------------------------------------
1 | import tkinter as tk
2 | from tkinter import Button
3 | import time
4 | import numpy as np
5 | from PIL import ImageTk, Image
6 |
7 | PhotoImage = ImageTk.PhotoImage
8 | UNIT = 100 # 픽셀 수
9 | HEIGHT = 5 # 그리드월드 세로
10 | WIDTH = 5 # 그리드월드 가로
11 | TRANSITION_PROB = 1
12 | POSSIBLE_ACTIONS = [0, 1, 2, 3] # 좌, 우, 상, 하
13 | ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 좌표로 나타낸 행동
14 | REWARDS = []
15 |
16 |
17 | class GraphicDisplay(tk.Tk):
18 | def __init__(self, agent):
19 | super(GraphicDisplay, self).__init__()
20 | self.title('Policy Iteration')
21 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
22 | self.texts = []
23 | self.arrows = []
24 | self.env = Env()
25 | self.agent = agent
26 | self.evaluation_count = 0
27 | self.improvement_count = 0
28 | self.is_moving = 0
29 | (self.up, self.down, self.left, self.right), self.shapes = self.load_images()
30 | self.canvas = self._build_canvas()
31 | self.text_reward(2, 2, "R : 1.0")
32 | self.text_reward(1, 2, "R : -1.0")
33 | self.text_reward(2, 1, "R : -1.0")
34 |
35 | def _build_canvas(self):
36 | canvas = tk.Canvas(self, bg='white',
37 | height=HEIGHT * UNIT,
38 | width=WIDTH * UNIT)
39 | # 버튼 초기화
40 | iteration_button = Button(self, text="Evaluate",
41 | command=self.evaluate_policy)
42 | iteration_button.configure(width=10, activebackground="#33B5E5")
43 | canvas.create_window(WIDTH * UNIT * 0.13, HEIGHT * UNIT + 10,
44 | window=iteration_button)
45 | policy_button = Button(self, text="Improve",
46 | command=self.improve_policy)
47 | policy_button.configure(width=10, activebackground="#33B5E5")
48 | canvas.create_window(WIDTH * UNIT * 0.37, HEIGHT * UNIT + 10,
49 | window=policy_button)
50 | policy_button = Button(self, text="move", command=self.move_by_policy)
51 | policy_button.configure(width=10, activebackground="#33B5E5")
52 | canvas.create_window(WIDTH * UNIT * 0.62, HEIGHT * UNIT + 10,
53 | window=policy_button)
54 | policy_button = Button(self, text="reset", command=self.reset)
55 | policy_button.configure(width=10, activebackground="#33B5E5")
56 | canvas.create_window(WIDTH * UNIT * 0.87, HEIGHT * UNIT + 10,
57 | window=policy_button)
58 |
59 | # 그리드 생성
60 | for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
61 | x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT
62 | canvas.create_line(x0, y0, x1, y1)
63 | for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
64 | x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row
65 | canvas.create_line(x0, y0, x1, y1)
66 |
67 | # 캔버스에 이미지 추가
68 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
69 | canvas.create_image(250, 150, image=self.shapes[1])
70 | canvas.create_image(150, 250, image=self.shapes[1])
71 | canvas.create_image(250, 250, image=self.shapes[2])
72 |
73 | canvas.pack()
74 |
75 | return canvas
76 |
77 | def load_images(self):
78 | up = PhotoImage(Image.open("../img/up.png").resize((13, 13)))
79 | right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))
80 | left = PhotoImage(Image.open("../img/left.png").resize((13, 13)))
81 | down = PhotoImage(Image.open("../img/down.png").resize((13, 13)))
82 | rectangle = PhotoImage(Image.open("../img/rectangle.png").resize((65, 65)))
83 | triangle = PhotoImage(Image.open("../img/triangle.png").resize((65, 65)))
84 | circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65)))
85 | return (up, down, left, right), (rectangle, triangle, circle)
86 |
87 | def reset(self):
88 | if self.is_moving == 0:
89 | self.evaluation_count = 0
90 | self.improvement_count = 0
91 | for i in self.texts:
92 | self.canvas.delete(i)
93 |
94 | for i in self.arrows:
95 | self.canvas.delete(i)
96 | self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]
97 | self.agent.policy_table = ([[[0.25, 0.25, 0.25, 0.25]] * WIDTH
98 | for _ in range(HEIGHT)])
99 | self.agent.policy_table[2][2] = []
100 | x, y = self.canvas.coords(self.rectangle)
101 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
102 |
103 | def text_value(self, row, col, contents, font='Helvetica', size=10,
104 | style='normal', anchor="nw"):
105 | origin_x, origin_y = 85, 70
106 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
107 | font = (font, str(size), style)
108 | text = self.canvas.create_text(x, y, fill="black", text=contents,
109 | font=font, anchor=anchor)
110 | return self.texts.append(text)
111 |
112 | def text_reward(self, row, col, contents, font='Helvetica', size=10,
113 | style='normal', anchor="nw"):
114 | origin_x, origin_y = 5, 5
115 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
116 | font = (font, str(size), style)
117 | text = self.canvas.create_text(x, y, fill="black", text=contents,
118 | font=font, anchor=anchor)
119 | return self.texts.append(text)
120 |
121 | def rectangle_move(self, action):
122 | base_action = np.array([0, 0])
123 | location = self.find_rectangle()
124 | self.render()
125 | if action == 0 and location[0] > 0: # 상
126 | base_action[1] -= UNIT
127 | elif action == 1 and location[0] < HEIGHT - 1: # 하
128 | base_action[1] += UNIT
129 | elif action == 2 and location[1] > 0: # 좌
130 | base_action[0] -= UNIT
131 | elif action == 3 and location[1] < WIDTH - 1: # 우
132 | base_action[0] += UNIT
133 | # move agent
134 | self.canvas.move(self.rectangle, base_action[0], base_action[1])
135 |
136 | def find_rectangle(self):
137 | temp = self.canvas.coords(self.rectangle)
138 | x = (temp[0] / 100) - 0.5
139 | y = (temp[1] / 100) - 0.5
140 | return int(y), int(x)
141 |
142 | def move_by_policy(self):
143 | if self.improvement_count != 0 and self.is_moving != 1:
144 | self.is_moving = 1
145 |
146 | x, y = self.canvas.coords(self.rectangle)
147 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
148 |
149 | x, y = self.find_rectangle()
150 | while len(self.agent.policy_table[x][y]) != 0:
151 | self.after(100,
152 | self.rectangle_move(self.agent.get_action([x, y])))
153 | x, y = self.find_rectangle()
154 | self.is_moving = 0
155 |
156 | def draw_one_arrow(self, col, row, policy):
157 | if col == 2 and row == 2:
158 | return
159 |
160 | if policy[0] > 0: # up
161 | origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
162 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
163 | image=self.up))
164 | if policy[1] > 0: # down
165 | origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
166 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
167 | image=self.down))
168 | if policy[2] > 0: # left
169 | origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
170 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
171 | image=self.left))
172 | if policy[3] > 0: # right
173 | origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
174 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
175 | image=self.right))
176 |
177 | def draw_from_policy(self, policy_table):
178 | for i in range(HEIGHT):
179 | for j in range(WIDTH):
180 | self.draw_one_arrow(i, j, policy_table[i][j])
181 |
182 | def print_value_table(self, value_table):
183 | for i in range(WIDTH):
184 | for j in range(HEIGHT):
185 | self.text_value(i, j, value_table[i][j])
186 |
187 | def render(self):
188 | time.sleep(0.1)
189 | self.canvas.tag_raise(self.rectangle)
190 | self.update()
191 |
192 | def evaluate_policy(self):
193 | self.evaluation_count += 1
194 | for i in self.texts:
195 | self.canvas.delete(i)
196 | self.agent.policy_evaluation()
197 | self.print_value_table(self.agent.value_table)
198 |
199 | def improve_policy(self):
200 | self.improvement_count += 1
201 | for i in self.arrows:
202 | self.canvas.delete(i)
203 | self.agent.policy_improvement()
204 | self.draw_from_policy(self.agent.policy_table)
205 |
206 |
207 | class Env:
208 | def __init__(self):
209 | self.transition_probability = TRANSITION_PROB
210 | self.width = WIDTH
211 | self.height = HEIGHT
212 | self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
213 | self.possible_actions = POSSIBLE_ACTIONS
214 | self.reward[2][2] = 1 # (2,2) 좌표 동그라미 위치에 보상 1
215 | self.reward[1][2] = -1 # (1,2) 좌표 세모 위치에 보상 -1
216 | self.reward[2][1] = -1 # (2,1) 좌표 세모 위치에 보상 -1
217 | self.all_state = []
218 |
219 | for x in range(WIDTH):
220 | for y in range(HEIGHT):
221 | state = [x, y]
222 | self.all_state.append(state)
223 |
224 | def get_reward(self, state, action):
225 | next_state = self.state_after_action(state, action)
226 | return self.reward[next_state[0]][next_state[1]]
227 |
228 | def state_after_action(self, state, action_index):
229 | action = ACTIONS[action_index]
230 | return self.check_boundary([state[0] + action[0], state[1] + action[1]])
231 |
232 | @staticmethod
233 | def check_boundary(state):
234 | state[0] = (0 if state[0] < 0 else WIDTH - 1
235 | if state[0] > WIDTH - 1 else state[0])
236 | state[1] = (0 if state[1] < 0 else HEIGHT - 1
237 | if state[1] > HEIGHT - 1 else state[1])
238 | return state
239 |
240 | def get_transition_prob(self, state, action):
241 | return self.transition_probability
242 |
243 | def get_all_states(self):
244 | return self.all_state
245 |
--------------------------------------------------------------------------------
/1-grid-world/1-policy-iteration/policy_iteration.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import random
3 | from environment import GraphicDisplay, Env
4 |
5 |
6 | class PolicyIteration:
7 | def __init__(self, env):
8 | # 환경에 대한 객체 선언
9 | self.env = env
10 | # 가치함수를 2차원 리스트로 초기화
11 | self.value_table = [[0.0] * env.width for _ in range(env.height)]
12 | # 상 하 좌 우 동일한 확률로 정책 초기화
13 | self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width
14 | for _ in range(env.height)]
15 | # 마침 상태의 설정
16 | self.policy_table[2][2] = []
17 | # 감가율
18 | self.discount_factor = 0.9
19 |
20 | def policy_evaluation(self):
21 |
22 | # 다음 가치함수 초기화
23 | next_value_table = [[0.00] * self.env.width
24 | for _ in range(self.env.height)]
25 |
26 | # 모든 상태에 대해서 벨만 기대방정식을 계산
27 | for state in self.env.get_all_states():
28 | value = 0.0
29 | # 마침 상태의 가치 함수 = 0
30 | if state == [2, 2]:
31 | next_value_table[state[0]][state[1]] = value
32 | continue
33 |
34 | # 벨만 기대 방정식
35 | for action in self.env.possible_actions:
36 | next_state = self.env.state_after_action(state, action)
37 | reward = self.env.get_reward(state, action)
38 | next_value = self.get_value(next_state)
39 | value += (self.get_policy(state)[action] *
40 | (reward + self.discount_factor * next_value))
41 |
42 | next_value_table[state[0]][state[1]] = round(value, 2)
43 |
44 | self.value_table = next_value_table
45 |
46 | # 현재 가치 함수에 대해서 탐욕 정책 발전
47 | def policy_improvement(self):
48 | next_policy = self.policy_table
49 | for state in self.env.get_all_states():
50 | if state == [2, 2]:
51 | continue
52 | value = -99999
53 | max_index = []
54 | # 반환할 정책 초기화
55 | result = [0.0, 0.0, 0.0, 0.0]
56 |
57 | # 모든 행동에 대해서 [보상 + (감가율 * 다음 상태 가치함수)] 계산
58 | for index, action in enumerate(self.env.possible_actions):
59 | next_state = self.env.state_after_action(state, action)
60 | reward = self.env.get_reward(state, action)
61 | next_value = self.get_value(next_state)
62 | temp = reward + self.discount_factor * next_value
63 |
64 | # 받을 보상이 최대인 행동의 index(최대가 복수라면 모두)를 추출
65 | if temp == value:
66 | max_index.append(index)
67 | elif temp > value:
68 | value = temp
69 | max_index.clear()
70 | max_index.append(index)
71 |
72 | # 행동의 확률 계산
73 | prob = 1 / len(max_index)
74 |
75 | for index in max_index:
76 | result[index] = prob
77 |
78 | next_policy[state[0]][state[1]] = result
79 |
80 | self.policy_table = next_policy
81 |
82 | # 특정 상태에서 정책에 따른 행동을 반환
83 | def get_action(self, state):
84 | # 0 ~ 1 사이의 값을 무작위로 추출
85 | random_pick = random.randrange(100) / 100
86 |
87 | policy = self.get_policy(state)
88 | policy_sum = 0.0
89 | # 정책에 담긴 행동 중에 무작위로 한 행동을 추출
90 | for index, value in enumerate(policy):
91 | policy_sum += value
92 | if random_pick < policy_sum:
93 | return index
94 |
95 | # 상태에 따른 정책 반환
96 | def get_policy(self, state):
97 | if state == [2, 2]:
98 | return 0.0
99 | return self.policy_table[state[0]][state[1]]
100 |
101 | # 가치 함수의 값을 반환
102 | def get_value(self, state):
103 | # 소숫점 둘째 자리까지만 계산
104 | return round(self.value_table[state[0]][state[1]], 2)
105 |
106 | if __name__ == "__main__":
107 | env = Env()
108 | policy_iteration = PolicyIteration(env)
109 | grid_world = GraphicDisplay(policy_iteration)
110 | grid_world.mainloop()
111 |
--------------------------------------------------------------------------------
/1-grid-world/2-value-iteration/environment.py:
--------------------------------------------------------------------------------
1 | import tkinter as tk
2 | import time
3 | import numpy as np
4 | import random
5 | from PIL import ImageTk, Image
6 |
7 | PhotoImage = ImageTk.PhotoImage
8 | UNIT = 100 # 픽셀 수
9 | HEIGHT = 5 # 그리드월드 세로
10 | WIDTH = 5 # 그리드월드 가로
11 | TRANSITION_PROB = 1
12 | POSSIBLE_ACTIONS = [0, 1, 2, 3] # 상, 하, 좌, 우
13 | ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 좌표로 나타낸 행동
14 | REWARDS = []
15 |
16 |
17 | class GraphicDisplay(tk.Tk):
18 | def __init__(self, value_iteration):
19 | super(GraphicDisplay, self).__init__()
20 | self.title('Value Iteration')
21 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
22 | self.texts = []
23 | self.arrows = []
24 | self.env = Env()
25 | self.agent = value_iteration
26 | self.iteration_count = 0
27 | self.improvement_count = 0
28 | self.is_moving = 0
29 | (self.up, self.down, self.left,
30 | self.right), self.shapes = self.load_images()
31 | self.canvas = self._build_canvas()
32 | self.text_reward(2, 2, "R : 1.0")
33 | self.text_reward(1, 2, "R : -1.0")
34 | self.text_reward(2, 1, "R : -1.0")
35 |
36 | def _build_canvas(self):
37 | canvas = tk.Canvas(self, bg='white',
38 | height=HEIGHT * UNIT,
39 | width=WIDTH * UNIT)
40 | # 버튼 초기화
41 | iteration_button = tk.Button(self, text="Calculate",
42 | command=self.calculate_value)
43 | iteration_button.configure(width=10, activebackground="#33B5E5")
44 | canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10,
45 | window=iteration_button)
46 |
47 | policy_button = tk.Button(self, text="Print Policy",
48 | command=self.print_optimal_policy)
49 | policy_button.configure(width=10, activebackground="#33B5E5")
50 | canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10,
51 | window=policy_button)
52 |
53 | policy_button = tk.Button(self, text="Move",
54 | command=self.move_by_policy)
55 | policy_button.configure(width=10, activebackground="#33B5E5")
56 | canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10,
57 | window=policy_button)
58 |
59 | policy_button = tk.Button(self, text="Clear", command=self.clear)
60 | policy_button.configure(width=10, activebackground="#33B5E5")
61 | canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10,
62 | window=policy_button)
63 |
64 | # 그리드 생성
65 | for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
66 | x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT
67 | canvas.create_line(x0, y0, x1, y1)
68 | for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
69 | x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row
70 | canvas.create_line(x0, y0, x1, y1)
71 |
72 | # 캔버스에 이미지 추가
73 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
74 | canvas.create_image(250, 150, image=self.shapes[1])
75 | canvas.create_image(150, 250, image=self.shapes[1])
76 | canvas.create_image(250, 250, image=self.shapes[2])
77 |
78 | canvas.pack()
79 |
80 | return canvas
81 |
82 | def load_images(self):
83 | PhotoImage = ImageTk.PhotoImage
84 | up = PhotoImage(Image.open("../img/up.png").resize((13, 13)))
85 | right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))
86 | left = PhotoImage(Image.open("../img/left.png").resize((13, 13)))
87 | down = PhotoImage(Image.open("../img/down.png").resize((13, 13)))
88 | rectangle = PhotoImage(
89 | Image.open("../img/rectangle.png").resize((65, 65)))
90 | triangle = PhotoImage(
91 | Image.open("../img/triangle.png").resize((65, 65)))
92 | circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65)))
93 | return (up, down, left, right), (rectangle, triangle, circle)
94 |
95 | def clear(self):
96 |
97 | if self.is_moving == 0:
98 | self.iteration_count = 0
99 | self.improvement_count = 0
100 | for i in self.texts:
101 | self.canvas.delete(i)
102 |
103 | for i in self.arrows:
104 | self.canvas.delete(i)
105 |
106 | self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]
107 |
108 | x, y = self.canvas.coords(self.rectangle)
109 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
110 |
111 | def reset(self):
112 | self.update()
113 | time.sleep(0.5)
114 | self.canvas.delete(self.rectangle)
115 | return self.canvas.coords(self.rectangle)
116 |
117 | def text_value(self, row, col, contents, font='Helvetica', size=12,
118 | style='normal', anchor="nw"):
119 | origin_x, origin_y = 85, 70
120 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
121 | font = (font, str(size), style)
122 | text = self.canvas.create_text(x, y, fill="black", text=contents,
123 | font=font, anchor=anchor)
124 | return self.texts.append(text)
125 |
126 | def text_reward(self, row, col, contents, font='Helvetica', size=12,
127 | style='normal', anchor="nw"):
128 | origin_x, origin_y = 5, 5
129 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
130 | font = (font, str(size), style)
131 | text = self.canvas.create_text(x, y, fill="black", text=contents,
132 | font=font, anchor=anchor)
133 | return self.texts.append(text)
134 |
135 | def rectangle_move(self, action):
136 | base_action = np.array([0, 0])
137 | location = self.find_rectangle()
138 | self.render()
139 | if action == 0 and location[0] > 0: # up
140 | base_action[1] -= UNIT
141 | elif action == 1 and location[0] < HEIGHT - 1: # down
142 | base_action[1] += UNIT
143 | elif action == 2 and location[1] > 0: # left
144 | base_action[0] -= UNIT
145 | elif action == 3 and location[1] < WIDTH - 1: # right
146 | base_action[0] += UNIT
147 |
148 | self.canvas.move(self.rectangle, base_action[0],
149 | base_action[1]) # move agent
150 |
151 | def find_rectangle(self):
152 | temp = self.canvas.coords(self.rectangle)
153 | x = (temp[0] / 100) - 0.5
154 | y = (temp[1] / 100) - 0.5
155 | return int(y), int(x)
156 |
157 | def move_by_policy(self):
158 |
159 | if self.improvement_count != 0 and self.is_moving != 1:
160 | self.is_moving = 1
161 | x, y = self.canvas.coords(self.rectangle)
162 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
163 |
164 | x, y = self.find_rectangle()
165 | while len(self.agent.get_action([x, y])) != 0:
166 | action = random.sample(self.agent.get_action([x, y]), 1)[0]
167 | self.after(100, self.rectangle_move(action))
168 | x, y = self.find_rectangle()
169 | self.is_moving = 0
170 |
171 | def draw_one_arrow(self, col, row, action):
172 | if col == 2 and row == 2:
173 | return
174 | if action == 0: # up
175 | origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
176 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
177 | image=self.up))
178 | elif action == 1: # down
179 | origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
180 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
181 | image=self.down))
182 | elif action == 3: # right
183 | origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
184 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
185 | image=self.right))
186 | elif action == 2: # left
187 | origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
188 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
189 | image=self.left))
190 |
191 | def draw_from_values(self, state, action_list):
192 | i = state[0]
193 | j = state[1]
194 | for action in action_list:
195 | self.draw_one_arrow(i, j, action)
196 |
197 | def print_values(self, values):
198 | for i in range(WIDTH):
199 | for j in range(HEIGHT):
200 | self.text_value(i, j, values[i][j])
201 |
202 | def render(self):
203 | time.sleep(0.1)
204 | self.canvas.tag_raise(self.rectangle)
205 | self.update()
206 |
207 | def calculate_value(self):
208 | self.iteration_count += 1
209 | for i in self.texts:
210 | self.canvas.delete(i)
211 | self.agent.value_iteration()
212 | self.print_values(self.agent.value_table)
213 |
214 | def print_optimal_policy(self):
215 | self.improvement_count += 1
216 | for i in self.arrows:
217 | self.canvas.delete(i)
218 | for state in self.env.get_all_states():
219 | action = self.agent.get_action(state)
220 | self.draw_from_values(state, action)
221 |
222 |
223 | class Env:
224 | def __init__(self):
225 | self.transition_probability = TRANSITION_PROB
226 | self.width = WIDTH # Width of Grid World
227 | self.height = HEIGHT # Height of GridWorld
228 | self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
229 | self.possible_actions = POSSIBLE_ACTIONS
230 | self.reward[2][2] = 1 # reward 1 for circle
231 | self.reward[1][2] = -1 # reward -1 for triangle
232 | self.reward[2][1] = -1 # reward -1 for triangle
233 | self.all_state = []
234 |
235 | for x in range(WIDTH):
236 | for y in range(HEIGHT):
237 | state = [x, y]
238 | self.all_state.append(state)
239 |
240 | def get_reward(self, state, action):
241 | next_state = self.state_after_action(state, action)
242 | return self.reward[next_state[0]][next_state[1]]
243 |
244 | def state_after_action(self, state, action_index):
245 | action = ACTIONS[action_index]
246 | return self.check_boundary([state[0] + action[0], state[1] + action[1]])
247 |
248 | @staticmethod
249 | def check_boundary(state):
250 | state[0] = (0 if state[0] < 0 else WIDTH - 1
251 | if state[0] > WIDTH - 1 else state[0])
252 | state[1] = (0 if state[1] < 0 else HEIGHT - 1
253 | if state[1] > HEIGHT - 1 else state[1])
254 | return state
255 |
256 | def get_transition_prob(self, state, action):
257 | return self.transition_probability
258 |
259 | def get_all_states(self):
260 | return self.all_state
261 |
--------------------------------------------------------------------------------
/1-grid-world/2-value-iteration/value_iteration.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from environment import GraphicDisplay, Env
3 |
4 | class ValueIteration:
5 | def __init__(self, env):
6 | # 환경 객체 생성
7 | self.env = env
8 | # 가치 함수를 2차원 리스트로 초기화
9 | self.value_table = [[0.0] * env.width for _ in range(env.height)]
10 | # 감가율
11 | self.discount_factor = 0.9
12 |
13 | # 가치 이터레이션
14 | # 벨만 최적 방정식을 통해 다음 가치 함수 계산
15 | def value_iteration(self):
16 | next_value_table = [[0.0] * self.env.width for _ in
17 | range(self.env.height)]
18 | for state in self.env.get_all_states():
19 | if state == [2, 2]:
20 | next_value_table[state[0]][state[1]] = 0.0
21 | continue
22 | # 가치 함수를 위한 빈 리스트
23 | value_list = []
24 |
25 | # 가능한 모든 행동에 대해 계산
26 | for action in self.env.possible_actions:
27 | next_state = self.env.state_after_action(state, action)
28 | reward = self.env.get_reward(state, action)
29 | next_value = self.get_value(next_state)
30 | value_list.append((reward + self.discount_factor * next_value))
31 | # 최댓값을 다음 가치 함수로 대입
32 | next_value_table[state[0]][state[1]] = round(max(value_list), 2)
33 | self.value_table = next_value_table
34 |
35 | # 현재 가치 함수로부터 행동을 반환
36 | def get_action(self, state):
37 | action_list = []
38 | max_value = -99999
39 |
40 | if state == [2, 2]:
41 | return []
42 |
43 | # 모든 행동에 대해 큐함수 (보상 + (감가율 * 다음 상태 가치함수))를 계산
44 | # 최대 큐 함수를 가진 행동(복수일 경우 여러 개)을 반환
45 | for action in self.env.possible_actions:
46 |
47 | next_state = self.env.state_after_action(state, action)
48 | reward = self.env.get_reward(state, action)
49 | next_value = self.get_value(next_state)
50 | value = (reward + self.discount_factor * next_value)
51 |
52 | if value > max_value:
53 | action_list.clear()
54 | action_list.append(action)
55 | max_value = value
56 | elif value == max_value:
57 | action_list.append(action)
58 |
59 | return action_list
60 |
61 | def get_value(self, state):
62 | return round(self.value_table[state[0]][state[1]], 2)
63 |
64 | if __name__ == "__main__":
65 | env = Env()
66 | value_iteration = ValueIteration(env)
67 | grid_world = GraphicDisplay(value_iteration)
68 | grid_world.mainloop()
69 |
--------------------------------------------------------------------------------
/1-grid-world/3-monte-carlo/environment.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tkinter as tk
4 | from PIL import ImageTk, Image
5 |
6 | np.random.seed(1)
7 | PhotoImage = ImageTk.PhotoImage
8 | UNIT = 100 # 픽셀 수
9 | HEIGHT = 5 # 그리드 월드 세로
10 | WIDTH = 5 # 그리드 월드 가로
11 |
12 |
13 | class Env(tk.Tk):
14 | def __init__(self):
15 | super(Env, self).__init__()
16 | self.action_space = ['u', 'd', 'l', 'r']
17 | self.n_actions = len(self.action_space)
18 | self.title('monte carlo')
19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
20 | self.shapes = self.load_images()
21 | self.canvas = self._build_canvas()
22 | self.texts = []
23 |
24 | def _build_canvas(self):
25 | canvas = tk.Canvas(self, bg='white',
26 | height=HEIGHT * UNIT,
27 | width=WIDTH * UNIT)
28 | # 그리드 생성
29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
31 | canvas.create_line(x0, y0, x1, y1)
32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
34 | canvas.create_line(x0, y0, x1, y1)
35 |
36 | # 캔버스에 이미지 추가
37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2])
41 |
42 | canvas.pack()
43 |
44 | return canvas
45 |
46 | def load_images(self):
47 | rectangle = PhotoImage(
48 | Image.open("../img/rectangle.png").resize((65, 65)))
49 | triangle = PhotoImage(
50 | Image.open("../img/triangle.png").resize((65, 65)))
51 | circle = PhotoImage(
52 | Image.open("../img/circle.png").resize((65, 65)))
53 |
54 | return rectangle, triangle, circle
55 |
56 | @staticmethod
57 | def coords_to_state(coords):
58 | x = int((coords[0] - 50) / 100)
59 | y = int((coords[1] - 50) / 100)
60 | return [x, y]
61 |
62 | def reset(self):
63 | self.update()
64 | time.sleep(0.5)
65 | x, y = self.canvas.coords(self.rectangle)
66 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
67 | return self.coords_to_state(self.canvas.coords(self.rectangle))
68 |
69 | def step(self, action):
70 | state = self.canvas.coords(self.rectangle)
71 | base_action = np.array([0, 0])
72 | self.render()
73 |
74 | if action == 0: # 상
75 | if state[1] > UNIT:
76 | base_action[1] -= UNIT
77 | elif action == 1: # 하
78 | if state[1] < (HEIGHT - 1) * UNIT:
79 | base_action[1] += UNIT
80 | elif action == 2: # 좌
81 | if state[0] > UNIT:
82 | base_action[0] -= UNIT
83 | elif action == 3: # 우
84 | if state[0] < (WIDTH - 1) * UNIT:
85 | base_action[0] += UNIT
86 | # 에이전트 이동
87 | self.canvas.move(self.rectangle, base_action[0], base_action[1])
88 | # 에이전트(빨간 네모)를 가장 상위로 배치
89 | self.canvas.tag_raise(self.rectangle)
90 |
91 | next_state = self.canvas.coords(self.rectangle)
92 |
93 | # 보상 함수
94 | if next_state == self.canvas.coords(self.circle):
95 | reward = 100
96 | done = True
97 | elif next_state in [self.canvas.coords(self.triangle1),
98 | self.canvas.coords(self.triangle2)]:
99 | reward = -100
100 | done = True
101 | else:
102 | reward = 0
103 | done = False
104 |
105 | next_state = self.coords_to_state(next_state)
106 |
107 | return next_state, reward, done
108 |
109 | def render(self):
110 | time.sleep(0.03)
111 | self.update()
112 |
--------------------------------------------------------------------------------
/1-grid-world/3-monte-carlo/mc_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | from collections import defaultdict
4 | from environment import Env
5 |
6 |
7 | # 몬테카를로 에이전트 (모든 에피소드 각각의 샘플로 부터 학습)
8 | class MCAgent:
9 | def __init__(self, actions):
10 | self.width = 5
11 | self.height = 5
12 | self.actions = actions
13 | self.learning_rate = 0.01
14 | self.discount_factor = 0.9
15 | self.epsilon = 0.1
16 | self.samples = []
17 | self.value_table = defaultdict(float)
18 |
19 | # 메모리에 샘플을 추가
20 | def save_sample(self, state, reward, done):
21 | self.samples.append([state, reward, done])
22 |
23 | # 모든 에피소드에서 에이전트가 방문한 상태의 큐 함수를 업데이트
24 | def update(self):
25 | G_t = 0
26 | visit_state = []
27 | for reward in reversed(self.samples):
28 | state = str(reward[0])
29 | if state not in visit_state:
30 | visit_state.append(state)
31 | G_t = reward[1] + self.discount_factor * G_t
32 | value = self.value_table[state]
33 | self.value_table[state] = (value +
34 | self.learning_rate * (G_t - value))
35 |
36 | # 큐 함수에 따라서 행동을 반환
37 | # 입실론 탐욕 정책에 따라서 행동을 반환
38 | def get_action(self, state):
39 | if np.random.rand() < self.epsilon:
40 | # 랜덤 행동
41 | action = np.random.choice(self.actions)
42 | else:
43 | # 큐 함수에 따른 행동
44 | next_state = self.possible_next_state(state)
45 | action = self.arg_max(next_state)
46 | return int(action)
47 |
48 | # 후보가 여럿이면 arg_max를 계산하고 무작위로 하나를 반환
49 | @staticmethod
50 | def arg_max(next_state):
51 | max_index_list = []
52 | max_value = next_state[0]
53 | for index, value in enumerate(next_state):
54 | if value > max_value:
55 | max_index_list.clear()
56 | max_value = value
57 | max_index_list.append(index)
58 | elif value == max_value:
59 | max_index_list.append(index)
60 | return random.choice(max_index_list)
61 |
62 | # 가능한 다음 모든 상태들을 반환
63 | def possible_next_state(self, state):
64 | col, row = state
65 | next_state = [0.0] * 4
66 |
67 | if row != 0:
68 | next_state[0] = self.value_table[str([col, row - 1])]
69 | else:
70 | next_state[0] = self.value_table[str(state)]
71 | if row != self.height - 1:
72 | next_state[1] = self.value_table[str([col, row + 1])]
73 | else:
74 | next_state[1] = self.value_table[str(state)]
75 | if col != 0:
76 | next_state[2] = self.value_table[str([col - 1, row])]
77 | else:
78 | next_state[2] = self.value_table[str(state)]
79 | if col != self.width - 1:
80 | next_state[3] = self.value_table[str([col + 1, row])]
81 | else:
82 | next_state[3] = self.value_table[str(state)]
83 |
84 | return next_state
85 |
86 |
87 | # 메인 함수
88 | if __name__ == "__main__":
89 | env = Env()
90 | agent = MCAgent(actions=list(range(env.n_actions)))
91 |
92 | for episode in range(1000):
93 | state = env.reset()
94 | action = agent.get_action(state)
95 |
96 | while True:
97 | env.render()
98 |
99 | # 다음 상태로 이동
100 | # 보상은 숫자이고, 완료 여부는 boolean
101 | next_state, reward, done = env.step(action)
102 | agent.save_sample(next_state, reward, done)
103 |
104 | # 다음 행동 받아옴
105 | action = agent.get_action(next_state)
106 |
107 | # 에피소드가 완료됐을 때, 큐 함수 업데이트
108 | if done:
109 | agent.update()
110 | agent.samples.clear()
111 | break
112 |
--------------------------------------------------------------------------------
/1-grid-world/4-sarsa/.python-version:
--------------------------------------------------------------------------------
1 | 3.5.0
2 |
--------------------------------------------------------------------------------
/1-grid-world/4-sarsa/environment.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tkinter as tk
4 | from PIL import ImageTk, Image
5 |
6 | np.random.seed(1)
7 | PhotoImage = ImageTk.PhotoImage
8 | UNIT = 100 # 필셀 수
9 | HEIGHT = 5 # 그리드 월드 가로
10 | WIDTH = 5 # 그리드 월드 세로
11 |
12 |
13 | class Env(tk.Tk):
14 | def __init__(self):
15 | super(Env, self).__init__()
16 | self.action_space = ['u', 'd', 'l', 'r']
17 | self.n_actions = len(self.action_space)
18 | self.title('SARSA')
19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
20 | self.shapes = self.load_images()
21 | self.canvas = self._build_canvas()
22 | self.texts = []
23 |
24 | def _build_canvas(self):
25 | canvas = tk.Canvas(self, bg='white',
26 | height=HEIGHT * UNIT,
27 | width=WIDTH * UNIT)
28 | # 그리드 생성
29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
31 | canvas.create_line(x0, y0, x1, y1)
32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
34 | canvas.create_line(x0, y0, x1, y1)
35 |
36 | # 캔버스에 이미지 추가
37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2])
41 |
42 | canvas.pack()
43 |
44 | return canvas
45 |
46 | def load_images(self):
47 | rectangle = PhotoImage(
48 | Image.open("../img/rectangle.png").resize((65, 65)))
49 | triangle = PhotoImage(
50 | Image.open("../img/triangle.png").resize((65, 65)))
51 | circle = PhotoImage(
52 | Image.open("../img/circle.png").resize((65, 65)))
53 |
54 | return rectangle, triangle, circle
55 |
56 | def text_value(self, row, col, contents, action, font='Helvetica', size=10,
57 | style='normal', anchor="nw"):
58 | if action == 0:
59 | origin_x, origin_y = 7, 42
60 | elif action == 1:
61 | origin_x, origin_y = 85, 42
62 | elif action == 2:
63 | origin_x, origin_y = 42, 5
64 | else:
65 | origin_x, origin_y = 42, 77
66 |
67 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
68 | font = (font, str(size), style)
69 | text = self.canvas.create_text(x, y, fill="black", text=contents,
70 | font=font, anchor=anchor)
71 | return self.texts.append(text)
72 |
73 | def print_value_all(self, q_table):
74 | for i in self.texts:
75 | self.canvas.delete(i)
76 | self.texts.clear()
77 | for x in range(HEIGHT):
78 | for y in range(WIDTH):
79 | for action in range(0, 4):
80 | state = [x, y]
81 | if str(state) in q_table.keys():
82 | temp = q_table[str(state)][action]
83 | self.text_value(y, x, round(temp, 2), action)
84 |
85 | def coords_to_state(self, coords):
86 | x = int((coords[0] - 50) / 100)
87 | y = int((coords[1] - 50) / 100)
88 | return [x, y]
89 |
90 | def reset(self):
91 | self.update()
92 | time.sleep(0.5)
93 | x, y = self.canvas.coords(self.rectangle)
94 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
95 | self.render()
96 | return self.coords_to_state(self.canvas.coords(self.rectangle))
97 |
98 | def step(self, action):
99 | state = self.canvas.coords(self.rectangle)
100 | base_action = np.array([0, 0])
101 | self.render()
102 |
103 | if action == 0: # 상
104 | if state[1] > UNIT:
105 | base_action[1] -= UNIT
106 | elif action == 1: # 하
107 | if state[1] < (HEIGHT - 1) * UNIT:
108 | base_action[1] += UNIT
109 | elif action == 2: # 좌
110 | if state[0] > UNIT:
111 | base_action[0] -= UNIT
112 | elif action == 3: # 우
113 | if state[0] < (WIDTH - 1) * UNIT:
114 | base_action[0] += UNIT
115 |
116 | # 에이전트 이동
117 | self.canvas.move(self.rectangle, base_action[0], base_action[1])
118 | # 에이전트(빨간 네모)를 가장 상위로 배치
119 | self.canvas.tag_raise(self.rectangle)
120 | next_state = self.canvas.coords(self.rectangle)
121 |
122 | # 보상 함수
123 | if next_state == self.canvas.coords(self.circle):
124 | reward = 100
125 | done = True
126 | elif next_state in [self.canvas.coords(self.triangle1),
127 | self.canvas.coords(self.triangle2)]:
128 | reward = -100
129 | done = True
130 | else:
131 | reward = 0
132 | done = False
133 |
134 | next_state = self.coords_to_state(next_state)
135 |
136 |
137 |
138 | return next_state, reward, done
139 |
140 | def render(self):
141 | time.sleep(0.03)
142 | self.update()
143 |
--------------------------------------------------------------------------------
/1-grid-world/4-sarsa/sarsa_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | from collections import defaultdict
4 | from environment import Env
5 |
6 |
7 | class SARSAgent:
8 | def __init__(self, actions):
9 | self.actions = actions
10 | self.learning_rate = 0.01
11 | self.discount_factor = 0.9
12 | self.epsilon = 0.1
13 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
14 |
15 | # 의 샘플로부터 큐함수를 업데이트
16 | def learn(self, state, action, reward, next_state, next_action):
17 | current_q = self.q_table[state][action]
18 | next_state_q = self.q_table[next_state][next_action]
19 | new_q = (current_q + self.learning_rate *
20 | (reward + self.discount_factor * next_state_q - current_q))
21 | self.q_table[state][action] = new_q
22 |
23 | # 입실론 탐욕 정책에 따라서 행동을 반환
24 | def get_action(self, state):
25 | if np.random.rand() < self.epsilon:
26 | # 무작위 행동 반환
27 | action = np.random.choice(self.actions)
28 | else:
29 | # 큐함수에 따른 행동 반환
30 | state_action = self.q_table[state]
31 | action = self.arg_max(state_action)
32 | return action
33 |
34 | @staticmethod
35 | def arg_max(state_action):
36 | max_index_list = []
37 | max_value = state_action[0]
38 | for index, value in enumerate(state_action):
39 | if value > max_value:
40 | max_index_list.clear()
41 | max_value = value
42 | max_index_list.append(index)
43 | elif value == max_value:
44 | max_index_list.append(index)
45 | return random.choice(max_index_list)
46 |
47 | if __name__ == "__main__":
48 | env = Env()
49 | agent = SARSAgent(actions=list(range(env.n_actions)))
50 |
51 | for episode in range(1000):
52 | # 게임 환경과 상태를 초기화
53 | state = env.reset()
54 | # 현재 상태에 대한 행동을 선택
55 | action = agent.get_action(str(state))
56 |
57 | while True:
58 | env.render()
59 |
60 | # 행동을 위한 후 다음상태 보상 에피소드의 종료 여부를 받아옴
61 | next_state, reward, done = env.step(action)
62 | # 다음 상태에서의 다음 행동 선택
63 | next_action = agent.get_action(str(next_state))
64 |
65 | # 로 큐함수를 업데이트
66 | agent.learn(str(state), action, reward, str(next_state), next_action)
67 |
68 | state = next_state
69 | action = next_action
70 |
71 | # 모든 큐함수를 화면에 표시
72 | env.print_value_all(agent.q_table)
73 |
74 | if done:
75 | break
76 |
77 |
--------------------------------------------------------------------------------
/1-grid-world/5-q-learning/.python-version:
--------------------------------------------------------------------------------
1 | 3.5.0
2 |
--------------------------------------------------------------------------------
/1-grid-world/5-q-learning/environment.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tkinter as tk
4 | from PIL import ImageTk, Image
5 |
6 | np.random.seed(1)
7 | PhotoImage = ImageTk.PhotoImage
8 | UNIT = 100 # 픽셀 수
9 | HEIGHT = 5 # 그리드월드 세로
10 | WIDTH = 5 # 그리드월드 가로
11 |
12 |
13 | class Env(tk.Tk):
14 | def __init__(self):
15 | super(Env, self).__init__()
16 | self.action_space = ['u', 'd', 'l', 'r']
17 | self.n_actions = len(self.action_space)
18 | self.title('Q Learning')
19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
20 | self.shapes = self.load_images()
21 | self.canvas = self._build_canvas()
22 | self.texts = []
23 |
24 | def _build_canvas(self):
25 | canvas = tk.Canvas(self, bg='white',
26 | height=HEIGHT * UNIT,
27 | width=WIDTH * UNIT)
28 | # 그리드 생성
29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
31 | canvas.create_line(x0, y0, x1, y1)
32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
34 | canvas.create_line(x0, y0, x1, y1)
35 |
36 | # 캔버스에 이미지 추가
37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2])
41 |
42 | canvas.pack()
43 |
44 | return canvas
45 |
46 | def load_images(self):
47 | rectangle = PhotoImage(
48 | Image.open("../img/rectangle.png").resize((65, 65)))
49 | triangle = PhotoImage(
50 | Image.open("../img/triangle.png").resize((65, 65)))
51 | circle = PhotoImage(
52 | Image.open("../img/circle.png").resize((65, 65)))
53 |
54 | return rectangle, triangle, circle
55 |
56 | def text_value(self, row, col, contents, action, font='Helvetica', size=10,
57 | style='normal', anchor="nw"):
58 |
59 | if action == 0:
60 | origin_x, origin_y = 7, 42
61 | elif action == 1:
62 | origin_x, origin_y = 85, 42
63 | elif action == 2:
64 | origin_x, origin_y = 42, 5
65 | else:
66 | origin_x, origin_y = 42, 77
67 |
68 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
69 | font = (font, str(size), style)
70 | text = self.canvas.create_text(x, y, fill="black", text=contents,
71 | font=font, anchor=anchor)
72 | return self.texts.append(text)
73 |
74 | def print_value_all(self, q_table):
75 | for i in self.texts:
76 | self.canvas.delete(i)
77 | self.texts.clear()
78 | for i in range(HEIGHT):
79 | for j in range(WIDTH):
80 | for action in range(0, 4):
81 | state = [i, j]
82 | if str(state) in q_table.keys():
83 | temp = q_table[str(state)][action]
84 | self.text_value(j, i, round(temp, 2), action)
85 |
86 | def coords_to_state(self, coords):
87 | x = int((coords[0] - 50) / 100)
88 | y = int((coords[1] - 50) / 100)
89 | return [x, y]
90 |
91 | def state_to_coords(self, state):
92 | x = int(state[0] * 100 + 50)
93 | y = int(state[1] * 100 + 50)
94 | return [x, y]
95 |
96 | def reset(self):
97 | self.update()
98 | time.sleep(0.5)
99 | x, y = self.canvas.coords(self.rectangle)
100 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
101 | self.render()
102 | return self.coords_to_state(self.canvas.coords(self.rectangle))
103 |
104 | def step(self, action):
105 | state = self.canvas.coords(self.rectangle)
106 | base_action = np.array([0, 0])
107 | self.render()
108 |
109 | if action == 0: # 상
110 | if state[1] > UNIT:
111 | base_action[1] -= UNIT
112 | elif action == 1: # 하
113 | if state[1] < (HEIGHT - 1) * UNIT:
114 | base_action[1] += UNIT
115 | elif action == 2: # 좌
116 | if state[0] > UNIT:
117 | base_action[0] -= UNIT
118 | elif action == 3: # 우
119 | if state[0] < (WIDTH - 1) * UNIT:
120 | base_action[0] += UNIT
121 |
122 | # 에이전트 이동
123 | self.canvas.move(self.rectangle, base_action[0], base_action[1])
124 | # 에이전트(빨간 네모)를 가장 상위로 배치
125 | self.canvas.tag_raise(self.rectangle)
126 | next_state = self.canvas.coords(self.rectangle)
127 |
128 | # 보상 함수
129 | if next_state == self.canvas.coords(self.circle):
130 | reward = 100
131 | done = True
132 | elif next_state in [self.canvas.coords(self.triangle1),
133 | self.canvas.coords(self.triangle2)]:
134 | reward = -100
135 | done = True
136 | else:
137 | reward = 0
138 | done = False
139 |
140 | next_state = self.coords_to_state(next_state)
141 | return next_state, reward, done
142 |
143 | def render(self):
144 | time.sleep(0.03)
145 | self.update()
146 |
--------------------------------------------------------------------------------
/1-grid-world/5-q-learning/q_learning_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | from environment import Env
4 | from collections import defaultdict
5 |
6 | class QLearningAgent:
7 | def __init__(self, actions):
8 | # 행동 = [0, 1, 2, 3] 순서대로 상, 하, 좌, 우
9 | self.actions = actions
10 | self.learning_rate = 0.01
11 | self.discount_factor = 0.9
12 | self.epsilon = 0.9
13 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
14 |
15 | # 샘플로부터 큐함수 업데이트
16 | def learn(self, state, action, reward, next_state):
17 | q_1 = self.q_table[state][action]
18 | # 벨만 최적 방정식을 사용한 큐함수의 업데이트
19 | q_2 = reward + self.discount_factor * max(self.q_table[next_state])
20 | self.q_table[state][action] += self.learning_rate * (q_2 - q_1)
21 |
22 | # 큐함수에 의거하여 입실론 탐욕 정책에 따라서 행동을 반환
23 | def get_action(self, state):
24 | if np.random.rand() < self.epsilon:
25 | # 무작위 행동 반환
26 | action = np.random.choice(self.actions)
27 | else:
28 | # 큐함수에 따른 행동 반환
29 | state_action = self.q_table[state]
30 | action = self.arg_max(state_action)
31 | return action
32 |
33 | @staticmethod
34 | def arg_max(state_action):
35 | max_index_list = []
36 | max_value = state_action[0]
37 | for index, value in enumerate(state_action):
38 | if value > max_value:
39 | max_index_list.clear()
40 | max_value = value
41 | max_index_list.append(index)
42 | elif value == max_value:
43 | max_index_list.append(index)
44 | return random.choice(max_index_list)
45 |
46 | if __name__ == "__main__":
47 | env = Env()
48 | agent = QLearningAgent(actions=list(range(env.n_actions)))
49 |
50 | for episode in range(1000):
51 | state = env.reset()
52 |
53 | while True:
54 | env.render()
55 |
56 | # 현재 상태에 대한 행동 선택
57 | action = agent.get_action(str(state))
58 | # 행동을 취한 후 다음 상태, 보상 에피소드의 종료여부를 받아옴
59 | next_state, reward, done = env.step(action)
60 |
61 | # 로 큐함수를 업데이트
62 | agent.learn(str(state), action, reward, str(next_state))
63 | state = next_state
64 | # 모든 큐함수를 화면에 표시
65 | env.print_value_all(agent.q_table)
66 |
67 | if done:
68 | break
69 |
--------------------------------------------------------------------------------
/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import pylab
3 | import random
4 | import numpy as np
5 | from environment import Env
6 | from keras.layers import Dense
7 | from keras.optimizers import Adam
8 | from keras.models import Sequential
9 |
10 | EPISODES = 1000
11 |
12 |
13 | # 그리드월드 예제에서의 딥살사 에이전트
14 | class DeepSARSAgent:
15 | def __init__(self):
16 | self.load_model = False
17 | # 에이전트가 가능한 모든 행동 정의
18 | self.action_space = [0, 1, 2, 3, 4]
19 | # 상태의 크기와 행동의 크기 정의
20 | self.action_size = len(self.action_space)
21 | self.state_size = 15
22 | self.discount_factor = 0.99
23 | self.learning_rate = 0.001
24 |
25 | self.epsilon = 1. # exploration
26 | self.epsilon_decay = .9999
27 | self.epsilon_min = 0.01
28 | self.model = self.build_model()
29 |
30 | if self.load_model:
31 | self.epsilon = 0.05
32 | self.model.load_weights('./save_model/deep_sarsa_trained.h5')
33 |
34 | # 상태가 입력 큐함수가 출력인 인공신경망 생성
35 | def build_model(self):
36 | model = Sequential()
37 | model.add(Dense(30, input_dim=self.state_size, activation='relu'))
38 | model.add(Dense(30, activation='relu'))
39 | model.add(Dense(self.action_size, activation='linear'))
40 | model.summary()
41 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
42 | return model
43 |
44 | # 입실론 탐욕 방법으로 행동 선택
45 | def get_action(self, state):
46 | if np.random.rand() <= self.epsilon:
47 | # 무작위 행동 반환
48 | return random.randrange(self.action_size)
49 | else:
50 | # 모델로부터 행동 산출
51 | state = np.float32(state)
52 | q_values = self.model.predict(state)
53 | return np.argmax(q_values[0])
54 |
55 | def train_model(self, state, action, reward, next_state, next_action, done):
56 | if self.epsilon > self.epsilon_min:
57 | self.epsilon *= self.epsilon_decay
58 |
59 | state = np.float32(state)
60 | next_state = np.float32(next_state)
61 | target = self.model.predict(state)[0]
62 | # 살사의 큐함수 업데이트 식
63 | if done:
64 | target[action] = reward
65 | else:
66 | target[action] = (reward + self.discount_factor *
67 | self.model.predict(next_state)[0][next_action])
68 |
69 | # 출력 값 reshape
70 | target = np.reshape(target, [1, 5])
71 | # 인공신경망 업데이트
72 | self.model.fit(state, target, epochs=1, verbose=0)
73 |
74 |
75 | if __name__ == "__main__":
76 | # 환경과 에이전트 생성
77 | env = Env()
78 | agent = DeepSARSAgent()
79 |
80 | global_step = 0
81 | scores, episodes = [], []
82 |
83 | for e in range(EPISODES):
84 | done = False
85 | score = 0
86 | state = env.reset()
87 | state = np.reshape(state, [1, 15])
88 |
89 | while not done:
90 | # env 초기화
91 | global_step += 1
92 |
93 | # 현재 상태에 대한 행동 선택
94 | action = agent.get_action(state)
95 | # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집
96 | next_state, reward, done = env.step(action)
97 | next_state = np.reshape(next_state, [1, 15])
98 | next_action = agent.get_action(next_state)
99 | # 샘플로 모델 학습
100 | agent.train_model(state, action, reward, next_state, next_action,
101 | done)
102 | state = next_state
103 | score += reward
104 |
105 | state = copy.deepcopy(next_state)
106 |
107 | if done:
108 | # 에피소드마다 학습 결과 출력
109 | scores.append(score)
110 | episodes.append(e)
111 | pylab.plot(episodes, scores, 'b')
112 | pylab.savefig("./save_graph/deep_sarsa_.png")
113 | print("episode:", e, " score:", score, "global_step",
114 | global_step, " epsilon:", agent.epsilon)
115 |
116 | # 100 에피소드마다 모델 저장
117 | if e % 100 == 0:
118 | agent.model.save_weights("./save_model/deep_sarsa.h5")
119 |
--------------------------------------------------------------------------------
/1-grid-world/6-deep-sarsa/environment.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tkinter as tk
4 | from PIL import ImageTk, Image
5 |
6 | PhotoImage = ImageTk.PhotoImage
7 | UNIT = 50 # 픽셀 수
8 | HEIGHT = 5 # 그리드 세로
9 | WIDTH = 5 # 그리드 가로
10 |
11 | np.random.seed(1)
12 |
13 |
14 | class Env(tk.Tk):
15 | def __init__(self):
16 | super(Env, self).__init__()
17 | self.action_space = ['u', 'd', 'l', 'r']
18 | self.action_size = len(self.action_space)
19 | self.title('DeepSARSA')
20 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
21 | self.shapes = self.load_images()
22 | self.canvas = self._build_canvas()
23 | self.counter = 0
24 | self.rewards = []
25 | self.goal = []
26 | # 장애물 설정
27 | self.set_reward([0, 1], -1)
28 | self.set_reward([1, 2], -1)
29 | self.set_reward([2, 3], -1)
30 | # 목표 지점 설정
31 | self.set_reward([4, 4], 1)
32 |
33 | def _build_canvas(self):
34 | canvas = tk.Canvas(self, bg='white',
35 | height=HEIGHT * UNIT,
36 | width=WIDTH * UNIT)
37 | # 그리드 생성
38 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
39 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
40 | canvas.create_line(x0, y0, x1, y1)
41 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
42 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
43 | canvas.create_line(x0, y0, x1, y1)
44 |
45 | self.rewards = []
46 | self.goal = []
47 | # 캔버스에 이미지 추가
48 | x, y = UNIT/2, UNIT/2
49 | self.rectangle = canvas.create_image(x, y, image=self.shapes[0])
50 |
51 | canvas.pack()
52 |
53 | return canvas
54 |
55 | def load_images(self):
56 | rectangle = PhotoImage(
57 | Image.open("../img/rectangle.png").resize((30, 30)))
58 | triangle = PhotoImage(
59 | Image.open("../img/triangle.png").resize((30, 30)))
60 | circle = PhotoImage(
61 | Image.open("../img/circle.png").resize((30, 30)))
62 |
63 | return rectangle, triangle, circle
64 |
65 | def reset_reward(self):
66 |
67 | for reward in self.rewards:
68 | self.canvas.delete(reward['figure'])
69 |
70 | self.rewards.clear()
71 | self.goal.clear()
72 | self.set_reward([0, 1], -1)
73 | self.set_reward([1, 2], -1)
74 | self.set_reward([2, 3], -1)
75 |
76 | # #goal
77 | self.set_reward([4, 4], 1)
78 |
79 | def set_reward(self, state, reward):
80 | state = [int(state[0]), int(state[1])]
81 | x = int(state[0])
82 | y = int(state[1])
83 | temp = {}
84 | if reward > 0:
85 | temp['reward'] = reward
86 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
87 | (UNIT * y) + UNIT / 2,
88 | image=self.shapes[2])
89 |
90 | self.goal.append(temp['figure'])
91 |
92 |
93 | elif reward < 0:
94 | temp['direction'] = -1
95 | temp['reward'] = reward
96 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
97 | (UNIT * y) + UNIT / 2,
98 | image=self.shapes[1])
99 |
100 | temp['coords'] = self.canvas.coords(temp['figure'])
101 | temp['state'] = state
102 | self.rewards.append(temp)
103 |
104 | # new methods
105 |
106 | def check_if_reward(self, state):
107 | check_list = dict()
108 | check_list['if_goal'] = False
109 | rewards = 0
110 |
111 | for reward in self.rewards:
112 | if reward['state'] == state:
113 | rewards += reward['reward']
114 | if reward['reward'] == 1:
115 | check_list['if_goal'] = True
116 |
117 | check_list['rewards'] = rewards
118 |
119 | return check_list
120 |
121 | def coords_to_state(self, coords):
122 | x = int((coords[0] - UNIT / 2) / UNIT)
123 | y = int((coords[1] - UNIT / 2) / UNIT)
124 | return [x, y]
125 |
126 | def reset(self):
127 | self.update()
128 | time.sleep(0.5)
129 | x, y = self.canvas.coords(self.rectangle)
130 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
131 | self.reset_reward()
132 | return self.get_state()
133 |
134 | def step(self, action):
135 | self.counter += 1
136 | self.render()
137 |
138 | if self.counter % 2 == 1:
139 | self.rewards = self.move_rewards()
140 |
141 | next_coords = self.move(self.rectangle, action)
142 | check = self.check_if_reward(self.coords_to_state(next_coords))
143 | done = check['if_goal']
144 | reward = check['rewards']
145 |
146 | self.canvas.tag_raise(self.rectangle)
147 |
148 | s_ = self.get_state()
149 |
150 | return s_, reward, done
151 |
152 | def get_state(self):
153 |
154 | location = self.coords_to_state(self.canvas.coords(self.rectangle))
155 | agent_x = location[0]
156 | agent_y = location[1]
157 |
158 | states = list()
159 |
160 | for reward in self.rewards:
161 | reward_location = reward['state']
162 | states.append(reward_location[0] - agent_x)
163 | states.append(reward_location[1] - agent_y)
164 | if reward['reward'] < 0:
165 | states.append(-1)
166 | states.append(reward['direction'])
167 | else:
168 | states.append(1)
169 |
170 | return states
171 |
172 | def move_rewards(self):
173 | new_rewards = []
174 | for temp in self.rewards:
175 | if temp['reward'] == 1:
176 | new_rewards.append(temp)
177 | continue
178 | temp['coords'] = self.move_const(temp)
179 | temp['state'] = self.coords_to_state(temp['coords'])
180 | new_rewards.append(temp)
181 | return new_rewards
182 |
183 | def move_const(self, target):
184 |
185 | s = self.canvas.coords(target['figure'])
186 |
187 | base_action = np.array([0, 0])
188 |
189 | if s[0] == (WIDTH - 1) * UNIT + UNIT / 2:
190 | target['direction'] = 1
191 | elif s[0] == UNIT / 2:
192 | target['direction'] = -1
193 |
194 | if target['direction'] == -1:
195 | base_action[0] += UNIT
196 | elif target['direction'] == 1:
197 | base_action[0] -= UNIT
198 |
199 | if (target['figure'] is not self.rectangle
200 | and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]):
201 | base_action = np.array([0, 0])
202 |
203 | self.canvas.move(target['figure'], base_action[0], base_action[1])
204 |
205 | s_ = self.canvas.coords(target['figure'])
206 |
207 | return s_
208 |
209 | def move(self, target, action):
210 | s = self.canvas.coords(target)
211 |
212 | base_action = np.array([0, 0])
213 |
214 | if action == 0: # 상
215 | if s[1] > UNIT:
216 | base_action[1] -= UNIT
217 | elif action == 1: # 하
218 | if s[1] < (HEIGHT - 1) * UNIT:
219 | base_action[1] += UNIT
220 | elif action == 2: # 우
221 | if s[0] < (WIDTH - 1) * UNIT:
222 | base_action[0] += UNIT
223 | elif action == 3: # 좌
224 | if s[0] > UNIT:
225 | base_action[0] -= UNIT
226 |
227 | self.canvas.move(target, base_action[0], base_action[1])
228 |
229 | s_ = self.canvas.coords(target)
230 |
231 | return s_
232 |
233 | def render(self):
234 | # 게임 속도 조정
235 | time.sleep(0.05)
236 | self.update()
237 |
--------------------------------------------------------------------------------
/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning-kr/c336961d9d67ec9d53694a30c6e28f92f51ea947/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png
--------------------------------------------------------------------------------
/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning-kr/c336961d9d67ec9d53694a30c6e28f92f51ea947/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5
--------------------------------------------------------------------------------
/1-grid-world/7-reinforce/environment.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tkinter as tk
4 | from PIL import ImageTk, Image
5 |
6 | PhotoImage = ImageTk.PhotoImage
7 | UNIT = 50 # 픽셀 수
8 | HEIGHT = 5 # 그리드월드 세로
9 | WIDTH = 5 # 그리드월드 가로
10 |
11 | np.random.seed(1)
12 |
13 |
14 | class Env(tk.Tk):
15 | def __init__(self):
16 | super(Env, self).__init__()
17 | self.action_space = ['u', 'd', 'l', 'r']
18 | self.action_size = len(self.action_space)
19 | self.title('Reinforce')
20 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
21 | self.shapes = self.load_images()
22 | self.canvas = self._build_canvas()
23 | self.counter = 0
24 | self.rewards = []
25 | self.goal = []
26 | # 장애물 설정
27 | self.set_reward([0, 1], -1)
28 | self.set_reward([1, 2], -1)
29 | self.set_reward([2, 3], -1)
30 | # 목표지점 설정
31 | self.set_reward([4, 4], 1)
32 |
33 | def _build_canvas(self):
34 | canvas = tk.Canvas(self, bg='white',
35 | height=HEIGHT * UNIT,
36 | width=WIDTH * UNIT)
37 | # 그리드 생성
38 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
39 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
40 | canvas.create_line(x0, y0, x1, y1)
41 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
42 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
43 | canvas.create_line(x0, y0, x1, y1)
44 |
45 | self.rewards = []
46 | self.goal = []
47 | # 캔버스에 이미지 추가
48 | x, y = UNIT/2, UNIT/2
49 | self.rectangle = canvas.create_image(x, y, image=self.shapes[0])
50 |
51 | canvas.pack()
52 |
53 | return canvas
54 |
55 | def load_images(self):
56 | rectangle = PhotoImage(
57 | Image.open("../img/rectangle.png").resize((30, 30)))
58 | triangle = PhotoImage(
59 | Image.open("../img/triangle.png").resize((30, 30)))
60 | circle = PhotoImage(
61 | Image.open("../img/circle.png").resize((30, 30)))
62 |
63 | return rectangle, triangle, circle
64 |
65 | def reset_reward(self):
66 |
67 | for reward in self.rewards:
68 | self.canvas.delete(reward['figure'])
69 |
70 | self.rewards.clear()
71 | self.goal.clear()
72 | self.set_reward([0, 1], -1)
73 | self.set_reward([1, 2], -1)
74 | self.set_reward([2, 3], -1)
75 |
76 | # 목표 지점
77 | self.set_reward([4, 4], 1)
78 |
79 | def set_reward(self, state, reward):
80 | state = [int(state[0]), int(state[1])]
81 | x = int(state[0])
82 | y = int(state[1])
83 | temp = {}
84 | if reward > 0:
85 | temp['reward'] = reward
86 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
87 | (UNIT * y) + UNIT / 2,
88 | image=self.shapes[2])
89 |
90 | self.goal.append(temp['figure'])
91 |
92 |
93 | elif reward < 0:
94 | temp['direction'] = -1
95 | temp['reward'] = reward
96 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
97 | (UNIT * y) + UNIT / 2,
98 | image=self.shapes[1])
99 |
100 | temp['coords'] = self.canvas.coords(temp['figure'])
101 | temp['state'] = state
102 | self.rewards.append(temp)
103 |
104 | def check_if_reward(self, state):
105 | check_list = dict()
106 | check_list['if_goal'] = False
107 | rewards = 0
108 |
109 | for reward in self.rewards:
110 | if reward['state'] == state:
111 | rewards += reward['reward']
112 | if reward['reward'] > 0:
113 | check_list['if_goal'] = True
114 |
115 | check_list['rewards'] = rewards
116 |
117 | return check_list
118 |
119 | def coords_to_state(self, coords):
120 | x = int((coords[0] - UNIT / 2) / UNIT)
121 | y = int((coords[1] - UNIT / 2) / UNIT)
122 | return [x, y]
123 |
124 | def reset(self):
125 | self.update()
126 | x, y = self.canvas.coords(self.rectangle)
127 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
128 | self.reset_reward()
129 | return self.get_state()
130 |
131 | def step(self, action):
132 | self.counter += 1
133 | self.render()
134 |
135 | if self.counter % 2 == 1:
136 | self.rewards = self.move_rewards()
137 |
138 | next_coords = self.move(self.rectangle, action)
139 | check = self.check_if_reward(self.coords_to_state(next_coords))
140 | done = check['if_goal']
141 | reward = check['rewards']
142 | reward -= 0.1
143 | self.canvas.tag_raise(self.rectangle)
144 |
145 | s_ = self.get_state()
146 |
147 | return s_, reward, done
148 |
149 | def get_state(self):
150 |
151 | location = self.coords_to_state(self.canvas.coords(self.rectangle))
152 | agent_x = location[0]
153 | agent_y = location[1]
154 |
155 | states = list()
156 |
157 | for reward in self.rewards:
158 | reward_location = reward['state']
159 | states.append(reward_location[0] - agent_x)
160 | states.append(reward_location[1] - agent_y)
161 | if reward['reward'] < 0:
162 | states.append(-1)
163 | states.append(reward['direction'])
164 | else:
165 | states.append(1)
166 |
167 | return states
168 |
169 | def move_rewards(self):
170 | new_rewards = []
171 | for temp in self.rewards:
172 | if temp['reward'] > 0:
173 | new_rewards.append(temp)
174 | continue
175 | temp['coords'] = self.move_const(temp)
176 | temp['state'] = self.coords_to_state(temp['coords'])
177 | new_rewards.append(temp)
178 | return new_rewards
179 |
180 | def move_const(self, target):
181 |
182 | s = self.canvas.coords(target['figure'])
183 |
184 | base_action = np.array([0, 0])
185 |
186 | if s[0] == (WIDTH - 1) * UNIT + UNIT / 2:
187 | target['direction'] = 1
188 | elif s[0] == UNIT / 2:
189 | target['direction'] = -1
190 |
191 | if target['direction'] == -1:
192 | base_action[0] += UNIT
193 | elif target['direction'] == 1:
194 | base_action[0] -= UNIT
195 |
196 | if (target['figure'] is not self.rectangle
197 | and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]):
198 | base_action = np.array([0, 0])
199 |
200 | self.canvas.move(target['figure'], base_action[0], base_action[1])
201 |
202 | s_ = self.canvas.coords(target['figure'])
203 |
204 | return s_
205 |
206 | def move(self, target, action):
207 | s = self.canvas.coords(target)
208 |
209 | base_action = np.array([0, 0])
210 |
211 | if action == 0: # 상
212 | if s[1] > UNIT:
213 | base_action[1] -= UNIT
214 | elif action == 1: # 하
215 | if s[1] < (HEIGHT - 1) * UNIT:
216 | base_action[1] += UNIT
217 | elif action == 2: # 우
218 | if s[0] < (WIDTH - 1) * UNIT:
219 | base_action[0] += UNIT
220 | elif action == 3: # 좌
221 | if s[0] > UNIT:
222 | base_action[0] -= UNIT
223 |
224 | self.canvas.move(target, base_action[0], base_action[1])
225 |
226 | s_ = self.canvas.coords(target)
227 |
228 | return s_
229 |
230 | def render(self):
231 | # 게임 속도 조정
232 | time.sleep(0.07)
233 | self.update()
234 |
--------------------------------------------------------------------------------
/1-grid-world/7-reinforce/reinforce_agent.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import pylab
3 | import numpy as np
4 | from environment import Env
5 | from keras.layers import Dense
6 | from keras.optimizers import Adam
7 | from keras.models import Sequential
8 | from keras import backend as K
9 |
10 | EPISODES = 2500
11 |
12 | # 그리드월드 예제에서의 REINFORCE 에이전트
13 | class ReinforceAgent:
14 | def __init__(self):
15 | self.load_model = False
16 | # 가능한 모든 행동 정의
17 | self.action_space = [0, 1, 2, 3, 4]
18 | # 상태와 행동의 크기 정의
19 | self.action_size = len(self.action_space)
20 | self.state_size = 15
21 | self.discount_factor = 0.99
22 | self.learning_rate = 0.001
23 |
24 | self.model = self.build_model()
25 | self.optimizer = self.optimizer()
26 | self.states, self.actions, self.rewards = [], [], []
27 |
28 | if self.load_model:
29 | self.model.load_weights('./save_model/reinforce_trained.h5')
30 |
31 | # 상태가 입력, 각 행동의 확률이 출력인 인공신경망 생성
32 | def build_model(self):
33 | model = Sequential()
34 | model.add(Dense(24, input_dim=self.state_size, activation='relu'))
35 | model.add(Dense(24, activation='relu'))
36 | model.add(Dense(self.action_size, activation='softmax'))
37 | model.summary()
38 | return model
39 |
40 | # 정책신경망을 업데이트 하기 위한 오류함수와 훈련함수의 생성
41 | def optimizer(self):
42 | action = K.placeholder(shape=[None, 5])
43 | discounted_rewards = K.placeholder(shape=[None, ])
44 |
45 | # 크로스 엔트로피 오류함수 계산
46 | action_prob = K.sum(action * self.model.output, axis=1)
47 | cross_entropy = K.log(action_prob) * discounted_rewards
48 | loss = -K.sum(cross_entropy)
49 |
50 | # 정책신경망을 업데이트하는 훈련함수 생성
51 | optimizer = Adam(lr=self.learning_rate)
52 | updates = optimizer.get_updates(self.model.trainable_weights,[],
53 | loss)
54 | train = K.function([self.model.input, action, discounted_rewards], [],
55 | updates=updates)
56 |
57 | return train
58 |
59 | # 정책신경망으로 행동 선택
60 | def get_action(self, state):
61 | policy = self.model.predict(state)[0]
62 | return np.random.choice(self.action_size, 1, p=policy)[0]
63 |
64 | # 반환값 계산
65 | def discount_rewards(self, rewards):
66 | discounted_rewards = np.zeros_like(rewards)
67 | running_add = 0
68 | for t in reversed(range(0, len(rewards))):
69 | running_add = running_add * self.discount_factor + rewards[t]
70 | discounted_rewards[t] = running_add
71 | return discounted_rewards
72 |
73 | # 한 에피소드 동안의 상태, 행동, 보상을 저장
74 | def append_sample(self, state, action, reward):
75 | self.states.append(state[0])
76 | self.rewards.append(reward)
77 | act = np.zeros(self.action_size)
78 | act[action] = 1
79 | self.actions.append(act)
80 |
81 | # 정책신경망 업데이트
82 | def train_model(self):
83 | discounted_rewards = np.float32(self.discount_rewards(self.rewards))
84 | discounted_rewards -= np.mean(discounted_rewards)
85 | discounted_rewards /= np.std(discounted_rewards)
86 |
87 | self.optimizer([self.states, self.actions, discounted_rewards])
88 | self.states, self.actions, self.rewards = [], [], []
89 |
90 |
91 | if __name__ == "__main__":
92 | # 환경과 에이전트의 생성
93 | env = Env()
94 | agent = ReinforceAgent()
95 |
96 | global_step = 0
97 | scores, episodes = [], []
98 |
99 | for e in range(EPISODES):
100 | done = False
101 | score = 0
102 | # env 초기화
103 | state = env.reset()
104 | state = np.reshape(state, [1, 15])
105 |
106 | while not done:
107 | global_step += 1
108 | # 현재 상태에 대한 행동 선택
109 | action = agent.get_action(state)
110 | # 선택한 행동으로 환경에서 한 타임스탭 진행 후 샘플 수집
111 | next_state, reward, done = env.step(action)
112 | next_state = np.reshape(next_state, [1, 15])
113 |
114 | agent.append_sample(state, action, reward)
115 | score += reward
116 | state = copy.deepcopy(next_state)
117 |
118 | if done:
119 | # 에피소드마다 정책신경망 업데이트
120 | agent.train_model()
121 | scores.append(score)
122 | episodes.append(e)
123 | score = round(score,2)
124 | print("episode:", e, " score:", score, " time_step:",
125 | global_step)
126 |
127 | # 100 에피소드마다 학습 결과 출력 및 모델 저장
128 | if e % 100 == 0:
129 | pylab.plot(episodes, scores, 'b')
130 | pylab.savefig("./save_graph/reinforce.png")
131 | agent.model.save_weights("./save_model/reinforce.h5")
132 |
--------------------------------------------------------------------------------
/1-grid-world/7-reinforce/save_graph/reinforce_trained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning-kr/c336961d9d67ec9d53694a30c6e28f92f51ea947/1-grid-world/7-reinforce/save_graph/reinforce_trained.png
--------------------------------------------------------------------------------
/1-grid-world/7-reinforce/save_model/reinforce_trained.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning-kr/c336961d9d67ec9d53694a30c6e28f92f51ea947/1-grid-world/7-reinforce/save_model/reinforce_trained.h5
--------------------------------------------------------------------------------
/1-grid-world/README.md:
--------------------------------------------------------------------------------
1 | # Grid World with Reinforcement Learning
2 | This is Grid World example that we made for the simple algorithm test
3 | The game is simple. The red rectangle must arrive in the circle, avoiding triangle.
4 |
5 |