├── .gitignore
├── 1-grid-world
├── 1-policy-iteration
│ ├── environment.py
│ └── policy_iteration.py
├── 2-value-iteration
│ ├── environment.py
│ └── value_iteration.py
├── 3-monte-carlo
│ ├── environment.py
│ └── mc_agent.py
├── 4-sarsa
│ ├── .python-version
│ ├── environment.py
│ └── sarsa_agent.py
├── 5-q-learning
│ ├── .python-version
│ ├── environment.py
│ └── q_learning_agent.py
├── 6-deep-sarsa
│ ├── deep_sarsa_agent.py
│ ├── environment.py
│ ├── save_graph
│ │ └── deep_sarsa_trained.png
│ └── save_model
│ │ └── deep_sarsa_trained.h5
├── 7-reinforce
│ ├── environment.py
│ ├── reinforce_agent.py
│ ├── save_graph
│ │ └── reinforce_trained.png
│ └── save_model
│ │ └── reinforce_trained.h5
├── README.md
├── gridworld.png
├── gridworld_changing.png
└── img
│ ├── circle.png
│ ├── down.png
│ ├── left.png
│ ├── rectangle.png
│ ├── right.png
│ ├── triangle.png
│ └── up.png
├── 2-cartpole
├── 1-dqn
│ ├── SumTree.py
│ ├── cartpole_dqn.py
│ ├── cartpole_only_per.py
│ ├── save_graph
│ │ └── Cartpole_DQN.png
│ └── save_model
│ │ └── cartpole_dqn.h5
├── 2-double-dqn
│ ├── cartpole_ddqn.py
│ ├── save_graph
│ │ └── cartpole_ddqn.png
│ └── save_model
│ │ └── cartpole_ddqn.h5
├── 3-reinforce
│ ├── cartpole_reinforce.py
│ ├── save_graph
│ │ └── cartpole_reinforce.png
│ └── save_model
│ │ └── cartpole_reinforce.h5
├── 4-actor-critic
│ ├── cartpole_a2c.py
│ ├── save_graph
│ │ └── cartpole_a2c.png
│ └── save_model
│ │ ├── cartpole_actor.h5
│ │ └── cartpole_critic.h5
├── 5-a3c
│ ├── cartpole_a3c.py
│ └── save_model
│ │ ├── Cartpole_A3C_actor.h5
│ │ └── Cartpole_A3C_critic.h5
├── LICENSE
├── README.md
└── cartpole.png
├── 3-atari
├── 1-breakout
│ ├── breakout_a3c.py
│ ├── breakout_ddqn.py
│ ├── breakout_dqn.py
│ ├── breakout_dueling_ddqn.py
│ ├── play_a3c_model.py
│ ├── play_dqn_model.py
│ ├── save_model
│ │ ├── breakout_a3c_1_actor.h5
│ │ ├── breakout_a3c_1_critic.h5
│ │ ├── breakout_a3c_2_actor.h5
│ │ ├── breakout_a3c_2_critic.h5
│ │ ├── breakout_a3c_3_actor.h5
│ │ ├── breakout_a3c_3_critic.h5
│ │ ├── breakout_a3c_4_actor.h5
│ │ ├── breakout_a3c_4_critic.h5
│ │ ├── breakout_a3c_5_actor.h5
│ │ ├── breakout_a3c_5_critic.h5
│ │ ├── breakout_dqn.h5
│ │ ├── breakout_dqn_1.h5
│ │ ├── breakout_dqn_2.h5
│ │ ├── breakout_dqn_3.h5
│ │ ├── breakout_dqn_4.h5
│ │ └── breakout_dqn_5.h5
│ └── summary
│ │ ├── breakout_a3c
│ │ └── events.out.tfevents.1497264638
│ │ └── breakout_dqn
│ │ └── events.out.tfevents.1496968668.young-System-Product-Name
├── 2-pong
│ ├── README.md
│ ├── assets
│ │ ├── pg.gif
│ │ └── score.png
│ ├── pong_a3c.py
│ ├── pong_reinforce.py
│ └── save_model
│ │ └── pong_reinforce.h5
└── LICENSE
├── 4-gym
└── 1-mountaincar
│ ├── mountaincar_dqn.py
│ └── save_model
│ └── MountainCar_DQN.h5
├── LICENSE
├── README.md
├── images
└── Reinforcement-Learning.png
├── requirements.txt
└── wiki
├── how-to-windows.md
├── img
├── how-to-windows.png
├── link-env-with-pychar-1.png
├── link-env-with-pychar-2.png
└── link-env-with-pychar.png
├── install_guide_osx+ubuntu.md
└── rlcode_image
├── cartpole_exam.png
├── console_hello_world.png
├── default_config.png
├── file_setting.png
├── hello_world_ubuntu.png
├── openai_github.png
├── project_interpreter.png
├── pycham_new_project.png
├── pycharm_community.png
├── pycharm_drag.png
├── pycharm_init.png
├── python3_terminal.jpg
├── python_download.png
├── python_installed.png
├── python_intalled.png
├── rl_book_hello_world.png
├── rl_book_project.png
├── rl_book_venv.png
├── rl_book_virtualenv.png
├── rlcode_book_directory.png
├── rlcode_project.png
├── run_hello_world.png
├── sh_pycharm.sh.png
└── terminal_rlcode_book.png
/.gitignore:
--------------------------------------------------------------------------------
1 | *.project
2 | *.pydevproject
3 | .idea/
4 | .DS_Store
5 | __pycache__
6 | ./Code 2. Cartpole/6. A3C/Cartpole_A3C.pgy
--------------------------------------------------------------------------------
/1-grid-world/1-policy-iteration/environment.py:
--------------------------------------------------------------------------------
1 | import tkinter as tk
2 | from tkinter import Button
3 | import time
4 | import numpy as np
5 | from PIL import ImageTk, Image
6 |
7 | PhotoImage = ImageTk.PhotoImage
8 | UNIT = 100 # pixels
9 | HEIGHT = 5 # grid height
10 | WIDTH = 5 # grid width
11 | TRANSITION_PROB = 1
12 | POSSIBLE_ACTIONS = [0, 1, 2, 3] # up, down, left, right
13 | ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # actions in coordinates
14 | REWARDS = []
15 |
16 |
17 | class GraphicDisplay(tk.Tk):
18 | def __init__(self, agent):
19 | super(GraphicDisplay, self).__init__()
20 | self.title('Policy Iteration')
21 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
22 | self.texts = []
23 | self.arrows = []
24 | self.env = Env()
25 | self.agent = agent
26 | self.evaluation_count = 0
27 | self.improvement_count = 0
28 | self.is_moving = 0
29 | (self.up, self.down, self.left, self.right), self.shapes = self.load_images()
30 | self.canvas = self._build_canvas()
31 | self.text_reward(2, 2, "R : 1.0")
32 | self.text_reward(1, 2, "R : -1.0")
33 | self.text_reward(2, 1, "R : -1.0")
34 |
35 | def _build_canvas(self):
36 | canvas = tk.Canvas(self, bg='white',
37 | height=HEIGHT * UNIT,
38 | width=WIDTH * UNIT)
39 | # buttons
40 | iteration_button = Button(self, text="Evaluate",
41 | command=self.evaluate_policy)
42 | iteration_button.configure(width=10, activebackground="#33B5E5")
43 | canvas.create_window(WIDTH * UNIT * 0.13, HEIGHT * UNIT + 10,
44 | window=iteration_button)
45 | policy_button = Button(self, text="Improve",
46 | command=self.improve_policy)
47 | policy_button.configure(width=10, activebackground="#33B5E5")
48 | canvas.create_window(WIDTH * UNIT * 0.37, HEIGHT * UNIT + 10,
49 | window=policy_button)
50 | policy_button = Button(self, text="move", command=self.move_by_policy)
51 | policy_button.configure(width=10, activebackground="#33B5E5")
52 | canvas.create_window(WIDTH * UNIT * 0.62, HEIGHT * UNIT + 10,
53 | window=policy_button)
54 | policy_button = Button(self, text="reset", command=self.reset)
55 | policy_button.configure(width=10, activebackground="#33B5E5")
56 | canvas.create_window(WIDTH * UNIT * 0.87, HEIGHT * UNIT + 10,
57 | window=policy_button)
58 |
59 | # create grids
60 | for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
61 | x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT
62 | canvas.create_line(x0, y0, x1, y1)
63 | for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
64 | x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row
65 | canvas.create_line(x0, y0, x1, y1)
66 |
67 | # add img to canvas
68 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
69 | canvas.create_image(250, 150, image=self.shapes[1])
70 | canvas.create_image(150, 250, image=self.shapes[1])
71 | canvas.create_image(250, 250, image=self.shapes[2])
72 |
73 | # pack all
74 | canvas.pack()
75 |
76 | return canvas
77 |
78 | def load_images(self):
79 | up = PhotoImage(Image.open("../img/up.png").resize((13, 13)))
80 | right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))
81 | left = PhotoImage(Image.open("../img/left.png").resize((13, 13)))
82 | down = PhotoImage(Image.open("../img/down.png").resize((13, 13)))
83 | rectangle = PhotoImage(Image.open("../img/rectangle.png").resize((65, 65)))
84 | triangle = PhotoImage(Image.open("../img/triangle.png").resize((65, 65)))
85 | circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65)))
86 | return (up, down, left, right), (rectangle, triangle, circle)
87 |
88 | def reset(self):
89 | if self.is_moving == 0:
90 | self.evaluation_count = 0
91 | self.improvement_count = 0
92 | for i in self.texts:
93 | self.canvas.delete(i)
94 |
95 | for i in self.arrows:
96 | self.canvas.delete(i)
97 | self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]
98 | self.agent.policy_table = ([[[0.25, 0.25, 0.25, 0.25]] * WIDTH
99 | for _ in range(HEIGHT)])
100 | self.agent.policy_table[2][2] = []
101 | x, y = self.canvas.coords(self.rectangle)
102 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
103 |
104 | def text_value(self, row, col, contents, font='Helvetica', size=10,
105 | style='normal', anchor="nw"):
106 | origin_x, origin_y = 85, 70
107 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
108 | font = (font, str(size), style)
109 | text = self.canvas.create_text(x, y, fill="black", text=contents,
110 | font=font, anchor=anchor)
111 | return self.texts.append(text)
112 |
113 | def text_reward(self, row, col, contents, font='Helvetica', size=10,
114 | style='normal', anchor="nw"):
115 | origin_x, origin_y = 5, 5
116 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
117 | font = (font, str(size), style)
118 | text = self.canvas.create_text(x, y, fill="black", text=contents,
119 | font=font, anchor=anchor)
120 | return self.texts.append(text)
121 |
122 | def rectangle_move(self, action):
123 | base_action = np.array([0, 0])
124 | location = self.find_rectangle()
125 | self.render()
126 | if action == 0 and location[0] > 0: # up
127 | base_action[1] -= UNIT
128 | elif action == 1 and location[0] < HEIGHT - 1: # down
129 | base_action[1] += UNIT
130 | elif action == 2 and location[1] > 0: # left
131 | base_action[0] -= UNIT
132 | elif action == 3 and location[1] < WIDTH - 1: # right
133 | base_action[0] += UNIT
134 | # move agent
135 | self.canvas.move(self.rectangle, base_action[0], base_action[1])
136 |
137 | def find_rectangle(self):
138 | temp = self.canvas.coords(self.rectangle)
139 | x = (temp[0] / 100) - 0.5
140 | y = (temp[1] / 100) - 0.5
141 | return int(y), int(x)
142 |
143 | def move_by_policy(self):
144 | if self.improvement_count != 0 and self.is_moving != 1:
145 | self.is_moving = 1
146 |
147 | x, y = self.canvas.coords(self.rectangle)
148 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
149 |
150 | x, y = self.find_rectangle()
151 | while len(self.agent.policy_table[x][y]) != 0:
152 | self.after(100,
153 | self.rectangle_move(self.agent.get_action([x, y])))
154 | x, y = self.find_rectangle()
155 | self.is_moving = 0
156 |
157 | def draw_one_arrow(self, col, row, policy):
158 | if col == 2 and row == 2:
159 | return
160 |
161 | if policy[0] > 0: # up
162 | origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
163 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
164 | image=self.up))
165 | if policy[1] > 0: # down
166 | origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
167 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
168 | image=self.down))
169 | if policy[2] > 0: # left
170 | origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
171 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
172 | image=self.left))
173 | if policy[3] > 0: # right
174 | origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
175 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
176 | image=self.right))
177 |
178 | def draw_from_policy(self, policy_table):
179 | for i in range(HEIGHT):
180 | for j in range(WIDTH):
181 | self.draw_one_arrow(i, j, policy_table[i][j])
182 |
183 | def print_value_table(self, value_table):
184 | for i in range(WIDTH):
185 | for j in range(HEIGHT):
186 | self.text_value(i, j, value_table[i][j])
187 |
188 | def render(self):
189 | time.sleep(0.1)
190 | self.canvas.tag_raise(self.rectangle)
191 | self.update()
192 |
193 | def evaluate_policy(self):
194 | self.evaluation_count += 1
195 | for i in self.texts:
196 | self.canvas.delete(i)
197 | self.agent.policy_evaluation()
198 | self.print_value_table(self.agent.value_table)
199 |
200 | def improve_policy(self):
201 | self.improvement_count += 1
202 | for i in self.arrows:
203 | self.canvas.delete(i)
204 | self.agent.policy_improvement()
205 | self.draw_from_policy(self.agent.policy_table)
206 |
207 |
208 | class Env:
209 | def __init__(self):
210 | self.transition_probability = TRANSITION_PROB
211 | self.width = WIDTH
212 | self.height = HEIGHT
213 | self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
214 | self.possible_actions = POSSIBLE_ACTIONS
215 | self.reward[2][2] = 1 # reward 1 for circle
216 | self.reward[1][2] = -1 # reward -1 for triangle
217 | self.reward[2][1] = -1 # reward -1 for triangle
218 | self.all_state = []
219 |
220 | for x in range(WIDTH):
221 | for y in range(HEIGHT):
222 | state = [x, y]
223 | self.all_state.append(state)
224 |
225 | def get_reward(self, state, action):
226 | next_state = self.state_after_action(state, action)
227 | return self.reward[next_state[0]][next_state[1]]
228 |
229 | def state_after_action(self, state, action_index):
230 | action = ACTIONS[action_index]
231 | return self.check_boundary([state[0] + action[0], state[1] + action[1]])
232 |
233 | @staticmethod
234 | def check_boundary(state):
235 | state[0] = (0 if state[0] < 0 else WIDTH - 1
236 | if state[0] > WIDTH - 1 else state[0])
237 | state[1] = (0 if state[1] < 0 else HEIGHT - 1
238 | if state[1] > HEIGHT - 1 else state[1])
239 | return state
240 |
241 | def get_transition_prob(self, state, action):
242 | return self.transition_probability
243 |
244 | def get_all_states(self):
245 | return self.all_state
246 |
--------------------------------------------------------------------------------
/1-grid-world/1-policy-iteration/policy_iteration.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import random
3 | from environment import GraphicDisplay, Env
4 |
5 |
6 | class PolicyIteration:
7 | def __init__(self, env):
8 | self.env = env
9 | # 2-d list for the value function
10 | self.value_table = [[0.0] * env.width for _ in range(env.height)]
11 | # list of random policy (same probability of up, down, left, right)
12 | self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width
13 | for _ in range(env.height)]
14 | # setting terminal state
15 | self.policy_table[2][2] = []
16 | self.discount_factor = 0.9
17 |
18 | def policy_evaluation(self):
19 | next_value_table = [[0.00] * self.env.width
20 | for _ in range(self.env.height)]
21 |
22 | # Bellman Expectation Equation for the every states
23 | for state in self.env.get_all_states():
24 | value = 0.0
25 | # keep the value function of terminal states as 0
26 | if state == [2, 2]:
27 | next_value_table[state[0]][state[1]] = value
28 | continue
29 |
30 | for action in self.env.possible_actions:
31 | next_state = self.env.state_after_action(state, action)
32 | reward = self.env.get_reward(state, action)
33 | next_value = self.get_value(next_state)
34 | value += (self.get_policy(state)[action] *
35 | (reward + self.discount_factor * next_value))
36 |
37 | next_value_table[state[0]][state[1]] = round(value, 2)
38 |
39 | self.value_table = next_value_table
40 |
41 | def policy_improvement(self):
42 | next_policy = self.policy_table
43 | for state in self.env.get_all_states():
44 | if state == [2, 2]:
45 | continue
46 | value = -99999
47 | max_index = []
48 | result = [0.0, 0.0, 0.0, 0.0] # initialize the policy
49 |
50 | # for every actions, calculate
51 | # [reward + (discount factor) * (next state value function)]
52 | for index, action in enumerate(self.env.possible_actions):
53 | next_state = self.env.state_after_action(state, action)
54 | reward = self.env.get_reward(state, action)
55 | next_value = self.get_value(next_state)
56 | temp = reward + self.discount_factor * next_value
57 |
58 | # We normally can't pick multiple actions in greedy policy.
59 | # but here we allow multiple actions with same max values
60 | if temp == value:
61 | max_index.append(index)
62 | elif temp > value:
63 | value = temp
64 | max_index.clear()
65 | max_index.append(index)
66 |
67 | # probability of action
68 | prob = 1 / len(max_index)
69 |
70 | for index in max_index:
71 | result[index] = prob
72 |
73 | next_policy[state[0]][state[1]] = result
74 |
75 | self.policy_table = next_policy
76 |
77 | # get action according to the current policy
78 | def get_action(self, state):
79 | random_pick = random.randrange(100) / 100
80 |
81 | policy = self.get_policy(state)
82 | policy_sum = 0.0
83 | # return the action in the index
84 | for index, value in enumerate(policy):
85 | policy_sum += value
86 | if random_pick < policy_sum:
87 | return index
88 |
89 | # get policy of specific state
90 | def get_policy(self, state):
91 | if state == [2, 2]:
92 | return 0.0
93 | return self.policy_table[state[0]][state[1]]
94 |
95 | def get_value(self, state):
96 | return round(self.value_table[state[0]][state[1]], 2)
97 |
98 | if __name__ == "__main__":
99 | env = Env()
100 | policy_iteration = PolicyIteration(env)
101 | grid_world = GraphicDisplay(policy_iteration)
102 | grid_world.mainloop()
103 |
--------------------------------------------------------------------------------
/1-grid-world/2-value-iteration/environment.py:
--------------------------------------------------------------------------------
1 | import tkinter as tk
2 | import time
3 | import numpy as np
4 | import random
5 | from PIL import ImageTk, Image
6 |
7 | PhotoImage = ImageTk.PhotoImage
8 | UNIT = 100 # pixels
9 | HEIGHT = 5 # grid height
10 | WIDTH = 5 # grid width
11 | TRANSITION_PROB = 1
12 | POSSIBLE_ACTIONS = [0, 1, 2, 3] # up, down, left, right
13 | ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # actions in coordinates
14 | REWARDS = []
15 |
16 |
17 | class GraphicDisplay(tk.Tk):
18 | def __init__(self, value_iteration):
19 | super(GraphicDisplay, self).__init__()
20 | self.title('Value Iteration')
21 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
22 | self.texts = []
23 | self.arrows = []
24 | self.env = Env()
25 | self.agent = value_iteration
26 | self.iteration_count = 0
27 | self.improvement_count = 0
28 | self.is_moving = 0
29 | (self.up, self.down, self.left,
30 | self.right), self.shapes = self.load_images()
31 | self.canvas = self._build_canvas()
32 | self.text_reward(2, 2, "R : 1.0")
33 | self.text_reward(1, 2, "R : -1.0")
34 | self.text_reward(2, 1, "R : -1.0")
35 |
36 | def _build_canvas(self):
37 | canvas = tk.Canvas(self, bg='white',
38 | height=HEIGHT * UNIT,
39 | width=WIDTH * UNIT)
40 | # buttons
41 | iteration_button = tk.Button(self, text="Calculate",
42 | command=self.calculate_value)
43 | iteration_button.configure(width=10, activebackground="#33B5E5")
44 | canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10,
45 | window=iteration_button)
46 |
47 | policy_button = tk.Button(self, text="Print Policy",
48 | command=self.print_optimal_policy)
49 | policy_button.configure(width=10, activebackground="#33B5E5")
50 | canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10,
51 | window=policy_button)
52 |
53 | policy_button = tk.Button(self, text="Move",
54 | command=self.move_by_policy)
55 | policy_button.configure(width=10, activebackground="#33B5E5")
56 | canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10,
57 | window=policy_button)
58 |
59 | policy_button = tk.Button(self, text="Clear", command=self.clear)
60 | policy_button.configure(width=10, activebackground="#33B5E5")
61 | canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10,
62 | window=policy_button)
63 |
64 | # create grids
65 | for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
66 | x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT
67 | canvas.create_line(x0, y0, x1, y1)
68 | for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
69 | x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row
70 | canvas.create_line(x0, y0, x1, y1)
71 |
72 | # add img to canvas
73 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
74 | canvas.create_image(250, 150, image=self.shapes[1])
75 | canvas.create_image(150, 250, image=self.shapes[1])
76 | canvas.create_image(250, 250, image=self.shapes[2])
77 |
78 | # pack all
79 | canvas.pack()
80 |
81 | return canvas
82 |
83 | def load_images(self):
84 | PhotoImage = ImageTk.PhotoImage
85 | up = PhotoImage(Image.open("../img/up.png").resize((13, 13)))
86 | right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))
87 | left = PhotoImage(Image.open("../img/left.png").resize((13, 13)))
88 | down = PhotoImage(Image.open("../img/down.png").resize((13, 13)))
89 | rectangle = PhotoImage(
90 | Image.open("../img/rectangle.png").resize((65, 65)))
91 | triangle = PhotoImage(
92 | Image.open("../img/triangle.png").resize((65, 65)))
93 | circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65)))
94 | return (up, down, left, right), (rectangle, triangle, circle)
95 |
96 | def clear(self):
97 |
98 | if self.is_moving == 0:
99 | self.iteration_count = 0
100 | self.improvement_count = 0
101 | for i in self.texts:
102 | self.canvas.delete(i)
103 |
104 | for i in self.arrows:
105 | self.canvas.delete(i)
106 |
107 | self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]
108 |
109 | x, y = self.canvas.coords(self.rectangle)
110 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
111 |
112 | def reset(self):
113 | self.update()
114 | time.sleep(0.5)
115 | self.canvas.delete(self.rectangle)
116 | return self.canvas.coords(self.rectangle)
117 |
118 | def text_value(self, row, col, contents, font='Helvetica', size=12,
119 | style='normal', anchor="nw"):
120 | origin_x, origin_y = 85, 70
121 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
122 | font = (font, str(size), style)
123 | text = self.canvas.create_text(x, y, fill="black", text=contents,
124 | font=font, anchor=anchor)
125 | return self.texts.append(text)
126 |
127 | def text_reward(self, row, col, contents, font='Helvetica', size=12,
128 | style='normal', anchor="nw"):
129 | origin_x, origin_y = 5, 5
130 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
131 | font = (font, str(size), style)
132 | text = self.canvas.create_text(x, y, fill="black", text=contents,
133 | font=font, anchor=anchor)
134 | return self.texts.append(text)
135 |
136 | def rectangle_move(self, action):
137 | base_action = np.array([0, 0])
138 | location = self.find_rectangle()
139 | self.render()
140 | if action == 0 and location[0] > 0: # up
141 | base_action[1] -= UNIT
142 | elif action == 1 and location[0] < HEIGHT - 1: # down
143 | base_action[1] += UNIT
144 | elif action == 2 and location[1] > 0: # left
145 | base_action[0] -= UNIT
146 | elif action == 3 and location[1] < WIDTH - 1: # right
147 | base_action[0] += UNIT
148 |
149 | self.canvas.move(self.rectangle, base_action[0],
150 | base_action[1]) # move agent
151 |
152 | def find_rectangle(self):
153 | temp = self.canvas.coords(self.rectangle)
154 | x = (temp[0] / 100) - 0.5
155 | y = (temp[1] / 100) - 0.5
156 | return int(y), int(x)
157 |
158 | def move_by_policy(self):
159 |
160 | if self.improvement_count != 0 and self.is_moving != 1:
161 | self.is_moving = 1
162 | x, y = self.canvas.coords(self.rectangle)
163 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
164 |
165 | x, y = self.find_rectangle()
166 | while len(self.agent.get_action([x, y])) != 0:
167 | action = random.sample(self.agent.get_action([x, y]), 1)[0]
168 | self.after(100, self.rectangle_move(action))
169 | x, y = self.find_rectangle()
170 | self.is_moving = 0
171 |
172 | def draw_one_arrow(self, col, row, action):
173 | if col == 2 and row == 2:
174 | return
175 | if action == 0: # up
176 | origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
177 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
178 | image=self.up))
179 | elif action == 1: # down
180 | origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
181 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
182 | image=self.down))
183 | elif action == 3: # right
184 | origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
185 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
186 | image=self.right))
187 | elif action == 2: # left
188 | origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
189 | self.arrows.append(self.canvas.create_image(origin_x, origin_y,
190 | image=self.left))
191 |
192 | def draw_from_values(self, state, action_list):
193 | i = state[0]
194 | j = state[1]
195 | for action in action_list:
196 | self.draw_one_arrow(i, j, action)
197 |
198 | def print_values(self, values):
199 | for i in range(WIDTH):
200 | for j in range(HEIGHT):
201 | self.text_value(i, j, values[i][j])
202 |
203 | def render(self):
204 | time.sleep(0.1)
205 | self.canvas.tag_raise(self.rectangle)
206 | self.update()
207 |
208 | def calculate_value(self):
209 | self.iteration_count += 1
210 | for i in self.texts:
211 | self.canvas.delete(i)
212 | self.agent.value_iteration()
213 | self.print_values(self.agent.value_table)
214 |
215 | def print_optimal_policy(self):
216 | self.improvement_count += 1
217 | for i in self.arrows:
218 | self.canvas.delete(i)
219 | for state in self.env.get_all_states():
220 | action = self.agent.get_action(state)
221 | self.draw_from_values(state, action)
222 |
223 |
224 | class Env:
225 | def __init__(self):
226 | self.transition_probability = TRANSITION_PROB
227 | self.width = WIDTH # Width of Grid World
228 | self.height = HEIGHT # Height of GridWorld
229 | self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
230 | self.possible_actions = POSSIBLE_ACTIONS
231 | self.reward[2][2] = 1 # reward 1 for circle
232 | self.reward[1][2] = -1 # reward -1 for triangle
233 | self.reward[2][1] = -1 # reward -1 for triangle
234 | self.all_state = []
235 |
236 | for x in range(WIDTH):
237 | for y in range(HEIGHT):
238 | state = [x, y]
239 | self.all_state.append(state)
240 |
241 | def get_reward(self, state, action):
242 | next_state = self.state_after_action(state, action)
243 | return self.reward[next_state[0]][next_state[1]]
244 |
245 | def state_after_action(self, state, action_index):
246 | action = ACTIONS[action_index]
247 | return self.check_boundary([state[0] + action[0], state[1] + action[1]])
248 |
249 | @staticmethod
250 | def check_boundary(state):
251 | state[0] = (0 if state[0] < 0 else WIDTH - 1
252 | if state[0] > WIDTH - 1 else state[0])
253 | state[1] = (0 if state[1] < 0 else HEIGHT - 1
254 | if state[1] > HEIGHT - 1 else state[1])
255 | return state
256 |
257 | def get_transition_prob(self, state, action):
258 | return self.transition_probability
259 |
260 | def get_all_states(self):
261 | return self.all_state
262 |
--------------------------------------------------------------------------------
/1-grid-world/2-value-iteration/value_iteration.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from environment import GraphicDisplay, Env
3 |
4 | class ValueIteration:
5 | def __init__(self, env):
6 | self.env = env
7 | # 2-d list for the value function
8 | self.value_table = [[0.0] * env.width for _ in range(env.height)]
9 | self.discount_factor = 0.9
10 |
11 | # get next value function table from the current value function table
12 | def value_iteration(self):
13 | next_value_table = [[0.0] * self.env.width
14 | for _ in range(self.env.height)]
15 | for state in self.env.get_all_states():
16 | if state == [2, 2]:
17 | next_value_table[state[0]][state[1]] = 0.0
18 | continue
19 | value_list = []
20 |
21 | for action in self.env.possible_actions:
22 | next_state = self.env.state_after_action(state, action)
23 | reward = self.env.get_reward(state, action)
24 | next_value = self.get_value(next_state)
25 | value_list.append((reward + self.discount_factor * next_value))
26 | # return the maximum value(it is the optimality equation!!)
27 | next_value_table[state[0]][state[1]] = round(max(value_list), 2)
28 | self.value_table = next_value_table
29 |
30 | # get action according to the current value function table
31 | def get_action(self, state):
32 | action_list = []
33 | max_value = -99999
34 |
35 | if state == [2, 2]:
36 | return []
37 |
38 | # calculating q values for the all actions and
39 | # append the action to action list which has maximum q value
40 | for action in self.env.possible_actions:
41 |
42 | next_state = self.env.state_after_action(state, action)
43 | reward = self.env.get_reward(state, action)
44 | next_value = self.get_value(next_state)
45 | value = (reward + self.discount_factor * next_value)
46 |
47 | if value > max_value:
48 | action_list.clear()
49 | action_list.append(action)
50 | max_value = value
51 | elif value == max_value:
52 | action_list.append(action)
53 |
54 | return action_list
55 |
56 | def get_value(self, state):
57 | return round(self.value_table[state[0]][state[1]], 2)
58 |
59 | if __name__ == "__main__":
60 | env = Env()
61 | value_iteration = ValueIteration(env)
62 | grid_world = GraphicDisplay(value_iteration)
63 | grid_world.mainloop()
64 |
--------------------------------------------------------------------------------
/1-grid-world/3-monte-carlo/environment.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tkinter as tk
4 | from PIL import ImageTk, Image
5 |
6 | np.random.seed(1)
7 | PhotoImage = ImageTk.PhotoImage
8 | UNIT = 100 # pixels
9 | HEIGHT = 5 # grid height
10 | WIDTH = 5 # grid width
11 |
12 |
13 | class Env(tk.Tk):
14 | def __init__(self):
15 | super(Env, self).__init__()
16 | self.action_space = ['u', 'd', 'l', 'r']
17 | self.n_actions = len(self.action_space)
18 | self.title('monte carlo')
19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
20 | self.shapes = self.load_images()
21 | self.canvas = self._build_canvas()
22 | self.texts = []
23 |
24 | def _build_canvas(self):
25 | canvas = tk.Canvas(self, bg='white',
26 | height=HEIGHT * UNIT,
27 | width=WIDTH * UNIT)
28 | # create grids
29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
31 | canvas.create_line(x0, y0, x1, y1)
32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
34 | canvas.create_line(x0, y0, x1, y1)
35 |
36 | # add img to canvas
37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2])
41 |
42 | # pack all
43 | canvas.pack()
44 |
45 | return canvas
46 |
47 | def load_images(self):
48 | rectangle = PhotoImage(
49 | Image.open("../img/rectangle.png").resize((65, 65)))
50 | triangle = PhotoImage(
51 | Image.open("../img/triangle.png").resize((65, 65)))
52 | circle = PhotoImage(
53 | Image.open("../img/circle.png").resize((65, 65)))
54 |
55 | return rectangle, triangle, circle
56 |
57 | @staticmethod
58 | def coords_to_state(coords):
59 | x = int((coords[0] - 50) / 100)
60 | y = int((coords[1] - 50) / 100)
61 | return [x, y]
62 |
63 | def reset(self):
64 | self.update()
65 | time.sleep(0.5)
66 | x, y = self.canvas.coords(self.rectangle)
67 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
68 | # return observation
69 | return self.coords_to_state(self.canvas.coords(self.rectangle))
70 |
71 | def step(self, action):
72 | state = self.canvas.coords(self.rectangle)
73 | base_action = np.array([0, 0])
74 | self.render()
75 |
76 | if action == 0: # up
77 | if state[1] > UNIT:
78 | base_action[1] -= UNIT
79 | elif action == 1: # down
80 | if state[1] < (HEIGHT - 1) * UNIT:
81 | base_action[1] += UNIT
82 | elif action == 2: # left
83 | if state[0] > UNIT:
84 | base_action[0] -= UNIT
85 | elif action == 3: # right
86 | if state[0] < (WIDTH - 1) * UNIT:
87 | base_action[0] += UNIT
88 | # move agent
89 | self.canvas.move(self.rectangle, base_action[0], base_action[1])
90 | # move rectangle to top level of canvas
91 | self.canvas.tag_raise(self.rectangle)
92 |
93 | next_state = self.canvas.coords(self.rectangle)
94 |
95 | # reward function
96 | if next_state == self.canvas.coords(self.circle):
97 | reward = 100
98 | done = True
99 | elif next_state in [self.canvas.coords(self.triangle1),
100 | self.canvas.coords(self.triangle2)]:
101 | reward = -100
102 | done = True
103 | else:
104 | reward = 0
105 | done = False
106 |
107 | next_state = self.coords_to_state(next_state)
108 |
109 | return next_state, reward, done
110 |
111 | def render(self):
112 | time.sleep(0.03)
113 | self.update()
114 |
--------------------------------------------------------------------------------
/1-grid-world/3-monte-carlo/mc_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | from collections import defaultdict
4 | from environment import Env
5 |
6 |
7 | # Monte Carlo Agent which learns every episodes from the sample
8 | class MCAgent:
9 | def __init__(self, actions):
10 | self.width = 5
11 | self.height = 5
12 | self.actions = actions
13 | self.learning_rate = 0.01
14 | self.discount_factor = 0.9
15 | self.epsilon = 0.1
16 | self.samples = []
17 | self.value_table = defaultdict(float)
18 |
19 | # append sample to memory(state, reward, done)
20 | def save_sample(self, state, reward, done):
21 | self.samples.append([state, reward, done])
22 |
23 | # for every episode, agent updates q function of visited states
24 | def update(self):
25 | G_t = 0
26 | visit_state = []
27 | for reward in reversed(self.samples):
28 | state = str(reward[0])
29 | if state not in visit_state:
30 | visit_state.append(state)
31 | G_t = self.discount_factor * (reward[1] + G_t)
32 | value = self.value_table[state]
33 | self.value_table[state] = (value +
34 | self.learning_rate * (G_t - value))
35 |
36 | # get action for the state according to the q function table
37 | # agent pick action of epsilon-greedy policy
38 | def get_action(self, state):
39 | if np.random.rand() < self.epsilon:
40 | # take random action
41 | action = np.random.choice(self.actions)
42 | else:
43 | # take action according to the q function table
44 | next_state = self.possible_next_state(state)
45 | action = self.arg_max(next_state)
46 | return int(action)
47 |
48 | # compute arg_max if multiple candidates exit, pick one randomly
49 | @staticmethod
50 | def arg_max(next_state):
51 | max_index_list = []
52 | max_value = next_state[0]
53 | for index, value in enumerate(next_state):
54 | if value > max_value:
55 | max_index_list.clear()
56 | max_value = value
57 | max_index_list.append(index)
58 | elif value == max_value:
59 | max_index_list.append(index)
60 | return random.choice(max_index_list)
61 |
62 | # get the possible next states
63 | def possible_next_state(self, state):
64 | col, row = state
65 | next_state = [0.0] * 4
66 |
67 | if row != 0:
68 | next_state[0] = self.value_table[str([col, row - 1])]
69 | else:
70 | next_state[0] = self.value_table[str(state)]
71 | if row != self.height - 1:
72 | next_state[1] = self.value_table[str([col, row + 1])]
73 | else:
74 | next_state[1] = self.value_table[str(state)]
75 | if col != 0:
76 | next_state[2] = self.value_table[str([col - 1, row])]
77 | else:
78 | next_state[2] = self.value_table[str(state)]
79 | if col != self.width - 1:
80 | next_state[3] = self.value_table[str([col + 1, row])]
81 | else:
82 | next_state[3] = self.value_table[str(state)]
83 |
84 | return next_state
85 |
86 |
87 | # main loop
88 | if __name__ == "__main__":
89 | env = Env()
90 | agent = MCAgent(actions=list(range(env.n_actions)))
91 |
92 | for episode in range(1000):
93 | state = env.reset()
94 | action = agent.get_action(state)
95 |
96 | while True:
97 | env.render()
98 |
99 | # forward to next state. reward is number and done is boolean
100 | next_state, reward, done = env.step(action)
101 | agent.save_sample(next_state, reward, done)
102 |
103 | # get next action
104 | action = agent.get_action(next_state)
105 |
106 | # at the end of each episode, update the q function table
107 | if done:
108 | print("episode : ", episode)
109 | agent.update()
110 | agent.samples.clear()
111 | break
112 |
--------------------------------------------------------------------------------
/1-grid-world/4-sarsa/.python-version:
--------------------------------------------------------------------------------
1 | 3.5.0
2 |
--------------------------------------------------------------------------------
/1-grid-world/4-sarsa/environment.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tkinter as tk
4 | from PIL import ImageTk, Image
5 |
6 | np.random.seed(1)
7 | PhotoImage = ImageTk.PhotoImage
8 | UNIT = 100 # pixels
9 | HEIGHT = 5 # grid height
10 | WIDTH = 5 # grid width
11 |
12 |
13 | class Env(tk.Tk):
14 | def __init__(self):
15 | super(Env, self).__init__()
16 | self.action_space = ['u', 'd', 'l', 'r']
17 | self.n_actions = len(self.action_space)
18 | self.title('SARSA')
19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
20 | self.shapes = self.load_images()
21 | self.canvas = self._build_canvas()
22 | self.texts = []
23 |
24 | def _build_canvas(self):
25 | canvas = tk.Canvas(self, bg='white',
26 | height=HEIGHT * UNIT,
27 | width=WIDTH * UNIT)
28 | # create grids
29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
31 | canvas.create_line(x0, y0, x1, y1)
32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
34 | canvas.create_line(x0, y0, x1, y1)
35 |
36 | # add img to canvas
37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2])
41 |
42 | # pack all
43 | canvas.pack()
44 |
45 | return canvas
46 |
47 | def load_images(self):
48 | rectangle = PhotoImage(
49 | Image.open("../img/rectangle.png").resize((65, 65)))
50 | triangle = PhotoImage(
51 | Image.open("../img/triangle.png").resize((65, 65)))
52 | circle = PhotoImage(
53 | Image.open("../img/circle.png").resize((65, 65)))
54 |
55 | return rectangle, triangle, circle
56 |
57 | def text_value(self, row, col, contents, action, font='Helvetica', size=10,
58 | style='normal', anchor="nw"):
59 | if action == 0:
60 | origin_x, origin_y = 7, 42
61 | elif action == 1:
62 | origin_x, origin_y = 85, 42
63 | elif action == 2:
64 | origin_x, origin_y = 42, 5
65 | else:
66 | origin_x, origin_y = 42, 77
67 |
68 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
69 | font = (font, str(size), style)
70 | text = self.canvas.create_text(x, y, fill="black", text=contents,
71 | font=font, anchor=anchor)
72 | return self.texts.append(text)
73 |
74 | def print_value_all(self, q_table):
75 | for i in self.texts:
76 | self.canvas.delete(i)
77 | self.texts.clear()
78 | for x in range(HEIGHT):
79 | for y in range(WIDTH):
80 | for action in range(0, 4):
81 | state = [x, y]
82 | if str(state) in q_table.keys():
83 | temp = q_table[str(state)][action]
84 | self.text_value(y, x, round(temp, 2), action)
85 |
86 | def coords_to_state(self, coords):
87 | x = int((coords[0] - 50) / 100)
88 | y = int((coords[1] - 50) / 100)
89 | return [x, y]
90 |
91 | def reset(self):
92 | self.update()
93 | time.sleep(0.5)
94 | x, y = self.canvas.coords(self.rectangle)
95 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
96 | self.render()
97 | # return observation
98 | return self.coords_to_state(self.canvas.coords(self.rectangle))
99 |
100 | def step(self, action):
101 | state = self.canvas.coords(self.rectangle)
102 | base_action = np.array([0, 0])
103 | self.render()
104 |
105 | if action == 0: # up
106 | if state[1] > UNIT:
107 | base_action[1] -= UNIT
108 | elif action == 1: # down
109 | if state[1] < (HEIGHT - 1) * UNIT:
110 | base_action[1] += UNIT
111 | elif action == 2: # left
112 | if state[0] > UNIT:
113 | base_action[0] -= UNIT
114 | elif action == 3: # right
115 | if state[0] < (WIDTH - 1) * UNIT:
116 | base_action[0] += UNIT
117 |
118 | # move agent
119 | self.canvas.move(self.rectangle, base_action[0], base_action[1])
120 | # move rectangle to top level of canvas
121 | self.canvas.tag_raise(self.rectangle)
122 | next_state = self.canvas.coords(self.rectangle)
123 |
124 | # reward function
125 | if next_state == self.canvas.coords(self.circle):
126 | reward = 100
127 | done = True
128 | elif next_state in [self.canvas.coords(self.triangle1),
129 | self.canvas.coords(self.triangle2)]:
130 | reward = -100
131 | done = True
132 | else:
133 | reward = 0
134 | done = False
135 |
136 | next_state = self.coords_to_state(next_state)
137 |
138 | return next_state, reward, done
139 |
140 | def render(self):
141 | time.sleep(0.03)
142 | self.update()
143 |
--------------------------------------------------------------------------------
/1-grid-world/4-sarsa/sarsa_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | from collections import defaultdict
4 | from environment import Env
5 |
6 |
7 | # SARSA agent learns every time step from the sample
8 | class SARSAgent:
9 | def __init__(self, actions):
10 | self.actions = actions
11 | self.learning_rate = 0.01
12 | self.discount_factor = 0.9
13 | self.epsilon = 0.1
14 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
15 |
16 | # with sample , learns new q function
17 | def learn(self, state, action, reward, next_state, next_action):
18 | current_q = self.q_table[state][action]
19 | next_state_q = self.q_table[next_state][next_action]
20 | new_q = (current_q + self.learning_rate *
21 | (reward + self.discount_factor * next_state_q - current_q))
22 | self.q_table[state][action] = new_q
23 |
24 | # get action for the state according to the q function table
25 | # agent pick action of epsilon-greedy policy
26 | def get_action(self, state):
27 | if np.random.rand() < self.epsilon:
28 | # take random action
29 | action = np.random.choice(self.actions)
30 | else:
31 | # take action according to the q function table
32 | state_action = self.q_table[state]
33 | action = self.arg_max(state_action)
34 | return action
35 |
36 | @staticmethod
37 | def arg_max(state_action):
38 | max_index_list = []
39 | max_value = state_action[0]
40 | for index, value in enumerate(state_action):
41 | if value > max_value:
42 | max_index_list.clear()
43 | max_value = value
44 | max_index_list.append(index)
45 | elif value == max_value:
46 | max_index_list.append(index)
47 | return random.choice(max_index_list)
48 |
49 | if __name__ == "__main__":
50 | env = Env()
51 | agent = SARSAgent(actions=list(range(env.n_actions)))
52 |
53 | for episode in range(1000):
54 | # reset environment and initialize state
55 |
56 | state = env.reset()
57 | # get action of state from agent
58 | action = agent.get_action(str(state))
59 |
60 | while True:
61 | env.render()
62 |
63 | # take action and proceed one step in the environment
64 | next_state, reward, done = env.step(action)
65 | next_action = agent.get_action(str(next_state))
66 |
67 | # with sample , agent learns new q function
68 | agent.learn(str(state), action, reward, str(next_state), next_action)
69 |
70 | state = next_state
71 | action = next_action
72 |
73 | # print q function of all states at screen
74 | env.print_value_all(agent.q_table)
75 |
76 | # if episode ends, then break
77 | if done:
78 | break
79 |
80 |
--------------------------------------------------------------------------------
/1-grid-world/5-q-learning/.python-version:
--------------------------------------------------------------------------------
1 | 3.5.0
2 |
--------------------------------------------------------------------------------
/1-grid-world/5-q-learning/environment.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tkinter as tk
4 | from PIL import ImageTk, Image
5 |
6 | np.random.seed(1)
7 | PhotoImage = ImageTk.PhotoImage
8 | UNIT = 100 # pixels
9 | HEIGHT = 5 # grid height
10 | WIDTH = 5 # grid width
11 |
12 |
13 | class Env(tk.Tk):
14 | def __init__(self):
15 | super(Env, self).__init__()
16 | self.action_space = ['u', 'd', 'l', 'r']
17 | self.n_actions = len(self.action_space)
18 | self.title('Q Learning')
19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
20 | self.shapes = self.load_images()
21 | self.canvas = self._build_canvas()
22 | self.texts = []
23 |
24 | def _build_canvas(self):
25 | canvas = tk.Canvas(self, bg='white',
26 | height=HEIGHT * UNIT,
27 | width=WIDTH * UNIT)
28 | # create grids
29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
31 | canvas.create_line(x0, y0, x1, y1)
32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
34 | canvas.create_line(x0, y0, x1, y1)
35 |
36 | # add img to canvas
37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2])
41 |
42 | # pack all
43 | canvas.pack()
44 |
45 | return canvas
46 |
47 | def load_images(self):
48 | rectangle = PhotoImage(
49 | Image.open("../img/rectangle.png").resize((65, 65)))
50 | triangle = PhotoImage(
51 | Image.open("../img/triangle.png").resize((65, 65)))
52 | circle = PhotoImage(
53 | Image.open("../img/circle.png").resize((65, 65)))
54 |
55 | return rectangle, triangle, circle
56 |
57 | def text_value(self, row, col, contents, action, font='Helvetica', size=10,
58 | style='normal', anchor="nw"):
59 |
60 | if action == 0:
61 | origin_x, origin_y = 7, 42
62 | elif action == 1:
63 | origin_x, origin_y = 85, 42
64 | elif action == 2:
65 | origin_x, origin_y = 42, 5
66 | else:
67 | origin_x, origin_y = 42, 77
68 |
69 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
70 | font = (font, str(size), style)
71 | text = self.canvas.create_text(x, y, fill="black", text=contents,
72 | font=font, anchor=anchor)
73 | return self.texts.append(text)
74 |
75 | def print_value_all(self, q_table):
76 | for i in self.texts:
77 | self.canvas.delete(i)
78 | self.texts.clear()
79 | for i in range(HEIGHT):
80 | for j in range(WIDTH):
81 | for action in range(0, 4):
82 | state = [i, j]
83 | if str(state) in q_table.keys():
84 | temp = q_table[str(state)][action]
85 | self.text_value(j, i, round(temp, 2), action)
86 |
87 | def coords_to_state(self, coords):
88 | x = int((coords[0] - 50) / 100)
89 | y = int((coords[1] - 50) / 100)
90 | return [x, y]
91 |
92 | def state_to_coords(self, state):
93 | x = int(state[0] * 100 + 50)
94 | y = int(state[1] * 100 + 50)
95 | return [x, y]
96 |
97 | def reset(self):
98 | self.update()
99 | time.sleep(0.5)
100 | x, y = self.canvas.coords(self.rectangle)
101 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
102 | self.render()
103 | # return observation
104 | return self.coords_to_state(self.canvas.coords(self.rectangle))
105 |
106 |
107 | def step(self, action):
108 | state = self.canvas.coords(self.rectangle)
109 | base_action = np.array([0, 0])
110 | self.render()
111 |
112 | if action == 0: # up
113 | if state[1] > UNIT:
114 | base_action[1] -= UNIT
115 | elif action == 1: # down
116 | if state[1] < (HEIGHT - 1) * UNIT:
117 | base_action[1] += UNIT
118 | elif action == 2: # left
119 | if state[0] > UNIT:
120 | base_action[0] -= UNIT
121 | elif action == 3: # right
122 | if state[0] < (WIDTH - 1) * UNIT:
123 | base_action[0] += UNIT
124 |
125 | # move agent
126 | self.canvas.move(self.rectangle, base_action[0], base_action[1])
127 | # move rectangle to top level of canvas
128 | self.canvas.tag_raise(self.rectangle)
129 | next_state = self.canvas.coords(self.rectangle)
130 |
131 | # reward function
132 | if next_state == self.canvas.coords(self.circle):
133 | reward = 100
134 | done = True
135 | elif next_state in [self.canvas.coords(self.triangle1),
136 | self.canvas.coords(self.triangle2)]:
137 | reward = -100
138 | done = True
139 | else:
140 | reward = 0
141 | done = False
142 |
143 | next_state = self.coords_to_state(next_state)
144 | return next_state, reward, done
145 |
146 | def render(self):
147 | time.sleep(0.03)
148 | self.update()
149 |
--------------------------------------------------------------------------------
/1-grid-world/5-q-learning/q_learning_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | from environment import Env
4 | from collections import defaultdict
5 |
6 | class QLearningAgent:
7 | def __init__(self, actions):
8 | # actions = [0, 1, 2, 3]
9 | self.actions = actions
10 | self.learning_rate = 0.01
11 | self.discount_factor = 0.9
12 | self.epsilon = 0.1
13 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
14 |
15 | # update q function with sample
16 | def learn(self, state, action, reward, next_state):
17 | current_q = self.q_table[state][action]
18 | # using Bellman Optimality Equation to update q function
19 | new_q = reward + self.discount_factor * max(self.q_table[next_state])
20 | self.q_table[state][action] += self.learning_rate * (new_q - current_q)
21 |
22 | # get action for the state according to the q function table
23 | # agent pick action of epsilon-greedy policy
24 | def get_action(self, state):
25 | if np.random.rand() < self.epsilon:
26 | # take random action
27 | action = np.random.choice(self.actions)
28 | else:
29 | # take action according to the q function table
30 | state_action = self.q_table[state]
31 | action = self.arg_max(state_action)
32 | return action
33 |
34 | @staticmethod
35 | def arg_max(state_action):
36 | max_index_list = []
37 | max_value = state_action[0]
38 | for index, value in enumerate(state_action):
39 | if value > max_value:
40 | max_index_list.clear()
41 | max_value = value
42 | max_index_list.append(index)
43 | elif value == max_value:
44 | max_index_list.append(index)
45 | return random.choice(max_index_list)
46 |
47 | if __name__ == "__main__":
48 | env = Env()
49 | agent = QLearningAgent(actions=list(range(env.n_actions)))
50 |
51 | for episode in range(1000):
52 | state = env.reset()
53 |
54 | while True:
55 | env.render()
56 |
57 | # take action and proceed one step in the environment
58 | action = agent.get_action(str(state))
59 | next_state, reward, done = env.step(action)
60 |
61 | # with sample , agent learns new q function
62 | agent.learn(str(state), action, reward, str(next_state))
63 |
64 | state = next_state
65 | env.print_value_all(agent.q_table)
66 |
67 | # if episode ends, then break
68 | if done:
69 | break
70 |
--------------------------------------------------------------------------------
/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import pylab
3 | import random
4 | import numpy as np
5 | from environment import Env
6 | from keras.layers import Dense
7 | from keras.optimizers import Adam
8 | from keras.models import Sequential
9 |
10 | EPISODES = 1000
11 |
12 |
13 | # this is DeepSARSA Agent for the GridWorld
14 | # Utilize Neural Network as q function approximator
15 | class DeepSARSAgent:
16 | def __init__(self):
17 | self.load_model = False
18 | # actions which agent can do
19 | self.action_space = [0, 1, 2, 3, 4]
20 | # get size of state and action
21 | self.action_size = len(self.action_space)
22 | self.state_size = 15
23 | self.discount_factor = 0.99
24 | self.learning_rate = 0.001
25 |
26 | self.epsilon = 1. # exploration
27 | self.epsilon_decay = .9999
28 | self.epsilon_min = 0.01
29 | self.model = self.build_model()
30 |
31 | if self.load_model:
32 | self.epsilon = 0.05
33 | self.model.load_weights('./save_model/deep_sarsa_trained.h5')
34 |
35 | # approximate Q function using Neural Network
36 | # state is input and Q Value of each action is output of network
37 | def build_model(self):
38 | model = Sequential()
39 | model.add(Dense(30, input_dim=self.state_size, activation='relu'))
40 | model.add(Dense(30, activation='relu'))
41 | model.add(Dense(self.action_size, activation='linear'))
42 | model.summary()
43 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
44 | return model
45 |
46 | # get action from model using epsilon-greedy policy
47 | def get_action(self, state):
48 | if np.random.rand() <= self.epsilon:
49 | # The agent acts randomly
50 | return random.randrange(self.action_size)
51 | else:
52 | # Predict the reward value based on the given state
53 | state = np.float32(state)
54 | q_values = self.model.predict(state)
55 | return np.argmax(q_values[0])
56 |
57 | def train_model(self, state, action, reward, next_state, next_action, done):
58 | if self.epsilon > self.epsilon_min:
59 | self.epsilon *= self.epsilon_decay
60 |
61 | state = np.float32(state)
62 | next_state = np.float32(next_state)
63 | target = self.model.predict(state)[0]
64 | # like Q Learning, get maximum Q value at s'
65 | # But from target model
66 | if done:
67 | target[action] = reward
68 | else:
69 | target[action] = (reward + self.discount_factor *
70 | self.model.predict(next_state)[0][next_action])
71 |
72 | target = np.reshape(target, [1, 5])
73 | # make minibatch which includes target q value and predicted q value
74 | # and do the model fit!
75 | self.model.fit(state, target, epochs=1, verbose=0)
76 |
77 |
78 | if __name__ == "__main__":
79 | env = Env()
80 | agent = DeepSARSAgent()
81 |
82 | global_step = 0
83 | scores, episodes = [], []
84 |
85 | for e in range(EPISODES):
86 | done = False
87 | score = 0
88 | state = env.reset()
89 | state = np.reshape(state, [1, 15])
90 |
91 | while not done:
92 | # fresh env
93 | global_step += 1
94 |
95 | # get action for the current state and go one step in environment
96 | action = agent.get_action(state)
97 | next_state, reward, done = env.step(action)
98 | next_state = np.reshape(next_state, [1, 15])
99 | next_action = agent.get_action(next_state)
100 | agent.train_model(state, action, reward, next_state, next_action,
101 | done)
102 | state = next_state
103 | # every time step we do training
104 | score += reward
105 |
106 | state = copy.deepcopy(next_state)
107 |
108 | if done:
109 | scores.append(score)
110 | episodes.append(e)
111 | pylab.plot(episodes, scores, 'b')
112 | pylab.savefig("./save_graph/deep_sarsa_.png")
113 | print("episode:", e, " score:", score, "global_step",
114 | global_step, " epsilon:", agent.epsilon)
115 |
116 | if e % 100 == 0:
117 | agent.model.save_weights("./save_model/deep_sarsa.h5")
118 |
--------------------------------------------------------------------------------
/1-grid-world/6-deep-sarsa/environment.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tkinter as tk
4 | from PIL import ImageTk, Image
5 |
6 | PhotoImage = ImageTk.PhotoImage
7 | UNIT = 50 # pixels
8 | HEIGHT = 5 # grid height
9 | WIDTH = 5 # grid width
10 |
11 | np.random.seed(1)
12 |
13 |
14 | class Env(tk.Tk):
15 | def __init__(self):
16 | super(Env, self).__init__()
17 | self.action_space = ['u', 'd', 'l', 'r']
18 | self.action_size = len(self.action_space)
19 | self.title('DeepSARSA')
20 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
21 | self.shapes = self.load_images()
22 | self.canvas = self._build_canvas()
23 | self.counter = 0
24 | self.rewards = []
25 | self.goal = []
26 | # obstacle
27 | self.set_reward([0, 1], -1)
28 | self.set_reward([1, 2], -1)
29 | self.set_reward([2, 3], -1)
30 | # #goal
31 | self.set_reward([4, 4], 1)
32 |
33 | def _build_canvas(self):
34 | canvas = tk.Canvas(self, bg='white',
35 | height=HEIGHT * UNIT,
36 | width=WIDTH * UNIT)
37 | # create grids
38 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
39 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
40 | canvas.create_line(x0, y0, x1, y1)
41 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
42 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
43 | canvas.create_line(x0, y0, x1, y1)
44 |
45 | self.rewards = []
46 | self.goal = []
47 | # add image to canvas
48 | x, y = UNIT/2, UNIT/2
49 | self.rectangle = canvas.create_image(x, y, image=self.shapes[0])
50 |
51 | # pack all`
52 | canvas.pack()
53 |
54 | return canvas
55 |
56 | def load_images(self):
57 | rectangle = PhotoImage(
58 | Image.open("../img/rectangle.png").resize((30, 30)))
59 | triangle = PhotoImage(
60 | Image.open("../img/triangle.png").resize((30, 30)))
61 | circle = PhotoImage(
62 | Image.open("../img/circle.png").resize((30, 30)))
63 |
64 | return rectangle, triangle, circle
65 |
66 | def reset_reward(self):
67 |
68 | for reward in self.rewards:
69 | self.canvas.delete(reward['figure'])
70 |
71 | self.rewards.clear()
72 | self.goal.clear()
73 | self.set_reward([0, 1], -1)
74 | self.set_reward([1, 2], -1)
75 | self.set_reward([2, 3], -1)
76 |
77 | # #goal
78 | self.set_reward([4, 4], 1)
79 |
80 | def set_reward(self, state, reward):
81 | state = [int(state[0]), int(state[1])]
82 | x = int(state[0])
83 | y = int(state[1])
84 | temp = {}
85 | if reward > 0:
86 | temp['reward'] = reward
87 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
88 | (UNIT * y) + UNIT / 2,
89 | image=self.shapes[2])
90 |
91 | self.goal.append(temp['figure'])
92 |
93 |
94 | elif reward < 0:
95 | temp['direction'] = -1
96 | temp['reward'] = reward
97 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
98 | (UNIT * y) + UNIT / 2,
99 | image=self.shapes[1])
100 |
101 | temp['coords'] = self.canvas.coords(temp['figure'])
102 | temp['state'] = state
103 | self.rewards.append(temp)
104 |
105 | # new methods
106 |
107 | def check_if_reward(self, state):
108 | check_list = dict()
109 | check_list['if_goal'] = False
110 | rewards = 0
111 |
112 | for reward in self.rewards:
113 | if reward['state'] == state:
114 | rewards += reward['reward']
115 | if reward['reward'] == 1:
116 | check_list['if_goal'] = True
117 |
118 | check_list['rewards'] = rewards
119 |
120 | return check_list
121 |
122 | def coords_to_state(self, coords):
123 | x = int((coords[0] - UNIT / 2) / UNIT)
124 | y = int((coords[1] - UNIT / 2) / UNIT)
125 | return [x, y]
126 |
127 | def reset(self):
128 | self.update()
129 | time.sleep(0.5)
130 | x, y = self.canvas.coords(self.rectangle)
131 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
132 | # return observation
133 | self.reset_reward()
134 | return self.get_state()
135 |
136 | def step(self, action):
137 | self.counter += 1
138 | self.render()
139 |
140 | if self.counter % 2 == 1:
141 | self.rewards = self.move_rewards()
142 |
143 | next_coords = self.move(self.rectangle, action)
144 | check = self.check_if_reward(self.coords_to_state(next_coords))
145 | done = check['if_goal']
146 | reward = check['rewards']
147 |
148 | self.canvas.tag_raise(self.rectangle)
149 |
150 | s_ = self.get_state()
151 |
152 | return s_, reward, done
153 |
154 | def get_state(self):
155 |
156 | location = self.coords_to_state(self.canvas.coords(self.rectangle))
157 | agent_x = location[0]
158 | agent_y = location[1]
159 |
160 | states = list()
161 |
162 | # locations.append(agent_x)
163 | # locations.append(agent_y)
164 |
165 | for reward in self.rewards:
166 | reward_location = reward['state']
167 | states.append(reward_location[0] - agent_x)
168 | states.append(reward_location[1] - agent_y)
169 | if reward['reward'] < 0:
170 | states.append(-1)
171 | states.append(reward['direction'])
172 | else:
173 | states.append(1)
174 |
175 | return states
176 |
177 | def move_rewards(self):
178 | new_rewards = []
179 | for temp in self.rewards:
180 | if temp['reward'] == 1:
181 | new_rewards.append(temp)
182 | continue
183 | temp['coords'] = self.move_const(temp)
184 | temp['state'] = self.coords_to_state(temp['coords'])
185 | new_rewards.append(temp)
186 | return new_rewards
187 |
188 | def move_const(self, target):
189 |
190 | s = self.canvas.coords(target['figure'])
191 |
192 | base_action = np.array([0, 0])
193 |
194 | if s[0] == (WIDTH - 1) * UNIT + UNIT / 2:
195 | target['direction'] = 1
196 | elif s[0] == UNIT / 2:
197 | target['direction'] = -1
198 |
199 | if target['direction'] == -1:
200 | base_action[0] += UNIT
201 | elif target['direction'] == 1:
202 | base_action[0] -= UNIT
203 |
204 | if (target['figure'] is not self.rectangle
205 | and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]):
206 | base_action = np.array([0, 0])
207 |
208 | self.canvas.move(target['figure'], base_action[0], base_action[1])
209 |
210 | s_ = self.canvas.coords(target['figure'])
211 |
212 | return s_
213 |
214 | def move(self, target, action):
215 | s = self.canvas.coords(target)
216 |
217 | base_action = np.array([0, 0])
218 |
219 | if action == 0: # up
220 | if s[1] > UNIT:
221 | base_action[1] -= UNIT
222 | elif action == 1: # down
223 | if s[1] < (HEIGHT - 1) * UNIT:
224 | base_action[1] += UNIT
225 | elif action == 2: # right
226 | if s[0] < (WIDTH - 1) * UNIT:
227 | base_action[0] += UNIT
228 | elif action == 3: # left
229 | if s[0] > UNIT:
230 | base_action[0] -= UNIT
231 |
232 | self.canvas.move(target, base_action[0], base_action[1])
233 |
234 | s_ = self.canvas.coords(target)
235 |
236 | return s_
237 |
238 | def render(self):
239 | time.sleep(0.07)
240 | self.update()
241 |
--------------------------------------------------------------------------------
/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png
--------------------------------------------------------------------------------
/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5
--------------------------------------------------------------------------------
/1-grid-world/7-reinforce/environment.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | import tkinter as tk
4 | from PIL import ImageTk, Image
5 |
6 | PhotoImage = ImageTk.PhotoImage
7 | UNIT = 50 # pixels
8 | HEIGHT = 5 # grid height
9 | WIDTH = 5 # grid width
10 |
11 | np.random.seed(1)
12 |
13 |
14 | class Env(tk.Tk):
15 | def __init__(self):
16 | super(Env, self).__init__()
17 | self.action_space = ['u', 'd', 'l', 'r']
18 | self.action_size = len(self.action_space)
19 | self.title('Reinforce')
20 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
21 | self.shapes = self.load_images()
22 | self.canvas = self._build_canvas()
23 | self.counter = 0
24 | self.rewards = []
25 | self.goal = []
26 | # obstacle
27 | self.set_reward([0, 1], -1)
28 | self.set_reward([1, 2], -1)
29 | self.set_reward([2, 3], -1)
30 | # #goal
31 | self.set_reward([4, 4], 1)
32 |
33 | def _build_canvas(self):
34 | canvas = tk.Canvas(self, bg='white',
35 | height=HEIGHT * UNIT,
36 | width=WIDTH * UNIT)
37 | # create grids
38 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
39 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
40 | canvas.create_line(x0, y0, x1, y1)
41 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
42 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
43 | canvas.create_line(x0, y0, x1, y1)
44 |
45 | self.rewards = []
46 | self.goal = []
47 | # add image to canvas
48 | x, y = UNIT/2, UNIT/2
49 | self.rectangle = canvas.create_image(x, y, image=self.shapes[0])
50 |
51 | # pack all`
52 | canvas.pack()
53 |
54 | return canvas
55 |
56 | def load_images(self):
57 | rectangle = PhotoImage(
58 | Image.open("../img/rectangle.png").resize((30, 30)))
59 | triangle = PhotoImage(
60 | Image.open("../img/triangle.png").resize((30, 30)))
61 | circle = PhotoImage(
62 | Image.open("../img/circle.png").resize((30, 30)))
63 |
64 | return rectangle, triangle, circle
65 |
66 | def reset_reward(self):
67 |
68 | for reward in self.rewards:
69 | self.canvas.delete(reward['figure'])
70 |
71 | self.rewards.clear()
72 | self.goal.clear()
73 | self.set_reward([0, 1], -1)
74 | self.set_reward([1, 2], -1)
75 | self.set_reward([2, 3], -1)
76 |
77 | # #goal
78 | self.set_reward([4, 4], 1)
79 |
80 | def set_reward(self, state, reward):
81 | state = [int(state[0]), int(state[1])]
82 | x = int(state[0])
83 | y = int(state[1])
84 | temp = {}
85 | if reward > 0:
86 | temp['reward'] = reward
87 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
88 | (UNIT * y) + UNIT / 2,
89 | image=self.shapes[2])
90 |
91 | self.goal.append(temp['figure'])
92 |
93 |
94 | elif reward < 0:
95 | temp['direction'] = -1
96 | temp['reward'] = reward
97 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
98 | (UNIT * y) + UNIT / 2,
99 | image=self.shapes[1])
100 |
101 | temp['coords'] = self.canvas.coords(temp['figure'])
102 | temp['state'] = state
103 | self.rewards.append(temp)
104 |
105 | # new methods
106 |
107 | def check_if_reward(self, state):
108 | check_list = dict()
109 | check_list['if_goal'] = False
110 | rewards = 0
111 |
112 | for reward in self.rewards:
113 | if reward['state'] == state:
114 | rewards += reward['reward']
115 | if reward['reward'] > 0:
116 | check_list['if_goal'] = True
117 |
118 | check_list['rewards'] = rewards
119 |
120 | return check_list
121 |
122 | def coords_to_state(self, coords):
123 | x = int((coords[0] - UNIT / 2) / UNIT)
124 | y = int((coords[1] - UNIT / 2) / UNIT)
125 | return [x, y]
126 |
127 | def reset(self):
128 | self.update()
129 | x, y = self.canvas.coords(self.rectangle)
130 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
131 | # return observation
132 | self.reset_reward()
133 | return self.get_state()
134 |
135 | def step(self, action):
136 | self.counter += 1
137 | self.render()
138 |
139 | if self.counter % 2 == 1:
140 | self.rewards = self.move_rewards()
141 |
142 | next_coords = self.move(self.rectangle, action)
143 | check = self.check_if_reward(self.coords_to_state(next_coords))
144 | done = check['if_goal']
145 | reward = check['rewards']
146 | reward -= 0.1
147 | self.canvas.tag_raise(self.rectangle)
148 |
149 | s_ = self.get_state()
150 |
151 | return s_, reward, done
152 |
153 | def get_state(self):
154 |
155 | location = self.coords_to_state(self.canvas.coords(self.rectangle))
156 | agent_x = location[0]
157 | agent_y = location[1]
158 |
159 | states = list()
160 |
161 | # locations.append(agent_x)
162 | # locations.append(agent_y)
163 |
164 | for reward in self.rewards:
165 | reward_location = reward['state']
166 | states.append(reward_location[0] - agent_x)
167 | states.append(reward_location[1] - agent_y)
168 | if reward['reward'] < 0:
169 | states.append(-1)
170 | states.append(reward['direction'])
171 | else:
172 | states.append(1)
173 |
174 | return states
175 |
176 | def move_rewards(self):
177 | new_rewards = []
178 | for temp in self.rewards:
179 | if temp['reward'] > 0:
180 | new_rewards.append(temp)
181 | continue
182 | temp['coords'] = self.move_const(temp)
183 | temp['state'] = self.coords_to_state(temp['coords'])
184 | new_rewards.append(temp)
185 | return new_rewards
186 |
187 | def move_const(self, target):
188 |
189 | s = self.canvas.coords(target['figure'])
190 |
191 | base_action = np.array([0, 0])
192 |
193 | if s[0] == (WIDTH - 1) * UNIT + UNIT / 2:
194 | target['direction'] = 1
195 | elif s[0] == UNIT / 2:
196 | target['direction'] = -1
197 |
198 | if target['direction'] == -1:
199 | base_action[0] += UNIT
200 | elif target['direction'] == 1:
201 | base_action[0] -= UNIT
202 |
203 | if (target['figure'] is not self.rectangle
204 | and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]):
205 | base_action = np.array([0, 0])
206 |
207 | self.canvas.move(target['figure'], base_action[0], base_action[1])
208 |
209 | s_ = self.canvas.coords(target['figure'])
210 |
211 | return s_
212 |
213 | def move(self, target, action):
214 | s = self.canvas.coords(target)
215 |
216 | base_action = np.array([0, 0])
217 |
218 | if action == 0: # up
219 | if s[1] > UNIT:
220 | base_action[1] -= UNIT
221 | elif action == 1: # down
222 | if s[1] < (HEIGHT - 1) * UNIT:
223 | base_action[1] += UNIT
224 | elif action == 2: # right
225 | if s[0] < (WIDTH - 1) * UNIT:
226 | base_action[0] += UNIT
227 | elif action == 3: # left
228 | if s[0] > UNIT:
229 | base_action[0] -= UNIT
230 |
231 | self.canvas.move(target, base_action[0], base_action[1])
232 |
233 | s_ = self.canvas.coords(target)
234 |
235 | return s_
236 |
237 | def render(self):
238 | time.sleep(0.07)
239 | self.update()
240 |
--------------------------------------------------------------------------------
/1-grid-world/7-reinforce/reinforce_agent.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import pylab
3 | import numpy as np
4 | from environment import Env
5 | from keras.layers import Dense
6 | from keras.optimizers import Adam
7 | from keras.models import Sequential
8 | from keras import backend as K
9 |
10 | EPISODES = 2500
11 |
12 |
13 | # this is REINFORCE Agent for GridWorld
14 | class ReinforceAgent:
15 | def __init__(self):
16 | self.load_model = True
17 | # actions which agent can do
18 | self.action_space = [0, 1, 2, 3, 4]
19 | # get size of state and action
20 | self.action_size = len(self.action_space)
21 | self.state_size = 15
22 | self.discount_factor = 0.99
23 | self.learning_rate = 0.001
24 |
25 | self.model = self.build_model()
26 | self.optimizer = self.optimizer()
27 | self.states, self.actions, self.rewards = [], [], []
28 |
29 | if self.load_model:
30 | self.model.load_weights('./save_model/reinforce_trained.h5')
31 |
32 | # state is input and probability of each action(policy) is output of network
33 | def build_model(self):
34 | model = Sequential()
35 | model.add(Dense(24, input_dim=self.state_size, activation='relu'))
36 | model.add(Dense(24, activation='relu'))
37 | model.add(Dense(self.action_size, activation='softmax'))
38 | model.summary()
39 | return model
40 |
41 | # create error function and training function to update policy network
42 | def optimizer(self):
43 | action = K.placeholder(shape=[None, 5])
44 | discounted_rewards = K.placeholder(shape=[None, ])
45 |
46 | # Calculate cross entropy error function
47 | action_prob = K.sum(action * self.model.output, axis=1)
48 | cross_entropy = K.log(action_prob) * discounted_rewards
49 | loss = -K.sum(cross_entropy)
50 |
51 | # create training function
52 | optimizer = Adam(lr=self.learning_rate)
53 | updates = optimizer.get_updates(self.model.trainable_weights, [],
54 | loss)
55 | train = K.function([self.model.input, action, discounted_rewards], [],
56 | updates=updates)
57 |
58 | return train
59 |
60 | # get action from policy network
61 | def get_action(self, state):
62 | policy = self.model.predict(state)[0]
63 | return np.random.choice(self.action_size, 1, p=policy)[0]
64 |
65 | # calculate discounted rewards
66 | def discount_rewards(self, rewards):
67 | discounted_rewards = np.zeros_like(rewards)
68 | running_add = 0
69 | for t in reversed(range(0, len(rewards))):
70 | running_add = running_add * self.discount_factor + rewards[t]
71 | discounted_rewards[t] = running_add
72 | return discounted_rewards
73 |
74 | # save states, actions and rewards for an episode
75 | def append_sample(self, state, action, reward):
76 | self.states.append(state[0])
77 | self.rewards.append(reward)
78 | act = np.zeros(self.action_size)
79 | act[action] = 1
80 | self.actions.append(act)
81 |
82 | # update policy neural network
83 | def train_model(self):
84 | discounted_rewards = np.float32(self.discount_rewards(self.rewards))
85 | discounted_rewards -= np.mean(discounted_rewards)
86 | discounted_rewards /= np.std(discounted_rewards)
87 |
88 | self.optimizer([self.states, self.actions, discounted_rewards])
89 | self.states, self.actions, self.rewards = [], [], []
90 |
91 |
92 | if __name__ == "__main__":
93 | env = Env()
94 | agent = ReinforceAgent()
95 |
96 | global_step = 0
97 | scores, episodes = [], []
98 |
99 | for e in range(EPISODES):
100 | done = False
101 | score = 0
102 | # fresh env
103 | state = env.reset()
104 | state = np.reshape(state, [1, 15])
105 |
106 | while not done:
107 | global_step += 1
108 | # get action for the current state and go one step in environment
109 | action = agent.get_action(state)
110 | next_state, reward, done = env.step(action)
111 | next_state = np.reshape(next_state, [1, 15])
112 |
113 | agent.append_sample(state, action, reward)
114 | score += reward
115 | state = copy.deepcopy(next_state)
116 |
117 | if done:
118 | # update policy neural network for each episode
119 | agent.train_model()
120 | scores.append(score)
121 | episodes.append(e)
122 | score = round(score, 2)
123 | print("episode:", e, " score:", score, " time_step:",
124 | global_step)
125 |
126 | if e % 100 == 0:
127 | pylab.plot(episodes, scores, 'b')
128 | pylab.savefig("./save_graph/reinforce.png")
129 | agent.model.save_weights("./save_model/reinforce.h5")
130 |
--------------------------------------------------------------------------------
/1-grid-world/7-reinforce/save_graph/reinforce_trained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/7-reinforce/save_graph/reinforce_trained.png
--------------------------------------------------------------------------------
/1-grid-world/7-reinforce/save_model/reinforce_trained.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/7-reinforce/save_model/reinforce_trained.h5
--------------------------------------------------------------------------------
/1-grid-world/README.md:
--------------------------------------------------------------------------------
1 | # Grid World with Reinforcement Learning
2 | This is Grid World example that we made for the simple algorithm test
3 | The game is simple. The red rectangle must arrive in the circle, avoiding triangle.
4 |
5 |

6 |
7 |
8 |
9 |
10 |
11 | ## Dynamic Programming
12 | **1. Policy Iteration**
13 |
14 | **2. Value Iteration**
15 |
16 |
17 |
18 | ## Reinforcement Learning Fundamental Algorithms
19 | **3. Monte-Carlo**
20 |
21 | **4. SARSA**
22 |
23 | **5. Q-Learning**
24 |
25 |
26 |
27 | ## Futher Reinforcement Learning Algorithms
28 | >we have changed Grid World so the obstacles are moving. To solve this problem, we have to use function approximator.
29 | We used Neural Network as function approximator
30 |
31 | 
32 |
33 |
34 |
35 | **6. DQN**
36 |
37 | **7. Policy Gradient**
38 |
39 |
40 |
--------------------------------------------------------------------------------
/1-grid-world/gridworld.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/gridworld.png
--------------------------------------------------------------------------------
/1-grid-world/gridworld_changing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/gridworld_changing.png
--------------------------------------------------------------------------------
/1-grid-world/img/circle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/circle.png
--------------------------------------------------------------------------------
/1-grid-world/img/down.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/down.png
--------------------------------------------------------------------------------
/1-grid-world/img/left.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/left.png
--------------------------------------------------------------------------------
/1-grid-world/img/rectangle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/rectangle.png
--------------------------------------------------------------------------------
/1-grid-world/img/right.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/right.png
--------------------------------------------------------------------------------
/1-grid-world/img/triangle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/triangle.png
--------------------------------------------------------------------------------
/1-grid-world/img/up.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/up.png
--------------------------------------------------------------------------------
/2-cartpole/1-dqn/SumTree.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 |
4 | class SumTree:
5 | write = 0
6 |
7 | def __init__(self, capacity):
8 | self.capacity = capacity
9 | self.tree = numpy.zeros(2 * capacity - 1)
10 | self.data = numpy.zeros(capacity, dtype=object)
11 |
12 | def _propagate(self, idx, change):
13 | parent = (idx - 1) // 2
14 |
15 | self.tree[parent] += change
16 |
17 | if parent != 0:
18 | self._propagate(parent, change)
19 |
20 | def _retrieve(self, idx, s):
21 | left = 2 * idx + 1
22 | right = left + 1
23 |
24 | if left >= len(self.tree):
25 | return idx
26 |
27 | if s <= self.tree[left]:
28 | return self._retrieve(left, s)
29 | else:
30 | return self._retrieve(right, s - self.tree[left])
31 |
32 | def total(self):
33 | return self.tree[0]
34 |
35 | def add(self, p, data):
36 | idx = self.write + self.capacity - 1
37 |
38 | self.data[self.write] = data
39 | self.update(idx, p)
40 |
41 | self.write += 1
42 | if self.write >= self.capacity:
43 | self.write = 0
44 |
45 | def update(self, idx, p):
46 | change = p - self.tree[idx]
47 |
48 | self.tree[idx] = p
49 | self._propagate(idx, change)
50 |
51 | def get(self, s):
52 | idx = self._retrieve(0, s)
53 | dataIdx = idx - self.capacity + 1
54 |
55 | return (idx, self.tree[idx], self.data[dataIdx])
56 |
--------------------------------------------------------------------------------
/2-cartpole/1-dqn/cartpole_dqn.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import gym
3 | import pylab
4 | import random
5 | import numpy as np
6 | from collections import deque
7 | from keras.layers import Dense
8 | from keras.optimizers import Adam
9 | from keras.models import Sequential
10 |
11 | EPISODES = 300
12 |
13 |
14 | # DQN Agent for the Cartpole
15 | # it uses Neural Network to approximate q function
16 | # and replay memory & target q network
17 | class DQNAgent:
18 | def __init__(self, state_size, action_size):
19 | # if you want to see Cartpole learning, then change to True
20 | self.render = False
21 | self.load_model = False
22 |
23 | # get size of state and action
24 | self.state_size = state_size
25 | self.action_size = action_size
26 |
27 | # These are hyper parameters for the DQN
28 | self.discount_factor = 0.99
29 | self.learning_rate = 0.001
30 | self.epsilon = 1.0
31 | self.epsilon_decay = 0.999
32 | self.epsilon_min = 0.01
33 | self.batch_size = 64
34 | self.train_start = 1000
35 | # create replay memory using deque
36 | self.memory = deque(maxlen=2000)
37 |
38 | # create main model and target model
39 | self.model = self.build_model()
40 | self.target_model = self.build_model()
41 |
42 | # initialize target model
43 | self.update_target_model()
44 |
45 | if self.load_model:
46 | self.model.load_weights("./save_model/cartpole_dqn.h5")
47 |
48 | # approximate Q function using Neural Network
49 | # state is input and Q Value of each action is output of network
50 | def build_model(self):
51 | model = Sequential()
52 | model.add(Dense(24, input_dim=self.state_size, activation='relu',
53 | kernel_initializer='he_uniform'))
54 | model.add(Dense(24, activation='relu',
55 | kernel_initializer='he_uniform'))
56 | model.add(Dense(self.action_size, activation='linear',
57 | kernel_initializer='he_uniform'))
58 | model.summary()
59 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
60 | return model
61 |
62 | # after some time interval update the target model to be same with model
63 | def update_target_model(self):
64 | self.target_model.set_weights(self.model.get_weights())
65 |
66 | # get action from model using epsilon-greedy policy
67 | def get_action(self, state):
68 | if np.random.rand() <= self.epsilon:
69 | return random.randrange(self.action_size)
70 | else:
71 | q_value = self.model.predict(state)
72 | return np.argmax(q_value[0])
73 |
74 | # save sample to the replay memory
75 | def append_sample(self, state, action, reward, next_state, done):
76 | self.memory.append((state, action, reward, next_state, done))
77 | if self.epsilon > self.epsilon_min:
78 | self.epsilon *= self.epsilon_decay
79 |
80 | # pick samples randomly from replay memory (with batch_size)
81 | def train_model(self):
82 | if len(self.memory) < self.train_start:
83 | return
84 | batch_size = min(self.batch_size, len(self.memory))
85 | mini_batch = random.sample(self.memory, batch_size)
86 |
87 | update_input = np.zeros((batch_size, self.state_size))
88 | update_target = np.zeros((batch_size, self.state_size))
89 | action, reward, done = [], [], []
90 |
91 | for i in range(self.batch_size):
92 | update_input[i] = mini_batch[i][0]
93 | action.append(mini_batch[i][1])
94 | reward.append(mini_batch[i][2])
95 | update_target[i] = mini_batch[i][3]
96 | done.append(mini_batch[i][4])
97 |
98 | target = self.model.predict(update_input)
99 | target_val = self.target_model.predict(update_target)
100 |
101 | for i in range(self.batch_size):
102 | # Q Learning: get maximum Q value at s' from target model
103 | if done[i]:
104 | target[i][action[i]] = reward[i]
105 | else:
106 | target[i][action[i]] = reward[i] + self.discount_factor * (
107 | np.amax(target_val[i]))
108 |
109 | # and do the model fit!
110 | self.model.fit(update_input, target, batch_size=self.batch_size,
111 | epochs=1, verbose=0)
112 |
113 |
114 | if __name__ == "__main__":
115 | # In case of CartPole-v1, maximum length of episode is 500
116 | env = gym.make('CartPole-v1')
117 | # get size of state and action from environment
118 | state_size = env.observation_space.shape[0]
119 | action_size = env.action_space.n
120 |
121 | agent = DQNAgent(state_size, action_size)
122 |
123 | scores, episodes = [], []
124 |
125 | for e in range(EPISODES):
126 | done = False
127 | score = 0
128 | state = env.reset()
129 | state = np.reshape(state, [1, state_size])
130 |
131 | while not done:
132 | if agent.render:
133 | env.render()
134 |
135 | # get action for the current state and go one step in environment
136 | action = agent.get_action(state)
137 | next_state, reward, done, info = env.step(action)
138 | next_state = np.reshape(next_state, [1, state_size])
139 | # if an action make the episode end, then gives penalty of -100
140 | reward = reward if not done or score == 499 else -100
141 |
142 | # save the sample to the replay memory
143 | agent.append_sample(state, action, reward, next_state, done)
144 | # every time step do the training
145 | agent.train_model()
146 | score += reward
147 | state = next_state
148 |
149 | if done:
150 | # every episode update the target model to be same with model
151 | agent.update_target_model()
152 |
153 | # every episode, plot the play time
154 | score = score if score == 500 else score + 100
155 | scores.append(score)
156 | episodes.append(e)
157 | pylab.plot(episodes, scores, 'b')
158 | pylab.savefig("./save_graph/cartpole_dqn.png")
159 | print("episode:", e, " score:", score, " memory length:",
160 | len(agent.memory), " epsilon:", agent.epsilon)
161 |
162 | # if the mean of scores of last 10 episode is bigger than 490
163 | # stop training
164 | if np.mean(scores[-min(10, len(scores)):]) > 490:
165 | sys.exit()
166 |
167 | # save the model
168 | if e % 50 == 0:
169 | agent.model.save_weights("./save_model/cartpole_dqn.h5")
170 |
--------------------------------------------------------------------------------
/2-cartpole/1-dqn/cartpole_only_per.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import gym
3 | import pylab
4 | import random
5 | import numpy as np
6 | from SumTree import SumTree
7 | from collections import deque
8 | from keras.layers import Dense
9 | from keras.optimizers import Adam
10 | from keras.models import Sequential
11 |
12 | EPISODES = 300
13 |
14 |
15 | # 카트폴 예제에서의 DQN 에이전트
16 | class DQNAgent:
17 | def __init__(self, state_size, action_size):
18 | self.render = False
19 | self.load_model = False
20 |
21 | # 상태와 행동의 크기 정의
22 | self.state_size = state_size
23 | self.action_size = action_size
24 |
25 | # DQN 하이퍼파라미터
26 | self.discount_factor = 0.99
27 | self.learning_rate = 0.001
28 | self.epsilon = 1.0
29 | self.epsilon_decay = 0.999
30 | self.epsilon_min = 0.01
31 | self.batch_size = 64
32 | self.train_start = 2000
33 | self.memory_size = 2000
34 |
35 | # 리플레이 메모리, 최대 크기 2000
36 | self.memory = Memory(self.memory_size)
37 |
38 | # 모델과 타깃 모델 생성
39 | self.model = self.build_model()
40 | self.target_model = self.build_model()
41 |
42 | # 타깃 모델 초기화
43 | self.update_target_model()
44 |
45 | if self.load_model:
46 | self.model.load_weights("./save_model/cartpole_dqn_trained.h5")
47 |
48 | # 상태가 입력, 큐함수가 출력인 인공신경망 생성
49 | def build_model(self):
50 | model = Sequential()
51 | model.add(Dense(24, input_dim=self.state_size, activation='relu',
52 | kernel_initializer='he_uniform'))
53 | model.add(Dense(24, activation='relu',
54 | kernel_initializer='he_uniform'))
55 | model.add(Dense(self.action_size, activation='linear',
56 | kernel_initializer='he_uniform'))
57 | model.summary()
58 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
59 | return model
60 |
61 | # 타깃 모델을 모델의 가중치로 업데이트
62 | def update_target_model(self):
63 | self.target_model.set_weights(self.model.get_weights())
64 |
65 | # 입실론 탐욕 정책으로 행동 선택
66 | def get_action(self, state):
67 | if np.random.rand() <= self.epsilon:
68 | return random.randrange(self.action_size)
69 | else:
70 | q_value = self.model.predict(state)
71 | return np.argmax(q_value[0])
72 |
73 | # 샘플 을 리플레이 메모리에 저장
74 | def append_sample(self, state, action, reward, next_state, done):
75 | if self.epsilon == 1:
76 | done = True
77 |
78 | # TD-error 를 구해서 같이 메모리에 저장
79 | target = self.model.predict([state])
80 | old_val = target[0][action]
81 | target_val = self.target_model.predict([next_state])
82 | if done:
83 | target[0][action] = reward
84 | else:
85 | target[0][action] = reward + self.discount_factor * (
86 | np.amax(target_val[0]))
87 | error = abs(old_val - target[0][action])
88 |
89 | self.memory.add(error, (state, action, reward, next_state, done))
90 |
91 | # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
92 | def train_model(self):
93 | if self.epsilon > self.epsilon_min:
94 | self.epsilon *= self.epsilon_decay
95 |
96 | # 메모리에서 배치 크기만큼 무작위로 샘플 추출
97 | mini_batch = self.memory.sample(self.batch_size)
98 |
99 | errors = np.zeros(self.batch_size)
100 | states = np.zeros((self.batch_size, self.state_size))
101 | next_states = np.zeros((self.batch_size, self.state_size))
102 | actions, rewards, dones = [], [], []
103 |
104 | for i in range(self.batch_size):
105 | states[i] = mini_batch[i][1][0]
106 | actions.append(mini_batch[i][1][1])
107 | rewards.append(mini_batch[i][1][2])
108 | next_states[i] = mini_batch[i][1][3]
109 | dones.append(mini_batch[i][1][4])
110 |
111 | # 현재 상태에 대한 모델의 큐함수
112 | # 다음 상태에 대한 타깃 모델의 큐함수
113 | target = self.model.predict(states)
114 | target_val = self.target_model.predict(next_states)
115 |
116 | # 벨만 최적 방정식을 이용한 업데이트 타깃
117 | for i in range(self.batch_size):
118 | old_val = target[i][actions[i]]
119 | if dones[i]:
120 | target[i][actions[i]] = rewards[i]
121 | else:
122 | target[i][actions[i]] = rewards[i] + self.discount_factor * (
123 | np.amax(target_val[i]))
124 | # TD-error를 저장
125 | errors[i] = abs(old_val - target[i][actions[i]])
126 |
127 | # TD-error로 priority 업데이트
128 | for i in range(self.batch_size):
129 | idx = mini_batch[i][0]
130 | self.memory.update(idx, errors[i])
131 |
132 | self.model.fit(states, target, batch_size=self.batch_size,
133 | epochs=1, verbose=0)
134 |
135 |
136 | class Memory: # stored as ( s, a, r, s_ ) in SumTree
137 | e = 0.01
138 | a = 0.6
139 |
140 | def __init__(self, capacity):
141 | self.tree = SumTree(capacity)
142 |
143 | def _getPriority(self, error):
144 | return (error + self.e) ** self.a
145 |
146 | def add(self, error, sample):
147 | p = self._getPriority(error)
148 | self.tree.add(p, sample)
149 |
150 | def sample(self, n):
151 | batch = []
152 | segment = self.tree.total() / n
153 |
154 | for i in range(n):
155 | a = segment * i
156 | b = segment * (i + 1)
157 |
158 | s = random.uniform(a, b)
159 | (idx, p, data) = self.tree.get(s)
160 | batch.append((idx, data))
161 |
162 | return batch
163 |
164 | def update(self, idx, error):
165 | p = self._getPriority(error)
166 | self.tree.update(idx, p)
167 |
168 |
169 | if __name__ == "__main__":
170 | # CartPole-v1 환경, 최대 타임스텝 수가 500
171 | env = gym.make('CartPole-v1')
172 | state_size = env.observation_space.shape[0]
173 | action_size = env.action_space.n
174 |
175 | # DQN 에이전트 생성
176 | agent = DQNAgent(state_size, action_size)
177 |
178 | scores, episodes = [], []
179 |
180 | step = 0
181 | for e in range(EPISODES):
182 | done = False
183 | score = 0
184 | # env 초기화
185 | state = env.reset()
186 | state = np.reshape(state, [1, state_size])
187 |
188 | while not done:
189 | if agent.render:
190 | env.render()
191 | step += 1
192 | # 현재 상태로 행동을 선택
193 | action = agent.get_action(state)
194 | # 선택한 행동으로 환경에서 한 타임스텝 진행
195 | next_state, reward, done, info = env.step(action)
196 | next_state = np.reshape(next_state, [1, state_size])
197 | # 에피소드가 중간에 끝나면 -100 보상
198 | r = reward if not done or score+reward == 500 else -10
199 | # 리플레이 메모리에 샘플 저장
200 | agent.append_sample(state, action, r, next_state, done)
201 | # 매 타임스텝마다 학습
202 | if step >= agent.train_start:
203 | agent.train_model()
204 |
205 | score += reward
206 | state = next_state
207 |
208 | if done:
209 | # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트
210 | agent.update_target_model()
211 |
212 | # score = score if score == 500 else score + 100
213 | # 에피소드마다 학습 결과 출력
214 | scores.append(score)
215 | episodes.append(e)
216 | pylab.plot(episodes, scores, 'b')
217 | pylab.savefig("./save_graph/cartpole_dqn.png")
218 | print("episode:", e, " score:", score, " memory length:",
219 | step if step <= agent.memory_size else agent.memory_size, " epsilon:", agent.epsilon)
220 |
221 | # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단
222 | if np.mean(scores[-min(10, len(scores)):]) > 490:
223 | agent.model.save_weights("./save_model/cartpole_dqn.h5")
224 | sys.exit()
225 |
--------------------------------------------------------------------------------
/2-cartpole/1-dqn/save_graph/Cartpole_DQN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/1-dqn/save_graph/Cartpole_DQN.png
--------------------------------------------------------------------------------
/2-cartpole/1-dqn/save_model/cartpole_dqn.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/1-dqn/save_model/cartpole_dqn.h5
--------------------------------------------------------------------------------
/2-cartpole/2-double-dqn/cartpole_ddqn.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import gym
3 | import pylab
4 | import random
5 | import numpy as np
6 | from collections import deque
7 | from keras.layers import Dense
8 | from keras.optimizers import Adam
9 | from keras.models import Sequential
10 |
11 | EPISODES = 300
12 |
13 |
14 | # Double DQN Agent for the Cartpole
15 | # it uses Neural Network to approximate q function
16 | # and replay memory & target q network
17 | class DoubleDQNAgent:
18 | def __init__(self, state_size, action_size):
19 | # if you want to see Cartpole learning, then change to True
20 | self.render = False
21 | self.load_model = False
22 | # get size of state and action
23 | self.state_size = state_size
24 | self.action_size = action_size
25 |
26 | # these is hyper parameters for the Double DQN
27 | self.discount_factor = 0.99
28 | self.learning_rate = 0.001
29 | self.epsilon = 1.0
30 | self.epsilon_decay = 0.999
31 | self.epsilon_min = 0.01
32 | self.batch_size = 64
33 | self.train_start = 1000
34 | # create replay memory using deque
35 | self.memory = deque(maxlen=2000)
36 |
37 | # create main model and target model
38 | self.model = self.build_model()
39 | self.target_model = self.build_model()
40 |
41 | # initialize target model
42 | self.update_target_model()
43 |
44 | if self.load_model:
45 | self.model.load_weights("./save_model/cartpole_ddqn.h5")
46 |
47 | # approximate Q function using Neural Network
48 | # state is input and Q Value of each action is output of network
49 | def build_model(self):
50 | model = Sequential()
51 | model.add(Dense(24, input_dim=self.state_size, activation='relu',
52 | kernel_initializer='he_uniform'))
53 | model.add(Dense(24, activation='relu',
54 | kernel_initializer='he_uniform'))
55 | model.add(Dense(self.action_size, activation='linear',
56 | kernel_initializer='he_uniform'))
57 | model.summary()
58 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
59 | return model
60 |
61 | # after some time interval update the target model to be same with model
62 | def update_target_model(self):
63 | self.target_model.set_weights(self.model.get_weights())
64 |
65 | # get action from model using epsilon-greedy policy
66 | def get_action(self, state):
67 | if np.random.rand() <= self.epsilon:
68 | return random.randrange(self.action_size)
69 | else:
70 | q_value = self.model.predict(state)
71 | return np.argmax(q_value[0])
72 |
73 | # save sample to the replay memory
74 | def append_sample(self, state, action, reward, next_state, done):
75 | self.memory.append((state, action, reward, next_state, done))
76 | if self.epsilon > self.epsilon_min:
77 | self.epsilon *= self.epsilon_decay
78 |
79 | # pick samples randomly from replay memory (with batch_size)
80 | def train_model(self):
81 | if len(self.memory) < self.train_start:
82 | return
83 | batch_size = min(self.batch_size, len(self.memory))
84 | mini_batch = random.sample(self.memory, batch_size)
85 |
86 | update_input = np.zeros((batch_size, self.state_size))
87 | update_target = np.zeros((batch_size, self.state_size))
88 | action, reward, done = [], [], []
89 |
90 | for i in range(batch_size):
91 | update_input[i] = mini_batch[i][0]
92 | action.append(mini_batch[i][1])
93 | reward.append(mini_batch[i][2])
94 | update_target[i] = mini_batch[i][3]
95 | done.append(mini_batch[i][4])
96 |
97 | target = self.model.predict(update_input)
98 | target_next = self.model.predict(update_target)
99 | target_val = self.target_model.predict(update_target)
100 |
101 | for i in range(self.batch_size):
102 | # like Q Learning, get maximum Q value at s'
103 | # But from target model
104 | if done[i]:
105 | target[i][action[i]] = reward[i]
106 | else:
107 | # the key point of Double DQN
108 | # selection of action is from model
109 | # update is from target model
110 | a = np.argmax(target_next[i])
111 | target[i][action[i]] = reward[i] + self.discount_factor * (
112 | target_val[i][a])
113 |
114 | # make minibatch which includes target q value and predicted q value
115 | # and do the model fit!
116 | self.model.fit(update_input, target, batch_size=self.batch_size,
117 | epochs=1, verbose=0)
118 |
119 |
120 | if __name__ == "__main__":
121 | # In case of CartPole-v1, you can play until 500 time step
122 | env = gym.make('CartPole-v1')
123 | # get size of state and action from environment
124 | state_size = env.observation_space.shape[0]
125 | action_size = env.action_space.n
126 |
127 | agent = DoubleDQNAgent(state_size, action_size)
128 |
129 | scores, episodes = [], []
130 |
131 | for e in range(EPISODES):
132 | done = False
133 | score = 0
134 | state = env.reset()
135 | state = np.reshape(state, [1, state_size])
136 |
137 | while not done:
138 | if agent.render:
139 | env.render()
140 |
141 | # get action for the current state and go one step in environment
142 | action = agent.get_action(state)
143 | next_state, reward, done, info = env.step(action)
144 | next_state = np.reshape(next_state, [1, state_size])
145 | # if an action make the episode end, then gives penalty of -100
146 | reward = reward if not done or score == 499 else -100
147 |
148 | # save the sample to the replay memory
149 | agent.append_sample(state, action, reward, next_state, done)
150 | # every time step do the training
151 | agent.train_model()
152 | score += reward
153 | state = next_state
154 |
155 | if done:
156 | # every episode update the target model to be same with model
157 | agent.update_target_model()
158 |
159 | # every episode, plot the play time
160 | score = score if score == 500 else score + 100
161 | scores.append(score)
162 | episodes.append(e)
163 | pylab.plot(episodes, scores, 'b')
164 | pylab.savefig("./save_graph/cartpole_ddqn.png")
165 | print("episode:", e, " score:", score, " memory length:",
166 | len(agent.memory), " epsilon:", agent.epsilon)
167 |
168 | # if the mean of scores of last 10 episode is bigger than 490
169 | # stop training
170 | if np.mean(scores[-min(10, len(scores)):]) > 490:
171 | sys.exit()
172 |
173 | # save the model
174 | if e % 50 == 0:
175 | agent.model.save_weights("./save_model/cartpole_ddqn.h5")
176 |
--------------------------------------------------------------------------------
/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png
--------------------------------------------------------------------------------
/2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5
--------------------------------------------------------------------------------
/2-cartpole/3-reinforce/cartpole_reinforce.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import gym
3 | import pylab
4 | import numpy as np
5 | from keras.layers import Dense
6 | from keras.models import Sequential
7 | from keras.optimizers import Adam
8 |
9 | EPISODES = 1000
10 |
11 |
12 | # This is Policy Gradient agent for the Cartpole
13 | # In this example, we use REINFORCE algorithm which uses monte-carlo update rule
14 | class REINFORCEAgent:
15 | def __init__(self, state_size, action_size):
16 | # if you want to see Cartpole learning, then change to True
17 | self.render = False
18 | self.load_model = False
19 | # get size of state and action
20 | self.state_size = state_size
21 | self.action_size = action_size
22 |
23 | # These are hyper parameters for the Policy Gradient
24 | self.discount_factor = 0.99
25 | self.learning_rate = 0.001
26 | self.hidden1, self.hidden2 = 24, 24
27 |
28 | # create model for policy network
29 | self.model = self.build_model()
30 |
31 | # lists for the states, actions and rewards
32 | self.states, self.actions, self.rewards = [], [], []
33 |
34 | if self.load_model:
35 | self.model.load_weights("./save_model/cartpole_reinforce.h5")
36 |
37 | # approximate policy using Neural Network
38 | # state is input and probability of each action is output of network
39 | def build_model(self):
40 | model = Sequential()
41 | model.add(Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform'))
42 | model.add(Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform'))
43 | model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform'))
44 | model.summary()
45 | # Using categorical crossentropy as a loss is a trick to easily
46 | # implement the policy gradient. Categorical cross entropy is defined
47 | # H(p, q) = sum(p_i * log(q_i)). For the action taken, a, you set
48 | # p_a = advantage. q_a is the output of the policy network, which is
49 | # the probability of taking the action a, i.e. policy(s, a).
50 | # All other p_i are zero, thus we have H(p, q) = A * log(policy(s, a))
51 | model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=self.learning_rate))
52 | return model
53 |
54 | # using the output of policy network, pick action stochastically
55 | def get_action(self, state):
56 | policy = self.model.predict(state, batch_size=1).flatten()
57 | return np.random.choice(self.action_size, 1, p=policy)[0]
58 |
59 | # In Policy Gradient, Q function is not available.
60 | # Instead agent uses sample returns for evaluating policy
61 | def discount_rewards(self, rewards):
62 | discounted_rewards = np.zeros_like(rewards)
63 | running_add = 0
64 | for t in reversed(range(0, len(rewards))):
65 | running_add = running_add * self.discount_factor + rewards[t]
66 | discounted_rewards[t] = running_add
67 | return discounted_rewards
68 |
69 | # save of each step
70 | def append_sample(self, state, action, reward):
71 | self.states.append(state)
72 | self.rewards.append(reward)
73 | self.actions.append(action)
74 |
75 | # update policy network every episode
76 | def train_model(self):
77 | episode_length = len(self.states)
78 |
79 | discounted_rewards = self.discount_rewards(self.rewards)
80 | discounted_rewards -= np.mean(discounted_rewards)
81 | discounted_rewards /= np.std(discounted_rewards)
82 |
83 | update_inputs = np.zeros((episode_length, self.state_size))
84 | advantages = np.zeros((episode_length, self.action_size))
85 |
86 | for i in range(episode_length):
87 | update_inputs[i] = self.states[i]
88 | advantages[i][self.actions[i]] = discounted_rewards[i]
89 |
90 | self.model.fit(update_inputs, advantages, epochs=1, verbose=0)
91 | self.states, self.actions, self.rewards = [], [], []
92 |
93 | if __name__ == "__main__":
94 | # In case of CartPole-v1, you can play until 500 time step
95 | env = gym.make('CartPole-v1')
96 | # get size of state and action from environment
97 | state_size = env.observation_space.shape[0]
98 | action_size = env.action_space.n
99 |
100 | # make REINFORCE agent
101 | agent = REINFORCEAgent(state_size, action_size)
102 |
103 | scores, episodes = [], []
104 |
105 | for e in range(EPISODES):
106 | done = False
107 | score = 0
108 | state = env.reset()
109 | state = np.reshape(state, [1, state_size])
110 |
111 | while not done:
112 | if agent.render:
113 | env.render()
114 |
115 | # get action for the current state and go one step in environment
116 | action = agent.get_action(state)
117 | next_state, reward, done, info = env.step(action)
118 | next_state = np.reshape(next_state, [1, state_size])
119 | reward = reward if not done or score == 499 else -100
120 |
121 | # save the sample to the memory
122 | agent.append_sample(state, action, reward)
123 |
124 | score += reward
125 | state = next_state
126 |
127 | if done:
128 | # every episode, agent learns from sample returns
129 | agent.train_model()
130 |
131 | # every episode, plot the play time
132 | score = score if score == 500 else score + 100
133 | scores.append(score)
134 | episodes.append(e)
135 | pylab.plot(episodes, scores, 'b')
136 | pylab.savefig("./save_graph/cartpole_reinforce.png")
137 | print("episode:", e, " score:", score)
138 |
139 | # if the mean of scores of last 10 episode is bigger than 490
140 | # stop training
141 | if np.mean(scores[-min(10, len(scores)):]) > 490:
142 | sys.exit()
143 |
144 | # save the model
145 | if e % 50 == 0:
146 | agent.model.save_weights("./save_model/cartpole_reinforce.h5")
147 |
--------------------------------------------------------------------------------
/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png
--------------------------------------------------------------------------------
/2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5
--------------------------------------------------------------------------------
/2-cartpole/4-actor-critic/cartpole_a2c.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import gym
3 | import pylab
4 | import numpy as np
5 | from keras.layers import Dense
6 | from keras.models import Sequential
7 | from keras.optimizers import Adam
8 |
9 | EPISODES = 1000
10 |
11 |
12 | # A2C(Advantage Actor-Critic) agent for the Cartpole
13 | class A2CAgent:
14 | def __init__(self, state_size, action_size):
15 | # if you want to see Cartpole learning, then change to True
16 | self.render = False
17 | self.load_model = False
18 | # get size of state and action
19 | self.state_size = state_size
20 | self.action_size = action_size
21 | self.value_size = 1
22 |
23 | # These are hyper parameters for the Policy Gradient
24 | self.discount_factor = 0.99
25 | self.actor_lr = 0.001
26 | self.critic_lr = 0.005
27 |
28 | # create model for policy network
29 | self.actor = self.build_actor()
30 | self.critic = self.build_critic()
31 |
32 | if self.load_model:
33 | self.actor.load_weights("./save_model/cartpole_actor.h5")
34 | self.critic.load_weights("./save_model/cartpole_critic.h5")
35 |
36 | # approximate policy and value using Neural Network
37 | # actor: state is input and probability of each action is output of model
38 | def build_actor(self):
39 | actor = Sequential()
40 | actor.add(Dense(24, input_dim=self.state_size, activation='relu',
41 | kernel_initializer='he_uniform'))
42 | actor.add(Dense(self.action_size, activation='softmax',
43 | kernel_initializer='he_uniform'))
44 | actor.summary()
45 | # See note regarding crossentropy in cartpole_reinforce.py
46 | actor.compile(loss='categorical_crossentropy',
47 | optimizer=Adam(lr=self.actor_lr))
48 | return actor
49 |
50 | # critic: state is input and value of state is output of model
51 | def build_critic(self):
52 | critic = Sequential()
53 | critic.add(Dense(24, input_dim=self.state_size, activation='relu',
54 | kernel_initializer='he_uniform'))
55 | critic.add(Dense(self.value_size, activation='linear',
56 | kernel_initializer='he_uniform'))
57 | critic.summary()
58 | critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
59 | return critic
60 |
61 | # using the output of policy network, pick action stochastically
62 | def get_action(self, state):
63 | policy = self.actor.predict(state, batch_size=1).flatten()
64 | return np.random.choice(self.action_size, 1, p=policy)[0]
65 |
66 | # update policy network every episode
67 | def train_model(self, state, action, reward, next_state, done):
68 | target = np.zeros((1, self.value_size))
69 | advantages = np.zeros((1, self.action_size))
70 |
71 | value = self.critic.predict(state)[0]
72 | next_value = self.critic.predict(next_state)[0]
73 |
74 | if done:
75 | advantages[0][action] = reward - value
76 | target[0][0] = reward
77 | else:
78 | advantages[0][action] = reward + self.discount_factor * (next_value) - value
79 | target[0][0] = reward + self.discount_factor * next_value
80 |
81 | self.actor.fit(state, advantages, epochs=1, verbose=0)
82 | self.critic.fit(state, target, epochs=1, verbose=0)
83 |
84 |
85 | if __name__ == "__main__":
86 | # In case of CartPole-v1, maximum length of episode is 500
87 | env = gym.make('CartPole-v1')
88 | # get size of state and action from environment
89 | state_size = env.observation_space.shape[0]
90 | action_size = env.action_space.n
91 |
92 | # make A2C agent
93 | agent = A2CAgent(state_size, action_size)
94 |
95 | scores, episodes = [], []
96 |
97 | for e in range(EPISODES):
98 | done = False
99 | score = 0
100 | state = env.reset()
101 | state = np.reshape(state, [1, state_size])
102 |
103 | while not done:
104 | if agent.render:
105 | env.render()
106 |
107 | action = agent.get_action(state)
108 | next_state, reward, done, info = env.step(action)
109 | next_state = np.reshape(next_state, [1, state_size])
110 | # if an action make the episode end, then gives penalty of -100
111 | reward = reward if not done or score == 499 else -100
112 |
113 | agent.train_model(state, action, reward, next_state, done)
114 |
115 | score += reward
116 | state = next_state
117 |
118 | if done:
119 | # every episode, plot the play time
120 | score = score if score == 500.0 else score + 100
121 | scores.append(score)
122 | episodes.append(e)
123 | pylab.plot(episodes, scores, 'b')
124 | pylab.savefig("./save_graph/cartpole_a2c.png")
125 | print("episode:", e, " score:", score)
126 |
127 | # if the mean of scores of last 10 episode is bigger than 490
128 | # stop training
129 | if np.mean(scores[-min(10, len(scores)):]) > 490:
130 | sys.exit()
131 |
132 | # save the model
133 | if e % 50 == 0:
134 | agent.actor.save_weights("./save_model/cartpole_actor.h5")
135 | agent.critic.save_weights("./save_model/cartpole_critic.h5")
136 |
--------------------------------------------------------------------------------
/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png
--------------------------------------------------------------------------------
/2-cartpole/4-actor-critic/save_model/cartpole_actor.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/4-actor-critic/save_model/cartpole_actor.h5
--------------------------------------------------------------------------------
/2-cartpole/4-actor-critic/save_model/cartpole_critic.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/4-actor-critic/save_model/cartpole_critic.h5
--------------------------------------------------------------------------------
/2-cartpole/5-a3c/cartpole_a3c.py:
--------------------------------------------------------------------------------
1 | import threading
2 | import numpy as np
3 | import tensorflow as tf
4 | import pylab
5 | import time
6 | import gym
7 | from keras.layers import Dense, Input
8 | from keras.models import Model
9 | from keras.optimizers import Adam
10 | from keras import backend as K
11 |
12 |
13 | # global variables for threading
14 | episode = 0
15 | scores = []
16 |
17 | EPISODES = 2000
18 |
19 | # This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole
20 | # In this example, we use A3C algorithm
21 | class A3CAgent:
22 | def __init__(self, state_size, action_size, env_name):
23 | # get size of state and action
24 | self.state_size = state_size
25 | self.action_size = action_size
26 |
27 | # get gym environment name
28 | self.env_name = env_name
29 |
30 | # these are hyper parameters for the A3C
31 | self.actor_lr = 0.001
32 | self.critic_lr = 0.001
33 | self.discount_factor = .99
34 | self.hidden1, self.hidden2 = 24, 24
35 | self.threads = 8
36 |
37 | # create model for actor and critic network
38 | self.actor, self.critic = self.build_model()
39 |
40 | # method for training actor and critic network
41 | self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]
42 |
43 | self.sess = tf.InteractiveSession()
44 | K.set_session(self.sess)
45 | self.sess.run(tf.global_variables_initializer())
46 |
47 | # approximate policy and value using Neural Network
48 | # actor -> state is input and probability of each action is output of network
49 | # critic -> state is input and value of state is output of network
50 | # actor and critic network share first hidden layer
51 | def build_model(self):
52 | state = Input(batch_shape=(None, self.state_size))
53 | shared = Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state)
54 |
55 | actor_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')(shared)
56 | action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden)
57 |
58 | value_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='he_uniform')(shared)
59 | state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden)
60 |
61 | actor = Model(inputs=state, outputs=action_prob)
62 | critic = Model(inputs=state, outputs=state_value)
63 |
64 | actor._make_predict_function()
65 | critic._make_predict_function()
66 |
67 | actor.summary()
68 | critic.summary()
69 |
70 | return actor, critic
71 |
72 | # make loss function for Policy Gradient
73 | # [log(action probability) * advantages] will be input for the back prop
74 | # we add entropy of action probability to loss
75 | def actor_optimizer(self):
76 | action = K.placeholder(shape=(None, self.action_size))
77 | advantages = K.placeholder(shape=(None, ))
78 |
79 | policy = self.actor.output
80 |
81 | good_prob = K.sum(action * policy, axis=1)
82 | eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages)
83 | loss = -K.sum(eligibility)
84 |
85 | entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)
86 |
87 | actor_loss = loss + 0.01*entropy
88 |
89 | optimizer = Adam(lr=self.actor_lr)
90 | updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss)
91 | train = K.function([self.actor.input, action, advantages], [], updates=updates)
92 | return train
93 |
94 | # make loss function for Value approximation
95 | def critic_optimizer(self):
96 | discounted_reward = K.placeholder(shape=(None, ))
97 |
98 | value = self.critic.output
99 |
100 | loss = K.mean(K.square(discounted_reward - value))
101 |
102 | optimizer = Adam(lr=self.critic_lr)
103 | updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
104 | train = K.function([self.critic.input, discounted_reward], [], updates=updates)
105 | return train
106 |
107 | # make agents(local) and start training
108 | def train(self):
109 | # self.load_model('./save_model/cartpole_a3c.h5')
110 | agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor,
111 | self.action_size, self.state_size) for i in range(self.threads)]
112 |
113 | for agent in agents:
114 | agent.start()
115 |
116 | while True:
117 | time.sleep(20)
118 |
119 | plot = scores[:]
120 | pylab.plot(range(len(plot)), plot, 'b')
121 | pylab.savefig("./save_graph/cartpole_a3c.png")
122 |
123 | self.save_model('./save_model/cartpole_a3c.h5')
124 |
125 | def save_model(self, name):
126 | self.actor.save_weights(name + "_actor.h5")
127 | self.critic.save_weights(name + "_critic.h5")
128 |
129 | def load_model(self, name):
130 | self.actor.load_weights(name + "_actor.h5")
131 | self.critic.load_weights(name + "_critic.h5")
132 |
133 | # This is Agent(local) class for threading
134 | class Agent(threading.Thread):
135 | def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size):
136 | threading.Thread.__init__(self)
137 |
138 | self.states = []
139 | self.rewards = []
140 | self.actions = []
141 |
142 | self.index = index
143 | self.actor = actor
144 | self.critic = critic
145 | self.optimizer = optimizer
146 | self.env_name = env_name
147 | self.discount_factor = discount_factor
148 | self.action_size = action_size
149 | self.state_size = state_size
150 |
151 | # Thread interactive with environment
152 | def run(self):
153 | global episode
154 | env = gym.make(self.env_name)
155 | while episode < EPISODES:
156 | state = env.reset()
157 | score = 0
158 | while True:
159 | action = self.get_action(state)
160 | next_state, reward, done, _ = env.step(action)
161 | score += reward
162 |
163 | self.memory(state, action, reward)
164 |
165 | state = next_state
166 |
167 | if done:
168 | episode += 1
169 | print("episode: ", episode, "/ score : ", score)
170 | scores.append(score)
171 | self.train_episode(score != 500)
172 | break
173 |
174 | # In Policy Gradient, Q function is not available.
175 | # Instead agent uses sample returns for evaluating policy
176 | def discount_rewards(self, rewards, done=True):
177 | discounted_rewards = np.zeros_like(rewards)
178 | running_add = 0
179 | if not done:
180 | running_add = self.critic.predict(np.reshape(self.states[-1], (1, self.state_size)))[0]
181 | for t in reversed(range(0, len(rewards))):
182 | running_add = running_add * self.discount_factor + rewards[t]
183 | discounted_rewards[t] = running_add
184 | return discounted_rewards
185 |
186 | # save of each step
187 | # this is used for calculating discounted rewards
188 | def memory(self, state, action, reward):
189 | self.states.append(state)
190 | act = np.zeros(self.action_size)
191 | act[action] = 1
192 | self.actions.append(act)
193 | self.rewards.append(reward)
194 |
195 | # update policy network and value network every episode
196 | def train_episode(self, done):
197 | discounted_rewards = self.discount_rewards(self.rewards, done)
198 |
199 | values = self.critic.predict(np.array(self.states))
200 | values = np.reshape(values, len(values))
201 |
202 | advantages = discounted_rewards - values
203 |
204 | self.optimizer[0]([self.states, self.actions, advantages])
205 | self.optimizer[1]([self.states, discounted_rewards])
206 | self.states, self.actions, self.rewards = [], [], []
207 |
208 | def get_action(self, state):
209 | policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0]
210 | return np.random.choice(self.action_size, 1, p=policy)[0]
211 |
212 |
213 | if __name__ == "__main__":
214 | env_name = 'CartPole-v1'
215 | env = gym.make(env_name)
216 |
217 | state_size = env.observation_space.shape[0]
218 | action_size = env.action_space.n
219 |
220 | env.close()
221 |
222 | global_agent = A3CAgent(state_size, action_size, env_name)
223 | global_agent.train()
224 |
--------------------------------------------------------------------------------
/2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5
--------------------------------------------------------------------------------
/2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5
--------------------------------------------------------------------------------
/2-cartpole/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Keon Kim
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/2-cartpole/README.md:
--------------------------------------------------------------------------------
1 | # OpenAI gym Cartpole
2 |
3 |
4 | Various reinforcement learning algorithms for Cartpole example.
5 | 
6 |
7 |
8 |
9 | This is graph of DQN algorithm
10 |
11 | 
12 |
13 |
14 | This is graph of Double DQN algorithm
15 |
16 | 
17 |
18 |
19 | This is graph of Policy Gradient algorithm
20 | 
21 |
22 |
23 | This is graph of Actor Critic algorithm
24 | 
--------------------------------------------------------------------------------
/2-cartpole/cartpole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/cartpole.png
--------------------------------------------------------------------------------
/3-atari/1-breakout/breakout_ddqn.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import random
3 | import numpy as np
4 | import tensorflow as tf
5 | from collections import deque
6 | from skimage.color import rgb2gray
7 | from skimage.transform import resize
8 | from keras.models import Sequential
9 | from keras.optimizers import RMSprop
10 | from keras.layers import Dense, Flatten
11 | from keras.layers.convolutional import Conv2D
12 | from keras import backend as K
13 |
14 | EPISODES = 50000
15 |
16 |
17 | class DDQNAgent:
18 | def __init__(self, action_size):
19 | self.render = False
20 | self.load_model = False
21 | # environment settings
22 | self.state_size = (84, 84, 4)
23 | self.action_size = action_size
24 | # parameters about epsilon
25 | self.epsilon = 1.
26 | self.epsilon_start, self.epsilon_end = 1.0, 0.1
27 | self.exploration_steps = 1000000.
28 | self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \
29 | / self.exploration_steps
30 | # parameters about training
31 | self.batch_size = 32
32 | self.train_start = 50000
33 | self.update_target_rate = 10000
34 | self.discount_factor = 0.99
35 | self.memory = deque(maxlen=400000)
36 | self.no_op_steps = 30
37 | # build
38 | self.model = self.build_model()
39 | self.target_model = self.build_model()
40 | self.update_target_model()
41 |
42 | self.optimizer = self.optimizer()
43 |
44 | self.sess = tf.InteractiveSession()
45 | K.set_session(self.sess)
46 |
47 | self.avg_q_max, self.avg_loss = 0, 0
48 | self.summary_placeholders, self.update_ops, self.summary_op = \
49 | self.setup_summary()
50 | self.summary_writer = tf.summary.FileWriter(
51 | 'summary/breakout_ddqn', self.sess.graph)
52 | self.sess.run(tf.global_variables_initializer())
53 |
54 | if self.load_model:
55 | self.model.load_weights("./save_model/breakout_ddqn.h5")
56 |
57 | # if the error is in [-1, 1], then the cost is quadratic to the error
58 | # But outside the interval, the cost is linear to the error
59 | def optimizer(self):
60 | a = K.placeholder(shape=(None, ), dtype='int32')
61 | y = K.placeholder(shape=(None, ), dtype='float32')
62 |
63 | py_x = self.model.output
64 |
65 | a_one_hot = K.one_hot(a, self.action_size)
66 | q_value = K.sum(py_x * a_one_hot, axis=1)
67 | error = K.abs(y - q_value)
68 |
69 | quadratic_part = K.clip(error, 0.0, 1.0)
70 | linear_part = error - quadratic_part
71 | loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
72 |
73 | optimizer = RMSprop(lr=0.00025, epsilon=0.01)
74 | updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
75 | train = K.function([self.model.input, a, y], [loss], updates=updates)
76 |
77 | return train
78 |
79 | # approximate Q function using Convolution Neural Network
80 | # state is input and Q Value of each action is output of network
81 | def build_model(self):
82 | model = Sequential()
83 | model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
84 | input_shape=self.state_size))
85 | model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
86 | model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
87 | model.add(Flatten())
88 | model.add(Dense(512, activation='relu'))
89 | model.add(Dense(self.action_size))
90 | model.summary()
91 |
92 | return model
93 |
94 | # after some time interval update the target model to be same with model
95 | def update_target_model(self):
96 | self.target_model.set_weights(self.model.get_weights())
97 |
98 | # get action from model using epsilon-greedy policy
99 | def get_action(self, history):
100 | history = np.float32(history / 255.0)
101 | if np.random.rand() <= self.epsilon:
102 | return random.randrange(self.action_size)
103 | else:
104 | q_value = self.model.predict(history)
105 | return np.argmax(q_value[0])
106 |
107 | # save sample to the replay memory
108 | def replay_memory(self, history, action, reward, next_history, dead):
109 | self.memory.append((history, action, reward, next_history, dead))
110 |
111 | # pick samples randomly from replay memory (with batch_size)
112 | def train_replay(self):
113 | if len(self.memory) < self.train_start:
114 | return
115 | if self.epsilon > self.epsilon_end:
116 | self.epsilon -= self.epsilon_decay_step
117 |
118 | mini_batch = random.sample(self.memory, self.batch_size)
119 |
120 | history = np.zeros((self.batch_size, self.state_size[0],
121 | self.state_size[1], self.state_size[2]))
122 | next_history = np.zeros((self.batch_size, self.state_size[0],
123 | self.state_size[1], self.state_size[2]))
124 | target = np.zeros((self.batch_size, ))
125 | action, reward, dead = [], [], []
126 |
127 | for i in range(self.batch_size):
128 | history[i] = np.float32(mini_batch[i][0] / 255.)
129 | next_history[i] = np.float32(mini_batch[i][3] / 255.)
130 | action.append(mini_batch[i][1])
131 | reward.append(mini_batch[i][2])
132 | dead.append(mini_batch[i][4])
133 |
134 | value = self.model.predict(next_history)
135 | target_value = self.target_model.predict(next_history)
136 |
137 | # like Q Learning, get maximum Q value at s'
138 | # But from target model
139 | for i in range(self.batch_size):
140 | if dead[i]:
141 | target[i] = reward[i]
142 | else:
143 | # the key point of Double DQN
144 | # selection of action is from model
145 | # update is from target model
146 | target[i] = reward[i] + self.discount_factor * \
147 | target_value[i][np.argmax(value[i])]
148 |
149 | loss = self.optimizer([history, action, target])
150 | self.avg_loss += loss[0]
151 |
152 | # make summary operators for tensorboard
153 | def setup_summary(self):
154 | episode_total_reward = tf.Variable(0.)
155 | episode_avg_max_q = tf.Variable(0.)
156 | episode_duration = tf.Variable(0.)
157 | episode_avg_loss = tf.Variable(0.)
158 |
159 | tf.summary.scalar('Total Reward/Episode', episode_total_reward)
160 | tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
161 | tf.summary.scalar('Duration/Episode', episode_duration)
162 | tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
163 |
164 | summary_vars = [episode_total_reward, episode_avg_max_q,
165 | episode_duration, episode_avg_loss]
166 | summary_placeholders = [tf.placeholder(tf.float32) for _ in
167 | range(len(summary_vars))]
168 | update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
169 | range(len(summary_vars))]
170 | summary_op = tf.summary.merge_all()
171 | return summary_placeholders, update_ops, summary_op
172 |
173 |
174 | # 210*160*3(color) --> 84*84(mono)
175 | # float --> integer (to reduce the size of replay memory)
176 | def pre_processing(observe):
177 | processed_observe = np.uint8(
178 | resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
179 | return processed_observe
180 |
181 |
182 | if __name__ == "__main__":
183 | # In case of BreakoutDeterministic-v4, always skip 4 frames
184 | # Deterministic-v4 version use 4 actions
185 | env = gym.make('BreakoutDeterministic-v4')
186 | agent = DDQNAgent(action_size=3)
187 |
188 | scores, episodes, global_step = [], [], 0
189 |
190 | for e in range(EPISODES):
191 | done = False
192 | dead = False
193 | # 1 episode = 5 lives
194 | step, score, start_life = 0, 0, 5
195 | observe = env.reset()
196 |
197 | # this is one of DeepMind's idea.
198 | # just do nothing at the start of episode to avoid sub-optimal
199 | for _ in range(random.randint(1, agent.no_op_steps)):
200 | observe, _, _, _ = env.step(1)
201 |
202 | # At start of episode, there is no preceding frame.
203 | # So just copy initial states to make history
204 | state = pre_processing(observe)
205 | history = np.stack((state, state, state, state), axis=2)
206 | history = np.reshape([history], (1, 84, 84, 4))
207 |
208 | while not done:
209 | if agent.render:
210 | env.render()
211 | global_step += 1
212 | step += 1
213 |
214 | # get action for the current history and go one step in environment
215 | action = agent.get_action(history)
216 | # change action to real_action
217 | if action == 0: real_action = 1
218 | elif action == 1: real_action = 2
219 | else: real_action = 3
220 |
221 | observe, reward, done, info = env.step(real_action)
222 | # pre-process the observation --> history
223 | next_state = pre_processing(observe)
224 | next_state = np.reshape([next_state], (1, 84, 84, 1))
225 | next_history = np.append(next_state, history[:, :, :, :3], axis=3)
226 |
227 | agent.avg_q_max += np.amax(
228 | agent.model.predict(np.float32(history / 255.))[0])
229 |
230 | # if the agent missed ball, agent is dead --> episode is not over
231 | if start_life > info['ale.lives']:
232 | dead = True
233 | start_life = info['ale.lives']
234 |
235 | reward = np.clip(reward, -1., 1.)
236 |
237 | # save the sample to the replay memory
238 | agent.replay_memory(history, action, reward, next_history, dead)
239 | # every some time interval, train model
240 | agent.train_replay()
241 | # update the target model with model
242 | if global_step % agent.update_target_rate == 0:
243 | agent.update_target_model()
244 |
245 | score += reward
246 |
247 | # if agent is dead, then reset the history
248 | if dead:
249 | dead = False
250 | else:
251 | history = next_history
252 |
253 | # if done, plot the score over episodes
254 | if done:
255 | if global_step > agent.train_start:
256 | stats = [score, agent.avg_q_max / float(step), step,
257 | agent.avg_loss / float(step)]
258 | for i in range(len(stats)):
259 | agent.sess.run(agent.update_ops[i], feed_dict={
260 | agent.summary_placeholders[i]: float(stats[i])
261 | })
262 | summary_str = agent.sess.run(agent.summary_op)
263 | agent.summary_writer.add_summary(summary_str, e + 1)
264 |
265 | print("episode:", e, " score:", score, " memory length:",
266 | len(agent.memory), " epsilon:", agent.epsilon,
267 | " global_step:", global_step, " average_q:",
268 | agent.avg_q_max/float(step), " average loss:",
269 | agent.avg_loss/float(step))
270 |
271 | agent.avg_q_max, agent.avg_loss = 0, 0
272 |
273 | if e % 1000 == 0:
274 | agent.model.save_weights("./save_model/breakout_ddqn.h5")
275 |
--------------------------------------------------------------------------------
/3-atari/1-breakout/breakout_dqn.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import random
3 | import numpy as np
4 | import tensorflow as tf
5 | from collections import deque
6 | from skimage.color import rgb2gray
7 | from skimage.transform import resize
8 | from keras.models import Sequential
9 | from keras.optimizers import RMSprop
10 | from keras.layers import Dense, Flatten
11 | from keras.layers.convolutional import Conv2D
12 | from keras import backend as K
13 |
14 | EPISODES = 50000
15 |
16 |
17 | class DQNAgent:
18 | def __init__(self, action_size):
19 | self.render = False
20 | self.load_model = False
21 | # environment settings
22 | self.state_size = (84, 84, 4)
23 | self.action_size = action_size
24 | # parameters about epsilon
25 | self.epsilon = 1.
26 | self.epsilon_start, self.epsilon_end = 1.0, 0.1
27 | self.exploration_steps = 1000000.
28 | self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \
29 | / self.exploration_steps
30 | # parameters about training
31 | self.batch_size = 32
32 | self.train_start = 50000
33 | self.update_target_rate = 10000
34 | self.discount_factor = 0.99
35 | self.memory = deque(maxlen=400000)
36 | self.no_op_steps = 30
37 | # build model
38 | self.model = self.build_model()
39 | self.target_model = self.build_model()
40 | self.update_target_model()
41 |
42 | self.optimizer = self.optimizer()
43 |
44 | self.sess = tf.InteractiveSession()
45 | K.set_session(self.sess)
46 |
47 | self.avg_q_max, self.avg_loss = 0, 0
48 | self.summary_placeholders, self.update_ops, self.summary_op = \
49 | self.setup_summary()
50 | self.summary_writer = tf.summary.FileWriter(
51 | 'summary/breakout_dqn', self.sess.graph)
52 | self.sess.run(tf.global_variables_initializer())
53 |
54 | if self.load_model:
55 | self.model.load_weights("./save_model/breakout_dqn.h5")
56 |
57 | # if the error is in [-1, 1], then the cost is quadratic to the error
58 | # But outside the interval, the cost is linear to the error
59 | def optimizer(self):
60 | a = K.placeholder(shape=(None,), dtype='int32')
61 | y = K.placeholder(shape=(None,), dtype='float32')
62 |
63 | py_x = self.model.output
64 |
65 | a_one_hot = K.one_hot(a, self.action_size)
66 | q_value = K.sum(py_x * a_one_hot, axis=1)
67 | error = K.abs(y - q_value)
68 |
69 | quadratic_part = K.clip(error, 0.0, 1.0)
70 | linear_part = error - quadratic_part
71 | loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
72 |
73 | optimizer = RMSprop(lr=0.00025, epsilon=0.01)
74 | updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
75 | train = K.function([self.model.input, a, y], [loss], updates=updates)
76 |
77 | return train
78 |
79 | # approximate Q function using Convolution Neural Network
80 | # state is input and Q Value of each action is output of network
81 | def build_model(self):
82 | model = Sequential()
83 | model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
84 | input_shape=self.state_size))
85 | model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
86 | model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
87 | model.add(Flatten())
88 | model.add(Dense(512, activation='relu'))
89 | model.add(Dense(self.action_size))
90 | model.summary()
91 | return model
92 |
93 | # after some time interval update the target model to be same with model
94 | def update_target_model(self):
95 | self.target_model.set_weights(self.model.get_weights())
96 |
97 | # get action from model using epsilon-greedy policy
98 | def get_action(self, history):
99 | history = np.float32(history / 255.0)
100 | if np.random.rand() <= self.epsilon:
101 | return random.randrange(self.action_size)
102 | else:
103 | q_value = self.model.predict(history)
104 | return np.argmax(q_value[0])
105 |
106 | # save sample to the replay memory
107 | def replay_memory(self, history, action, reward, next_history, dead):
108 | self.memory.append((history, action, reward, next_history, dead))
109 |
110 | # pick samples randomly from replay memory (with batch_size)
111 | def train_replay(self):
112 | if len(self.memory) < self.train_start:
113 | return
114 | if self.epsilon > self.epsilon_end:
115 | self.epsilon -= self.epsilon_decay_step
116 |
117 | mini_batch = random.sample(self.memory, self.batch_size)
118 |
119 | history = np.zeros((self.batch_size, self.state_size[0],
120 | self.state_size[1], self.state_size[2]))
121 | next_history = np.zeros((self.batch_size, self.state_size[0],
122 | self.state_size[1], self.state_size[2]))
123 | target = np.zeros((self.batch_size,))
124 | action, reward, dead = [], [], []
125 |
126 | for i in range(self.batch_size):
127 | history[i] = np.float32(mini_batch[i][0] / 255.)
128 | next_history[i] = np.float32(mini_batch[i][3] / 255.)
129 | action.append(mini_batch[i][1])
130 | reward.append(mini_batch[i][2])
131 | dead.append(mini_batch[i][4])
132 |
133 | target_value = self.target_model.predict(next_history)
134 |
135 | # like Q Learning, get maximum Q value at s'
136 | # But from target model
137 | for i in range(self.batch_size):
138 | if dead[i]:
139 | target[i] = reward[i]
140 | else:
141 | target[i] = reward[i] + self.discount_factor * \
142 | np.amax(target_value[i])
143 |
144 | loss = self.optimizer([history, action, target])
145 | self.avg_loss += loss[0]
146 |
147 | def save_model(self, name):
148 | self.model.save_weights(name)
149 |
150 | # make summary operators for tensorboard
151 | def setup_summary(self):
152 | episode_total_reward = tf.Variable(0.)
153 | episode_avg_max_q = tf.Variable(0.)
154 | episode_duration = tf.Variable(0.)
155 | episode_avg_loss = tf.Variable(0.)
156 |
157 | tf.summary.scalar('Total Reward/Episode', episode_total_reward)
158 | tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
159 | tf.summary.scalar('Duration/Episode', episode_duration)
160 | tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
161 |
162 | summary_vars = [episode_total_reward, episode_avg_max_q,
163 | episode_duration, episode_avg_loss]
164 | summary_placeholders = [tf.placeholder(tf.float32) for _ in
165 | range(len(summary_vars))]
166 | update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
167 | range(len(summary_vars))]
168 | summary_op = tf.summary.merge_all()
169 | return summary_placeholders, update_ops, summary_op
170 |
171 |
172 | # 210*160*3(color) --> 84*84(mono)
173 | # float --> integer (to reduce the size of replay memory)
174 | def pre_processing(observe):
175 | processed_observe = np.uint8(
176 | resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
177 | return processed_observe
178 |
179 |
180 | if __name__ == "__main__":
181 | # In case of BreakoutDeterministic-v3, always skip 4 frames
182 | # Deterministic-v4 version use 4 actions
183 | env = gym.make('BreakoutDeterministic-v4')
184 | agent = DQNAgent(action_size=3)
185 |
186 | scores, episodes, global_step = [], [], 0
187 |
188 | for e in range(EPISODES):
189 | done = False
190 | dead = False
191 | # 1 episode = 5 lives
192 | step, score, start_life = 0, 0, 5
193 | observe = env.reset()
194 |
195 | # this is one of DeepMind's idea.
196 | # just do nothing at the start of episode to avoid sub-optimal
197 | for _ in range(random.randint(1, agent.no_op_steps)):
198 | observe, _, _, _ = env.step(1)
199 |
200 | # At start of episode, there is no preceding frame
201 | # So just copy initial states to make history
202 | state = pre_processing(observe)
203 | history = np.stack((state, state, state, state), axis=2)
204 | history = np.reshape([history], (1, 84, 84, 4))
205 |
206 | while not done:
207 | if agent.render:
208 | env.render()
209 | global_step += 1
210 | step += 1
211 |
212 | # get action for the current history and go one step in environment
213 | action = agent.get_action(history)
214 | # change action to real_action
215 | if action == 0:
216 | real_action = 1
217 | elif action == 1:
218 | real_action = 2
219 | else:
220 | real_action = 3
221 |
222 | observe, reward, done, info = env.step(real_action)
223 | # pre-process the observation --> history
224 | next_state = pre_processing(observe)
225 | next_state = np.reshape([next_state], (1, 84, 84, 1))
226 | next_history = np.append(next_state, history[:, :, :, :3], axis=3)
227 |
228 | agent.avg_q_max += np.amax(
229 | agent.model.predict(np.float32(history / 255.))[0])
230 |
231 | # if the agent missed ball, agent is dead --> episode is not over
232 | if start_life > info['ale.lives']:
233 | dead = True
234 | start_life = info['ale.lives']
235 |
236 | reward = np.clip(reward, -1., 1.)
237 |
238 | # save the sample to the replay memory
239 | agent.replay_memory(history, action, reward, next_history, dead)
240 | # every some time interval, train model
241 | agent.train_replay()
242 | # update the target model with model
243 | if global_step % agent.update_target_rate == 0:
244 | agent.update_target_model()
245 |
246 | score += reward
247 |
248 | # if agent is dead, then reset the history
249 | if dead:
250 | dead = False
251 | else:
252 | history = next_history
253 |
254 | # if done, plot the score over episodes
255 | if done:
256 | if global_step > agent.train_start:
257 | stats = [score, agent.avg_q_max / float(step), step,
258 | agent.avg_loss / float(step)]
259 | for i in range(len(stats)):
260 | agent.sess.run(agent.update_ops[i], feed_dict={
261 | agent.summary_placeholders[i]: float(stats[i])
262 | })
263 | summary_str = agent.sess.run(agent.summary_op)
264 | agent.summary_writer.add_summary(summary_str, e + 1)
265 |
266 | print("episode:", e, " score:", score, " memory length:",
267 | len(agent.memory), " epsilon:", agent.epsilon,
268 | " global_step:", global_step, " average_q:",
269 | agent.avg_q_max / float(step), " average loss:",
270 | agent.avg_loss / float(step))
271 |
272 | agent.avg_q_max, agent.avg_loss = 0, 0
273 |
274 | if e % 1000 == 0:
275 | agent.model.save_weights("./save_model/breakout_dqn.h5")
276 |
--------------------------------------------------------------------------------
/3-atari/1-breakout/breakout_dueling_ddqn.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import random
3 | import numpy as np
4 | import tensorflow as tf
5 | from collections import deque
6 | from skimage.color import rgb2gray
7 | from skimage.transform import resize
8 | from keras.models import Model
9 | from keras.optimizers import RMSprop
10 | from keras.layers import Input, Dense, Flatten, Lambda, merge
11 | from keras.layers.convolutional import Conv2D
12 | from keras import backend as K
13 |
14 | EPISODES = 50000
15 |
16 |
17 | class DuelingDDQNAgent:
18 | def __init__(self, action_size):
19 | self.render = False
20 | self.load_model = False
21 | # environment settings
22 | self.state_size = (84, 84, 4)
23 | self.action_size = action_size
24 | # parameters about epsilon
25 | self.epsilon = 1.
26 | self.epsilon_start, self.epsilon_end = 1.0, 0.1
27 | self.exploration_steps = 1000000.
28 | self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \
29 | / self.exploration_steps
30 | # parameters about training
31 | self.batch_size = 32
32 | self.train_start = 50000
33 | self.update_target_rate = 10000
34 | self.discount_factor = 0.99
35 | self.memory = deque(maxlen=400000)
36 | self.no_op_steps = 30
37 | # build
38 | self.model = self.build_model()
39 | self.target_model = self.build_model()
40 | self.update_target_model()
41 |
42 | self.optimizer = self.optimizer()
43 |
44 | self.sess = tf.InteractiveSession()
45 | K.set_session(self.sess)
46 |
47 | self.avg_q_max, self.avg_loss = 0, 0
48 | self.summary_placeholders, self.update_ops, self.summary_op = \
49 | self.setup_summary()
50 | self.summary_writer = tf.summary.FileWriter(
51 | 'summary/breakout_dueling_ddqn', self.sess.graph)
52 | self.sess.run(tf.global_variables_initializer())
53 |
54 | if self.load_model:
55 | self.model.load_weights("./save_model/breakout_dueling_ddqb.h5")
56 |
57 | # if the error is in [-1, 1], then the cost is quadratic to the error
58 | # But outside the interval, the cost is linear to the error
59 | def optimizer(self):
60 | a = K.placeholder(shape=(None, ), dtype='int32')
61 | y = K.placeholder(shape=(None, ), dtype='float32')
62 |
63 | py_x = self.model.output
64 |
65 | a_one_hot = K.one_hot(a, self.action_size)
66 | q_value = K.sum(py_x * a_one_hot, axis=1)
67 | error = K.abs(y - q_value)
68 |
69 | quadratic_part = K.clip(error, 0.0, 1.0)
70 | linear_part = error - quadratic_part
71 | loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
72 |
73 | optimizer = RMSprop(lr=0.00025, epsilon=0.01)
74 | updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
75 | train = K.function([self.model.input, a, y], [loss], updates=updates)
76 |
77 | return train
78 |
79 | # approximate Q function using Convolution Neural Network
80 | # state is input and Q Value of each action is output of network
81 | # dueling network's Q Value is sum of advantages and state value
82 | def build_model(self):
83 | input = Input(shape=self.state_size)
84 | shared = Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(input)
85 | shared = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(shared)
86 | shared = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(shared)
87 | flatten = Flatten()(shared)
88 |
89 | # network separate state value and advantages
90 | advantage_fc = Dense(512, activation='relu')(flatten)
91 | advantage = Dense(self.action_size)(advantage_fc)
92 | advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True),
93 | output_shape=(self.action_size,))(advantage)
94 |
95 | value_fc = Dense(512, activation='relu')(flatten)
96 | value = Dense(1)(value_fc)
97 | value = Lambda(lambda s: K.expand_dims(s[:, 0], -1),
98 | output_shape=(self.action_size,))(value)
99 |
100 | # network merged and make Q Value
101 | q_value = merge([value, advantage], mode='sum')
102 | model = Model(inputs=input, outputs=q_value)
103 | model.summary()
104 |
105 | return model
106 |
107 | # after some time interval update the target model to be same with model
108 | def update_target_model(self):
109 | self.target_model.set_weights(self.model.get_weights())
110 |
111 | # get action from model using epsilon-greedy policy
112 | def get_action(self, history):
113 | history = np.float32(history / 255.0)
114 | if np.random.rand() <= self.epsilon:
115 | return random.randrange(self.action_size)
116 | else:
117 | q_value = self.model.predict(history)
118 | return np.argmax(q_value[0])
119 |
120 | # save sample to the replay memory
121 | def replay_memory(self, history, action, reward, next_history, dead):
122 | self.memory.append((history, action, reward, next_history, dead))
123 |
124 | # pick samples randomly from replay memory (with batch_size)
125 | def train_replay(self):
126 | if len(self.memory) < self.train_start:
127 | return
128 | if self.epsilon > self.epsilon_end:
129 | self.epsilon -= self.epsilon_decay_step
130 |
131 | mini_batch = random.sample(self.memory, self.batch_size)
132 |
133 | history = np.zeros((self.batch_size, self.state_size[0],
134 | self.state_size[1], self.state_size[2]))
135 | next_history = np.zeros((self.batch_size, self.state_size[0],
136 | self.state_size[1], self.state_size[2]))
137 | target = np.zeros((self.batch_size, ))
138 | action, reward, dead = [], [], []
139 |
140 | for i in range(self.batch_size):
141 | history[i] = np.float32(mini_batch[i][0] / 255.)
142 | next_history[i] = np.float32(mini_batch[i][3] / 255.)
143 | action.append(mini_batch[i][1])
144 | reward.append(mini_batch[i][2])
145 | dead.append(mini_batch[i][4])
146 |
147 | value = self.model.predict(history)
148 | target_value = self.target_model.predict(next_history)
149 |
150 | # like Q Learning, get maximum Q value at s'
151 | # But from target model
152 | for i in range(self.batch_size):
153 | if dead[i]:
154 | target[i] = reward[i]
155 | else:
156 | # the key point of Double DQN
157 | # selection of action is from model
158 | # update is from target model
159 | target[i] = reward[i] + self.discount_factor * \
160 | target_value[i][np.argmax(value[i])]
161 |
162 | loss = self.optimizer([history, action, target])
163 | self.avg_loss += loss[0]
164 |
165 | def setup_summary(self):
166 | episode_total_reward = tf.Variable(0.)
167 | episode_avg_max_q = tf.Variable(0.)
168 | episode_duration = tf.Variable(0.)
169 | episode_avg_loss = tf.Variable(0.)
170 |
171 | tf.summary.scalar('Total Reward/Episode', episode_total_reward)
172 | tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
173 | tf.summary.scalar('Duration/Episode', episode_duration)
174 | tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
175 |
176 | summary_vars = [episode_total_reward, episode_avg_max_q,
177 | episode_duration, episode_avg_loss]
178 | summary_placeholders = [tf.placeholder(tf.float32) for _ in
179 | range(len(summary_vars))]
180 | update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
181 | range(len(summary_vars))]
182 | summary_op = tf.summary.merge_all()
183 | return summary_placeholders, update_ops, summary_op
184 |
185 |
186 | # 210*160*3(color) --> 84*84(mono)
187 | # float --> integer (to reduce the size of replay memory)
188 | def pre_processing(observe):
189 | processed_observe = np.uint8(
190 | resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
191 | return processed_observe
192 |
193 |
194 | if __name__ == "__main__":
195 | # In case of BreakoutDeterministic-v3, always skip 4 frames
196 | # Deterministic-v4 version use 4 actions
197 | env = gym.make('BreakoutDeterministic-v4')
198 | agent = DuelingDDQNAgent(action_size=3)
199 |
200 | scores, episodes, global_step = [], [], 0
201 |
202 | for e in range(EPISODES):
203 | done = False
204 | dead = False
205 | # 1 episode = 5 lives
206 | step, score, start_life = 0, 0, 5
207 | observe = env.reset()
208 |
209 | # this is one of DeepMind's idea.
210 | # just do nothing at the start of episode to avoid sub-optimal
211 | for _ in range(random.randint(1, agent.no_op_steps)):
212 | observe, _, _, _ = env.step(1)
213 |
214 | # At start of episode, there is no preceding frame.
215 | # So just copy initial states to make history
216 | state = pre_processing(observe)
217 | history = np.stack((state, state, state, state), axis=2)
218 | history = np.reshape([history], (1, 84, 84, 4))
219 |
220 | while not done:
221 | if agent.render:
222 | env.render()
223 | global_step += 1
224 | step += 1
225 |
226 | # get action for the current history and go one step in environment
227 | action = agent.get_action(history)
228 | # change action to real_action
229 | if action == 0: real_action = 1
230 | elif action == 1: real_action = 2
231 | else: real_action = 3
232 |
233 | observe, reward, done, info = env.step(real_action)
234 | # pre-process the observation --> history
235 | next_state = pre_processing(observe)
236 | next_state = np.reshape([next_state], (1, 84, 84, 1))
237 | next_history = np.append(next_state, history[:, :, :, :3], axis=3)
238 |
239 | agent.avg_q_max += np.amax(
240 | agent.model.predict(np.float32(history / 255.))[0])
241 |
242 | # if the agent missed ball, agent is dead --> episode is not over
243 | if start_life > info['ale.lives']:
244 | dead = True
245 | start_life = info['ale.lives']
246 |
247 | reward = np.clip(reward, -1., 1.)
248 |
249 | # save the sample to the replay memory
250 | agent.replay_memory(history, action, reward, next_history, dead)
251 | # every some time interval, train model
252 | agent.train_replay()
253 | # update the target model with model
254 | if global_step % agent.update_target_rate == 0:
255 | agent.update_target_model()
256 |
257 | score += reward
258 |
259 | # if agent is dead, then reset the history
260 | if dead:
261 | dead = False
262 | else:
263 | history = next_history
264 |
265 | # if done, plot the score over episodes
266 | if done:
267 | if global_step > agent.train_start:
268 | stats = [score, agent.avg_q_max / float(step), step,
269 | agent.avg_loss / float(step)]
270 | for i in range(len(stats)):
271 | agent.sess.run(agent.update_ops[i], feed_dict={
272 | agent.summary_placeholders[i]: float(stats[i])
273 | })
274 | summary_str = agent.sess.run(agent.summary_op)
275 | agent.summary_writer.add_summary(summary_str, e + 1)
276 |
277 | print("episode:", e, " score:", score, " memory length:",
278 | len(agent.memory), " epsilon:", agent.epsilon,
279 | " global_step:", global_step, " average_q:",
280 | agent.avg_q_max/float(step), " average loss:",
281 | agent.avg_loss/float(step))
282 |
283 | agent.avg_q_max, agent.avg_loss = 0, 0
284 |
285 | if e % 1000 == 0:
286 | agent.model.save_weights("./save_model/breakout_dueling_ddqn.h5")
287 |
--------------------------------------------------------------------------------
/3-atari/1-breakout/play_a3c_model.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import random
3 | import numpy as np
4 | from skimage.color import rgb2gray
5 | from skimage.transform import resize
6 | from keras.models import Model
7 | from keras.layers import Dense, Flatten, Input
8 | from keras.layers.convolutional import Conv2D
9 |
10 | global episode
11 | episode = 0
12 | EPISODES = 8000000
13 | env_name = "BreakoutDeterministic-v4"
14 |
15 | class TestAgent:
16 | def __init__(self, action_size):
17 | self.state_size = (84, 84, 4)
18 | self.action_size = action_size
19 |
20 | self.discount_factor = 0.99
21 | self.no_op_steps = 30
22 |
23 | self.actor, self.critic = self.build_model()
24 |
25 | def build_model(self):
26 | input = Input(shape=self.state_size)
27 | conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input)
28 | conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv)
29 | conv = Flatten()(conv)
30 | fc = Dense(256, activation='relu')(conv)
31 | policy = Dense(self.action_size, activation='softmax')(fc)
32 | value = Dense(1, activation='linear')(fc)
33 |
34 | actor = Model(inputs=input, outputs=policy)
35 | critic = Model(inputs=input, outputs=value)
36 |
37 | actor.summary()
38 | critic.summary()
39 |
40 | return actor, critic
41 |
42 | def get_action(self, history):
43 | history = np.float32(history / 255.)
44 | policy = self.actor.predict(history)[0]
45 |
46 | action_index = np.argmax(policy)
47 | return action_index
48 |
49 | def load_model(self, name):
50 | self.actor.load_weights(name)
51 |
52 | def pre_processing(next_observe, observe):
53 | processed_observe = np.maximum(next_observe, observe)
54 | processed_observe = np.uint8(
55 | resize(rgb2gray(processed_observe), (84, 84), mode='constant') * 255)
56 | return processed_observe
57 |
58 |
59 | if __name__ == "__main__":
60 | env = gym.make(env_name)
61 | agent = TestAgent(action_size=3)
62 | agent.load_model("save_model/breakout_a3c_5_actor.h5")
63 |
64 | step = 0
65 |
66 | while episode < EPISODES:
67 | done = False
68 | dead = False
69 |
70 | score, start_life = 0, 5
71 | observe = env.reset()
72 | next_observe = observe
73 |
74 | for _ in range(random.randint(1, 20)):
75 | observe = next_observe
76 | next_observe, _, _, _ = env.step(1)
77 |
78 | state = pre_processing(next_observe, observe)
79 | history = np.stack((state, state, state, state), axis=2)
80 | history = np.reshape([history], (1, 84, 84, 4))
81 |
82 | while not done:
83 | env.render()
84 | step += 1
85 | observe = next_observe
86 |
87 | action = agent.get_action(history)
88 |
89 | if action == 1:
90 | fake_action = 2
91 | elif action == 2:
92 | fake_action = 3
93 | else:
94 | fake_action = 1
95 |
96 | if dead:
97 | fake_action = 1
98 | dead = False
99 |
100 | next_observe, reward, done, info = env.step(fake_action)
101 |
102 | next_state = pre_processing(next_observe, observe)
103 | next_state = np.reshape([next_state], (1, 84, 84, 1))
104 | next_history = np.append(next_state, history[:, :, :, :3], axis=3)
105 |
106 | if start_life > info['ale.lives']:
107 | dead = True
108 | reward = -1
109 | start_life = info['ale.lives']
110 |
111 | score += reward
112 |
113 | # if agent is dead, then reset the history
114 | if dead:
115 | history = np.stack(
116 | (next_state, next_state, next_state, next_state), axis=2)
117 | history = np.reshape([history], (1, 84, 84, 4))
118 | else:
119 | history = next_history
120 |
121 | # if done, plot the score over episodes
122 | if done:
123 | episode += 1
124 | print("episode:", episode, " score:", score, " step:", step)
125 | step = 0
--------------------------------------------------------------------------------
/3-atari/1-breakout/play_dqn_model.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import random
3 | import numpy as np
4 | import tensorflow as tf
5 | from skimage.color import rgb2gray
6 | from skimage.transform import resize
7 | from keras.models import Sequential
8 | from keras.layers import Dense, Flatten
9 | from keras.layers.convolutional import Conv2D
10 | from keras import backend as K
11 |
12 | EPISODES = 50000
13 |
14 |
15 | class TestAgent:
16 | def __init__(self, action_size):
17 | self.state_size = (84, 84, 4)
18 | self.action_size = action_size
19 | self.no_op_steps = 20
20 |
21 | self.model = self.build_model()
22 |
23 | self.sess = tf.InteractiveSession()
24 | K.set_session(self.sess)
25 |
26 | self.avg_q_max, self.avg_loss = 0, 0
27 | self.sess.run(tf.global_variables_initializer())
28 |
29 | def build_model(self):
30 | model = Sequential()
31 | model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
32 | input_shape=self.state_size))
33 | model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
34 | model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
35 | model.add(Flatten())
36 | model.add(Dense(512, activation='relu'))
37 | model.add(Dense(self.action_size))
38 | model.summary()
39 |
40 | return model
41 |
42 | def get_action(self, history):
43 | if np.random.random() < 0.01:
44 | return random.randrange(3)
45 | history = np.float32(history / 255.0)
46 | q_value = self.model.predict(history)
47 | return np.argmax(q_value[0])
48 |
49 | def load_model(self, filename):
50 | self.model.load_weights(filename)
51 |
52 | def pre_processing(observe):
53 | processed_observe = np.uint8(
54 | resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
55 | return processed_observe
56 |
57 |
58 | if __name__ == "__main__":
59 | env = gym.make('BreakoutDeterministic-v4')
60 | agent = TestAgent(action_size=3)
61 | agent.load_model("./save_model/breakout_dqn_5.h5")
62 |
63 | for e in range(EPISODES):
64 | done = False
65 | dead = False
66 |
67 | step, score, start_life = 0, 0, 5
68 | observe = env.reset()
69 |
70 | for _ in range(random.randint(1, agent.no_op_steps)):
71 | observe, _, _, _ = env.step(1)
72 |
73 | state = pre_processing(observe)
74 | history = np.stack((state, state, state, state), axis=2)
75 | history = np.reshape([history], (1, 84, 84, 4))
76 |
77 | while not done:
78 | env.render()
79 | step += 1
80 |
81 | action = agent.get_action(history)
82 |
83 | if action == 0:
84 | real_action = 1
85 | elif action == 1:
86 | real_action = 2
87 | else:
88 | real_action = 3
89 |
90 | if dead:
91 | real_action = 1
92 | dead = False
93 |
94 | observe, reward, done, info = env.step(real_action)
95 |
96 | next_state = pre_processing(observe)
97 | next_state = np.reshape([next_state], (1, 84, 84, 1))
98 | next_history = np.append(next_state, history[:, :, :, :3], axis=3)
99 |
100 | if start_life > info['ale.lives']:
101 | dead = True
102 | start_life = info['ale.lives']
103 |
104 | score += reward
105 |
106 | history = next_history
107 |
108 | if done:
109 | print("episode:", e, " score:", score)
110 |
111 |
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_dqn.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_dqn_1.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn_1.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_dqn_2.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn_2.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_dqn_3.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn_3.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_dqn_4.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn_4.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/save_model/breakout_dqn_5.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn_5.h5
--------------------------------------------------------------------------------
/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638
--------------------------------------------------------------------------------
/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name
--------------------------------------------------------------------------------
/3-atari/2-pong/README.md:
--------------------------------------------------------------------------------
1 | # Policy Gradient
2 |
3 | Minimal implementation of Stochastic Policy Gradient Algorithm in Keras
4 |
5 | ## Pong Agent
6 |
7 | 
8 |
9 |
10 | This PG agent seems to get more frequent wins after about 8000 episodes. Below is the score graph.
11 |
12 |
13 | 
14 |
--------------------------------------------------------------------------------
/3-atari/2-pong/assets/pg.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/2-pong/assets/pg.gif
--------------------------------------------------------------------------------
/3-atari/2-pong/assets/score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/2-pong/assets/score.png
--------------------------------------------------------------------------------
/3-atari/2-pong/pong_a3c.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/2-pong/pong_a3c.py
--------------------------------------------------------------------------------
/3-atari/2-pong/pong_reinforce.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from keras.models import Sequential
4 | from keras.layers import Dense, Reshape, Flatten
5 | from keras.optimizers import Adam
6 | from keras.layers.convolutional import Convolution2D
7 |
8 |
9 | class PGAgent:
10 | def __init__(self, state_size, action_size):
11 | self.state_size = state_size
12 | self.action_size = action_size
13 | self.gamma = 0.99
14 | self.learning_rate = 0.001
15 | self.states = []
16 | self.gradients = []
17 | self.rewards = []
18 | self.probs = []
19 | self.model = self._build_model()
20 | self.model.summary()
21 |
22 | def _build_model(self):
23 | model = Sequential()
24 | model.add(Reshape((1, 80, 80), input_shape=(self.state_size,)))
25 | model.add(Convolution2D(32, 6, 6, subsample=(3, 3), border_mode='same',
26 | activation='relu', init='he_uniform'))
27 | model.add(Flatten())
28 | model.add(Dense(64, activation='relu', init='he_uniform'))
29 | model.add(Dense(32, activation='relu', init='he_uniform'))
30 | model.add(Dense(self.action_size, activation='softmax'))
31 | opt = Adam(lr=self.learning_rate)
32 | # See note regarding crossentropy in cartpole_reinforce.py
33 | model.compile(loss='categorical_crossentropy', optimizer=opt)
34 | return model
35 |
36 | def remember(self, state, action, prob, reward):
37 | y = np.zeros([self.action_size])
38 | y[action] = 1
39 | self.gradients.append(np.array(y).astype('float32') - prob)
40 | self.states.append(state)
41 | self.rewards.append(reward)
42 |
43 | def act(self, state):
44 | state = state.reshape([1, state.shape[0]])
45 | aprob = self.model.predict(state, batch_size=1).flatten()
46 | self.probs.append(aprob)
47 | prob = aprob / np.sum(aprob)
48 | action = np.random.choice(self.action_size, 1, p=prob)[0]
49 | return action, prob
50 |
51 | def discount_rewards(self, rewards):
52 | discounted_rewards = np.zeros_like(rewards)
53 | running_add = 0
54 | for t in reversed(range(0, rewards.size)):
55 | if rewards[t] != 0:
56 | running_add = 0
57 | running_add = running_add * self.gamma + rewards[t]
58 | discounted_rewards[t] = running_add
59 | return discounted_rewards
60 |
61 | def train(self):
62 | gradients = np.vstack(self.gradients)
63 | rewards = np.vstack(self.rewards)
64 | rewards = self.discount_rewards(rewards)
65 | rewards = rewards / np.std(rewards - np.mean(rewards))
66 | gradients *= rewards
67 | X = np.squeeze(np.vstack([self.states]))
68 | Y = self.probs + self.learning_rate * np.squeeze(np.vstack([gradients]))
69 | self.model.train_on_batch(X, Y)
70 | self.states, self.probs, self.gradients, self.rewards = [], [], [], []
71 |
72 | def load(self, name):
73 | self.model.load_weights(name)
74 |
75 | def save(self, name):
76 | self.model.save_weights(name)
77 |
78 | def preprocess(I):
79 | I = I[35:195]
80 | I = I[::2, ::2, 0]
81 | I[I == 144] = 0
82 | I[I == 109] = 0
83 | I[I != 0] = 1
84 | return I.astype(np.float).ravel()
85 |
86 | if __name__ == "__main__":
87 | env = gym.make("Pong-v0")
88 | state = env.reset()
89 | prev_x = None
90 | score = 0
91 | episode = 0
92 |
93 | state_size = 80 * 80
94 | action_size = env.action_space.n
95 | agent = PGAgent(state_size, action_size)
96 | agent.load('./save_model/pong_reinforce.h5')
97 | while True:
98 | env.render()
99 |
100 | cur_x = preprocess(state)
101 | x = cur_x - prev_x if prev_x is not None else np.zeros(state_size)
102 | prev_x = cur_x
103 |
104 | action, prob = agent.act(x)
105 | state, reward, done, info = env.step(action)
106 | score += reward
107 | agent.remember(x, action, prob, reward)
108 |
109 | if done:
110 | episode += 1
111 | agent.train()
112 | print('Episode: %d - Score: %f.' % (episode, score))
113 | score = 0
114 | state = env.reset()
115 | prev_x = None
116 | if episode > 1 and episode % 10 == 0:
117 | agent.save('./save_model/pong_reinforce.h5')
118 |
--------------------------------------------------------------------------------
/3-atari/2-pong/save_model/pong_reinforce.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/2-pong/save_model/pong_reinforce.h5
--------------------------------------------------------------------------------
/3-atari/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Keon Kim
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/4-gym/1-mountaincar/mountaincar_dqn.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import pylab
3 | import random
4 | import numpy as np
5 | from collections import deque
6 | from keras.layers import Dense
7 | from keras.optimizers import Adam
8 | from keras.models import Sequential
9 |
10 | EPISODES = 4000
11 |
12 |
13 | class DQNAgent:
14 | def __init__(self, state_size, action_size):
15 | # Cartpole이 학습하는 것을 보려면 "True"로 바꿀 것
16 | self.render = True
17 |
18 | # state와 action의 크기를 가져와서 모델을 생성하는데 사용함
19 | self.state_size = state_size
20 | self.action_size = action_size
21 |
22 | # Cartpole DQN 학습의 Hyper parameter 들
23 | # deque를 통해서 replay memory 생성
24 | self.discount_factor = 0.99
25 | self.learning_rate = 0.001
26 | self.epsilon = 1.0
27 | self.epsilon_min = 0.005
28 | self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000
29 | self.batch_size = 64
30 | self.train_start = 1000
31 | self.memory = deque(maxlen=10000)
32 |
33 | # 학습할 모델과 타겟 모델을 생성
34 | self.model = self.build_model()
35 | self.target_model = self.build_model()
36 | # 학습할 모델을 타겟 모델로 복사 --> 타겟 모델의 초기화(weight를 같게 해주고 시작해야 함)
37 | self.update_target_model()
38 |
39 | # Deep Neural Network를 통해서 Q Function을 근사
40 | # state가 입력, 각 행동에 대한 Q Value가 출력인 모델을 생성
41 | def build_model(self):
42 | model = Sequential()
43 | model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
44 | model.add(Dense(16, activation='relu', kernel_initializer='he_uniform'))
45 | model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
46 | model.summary()
47 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
48 | return model
49 |
50 | # 일정한 시간 간격마다 타겟 모델을 현재 학습하고 있는 모델로 업데이트
51 | def update_target_model(self):
52 | self.target_model.set_weights(self.model.get_weights())
53 |
54 | # 행동의 선택은 현재 네트워크에 대해서 epsilon-greedy 정책을 사용
55 | def get_action(self, state):
56 | if np.random.rand() <= self.epsilon:
57 | return random.randrange(self.action_size)
58 | else:
59 | q_value = self.model.predict(state)
60 | return np.argmax(q_value[0])
61 |
62 | # 을 replay_memory에 저장함
63 | def replay_memory(self, state, action, reward, next_state, done):
64 | if action == 2:
65 | action = 1
66 | self.memory.append((state, action, reward, next_state, done))
67 | if self.epsilon > self.epsilon_min:
68 | self.epsilon -= self.epsilon_decay
69 | # print(len(self.memory))
70 |
71 | # replay memory에서 batch_size 만큼의 샘플들을 무작위로 뽑아서 학습
72 | def train_replay(self):
73 | if len(self.memory) < self.train_start:
74 | return
75 | batch_size = min(self.batch_size, len(self.memory))
76 | mini_batch = random.sample(self.memory, batch_size)
77 |
78 | update_input = np.zeros((batch_size, self.state_size))
79 | update_target = np.zeros((batch_size, self.action_size))
80 |
81 | for i in range(batch_size):
82 | state, action, reward, next_state, done = mini_batch[i]
83 | target = self.model.predict(state)[0]
84 |
85 | # 큐러닝에서와 같이 s'에서의 최대 Q Value를 가져옴. 단, 타겟 모델에서 가져옴
86 | if done:
87 | target[action] = reward
88 | else:
89 | target[action] = reward + self.discount_factor * \
90 | np.amax(self.target_model.predict(next_state)[0])
91 | update_input[i] = state
92 | update_target[i] = target
93 |
94 | # 학습할 정답인 타겟과 현재 자신의 값의 minibatch를 만들고 그것으로 한 번에 모델 업데이트
95 | self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
96 |
97 | # 저장한 모델을 불러옴
98 | def load_model(self, name):
99 | self.model.load_weights(name)
100 |
101 | # 학습된 모델을 저장함
102 | def save_model(self, name):
103 | self.model.save_weights(name)
104 |
105 |
106 | if __name__ == "__main__":
107 | # CartPole-v1의 경우 500 타임스텝까지 플레이가능
108 | env = gym.make('MountainCar-v0')
109 | # 환경으로부터 상태와 행동의 크기를 가져옴
110 | state_size = env.observation_space.shape[0]
111 | #action_size = env.action_space.n
112 | action_size = 2
113 | # DQN 에이전트의 생성
114 | agent = DQNAgent(state_size, action_size)
115 | agent.load_model("./save_model/MountainCar_DQN.h5")
116 | scores, episodes = [], []
117 |
118 | for e in range(EPISODES):
119 | done = False
120 | score = 0
121 | state = env.reset()
122 | state = np.reshape(state, [1, state_size])
123 | print(state)
124 |
125 | # 액션 0(좌), 1(아무것도 안함), 3(아무것도 하지 않는 액션을 하지 않기 위한 fake_action 선언
126 | fake_action = 0
127 |
128 | # 같은 액션을 4번하기 위한 카운터
129 | action_count = 0
130 |
131 | while not done:
132 | if agent.render:
133 | env.render()
134 |
135 | # 현재 상태에서 행동을 선택하고 한 스텝을 진행
136 | action_count = action_count + 1
137 |
138 | if action_count == 4:
139 | action = agent.get_action(state)
140 | action_count = 0
141 |
142 | if action == 0:
143 | fake_action = 0
144 | elif action == 1:
145 | fake_action = 2
146 |
147 | # 선택한 액션으로 1 step을 시행한다
148 | next_state, reward, done, info = env.step(fake_action)
149 | next_state = np.reshape(next_state, [1, state_size])
150 | # 에피소드를 끝나게 한 행동에 대해서 -100의 패널티를 줌
151 | #reward = reward if not done else -100
152 |
153 | # 을 replay memory에 저장
154 | agent.replay_memory(state, fake_action, reward, next_state, done)
155 | # 매 타임스텝마다 학습을 진행
156 | agent.train_replay()
157 | score += reward
158 | state = next_state
159 |
160 | if done:
161 | env.reset()
162 | # 매 에피소드마다 학습하는 모델을 타겟 모델로 복사
163 | agent.update_target_model()
164 |
165 | # 각 에피소드마다 cartpole이 서있었던 타임스텝을 plot
166 | scores.append(score)
167 | episodes.append(e)
168 | #pylab.plot(episodes, scores, 'b')
169 | #pylab.savefig("./save_graph/MountainCar_DQN.png")
170 | print("episode:", e, " score:", score, " memory length:", len(agent.memory),
171 | " epsilon:", agent.epsilon)
172 |
173 | # 50 에피소드마다 학습 모델을 저장
174 | if e % 50 == 0:
175 | agent.save_model("./save_model/MountainCar_DQN.h5")
176 |
--------------------------------------------------------------------------------
/4-gym/1-mountaincar/save_model/MountainCar_DQN.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/4-gym/1-mountaincar/save_model/MountainCar_DQN.h5
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 RLCode
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | --------------------------------------------------------------------------------
4 |
5 | > Minimal and clean examples of reinforcement learning algorithms presented by [RLCode](https://rlcode.github.io) team. [[한국어]](https://github.com/rlcode/reinforcement-learning-kr)
6 | >
7 | > Maintainers - [Woongwon](https://github.com/dnddnjs), [Youngmoo](https://github.com/zzing0907), [Hyeokreal](https://github.com/Hyeokreal), [Uiryeong](https://github.com/wooridle), [Keon](https://github.com/keon)
8 |
9 | From the basics to deep reinforcement learning, this repo provides easy-to-read code examples. One file for each algorithm.
10 | Please feel free to create a [Pull Request](https://github.com/rlcode/reinforcement-learning/pulls), or open an [issue](https://github.com/rlcode/reinforcement-learning/issues)!
11 |
12 | ## Dependencies
13 | 1. Python 3.5
14 | 2. Tensorflow 1.0.0
15 | 3. Keras
16 | 4. numpy
17 | 5. pandas
18 | 6. matplot
19 | 7. pillow
20 | 8. Skimage
21 | 9. h5py
22 |
23 | ### Install Requirements
24 | ```
25 | pip install -r requirements.txt
26 | ```
27 |
28 | ## Table of Contents
29 |
30 | **Grid World** - Mastering the basics of reinforcement learning in the simplified world called "Grid World"
31 |
32 | - [Policy Iteration](./1-grid-world/1-policy-iteration)
33 | - [Value Iteration](./1-grid-world/2-value-iteration)
34 | - [Monte Carlo](./1-grid-world/3-monte-carlo)
35 | - [SARSA](./1-grid-world/4-sarsa)
36 | - [Q-Learning](./1-grid-world/5-q-learning)
37 | - [Deep SARSA](./1-grid-world/6-deep-sarsa)
38 | - [REINFORCE](./1-grid-world/7-reinforce)
39 |
40 | **CartPole** - Applying deep reinforcement learning on basic Cartpole game.
41 |
42 | - [Deep Q Network](./2-cartpole/1-dqn)
43 | - [Double Deep Q Network](./2-cartpole/2-double-dqn)
44 | - [Policy Gradient](./2-cartpole/3-reinforce)
45 | - [Actor Critic (A2C)](./2-cartpole/4-actor-critic)
46 | - [Asynchronous Advantage Actor Critic (A3C)](./2-cartpole/5-a3c)
47 |
48 | **Atari** - Mastering Atari games with Deep Reinforcement Learning
49 |
50 | - **Breakout** - [DQN](./3-atari/1-breakout/breakout_dqn.py), [DDQN](./3-atari/1-breakout/breakout_ddqn.py) [Dueling DDQN](./3-atari/1-breakout/breakout_ddqn.py) [A3C](./3-atari/1-breakout/breakout_a3c.py)
51 | - **Pong** - [Policy Gradient](./3-atari/2-pong/pong_reinforce.py)
52 |
53 | **OpenAI GYM** - [WIP]
54 |
55 | - Mountain Car - [DQN](./4-gym/1-mountaincar)
56 |
--------------------------------------------------------------------------------
/images/Reinforcement-Learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/images/Reinforcement-Learning.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Keras==2.0.3
2 | numpy==1.12.1
3 | pandas==0.19.2
4 | matplotlib==2.0.0
5 | tensorflow==1.0.0
6 | Pillow==4.1.0
7 | gym==0.8.1
8 | h5py==2.7.0
9 | scikit-image==0.13.0
10 |
--------------------------------------------------------------------------------
/wiki/how-to-windows.md:
--------------------------------------------------------------------------------
1 | # How To run examples on windows, step by step.
2 | Traditionally machine learning applications could only run in Linux or MacOS environments.
3 |
4 | In this wiki you will learn how to configure your **windows** environment so you can run the examples.
5 | 
6 |
7 | # Go for it.
8 | Recommended for greater compatibility
9 |
10 | ## Python Interpreter:
11 | - Download & Install [Anaconda](https://www.continuum.io/downloads), pick Python 3.6 version 64 Bit Installer
12 | Test installation on Windows console
13 |
14 | ```
15 | python --version
16 | Python 3.6.0 :: Anaconda custom (64-bit)
17 | ```
18 | ## Set virtual env to run examples
19 | ```
20 | # Create env, you can pick any version of python, but for run this repo
21 | conda create --name rl python=3.5
22 |
23 | # Activate env
24 | activate rl
25 |
26 | # Install TensorFlow, the easy way
27 | conda install -c conda-forge tensorflow
28 | conda install -c anaconda scipy=0.19.0
29 |
30 | mkdir examples
31 | cd examples
32 | git clone https://github.com/rlcode/reinforcement-learning
33 | cd reinforcement-learning
34 |
35 | # Install Requirements
36 | pip install -r requirements.txt
37 |
38 | # Check
39 | conda list
40 |
41 | # Test the code
42 | cd "Code 1. Grid World\1. Policy Iteration"
43 | python run.py
44 | ```
45 |
46 | # Next Steps.
47 | - Need a IDE to easy manage the python scripts, Download & Install [PyCharm Community](https://www.jetbrains.com/pycharm/download/#section=windows) its free.
48 |
49 | ## Linking PyCharm with Anaconda Env.
50 | - Open Project with PyCharm IDE: File > Open > Pick Folder (c:\examples\reinforcement-learning)
51 | - File > Settings > Project Interpreter > Add Local
52 | 
53 |
54 | - Note: Need to pick python environment interpreter i.e located in c:\Anaconda3\envs\rl
55 | 
56 |
57 | - If all is ok.
58 | 
59 |
60 | - Play It with samples (Run).
61 |
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/wiki/img/how-to-windows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/img/how-to-windows.png
--------------------------------------------------------------------------------
/wiki/img/link-env-with-pychar-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/img/link-env-with-pychar-1.png
--------------------------------------------------------------------------------
/wiki/img/link-env-with-pychar-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/img/link-env-with-pychar-2.png
--------------------------------------------------------------------------------
/wiki/img/link-env-with-pychar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/img/link-env-with-pychar.png
--------------------------------------------------------------------------------
/wiki/install_guide_osx+ubuntu.md:
--------------------------------------------------------------------------------
1 | ## 개발 환경 설정 1: 리눅스 (우분투)
2 |
3 | 리눅스는 소스코드가 공개된 대표적인 오픈소스 운영체제입니다. 리눅스는 모든 소스가 공개되어 있으므로 정말 많은 종류가 있습니다. 그중에서도 우분투(Ubuntu)가 가장 넓은 사용자를 가진 배포판입니다. 매년 상반기 하반기 우분투 재단에서 새로운 버전을 배포하는데 이 책에서는 14년 상반기에 배포한 우분투 14.04 버전을 사용할 것입니다. 우분투 14.04가 설치되어 있다는 가정에 따라 이후의 개발환경 설정을 설명할 것입니다.
4 |
5 |
6 |
7 | ### 2.1.1 우분투 파이썬의 버전 확인
8 |
9 | 리눅스의 장점은 바로 파이썬(Python)이 설치가 기본적으로 되어 있다는 것입니다. 파이썬은 2.X 버전과 3.X버전이 있는데 이 책에서는 `파이썬 3.5버전`을 사용할 것입니다. 바탕화면에서 `Ctrl+Alt+t`를 누르면 터미널 창이 뜨는데
10 | 여기에 다음 명령어를 치고 엔터를 누르면 설치된 파이썬의 버전을 확인할 수 있습니다.
11 |
12 | ```python
13 | $ python -V
14 | ```
15 |
16 | 우분투 14.04 버전에는 `파이썬 2.7버전`과 `3.5버전`이 기본적으로 설치되어 있습니다.
17 |
18 |
19 |
20 | ### 2.1.2 파이참 커뮤니티 설치 및 환경 설정
21 |
22 | 앞으로 강화학습 에이전트를 만들고 가상 환경에서 에이전트를 학습시킬 것입니다. 그러기 위해 코드를 짜고 편집하는
23 | 환경이 필요한데 그러한 환경을 IDE(interface Development Environment)라고 합니다. IDE에는 많은 종류가 있지만 이 책에서는 파이참(Pycharm)을 파이썬을 위한 IDE로 사용할 것입니다.
24 |
25 | 파이참의 설치는 파이참의 공식 홈페이지[[1\]](#_ftn1)를통해서 할 수 있습니다. 홈페이지에서 윈도우, 리눅스, 맥 OS 버전의 파이참을 다운로드 할 수 있습니다. 파이참은 유료 버전인 `프로페셔녈(PyCharm ProfessionalEdition)`과, 무료 버전인 `커뮤니티(PyCharm Community Edition)`으로 나뉩니다. 앞으로 에이전트를 개발할 때 `파이참 커뮤니티`를 사용할 것이므로 커뮤니티 버전을 기준으로 설치법을 설명할 것입니다.
26 |
27 |
28 |
29 | **설치는 다음과 같은 순서로 진행합니다. **
30 |
31 | 1. 파이참 공식 홈페이지 링크에서 파이참 커뮤니티버전을 다운로드합니다.
32 |
33 |
34 |
35 | 링크: [https://www.jetbrains.com/pycharm/download/#section=linux](https://www.jetbrains.com/pycharm/download/#section=linux)
36 |
37 |
38 |
39 |
40 |
41 | 2. 다운받은 경로로 들어가서 다음 명령어로 압축파일을 풀어줍니다.
42 |
43 | ```shell
44 | $tar xfz pycharm-community-2016.3.2.tar.gz
45 | ```
46 |
47 |
48 |
49 |
50 | 3. 압축을 푼 후 아래 경로(bin폴더)로 이동합니다.
51 |
52 | ```shell
53 | $cd ~/pycharm-community-2016.3.2/bin
54 | ```
55 |
56 |
57 | 4. 다음 명령어로 파이참을 실행합니다.
58 |
59 | ```shell
60 | $sh pycharm.sh
61 | ```
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 | 5. 명령어가 실행되면 설치가 시작됩니다.
70 |
71 |
72 |
73 | 6. 설치가 완료되면 다음 화면과 같은 초기 환경설정 화면을 볼 수 있습니다.
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 | IDE theme 항목에서 Intellij는 바탕이 흰색인 테마이고 Darcula 테마는 바탕이 검은색입니다. 이
82 | 책에서는 Intellij를 테마로 사용합니다.
83 |
84 |
85 |
86 | 7. 초기설정이 완료된 후의 화면입니다. 여기서 프로젝트 생성을 해봅니다.
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 | 8. 프로젝트의 경로와 Interpreter를 설정하는 화면입니다. Home 디렉터리에 PycharmProjects 폴더를 생성하고 그 하위에 프로젝트를 생성합니다. 프로젝트의 이름은 독자가 임의로 정하도록 합니다. “rlcode_book” 이름으로 프로젝트를 생성하는데 Interpreter를 설정해줍니다. Interpreter는 이 프로젝트에서 사용할 언어인데 python 3.5라고 설정합니다.
95 |
96 |
97 |
98 |
99 |
100 |
101 | 9. rlcode_book 프로젝트가 생성되면 아래와 같은 화면이 나옵니다.
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 | 10. 파이참이 정상적으로 설치되었는지 확인하기 위해 파이썬 스크립트 파일을 생성해봅니다. 가장 간단한 예제인 `“Hello World”`를 실행하기 위해 다음과 같이 hello_world.py 파일을 생성합니다.
112 |
113 |
114 |
115 |
116 |
117 | 11. 생성한 파일에 마우스 커서를 놓고 오른쪽 버튼을 누르면 여러 항목이 나옵니다. 그 중에서 “Run ‘hello_world’” 버튼을 누르면 hello_world.py 파일을 실행할 수 있습니다.
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 | 12. hello_world.py 파일 안에 다음 코드를 입력합니다.
126 | ```python
127 | print("hello world")
128 | ```
129 |
130 |
131 |
132 |
133 | 13. hello_world.py 파일을 실행시키면 아래 화면과 같이 실행 창에 “hello world”가 나옵니다. 이를 통해 파이참이 정상적으로 설치된 것을 확인할 수 있습니다.
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 | ###Virtualenv(가상환경) 사용법 :happy:
143 |
144 | 여기까지가기본적인 파이참의 환경설정입니다. 한 컴퓨터에서 여러가지 프로젝트를 진행할 경우에 프로젝트마다 개발환경이다를 수 있습니다. 서로 다른 프로젝트의 개발환경이 다를 경우에 사용자는 상당한 불편을 겪을 수 있습니다. 따라서 프로젝트별로 개발환경을 분리해서 관리하는 것은 상당한 장점이 있는데 그 기능을 하는 것이 VirtualEnv입니다. VirtualEnv를 사용하면 이 책의 프로젝트만을위한 가상 개발환경을 만들 수 있습니다.
145 |
146 | 파이참은VirtualEnv를 지원하기 때문에 파이참으로 VirtualEnv를사용하는 법을 설명하겠습니다. VirtualEnv의 설치 및 사용 방법은 여러 가지가 있지만 위에서설치한 파이참을 이용하면 GUI(graphic user interface)형식으로 VirtualEnv를 사용할 수 있습니다. 그리고 파이참은 가상 개발환경에설치된 다양한 파이썬 외부 라이브러리들을 관리 할 수 있는 기능을 제공합니다.
147 |
148 | **파이참에서 VirtualEnv 이용방법은 다음과 같습니다.**
149 |
150 | 1. “File” 메뉴에서 “Settings”를 클릭합니다.
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 | 2. Settings의 왼쪽 목록에서 “Project: 프로젝트명”의 하위 항목인 Project Interpreter 클릭합니다. 그리고 Project Interpreter 탭 오른쪽에서 “Create VirtualEnv”를 클릭합니다.
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 | 3. 가상환경 이름을 입력하면 /home/brian/rlcode_book 디렉토리가 생성되어 가상환경이 생깁니다.
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 | 4. 아래와 같이 터미널 창에 (rlcode_book) 표시가 된다면 rlcode_book이름을 가진 가상 환경이 생긴 것입니다. 이제 이 환경을 이 책을 위한 가상환경으로 사용하겠습니다.
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 | ### 2.1.3 오픈에이아이 설치 및 테스트
183 |
184 | 2016년에 오픈에이아이(OpenAI)라는 회사가 세워졌습니다. 이 회사의 목표는 인공지능 기술을 전 세계에공개해서 더 안전한 인공지능을 만들어가며 더 많은 분야에 인공지능을 도입하는 것입니다. 오픈에이아이 짐(Gym)는 오픈에이아이에서 만든 환경인데여기서 여러가지 인공지능 알고리즘을 테스트 해볼 수 있습니다.
185 |
186 | 오픈에이아이짐의 코드는 모두 오픈에이아이의 깃허브(Github)[[2\]](#_ftn1)에업로드되어있습니다.
187 |
188 |
189 |
190 |
191 |
192 | 오픈에이아이 짐의 설치는 공식 홈페이지에 설명되어있습니다. 오픈에이아이짐을 설치하기 위해서는 깃(Git)를 먼저 설치해줘야 합니다. 깃(Git)은 버전 관리 도구로서개발 프로세스에서 버전 관리가 필요할 때 주로 사용합니다. 오픈에이아이는 오픈소스로 깃헙(Github)에 공개되어 있습니다. 깃헙은 버전관리되는 소스 코드들의원격 저장소 역할을 하는 플랫폼입니다.
193 |
194 | 다음과 같은 명령어로 깃를 설치합니다.
195 |
196 | ```shell
197 | $ sudo apt-get install git
198 | ```
199 |
200 |
201 |
202 | 깃을 설치한 다음에 오픈에이아이 짐을 설치합니다. 터미널 창에서 오픈에이아이 짐을 설치할 디렉토리로 이동한 다음에 다음과 같은 명령어를 실행합니다.
203 |
204 | ```shell
205 | $ git clone https://github.com/openai/gym
206 | $ cd gym
207 | $ pip3 install -e
208 | ```
209 |
210 |
211 |
212 | 오픈에이아이 짐은 여러가지 다른 설정으로 설치할 수 있는데 `pip install -e .`은 가장 기본적인 부분들만 설치하는 것입니다. 이후에 아타리 게임 등 오픈에이아이 짐의 모든 게임들을 사용하려면 `pip install -e .` 대신에 다음과 같이 입력해야 합니다.
213 |
214 | ```shell
215 | $ pip3 install -e .[all]
216 | ```
217 |
218 |
219 |
220 | 오픈에이아이 짐이 정상적으로 설치되었는지 확인하기 위해서 간단한 예제를 실행해봅니다. 오픈에이아이 짐의 가장 간단한 예제는 카트폴(CartPole)입니다. 카트폴은 카트에 진자가 달린 형태로 이 문제의 목표는 카트를 움직여서 그 반동으로 진자를 세우는 것입니다. 테스트할 때는 그냥 아무 입력도 카트폴에 주지 않은 상태로 오픈에이아이 짐이 제대로 실행되는지만 확인할 것입니다.
221 |
222 | `CartPole.py` 파일을 생성하고 코드 2.1과 같이 입력합니다.
223 |
224 | ```python
225 | import gym
226 | env = gym.make('CartPole-v0')
227 | env.reset()
228 | for _ in range(1000):
229 | env.render()
230 | env.step(env.action_space.sample()) # take a random action
231 | ```
232 |
233 | 코드 2.1 카트폴 예제 실행 코드
234 |
235 |
236 | 이 코드를 실행하면 화면에 아무 행동도 하지 않는 카트폴이 실행됩니다. 오픈에이아이 짐은 이와 같은 많은 문제들을 제공하며 사용자들은 오픈에이아이 짐의 여러가지 문제에 자신의 학습 알고리즘을 적용해볼 수 있습니다. 또한 오픈에이아이 짐 사이트에 자신의 알고리즘을 공유하거나 결과를 확인할 수 있습니다.
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 | ## 2.2 개발 환경 설정 2: 맥 OS
245 |
246 | 맥 OS에는 기본적으로 파이썬 2.7버전이 설치되어있기 때문에 3.5 버전을 새로 설치를 해야 합니다.
247 |
248 | ### 2.2.1 파이썬 3.5 설치 및 환경 설정
249 |
250 | 파이썬다운로드 페이지[[3\]](#_ftnref3)로접속하면 다음과 같은 화면이 나옵니다.
251 |
252 |
253 |
254 |
255 |
256 | 1. 위 화면에서 자신의 맥 OS 버전에 맞는 파일을 선택해서 다운로드합니다. 다운로드가 완료된 파일을 실행 후 안내에 따르면 설치가 완료됩니다.
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 | 2. 파이썬 설치가 정상적으로 완료됐는지 확인하기 위해서는 터미널을 실행합니다. 터미널 창에 ‘python3’ 명령어를 입력했을 때 다음 화면과 같이 출력된다면 정상적으로 설치된 것입니다.
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 | ### 2.2.2 파이참 커뮤니티 설치 및 환경 설정
273 |
274 | 파이참의 설치 및 환경 설정은 다음과 같은 순서로 진행합니다.
275 |
276 | 1. 파이참홈페이지에 접속하여 커뮤니티버전을 다운로드합니다.
277 |
278 | 2. 다운로드가 완료된 파일을 실행하고아래 그림에서 왼쪽 PyCharm CE 아이콘을 오른쪽 폴더 아이콘으로 드래그하면 설치가 완료됩니다.
279 |
280 |
281 |
282 | 3. 처음 파이참을 실행하게 되면 설정화면이 나오는데 IDE theme을 통해 IDE의 색상과 스타일을 변경할 수 있습니다. Default는 우분투의 개발환경설정에서 봤던 Intellij 테마입니다. 이 책에서는 Default를 사용할 것입니다.
283 |
284 |
285 | 4. 초기 설정을 완료하고 Create New Project 버튼을 클릭합니다.
286 |
287 |
288 |
289 | 5. Create New Project 버튼을 클릭하면 아래 그림과 같은 화면이 나옵니다. Location은 프로젝트가 생성될 경로와 프로젝트 폴더명을 설정하는 곳입니다. 프로젝트의 이름과 경로는 독자가 임의로 지정하면 됩니다.
290 |
291 | Interpreter는 프로젝트에서 어떤 파이썬 Interpreter를 사용할 것인지 고르는 것입니다. 우분투에서와 마찬가지로 VirtualEnv를 통해 가상 환경을 만들고 그것을 Interpreter로 사용할 것입니다. Create VirtualEnv 버튼을 누릅니다.
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 | 6. 아래 그림은 VirtualEnv의 생성화면입니다. Name과 Location은 여러분이 임의로 설정하면 됩니다. Base Interpreter는 위와 같이 새로 설치한 python3.5 를 선택합니다. OK버튼을 누르면 해당 VirtualEnv가 생성됩니다.
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 | 7. 처음 New Project 생성화면의 Interpreter에서 방금 생성한 VirtualEnv를 선택해줍니다. 그리고 Create버튼을 누르면 프로젝트 생성이 완료됩니다.
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 | 8. 프로젝트를 생성하고 나면 다음과 같은 작업 환경이 보입니다. 이 화면에서 최상위 폴더를 우클릭한 후
316 |
317 | New -> Python File을 클릭하면 새로운 파이썬 파일을 생성할 수 있습니다.
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 | 9. 파이참이 제대로 설치됐는지 확인하기 위해 hello world 예제를 실행해봅니다. 우분투에서와 동일하기 때문에 생략하겠습니다.
326 |
327 |
328 |
329 | ### 2.2.3 오픈에이아이 설치 및 테스트
330 |
331 | 오픈에이아이를 설치하고 카트폴을 실행해보는 단계는 우분투와 동일하므로 생략합니다.
332 |
333 |
334 |
335 | ------
336 |
337 | [[1\]](#_ftnref1) https://www.jetbrains.com/pycharm/
338 |
339 | [[2\]](#_ftnref2) https://github.com/openai/gym
340 |
341 | [[3\]](#_ftnref3) https://www.python.org/downloads/release/python-350/
342 |
--------------------------------------------------------------------------------
/wiki/rlcode_image/cartpole_exam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/cartpole_exam.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/console_hello_world.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/console_hello_world.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/default_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/default_config.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/file_setting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/file_setting.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/hello_world_ubuntu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/hello_world_ubuntu.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/openai_github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/openai_github.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/project_interpreter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/project_interpreter.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/pycham_new_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/pycham_new_project.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/pycharm_community.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/pycharm_community.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/pycharm_drag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/pycharm_drag.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/pycharm_init.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/pycharm_init.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/python3_terminal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/python3_terminal.jpg
--------------------------------------------------------------------------------
/wiki/rlcode_image/python_download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/python_download.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/python_installed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/python_installed.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/python_intalled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/python_intalled.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/rl_book_hello_world.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rl_book_hello_world.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/rl_book_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rl_book_project.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/rl_book_venv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rl_book_venv.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/rl_book_virtualenv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rl_book_virtualenv.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/rlcode_book_directory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rlcode_book_directory.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/rlcode_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rlcode_project.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/run_hello_world.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/run_hello_world.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/sh_pycharm.sh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/sh_pycharm.sh.png
--------------------------------------------------------------------------------
/wiki/rlcode_image/terminal_rlcode_book.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/terminal_rlcode_book.png
--------------------------------------------------------------------------------