├── .gitignore ├── 1-grid-world ├── 1-policy-iteration │ ├── environment.py │ └── policy_iteration.py ├── 2-value-iteration │ ├── environment.py │ └── value_iteration.py ├── 3-monte-carlo │ ├── environment.py │ └── mc_agent.py ├── 4-sarsa │ ├── .python-version │ ├── environment.py │ └── sarsa_agent.py ├── 5-q-learning │ ├── .python-version │ ├── environment.py │ └── q_learning_agent.py ├── 6-deep-sarsa │ ├── deep_sarsa_agent.py │ ├── environment.py │ ├── save_graph │ │ └── deep_sarsa_trained.png │ └── save_model │ │ └── deep_sarsa_trained.h5 ├── 7-reinforce │ ├── environment.py │ ├── reinforce_agent.py │ ├── save_graph │ │ └── reinforce_trained.png │ └── save_model │ │ └── reinforce_trained.h5 ├── README.md ├── gridworld.png ├── gridworld_changing.png └── img │ ├── circle.png │ ├── down.png │ ├── left.png │ ├── rectangle.png │ ├── right.png │ ├── triangle.png │ └── up.png ├── 2-cartpole ├── 1-dqn │ ├── SumTree.py │ ├── cartpole_dqn.py │ ├── cartpole_only_per.py │ ├── save_graph │ │ └── Cartpole_DQN.png │ └── save_model │ │ └── cartpole_dqn.h5 ├── 2-double-dqn │ ├── cartpole_ddqn.py │ ├── save_graph │ │ └── cartpole_ddqn.png │ └── save_model │ │ └── cartpole_ddqn.h5 ├── 3-reinforce │ ├── cartpole_reinforce.py │ ├── save_graph │ │ └── cartpole_reinforce.png │ └── save_model │ │ └── cartpole_reinforce.h5 ├── 4-actor-critic │ ├── cartpole_a2c.py │ ├── save_graph │ │ └── cartpole_a2c.png │ └── save_model │ │ ├── cartpole_actor.h5 │ │ └── cartpole_critic.h5 ├── 5-a3c │ ├── cartpole_a3c.py │ └── save_model │ │ ├── Cartpole_A3C_actor.h5 │ │ └── Cartpole_A3C_critic.h5 ├── LICENSE ├── README.md └── cartpole.png ├── 3-atari ├── 1-breakout │ ├── breakout_a3c.py │ ├── breakout_ddqn.py │ ├── breakout_dqn.py │ ├── breakout_dueling_ddqn.py │ ├── play_a3c_model.py │ ├── play_dqn_model.py │ ├── save_model │ │ ├── breakout_a3c_1_actor.h5 │ │ ├── breakout_a3c_1_critic.h5 │ │ ├── breakout_a3c_2_actor.h5 │ │ ├── breakout_a3c_2_critic.h5 │ │ ├── breakout_a3c_3_actor.h5 │ │ ├── breakout_a3c_3_critic.h5 │ │ ├── breakout_a3c_4_actor.h5 │ │ ├── breakout_a3c_4_critic.h5 │ │ ├── breakout_a3c_5_actor.h5 │ │ ├── breakout_a3c_5_critic.h5 │ │ ├── breakout_dqn.h5 │ │ ├── breakout_dqn_1.h5 │ │ ├── breakout_dqn_2.h5 │ │ ├── breakout_dqn_3.h5 │ │ ├── breakout_dqn_4.h5 │ │ └── breakout_dqn_5.h5 │ └── summary │ │ ├── breakout_a3c │ │ └── events.out.tfevents.1497264638 │ │ └── breakout_dqn │ │ └── events.out.tfevents.1496968668.young-System-Product-Name ├── 2-pong │ ├── README.md │ ├── assets │ │ ├── pg.gif │ │ └── score.png │ ├── pong_a3c.py │ ├── pong_reinforce.py │ └── save_model │ │ └── pong_reinforce.h5 └── LICENSE ├── 4-gym └── 1-mountaincar │ ├── mountaincar_dqn.py │ └── save_model │ └── MountainCar_DQN.h5 ├── LICENSE ├── README.md ├── images └── Reinforcement-Learning.png ├── requirements.txt └── wiki ├── how-to-windows.md ├── img ├── how-to-windows.png ├── link-env-with-pychar-1.png ├── link-env-with-pychar-2.png └── link-env-with-pychar.png ├── install_guide_osx+ubuntu.md └── rlcode_image ├── cartpole_exam.png ├── console_hello_world.png ├── default_config.png ├── file_setting.png ├── hello_world_ubuntu.png ├── openai_github.png ├── project_interpreter.png ├── pycham_new_project.png ├── pycharm_community.png ├── pycharm_drag.png ├── pycharm_init.png ├── python3_terminal.jpg ├── python_download.png ├── python_installed.png ├── python_intalled.png ├── rl_book_hello_world.png ├── rl_book_project.png ├── rl_book_venv.png ├── rl_book_virtualenv.png ├── rlcode_book_directory.png ├── rlcode_project.png ├── run_hello_world.png ├── sh_pycharm.sh.png └── terminal_rlcode_book.png /.gitignore: -------------------------------------------------------------------------------- 1 | *.project 2 | *.pydevproject 3 | .idea/ 4 | .DS_Store 5 | __pycache__ 6 | ./Code 2. Cartpole/6. A3C/Cartpole_A3C.pgy -------------------------------------------------------------------------------- /1-grid-world/1-policy-iteration/environment.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import Button 3 | import time 4 | import numpy as np 5 | from PIL import ImageTk, Image 6 | 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # pixels 9 | HEIGHT = 5 # grid height 10 | WIDTH = 5 # grid width 11 | TRANSITION_PROB = 1 12 | POSSIBLE_ACTIONS = [0, 1, 2, 3] # up, down, left, right 13 | ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # actions in coordinates 14 | REWARDS = [] 15 | 16 | 17 | class GraphicDisplay(tk.Tk): 18 | def __init__(self, agent): 19 | super(GraphicDisplay, self).__init__() 20 | self.title('Policy Iteration') 21 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) 22 | self.texts = [] 23 | self.arrows = [] 24 | self.env = Env() 25 | self.agent = agent 26 | self.evaluation_count = 0 27 | self.improvement_count = 0 28 | self.is_moving = 0 29 | (self.up, self.down, self.left, self.right), self.shapes = self.load_images() 30 | self.canvas = self._build_canvas() 31 | self.text_reward(2, 2, "R : 1.0") 32 | self.text_reward(1, 2, "R : -1.0") 33 | self.text_reward(2, 1, "R : -1.0") 34 | 35 | def _build_canvas(self): 36 | canvas = tk.Canvas(self, bg='white', 37 | height=HEIGHT * UNIT, 38 | width=WIDTH * UNIT) 39 | # buttons 40 | iteration_button = Button(self, text="Evaluate", 41 | command=self.evaluate_policy) 42 | iteration_button.configure(width=10, activebackground="#33B5E5") 43 | canvas.create_window(WIDTH * UNIT * 0.13, HEIGHT * UNIT + 10, 44 | window=iteration_button) 45 | policy_button = Button(self, text="Improve", 46 | command=self.improve_policy) 47 | policy_button.configure(width=10, activebackground="#33B5E5") 48 | canvas.create_window(WIDTH * UNIT * 0.37, HEIGHT * UNIT + 10, 49 | window=policy_button) 50 | policy_button = Button(self, text="move", command=self.move_by_policy) 51 | policy_button.configure(width=10, activebackground="#33B5E5") 52 | canvas.create_window(WIDTH * UNIT * 0.62, HEIGHT * UNIT + 10, 53 | window=policy_button) 54 | policy_button = Button(self, text="reset", command=self.reset) 55 | policy_button.configure(width=10, activebackground="#33B5E5") 56 | canvas.create_window(WIDTH * UNIT * 0.87, HEIGHT * UNIT + 10, 57 | window=policy_button) 58 | 59 | # create grids 60 | for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 61 | x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT 62 | canvas.create_line(x0, y0, x1, y1) 63 | for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 64 | x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row 65 | canvas.create_line(x0, y0, x1, y1) 66 | 67 | # add img to canvas 68 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 69 | canvas.create_image(250, 150, image=self.shapes[1]) 70 | canvas.create_image(150, 250, image=self.shapes[1]) 71 | canvas.create_image(250, 250, image=self.shapes[2]) 72 | 73 | # pack all 74 | canvas.pack() 75 | 76 | return canvas 77 | 78 | def load_images(self): 79 | up = PhotoImage(Image.open("../img/up.png").resize((13, 13))) 80 | right = PhotoImage(Image.open("../img/right.png").resize((13, 13))) 81 | left = PhotoImage(Image.open("../img/left.png").resize((13, 13))) 82 | down = PhotoImage(Image.open("../img/down.png").resize((13, 13))) 83 | rectangle = PhotoImage(Image.open("../img/rectangle.png").resize((65, 65))) 84 | triangle = PhotoImage(Image.open("../img/triangle.png").resize((65, 65))) 85 | circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65))) 86 | return (up, down, left, right), (rectangle, triangle, circle) 87 | 88 | def reset(self): 89 | if self.is_moving == 0: 90 | self.evaluation_count = 0 91 | self.improvement_count = 0 92 | for i in self.texts: 93 | self.canvas.delete(i) 94 | 95 | for i in self.arrows: 96 | self.canvas.delete(i) 97 | self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)] 98 | self.agent.policy_table = ([[[0.25, 0.25, 0.25, 0.25]] * WIDTH 99 | for _ in range(HEIGHT)]) 100 | self.agent.policy_table[2][2] = [] 101 | x, y = self.canvas.coords(self.rectangle) 102 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 103 | 104 | def text_value(self, row, col, contents, font='Helvetica', size=10, 105 | style='normal', anchor="nw"): 106 | origin_x, origin_y = 85, 70 107 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 108 | font = (font, str(size), style) 109 | text = self.canvas.create_text(x, y, fill="black", text=contents, 110 | font=font, anchor=anchor) 111 | return self.texts.append(text) 112 | 113 | def text_reward(self, row, col, contents, font='Helvetica', size=10, 114 | style='normal', anchor="nw"): 115 | origin_x, origin_y = 5, 5 116 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 117 | font = (font, str(size), style) 118 | text = self.canvas.create_text(x, y, fill="black", text=contents, 119 | font=font, anchor=anchor) 120 | return self.texts.append(text) 121 | 122 | def rectangle_move(self, action): 123 | base_action = np.array([0, 0]) 124 | location = self.find_rectangle() 125 | self.render() 126 | if action == 0 and location[0] > 0: # up 127 | base_action[1] -= UNIT 128 | elif action == 1 and location[0] < HEIGHT - 1: # down 129 | base_action[1] += UNIT 130 | elif action == 2 and location[1] > 0: # left 131 | base_action[0] -= UNIT 132 | elif action == 3 and location[1] < WIDTH - 1: # right 133 | base_action[0] += UNIT 134 | # move agent 135 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 136 | 137 | def find_rectangle(self): 138 | temp = self.canvas.coords(self.rectangle) 139 | x = (temp[0] / 100) - 0.5 140 | y = (temp[1] / 100) - 0.5 141 | return int(y), int(x) 142 | 143 | def move_by_policy(self): 144 | if self.improvement_count != 0 and self.is_moving != 1: 145 | self.is_moving = 1 146 | 147 | x, y = self.canvas.coords(self.rectangle) 148 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 149 | 150 | x, y = self.find_rectangle() 151 | while len(self.agent.policy_table[x][y]) != 0: 152 | self.after(100, 153 | self.rectangle_move(self.agent.get_action([x, y]))) 154 | x, y = self.find_rectangle() 155 | self.is_moving = 0 156 | 157 | def draw_one_arrow(self, col, row, policy): 158 | if col == 2 and row == 2: 159 | return 160 | 161 | if policy[0] > 0: # up 162 | origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) 163 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 164 | image=self.up)) 165 | if policy[1] > 0: # down 166 | origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) 167 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 168 | image=self.down)) 169 | if policy[2] > 0: # left 170 | origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) 171 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 172 | image=self.left)) 173 | if policy[3] > 0: # right 174 | origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) 175 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 176 | image=self.right)) 177 | 178 | def draw_from_policy(self, policy_table): 179 | for i in range(HEIGHT): 180 | for j in range(WIDTH): 181 | self.draw_one_arrow(i, j, policy_table[i][j]) 182 | 183 | def print_value_table(self, value_table): 184 | for i in range(WIDTH): 185 | for j in range(HEIGHT): 186 | self.text_value(i, j, value_table[i][j]) 187 | 188 | def render(self): 189 | time.sleep(0.1) 190 | self.canvas.tag_raise(self.rectangle) 191 | self.update() 192 | 193 | def evaluate_policy(self): 194 | self.evaluation_count += 1 195 | for i in self.texts: 196 | self.canvas.delete(i) 197 | self.agent.policy_evaluation() 198 | self.print_value_table(self.agent.value_table) 199 | 200 | def improve_policy(self): 201 | self.improvement_count += 1 202 | for i in self.arrows: 203 | self.canvas.delete(i) 204 | self.agent.policy_improvement() 205 | self.draw_from_policy(self.agent.policy_table) 206 | 207 | 208 | class Env: 209 | def __init__(self): 210 | self.transition_probability = TRANSITION_PROB 211 | self.width = WIDTH 212 | self.height = HEIGHT 213 | self.reward = [[0] * WIDTH for _ in range(HEIGHT)] 214 | self.possible_actions = POSSIBLE_ACTIONS 215 | self.reward[2][2] = 1 # reward 1 for circle 216 | self.reward[1][2] = -1 # reward -1 for triangle 217 | self.reward[2][1] = -1 # reward -1 for triangle 218 | self.all_state = [] 219 | 220 | for x in range(WIDTH): 221 | for y in range(HEIGHT): 222 | state = [x, y] 223 | self.all_state.append(state) 224 | 225 | def get_reward(self, state, action): 226 | next_state = self.state_after_action(state, action) 227 | return self.reward[next_state[0]][next_state[1]] 228 | 229 | def state_after_action(self, state, action_index): 230 | action = ACTIONS[action_index] 231 | return self.check_boundary([state[0] + action[0], state[1] + action[1]]) 232 | 233 | @staticmethod 234 | def check_boundary(state): 235 | state[0] = (0 if state[0] < 0 else WIDTH - 1 236 | if state[0] > WIDTH - 1 else state[0]) 237 | state[1] = (0 if state[1] < 0 else HEIGHT - 1 238 | if state[1] > HEIGHT - 1 else state[1]) 239 | return state 240 | 241 | def get_transition_prob(self, state, action): 242 | return self.transition_probability 243 | 244 | def get_all_states(self): 245 | return self.all_state 246 | -------------------------------------------------------------------------------- /1-grid-world/1-policy-iteration/policy_iteration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | from environment import GraphicDisplay, Env 4 | 5 | 6 | class PolicyIteration: 7 | def __init__(self, env): 8 | self.env = env 9 | # 2-d list for the value function 10 | self.value_table = [[0.0] * env.width for _ in range(env.height)] 11 | # list of random policy (same probability of up, down, left, right) 12 | self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width 13 | for _ in range(env.height)] 14 | # setting terminal state 15 | self.policy_table[2][2] = [] 16 | self.discount_factor = 0.9 17 | 18 | def policy_evaluation(self): 19 | next_value_table = [[0.00] * self.env.width 20 | for _ in range(self.env.height)] 21 | 22 | # Bellman Expectation Equation for the every states 23 | for state in self.env.get_all_states(): 24 | value = 0.0 25 | # keep the value function of terminal states as 0 26 | if state == [2, 2]: 27 | next_value_table[state[0]][state[1]] = value 28 | continue 29 | 30 | for action in self.env.possible_actions: 31 | next_state = self.env.state_after_action(state, action) 32 | reward = self.env.get_reward(state, action) 33 | next_value = self.get_value(next_state) 34 | value += (self.get_policy(state)[action] * 35 | (reward + self.discount_factor * next_value)) 36 | 37 | next_value_table[state[0]][state[1]] = round(value, 2) 38 | 39 | self.value_table = next_value_table 40 | 41 | def policy_improvement(self): 42 | next_policy = self.policy_table 43 | for state in self.env.get_all_states(): 44 | if state == [2, 2]: 45 | continue 46 | value = -99999 47 | max_index = [] 48 | result = [0.0, 0.0, 0.0, 0.0] # initialize the policy 49 | 50 | # for every actions, calculate 51 | # [reward + (discount factor) * (next state value function)] 52 | for index, action in enumerate(self.env.possible_actions): 53 | next_state = self.env.state_after_action(state, action) 54 | reward = self.env.get_reward(state, action) 55 | next_value = self.get_value(next_state) 56 | temp = reward + self.discount_factor * next_value 57 | 58 | # We normally can't pick multiple actions in greedy policy. 59 | # but here we allow multiple actions with same max values 60 | if temp == value: 61 | max_index.append(index) 62 | elif temp > value: 63 | value = temp 64 | max_index.clear() 65 | max_index.append(index) 66 | 67 | # probability of action 68 | prob = 1 / len(max_index) 69 | 70 | for index in max_index: 71 | result[index] = prob 72 | 73 | next_policy[state[0]][state[1]] = result 74 | 75 | self.policy_table = next_policy 76 | 77 | # get action according to the current policy 78 | def get_action(self, state): 79 | random_pick = random.randrange(100) / 100 80 | 81 | policy = self.get_policy(state) 82 | policy_sum = 0.0 83 | # return the action in the index 84 | for index, value in enumerate(policy): 85 | policy_sum += value 86 | if random_pick < policy_sum: 87 | return index 88 | 89 | # get policy of specific state 90 | def get_policy(self, state): 91 | if state == [2, 2]: 92 | return 0.0 93 | return self.policy_table[state[0]][state[1]] 94 | 95 | def get_value(self, state): 96 | return round(self.value_table[state[0]][state[1]], 2) 97 | 98 | if __name__ == "__main__": 99 | env = Env() 100 | policy_iteration = PolicyIteration(env) 101 | grid_world = GraphicDisplay(policy_iteration) 102 | grid_world.mainloop() 103 | -------------------------------------------------------------------------------- /1-grid-world/2-value-iteration/environment.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | import time 3 | import numpy as np 4 | import random 5 | from PIL import ImageTk, Image 6 | 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # pixels 9 | HEIGHT = 5 # grid height 10 | WIDTH = 5 # grid width 11 | TRANSITION_PROB = 1 12 | POSSIBLE_ACTIONS = [0, 1, 2, 3] # up, down, left, right 13 | ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # actions in coordinates 14 | REWARDS = [] 15 | 16 | 17 | class GraphicDisplay(tk.Tk): 18 | def __init__(self, value_iteration): 19 | super(GraphicDisplay, self).__init__() 20 | self.title('Value Iteration') 21 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) 22 | self.texts = [] 23 | self.arrows = [] 24 | self.env = Env() 25 | self.agent = value_iteration 26 | self.iteration_count = 0 27 | self.improvement_count = 0 28 | self.is_moving = 0 29 | (self.up, self.down, self.left, 30 | self.right), self.shapes = self.load_images() 31 | self.canvas = self._build_canvas() 32 | self.text_reward(2, 2, "R : 1.0") 33 | self.text_reward(1, 2, "R : -1.0") 34 | self.text_reward(2, 1, "R : -1.0") 35 | 36 | def _build_canvas(self): 37 | canvas = tk.Canvas(self, bg='white', 38 | height=HEIGHT * UNIT, 39 | width=WIDTH * UNIT) 40 | # buttons 41 | iteration_button = tk.Button(self, text="Calculate", 42 | command=self.calculate_value) 43 | iteration_button.configure(width=10, activebackground="#33B5E5") 44 | canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, 45 | window=iteration_button) 46 | 47 | policy_button = tk.Button(self, text="Print Policy", 48 | command=self.print_optimal_policy) 49 | policy_button.configure(width=10, activebackground="#33B5E5") 50 | canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, 51 | window=policy_button) 52 | 53 | policy_button = tk.Button(self, text="Move", 54 | command=self.move_by_policy) 55 | policy_button.configure(width=10, activebackground="#33B5E5") 56 | canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, 57 | window=policy_button) 58 | 59 | policy_button = tk.Button(self, text="Clear", command=self.clear) 60 | policy_button.configure(width=10, activebackground="#33B5E5") 61 | canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, 62 | window=policy_button) 63 | 64 | # create grids 65 | for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 66 | x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT 67 | canvas.create_line(x0, y0, x1, y1) 68 | for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 69 | x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row 70 | canvas.create_line(x0, y0, x1, y1) 71 | 72 | # add img to canvas 73 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 74 | canvas.create_image(250, 150, image=self.shapes[1]) 75 | canvas.create_image(150, 250, image=self.shapes[1]) 76 | canvas.create_image(250, 250, image=self.shapes[2]) 77 | 78 | # pack all 79 | canvas.pack() 80 | 81 | return canvas 82 | 83 | def load_images(self): 84 | PhotoImage = ImageTk.PhotoImage 85 | up = PhotoImage(Image.open("../img/up.png").resize((13, 13))) 86 | right = PhotoImage(Image.open("../img/right.png").resize((13, 13))) 87 | left = PhotoImage(Image.open("../img/left.png").resize((13, 13))) 88 | down = PhotoImage(Image.open("../img/down.png").resize((13, 13))) 89 | rectangle = PhotoImage( 90 | Image.open("../img/rectangle.png").resize((65, 65))) 91 | triangle = PhotoImage( 92 | Image.open("../img/triangle.png").resize((65, 65))) 93 | circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65))) 94 | return (up, down, left, right), (rectangle, triangle, circle) 95 | 96 | def clear(self): 97 | 98 | if self.is_moving == 0: 99 | self.iteration_count = 0 100 | self.improvement_count = 0 101 | for i in self.texts: 102 | self.canvas.delete(i) 103 | 104 | for i in self.arrows: 105 | self.canvas.delete(i) 106 | 107 | self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)] 108 | 109 | x, y = self.canvas.coords(self.rectangle) 110 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 111 | 112 | def reset(self): 113 | self.update() 114 | time.sleep(0.5) 115 | self.canvas.delete(self.rectangle) 116 | return self.canvas.coords(self.rectangle) 117 | 118 | def text_value(self, row, col, contents, font='Helvetica', size=12, 119 | style='normal', anchor="nw"): 120 | origin_x, origin_y = 85, 70 121 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 122 | font = (font, str(size), style) 123 | text = self.canvas.create_text(x, y, fill="black", text=contents, 124 | font=font, anchor=anchor) 125 | return self.texts.append(text) 126 | 127 | def text_reward(self, row, col, contents, font='Helvetica', size=12, 128 | style='normal', anchor="nw"): 129 | origin_x, origin_y = 5, 5 130 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 131 | font = (font, str(size), style) 132 | text = self.canvas.create_text(x, y, fill="black", text=contents, 133 | font=font, anchor=anchor) 134 | return self.texts.append(text) 135 | 136 | def rectangle_move(self, action): 137 | base_action = np.array([0, 0]) 138 | location = self.find_rectangle() 139 | self.render() 140 | if action == 0 and location[0] > 0: # up 141 | base_action[1] -= UNIT 142 | elif action == 1 and location[0] < HEIGHT - 1: # down 143 | base_action[1] += UNIT 144 | elif action == 2 and location[1] > 0: # left 145 | base_action[0] -= UNIT 146 | elif action == 3 and location[1] < WIDTH - 1: # right 147 | base_action[0] += UNIT 148 | 149 | self.canvas.move(self.rectangle, base_action[0], 150 | base_action[1]) # move agent 151 | 152 | def find_rectangle(self): 153 | temp = self.canvas.coords(self.rectangle) 154 | x = (temp[0] / 100) - 0.5 155 | y = (temp[1] / 100) - 0.5 156 | return int(y), int(x) 157 | 158 | def move_by_policy(self): 159 | 160 | if self.improvement_count != 0 and self.is_moving != 1: 161 | self.is_moving = 1 162 | x, y = self.canvas.coords(self.rectangle) 163 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 164 | 165 | x, y = self.find_rectangle() 166 | while len(self.agent.get_action([x, y])) != 0: 167 | action = random.sample(self.agent.get_action([x, y]), 1)[0] 168 | self.after(100, self.rectangle_move(action)) 169 | x, y = self.find_rectangle() 170 | self.is_moving = 0 171 | 172 | def draw_one_arrow(self, col, row, action): 173 | if col == 2 and row == 2: 174 | return 175 | if action == 0: # up 176 | origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) 177 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 178 | image=self.up)) 179 | elif action == 1: # down 180 | origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) 181 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 182 | image=self.down)) 183 | elif action == 3: # right 184 | origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) 185 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 186 | image=self.right)) 187 | elif action == 2: # left 188 | origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) 189 | self.arrows.append(self.canvas.create_image(origin_x, origin_y, 190 | image=self.left)) 191 | 192 | def draw_from_values(self, state, action_list): 193 | i = state[0] 194 | j = state[1] 195 | for action in action_list: 196 | self.draw_one_arrow(i, j, action) 197 | 198 | def print_values(self, values): 199 | for i in range(WIDTH): 200 | for j in range(HEIGHT): 201 | self.text_value(i, j, values[i][j]) 202 | 203 | def render(self): 204 | time.sleep(0.1) 205 | self.canvas.tag_raise(self.rectangle) 206 | self.update() 207 | 208 | def calculate_value(self): 209 | self.iteration_count += 1 210 | for i in self.texts: 211 | self.canvas.delete(i) 212 | self.agent.value_iteration() 213 | self.print_values(self.agent.value_table) 214 | 215 | def print_optimal_policy(self): 216 | self.improvement_count += 1 217 | for i in self.arrows: 218 | self.canvas.delete(i) 219 | for state in self.env.get_all_states(): 220 | action = self.agent.get_action(state) 221 | self.draw_from_values(state, action) 222 | 223 | 224 | class Env: 225 | def __init__(self): 226 | self.transition_probability = TRANSITION_PROB 227 | self.width = WIDTH # Width of Grid World 228 | self.height = HEIGHT # Height of GridWorld 229 | self.reward = [[0] * WIDTH for _ in range(HEIGHT)] 230 | self.possible_actions = POSSIBLE_ACTIONS 231 | self.reward[2][2] = 1 # reward 1 for circle 232 | self.reward[1][2] = -1 # reward -1 for triangle 233 | self.reward[2][1] = -1 # reward -1 for triangle 234 | self.all_state = [] 235 | 236 | for x in range(WIDTH): 237 | for y in range(HEIGHT): 238 | state = [x, y] 239 | self.all_state.append(state) 240 | 241 | def get_reward(self, state, action): 242 | next_state = self.state_after_action(state, action) 243 | return self.reward[next_state[0]][next_state[1]] 244 | 245 | def state_after_action(self, state, action_index): 246 | action = ACTIONS[action_index] 247 | return self.check_boundary([state[0] + action[0], state[1] + action[1]]) 248 | 249 | @staticmethod 250 | def check_boundary(state): 251 | state[0] = (0 if state[0] < 0 else WIDTH - 1 252 | if state[0] > WIDTH - 1 else state[0]) 253 | state[1] = (0 if state[1] < 0 else HEIGHT - 1 254 | if state[1] > HEIGHT - 1 else state[1]) 255 | return state 256 | 257 | def get_transition_prob(self, state, action): 258 | return self.transition_probability 259 | 260 | def get_all_states(self): 261 | return self.all_state 262 | -------------------------------------------------------------------------------- /1-grid-world/2-value-iteration/value_iteration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from environment import GraphicDisplay, Env 3 | 4 | class ValueIteration: 5 | def __init__(self, env): 6 | self.env = env 7 | # 2-d list for the value function 8 | self.value_table = [[0.0] * env.width for _ in range(env.height)] 9 | self.discount_factor = 0.9 10 | 11 | # get next value function table from the current value function table 12 | def value_iteration(self): 13 | next_value_table = [[0.0] * self.env.width 14 | for _ in range(self.env.height)] 15 | for state in self.env.get_all_states(): 16 | if state == [2, 2]: 17 | next_value_table[state[0]][state[1]] = 0.0 18 | continue 19 | value_list = [] 20 | 21 | for action in self.env.possible_actions: 22 | next_state = self.env.state_after_action(state, action) 23 | reward = self.env.get_reward(state, action) 24 | next_value = self.get_value(next_state) 25 | value_list.append((reward + self.discount_factor * next_value)) 26 | # return the maximum value(it is the optimality equation!!) 27 | next_value_table[state[0]][state[1]] = round(max(value_list), 2) 28 | self.value_table = next_value_table 29 | 30 | # get action according to the current value function table 31 | def get_action(self, state): 32 | action_list = [] 33 | max_value = -99999 34 | 35 | if state == [2, 2]: 36 | return [] 37 | 38 | # calculating q values for the all actions and 39 | # append the action to action list which has maximum q value 40 | for action in self.env.possible_actions: 41 | 42 | next_state = self.env.state_after_action(state, action) 43 | reward = self.env.get_reward(state, action) 44 | next_value = self.get_value(next_state) 45 | value = (reward + self.discount_factor * next_value) 46 | 47 | if value > max_value: 48 | action_list.clear() 49 | action_list.append(action) 50 | max_value = value 51 | elif value == max_value: 52 | action_list.append(action) 53 | 54 | return action_list 55 | 56 | def get_value(self, state): 57 | return round(self.value_table[state[0]][state[1]], 2) 58 | 59 | if __name__ == "__main__": 60 | env = Env() 61 | value_iteration = ValueIteration(env) 62 | grid_world = GraphicDisplay(value_iteration) 63 | grid_world.mainloop() 64 | -------------------------------------------------------------------------------- /1-grid-world/3-monte-carlo/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | np.random.seed(1) 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # pixels 9 | HEIGHT = 5 # grid height 10 | WIDTH = 5 # grid width 11 | 12 | 13 | class Env(tk.Tk): 14 | def __init__(self): 15 | super(Env, self).__init__() 16 | self.action_space = ['u', 'd', 'l', 'r'] 17 | self.n_actions = len(self.action_space) 18 | self.title('monte carlo') 19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 20 | self.shapes = self.load_images() 21 | self.canvas = self._build_canvas() 22 | self.texts = [] 23 | 24 | def _build_canvas(self): 25 | canvas = tk.Canvas(self, bg='white', 26 | height=HEIGHT * UNIT, 27 | width=WIDTH * UNIT) 28 | # create grids 29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 31 | canvas.create_line(x0, y0, x1, y1) 32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 34 | canvas.create_line(x0, y0, x1, y1) 35 | 36 | # add img to canvas 37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) 39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) 40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2]) 41 | 42 | # pack all 43 | canvas.pack() 44 | 45 | return canvas 46 | 47 | def load_images(self): 48 | rectangle = PhotoImage( 49 | Image.open("../img/rectangle.png").resize((65, 65))) 50 | triangle = PhotoImage( 51 | Image.open("../img/triangle.png").resize((65, 65))) 52 | circle = PhotoImage( 53 | Image.open("../img/circle.png").resize((65, 65))) 54 | 55 | return rectangle, triangle, circle 56 | 57 | @staticmethod 58 | def coords_to_state(coords): 59 | x = int((coords[0] - 50) / 100) 60 | y = int((coords[1] - 50) / 100) 61 | return [x, y] 62 | 63 | def reset(self): 64 | self.update() 65 | time.sleep(0.5) 66 | x, y = self.canvas.coords(self.rectangle) 67 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 68 | # return observation 69 | return self.coords_to_state(self.canvas.coords(self.rectangle)) 70 | 71 | def step(self, action): 72 | state = self.canvas.coords(self.rectangle) 73 | base_action = np.array([0, 0]) 74 | self.render() 75 | 76 | if action == 0: # up 77 | if state[1] > UNIT: 78 | base_action[1] -= UNIT 79 | elif action == 1: # down 80 | if state[1] < (HEIGHT - 1) * UNIT: 81 | base_action[1] += UNIT 82 | elif action == 2: # left 83 | if state[0] > UNIT: 84 | base_action[0] -= UNIT 85 | elif action == 3: # right 86 | if state[0] < (WIDTH - 1) * UNIT: 87 | base_action[0] += UNIT 88 | # move agent 89 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 90 | # move rectangle to top level of canvas 91 | self.canvas.tag_raise(self.rectangle) 92 | 93 | next_state = self.canvas.coords(self.rectangle) 94 | 95 | # reward function 96 | if next_state == self.canvas.coords(self.circle): 97 | reward = 100 98 | done = True 99 | elif next_state in [self.canvas.coords(self.triangle1), 100 | self.canvas.coords(self.triangle2)]: 101 | reward = -100 102 | done = True 103 | else: 104 | reward = 0 105 | done = False 106 | 107 | next_state = self.coords_to_state(next_state) 108 | 109 | return next_state, reward, done 110 | 111 | def render(self): 112 | time.sleep(0.03) 113 | self.update() 114 | -------------------------------------------------------------------------------- /1-grid-world/3-monte-carlo/mc_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import defaultdict 4 | from environment import Env 5 | 6 | 7 | # Monte Carlo Agent which learns every episodes from the sample 8 | class MCAgent: 9 | def __init__(self, actions): 10 | self.width = 5 11 | self.height = 5 12 | self.actions = actions 13 | self.learning_rate = 0.01 14 | self.discount_factor = 0.9 15 | self.epsilon = 0.1 16 | self.samples = [] 17 | self.value_table = defaultdict(float) 18 | 19 | # append sample to memory(state, reward, done) 20 | def save_sample(self, state, reward, done): 21 | self.samples.append([state, reward, done]) 22 | 23 | # for every episode, agent updates q function of visited states 24 | def update(self): 25 | G_t = 0 26 | visit_state = [] 27 | for reward in reversed(self.samples): 28 | state = str(reward[0]) 29 | if state not in visit_state: 30 | visit_state.append(state) 31 | G_t = self.discount_factor * (reward[1] + G_t) 32 | value = self.value_table[state] 33 | self.value_table[state] = (value + 34 | self.learning_rate * (G_t - value)) 35 | 36 | # get action for the state according to the q function table 37 | # agent pick action of epsilon-greedy policy 38 | def get_action(self, state): 39 | if np.random.rand() < self.epsilon: 40 | # take random action 41 | action = np.random.choice(self.actions) 42 | else: 43 | # take action according to the q function table 44 | next_state = self.possible_next_state(state) 45 | action = self.arg_max(next_state) 46 | return int(action) 47 | 48 | # compute arg_max if multiple candidates exit, pick one randomly 49 | @staticmethod 50 | def arg_max(next_state): 51 | max_index_list = [] 52 | max_value = next_state[0] 53 | for index, value in enumerate(next_state): 54 | if value > max_value: 55 | max_index_list.clear() 56 | max_value = value 57 | max_index_list.append(index) 58 | elif value == max_value: 59 | max_index_list.append(index) 60 | return random.choice(max_index_list) 61 | 62 | # get the possible next states 63 | def possible_next_state(self, state): 64 | col, row = state 65 | next_state = [0.0] * 4 66 | 67 | if row != 0: 68 | next_state[0] = self.value_table[str([col, row - 1])] 69 | else: 70 | next_state[0] = self.value_table[str(state)] 71 | if row != self.height - 1: 72 | next_state[1] = self.value_table[str([col, row + 1])] 73 | else: 74 | next_state[1] = self.value_table[str(state)] 75 | if col != 0: 76 | next_state[2] = self.value_table[str([col - 1, row])] 77 | else: 78 | next_state[2] = self.value_table[str(state)] 79 | if col != self.width - 1: 80 | next_state[3] = self.value_table[str([col + 1, row])] 81 | else: 82 | next_state[3] = self.value_table[str(state)] 83 | 84 | return next_state 85 | 86 | 87 | # main loop 88 | if __name__ == "__main__": 89 | env = Env() 90 | agent = MCAgent(actions=list(range(env.n_actions))) 91 | 92 | for episode in range(1000): 93 | state = env.reset() 94 | action = agent.get_action(state) 95 | 96 | while True: 97 | env.render() 98 | 99 | # forward to next state. reward is number and done is boolean 100 | next_state, reward, done = env.step(action) 101 | agent.save_sample(next_state, reward, done) 102 | 103 | # get next action 104 | action = agent.get_action(next_state) 105 | 106 | # at the end of each episode, update the q function table 107 | if done: 108 | print("episode : ", episode) 109 | agent.update() 110 | agent.samples.clear() 111 | break 112 | -------------------------------------------------------------------------------- /1-grid-world/4-sarsa/.python-version: -------------------------------------------------------------------------------- 1 | 3.5.0 2 | -------------------------------------------------------------------------------- /1-grid-world/4-sarsa/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | np.random.seed(1) 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # pixels 9 | HEIGHT = 5 # grid height 10 | WIDTH = 5 # grid width 11 | 12 | 13 | class Env(tk.Tk): 14 | def __init__(self): 15 | super(Env, self).__init__() 16 | self.action_space = ['u', 'd', 'l', 'r'] 17 | self.n_actions = len(self.action_space) 18 | self.title('SARSA') 19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 20 | self.shapes = self.load_images() 21 | self.canvas = self._build_canvas() 22 | self.texts = [] 23 | 24 | def _build_canvas(self): 25 | canvas = tk.Canvas(self, bg='white', 26 | height=HEIGHT * UNIT, 27 | width=WIDTH * UNIT) 28 | # create grids 29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 31 | canvas.create_line(x0, y0, x1, y1) 32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 34 | canvas.create_line(x0, y0, x1, y1) 35 | 36 | # add img to canvas 37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) 39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) 40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2]) 41 | 42 | # pack all 43 | canvas.pack() 44 | 45 | return canvas 46 | 47 | def load_images(self): 48 | rectangle = PhotoImage( 49 | Image.open("../img/rectangle.png").resize((65, 65))) 50 | triangle = PhotoImage( 51 | Image.open("../img/triangle.png").resize((65, 65))) 52 | circle = PhotoImage( 53 | Image.open("../img/circle.png").resize((65, 65))) 54 | 55 | return rectangle, triangle, circle 56 | 57 | def text_value(self, row, col, contents, action, font='Helvetica', size=10, 58 | style='normal', anchor="nw"): 59 | if action == 0: 60 | origin_x, origin_y = 7, 42 61 | elif action == 1: 62 | origin_x, origin_y = 85, 42 63 | elif action == 2: 64 | origin_x, origin_y = 42, 5 65 | else: 66 | origin_x, origin_y = 42, 77 67 | 68 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 69 | font = (font, str(size), style) 70 | text = self.canvas.create_text(x, y, fill="black", text=contents, 71 | font=font, anchor=anchor) 72 | return self.texts.append(text) 73 | 74 | def print_value_all(self, q_table): 75 | for i in self.texts: 76 | self.canvas.delete(i) 77 | self.texts.clear() 78 | for x in range(HEIGHT): 79 | for y in range(WIDTH): 80 | for action in range(0, 4): 81 | state = [x, y] 82 | if str(state) in q_table.keys(): 83 | temp = q_table[str(state)][action] 84 | self.text_value(y, x, round(temp, 2), action) 85 | 86 | def coords_to_state(self, coords): 87 | x = int((coords[0] - 50) / 100) 88 | y = int((coords[1] - 50) / 100) 89 | return [x, y] 90 | 91 | def reset(self): 92 | self.update() 93 | time.sleep(0.5) 94 | x, y = self.canvas.coords(self.rectangle) 95 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 96 | self.render() 97 | # return observation 98 | return self.coords_to_state(self.canvas.coords(self.rectangle)) 99 | 100 | def step(self, action): 101 | state = self.canvas.coords(self.rectangle) 102 | base_action = np.array([0, 0]) 103 | self.render() 104 | 105 | if action == 0: # up 106 | if state[1] > UNIT: 107 | base_action[1] -= UNIT 108 | elif action == 1: # down 109 | if state[1] < (HEIGHT - 1) * UNIT: 110 | base_action[1] += UNIT 111 | elif action == 2: # left 112 | if state[0] > UNIT: 113 | base_action[0] -= UNIT 114 | elif action == 3: # right 115 | if state[0] < (WIDTH - 1) * UNIT: 116 | base_action[0] += UNIT 117 | 118 | # move agent 119 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 120 | # move rectangle to top level of canvas 121 | self.canvas.tag_raise(self.rectangle) 122 | next_state = self.canvas.coords(self.rectangle) 123 | 124 | # reward function 125 | if next_state == self.canvas.coords(self.circle): 126 | reward = 100 127 | done = True 128 | elif next_state in [self.canvas.coords(self.triangle1), 129 | self.canvas.coords(self.triangle2)]: 130 | reward = -100 131 | done = True 132 | else: 133 | reward = 0 134 | done = False 135 | 136 | next_state = self.coords_to_state(next_state) 137 | 138 | return next_state, reward, done 139 | 140 | def render(self): 141 | time.sleep(0.03) 142 | self.update() 143 | -------------------------------------------------------------------------------- /1-grid-world/4-sarsa/sarsa_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from collections import defaultdict 4 | from environment import Env 5 | 6 | 7 | # SARSA agent learns every time step from the sample 8 | class SARSAgent: 9 | def __init__(self, actions): 10 | self.actions = actions 11 | self.learning_rate = 0.01 12 | self.discount_factor = 0.9 13 | self.epsilon = 0.1 14 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) 15 | 16 | # with sample , learns new q function 17 | def learn(self, state, action, reward, next_state, next_action): 18 | current_q = self.q_table[state][action] 19 | next_state_q = self.q_table[next_state][next_action] 20 | new_q = (current_q + self.learning_rate * 21 | (reward + self.discount_factor * next_state_q - current_q)) 22 | self.q_table[state][action] = new_q 23 | 24 | # get action for the state according to the q function table 25 | # agent pick action of epsilon-greedy policy 26 | def get_action(self, state): 27 | if np.random.rand() < self.epsilon: 28 | # take random action 29 | action = np.random.choice(self.actions) 30 | else: 31 | # take action according to the q function table 32 | state_action = self.q_table[state] 33 | action = self.arg_max(state_action) 34 | return action 35 | 36 | @staticmethod 37 | def arg_max(state_action): 38 | max_index_list = [] 39 | max_value = state_action[0] 40 | for index, value in enumerate(state_action): 41 | if value > max_value: 42 | max_index_list.clear() 43 | max_value = value 44 | max_index_list.append(index) 45 | elif value == max_value: 46 | max_index_list.append(index) 47 | return random.choice(max_index_list) 48 | 49 | if __name__ == "__main__": 50 | env = Env() 51 | agent = SARSAgent(actions=list(range(env.n_actions))) 52 | 53 | for episode in range(1000): 54 | # reset environment and initialize state 55 | 56 | state = env.reset() 57 | # get action of state from agent 58 | action = agent.get_action(str(state)) 59 | 60 | while True: 61 | env.render() 62 | 63 | # take action and proceed one step in the environment 64 | next_state, reward, done = env.step(action) 65 | next_action = agent.get_action(str(next_state)) 66 | 67 | # with sample , agent learns new q function 68 | agent.learn(str(state), action, reward, str(next_state), next_action) 69 | 70 | state = next_state 71 | action = next_action 72 | 73 | # print q function of all states at screen 74 | env.print_value_all(agent.q_table) 75 | 76 | # if episode ends, then break 77 | if done: 78 | break 79 | 80 | -------------------------------------------------------------------------------- /1-grid-world/5-q-learning/.python-version: -------------------------------------------------------------------------------- 1 | 3.5.0 2 | -------------------------------------------------------------------------------- /1-grid-world/5-q-learning/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | np.random.seed(1) 7 | PhotoImage = ImageTk.PhotoImage 8 | UNIT = 100 # pixels 9 | HEIGHT = 5 # grid height 10 | WIDTH = 5 # grid width 11 | 12 | 13 | class Env(tk.Tk): 14 | def __init__(self): 15 | super(Env, self).__init__() 16 | self.action_space = ['u', 'd', 'l', 'r'] 17 | self.n_actions = len(self.action_space) 18 | self.title('Q Learning') 19 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 20 | self.shapes = self.load_images() 21 | self.canvas = self._build_canvas() 22 | self.texts = [] 23 | 24 | def _build_canvas(self): 25 | canvas = tk.Canvas(self, bg='white', 26 | height=HEIGHT * UNIT, 27 | width=WIDTH * UNIT) 28 | # create grids 29 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 30 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 31 | canvas.create_line(x0, y0, x1, y1) 32 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 33 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 34 | canvas.create_line(x0, y0, x1, y1) 35 | 36 | # add img to canvas 37 | self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) 38 | self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) 39 | self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) 40 | self.circle = canvas.create_image(250, 250, image=self.shapes[2]) 41 | 42 | # pack all 43 | canvas.pack() 44 | 45 | return canvas 46 | 47 | def load_images(self): 48 | rectangle = PhotoImage( 49 | Image.open("../img/rectangle.png").resize((65, 65))) 50 | triangle = PhotoImage( 51 | Image.open("../img/triangle.png").resize((65, 65))) 52 | circle = PhotoImage( 53 | Image.open("../img/circle.png").resize((65, 65))) 54 | 55 | return rectangle, triangle, circle 56 | 57 | def text_value(self, row, col, contents, action, font='Helvetica', size=10, 58 | style='normal', anchor="nw"): 59 | 60 | if action == 0: 61 | origin_x, origin_y = 7, 42 62 | elif action == 1: 63 | origin_x, origin_y = 85, 42 64 | elif action == 2: 65 | origin_x, origin_y = 42, 5 66 | else: 67 | origin_x, origin_y = 42, 77 68 | 69 | x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) 70 | font = (font, str(size), style) 71 | text = self.canvas.create_text(x, y, fill="black", text=contents, 72 | font=font, anchor=anchor) 73 | return self.texts.append(text) 74 | 75 | def print_value_all(self, q_table): 76 | for i in self.texts: 77 | self.canvas.delete(i) 78 | self.texts.clear() 79 | for i in range(HEIGHT): 80 | for j in range(WIDTH): 81 | for action in range(0, 4): 82 | state = [i, j] 83 | if str(state) in q_table.keys(): 84 | temp = q_table[str(state)][action] 85 | self.text_value(j, i, round(temp, 2), action) 86 | 87 | def coords_to_state(self, coords): 88 | x = int((coords[0] - 50) / 100) 89 | y = int((coords[1] - 50) / 100) 90 | return [x, y] 91 | 92 | def state_to_coords(self, state): 93 | x = int(state[0] * 100 + 50) 94 | y = int(state[1] * 100 + 50) 95 | return [x, y] 96 | 97 | def reset(self): 98 | self.update() 99 | time.sleep(0.5) 100 | x, y = self.canvas.coords(self.rectangle) 101 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 102 | self.render() 103 | # return observation 104 | return self.coords_to_state(self.canvas.coords(self.rectangle)) 105 | 106 | 107 | def step(self, action): 108 | state = self.canvas.coords(self.rectangle) 109 | base_action = np.array([0, 0]) 110 | self.render() 111 | 112 | if action == 0: # up 113 | if state[1] > UNIT: 114 | base_action[1] -= UNIT 115 | elif action == 1: # down 116 | if state[1] < (HEIGHT - 1) * UNIT: 117 | base_action[1] += UNIT 118 | elif action == 2: # left 119 | if state[0] > UNIT: 120 | base_action[0] -= UNIT 121 | elif action == 3: # right 122 | if state[0] < (WIDTH - 1) * UNIT: 123 | base_action[0] += UNIT 124 | 125 | # move agent 126 | self.canvas.move(self.rectangle, base_action[0], base_action[1]) 127 | # move rectangle to top level of canvas 128 | self.canvas.tag_raise(self.rectangle) 129 | next_state = self.canvas.coords(self.rectangle) 130 | 131 | # reward function 132 | if next_state == self.canvas.coords(self.circle): 133 | reward = 100 134 | done = True 135 | elif next_state in [self.canvas.coords(self.triangle1), 136 | self.canvas.coords(self.triangle2)]: 137 | reward = -100 138 | done = True 139 | else: 140 | reward = 0 141 | done = False 142 | 143 | next_state = self.coords_to_state(next_state) 144 | return next_state, reward, done 145 | 146 | def render(self): 147 | time.sleep(0.03) 148 | self.update() 149 | -------------------------------------------------------------------------------- /1-grid-world/5-q-learning/q_learning_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from environment import Env 4 | from collections import defaultdict 5 | 6 | class QLearningAgent: 7 | def __init__(self, actions): 8 | # actions = [0, 1, 2, 3] 9 | self.actions = actions 10 | self.learning_rate = 0.01 11 | self.discount_factor = 0.9 12 | self.epsilon = 0.1 13 | self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) 14 | 15 | # update q function with sample 16 | def learn(self, state, action, reward, next_state): 17 | current_q = self.q_table[state][action] 18 | # using Bellman Optimality Equation to update q function 19 | new_q = reward + self.discount_factor * max(self.q_table[next_state]) 20 | self.q_table[state][action] += self.learning_rate * (new_q - current_q) 21 | 22 | # get action for the state according to the q function table 23 | # agent pick action of epsilon-greedy policy 24 | def get_action(self, state): 25 | if np.random.rand() < self.epsilon: 26 | # take random action 27 | action = np.random.choice(self.actions) 28 | else: 29 | # take action according to the q function table 30 | state_action = self.q_table[state] 31 | action = self.arg_max(state_action) 32 | return action 33 | 34 | @staticmethod 35 | def arg_max(state_action): 36 | max_index_list = [] 37 | max_value = state_action[0] 38 | for index, value in enumerate(state_action): 39 | if value > max_value: 40 | max_index_list.clear() 41 | max_value = value 42 | max_index_list.append(index) 43 | elif value == max_value: 44 | max_index_list.append(index) 45 | return random.choice(max_index_list) 46 | 47 | if __name__ == "__main__": 48 | env = Env() 49 | agent = QLearningAgent(actions=list(range(env.n_actions))) 50 | 51 | for episode in range(1000): 52 | state = env.reset() 53 | 54 | while True: 55 | env.render() 56 | 57 | # take action and proceed one step in the environment 58 | action = agent.get_action(str(state)) 59 | next_state, reward, done = env.step(action) 60 | 61 | # with sample , agent learns new q function 62 | agent.learn(str(state), action, reward, str(next_state)) 63 | 64 | state = next_state 65 | env.print_value_all(agent.q_table) 66 | 67 | # if episode ends, then break 68 | if done: 69 | break 70 | -------------------------------------------------------------------------------- /1-grid-world/6-deep-sarsa/deep_sarsa_agent.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import pylab 3 | import random 4 | import numpy as np 5 | from environment import Env 6 | from keras.layers import Dense 7 | from keras.optimizers import Adam 8 | from keras.models import Sequential 9 | 10 | EPISODES = 1000 11 | 12 | 13 | # this is DeepSARSA Agent for the GridWorld 14 | # Utilize Neural Network as q function approximator 15 | class DeepSARSAgent: 16 | def __init__(self): 17 | self.load_model = False 18 | # actions which agent can do 19 | self.action_space = [0, 1, 2, 3, 4] 20 | # get size of state and action 21 | self.action_size = len(self.action_space) 22 | self.state_size = 15 23 | self.discount_factor = 0.99 24 | self.learning_rate = 0.001 25 | 26 | self.epsilon = 1. # exploration 27 | self.epsilon_decay = .9999 28 | self.epsilon_min = 0.01 29 | self.model = self.build_model() 30 | 31 | if self.load_model: 32 | self.epsilon = 0.05 33 | self.model.load_weights('./save_model/deep_sarsa_trained.h5') 34 | 35 | # approximate Q function using Neural Network 36 | # state is input and Q Value of each action is output of network 37 | def build_model(self): 38 | model = Sequential() 39 | model.add(Dense(30, input_dim=self.state_size, activation='relu')) 40 | model.add(Dense(30, activation='relu')) 41 | model.add(Dense(self.action_size, activation='linear')) 42 | model.summary() 43 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 44 | return model 45 | 46 | # get action from model using epsilon-greedy policy 47 | def get_action(self, state): 48 | if np.random.rand() <= self.epsilon: 49 | # The agent acts randomly 50 | return random.randrange(self.action_size) 51 | else: 52 | # Predict the reward value based on the given state 53 | state = np.float32(state) 54 | q_values = self.model.predict(state) 55 | return np.argmax(q_values[0]) 56 | 57 | def train_model(self, state, action, reward, next_state, next_action, done): 58 | if self.epsilon > self.epsilon_min: 59 | self.epsilon *= self.epsilon_decay 60 | 61 | state = np.float32(state) 62 | next_state = np.float32(next_state) 63 | target = self.model.predict(state)[0] 64 | # like Q Learning, get maximum Q value at s' 65 | # But from target model 66 | if done: 67 | target[action] = reward 68 | else: 69 | target[action] = (reward + self.discount_factor * 70 | self.model.predict(next_state)[0][next_action]) 71 | 72 | target = np.reshape(target, [1, 5]) 73 | # make minibatch which includes target q value and predicted q value 74 | # and do the model fit! 75 | self.model.fit(state, target, epochs=1, verbose=0) 76 | 77 | 78 | if __name__ == "__main__": 79 | env = Env() 80 | agent = DeepSARSAgent() 81 | 82 | global_step = 0 83 | scores, episodes = [], [] 84 | 85 | for e in range(EPISODES): 86 | done = False 87 | score = 0 88 | state = env.reset() 89 | state = np.reshape(state, [1, 15]) 90 | 91 | while not done: 92 | # fresh env 93 | global_step += 1 94 | 95 | # get action for the current state and go one step in environment 96 | action = agent.get_action(state) 97 | next_state, reward, done = env.step(action) 98 | next_state = np.reshape(next_state, [1, 15]) 99 | next_action = agent.get_action(next_state) 100 | agent.train_model(state, action, reward, next_state, next_action, 101 | done) 102 | state = next_state 103 | # every time step we do training 104 | score += reward 105 | 106 | state = copy.deepcopy(next_state) 107 | 108 | if done: 109 | scores.append(score) 110 | episodes.append(e) 111 | pylab.plot(episodes, scores, 'b') 112 | pylab.savefig("./save_graph/deep_sarsa_.png") 113 | print("episode:", e, " score:", score, "global_step", 114 | global_step, " epsilon:", agent.epsilon) 115 | 116 | if e % 100 == 0: 117 | agent.model.save_weights("./save_model/deep_sarsa.h5") 118 | -------------------------------------------------------------------------------- /1-grid-world/6-deep-sarsa/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | PhotoImage = ImageTk.PhotoImage 7 | UNIT = 50 # pixels 8 | HEIGHT = 5 # grid height 9 | WIDTH = 5 # grid width 10 | 11 | np.random.seed(1) 12 | 13 | 14 | class Env(tk.Tk): 15 | def __init__(self): 16 | super(Env, self).__init__() 17 | self.action_space = ['u', 'd', 'l', 'r'] 18 | self.action_size = len(self.action_space) 19 | self.title('DeepSARSA') 20 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 21 | self.shapes = self.load_images() 22 | self.canvas = self._build_canvas() 23 | self.counter = 0 24 | self.rewards = [] 25 | self.goal = [] 26 | # obstacle 27 | self.set_reward([0, 1], -1) 28 | self.set_reward([1, 2], -1) 29 | self.set_reward([2, 3], -1) 30 | # #goal 31 | self.set_reward([4, 4], 1) 32 | 33 | def _build_canvas(self): 34 | canvas = tk.Canvas(self, bg='white', 35 | height=HEIGHT * UNIT, 36 | width=WIDTH * UNIT) 37 | # create grids 38 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 39 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 40 | canvas.create_line(x0, y0, x1, y1) 41 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 42 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 43 | canvas.create_line(x0, y0, x1, y1) 44 | 45 | self.rewards = [] 46 | self.goal = [] 47 | # add image to canvas 48 | x, y = UNIT/2, UNIT/2 49 | self.rectangle = canvas.create_image(x, y, image=self.shapes[0]) 50 | 51 | # pack all` 52 | canvas.pack() 53 | 54 | return canvas 55 | 56 | def load_images(self): 57 | rectangle = PhotoImage( 58 | Image.open("../img/rectangle.png").resize((30, 30))) 59 | triangle = PhotoImage( 60 | Image.open("../img/triangle.png").resize((30, 30))) 61 | circle = PhotoImage( 62 | Image.open("../img/circle.png").resize((30, 30))) 63 | 64 | return rectangle, triangle, circle 65 | 66 | def reset_reward(self): 67 | 68 | for reward in self.rewards: 69 | self.canvas.delete(reward['figure']) 70 | 71 | self.rewards.clear() 72 | self.goal.clear() 73 | self.set_reward([0, 1], -1) 74 | self.set_reward([1, 2], -1) 75 | self.set_reward([2, 3], -1) 76 | 77 | # #goal 78 | self.set_reward([4, 4], 1) 79 | 80 | def set_reward(self, state, reward): 81 | state = [int(state[0]), int(state[1])] 82 | x = int(state[0]) 83 | y = int(state[1]) 84 | temp = {} 85 | if reward > 0: 86 | temp['reward'] = reward 87 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, 88 | (UNIT * y) + UNIT / 2, 89 | image=self.shapes[2]) 90 | 91 | self.goal.append(temp['figure']) 92 | 93 | 94 | elif reward < 0: 95 | temp['direction'] = -1 96 | temp['reward'] = reward 97 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, 98 | (UNIT * y) + UNIT / 2, 99 | image=self.shapes[1]) 100 | 101 | temp['coords'] = self.canvas.coords(temp['figure']) 102 | temp['state'] = state 103 | self.rewards.append(temp) 104 | 105 | # new methods 106 | 107 | def check_if_reward(self, state): 108 | check_list = dict() 109 | check_list['if_goal'] = False 110 | rewards = 0 111 | 112 | for reward in self.rewards: 113 | if reward['state'] == state: 114 | rewards += reward['reward'] 115 | if reward['reward'] == 1: 116 | check_list['if_goal'] = True 117 | 118 | check_list['rewards'] = rewards 119 | 120 | return check_list 121 | 122 | def coords_to_state(self, coords): 123 | x = int((coords[0] - UNIT / 2) / UNIT) 124 | y = int((coords[1] - UNIT / 2) / UNIT) 125 | return [x, y] 126 | 127 | def reset(self): 128 | self.update() 129 | time.sleep(0.5) 130 | x, y = self.canvas.coords(self.rectangle) 131 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 132 | # return observation 133 | self.reset_reward() 134 | return self.get_state() 135 | 136 | def step(self, action): 137 | self.counter += 1 138 | self.render() 139 | 140 | if self.counter % 2 == 1: 141 | self.rewards = self.move_rewards() 142 | 143 | next_coords = self.move(self.rectangle, action) 144 | check = self.check_if_reward(self.coords_to_state(next_coords)) 145 | done = check['if_goal'] 146 | reward = check['rewards'] 147 | 148 | self.canvas.tag_raise(self.rectangle) 149 | 150 | s_ = self.get_state() 151 | 152 | return s_, reward, done 153 | 154 | def get_state(self): 155 | 156 | location = self.coords_to_state(self.canvas.coords(self.rectangle)) 157 | agent_x = location[0] 158 | agent_y = location[1] 159 | 160 | states = list() 161 | 162 | # locations.append(agent_x) 163 | # locations.append(agent_y) 164 | 165 | for reward in self.rewards: 166 | reward_location = reward['state'] 167 | states.append(reward_location[0] - agent_x) 168 | states.append(reward_location[1] - agent_y) 169 | if reward['reward'] < 0: 170 | states.append(-1) 171 | states.append(reward['direction']) 172 | else: 173 | states.append(1) 174 | 175 | return states 176 | 177 | def move_rewards(self): 178 | new_rewards = [] 179 | for temp in self.rewards: 180 | if temp['reward'] == 1: 181 | new_rewards.append(temp) 182 | continue 183 | temp['coords'] = self.move_const(temp) 184 | temp['state'] = self.coords_to_state(temp['coords']) 185 | new_rewards.append(temp) 186 | return new_rewards 187 | 188 | def move_const(self, target): 189 | 190 | s = self.canvas.coords(target['figure']) 191 | 192 | base_action = np.array([0, 0]) 193 | 194 | if s[0] == (WIDTH - 1) * UNIT + UNIT / 2: 195 | target['direction'] = 1 196 | elif s[0] == UNIT / 2: 197 | target['direction'] = -1 198 | 199 | if target['direction'] == -1: 200 | base_action[0] += UNIT 201 | elif target['direction'] == 1: 202 | base_action[0] -= UNIT 203 | 204 | if (target['figure'] is not self.rectangle 205 | and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]): 206 | base_action = np.array([0, 0]) 207 | 208 | self.canvas.move(target['figure'], base_action[0], base_action[1]) 209 | 210 | s_ = self.canvas.coords(target['figure']) 211 | 212 | return s_ 213 | 214 | def move(self, target, action): 215 | s = self.canvas.coords(target) 216 | 217 | base_action = np.array([0, 0]) 218 | 219 | if action == 0: # up 220 | if s[1] > UNIT: 221 | base_action[1] -= UNIT 222 | elif action == 1: # down 223 | if s[1] < (HEIGHT - 1) * UNIT: 224 | base_action[1] += UNIT 225 | elif action == 2: # right 226 | if s[0] < (WIDTH - 1) * UNIT: 227 | base_action[0] += UNIT 228 | elif action == 3: # left 229 | if s[0] > UNIT: 230 | base_action[0] -= UNIT 231 | 232 | self.canvas.move(target, base_action[0], base_action[1]) 233 | 234 | s_ = self.canvas.coords(target) 235 | 236 | return s_ 237 | 238 | def render(self): 239 | time.sleep(0.07) 240 | self.update() 241 | -------------------------------------------------------------------------------- /1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png -------------------------------------------------------------------------------- /1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 -------------------------------------------------------------------------------- /1-grid-world/7-reinforce/environment.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tkinter as tk 4 | from PIL import ImageTk, Image 5 | 6 | PhotoImage = ImageTk.PhotoImage 7 | UNIT = 50 # pixels 8 | HEIGHT = 5 # grid height 9 | WIDTH = 5 # grid width 10 | 11 | np.random.seed(1) 12 | 13 | 14 | class Env(tk.Tk): 15 | def __init__(self): 16 | super(Env, self).__init__() 17 | self.action_space = ['u', 'd', 'l', 'r'] 18 | self.action_size = len(self.action_space) 19 | self.title('Reinforce') 20 | self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) 21 | self.shapes = self.load_images() 22 | self.canvas = self._build_canvas() 23 | self.counter = 0 24 | self.rewards = [] 25 | self.goal = [] 26 | # obstacle 27 | self.set_reward([0, 1], -1) 28 | self.set_reward([1, 2], -1) 29 | self.set_reward([2, 3], -1) 30 | # #goal 31 | self.set_reward([4, 4], 1) 32 | 33 | def _build_canvas(self): 34 | canvas = tk.Canvas(self, bg='white', 35 | height=HEIGHT * UNIT, 36 | width=WIDTH * UNIT) 37 | # create grids 38 | for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 39 | x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT 40 | canvas.create_line(x0, y0, x1, y1) 41 | for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 42 | x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r 43 | canvas.create_line(x0, y0, x1, y1) 44 | 45 | self.rewards = [] 46 | self.goal = [] 47 | # add image to canvas 48 | x, y = UNIT/2, UNIT/2 49 | self.rectangle = canvas.create_image(x, y, image=self.shapes[0]) 50 | 51 | # pack all` 52 | canvas.pack() 53 | 54 | return canvas 55 | 56 | def load_images(self): 57 | rectangle = PhotoImage( 58 | Image.open("../img/rectangle.png").resize((30, 30))) 59 | triangle = PhotoImage( 60 | Image.open("../img/triangle.png").resize((30, 30))) 61 | circle = PhotoImage( 62 | Image.open("../img/circle.png").resize((30, 30))) 63 | 64 | return rectangle, triangle, circle 65 | 66 | def reset_reward(self): 67 | 68 | for reward in self.rewards: 69 | self.canvas.delete(reward['figure']) 70 | 71 | self.rewards.clear() 72 | self.goal.clear() 73 | self.set_reward([0, 1], -1) 74 | self.set_reward([1, 2], -1) 75 | self.set_reward([2, 3], -1) 76 | 77 | # #goal 78 | self.set_reward([4, 4], 1) 79 | 80 | def set_reward(self, state, reward): 81 | state = [int(state[0]), int(state[1])] 82 | x = int(state[0]) 83 | y = int(state[1]) 84 | temp = {} 85 | if reward > 0: 86 | temp['reward'] = reward 87 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, 88 | (UNIT * y) + UNIT / 2, 89 | image=self.shapes[2]) 90 | 91 | self.goal.append(temp['figure']) 92 | 93 | 94 | elif reward < 0: 95 | temp['direction'] = -1 96 | temp['reward'] = reward 97 | temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, 98 | (UNIT * y) + UNIT / 2, 99 | image=self.shapes[1]) 100 | 101 | temp['coords'] = self.canvas.coords(temp['figure']) 102 | temp['state'] = state 103 | self.rewards.append(temp) 104 | 105 | # new methods 106 | 107 | def check_if_reward(self, state): 108 | check_list = dict() 109 | check_list['if_goal'] = False 110 | rewards = 0 111 | 112 | for reward in self.rewards: 113 | if reward['state'] == state: 114 | rewards += reward['reward'] 115 | if reward['reward'] > 0: 116 | check_list['if_goal'] = True 117 | 118 | check_list['rewards'] = rewards 119 | 120 | return check_list 121 | 122 | def coords_to_state(self, coords): 123 | x = int((coords[0] - UNIT / 2) / UNIT) 124 | y = int((coords[1] - UNIT / 2) / UNIT) 125 | return [x, y] 126 | 127 | def reset(self): 128 | self.update() 129 | x, y = self.canvas.coords(self.rectangle) 130 | self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) 131 | # return observation 132 | self.reset_reward() 133 | return self.get_state() 134 | 135 | def step(self, action): 136 | self.counter += 1 137 | self.render() 138 | 139 | if self.counter % 2 == 1: 140 | self.rewards = self.move_rewards() 141 | 142 | next_coords = self.move(self.rectangle, action) 143 | check = self.check_if_reward(self.coords_to_state(next_coords)) 144 | done = check['if_goal'] 145 | reward = check['rewards'] 146 | reward -= 0.1 147 | self.canvas.tag_raise(self.rectangle) 148 | 149 | s_ = self.get_state() 150 | 151 | return s_, reward, done 152 | 153 | def get_state(self): 154 | 155 | location = self.coords_to_state(self.canvas.coords(self.rectangle)) 156 | agent_x = location[0] 157 | agent_y = location[1] 158 | 159 | states = list() 160 | 161 | # locations.append(agent_x) 162 | # locations.append(agent_y) 163 | 164 | for reward in self.rewards: 165 | reward_location = reward['state'] 166 | states.append(reward_location[0] - agent_x) 167 | states.append(reward_location[1] - agent_y) 168 | if reward['reward'] < 0: 169 | states.append(-1) 170 | states.append(reward['direction']) 171 | else: 172 | states.append(1) 173 | 174 | return states 175 | 176 | def move_rewards(self): 177 | new_rewards = [] 178 | for temp in self.rewards: 179 | if temp['reward'] > 0: 180 | new_rewards.append(temp) 181 | continue 182 | temp['coords'] = self.move_const(temp) 183 | temp['state'] = self.coords_to_state(temp['coords']) 184 | new_rewards.append(temp) 185 | return new_rewards 186 | 187 | def move_const(self, target): 188 | 189 | s = self.canvas.coords(target['figure']) 190 | 191 | base_action = np.array([0, 0]) 192 | 193 | if s[0] == (WIDTH - 1) * UNIT + UNIT / 2: 194 | target['direction'] = 1 195 | elif s[0] == UNIT / 2: 196 | target['direction'] = -1 197 | 198 | if target['direction'] == -1: 199 | base_action[0] += UNIT 200 | elif target['direction'] == 1: 201 | base_action[0] -= UNIT 202 | 203 | if (target['figure'] is not self.rectangle 204 | and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]): 205 | base_action = np.array([0, 0]) 206 | 207 | self.canvas.move(target['figure'], base_action[0], base_action[1]) 208 | 209 | s_ = self.canvas.coords(target['figure']) 210 | 211 | return s_ 212 | 213 | def move(self, target, action): 214 | s = self.canvas.coords(target) 215 | 216 | base_action = np.array([0, 0]) 217 | 218 | if action == 0: # up 219 | if s[1] > UNIT: 220 | base_action[1] -= UNIT 221 | elif action == 1: # down 222 | if s[1] < (HEIGHT - 1) * UNIT: 223 | base_action[1] += UNIT 224 | elif action == 2: # right 225 | if s[0] < (WIDTH - 1) * UNIT: 226 | base_action[0] += UNIT 227 | elif action == 3: # left 228 | if s[0] > UNIT: 229 | base_action[0] -= UNIT 230 | 231 | self.canvas.move(target, base_action[0], base_action[1]) 232 | 233 | s_ = self.canvas.coords(target) 234 | 235 | return s_ 236 | 237 | def render(self): 238 | time.sleep(0.07) 239 | self.update() 240 | -------------------------------------------------------------------------------- /1-grid-world/7-reinforce/reinforce_agent.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import pylab 3 | import numpy as np 4 | from environment import Env 5 | from keras.layers import Dense 6 | from keras.optimizers import Adam 7 | from keras.models import Sequential 8 | from keras import backend as K 9 | 10 | EPISODES = 2500 11 | 12 | 13 | # this is REINFORCE Agent for GridWorld 14 | class ReinforceAgent: 15 | def __init__(self): 16 | self.load_model = True 17 | # actions which agent can do 18 | self.action_space = [0, 1, 2, 3, 4] 19 | # get size of state and action 20 | self.action_size = len(self.action_space) 21 | self.state_size = 15 22 | self.discount_factor = 0.99 23 | self.learning_rate = 0.001 24 | 25 | self.model = self.build_model() 26 | self.optimizer = self.optimizer() 27 | self.states, self.actions, self.rewards = [], [], [] 28 | 29 | if self.load_model: 30 | self.model.load_weights('./save_model/reinforce_trained.h5') 31 | 32 | # state is input and probability of each action(policy) is output of network 33 | def build_model(self): 34 | model = Sequential() 35 | model.add(Dense(24, input_dim=self.state_size, activation='relu')) 36 | model.add(Dense(24, activation='relu')) 37 | model.add(Dense(self.action_size, activation='softmax')) 38 | model.summary() 39 | return model 40 | 41 | # create error function and training function to update policy network 42 | def optimizer(self): 43 | action = K.placeholder(shape=[None, 5]) 44 | discounted_rewards = K.placeholder(shape=[None, ]) 45 | 46 | # Calculate cross entropy error function 47 | action_prob = K.sum(action * self.model.output, axis=1) 48 | cross_entropy = K.log(action_prob) * discounted_rewards 49 | loss = -K.sum(cross_entropy) 50 | 51 | # create training function 52 | optimizer = Adam(lr=self.learning_rate) 53 | updates = optimizer.get_updates(self.model.trainable_weights, [], 54 | loss) 55 | train = K.function([self.model.input, action, discounted_rewards], [], 56 | updates=updates) 57 | 58 | return train 59 | 60 | # get action from policy network 61 | def get_action(self, state): 62 | policy = self.model.predict(state)[0] 63 | return np.random.choice(self.action_size, 1, p=policy)[0] 64 | 65 | # calculate discounted rewards 66 | def discount_rewards(self, rewards): 67 | discounted_rewards = np.zeros_like(rewards) 68 | running_add = 0 69 | for t in reversed(range(0, len(rewards))): 70 | running_add = running_add * self.discount_factor + rewards[t] 71 | discounted_rewards[t] = running_add 72 | return discounted_rewards 73 | 74 | # save states, actions and rewards for an episode 75 | def append_sample(self, state, action, reward): 76 | self.states.append(state[0]) 77 | self.rewards.append(reward) 78 | act = np.zeros(self.action_size) 79 | act[action] = 1 80 | self.actions.append(act) 81 | 82 | # update policy neural network 83 | def train_model(self): 84 | discounted_rewards = np.float32(self.discount_rewards(self.rewards)) 85 | discounted_rewards -= np.mean(discounted_rewards) 86 | discounted_rewards /= np.std(discounted_rewards) 87 | 88 | self.optimizer([self.states, self.actions, discounted_rewards]) 89 | self.states, self.actions, self.rewards = [], [], [] 90 | 91 | 92 | if __name__ == "__main__": 93 | env = Env() 94 | agent = ReinforceAgent() 95 | 96 | global_step = 0 97 | scores, episodes = [], [] 98 | 99 | for e in range(EPISODES): 100 | done = False 101 | score = 0 102 | # fresh env 103 | state = env.reset() 104 | state = np.reshape(state, [1, 15]) 105 | 106 | while not done: 107 | global_step += 1 108 | # get action for the current state and go one step in environment 109 | action = agent.get_action(state) 110 | next_state, reward, done = env.step(action) 111 | next_state = np.reshape(next_state, [1, 15]) 112 | 113 | agent.append_sample(state, action, reward) 114 | score += reward 115 | state = copy.deepcopy(next_state) 116 | 117 | if done: 118 | # update policy neural network for each episode 119 | agent.train_model() 120 | scores.append(score) 121 | episodes.append(e) 122 | score = round(score, 2) 123 | print("episode:", e, " score:", score, " time_step:", 124 | global_step) 125 | 126 | if e % 100 == 0: 127 | pylab.plot(episodes, scores, 'b') 128 | pylab.savefig("./save_graph/reinforce.png") 129 | agent.model.save_weights("./save_model/reinforce.h5") 130 | -------------------------------------------------------------------------------- /1-grid-world/7-reinforce/save_graph/reinforce_trained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/7-reinforce/save_graph/reinforce_trained.png -------------------------------------------------------------------------------- /1-grid-world/7-reinforce/save_model/reinforce_trained.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/7-reinforce/save_model/reinforce_trained.h5 -------------------------------------------------------------------------------- /1-grid-world/README.md: -------------------------------------------------------------------------------- 1 | # Grid World with Reinforcement Learning 2 | This is Grid World example that we made for the simple algorithm test 3 | The game is simple. The red rectangle must arrive in the circle, avoiding triangle. 4 | 5 |

6 | 7 |
8 | 9 | 10 | 11 | ## Dynamic Programming 12 | **1. Policy Iteration** 13 | 14 | **2. Value Iteration** 15 | 16 |
17 | 18 | ## Reinforcement Learning Fundamental Algorithms 19 | **3. Monte-Carlo** 20 | 21 | **4. SARSA** 22 | 23 | **5. Q-Learning** 24 | 25 |
26 | 27 | ## Futher Reinforcement Learning Algorithms 28 | >we have changed Grid World so the obstacles are moving. To solve this problem, we have to use function approximator. 29 | We used Neural Network as function approximator 30 | 31 |

32 | 33 |
34 | 35 | **6. DQN** 36 | 37 | **7. Policy Gradient** 38 | 39 | 40 | -------------------------------------------------------------------------------- /1-grid-world/gridworld.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/gridworld.png -------------------------------------------------------------------------------- /1-grid-world/gridworld_changing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/gridworld_changing.png -------------------------------------------------------------------------------- /1-grid-world/img/circle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/circle.png -------------------------------------------------------------------------------- /1-grid-world/img/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/down.png -------------------------------------------------------------------------------- /1-grid-world/img/left.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/left.png -------------------------------------------------------------------------------- /1-grid-world/img/rectangle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/rectangle.png -------------------------------------------------------------------------------- /1-grid-world/img/right.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/right.png -------------------------------------------------------------------------------- /1-grid-world/img/triangle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/triangle.png -------------------------------------------------------------------------------- /1-grid-world/img/up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/1-grid-world/img/up.png -------------------------------------------------------------------------------- /2-cartpole/1-dqn/SumTree.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | 4 | class SumTree: 5 | write = 0 6 | 7 | def __init__(self, capacity): 8 | self.capacity = capacity 9 | self.tree = numpy.zeros(2 * capacity - 1) 10 | self.data = numpy.zeros(capacity, dtype=object) 11 | 12 | def _propagate(self, idx, change): 13 | parent = (idx - 1) // 2 14 | 15 | self.tree[parent] += change 16 | 17 | if parent != 0: 18 | self._propagate(parent, change) 19 | 20 | def _retrieve(self, idx, s): 21 | left = 2 * idx + 1 22 | right = left + 1 23 | 24 | if left >= len(self.tree): 25 | return idx 26 | 27 | if s <= self.tree[left]: 28 | return self._retrieve(left, s) 29 | else: 30 | return self._retrieve(right, s - self.tree[left]) 31 | 32 | def total(self): 33 | return self.tree[0] 34 | 35 | def add(self, p, data): 36 | idx = self.write + self.capacity - 1 37 | 38 | self.data[self.write] = data 39 | self.update(idx, p) 40 | 41 | self.write += 1 42 | if self.write >= self.capacity: 43 | self.write = 0 44 | 45 | def update(self, idx, p): 46 | change = p - self.tree[idx] 47 | 48 | self.tree[idx] = p 49 | self._propagate(idx, change) 50 | 51 | def get(self, s): 52 | idx = self._retrieve(0, s) 53 | dataIdx = idx - self.capacity + 1 54 | 55 | return (idx, self.tree[idx], self.data[dataIdx]) 56 | -------------------------------------------------------------------------------- /2-cartpole/1-dqn/cartpole_dqn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import pylab 4 | import random 5 | import numpy as np 6 | from collections import deque 7 | from keras.layers import Dense 8 | from keras.optimizers import Adam 9 | from keras.models import Sequential 10 | 11 | EPISODES = 300 12 | 13 | 14 | # DQN Agent for the Cartpole 15 | # it uses Neural Network to approximate q function 16 | # and replay memory & target q network 17 | class DQNAgent: 18 | def __init__(self, state_size, action_size): 19 | # if you want to see Cartpole learning, then change to True 20 | self.render = False 21 | self.load_model = False 22 | 23 | # get size of state and action 24 | self.state_size = state_size 25 | self.action_size = action_size 26 | 27 | # These are hyper parameters for the DQN 28 | self.discount_factor = 0.99 29 | self.learning_rate = 0.001 30 | self.epsilon = 1.0 31 | self.epsilon_decay = 0.999 32 | self.epsilon_min = 0.01 33 | self.batch_size = 64 34 | self.train_start = 1000 35 | # create replay memory using deque 36 | self.memory = deque(maxlen=2000) 37 | 38 | # create main model and target model 39 | self.model = self.build_model() 40 | self.target_model = self.build_model() 41 | 42 | # initialize target model 43 | self.update_target_model() 44 | 45 | if self.load_model: 46 | self.model.load_weights("./save_model/cartpole_dqn.h5") 47 | 48 | # approximate Q function using Neural Network 49 | # state is input and Q Value of each action is output of network 50 | def build_model(self): 51 | model = Sequential() 52 | model.add(Dense(24, input_dim=self.state_size, activation='relu', 53 | kernel_initializer='he_uniform')) 54 | model.add(Dense(24, activation='relu', 55 | kernel_initializer='he_uniform')) 56 | model.add(Dense(self.action_size, activation='linear', 57 | kernel_initializer='he_uniform')) 58 | model.summary() 59 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 60 | return model 61 | 62 | # after some time interval update the target model to be same with model 63 | def update_target_model(self): 64 | self.target_model.set_weights(self.model.get_weights()) 65 | 66 | # get action from model using epsilon-greedy policy 67 | def get_action(self, state): 68 | if np.random.rand() <= self.epsilon: 69 | return random.randrange(self.action_size) 70 | else: 71 | q_value = self.model.predict(state) 72 | return np.argmax(q_value[0]) 73 | 74 | # save sample to the replay memory 75 | def append_sample(self, state, action, reward, next_state, done): 76 | self.memory.append((state, action, reward, next_state, done)) 77 | if self.epsilon > self.epsilon_min: 78 | self.epsilon *= self.epsilon_decay 79 | 80 | # pick samples randomly from replay memory (with batch_size) 81 | def train_model(self): 82 | if len(self.memory) < self.train_start: 83 | return 84 | batch_size = min(self.batch_size, len(self.memory)) 85 | mini_batch = random.sample(self.memory, batch_size) 86 | 87 | update_input = np.zeros((batch_size, self.state_size)) 88 | update_target = np.zeros((batch_size, self.state_size)) 89 | action, reward, done = [], [], [] 90 | 91 | for i in range(self.batch_size): 92 | update_input[i] = mini_batch[i][0] 93 | action.append(mini_batch[i][1]) 94 | reward.append(mini_batch[i][2]) 95 | update_target[i] = mini_batch[i][3] 96 | done.append(mini_batch[i][4]) 97 | 98 | target = self.model.predict(update_input) 99 | target_val = self.target_model.predict(update_target) 100 | 101 | for i in range(self.batch_size): 102 | # Q Learning: get maximum Q value at s' from target model 103 | if done[i]: 104 | target[i][action[i]] = reward[i] 105 | else: 106 | target[i][action[i]] = reward[i] + self.discount_factor * ( 107 | np.amax(target_val[i])) 108 | 109 | # and do the model fit! 110 | self.model.fit(update_input, target, batch_size=self.batch_size, 111 | epochs=1, verbose=0) 112 | 113 | 114 | if __name__ == "__main__": 115 | # In case of CartPole-v1, maximum length of episode is 500 116 | env = gym.make('CartPole-v1') 117 | # get size of state and action from environment 118 | state_size = env.observation_space.shape[0] 119 | action_size = env.action_space.n 120 | 121 | agent = DQNAgent(state_size, action_size) 122 | 123 | scores, episodes = [], [] 124 | 125 | for e in range(EPISODES): 126 | done = False 127 | score = 0 128 | state = env.reset() 129 | state = np.reshape(state, [1, state_size]) 130 | 131 | while not done: 132 | if agent.render: 133 | env.render() 134 | 135 | # get action for the current state and go one step in environment 136 | action = agent.get_action(state) 137 | next_state, reward, done, info = env.step(action) 138 | next_state = np.reshape(next_state, [1, state_size]) 139 | # if an action make the episode end, then gives penalty of -100 140 | reward = reward if not done or score == 499 else -100 141 | 142 | # save the sample to the replay memory 143 | agent.append_sample(state, action, reward, next_state, done) 144 | # every time step do the training 145 | agent.train_model() 146 | score += reward 147 | state = next_state 148 | 149 | if done: 150 | # every episode update the target model to be same with model 151 | agent.update_target_model() 152 | 153 | # every episode, plot the play time 154 | score = score if score == 500 else score + 100 155 | scores.append(score) 156 | episodes.append(e) 157 | pylab.plot(episodes, scores, 'b') 158 | pylab.savefig("./save_graph/cartpole_dqn.png") 159 | print("episode:", e, " score:", score, " memory length:", 160 | len(agent.memory), " epsilon:", agent.epsilon) 161 | 162 | # if the mean of scores of last 10 episode is bigger than 490 163 | # stop training 164 | if np.mean(scores[-min(10, len(scores)):]) > 490: 165 | sys.exit() 166 | 167 | # save the model 168 | if e % 50 == 0: 169 | agent.model.save_weights("./save_model/cartpole_dqn.h5") 170 | -------------------------------------------------------------------------------- /2-cartpole/1-dqn/cartpole_only_per.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import pylab 4 | import random 5 | import numpy as np 6 | from SumTree import SumTree 7 | from collections import deque 8 | from keras.layers import Dense 9 | from keras.optimizers import Adam 10 | from keras.models import Sequential 11 | 12 | EPISODES = 300 13 | 14 | 15 | # 카트폴 예제에서의 DQN 에이전트 16 | class DQNAgent: 17 | def __init__(self, state_size, action_size): 18 | self.render = False 19 | self.load_model = False 20 | 21 | # 상태와 행동의 크기 정의 22 | self.state_size = state_size 23 | self.action_size = action_size 24 | 25 | # DQN 하이퍼파라미터 26 | self.discount_factor = 0.99 27 | self.learning_rate = 0.001 28 | self.epsilon = 1.0 29 | self.epsilon_decay = 0.999 30 | self.epsilon_min = 0.01 31 | self.batch_size = 64 32 | self.train_start = 2000 33 | self.memory_size = 2000 34 | 35 | # 리플레이 메모리, 최대 크기 2000 36 | self.memory = Memory(self.memory_size) 37 | 38 | # 모델과 타깃 모델 생성 39 | self.model = self.build_model() 40 | self.target_model = self.build_model() 41 | 42 | # 타깃 모델 초기화 43 | self.update_target_model() 44 | 45 | if self.load_model: 46 | self.model.load_weights("./save_model/cartpole_dqn_trained.h5") 47 | 48 | # 상태가 입력, 큐함수가 출력인 인공신경망 생성 49 | def build_model(self): 50 | model = Sequential() 51 | model.add(Dense(24, input_dim=self.state_size, activation='relu', 52 | kernel_initializer='he_uniform')) 53 | model.add(Dense(24, activation='relu', 54 | kernel_initializer='he_uniform')) 55 | model.add(Dense(self.action_size, activation='linear', 56 | kernel_initializer='he_uniform')) 57 | model.summary() 58 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 59 | return model 60 | 61 | # 타깃 모델을 모델의 가중치로 업데이트 62 | def update_target_model(self): 63 | self.target_model.set_weights(self.model.get_weights()) 64 | 65 | # 입실론 탐욕 정책으로 행동 선택 66 | def get_action(self, state): 67 | if np.random.rand() <= self.epsilon: 68 | return random.randrange(self.action_size) 69 | else: 70 | q_value = self.model.predict(state) 71 | return np.argmax(q_value[0]) 72 | 73 | # 샘플 을 리플레이 메모리에 저장 74 | def append_sample(self, state, action, reward, next_state, done): 75 | if self.epsilon == 1: 76 | done = True 77 | 78 | # TD-error 를 구해서 같이 메모리에 저장 79 | target = self.model.predict([state]) 80 | old_val = target[0][action] 81 | target_val = self.target_model.predict([next_state]) 82 | if done: 83 | target[0][action] = reward 84 | else: 85 | target[0][action] = reward + self.discount_factor * ( 86 | np.amax(target_val[0])) 87 | error = abs(old_val - target[0][action]) 88 | 89 | self.memory.add(error, (state, action, reward, next_state, done)) 90 | 91 | # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습 92 | def train_model(self): 93 | if self.epsilon > self.epsilon_min: 94 | self.epsilon *= self.epsilon_decay 95 | 96 | # 메모리에서 배치 크기만큼 무작위로 샘플 추출 97 | mini_batch = self.memory.sample(self.batch_size) 98 | 99 | errors = np.zeros(self.batch_size) 100 | states = np.zeros((self.batch_size, self.state_size)) 101 | next_states = np.zeros((self.batch_size, self.state_size)) 102 | actions, rewards, dones = [], [], [] 103 | 104 | for i in range(self.batch_size): 105 | states[i] = mini_batch[i][1][0] 106 | actions.append(mini_batch[i][1][1]) 107 | rewards.append(mini_batch[i][1][2]) 108 | next_states[i] = mini_batch[i][1][3] 109 | dones.append(mini_batch[i][1][4]) 110 | 111 | # 현재 상태에 대한 모델의 큐함수 112 | # 다음 상태에 대한 타깃 모델의 큐함수 113 | target = self.model.predict(states) 114 | target_val = self.target_model.predict(next_states) 115 | 116 | # 벨만 최적 방정식을 이용한 업데이트 타깃 117 | for i in range(self.batch_size): 118 | old_val = target[i][actions[i]] 119 | if dones[i]: 120 | target[i][actions[i]] = rewards[i] 121 | else: 122 | target[i][actions[i]] = rewards[i] + self.discount_factor * ( 123 | np.amax(target_val[i])) 124 | # TD-error를 저장 125 | errors[i] = abs(old_val - target[i][actions[i]]) 126 | 127 | # TD-error로 priority 업데이트 128 | for i in range(self.batch_size): 129 | idx = mini_batch[i][0] 130 | self.memory.update(idx, errors[i]) 131 | 132 | self.model.fit(states, target, batch_size=self.batch_size, 133 | epochs=1, verbose=0) 134 | 135 | 136 | class Memory: # stored as ( s, a, r, s_ ) in SumTree 137 | e = 0.01 138 | a = 0.6 139 | 140 | def __init__(self, capacity): 141 | self.tree = SumTree(capacity) 142 | 143 | def _getPriority(self, error): 144 | return (error + self.e) ** self.a 145 | 146 | def add(self, error, sample): 147 | p = self._getPriority(error) 148 | self.tree.add(p, sample) 149 | 150 | def sample(self, n): 151 | batch = [] 152 | segment = self.tree.total() / n 153 | 154 | for i in range(n): 155 | a = segment * i 156 | b = segment * (i + 1) 157 | 158 | s = random.uniform(a, b) 159 | (idx, p, data) = self.tree.get(s) 160 | batch.append((idx, data)) 161 | 162 | return batch 163 | 164 | def update(self, idx, error): 165 | p = self._getPriority(error) 166 | self.tree.update(idx, p) 167 | 168 | 169 | if __name__ == "__main__": 170 | # CartPole-v1 환경, 최대 타임스텝 수가 500 171 | env = gym.make('CartPole-v1') 172 | state_size = env.observation_space.shape[0] 173 | action_size = env.action_space.n 174 | 175 | # DQN 에이전트 생성 176 | agent = DQNAgent(state_size, action_size) 177 | 178 | scores, episodes = [], [] 179 | 180 | step = 0 181 | for e in range(EPISODES): 182 | done = False 183 | score = 0 184 | # env 초기화 185 | state = env.reset() 186 | state = np.reshape(state, [1, state_size]) 187 | 188 | while not done: 189 | if agent.render: 190 | env.render() 191 | step += 1 192 | # 현재 상태로 행동을 선택 193 | action = agent.get_action(state) 194 | # 선택한 행동으로 환경에서 한 타임스텝 진행 195 | next_state, reward, done, info = env.step(action) 196 | next_state = np.reshape(next_state, [1, state_size]) 197 | # 에피소드가 중간에 끝나면 -100 보상 198 | r = reward if not done or score+reward == 500 else -10 199 | # 리플레이 메모리에 샘플 저장 200 | agent.append_sample(state, action, r, next_state, done) 201 | # 매 타임스텝마다 학습 202 | if step >= agent.train_start: 203 | agent.train_model() 204 | 205 | score += reward 206 | state = next_state 207 | 208 | if done: 209 | # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트 210 | agent.update_target_model() 211 | 212 | # score = score if score == 500 else score + 100 213 | # 에피소드마다 학습 결과 출력 214 | scores.append(score) 215 | episodes.append(e) 216 | pylab.plot(episodes, scores, 'b') 217 | pylab.savefig("./save_graph/cartpole_dqn.png") 218 | print("episode:", e, " score:", score, " memory length:", 219 | step if step <= agent.memory_size else agent.memory_size, " epsilon:", agent.epsilon) 220 | 221 | # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단 222 | if np.mean(scores[-min(10, len(scores)):]) > 490: 223 | agent.model.save_weights("./save_model/cartpole_dqn.h5") 224 | sys.exit() 225 | -------------------------------------------------------------------------------- /2-cartpole/1-dqn/save_graph/Cartpole_DQN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/1-dqn/save_graph/Cartpole_DQN.png -------------------------------------------------------------------------------- /2-cartpole/1-dqn/save_model/cartpole_dqn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/1-dqn/save_model/cartpole_dqn.h5 -------------------------------------------------------------------------------- /2-cartpole/2-double-dqn/cartpole_ddqn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import pylab 4 | import random 5 | import numpy as np 6 | from collections import deque 7 | from keras.layers import Dense 8 | from keras.optimizers import Adam 9 | from keras.models import Sequential 10 | 11 | EPISODES = 300 12 | 13 | 14 | # Double DQN Agent for the Cartpole 15 | # it uses Neural Network to approximate q function 16 | # and replay memory & target q network 17 | class DoubleDQNAgent: 18 | def __init__(self, state_size, action_size): 19 | # if you want to see Cartpole learning, then change to True 20 | self.render = False 21 | self.load_model = False 22 | # get size of state and action 23 | self.state_size = state_size 24 | self.action_size = action_size 25 | 26 | # these is hyper parameters for the Double DQN 27 | self.discount_factor = 0.99 28 | self.learning_rate = 0.001 29 | self.epsilon = 1.0 30 | self.epsilon_decay = 0.999 31 | self.epsilon_min = 0.01 32 | self.batch_size = 64 33 | self.train_start = 1000 34 | # create replay memory using deque 35 | self.memory = deque(maxlen=2000) 36 | 37 | # create main model and target model 38 | self.model = self.build_model() 39 | self.target_model = self.build_model() 40 | 41 | # initialize target model 42 | self.update_target_model() 43 | 44 | if self.load_model: 45 | self.model.load_weights("./save_model/cartpole_ddqn.h5") 46 | 47 | # approximate Q function using Neural Network 48 | # state is input and Q Value of each action is output of network 49 | def build_model(self): 50 | model = Sequential() 51 | model.add(Dense(24, input_dim=self.state_size, activation='relu', 52 | kernel_initializer='he_uniform')) 53 | model.add(Dense(24, activation='relu', 54 | kernel_initializer='he_uniform')) 55 | model.add(Dense(self.action_size, activation='linear', 56 | kernel_initializer='he_uniform')) 57 | model.summary() 58 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 59 | return model 60 | 61 | # after some time interval update the target model to be same with model 62 | def update_target_model(self): 63 | self.target_model.set_weights(self.model.get_weights()) 64 | 65 | # get action from model using epsilon-greedy policy 66 | def get_action(self, state): 67 | if np.random.rand() <= self.epsilon: 68 | return random.randrange(self.action_size) 69 | else: 70 | q_value = self.model.predict(state) 71 | return np.argmax(q_value[0]) 72 | 73 | # save sample to the replay memory 74 | def append_sample(self, state, action, reward, next_state, done): 75 | self.memory.append((state, action, reward, next_state, done)) 76 | if self.epsilon > self.epsilon_min: 77 | self.epsilon *= self.epsilon_decay 78 | 79 | # pick samples randomly from replay memory (with batch_size) 80 | def train_model(self): 81 | if len(self.memory) < self.train_start: 82 | return 83 | batch_size = min(self.batch_size, len(self.memory)) 84 | mini_batch = random.sample(self.memory, batch_size) 85 | 86 | update_input = np.zeros((batch_size, self.state_size)) 87 | update_target = np.zeros((batch_size, self.state_size)) 88 | action, reward, done = [], [], [] 89 | 90 | for i in range(batch_size): 91 | update_input[i] = mini_batch[i][0] 92 | action.append(mini_batch[i][1]) 93 | reward.append(mini_batch[i][2]) 94 | update_target[i] = mini_batch[i][3] 95 | done.append(mini_batch[i][4]) 96 | 97 | target = self.model.predict(update_input) 98 | target_next = self.model.predict(update_target) 99 | target_val = self.target_model.predict(update_target) 100 | 101 | for i in range(self.batch_size): 102 | # like Q Learning, get maximum Q value at s' 103 | # But from target model 104 | if done[i]: 105 | target[i][action[i]] = reward[i] 106 | else: 107 | # the key point of Double DQN 108 | # selection of action is from model 109 | # update is from target model 110 | a = np.argmax(target_next[i]) 111 | target[i][action[i]] = reward[i] + self.discount_factor * ( 112 | target_val[i][a]) 113 | 114 | # make minibatch which includes target q value and predicted q value 115 | # and do the model fit! 116 | self.model.fit(update_input, target, batch_size=self.batch_size, 117 | epochs=1, verbose=0) 118 | 119 | 120 | if __name__ == "__main__": 121 | # In case of CartPole-v1, you can play until 500 time step 122 | env = gym.make('CartPole-v1') 123 | # get size of state and action from environment 124 | state_size = env.observation_space.shape[0] 125 | action_size = env.action_space.n 126 | 127 | agent = DoubleDQNAgent(state_size, action_size) 128 | 129 | scores, episodes = [], [] 130 | 131 | for e in range(EPISODES): 132 | done = False 133 | score = 0 134 | state = env.reset() 135 | state = np.reshape(state, [1, state_size]) 136 | 137 | while not done: 138 | if agent.render: 139 | env.render() 140 | 141 | # get action for the current state and go one step in environment 142 | action = agent.get_action(state) 143 | next_state, reward, done, info = env.step(action) 144 | next_state = np.reshape(next_state, [1, state_size]) 145 | # if an action make the episode end, then gives penalty of -100 146 | reward = reward if not done or score == 499 else -100 147 | 148 | # save the sample to the replay memory 149 | agent.append_sample(state, action, reward, next_state, done) 150 | # every time step do the training 151 | agent.train_model() 152 | score += reward 153 | state = next_state 154 | 155 | if done: 156 | # every episode update the target model to be same with model 157 | agent.update_target_model() 158 | 159 | # every episode, plot the play time 160 | score = score if score == 500 else score + 100 161 | scores.append(score) 162 | episodes.append(e) 163 | pylab.plot(episodes, scores, 'b') 164 | pylab.savefig("./save_graph/cartpole_ddqn.png") 165 | print("episode:", e, " score:", score, " memory length:", 166 | len(agent.memory), " epsilon:", agent.epsilon) 167 | 168 | # if the mean of scores of last 10 episode is bigger than 490 169 | # stop training 170 | if np.mean(scores[-min(10, len(scores)):]) > 490: 171 | sys.exit() 172 | 173 | # save the model 174 | if e % 50 == 0: 175 | agent.model.save_weights("./save_model/cartpole_ddqn.h5") 176 | -------------------------------------------------------------------------------- /2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png -------------------------------------------------------------------------------- /2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5 -------------------------------------------------------------------------------- /2-cartpole/3-reinforce/cartpole_reinforce.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import pylab 4 | import numpy as np 5 | from keras.layers import Dense 6 | from keras.models import Sequential 7 | from keras.optimizers import Adam 8 | 9 | EPISODES = 1000 10 | 11 | 12 | # This is Policy Gradient agent for the Cartpole 13 | # In this example, we use REINFORCE algorithm which uses monte-carlo update rule 14 | class REINFORCEAgent: 15 | def __init__(self, state_size, action_size): 16 | # if you want to see Cartpole learning, then change to True 17 | self.render = False 18 | self.load_model = False 19 | # get size of state and action 20 | self.state_size = state_size 21 | self.action_size = action_size 22 | 23 | # These are hyper parameters for the Policy Gradient 24 | self.discount_factor = 0.99 25 | self.learning_rate = 0.001 26 | self.hidden1, self.hidden2 = 24, 24 27 | 28 | # create model for policy network 29 | self.model = self.build_model() 30 | 31 | # lists for the states, actions and rewards 32 | self.states, self.actions, self.rewards = [], [], [] 33 | 34 | if self.load_model: 35 | self.model.load_weights("./save_model/cartpole_reinforce.h5") 36 | 37 | # approximate policy using Neural Network 38 | # state is input and probability of each action is output of network 39 | def build_model(self): 40 | model = Sequential() 41 | model.add(Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')) 42 | model.add(Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')) 43 | model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')) 44 | model.summary() 45 | # Using categorical crossentropy as a loss is a trick to easily 46 | # implement the policy gradient. Categorical cross entropy is defined 47 | # H(p, q) = sum(p_i * log(q_i)). For the action taken, a, you set 48 | # p_a = advantage. q_a is the output of the policy network, which is 49 | # the probability of taking the action a, i.e. policy(s, a). 50 | # All other p_i are zero, thus we have H(p, q) = A * log(policy(s, a)) 51 | model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=self.learning_rate)) 52 | return model 53 | 54 | # using the output of policy network, pick action stochastically 55 | def get_action(self, state): 56 | policy = self.model.predict(state, batch_size=1).flatten() 57 | return np.random.choice(self.action_size, 1, p=policy)[0] 58 | 59 | # In Policy Gradient, Q function is not available. 60 | # Instead agent uses sample returns for evaluating policy 61 | def discount_rewards(self, rewards): 62 | discounted_rewards = np.zeros_like(rewards) 63 | running_add = 0 64 | for t in reversed(range(0, len(rewards))): 65 | running_add = running_add * self.discount_factor + rewards[t] 66 | discounted_rewards[t] = running_add 67 | return discounted_rewards 68 | 69 | # save of each step 70 | def append_sample(self, state, action, reward): 71 | self.states.append(state) 72 | self.rewards.append(reward) 73 | self.actions.append(action) 74 | 75 | # update policy network every episode 76 | def train_model(self): 77 | episode_length = len(self.states) 78 | 79 | discounted_rewards = self.discount_rewards(self.rewards) 80 | discounted_rewards -= np.mean(discounted_rewards) 81 | discounted_rewards /= np.std(discounted_rewards) 82 | 83 | update_inputs = np.zeros((episode_length, self.state_size)) 84 | advantages = np.zeros((episode_length, self.action_size)) 85 | 86 | for i in range(episode_length): 87 | update_inputs[i] = self.states[i] 88 | advantages[i][self.actions[i]] = discounted_rewards[i] 89 | 90 | self.model.fit(update_inputs, advantages, epochs=1, verbose=0) 91 | self.states, self.actions, self.rewards = [], [], [] 92 | 93 | if __name__ == "__main__": 94 | # In case of CartPole-v1, you can play until 500 time step 95 | env = gym.make('CartPole-v1') 96 | # get size of state and action from environment 97 | state_size = env.observation_space.shape[0] 98 | action_size = env.action_space.n 99 | 100 | # make REINFORCE agent 101 | agent = REINFORCEAgent(state_size, action_size) 102 | 103 | scores, episodes = [], [] 104 | 105 | for e in range(EPISODES): 106 | done = False 107 | score = 0 108 | state = env.reset() 109 | state = np.reshape(state, [1, state_size]) 110 | 111 | while not done: 112 | if agent.render: 113 | env.render() 114 | 115 | # get action for the current state and go one step in environment 116 | action = agent.get_action(state) 117 | next_state, reward, done, info = env.step(action) 118 | next_state = np.reshape(next_state, [1, state_size]) 119 | reward = reward if not done or score == 499 else -100 120 | 121 | # save the sample to the memory 122 | agent.append_sample(state, action, reward) 123 | 124 | score += reward 125 | state = next_state 126 | 127 | if done: 128 | # every episode, agent learns from sample returns 129 | agent.train_model() 130 | 131 | # every episode, plot the play time 132 | score = score if score == 500 else score + 100 133 | scores.append(score) 134 | episodes.append(e) 135 | pylab.plot(episodes, scores, 'b') 136 | pylab.savefig("./save_graph/cartpole_reinforce.png") 137 | print("episode:", e, " score:", score) 138 | 139 | # if the mean of scores of last 10 episode is bigger than 490 140 | # stop training 141 | if np.mean(scores[-min(10, len(scores)):]) > 490: 142 | sys.exit() 143 | 144 | # save the model 145 | if e % 50 == 0: 146 | agent.model.save_weights("./save_model/cartpole_reinforce.h5") 147 | -------------------------------------------------------------------------------- /2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png -------------------------------------------------------------------------------- /2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5 -------------------------------------------------------------------------------- /2-cartpole/4-actor-critic/cartpole_a2c.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import pylab 4 | import numpy as np 5 | from keras.layers import Dense 6 | from keras.models import Sequential 7 | from keras.optimizers import Adam 8 | 9 | EPISODES = 1000 10 | 11 | 12 | # A2C(Advantage Actor-Critic) agent for the Cartpole 13 | class A2CAgent: 14 | def __init__(self, state_size, action_size): 15 | # if you want to see Cartpole learning, then change to True 16 | self.render = False 17 | self.load_model = False 18 | # get size of state and action 19 | self.state_size = state_size 20 | self.action_size = action_size 21 | self.value_size = 1 22 | 23 | # These are hyper parameters for the Policy Gradient 24 | self.discount_factor = 0.99 25 | self.actor_lr = 0.001 26 | self.critic_lr = 0.005 27 | 28 | # create model for policy network 29 | self.actor = self.build_actor() 30 | self.critic = self.build_critic() 31 | 32 | if self.load_model: 33 | self.actor.load_weights("./save_model/cartpole_actor.h5") 34 | self.critic.load_weights("./save_model/cartpole_critic.h5") 35 | 36 | # approximate policy and value using Neural Network 37 | # actor: state is input and probability of each action is output of model 38 | def build_actor(self): 39 | actor = Sequential() 40 | actor.add(Dense(24, input_dim=self.state_size, activation='relu', 41 | kernel_initializer='he_uniform')) 42 | actor.add(Dense(self.action_size, activation='softmax', 43 | kernel_initializer='he_uniform')) 44 | actor.summary() 45 | # See note regarding crossentropy in cartpole_reinforce.py 46 | actor.compile(loss='categorical_crossentropy', 47 | optimizer=Adam(lr=self.actor_lr)) 48 | return actor 49 | 50 | # critic: state is input and value of state is output of model 51 | def build_critic(self): 52 | critic = Sequential() 53 | critic.add(Dense(24, input_dim=self.state_size, activation='relu', 54 | kernel_initializer='he_uniform')) 55 | critic.add(Dense(self.value_size, activation='linear', 56 | kernel_initializer='he_uniform')) 57 | critic.summary() 58 | critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr)) 59 | return critic 60 | 61 | # using the output of policy network, pick action stochastically 62 | def get_action(self, state): 63 | policy = self.actor.predict(state, batch_size=1).flatten() 64 | return np.random.choice(self.action_size, 1, p=policy)[0] 65 | 66 | # update policy network every episode 67 | def train_model(self, state, action, reward, next_state, done): 68 | target = np.zeros((1, self.value_size)) 69 | advantages = np.zeros((1, self.action_size)) 70 | 71 | value = self.critic.predict(state)[0] 72 | next_value = self.critic.predict(next_state)[0] 73 | 74 | if done: 75 | advantages[0][action] = reward - value 76 | target[0][0] = reward 77 | else: 78 | advantages[0][action] = reward + self.discount_factor * (next_value) - value 79 | target[0][0] = reward + self.discount_factor * next_value 80 | 81 | self.actor.fit(state, advantages, epochs=1, verbose=0) 82 | self.critic.fit(state, target, epochs=1, verbose=0) 83 | 84 | 85 | if __name__ == "__main__": 86 | # In case of CartPole-v1, maximum length of episode is 500 87 | env = gym.make('CartPole-v1') 88 | # get size of state and action from environment 89 | state_size = env.observation_space.shape[0] 90 | action_size = env.action_space.n 91 | 92 | # make A2C agent 93 | agent = A2CAgent(state_size, action_size) 94 | 95 | scores, episodes = [], [] 96 | 97 | for e in range(EPISODES): 98 | done = False 99 | score = 0 100 | state = env.reset() 101 | state = np.reshape(state, [1, state_size]) 102 | 103 | while not done: 104 | if agent.render: 105 | env.render() 106 | 107 | action = agent.get_action(state) 108 | next_state, reward, done, info = env.step(action) 109 | next_state = np.reshape(next_state, [1, state_size]) 110 | # if an action make the episode end, then gives penalty of -100 111 | reward = reward if not done or score == 499 else -100 112 | 113 | agent.train_model(state, action, reward, next_state, done) 114 | 115 | score += reward 116 | state = next_state 117 | 118 | if done: 119 | # every episode, plot the play time 120 | score = score if score == 500.0 else score + 100 121 | scores.append(score) 122 | episodes.append(e) 123 | pylab.plot(episodes, scores, 'b') 124 | pylab.savefig("./save_graph/cartpole_a2c.png") 125 | print("episode:", e, " score:", score) 126 | 127 | # if the mean of scores of last 10 episode is bigger than 490 128 | # stop training 129 | if np.mean(scores[-min(10, len(scores)):]) > 490: 130 | sys.exit() 131 | 132 | # save the model 133 | if e % 50 == 0: 134 | agent.actor.save_weights("./save_model/cartpole_actor.h5") 135 | agent.critic.save_weights("./save_model/cartpole_critic.h5") 136 | -------------------------------------------------------------------------------- /2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png -------------------------------------------------------------------------------- /2-cartpole/4-actor-critic/save_model/cartpole_actor.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/4-actor-critic/save_model/cartpole_actor.h5 -------------------------------------------------------------------------------- /2-cartpole/4-actor-critic/save_model/cartpole_critic.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/4-actor-critic/save_model/cartpole_critic.h5 -------------------------------------------------------------------------------- /2-cartpole/5-a3c/cartpole_a3c.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import numpy as np 3 | import tensorflow as tf 4 | import pylab 5 | import time 6 | import gym 7 | from keras.layers import Dense, Input 8 | from keras.models import Model 9 | from keras.optimizers import Adam 10 | from keras import backend as K 11 | 12 | 13 | # global variables for threading 14 | episode = 0 15 | scores = [] 16 | 17 | EPISODES = 2000 18 | 19 | # This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole 20 | # In this example, we use A3C algorithm 21 | class A3CAgent: 22 | def __init__(self, state_size, action_size, env_name): 23 | # get size of state and action 24 | self.state_size = state_size 25 | self.action_size = action_size 26 | 27 | # get gym environment name 28 | self.env_name = env_name 29 | 30 | # these are hyper parameters for the A3C 31 | self.actor_lr = 0.001 32 | self.critic_lr = 0.001 33 | self.discount_factor = .99 34 | self.hidden1, self.hidden2 = 24, 24 35 | self.threads = 8 36 | 37 | # create model for actor and critic network 38 | self.actor, self.critic = self.build_model() 39 | 40 | # method for training actor and critic network 41 | self.optimizer = [self.actor_optimizer(), self.critic_optimizer()] 42 | 43 | self.sess = tf.InteractiveSession() 44 | K.set_session(self.sess) 45 | self.sess.run(tf.global_variables_initializer()) 46 | 47 | # approximate policy and value using Neural Network 48 | # actor -> state is input and probability of each action is output of network 49 | # critic -> state is input and value of state is output of network 50 | # actor and critic network share first hidden layer 51 | def build_model(self): 52 | state = Input(batch_shape=(None, self.state_size)) 53 | shared = Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state) 54 | 55 | actor_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')(shared) 56 | action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden) 57 | 58 | value_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='he_uniform')(shared) 59 | state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden) 60 | 61 | actor = Model(inputs=state, outputs=action_prob) 62 | critic = Model(inputs=state, outputs=state_value) 63 | 64 | actor._make_predict_function() 65 | critic._make_predict_function() 66 | 67 | actor.summary() 68 | critic.summary() 69 | 70 | return actor, critic 71 | 72 | # make loss function for Policy Gradient 73 | # [log(action probability) * advantages] will be input for the back prop 74 | # we add entropy of action probability to loss 75 | def actor_optimizer(self): 76 | action = K.placeholder(shape=(None, self.action_size)) 77 | advantages = K.placeholder(shape=(None, )) 78 | 79 | policy = self.actor.output 80 | 81 | good_prob = K.sum(action * policy, axis=1) 82 | eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages) 83 | loss = -K.sum(eligibility) 84 | 85 | entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) 86 | 87 | actor_loss = loss + 0.01*entropy 88 | 89 | optimizer = Adam(lr=self.actor_lr) 90 | updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss) 91 | train = K.function([self.actor.input, action, advantages], [], updates=updates) 92 | return train 93 | 94 | # make loss function for Value approximation 95 | def critic_optimizer(self): 96 | discounted_reward = K.placeholder(shape=(None, )) 97 | 98 | value = self.critic.output 99 | 100 | loss = K.mean(K.square(discounted_reward - value)) 101 | 102 | optimizer = Adam(lr=self.critic_lr) 103 | updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) 104 | train = K.function([self.critic.input, discounted_reward], [], updates=updates) 105 | return train 106 | 107 | # make agents(local) and start training 108 | def train(self): 109 | # self.load_model('./save_model/cartpole_a3c.h5') 110 | agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor, 111 | self.action_size, self.state_size) for i in range(self.threads)] 112 | 113 | for agent in agents: 114 | agent.start() 115 | 116 | while True: 117 | time.sleep(20) 118 | 119 | plot = scores[:] 120 | pylab.plot(range(len(plot)), plot, 'b') 121 | pylab.savefig("./save_graph/cartpole_a3c.png") 122 | 123 | self.save_model('./save_model/cartpole_a3c.h5') 124 | 125 | def save_model(self, name): 126 | self.actor.save_weights(name + "_actor.h5") 127 | self.critic.save_weights(name + "_critic.h5") 128 | 129 | def load_model(self, name): 130 | self.actor.load_weights(name + "_actor.h5") 131 | self.critic.load_weights(name + "_critic.h5") 132 | 133 | # This is Agent(local) class for threading 134 | class Agent(threading.Thread): 135 | def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size): 136 | threading.Thread.__init__(self) 137 | 138 | self.states = [] 139 | self.rewards = [] 140 | self.actions = [] 141 | 142 | self.index = index 143 | self.actor = actor 144 | self.critic = critic 145 | self.optimizer = optimizer 146 | self.env_name = env_name 147 | self.discount_factor = discount_factor 148 | self.action_size = action_size 149 | self.state_size = state_size 150 | 151 | # Thread interactive with environment 152 | def run(self): 153 | global episode 154 | env = gym.make(self.env_name) 155 | while episode < EPISODES: 156 | state = env.reset() 157 | score = 0 158 | while True: 159 | action = self.get_action(state) 160 | next_state, reward, done, _ = env.step(action) 161 | score += reward 162 | 163 | self.memory(state, action, reward) 164 | 165 | state = next_state 166 | 167 | if done: 168 | episode += 1 169 | print("episode: ", episode, "/ score : ", score) 170 | scores.append(score) 171 | self.train_episode(score != 500) 172 | break 173 | 174 | # In Policy Gradient, Q function is not available. 175 | # Instead agent uses sample returns for evaluating policy 176 | def discount_rewards(self, rewards, done=True): 177 | discounted_rewards = np.zeros_like(rewards) 178 | running_add = 0 179 | if not done: 180 | running_add = self.critic.predict(np.reshape(self.states[-1], (1, self.state_size)))[0] 181 | for t in reversed(range(0, len(rewards))): 182 | running_add = running_add * self.discount_factor + rewards[t] 183 | discounted_rewards[t] = running_add 184 | return discounted_rewards 185 | 186 | # save of each step 187 | # this is used for calculating discounted rewards 188 | def memory(self, state, action, reward): 189 | self.states.append(state) 190 | act = np.zeros(self.action_size) 191 | act[action] = 1 192 | self.actions.append(act) 193 | self.rewards.append(reward) 194 | 195 | # update policy network and value network every episode 196 | def train_episode(self, done): 197 | discounted_rewards = self.discount_rewards(self.rewards, done) 198 | 199 | values = self.critic.predict(np.array(self.states)) 200 | values = np.reshape(values, len(values)) 201 | 202 | advantages = discounted_rewards - values 203 | 204 | self.optimizer[0]([self.states, self.actions, advantages]) 205 | self.optimizer[1]([self.states, discounted_rewards]) 206 | self.states, self.actions, self.rewards = [], [], [] 207 | 208 | def get_action(self, state): 209 | policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0] 210 | return np.random.choice(self.action_size, 1, p=policy)[0] 211 | 212 | 213 | if __name__ == "__main__": 214 | env_name = 'CartPole-v1' 215 | env = gym.make(env_name) 216 | 217 | state_size = env.observation_space.shape[0] 218 | action_size = env.action_space.n 219 | 220 | env.close() 221 | 222 | global_agent = A3CAgent(state_size, action_size, env_name) 223 | global_agent.train() 224 | -------------------------------------------------------------------------------- /2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5 -------------------------------------------------------------------------------- /2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5 -------------------------------------------------------------------------------- /2-cartpole/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Keon Kim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /2-cartpole/README.md: -------------------------------------------------------------------------------- 1 | # OpenAI gym Cartpole 2 | 3 | 4 | Various reinforcement learning algorithms for Cartpole example. 5 |

6 | 7 | 8 |
9 | This is graph of DQN algorithm 10 | 11 |

12 | 13 |
14 | This is graph of Double DQN algorithm 15 | 16 |

17 | 18 |
19 | This is graph of Policy Gradient algorithm 20 |

21 | 22 |
23 | This is graph of Actor Critic algorithm 24 |

-------------------------------------------------------------------------------- /2-cartpole/cartpole.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/2-cartpole/cartpole.png -------------------------------------------------------------------------------- /3-atari/1-breakout/breakout_ddqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import numpy as np 4 | import tensorflow as tf 5 | from collections import deque 6 | from skimage.color import rgb2gray 7 | from skimage.transform import resize 8 | from keras.models import Sequential 9 | from keras.optimizers import RMSprop 10 | from keras.layers import Dense, Flatten 11 | from keras.layers.convolutional import Conv2D 12 | from keras import backend as K 13 | 14 | EPISODES = 50000 15 | 16 | 17 | class DDQNAgent: 18 | def __init__(self, action_size): 19 | self.render = False 20 | self.load_model = False 21 | # environment settings 22 | self.state_size = (84, 84, 4) 23 | self.action_size = action_size 24 | # parameters about epsilon 25 | self.epsilon = 1. 26 | self.epsilon_start, self.epsilon_end = 1.0, 0.1 27 | self.exploration_steps = 1000000. 28 | self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \ 29 | / self.exploration_steps 30 | # parameters about training 31 | self.batch_size = 32 32 | self.train_start = 50000 33 | self.update_target_rate = 10000 34 | self.discount_factor = 0.99 35 | self.memory = deque(maxlen=400000) 36 | self.no_op_steps = 30 37 | # build 38 | self.model = self.build_model() 39 | self.target_model = self.build_model() 40 | self.update_target_model() 41 | 42 | self.optimizer = self.optimizer() 43 | 44 | self.sess = tf.InteractiveSession() 45 | K.set_session(self.sess) 46 | 47 | self.avg_q_max, self.avg_loss = 0, 0 48 | self.summary_placeholders, self.update_ops, self.summary_op = \ 49 | self.setup_summary() 50 | self.summary_writer = tf.summary.FileWriter( 51 | 'summary/breakout_ddqn', self.sess.graph) 52 | self.sess.run(tf.global_variables_initializer()) 53 | 54 | if self.load_model: 55 | self.model.load_weights("./save_model/breakout_ddqn.h5") 56 | 57 | # if the error is in [-1, 1], then the cost is quadratic to the error 58 | # But outside the interval, the cost is linear to the error 59 | def optimizer(self): 60 | a = K.placeholder(shape=(None, ), dtype='int32') 61 | y = K.placeholder(shape=(None, ), dtype='float32') 62 | 63 | py_x = self.model.output 64 | 65 | a_one_hot = K.one_hot(a, self.action_size) 66 | q_value = K.sum(py_x * a_one_hot, axis=1) 67 | error = K.abs(y - q_value) 68 | 69 | quadratic_part = K.clip(error, 0.0, 1.0) 70 | linear_part = error - quadratic_part 71 | loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) 72 | 73 | optimizer = RMSprop(lr=0.00025, epsilon=0.01) 74 | updates = optimizer.get_updates(self.model.trainable_weights, [], loss) 75 | train = K.function([self.model.input, a, y], [loss], updates=updates) 76 | 77 | return train 78 | 79 | # approximate Q function using Convolution Neural Network 80 | # state is input and Q Value of each action is output of network 81 | def build_model(self): 82 | model = Sequential() 83 | model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', 84 | input_shape=self.state_size)) 85 | model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu')) 86 | model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu')) 87 | model.add(Flatten()) 88 | model.add(Dense(512, activation='relu')) 89 | model.add(Dense(self.action_size)) 90 | model.summary() 91 | 92 | return model 93 | 94 | # after some time interval update the target model to be same with model 95 | def update_target_model(self): 96 | self.target_model.set_weights(self.model.get_weights()) 97 | 98 | # get action from model using epsilon-greedy policy 99 | def get_action(self, history): 100 | history = np.float32(history / 255.0) 101 | if np.random.rand() <= self.epsilon: 102 | return random.randrange(self.action_size) 103 | else: 104 | q_value = self.model.predict(history) 105 | return np.argmax(q_value[0]) 106 | 107 | # save sample to the replay memory 108 | def replay_memory(self, history, action, reward, next_history, dead): 109 | self.memory.append((history, action, reward, next_history, dead)) 110 | 111 | # pick samples randomly from replay memory (with batch_size) 112 | def train_replay(self): 113 | if len(self.memory) < self.train_start: 114 | return 115 | if self.epsilon > self.epsilon_end: 116 | self.epsilon -= self.epsilon_decay_step 117 | 118 | mini_batch = random.sample(self.memory, self.batch_size) 119 | 120 | history = np.zeros((self.batch_size, self.state_size[0], 121 | self.state_size[1], self.state_size[2])) 122 | next_history = np.zeros((self.batch_size, self.state_size[0], 123 | self.state_size[1], self.state_size[2])) 124 | target = np.zeros((self.batch_size, )) 125 | action, reward, dead = [], [], [] 126 | 127 | for i in range(self.batch_size): 128 | history[i] = np.float32(mini_batch[i][0] / 255.) 129 | next_history[i] = np.float32(mini_batch[i][3] / 255.) 130 | action.append(mini_batch[i][1]) 131 | reward.append(mini_batch[i][2]) 132 | dead.append(mini_batch[i][4]) 133 | 134 | value = self.model.predict(next_history) 135 | target_value = self.target_model.predict(next_history) 136 | 137 | # like Q Learning, get maximum Q value at s' 138 | # But from target model 139 | for i in range(self.batch_size): 140 | if dead[i]: 141 | target[i] = reward[i] 142 | else: 143 | # the key point of Double DQN 144 | # selection of action is from model 145 | # update is from target model 146 | target[i] = reward[i] + self.discount_factor * \ 147 | target_value[i][np.argmax(value[i])] 148 | 149 | loss = self.optimizer([history, action, target]) 150 | self.avg_loss += loss[0] 151 | 152 | # make summary operators for tensorboard 153 | def setup_summary(self): 154 | episode_total_reward = tf.Variable(0.) 155 | episode_avg_max_q = tf.Variable(0.) 156 | episode_duration = tf.Variable(0.) 157 | episode_avg_loss = tf.Variable(0.) 158 | 159 | tf.summary.scalar('Total Reward/Episode', episode_total_reward) 160 | tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q) 161 | tf.summary.scalar('Duration/Episode', episode_duration) 162 | tf.summary.scalar('Average Loss/Episode', episode_avg_loss) 163 | 164 | summary_vars = [episode_total_reward, episode_avg_max_q, 165 | episode_duration, episode_avg_loss] 166 | summary_placeholders = [tf.placeholder(tf.float32) for _ in 167 | range(len(summary_vars))] 168 | update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in 169 | range(len(summary_vars))] 170 | summary_op = tf.summary.merge_all() 171 | return summary_placeholders, update_ops, summary_op 172 | 173 | 174 | # 210*160*3(color) --> 84*84(mono) 175 | # float --> integer (to reduce the size of replay memory) 176 | def pre_processing(observe): 177 | processed_observe = np.uint8( 178 | resize(rgb2gray(observe), (84, 84), mode='constant') * 255) 179 | return processed_observe 180 | 181 | 182 | if __name__ == "__main__": 183 | # In case of BreakoutDeterministic-v4, always skip 4 frames 184 | # Deterministic-v4 version use 4 actions 185 | env = gym.make('BreakoutDeterministic-v4') 186 | agent = DDQNAgent(action_size=3) 187 | 188 | scores, episodes, global_step = [], [], 0 189 | 190 | for e in range(EPISODES): 191 | done = False 192 | dead = False 193 | # 1 episode = 5 lives 194 | step, score, start_life = 0, 0, 5 195 | observe = env.reset() 196 | 197 | # this is one of DeepMind's idea. 198 | # just do nothing at the start of episode to avoid sub-optimal 199 | for _ in range(random.randint(1, agent.no_op_steps)): 200 | observe, _, _, _ = env.step(1) 201 | 202 | # At start of episode, there is no preceding frame. 203 | # So just copy initial states to make history 204 | state = pre_processing(observe) 205 | history = np.stack((state, state, state, state), axis=2) 206 | history = np.reshape([history], (1, 84, 84, 4)) 207 | 208 | while not done: 209 | if agent.render: 210 | env.render() 211 | global_step += 1 212 | step += 1 213 | 214 | # get action for the current history and go one step in environment 215 | action = agent.get_action(history) 216 | # change action to real_action 217 | if action == 0: real_action = 1 218 | elif action == 1: real_action = 2 219 | else: real_action = 3 220 | 221 | observe, reward, done, info = env.step(real_action) 222 | # pre-process the observation --> history 223 | next_state = pre_processing(observe) 224 | next_state = np.reshape([next_state], (1, 84, 84, 1)) 225 | next_history = np.append(next_state, history[:, :, :, :3], axis=3) 226 | 227 | agent.avg_q_max += np.amax( 228 | agent.model.predict(np.float32(history / 255.))[0]) 229 | 230 | # if the agent missed ball, agent is dead --> episode is not over 231 | if start_life > info['ale.lives']: 232 | dead = True 233 | start_life = info['ale.lives'] 234 | 235 | reward = np.clip(reward, -1., 1.) 236 | 237 | # save the sample to the replay memory 238 | agent.replay_memory(history, action, reward, next_history, dead) 239 | # every some time interval, train model 240 | agent.train_replay() 241 | # update the target model with model 242 | if global_step % agent.update_target_rate == 0: 243 | agent.update_target_model() 244 | 245 | score += reward 246 | 247 | # if agent is dead, then reset the history 248 | if dead: 249 | dead = False 250 | else: 251 | history = next_history 252 | 253 | # if done, plot the score over episodes 254 | if done: 255 | if global_step > agent.train_start: 256 | stats = [score, agent.avg_q_max / float(step), step, 257 | agent.avg_loss / float(step)] 258 | for i in range(len(stats)): 259 | agent.sess.run(agent.update_ops[i], feed_dict={ 260 | agent.summary_placeholders[i]: float(stats[i]) 261 | }) 262 | summary_str = agent.sess.run(agent.summary_op) 263 | agent.summary_writer.add_summary(summary_str, e + 1) 264 | 265 | print("episode:", e, " score:", score, " memory length:", 266 | len(agent.memory), " epsilon:", agent.epsilon, 267 | " global_step:", global_step, " average_q:", 268 | agent.avg_q_max/float(step), " average loss:", 269 | agent.avg_loss/float(step)) 270 | 271 | agent.avg_q_max, agent.avg_loss = 0, 0 272 | 273 | if e % 1000 == 0: 274 | agent.model.save_weights("./save_model/breakout_ddqn.h5") 275 | -------------------------------------------------------------------------------- /3-atari/1-breakout/breakout_dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import numpy as np 4 | import tensorflow as tf 5 | from collections import deque 6 | from skimage.color import rgb2gray 7 | from skimage.transform import resize 8 | from keras.models import Sequential 9 | from keras.optimizers import RMSprop 10 | from keras.layers import Dense, Flatten 11 | from keras.layers.convolutional import Conv2D 12 | from keras import backend as K 13 | 14 | EPISODES = 50000 15 | 16 | 17 | class DQNAgent: 18 | def __init__(self, action_size): 19 | self.render = False 20 | self.load_model = False 21 | # environment settings 22 | self.state_size = (84, 84, 4) 23 | self.action_size = action_size 24 | # parameters about epsilon 25 | self.epsilon = 1. 26 | self.epsilon_start, self.epsilon_end = 1.0, 0.1 27 | self.exploration_steps = 1000000. 28 | self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \ 29 | / self.exploration_steps 30 | # parameters about training 31 | self.batch_size = 32 32 | self.train_start = 50000 33 | self.update_target_rate = 10000 34 | self.discount_factor = 0.99 35 | self.memory = deque(maxlen=400000) 36 | self.no_op_steps = 30 37 | # build model 38 | self.model = self.build_model() 39 | self.target_model = self.build_model() 40 | self.update_target_model() 41 | 42 | self.optimizer = self.optimizer() 43 | 44 | self.sess = tf.InteractiveSession() 45 | K.set_session(self.sess) 46 | 47 | self.avg_q_max, self.avg_loss = 0, 0 48 | self.summary_placeholders, self.update_ops, self.summary_op = \ 49 | self.setup_summary() 50 | self.summary_writer = tf.summary.FileWriter( 51 | 'summary/breakout_dqn', self.sess.graph) 52 | self.sess.run(tf.global_variables_initializer()) 53 | 54 | if self.load_model: 55 | self.model.load_weights("./save_model/breakout_dqn.h5") 56 | 57 | # if the error is in [-1, 1], then the cost is quadratic to the error 58 | # But outside the interval, the cost is linear to the error 59 | def optimizer(self): 60 | a = K.placeholder(shape=(None,), dtype='int32') 61 | y = K.placeholder(shape=(None,), dtype='float32') 62 | 63 | py_x = self.model.output 64 | 65 | a_one_hot = K.one_hot(a, self.action_size) 66 | q_value = K.sum(py_x * a_one_hot, axis=1) 67 | error = K.abs(y - q_value) 68 | 69 | quadratic_part = K.clip(error, 0.0, 1.0) 70 | linear_part = error - quadratic_part 71 | loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) 72 | 73 | optimizer = RMSprop(lr=0.00025, epsilon=0.01) 74 | updates = optimizer.get_updates(self.model.trainable_weights, [], loss) 75 | train = K.function([self.model.input, a, y], [loss], updates=updates) 76 | 77 | return train 78 | 79 | # approximate Q function using Convolution Neural Network 80 | # state is input and Q Value of each action is output of network 81 | def build_model(self): 82 | model = Sequential() 83 | model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', 84 | input_shape=self.state_size)) 85 | model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu')) 86 | model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu')) 87 | model.add(Flatten()) 88 | model.add(Dense(512, activation='relu')) 89 | model.add(Dense(self.action_size)) 90 | model.summary() 91 | return model 92 | 93 | # after some time interval update the target model to be same with model 94 | def update_target_model(self): 95 | self.target_model.set_weights(self.model.get_weights()) 96 | 97 | # get action from model using epsilon-greedy policy 98 | def get_action(self, history): 99 | history = np.float32(history / 255.0) 100 | if np.random.rand() <= self.epsilon: 101 | return random.randrange(self.action_size) 102 | else: 103 | q_value = self.model.predict(history) 104 | return np.argmax(q_value[0]) 105 | 106 | # save sample to the replay memory 107 | def replay_memory(self, history, action, reward, next_history, dead): 108 | self.memory.append((history, action, reward, next_history, dead)) 109 | 110 | # pick samples randomly from replay memory (with batch_size) 111 | def train_replay(self): 112 | if len(self.memory) < self.train_start: 113 | return 114 | if self.epsilon > self.epsilon_end: 115 | self.epsilon -= self.epsilon_decay_step 116 | 117 | mini_batch = random.sample(self.memory, self.batch_size) 118 | 119 | history = np.zeros((self.batch_size, self.state_size[0], 120 | self.state_size[1], self.state_size[2])) 121 | next_history = np.zeros((self.batch_size, self.state_size[0], 122 | self.state_size[1], self.state_size[2])) 123 | target = np.zeros((self.batch_size,)) 124 | action, reward, dead = [], [], [] 125 | 126 | for i in range(self.batch_size): 127 | history[i] = np.float32(mini_batch[i][0] / 255.) 128 | next_history[i] = np.float32(mini_batch[i][3] / 255.) 129 | action.append(mini_batch[i][1]) 130 | reward.append(mini_batch[i][2]) 131 | dead.append(mini_batch[i][4]) 132 | 133 | target_value = self.target_model.predict(next_history) 134 | 135 | # like Q Learning, get maximum Q value at s' 136 | # But from target model 137 | for i in range(self.batch_size): 138 | if dead[i]: 139 | target[i] = reward[i] 140 | else: 141 | target[i] = reward[i] + self.discount_factor * \ 142 | np.amax(target_value[i]) 143 | 144 | loss = self.optimizer([history, action, target]) 145 | self.avg_loss += loss[0] 146 | 147 | def save_model(self, name): 148 | self.model.save_weights(name) 149 | 150 | # make summary operators for tensorboard 151 | def setup_summary(self): 152 | episode_total_reward = tf.Variable(0.) 153 | episode_avg_max_q = tf.Variable(0.) 154 | episode_duration = tf.Variable(0.) 155 | episode_avg_loss = tf.Variable(0.) 156 | 157 | tf.summary.scalar('Total Reward/Episode', episode_total_reward) 158 | tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q) 159 | tf.summary.scalar('Duration/Episode', episode_duration) 160 | tf.summary.scalar('Average Loss/Episode', episode_avg_loss) 161 | 162 | summary_vars = [episode_total_reward, episode_avg_max_q, 163 | episode_duration, episode_avg_loss] 164 | summary_placeholders = [tf.placeholder(tf.float32) for _ in 165 | range(len(summary_vars))] 166 | update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in 167 | range(len(summary_vars))] 168 | summary_op = tf.summary.merge_all() 169 | return summary_placeholders, update_ops, summary_op 170 | 171 | 172 | # 210*160*3(color) --> 84*84(mono) 173 | # float --> integer (to reduce the size of replay memory) 174 | def pre_processing(observe): 175 | processed_observe = np.uint8( 176 | resize(rgb2gray(observe), (84, 84), mode='constant') * 255) 177 | return processed_observe 178 | 179 | 180 | if __name__ == "__main__": 181 | # In case of BreakoutDeterministic-v3, always skip 4 frames 182 | # Deterministic-v4 version use 4 actions 183 | env = gym.make('BreakoutDeterministic-v4') 184 | agent = DQNAgent(action_size=3) 185 | 186 | scores, episodes, global_step = [], [], 0 187 | 188 | for e in range(EPISODES): 189 | done = False 190 | dead = False 191 | # 1 episode = 5 lives 192 | step, score, start_life = 0, 0, 5 193 | observe = env.reset() 194 | 195 | # this is one of DeepMind's idea. 196 | # just do nothing at the start of episode to avoid sub-optimal 197 | for _ in range(random.randint(1, agent.no_op_steps)): 198 | observe, _, _, _ = env.step(1) 199 | 200 | # At start of episode, there is no preceding frame 201 | # So just copy initial states to make history 202 | state = pre_processing(observe) 203 | history = np.stack((state, state, state, state), axis=2) 204 | history = np.reshape([history], (1, 84, 84, 4)) 205 | 206 | while not done: 207 | if agent.render: 208 | env.render() 209 | global_step += 1 210 | step += 1 211 | 212 | # get action for the current history and go one step in environment 213 | action = agent.get_action(history) 214 | # change action to real_action 215 | if action == 0: 216 | real_action = 1 217 | elif action == 1: 218 | real_action = 2 219 | else: 220 | real_action = 3 221 | 222 | observe, reward, done, info = env.step(real_action) 223 | # pre-process the observation --> history 224 | next_state = pre_processing(observe) 225 | next_state = np.reshape([next_state], (1, 84, 84, 1)) 226 | next_history = np.append(next_state, history[:, :, :, :3], axis=3) 227 | 228 | agent.avg_q_max += np.amax( 229 | agent.model.predict(np.float32(history / 255.))[0]) 230 | 231 | # if the agent missed ball, agent is dead --> episode is not over 232 | if start_life > info['ale.lives']: 233 | dead = True 234 | start_life = info['ale.lives'] 235 | 236 | reward = np.clip(reward, -1., 1.) 237 | 238 | # save the sample to the replay memory 239 | agent.replay_memory(history, action, reward, next_history, dead) 240 | # every some time interval, train model 241 | agent.train_replay() 242 | # update the target model with model 243 | if global_step % agent.update_target_rate == 0: 244 | agent.update_target_model() 245 | 246 | score += reward 247 | 248 | # if agent is dead, then reset the history 249 | if dead: 250 | dead = False 251 | else: 252 | history = next_history 253 | 254 | # if done, plot the score over episodes 255 | if done: 256 | if global_step > agent.train_start: 257 | stats = [score, agent.avg_q_max / float(step), step, 258 | agent.avg_loss / float(step)] 259 | for i in range(len(stats)): 260 | agent.sess.run(agent.update_ops[i], feed_dict={ 261 | agent.summary_placeholders[i]: float(stats[i]) 262 | }) 263 | summary_str = agent.sess.run(agent.summary_op) 264 | agent.summary_writer.add_summary(summary_str, e + 1) 265 | 266 | print("episode:", e, " score:", score, " memory length:", 267 | len(agent.memory), " epsilon:", agent.epsilon, 268 | " global_step:", global_step, " average_q:", 269 | agent.avg_q_max / float(step), " average loss:", 270 | agent.avg_loss / float(step)) 271 | 272 | agent.avg_q_max, agent.avg_loss = 0, 0 273 | 274 | if e % 1000 == 0: 275 | agent.model.save_weights("./save_model/breakout_dqn.h5") 276 | -------------------------------------------------------------------------------- /3-atari/1-breakout/breakout_dueling_ddqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import numpy as np 4 | import tensorflow as tf 5 | from collections import deque 6 | from skimage.color import rgb2gray 7 | from skimage.transform import resize 8 | from keras.models import Model 9 | from keras.optimizers import RMSprop 10 | from keras.layers import Input, Dense, Flatten, Lambda, merge 11 | from keras.layers.convolutional import Conv2D 12 | from keras import backend as K 13 | 14 | EPISODES = 50000 15 | 16 | 17 | class DuelingDDQNAgent: 18 | def __init__(self, action_size): 19 | self.render = False 20 | self.load_model = False 21 | # environment settings 22 | self.state_size = (84, 84, 4) 23 | self.action_size = action_size 24 | # parameters about epsilon 25 | self.epsilon = 1. 26 | self.epsilon_start, self.epsilon_end = 1.0, 0.1 27 | self.exploration_steps = 1000000. 28 | self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \ 29 | / self.exploration_steps 30 | # parameters about training 31 | self.batch_size = 32 32 | self.train_start = 50000 33 | self.update_target_rate = 10000 34 | self.discount_factor = 0.99 35 | self.memory = deque(maxlen=400000) 36 | self.no_op_steps = 30 37 | # build 38 | self.model = self.build_model() 39 | self.target_model = self.build_model() 40 | self.update_target_model() 41 | 42 | self.optimizer = self.optimizer() 43 | 44 | self.sess = tf.InteractiveSession() 45 | K.set_session(self.sess) 46 | 47 | self.avg_q_max, self.avg_loss = 0, 0 48 | self.summary_placeholders, self.update_ops, self.summary_op = \ 49 | self.setup_summary() 50 | self.summary_writer = tf.summary.FileWriter( 51 | 'summary/breakout_dueling_ddqn', self.sess.graph) 52 | self.sess.run(tf.global_variables_initializer()) 53 | 54 | if self.load_model: 55 | self.model.load_weights("./save_model/breakout_dueling_ddqb.h5") 56 | 57 | # if the error is in [-1, 1], then the cost is quadratic to the error 58 | # But outside the interval, the cost is linear to the error 59 | def optimizer(self): 60 | a = K.placeholder(shape=(None, ), dtype='int32') 61 | y = K.placeholder(shape=(None, ), dtype='float32') 62 | 63 | py_x = self.model.output 64 | 65 | a_one_hot = K.one_hot(a, self.action_size) 66 | q_value = K.sum(py_x * a_one_hot, axis=1) 67 | error = K.abs(y - q_value) 68 | 69 | quadratic_part = K.clip(error, 0.0, 1.0) 70 | linear_part = error - quadratic_part 71 | loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) 72 | 73 | optimizer = RMSprop(lr=0.00025, epsilon=0.01) 74 | updates = optimizer.get_updates(self.model.trainable_weights, [], loss) 75 | train = K.function([self.model.input, a, y], [loss], updates=updates) 76 | 77 | return train 78 | 79 | # approximate Q function using Convolution Neural Network 80 | # state is input and Q Value of each action is output of network 81 | # dueling network's Q Value is sum of advantages and state value 82 | def build_model(self): 83 | input = Input(shape=self.state_size) 84 | shared = Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(input) 85 | shared = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(shared) 86 | shared = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(shared) 87 | flatten = Flatten()(shared) 88 | 89 | # network separate state value and advantages 90 | advantage_fc = Dense(512, activation='relu')(flatten) 91 | advantage = Dense(self.action_size)(advantage_fc) 92 | advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), 93 | output_shape=(self.action_size,))(advantage) 94 | 95 | value_fc = Dense(512, activation='relu')(flatten) 96 | value = Dense(1)(value_fc) 97 | value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), 98 | output_shape=(self.action_size,))(value) 99 | 100 | # network merged and make Q Value 101 | q_value = merge([value, advantage], mode='sum') 102 | model = Model(inputs=input, outputs=q_value) 103 | model.summary() 104 | 105 | return model 106 | 107 | # after some time interval update the target model to be same with model 108 | def update_target_model(self): 109 | self.target_model.set_weights(self.model.get_weights()) 110 | 111 | # get action from model using epsilon-greedy policy 112 | def get_action(self, history): 113 | history = np.float32(history / 255.0) 114 | if np.random.rand() <= self.epsilon: 115 | return random.randrange(self.action_size) 116 | else: 117 | q_value = self.model.predict(history) 118 | return np.argmax(q_value[0]) 119 | 120 | # save sample to the replay memory 121 | def replay_memory(self, history, action, reward, next_history, dead): 122 | self.memory.append((history, action, reward, next_history, dead)) 123 | 124 | # pick samples randomly from replay memory (with batch_size) 125 | def train_replay(self): 126 | if len(self.memory) < self.train_start: 127 | return 128 | if self.epsilon > self.epsilon_end: 129 | self.epsilon -= self.epsilon_decay_step 130 | 131 | mini_batch = random.sample(self.memory, self.batch_size) 132 | 133 | history = np.zeros((self.batch_size, self.state_size[0], 134 | self.state_size[1], self.state_size[2])) 135 | next_history = np.zeros((self.batch_size, self.state_size[0], 136 | self.state_size[1], self.state_size[2])) 137 | target = np.zeros((self.batch_size, )) 138 | action, reward, dead = [], [], [] 139 | 140 | for i in range(self.batch_size): 141 | history[i] = np.float32(mini_batch[i][0] / 255.) 142 | next_history[i] = np.float32(mini_batch[i][3] / 255.) 143 | action.append(mini_batch[i][1]) 144 | reward.append(mini_batch[i][2]) 145 | dead.append(mini_batch[i][4]) 146 | 147 | value = self.model.predict(history) 148 | target_value = self.target_model.predict(next_history) 149 | 150 | # like Q Learning, get maximum Q value at s' 151 | # But from target model 152 | for i in range(self.batch_size): 153 | if dead[i]: 154 | target[i] = reward[i] 155 | else: 156 | # the key point of Double DQN 157 | # selection of action is from model 158 | # update is from target model 159 | target[i] = reward[i] + self.discount_factor * \ 160 | target_value[i][np.argmax(value[i])] 161 | 162 | loss = self.optimizer([history, action, target]) 163 | self.avg_loss += loss[0] 164 | 165 | def setup_summary(self): 166 | episode_total_reward = tf.Variable(0.) 167 | episode_avg_max_q = tf.Variable(0.) 168 | episode_duration = tf.Variable(0.) 169 | episode_avg_loss = tf.Variable(0.) 170 | 171 | tf.summary.scalar('Total Reward/Episode', episode_total_reward) 172 | tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q) 173 | tf.summary.scalar('Duration/Episode', episode_duration) 174 | tf.summary.scalar('Average Loss/Episode', episode_avg_loss) 175 | 176 | summary_vars = [episode_total_reward, episode_avg_max_q, 177 | episode_duration, episode_avg_loss] 178 | summary_placeholders = [tf.placeholder(tf.float32) for _ in 179 | range(len(summary_vars))] 180 | update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in 181 | range(len(summary_vars))] 182 | summary_op = tf.summary.merge_all() 183 | return summary_placeholders, update_ops, summary_op 184 | 185 | 186 | # 210*160*3(color) --> 84*84(mono) 187 | # float --> integer (to reduce the size of replay memory) 188 | def pre_processing(observe): 189 | processed_observe = np.uint8( 190 | resize(rgb2gray(observe), (84, 84), mode='constant') * 255) 191 | return processed_observe 192 | 193 | 194 | if __name__ == "__main__": 195 | # In case of BreakoutDeterministic-v3, always skip 4 frames 196 | # Deterministic-v4 version use 4 actions 197 | env = gym.make('BreakoutDeterministic-v4') 198 | agent = DuelingDDQNAgent(action_size=3) 199 | 200 | scores, episodes, global_step = [], [], 0 201 | 202 | for e in range(EPISODES): 203 | done = False 204 | dead = False 205 | # 1 episode = 5 lives 206 | step, score, start_life = 0, 0, 5 207 | observe = env.reset() 208 | 209 | # this is one of DeepMind's idea. 210 | # just do nothing at the start of episode to avoid sub-optimal 211 | for _ in range(random.randint(1, agent.no_op_steps)): 212 | observe, _, _, _ = env.step(1) 213 | 214 | # At start of episode, there is no preceding frame. 215 | # So just copy initial states to make history 216 | state = pre_processing(observe) 217 | history = np.stack((state, state, state, state), axis=2) 218 | history = np.reshape([history], (1, 84, 84, 4)) 219 | 220 | while not done: 221 | if agent.render: 222 | env.render() 223 | global_step += 1 224 | step += 1 225 | 226 | # get action for the current history and go one step in environment 227 | action = agent.get_action(history) 228 | # change action to real_action 229 | if action == 0: real_action = 1 230 | elif action == 1: real_action = 2 231 | else: real_action = 3 232 | 233 | observe, reward, done, info = env.step(real_action) 234 | # pre-process the observation --> history 235 | next_state = pre_processing(observe) 236 | next_state = np.reshape([next_state], (1, 84, 84, 1)) 237 | next_history = np.append(next_state, history[:, :, :, :3], axis=3) 238 | 239 | agent.avg_q_max += np.amax( 240 | agent.model.predict(np.float32(history / 255.))[0]) 241 | 242 | # if the agent missed ball, agent is dead --> episode is not over 243 | if start_life > info['ale.lives']: 244 | dead = True 245 | start_life = info['ale.lives'] 246 | 247 | reward = np.clip(reward, -1., 1.) 248 | 249 | # save the sample to the replay memory 250 | agent.replay_memory(history, action, reward, next_history, dead) 251 | # every some time interval, train model 252 | agent.train_replay() 253 | # update the target model with model 254 | if global_step % agent.update_target_rate == 0: 255 | agent.update_target_model() 256 | 257 | score += reward 258 | 259 | # if agent is dead, then reset the history 260 | if dead: 261 | dead = False 262 | else: 263 | history = next_history 264 | 265 | # if done, plot the score over episodes 266 | if done: 267 | if global_step > agent.train_start: 268 | stats = [score, agent.avg_q_max / float(step), step, 269 | agent.avg_loss / float(step)] 270 | for i in range(len(stats)): 271 | agent.sess.run(agent.update_ops[i], feed_dict={ 272 | agent.summary_placeholders[i]: float(stats[i]) 273 | }) 274 | summary_str = agent.sess.run(agent.summary_op) 275 | agent.summary_writer.add_summary(summary_str, e + 1) 276 | 277 | print("episode:", e, " score:", score, " memory length:", 278 | len(agent.memory), " epsilon:", agent.epsilon, 279 | " global_step:", global_step, " average_q:", 280 | agent.avg_q_max/float(step), " average loss:", 281 | agent.avg_loss/float(step)) 282 | 283 | agent.avg_q_max, agent.avg_loss = 0, 0 284 | 285 | if e % 1000 == 0: 286 | agent.model.save_weights("./save_model/breakout_dueling_ddqn.h5") 287 | -------------------------------------------------------------------------------- /3-atari/1-breakout/play_a3c_model.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import numpy as np 4 | from skimage.color import rgb2gray 5 | from skimage.transform import resize 6 | from keras.models import Model 7 | from keras.layers import Dense, Flatten, Input 8 | from keras.layers.convolutional import Conv2D 9 | 10 | global episode 11 | episode = 0 12 | EPISODES = 8000000 13 | env_name = "BreakoutDeterministic-v4" 14 | 15 | class TestAgent: 16 | def __init__(self, action_size): 17 | self.state_size = (84, 84, 4) 18 | self.action_size = action_size 19 | 20 | self.discount_factor = 0.99 21 | self.no_op_steps = 30 22 | 23 | self.actor, self.critic = self.build_model() 24 | 25 | def build_model(self): 26 | input = Input(shape=self.state_size) 27 | conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input) 28 | conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv) 29 | conv = Flatten()(conv) 30 | fc = Dense(256, activation='relu')(conv) 31 | policy = Dense(self.action_size, activation='softmax')(fc) 32 | value = Dense(1, activation='linear')(fc) 33 | 34 | actor = Model(inputs=input, outputs=policy) 35 | critic = Model(inputs=input, outputs=value) 36 | 37 | actor.summary() 38 | critic.summary() 39 | 40 | return actor, critic 41 | 42 | def get_action(self, history): 43 | history = np.float32(history / 255.) 44 | policy = self.actor.predict(history)[0] 45 | 46 | action_index = np.argmax(policy) 47 | return action_index 48 | 49 | def load_model(self, name): 50 | self.actor.load_weights(name) 51 | 52 | def pre_processing(next_observe, observe): 53 | processed_observe = np.maximum(next_observe, observe) 54 | processed_observe = np.uint8( 55 | resize(rgb2gray(processed_observe), (84, 84), mode='constant') * 255) 56 | return processed_observe 57 | 58 | 59 | if __name__ == "__main__": 60 | env = gym.make(env_name) 61 | agent = TestAgent(action_size=3) 62 | agent.load_model("save_model/breakout_a3c_5_actor.h5") 63 | 64 | step = 0 65 | 66 | while episode < EPISODES: 67 | done = False 68 | dead = False 69 | 70 | score, start_life = 0, 5 71 | observe = env.reset() 72 | next_observe = observe 73 | 74 | for _ in range(random.randint(1, 20)): 75 | observe = next_observe 76 | next_observe, _, _, _ = env.step(1) 77 | 78 | state = pre_processing(next_observe, observe) 79 | history = np.stack((state, state, state, state), axis=2) 80 | history = np.reshape([history], (1, 84, 84, 4)) 81 | 82 | while not done: 83 | env.render() 84 | step += 1 85 | observe = next_observe 86 | 87 | action = agent.get_action(history) 88 | 89 | if action == 1: 90 | fake_action = 2 91 | elif action == 2: 92 | fake_action = 3 93 | else: 94 | fake_action = 1 95 | 96 | if dead: 97 | fake_action = 1 98 | dead = False 99 | 100 | next_observe, reward, done, info = env.step(fake_action) 101 | 102 | next_state = pre_processing(next_observe, observe) 103 | next_state = np.reshape([next_state], (1, 84, 84, 1)) 104 | next_history = np.append(next_state, history[:, :, :, :3], axis=3) 105 | 106 | if start_life > info['ale.lives']: 107 | dead = True 108 | reward = -1 109 | start_life = info['ale.lives'] 110 | 111 | score += reward 112 | 113 | # if agent is dead, then reset the history 114 | if dead: 115 | history = np.stack( 116 | (next_state, next_state, next_state, next_state), axis=2) 117 | history = np.reshape([history], (1, 84, 84, 4)) 118 | else: 119 | history = next_history 120 | 121 | # if done, plot the score over episodes 122 | if done: 123 | episode += 1 124 | print("episode:", episode, " score:", score, " step:", step) 125 | step = 0 -------------------------------------------------------------------------------- /3-atari/1-breakout/play_dqn_model.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import numpy as np 4 | import tensorflow as tf 5 | from skimage.color import rgb2gray 6 | from skimage.transform import resize 7 | from keras.models import Sequential 8 | from keras.layers import Dense, Flatten 9 | from keras.layers.convolutional import Conv2D 10 | from keras import backend as K 11 | 12 | EPISODES = 50000 13 | 14 | 15 | class TestAgent: 16 | def __init__(self, action_size): 17 | self.state_size = (84, 84, 4) 18 | self.action_size = action_size 19 | self.no_op_steps = 20 20 | 21 | self.model = self.build_model() 22 | 23 | self.sess = tf.InteractiveSession() 24 | K.set_session(self.sess) 25 | 26 | self.avg_q_max, self.avg_loss = 0, 0 27 | self.sess.run(tf.global_variables_initializer()) 28 | 29 | def build_model(self): 30 | model = Sequential() 31 | model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', 32 | input_shape=self.state_size)) 33 | model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu')) 34 | model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu')) 35 | model.add(Flatten()) 36 | model.add(Dense(512, activation='relu')) 37 | model.add(Dense(self.action_size)) 38 | model.summary() 39 | 40 | return model 41 | 42 | def get_action(self, history): 43 | if np.random.random() < 0.01: 44 | return random.randrange(3) 45 | history = np.float32(history / 255.0) 46 | q_value = self.model.predict(history) 47 | return np.argmax(q_value[0]) 48 | 49 | def load_model(self, filename): 50 | self.model.load_weights(filename) 51 | 52 | def pre_processing(observe): 53 | processed_observe = np.uint8( 54 | resize(rgb2gray(observe), (84, 84), mode='constant') * 255) 55 | return processed_observe 56 | 57 | 58 | if __name__ == "__main__": 59 | env = gym.make('BreakoutDeterministic-v4') 60 | agent = TestAgent(action_size=3) 61 | agent.load_model("./save_model/breakout_dqn_5.h5") 62 | 63 | for e in range(EPISODES): 64 | done = False 65 | dead = False 66 | 67 | step, score, start_life = 0, 0, 5 68 | observe = env.reset() 69 | 70 | for _ in range(random.randint(1, agent.no_op_steps)): 71 | observe, _, _, _ = env.step(1) 72 | 73 | state = pre_processing(observe) 74 | history = np.stack((state, state, state, state), axis=2) 75 | history = np.reshape([history], (1, 84, 84, 4)) 76 | 77 | while not done: 78 | env.render() 79 | step += 1 80 | 81 | action = agent.get_action(history) 82 | 83 | if action == 0: 84 | real_action = 1 85 | elif action == 1: 86 | real_action = 2 87 | else: 88 | real_action = 3 89 | 90 | if dead: 91 | real_action = 1 92 | dead = False 93 | 94 | observe, reward, done, info = env.step(real_action) 95 | 96 | next_state = pre_processing(observe) 97 | next_state = np.reshape([next_state], (1, 84, 84, 1)) 98 | next_history = np.append(next_state, history[:, :, :, :3], axis=3) 99 | 100 | if start_life > info['ale.lives']: 101 | dead = True 102 | start_life = info['ale.lives'] 103 | 104 | score += reward 105 | 106 | history = next_history 107 | 108 | if done: 109 | print("episode:", e, " score:", score) 110 | 111 | -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_dqn.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_dqn_1.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn_1.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_dqn_2.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn_2.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_dqn_3.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn_3.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_dqn_4.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn_4.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/save_model/breakout_dqn_5.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/save_model/breakout_dqn_5.h5 -------------------------------------------------------------------------------- /3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 -------------------------------------------------------------------------------- /3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name -------------------------------------------------------------------------------- /3-atari/2-pong/README.md: -------------------------------------------------------------------------------- 1 | # Policy Gradient 2 | 3 | Minimal implementation of Stochastic Policy Gradient Algorithm in Keras 4 | 5 | ## Pong Agent 6 | 7 | ![pg](assetsg.gif) 8 | 9 | 10 | This PG agent seems to get more frequent wins after about 8000 episodes. Below is the score graph. 11 | 12 | 13 | ![score](assetscore.png) 14 | -------------------------------------------------------------------------------- /3-atari/2-pong/assets/pg.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/2-pong/assets/pg.gif -------------------------------------------------------------------------------- /3-atari/2-pong/assets/score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/2-pong/assets/score.png -------------------------------------------------------------------------------- /3-atari/2-pong/pong_a3c.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/2-pong/pong_a3c.py -------------------------------------------------------------------------------- /3-atari/2-pong/pong_reinforce.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from keras.models import Sequential 4 | from keras.layers import Dense, Reshape, Flatten 5 | from keras.optimizers import Adam 6 | from keras.layers.convolutional import Convolution2D 7 | 8 | 9 | class PGAgent: 10 | def __init__(self, state_size, action_size): 11 | self.state_size = state_size 12 | self.action_size = action_size 13 | self.gamma = 0.99 14 | self.learning_rate = 0.001 15 | self.states = [] 16 | self.gradients = [] 17 | self.rewards = [] 18 | self.probs = [] 19 | self.model = self._build_model() 20 | self.model.summary() 21 | 22 | def _build_model(self): 23 | model = Sequential() 24 | model.add(Reshape((1, 80, 80), input_shape=(self.state_size,))) 25 | model.add(Convolution2D(32, 6, 6, subsample=(3, 3), border_mode='same', 26 | activation='relu', init='he_uniform')) 27 | model.add(Flatten()) 28 | model.add(Dense(64, activation='relu', init='he_uniform')) 29 | model.add(Dense(32, activation='relu', init='he_uniform')) 30 | model.add(Dense(self.action_size, activation='softmax')) 31 | opt = Adam(lr=self.learning_rate) 32 | # See note regarding crossentropy in cartpole_reinforce.py 33 | model.compile(loss='categorical_crossentropy', optimizer=opt) 34 | return model 35 | 36 | def remember(self, state, action, prob, reward): 37 | y = np.zeros([self.action_size]) 38 | y[action] = 1 39 | self.gradients.append(np.array(y).astype('float32') - prob) 40 | self.states.append(state) 41 | self.rewards.append(reward) 42 | 43 | def act(self, state): 44 | state = state.reshape([1, state.shape[0]]) 45 | aprob = self.model.predict(state, batch_size=1).flatten() 46 | self.probs.append(aprob) 47 | prob = aprob / np.sum(aprob) 48 | action = np.random.choice(self.action_size, 1, p=prob)[0] 49 | return action, prob 50 | 51 | def discount_rewards(self, rewards): 52 | discounted_rewards = np.zeros_like(rewards) 53 | running_add = 0 54 | for t in reversed(range(0, rewards.size)): 55 | if rewards[t] != 0: 56 | running_add = 0 57 | running_add = running_add * self.gamma + rewards[t] 58 | discounted_rewards[t] = running_add 59 | return discounted_rewards 60 | 61 | def train(self): 62 | gradients = np.vstack(self.gradients) 63 | rewards = np.vstack(self.rewards) 64 | rewards = self.discount_rewards(rewards) 65 | rewards = rewards / np.std(rewards - np.mean(rewards)) 66 | gradients *= rewards 67 | X = np.squeeze(np.vstack([self.states])) 68 | Y = self.probs + self.learning_rate * np.squeeze(np.vstack([gradients])) 69 | self.model.train_on_batch(X, Y) 70 | self.states, self.probs, self.gradients, self.rewards = [], [], [], [] 71 | 72 | def load(self, name): 73 | self.model.load_weights(name) 74 | 75 | def save(self, name): 76 | self.model.save_weights(name) 77 | 78 | def preprocess(I): 79 | I = I[35:195] 80 | I = I[::2, ::2, 0] 81 | I[I == 144] = 0 82 | I[I == 109] = 0 83 | I[I != 0] = 1 84 | return I.astype(np.float).ravel() 85 | 86 | if __name__ == "__main__": 87 | env = gym.make("Pong-v0") 88 | state = env.reset() 89 | prev_x = None 90 | score = 0 91 | episode = 0 92 | 93 | state_size = 80 * 80 94 | action_size = env.action_space.n 95 | agent = PGAgent(state_size, action_size) 96 | agent.load('./save_model/pong_reinforce.h5') 97 | while True: 98 | env.render() 99 | 100 | cur_x = preprocess(state) 101 | x = cur_x - prev_x if prev_x is not None else np.zeros(state_size) 102 | prev_x = cur_x 103 | 104 | action, prob = agent.act(x) 105 | state, reward, done, info = env.step(action) 106 | score += reward 107 | agent.remember(x, action, prob, reward) 108 | 109 | if done: 110 | episode += 1 111 | agent.train() 112 | print('Episode: %d - Score: %f.' % (episode, score)) 113 | score = 0 114 | state = env.reset() 115 | prev_x = None 116 | if episode > 1 and episode % 10 == 0: 117 | agent.save('./save_model/pong_reinforce.h5') 118 | -------------------------------------------------------------------------------- /3-atari/2-pong/save_model/pong_reinforce.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/3-atari/2-pong/save_model/pong_reinforce.h5 -------------------------------------------------------------------------------- /3-atari/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Keon Kim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /4-gym/1-mountaincar/mountaincar_dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import pylab 3 | import random 4 | import numpy as np 5 | from collections import deque 6 | from keras.layers import Dense 7 | from keras.optimizers import Adam 8 | from keras.models import Sequential 9 | 10 | EPISODES = 4000 11 | 12 | 13 | class DQNAgent: 14 | def __init__(self, state_size, action_size): 15 | # Cartpole이 학습하는 것을 보려면 "True"로 바꿀 것 16 | self.render = True 17 | 18 | # state와 action의 크기를 가져와서 모델을 생성하는데 사용함 19 | self.state_size = state_size 20 | self.action_size = action_size 21 | 22 | # Cartpole DQN 학습의 Hyper parameter 들 23 | # deque를 통해서 replay memory 생성 24 | self.discount_factor = 0.99 25 | self.learning_rate = 0.001 26 | self.epsilon = 1.0 27 | self.epsilon_min = 0.005 28 | self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000 29 | self.batch_size = 64 30 | self.train_start = 1000 31 | self.memory = deque(maxlen=10000) 32 | 33 | # 학습할 모델과 타겟 모델을 생성 34 | self.model = self.build_model() 35 | self.target_model = self.build_model() 36 | # 학습할 모델을 타겟 모델로 복사 --> 타겟 모델의 초기화(weight를 같게 해주고 시작해야 함) 37 | self.update_target_model() 38 | 39 | # Deep Neural Network를 통해서 Q Function을 근사 40 | # state가 입력, 각 행동에 대한 Q Value가 출력인 모델을 생성 41 | def build_model(self): 42 | model = Sequential() 43 | model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) 44 | model.add(Dense(16, activation='relu', kernel_initializer='he_uniform')) 45 | model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform')) 46 | model.summary() 47 | model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) 48 | return model 49 | 50 | # 일정한 시간 간격마다 타겟 모델을 현재 학습하고 있는 모델로 업데이트 51 | def update_target_model(self): 52 | self.target_model.set_weights(self.model.get_weights()) 53 | 54 | # 행동의 선택은 현재 네트워크에 대해서 epsilon-greedy 정책을 사용 55 | def get_action(self, state): 56 | if np.random.rand() <= self.epsilon: 57 | return random.randrange(self.action_size) 58 | else: 59 | q_value = self.model.predict(state) 60 | return np.argmax(q_value[0]) 61 | 62 | # 을 replay_memory에 저장함 63 | def replay_memory(self, state, action, reward, next_state, done): 64 | if action == 2: 65 | action = 1 66 | self.memory.append((state, action, reward, next_state, done)) 67 | if self.epsilon > self.epsilon_min: 68 | self.epsilon -= self.epsilon_decay 69 | # print(len(self.memory)) 70 | 71 | # replay memory에서 batch_size 만큼의 샘플들을 무작위로 뽑아서 학습 72 | def train_replay(self): 73 | if len(self.memory) < self.train_start: 74 | return 75 | batch_size = min(self.batch_size, len(self.memory)) 76 | mini_batch = random.sample(self.memory, batch_size) 77 | 78 | update_input = np.zeros((batch_size, self.state_size)) 79 | update_target = np.zeros((batch_size, self.action_size)) 80 | 81 | for i in range(batch_size): 82 | state, action, reward, next_state, done = mini_batch[i] 83 | target = self.model.predict(state)[0] 84 | 85 | # 큐러닝에서와 같이 s'에서의 최대 Q Value를 가져옴. 단, 타겟 모델에서 가져옴 86 | if done: 87 | target[action] = reward 88 | else: 89 | target[action] = reward + self.discount_factor * \ 90 | np.amax(self.target_model.predict(next_state)[0]) 91 | update_input[i] = state 92 | update_target[i] = target 93 | 94 | # 학습할 정답인 타겟과 현재 자신의 값의 minibatch를 만들고 그것으로 한 번에 모델 업데이트 95 | self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0) 96 | 97 | # 저장한 모델을 불러옴 98 | def load_model(self, name): 99 | self.model.load_weights(name) 100 | 101 | # 학습된 모델을 저장함 102 | def save_model(self, name): 103 | self.model.save_weights(name) 104 | 105 | 106 | if __name__ == "__main__": 107 | # CartPole-v1의 경우 500 타임스텝까지 플레이가능 108 | env = gym.make('MountainCar-v0') 109 | # 환경으로부터 상태와 행동의 크기를 가져옴 110 | state_size = env.observation_space.shape[0] 111 | #action_size = env.action_space.n 112 | action_size = 2 113 | # DQN 에이전트의 생성 114 | agent = DQNAgent(state_size, action_size) 115 | agent.load_model("./save_model/MountainCar_DQN.h5") 116 | scores, episodes = [], [] 117 | 118 | for e in range(EPISODES): 119 | done = False 120 | score = 0 121 | state = env.reset() 122 | state = np.reshape(state, [1, state_size]) 123 | print(state) 124 | 125 | # 액션 0(좌), 1(아무것도 안함), 3(아무것도 하지 않는 액션을 하지 않기 위한 fake_action 선언 126 | fake_action = 0 127 | 128 | # 같은 액션을 4번하기 위한 카운터 129 | action_count = 0 130 | 131 | while not done: 132 | if agent.render: 133 | env.render() 134 | 135 | # 현재 상태에서 행동을 선택하고 한 스텝을 진행 136 | action_count = action_count + 1 137 | 138 | if action_count == 4: 139 | action = agent.get_action(state) 140 | action_count = 0 141 | 142 | if action == 0: 143 | fake_action = 0 144 | elif action == 1: 145 | fake_action = 2 146 | 147 | # 선택한 액션으로 1 step을 시행한다 148 | next_state, reward, done, info = env.step(fake_action) 149 | next_state = np.reshape(next_state, [1, state_size]) 150 | # 에피소드를 끝나게 한 행동에 대해서 -100의 패널티를 줌 151 | #reward = reward if not done else -100 152 | 153 | # 을 replay memory에 저장 154 | agent.replay_memory(state, fake_action, reward, next_state, done) 155 | # 매 타임스텝마다 학습을 진행 156 | agent.train_replay() 157 | score += reward 158 | state = next_state 159 | 160 | if done: 161 | env.reset() 162 | # 매 에피소드마다 학습하는 모델을 타겟 모델로 복사 163 | agent.update_target_model() 164 | 165 | # 각 에피소드마다 cartpole이 서있었던 타임스텝을 plot 166 | scores.append(score) 167 | episodes.append(e) 168 | #pylab.plot(episodes, scores, 'b') 169 | #pylab.savefig("./save_graph/MountainCar_DQN.png") 170 | print("episode:", e, " score:", score, " memory length:", len(agent.memory), 171 | " epsilon:", agent.epsilon) 172 | 173 | # 50 에피소드마다 학습 모델을 저장 174 | if e % 50 == 0: 175 | agent.save_model("./save_model/MountainCar_DQN.h5") 176 | -------------------------------------------------------------------------------- /4-gym/1-mountaincar/save_model/MountainCar_DQN.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/4-gym/1-mountaincar/save_model/MountainCar_DQN.h5 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 RLCode 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | -------------------------------------------------------------------------------- 4 | 5 | > Minimal and clean examples of reinforcement learning algorithms presented by [RLCode](https://rlcode.github.io) team. [[한국어]](https://github.com/rlcode/reinforcement-learning-kr) 6 | > 7 | > Maintainers - [Woongwon](https://github.com/dnddnjs), [Youngmoo](https://github.com/zzing0907), [Hyeokreal](https://github.com/Hyeokreal), [Uiryeong](https://github.com/wooridle), [Keon](https://github.com/keon) 8 | 9 | From the basics to deep reinforcement learning, this repo provides easy-to-read code examples. One file for each algorithm. 10 | Please feel free to create a [Pull Request](https://github.com/rlcode/reinforcement-learning/pulls), or open an [issue](https://github.com/rlcode/reinforcement-learning/issues)! 11 | 12 | ## Dependencies 13 | 1. Python 3.5 14 | 2. Tensorflow 1.0.0 15 | 3. Keras 16 | 4. numpy 17 | 5. pandas 18 | 6. matplot 19 | 7. pillow 20 | 8. Skimage 21 | 9. h5py 22 | 23 | ### Install Requirements 24 | ``` 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | ## Table of Contents 29 | 30 | **Grid World** - Mastering the basics of reinforcement learning in the simplified world called "Grid World" 31 | 32 | - [Policy Iteration](./1-grid-world/1-policy-iteration) 33 | - [Value Iteration](./1-grid-world/2-value-iteration) 34 | - [Monte Carlo](./1-grid-world/3-monte-carlo) 35 | - [SARSA](./1-grid-world/4-sarsa) 36 | - [Q-Learning](./1-grid-world/5-q-learning) 37 | - [Deep SARSA](./1-grid-world/6-deep-sarsa) 38 | - [REINFORCE](./1-grid-world/7-reinforce) 39 | 40 | **CartPole** - Applying deep reinforcement learning on basic Cartpole game. 41 | 42 | - [Deep Q Network](./2-cartpole/1-dqn) 43 | - [Double Deep Q Network](./2-cartpole/2-double-dqn) 44 | - [Policy Gradient](./2-cartpole/3-reinforce) 45 | - [Actor Critic (A2C)](./2-cartpole/4-actor-critic) 46 | - [Asynchronous Advantage Actor Critic (A3C)](./2-cartpole/5-a3c) 47 | 48 | **Atari** - Mastering Atari games with Deep Reinforcement Learning 49 | 50 | - **Breakout** - [DQN](./3-atari/1-breakout/breakout_dqn.py), [DDQN](./3-atari/1-breakout/breakout_ddqn.py) [Dueling DDQN](./3-atari/1-breakout/breakout_ddqn.py) [A3C](./3-atari/1-breakout/breakout_a3c.py) 51 | - **Pong** - [Policy Gradient](./3-atari/2-pong/pong_reinforce.py) 52 | 53 | **OpenAI GYM** - [WIP] 54 | 55 | - Mountain Car - [DQN](./4-gym/1-mountaincar) 56 | -------------------------------------------------------------------------------- /images/Reinforcement-Learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/images/Reinforcement-Learning.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Keras==2.0.3 2 | numpy==1.12.1 3 | pandas==0.19.2 4 | matplotlib==2.0.0 5 | tensorflow==1.0.0 6 | Pillow==4.1.0 7 | gym==0.8.1 8 | h5py==2.7.0 9 | scikit-image==0.13.0 10 | -------------------------------------------------------------------------------- /wiki/how-to-windows.md: -------------------------------------------------------------------------------- 1 | # How To run examples on windows, step by step. 2 | Traditionally machine learning applications could only run in Linux or MacOS environments. 3 | 4 | In this wiki you will learn how to configure your **windows** environment so you can run the examples. 5 | ![](img/how-to-windows.png) 6 | 7 | # Go for it. 8 | Recommended for greater compatibility 9 | 10 | ## Python Interpreter: 11 | - Download & Install [Anaconda](https://www.continuum.io/downloads), pick Python 3.6 version 64 Bit Installer 12 | Test installation on Windows console 13 | 14 | ``` 15 | python --version 16 | Python 3.6.0 :: Anaconda custom (64-bit) 17 | ``` 18 | ## Set virtual env to run examples 19 | ``` 20 | # Create env, you can pick any version of python, but for run this repo 21 | conda create --name rl python=3.5 22 | 23 | # Activate env 24 | activate rl 25 | 26 | # Install TensorFlow, the easy way 27 | conda install -c conda-forge tensorflow 28 | conda install -c anaconda scipy=0.19.0 29 | 30 | mkdir examples 31 | cd examples 32 | git clone https://github.com/rlcode/reinforcement-learning 33 | cd reinforcement-learning 34 | 35 | # Install Requirements 36 | pip install -r requirements.txt 37 | 38 | # Check 39 | conda list 40 | 41 | # Test the code 42 | cd "Code 1. Grid World\1. Policy Iteration" 43 | python run.py 44 | ``` 45 | 46 | # Next Steps. 47 | - Need a IDE to easy manage the python scripts, Download & Install [PyCharm Community](https://www.jetbrains.com/pycharm/download/#section=windows) its free. 48 | 49 | ## Linking PyCharm with Anaconda Env. 50 | - Open Project with PyCharm IDE: File > Open > Pick Folder (c:\examples\reinforcement-learning) 51 | - File > Settings > Project Interpreter > Add Local 52 | ![](img/link-env-with-pychar.png) 53 | 54 | - Note: Need to pick python environment interpreter i.e located in c:\Anaconda3\envs\rl 55 | ![](img/link-env-with-pychar-1.png) 56 | 57 | - If all is ok. 58 | ![](img/link-env-with-pychar-2.png) 59 | 60 | - Play It with samples (Run). 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /wiki/img/how-to-windows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/img/how-to-windows.png -------------------------------------------------------------------------------- /wiki/img/link-env-with-pychar-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/img/link-env-with-pychar-1.png -------------------------------------------------------------------------------- /wiki/img/link-env-with-pychar-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/img/link-env-with-pychar-2.png -------------------------------------------------------------------------------- /wiki/img/link-env-with-pychar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/img/link-env-with-pychar.png -------------------------------------------------------------------------------- /wiki/install_guide_osx+ubuntu.md: -------------------------------------------------------------------------------- 1 | ## 개발 환경 설정 1: 리눅스 (우분투) 2 | 3 | 리눅스는 소스코드가 공개된 대표적인 오픈소스 운영체제입니다. 리눅스는 모든 소스가 공개되어 있으므로 정말 많은 종류가 있습니다. 그중에서도 우분투(Ubuntu)가 가장 넓은 사용자를 가진 배포판입니다. 매년 상반기 하반기 우분투 재단에서 새로운 버전을 배포하는데 이 책에서는 14년 상반기에 배포한 우분투 14.04 버전을 사용할 것입니다. 우분투 14.04가 설치되어 있다는 가정에 따라 이후의 개발환경 설정을 설명할 것입니다. 4 | 5 | 6 | 7 | ### 2.1.1 우분투 파이썬의 버전 확인 8 | 9 | 리눅스의 장점은 바로 파이썬(Python)이 설치가 기본적으로 되어 있다는 것입니다. 파이썬은 2.X 버전과 3.X버전이 있는데 이 책에서는 `파이썬 3.5버전`을 사용할 것입니다. 바탕화면에서 `Ctrl+Alt+t`를 누르면 터미널 창이 뜨는데 10 | 여기에 다음 명령어를 치고 엔터를 누르면 설치된 파이썬의 버전을 확인할 수 있습니다. 11 | 12 | ```python 13 | $ python -V 14 | ``` 15 | 16 | 우분투 14.04 버전에는 `파이썬 2.7버전`과 `3.5버전`이 기본적으로 설치되어 있습니다. 17 | 18 | 19 | 20 | ### 2.1.2 파이참 커뮤니티 설치 및 환경 설정 21 | 22 | 앞으로 강화학습 에이전트를 만들고 가상 환경에서 에이전트를 학습시킬 것입니다. 그러기 위해 코드를 짜고 편집하는 23 | 환경이 필요한데 그러한 환경을 IDE(interface Development Environment)라고 합니다. IDE에는 많은 종류가 있지만 이 책에서는 파이참(Pycharm)을 파이썬을 위한 IDE로 사용할 것입니다. 24 | 25 | 파이참의 설치는 파이참의 공식 홈페이지[[1\]](#_ftn1)를통해서 할 수 있습니다. 홈페이지에서 윈도우, 리눅스, 맥 OS 버전의 파이참을 다운로드 할 수 있습니다. 파이참은 유료 버전인 `프로페셔녈(PyCharm ProfessionalEdition)`과, 무료 버전인 `커뮤니티(PyCharm Community Edition)`으로 나뉩니다. 앞으로 에이전트를 개발할 때 `파이참 커뮤니티`를 사용할 것이므로 커뮤니티 버전을 기준으로 설치법을 설명할 것입니다. 26 | 27 | 28 | 29 | **설치는 다음과 같은 순서로 진행합니다. ** 30 | 31 | 1. 파이참 공식 홈페이지 링크에서 파이참 커뮤니티버전을 다운로드합니다. 32 | 33 | ​ 34 | 35 | 링크: [https://www.jetbrains.com/pycharm/download/#section=linux](https://www.jetbrains.com/pycharm/download/#section=linux) 36 | 37 | 38 | 39 | ​ 40 | 41 | 2. 다운받은 경로로 들어가서 다음 명령어로 압축파일을 풀어줍니다. 42 | 43 | ```shell 44 | $tar xfz pycharm-community-2016.3.2.tar.gz 45 | ``` 46 | 47 | 48 | 49 | 50 | 3. 압축을 푼 후 아래 경로(bin폴더)로 이동합니다. 51 | 52 | ```shell 53 | $cd ~/pycharm-community-2016.3.2/bin 54 | ``` 55 | ​ 56 | 57 | 4. 다음 명령어로 파이참을 실행합니다. 58 | 59 | ```shell 60 | $sh pycharm.sh 61 | ``` 62 | ​ 63 | 64 | 65 | 66 | ​ 67 | 68 | 69 | 5. 명령어가 실행되면 설치가 시작됩니다. 70 | 71 | ​ 72 | 73 | 6. 설치가 완료되면 다음 화면과 같은 초기 환경설정 화면을 볼 수 있습니다. 74 | 75 | ​ 76 | 77 | 78 | 79 | ​ 80 | 81 | IDE theme 항목에서 Intellij는 바탕이 흰색인 테마이고 Darcula 테마는 바탕이 검은색입니다. 이 82 | 책에서는 Intellij를 테마로 사용합니다. 83 | 84 | ​ 85 | 86 | 7. 초기설정이 완료된 후의 화면입니다. 여기서 프로젝트 생성을 해봅니다. 87 | 88 | 89 | 90 | ​ 91 | 92 | ​ 93 | 94 | 8. 프로젝트의 경로와 Interpreter를 설정하는 화면입니다. Home 디렉터리에 PycharmProjects 폴더를 생성하고 그 하위에 프로젝트를 생성합니다. 프로젝트의 이름은 독자가 임의로 정하도록 합니다. “rlcode_book” 이름으로 프로젝트를 생성하는데 Interpreter를 설정해줍니다. Interpreter는 이 프로젝트에서 사용할 언어인데 python 3.5라고 설정합니다. 95 | 96 | ​ 97 | 98 | 99 | ​ 100 | 101 | 9. rlcode_book 프로젝트가 생성되면 아래와 같은 화면이 나옵니다. 102 | 103 | ​ 104 | 105 | 106 | 107 | ​ 108 | 109 | ​ 110 | 111 | 10. 파이참이 정상적으로 설치되었는지 확인하기 위해 파이썬 스크립트 파일을 생성해봅니다. 가장 간단한 예제인 `“Hello World”`를 실행하기 위해 다음과 같이 hello_world.py 파일을 생성합니다. 112 | ​ 113 | 114 | 115 | ​ 116 | 117 | 11. 생성한 파일에 마우스 커서를 놓고 오른쪽 버튼을 누르면 여러 항목이 나옵니다. 그 중에서 “Run ‘hello_world’” 버튼을 누르면 hello_world.py 파일을 실행할 수 있습니다. 118 | ​ 119 | 120 | 121 | 122 | ​ 123 | 124 | 125 | 12. hello_world.py 파일 안에 다음 코드를 입력합니다. 126 | ```python 127 | print("hello world") 128 | ``` 129 | 130 | 131 | 132 | 133 | 13. hello_world.py 파일을 실행시키면 아래 화면과 같이 실행 창에 “hello world”가 나옵니다. 이를 통해 파이참이 정상적으로 설치된 것을 확인할 수 있습니다. 134 | 135 | ​ 136 | 137 | 138 | 139 | ​ 140 | 141 | 142 | ###Virtualenv(가상환경) 사용법 :happy: 143 | 144 | 여기까지가기본적인 파이참의 환경설정입니다. 한 컴퓨터에서 여러가지 프로젝트를 진행할 경우에 프로젝트마다 개발환경이다를 수 있습니다. 서로 다른 프로젝트의 개발환경이 다를 경우에 사용자는 상당한 불편을 겪을 수 있습니다. 따라서 프로젝트별로 개발환경을 분리해서 관리하는 것은 상당한 장점이 있는데 그 기능을 하는 것이 VirtualEnv입니다. VirtualEnv를 사용하면 이 책의 프로젝트만을위한 가상 개발환경을 만들 수 있습니다. 145 | 146 | 파이참은VirtualEnv를 지원하기 때문에 파이참으로 VirtualEnv를사용하는 법을 설명하겠습니다. VirtualEnv의 설치 및 사용 방법은 여러 가지가 있지만 위에서설치한 파이참을 이용하면 GUI(graphic user interface)형식으로 VirtualEnv를 사용할 수 있습니다. 그리고 파이참은 가상 개발환경에설치된 다양한 파이썬 외부 라이브러리들을 관리 할 수 있는 기능을 제공합니다. 147 | 148 | **파이참에서 VirtualEnv 이용방법은 다음과 같습니다.** 149 | 150 | 1. “File” 메뉴에서 “Settings”를 클릭합니다. 151 | 152 | ​ 153 | 154 | 155 | 156 | ​ 157 | 158 | 2. Settings의 왼쪽 목록에서 “Project: 프로젝트명”의 하위 항목인 Project Interpreter 클릭합니다. 그리고 Project Interpreter 탭 오른쪽에서 “Create VirtualEnv”를 클릭합니다. 159 | 160 | ​ 161 | 162 | 163 | 164 | ​ 165 | 166 | 3. 가상환경 이름을 입력하면 /home/brian/rlcode_book 디렉토리가 생성되어 가상환경이 생깁니다. 167 | 168 | ​ 169 | 170 | 171 | 172 | ​ 173 | 174 | 4. 아래와 같이 터미널 창에 (rlcode_book) 표시가 된다면 rlcode_book이름을 가진 가상 환경이 생긴 것입니다. 이제 이 환경을 이 책을 위한 가상환경으로 사용하겠습니다. 175 | 176 | ​ 177 | 178 | 179 | 180 | ​ 181 | 182 | ### 2.1.3 오픈에이아이 설치 및 테스트 183 | 184 | 2016년에 오픈에이아이(OpenAI)라는 회사가 세워졌습니다. 이 회사의 목표는 인공지능 기술을 전 세계에공개해서 더 안전한 인공지능을 만들어가며 더 많은 분야에 인공지능을 도입하는 것입니다. 오픈에이아이 짐(Gym)는 오픈에이아이에서 만든 환경인데여기서 여러가지 인공지능 알고리즘을 테스트 해볼 수 있습니다. 185 | 186 | 오픈에이아이짐의 코드는 모두 오픈에이아이의 깃허브(Github)[[2\]](#_ftn1)에업로드되어있습니다. 187 | 188 | 189 | 190 | 191 | 192 | 오픈에이아이 짐의 설치는 공식 홈페이지에 설명되어있습니다. 오픈에이아이짐을 설치하기 위해서는 깃(Git)를 먼저 설치해줘야 합니다. 깃(Git)은 버전 관리 도구로서개발 프로세스에서 버전 관리가 필요할 때 주로 사용합니다. 오픈에이아이는 오픈소스로 깃헙(Github)에 공개되어 있습니다. 깃헙은 버전관리되는 소스 코드들의원격 저장소 역할을 하는 플랫폼입니다. 193 | 194 | 다음과 같은 명령어로 깃를 설치합니다. 195 | 196 | ```shell 197 | $ sudo apt-get install git 198 | ``` 199 | 200 | 201 | 202 | 깃을 설치한 다음에 오픈에이아이 짐을 설치합니다. 터미널 창에서 오픈에이아이 짐을 설치할 디렉토리로 이동한 다음에 다음과 같은 명령어를 실행합니다. 203 | 204 | ```shell 205 | $ git clone https://github.com/openai/gym 206 | $ cd gym 207 | $ pip3 install -e 208 | ``` 209 | 210 | 211 | 212 | 오픈에이아이 짐은 여러가지 다른 설정으로 설치할 수 있는데 `pip install -e .`은 가장 기본적인 부분들만 설치하는 것입니다. 이후에 아타리 게임 등 오픈에이아이 짐의 모든 게임들을 사용하려면 `pip install -e .` 대신에 다음과 같이 입력해야 합니다. 213 | 214 | ```shell 215 | $ pip3 install -e .[all] 216 | ``` 217 | 218 | 219 | 220 | 오픈에이아이 짐이 정상적으로 설치되었는지 확인하기 위해서 간단한 예제를 실행해봅니다. 오픈에이아이 짐의 가장 간단한 예제는 카트폴(CartPole)입니다. 카트폴은 카트에 진자가 달린 형태로 이 문제의 목표는 카트를 움직여서 그 반동으로 진자를 세우는 것입니다. 테스트할 때는 그냥 아무 입력도 카트폴에 주지 않은 상태로 오픈에이아이 짐이 제대로 실행되는지만 확인할 것입니다. 221 | 222 | `CartPole.py` 파일을 생성하고 코드 2.1과 같이 입력합니다. 223 | 224 | ```python 225 | import gym 226 | env = gym.make('CartPole-v0') 227 | env.reset() 228 | for _ in range(1000): 229 | env.render() 230 | env.step(env.action_space.sample()) # take a random action 231 | ``` 232 | 233 | ​ 코드 2.1 카트폴 예제 실행 코드 234 | 235 | 236 | 이 코드를 실행하면 화면에 아무 행동도 하지 않는 카트폴이 실행됩니다. 오픈에이아이 짐은 이와 같은 많은 문제들을 제공하며 사용자들은 오픈에이아이 짐의 여러가지 문제에 자신의 학습 알고리즘을 적용해볼 수 있습니다. 또한 오픈에이아이 짐 사이트에 자신의 알고리즘을 공유하거나 결과를 확인할 수 있습니다. 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | ## 2.2 개발 환경 설정 2: 맥 OS 245 | 246 | 맥 OS에는 기본적으로 파이썬 2.7버전이 설치되어있기 때문에 3.5 버전을 새로 설치를 해야 합니다. 247 | 248 | ### 2.2.1 파이썬 3.5 설치 및 환경 설정 249 | 250 | 파이썬다운로드 페이지[[3\]](#_ftnref3)로접속하면 다음과 같은 화면이 나옵니다. 251 | 252 | 253 | 254 | 255 | 256 | 1. 위 화면에서 자신의 맥 OS 버전에 맞는 파일을 선택해서 다운로드합니다. 다운로드가 완료된 파일을 실행 후 안내에 따르면 설치가 완료됩니다. 257 | 258 | ​ 259 | 260 | 261 | 262 | ​ 263 | 264 | 2. 파이썬 설치가 정상적으로 완료됐는지 확인하기 위해서는 터미널을 실행합니다. 터미널 창에 ‘python3’ 명령어를 입력했을 때 다음 화면과 같이 출력된다면 정상적으로 설치된 것입니다. 265 | 266 | ​ 267 | 268 | 269 | 270 | ​ 271 | 272 | ### 2.2.2 파이참 커뮤니티 설치 및 환경 설정 273 | 274 | 파이참의 설치 및 환경 설정은 다음과 같은 순서로 진행합니다. 275 | 276 | 1. 파이참홈페이지에 접속하여 커뮤니티버전을 다운로드합니다. 277 | 278 | 2. 다운로드가 완료된 파일을 실행하고아래 그림에서 왼쪽 PyCharm CE 아이콘을 오른쪽 폴더 아이콘으로 드래그하면 설치가 완료됩니다. 279 | 280 | 281 | 282 | 3. 처음 파이참을 실행하게 되면 설정화면이 나오는데 IDE theme을 통해 IDE의 색상과 스타일을 변경할 수 있습니다. Default는 우분투의 개발환경설정에서 봤던 Intellij 테마입니다. 이 책에서는 Default를 사용할 것입니다. 283 | ​ 284 | 285 | 4. 초기 설정을 완료하고 Create New Project 버튼을 클릭합니다. 286 | 287 | ​ 288 | 289 | 5. Create New Project 버튼을 클릭하면 아래 그림과 같은 화면이 나옵니다. Location은 프로젝트가 생성될 경로와 프로젝트 폴더명을 설정하는 곳입니다. 프로젝트의 이름과 경로는 독자가 임의로 지정하면 됩니다. 290 | 291 | Interpreter는 프로젝트에서 어떤 파이썬 Interpreter를 사용할 것인지 고르는 것입니다. 우분투에서와 마찬가지로 VirtualEnv를 통해 가상 환경을 만들고 그것을 Interpreter로 사용할 것입니다. Create VirtualEnv 버튼을 누릅니다. 292 | 293 | ​ 294 | 295 | 296 | 297 | ​ 298 | 299 | 6. 아래 그림은 VirtualEnv의 생성화면입니다. Name과 Location은 여러분이 임의로 설정하면 됩니다. Base Interpreter는 위와 같이 새로 설치한 python3.5 를 선택합니다. OK버튼을 누르면 해당 VirtualEnv가 생성됩니다. 300 | 301 | ​ 302 | 303 | 304 | 305 | ​ 306 | 307 | 7. 처음 New Project 생성화면의 Interpreter에서 방금 생성한 VirtualEnv를 선택해줍니다. 그리고 Create버튼을 누르면 프로젝트 생성이 완료됩니다. 308 | 309 | ​ 310 | 311 | 312 | 313 | ​ 314 | 315 | 8. 프로젝트를 생성하고 나면 다음과 같은 작업 환경이 보입니다. 이 화면에서 최상위 폴더를 우클릭한 후 316 | 317 | New -> Python File을 클릭하면 새로운 파이썬 파일을 생성할 수 있습니다. 318 | 319 | ​ 320 | 321 | 322 | 323 | ​ 324 | 325 | 9. 파이참이 제대로 설치됐는지 확인하기 위해 hello world 예제를 실행해봅니다. 우분투에서와 동일하기 때문에 생략하겠습니다. 326 | 327 | ​ 328 | 329 | ### 2.2.3 오픈에이아이 설치 및 테스트 330 | 331 | 오픈에이아이를 설치하고 카트폴을 실행해보는 단계는 우분투와 동일하므로 생략합니다. 332 | 333 | 334 | 335 | ------ 336 | 337 | [[1\]](#_ftnref1) https://www.jetbrains.com/pycharm/ 338 | 339 | [[2\]](#_ftnref2) https://github.com/openai/gym 340 | 341 | [[3\]](#_ftnref3) https://www.python.org/downloads/release/python-350/ 342 | -------------------------------------------------------------------------------- /wiki/rlcode_image/cartpole_exam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/cartpole_exam.png -------------------------------------------------------------------------------- /wiki/rlcode_image/console_hello_world.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/console_hello_world.png -------------------------------------------------------------------------------- /wiki/rlcode_image/default_config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/default_config.png -------------------------------------------------------------------------------- /wiki/rlcode_image/file_setting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/file_setting.png -------------------------------------------------------------------------------- /wiki/rlcode_image/hello_world_ubuntu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/hello_world_ubuntu.png -------------------------------------------------------------------------------- /wiki/rlcode_image/openai_github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/openai_github.png -------------------------------------------------------------------------------- /wiki/rlcode_image/project_interpreter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/project_interpreter.png -------------------------------------------------------------------------------- /wiki/rlcode_image/pycham_new_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/pycham_new_project.png -------------------------------------------------------------------------------- /wiki/rlcode_image/pycharm_community.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/pycharm_community.png -------------------------------------------------------------------------------- /wiki/rlcode_image/pycharm_drag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/pycharm_drag.png -------------------------------------------------------------------------------- /wiki/rlcode_image/pycharm_init.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/pycharm_init.png -------------------------------------------------------------------------------- /wiki/rlcode_image/python3_terminal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/python3_terminal.jpg -------------------------------------------------------------------------------- /wiki/rlcode_image/python_download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/python_download.png -------------------------------------------------------------------------------- /wiki/rlcode_image/python_installed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/python_installed.png -------------------------------------------------------------------------------- /wiki/rlcode_image/python_intalled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/python_intalled.png -------------------------------------------------------------------------------- /wiki/rlcode_image/rl_book_hello_world.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rl_book_hello_world.png -------------------------------------------------------------------------------- /wiki/rlcode_image/rl_book_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rl_book_project.png -------------------------------------------------------------------------------- /wiki/rlcode_image/rl_book_venv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rl_book_venv.png -------------------------------------------------------------------------------- /wiki/rlcode_image/rl_book_virtualenv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rl_book_virtualenv.png -------------------------------------------------------------------------------- /wiki/rlcode_image/rlcode_book_directory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rlcode_book_directory.png -------------------------------------------------------------------------------- /wiki/rlcode_image/rlcode_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/rlcode_project.png -------------------------------------------------------------------------------- /wiki/rlcode_image/run_hello_world.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/run_hello_world.png -------------------------------------------------------------------------------- /wiki/rlcode_image/sh_pycharm.sh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/sh_pycharm.sh.png -------------------------------------------------------------------------------- /wiki/rlcode_image/terminal_rlcode_book.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlcode/reinforcement-learning/2fe6984da684c3f64a8d09d1718dbac9330aecea/wiki/rlcode_image/terminal_rlcode_book.png --------------------------------------------------------------------------------