├── README.md └── content ├── 1_treasure_on_right └── treasure_on_right.py ├── 2_Q-learning-maze ├── RL_brain.py ├── maze_env.py └── run_this.py ├── 3_Sarsa_maze ├── RL_brain.py ├── maze_env.py └── run_this.py ├── 4_Sarsa_lambda_maze ├── RL_brain.py ├── maze_env.py └── run_this.py ├── 5.1_double_DQN ├── RL_brain.py └── run_Pendulum.py ├── 5.2_Prioritized_Replay_DQN ├── Figure_1.png ├── RL_brain.py └── run_MountainCar.py ├── 5.3_Dueling_DQN ├── RL_brain.py ├── action15.png └── run_Pendulum.py ├── 5_Deep_Q_Network ├── RL_brain.py ├── maze_env.py └── run_this.py ├── 7_Policy_gradient_softmax ├── RL_brain.py ├── run_CartPole.py └── run_MountainCar.py └── 8_Actor_Critic_Advantage ├── AC_CartPole.py └── AC_continue_Pendulum.py /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement-learning-with-PyTorch 2 | Reinforcement learning with PyTorch, inspired by [MorvanZhou](https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow), change the framework from Tensorflow to PyTorch 3 | -------------------------------------------------------------------------------- /content/1_treasure_on_right/treasure_on_right.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import pandas as pd 4 | 5 | np.random.seed(2) 6 | 7 | N_STATES = 6 8 | ACTIONS = ['left', 'right'] 9 | MAX_EPISODES = 13 10 | FRESH_TIME = 0.3 11 | EPSILON = 0.9 12 | ALPHA = 0.1 13 | GAMMA = 0.9 14 | 15 | def update_env(S, episode, step_counter): 16 | env_list = ['-']*(N_STATES-1) + ['T'] 17 | if S == 'terminal': 18 | interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter) 19 | print('\r{}'.format(interaction),end='') 20 | time.sleep(2) 21 | print('\r ', end='') 22 | else: 23 | env_list[S] = 'o' 24 | interaction = ''.join(env_list) 25 | print('\r{}'.format(interaction), end='') 26 | time.sleep(FRESH_TIME) 27 | 28 | def get_env_feedback(S,A): 29 | if A == 'right': 30 | if S == N_STATES - 2: 31 | S_ = 'terminal' 32 | R = 1 33 | else: 34 | S_ = S + 1 35 | R = 0 36 | else: 37 | R = 0 38 | if S == 0: 39 | S_ = S 40 | else: 41 | S_ = S - 1 42 | return S_, R 43 | 44 | def build_q_table(n_states, actions): 45 | table = pd.DataFrame(np.zeros((n_states, len(actions))),columns=actions) 46 | return table 47 | 48 | def choose_action(state, q_table): 49 | state_actions = q_table.iloc[state,:] 50 | if (np.random.uniform()>EPSILON) or ((state_actions == 0).all()): 51 | action_name = np.random.choice(ACTIONS) 52 | else: 53 | action_name = state_actions.idxmax() 54 | return action_name 55 | 56 | def rl(): 57 | q_table = build_q_table(N_STATES, ACTIONS) 58 | for episode in range(MAX_EPISODES): 59 | # print("episode: ", episode) 60 | # print("q_table: ", q_table) 61 | step_counter = 0 62 | S = 0 63 | is_terminated = False 64 | update_env(S, episode, step_counter) 65 | while not is_terminated: 66 | A = choose_action(S, q_table) 67 | S_, R = get_env_feedback(S, A) 68 | q_predict = q_table.loc[S,A] 69 | if S_ != 'terminal': 70 | q_target = R + GAMMA*q_table.iloc[S_,:].max() 71 | else: 72 | q_target = R 73 | is_terminated = True 74 | 75 | q_table.loc[S,A] += ALPHA*(q_target - q_predict) 76 | S = S_ 77 | 78 | update_env(S, episode, step_counter+1) 79 | step_counter += 1 80 | return q_table 81 | 82 | if __name__ == "__main__": 83 | q_table = rl() 84 | print('\r\nQ-table:\n') 85 | print(q_table) -------------------------------------------------------------------------------- /content/2_Q-learning-maze/RL_brain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class QLearningTable: 5 | def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): 6 | self.actions = actions 7 | self.lr = learning_rate 8 | self.gamma = reward_decay 9 | self.epsilon = e_greedy 10 | self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64) 11 | 12 | def choose_action(self, observation): 13 | self.check_state_exist(observation) 14 | if np.random.uniform() < self.epsilon: 15 | state_action = self.q_table.loc[observation,:] 16 | action = np.random.choice(state_action[state_action==np.max(state_action)].index) 17 | else: 18 | action = np.random.choice(self.actions) 19 | return action 20 | 21 | def learn(self, s, a, r, s_): 22 | self.check_state_exist(s_) 23 | q_predict = self.q_table.loc[s, a] 24 | if s_ != 'terminal': 25 | q_target = r + self.gamma*self.q_table.loc[s_, :].max() 26 | else: 27 | q_target = r 28 | self.q_table.loc[s, a] += self.lr*(q_target-q_predict) 29 | 30 | def check_state_exist(self, state): 31 | if state not in self.q_table.index: 32 | self.q_table = self.q_table.append(pd.Series([0]*len(self.actions),index=self.q_table.columns,name=state)) -------------------------------------------------------------------------------- /content/2_Q-learning-maze/maze_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import sys 4 | if sys.version_info.major == 2: 5 | import Tkinter as tk 6 | else: 7 | import tkinter as tk 8 | 9 | 10 | UNIT = 40 # pixels 11 | MAZE_H = 8 # grid height 12 | MAZE_W = 8 # grid width 13 | 14 | 15 | class Maze(tk.Tk, object): 16 | def __init__(self): 17 | super(Maze, self).__init__() 18 | self.action_space = ['u', 'd', 'l', 'r'] 19 | self.n_actions = len(self.action_space) 20 | self.title('maze') 21 | self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT)) 22 | self._build_maze() 23 | 24 | def _build_maze(self): 25 | self.canvas = tk.Canvas(self, bg='white', 26 | height=MAZE_H * UNIT, 27 | width=MAZE_W * UNIT) 28 | 29 | # create grids 30 | for c in range(0, MAZE_W * UNIT, UNIT): 31 | x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT 32 | self.canvas.create_line(x0, y0, x1, y1) 33 | for r in range(0, MAZE_H * UNIT, UNIT): 34 | x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r 35 | self.canvas.create_line(x0, y0, x1, y1) 36 | 37 | # create origin 38 | origin = np.array([20, 20]) 39 | 40 | # hell 41 | hell1_center = origin + np.array([UNIT * 2, UNIT]) 42 | self.hell1 = self.canvas.create_rectangle( 43 | hell1_center[0] - 15, hell1_center[1] - 15, 44 | hell1_center[0] + 15, hell1_center[1] + 15, 45 | fill='black') 46 | # hell 47 | hell2_center = origin + np.array([UNIT, UNIT * 2]) 48 | self.hell2 = self.canvas.create_rectangle( 49 | hell2_center[0] - 15, hell2_center[1] - 15, 50 | hell2_center[0] + 15, hell2_center[1] + 15, 51 | fill='black') 52 | 53 | # hell 54 | hell3_center = origin + np.array([UNIT * 2, UNIT * 6]) 55 | self.hell3 = self.canvas.create_rectangle( 56 | hell3_center[0] - 15, hell3_center[1] - 15, 57 | hell3_center[0] + 15, hell3_center[1] + 15, 58 | fill='black') 59 | 60 | # hell 61 | hell4_center = origin + np.array([UNIT * 6, UNIT * 2]) 62 | self.hell4 = self.canvas.create_rectangle( 63 | hell4_center[0] - 15, hell4_center[1] - 15, 64 | hell4_center[0] + 15, hell4_center[1] + 15, 65 | fill='black') 66 | 67 | # hell 68 | hell5_center = origin + np.array([UNIT * 4, UNIT * 4]) 69 | self.hell5 = self.canvas.create_rectangle( 70 | hell5_center[0] - 15, hell5_center[1] - 15, 71 | hell5_center[0] + 15, hell5_center[1] + 15, 72 | fill='black') 73 | 74 | # hell 75 | hell6_center = origin + np.array([UNIT * 4, UNIT * 1]) 76 | self.hell6 = self.canvas.create_rectangle( 77 | hell6_center[0] - 15, hell6_center[1] - 15, 78 | hell6_center[0] + 15, hell6_center[1] + 15, 79 | fill='black') 80 | 81 | # hell 82 | hell7_center = origin + np.array([UNIT * 1, UNIT * 3]) 83 | self.hell7 = self.canvas.create_rectangle( 84 | hell7_center[0] - 15, hell7_center[1] - 15, 85 | hell7_center[0] + 15, hell7_center[1] + 15, 86 | fill='black') 87 | 88 | # hell 89 | hell8_center = origin + np.array([UNIT * 2, UNIT * 4]) 90 | self.hell8 = self.canvas.create_rectangle( 91 | hell8_center[0] - 15, hell8_center[1] - 15, 92 | hell8_center[0] + 15, hell8_center[1] + 15, 93 | fill='black') 94 | 95 | # hell 96 | hell9_center = origin + np.array([UNIT * 3, UNIT * 2]) 97 | self.hell9 = self.canvas.create_rectangle( 98 | hell9_center[0] - 15, hell9_center[1] - 15, 99 | hell9_center[0] + 15, hell9_center[1] + 15, 100 | fill='black') 101 | 102 | 103 | 104 | 105 | # create oval 106 | oval_center = origin + UNIT * 3 107 | self.oval = self.canvas.create_oval( 108 | oval_center[0] - 15, oval_center[1] - 15, 109 | oval_center[0] + 15, oval_center[1] + 15, 110 | fill='yellow') 111 | 112 | # create red rect 113 | self.rect = self.canvas.create_rectangle( 114 | origin[0] - 15, origin[1] - 15, 115 | origin[0] + 15, origin[1] + 15, 116 | fill='red') 117 | 118 | # pack all 119 | self.canvas.pack() 120 | 121 | def reset(self): 122 | self.update() 123 | time.sleep(0.5) 124 | self.canvas.delete(self.rect) 125 | origin = np.array([20, 20]) 126 | self.rect = self.canvas.create_rectangle( 127 | origin[0] - 15, origin[1] - 15, 128 | origin[0] + 15, origin[1] + 15, 129 | fill='red') 130 | # return observation 131 | return self.canvas.coords(self.rect) 132 | 133 | def step(self, action): 134 | s = self.canvas.coords(self.rect) 135 | base_action = np.array([0, 0]) 136 | if action == 0: # up 137 | if s[1] > UNIT: 138 | base_action[1] -= UNIT 139 | elif action == 1: # down 140 | if s[1] < (MAZE_H - 1) * UNIT: 141 | base_action[1] += UNIT 142 | elif action == 2: # right 143 | if s[0] < (MAZE_W - 1) * UNIT: 144 | base_action[0] += UNIT 145 | elif action == 3: # left 146 | if s[0] > UNIT: 147 | base_action[0] -= UNIT 148 | 149 | self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent 150 | 151 | s_ = self.canvas.coords(self.rect) # next state 152 | 153 | # reward function 154 | if s_ == self.canvas.coords(self.oval): 155 | reward = 1 156 | done = True 157 | s_ = 'terminal' 158 | elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2), self.canvas.coords(self.hell3), 159 | self.canvas.coords(self.hell4),self.canvas.coords(self.hell5),self.canvas.coords(self.hell6),self.canvas.coords(self.hell7), 160 | self.canvas.coords(self.hell8),self.canvas.coords(self.hell9)]: 161 | reward = -1 162 | done = True 163 | s_ = 'terminal' 164 | else: 165 | reward = 0 166 | done = False 167 | 168 | return s_, reward, done 169 | 170 | def render(self): 171 | time.sleep(0.1) 172 | self.update() 173 | 174 | 175 | def update(): 176 | for t in range(10): 177 | s = env.reset() 178 | while True: 179 | env.render() 180 | a = 1 181 | s, r, done = env.step(a) 182 | if done: 183 | break 184 | 185 | if __name__ == '__main__': 186 | env = Maze() 187 | env.after(100, update) 188 | env.mainloop() -------------------------------------------------------------------------------- /content/2_Q-learning-maze/run_this.py: -------------------------------------------------------------------------------- 1 | from maze_env import Maze 2 | from RL_brain import QLearningTable 3 | 4 | def update(): 5 | for episode in range(150): 6 | observation = env.reset() 7 | print(episode) 8 | while True: 9 | env.render() 10 | action = RL.choose_action(str(observation)) 11 | # print("observation: {}".format(observation)) 12 | observation_, reward, done = env.step(action) 13 | RL.learn(str(observation), action, reward, str(observation_)) 14 | # print(RL.q_table) 15 | observation = observation_ 16 | if done: 17 | break 18 | print('game over') 19 | env.destroy() 20 | 21 | if __name__ == '__main__': 22 | env = Maze() 23 | # print("env.n_actions: {}".format(env.n_actions)) 24 | RL = QLearningTable(actions=list(range(env.n_actions))) 25 | 26 | env.after(100, update) 27 | env.mainloop() -------------------------------------------------------------------------------- /content/3_Sarsa_maze/RL_brain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class RL(object): 5 | def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): 6 | self.actions = action_space 7 | self.lr = learning_rate 8 | self.gamma = reward_decay 9 | self.epsilon = e_greedy 10 | 11 | self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64) 12 | 13 | def check_state_exist(self, state): 14 | if state not in self.q_table.index: 15 | self.q_table = self.q_table.append(pd.Series([0]*len(self.actions),index=self.q_table.columns,name=state)) 16 | 17 | def choose_action(self, observation): 18 | self.check_state_exist(observation) 19 | if np.random.rand() < self.epsilon: 20 | state_action = self.q_table.loc[observation, :] 21 | action = np.random.choice(state_action[state_action == np.max(state_action)].index) 22 | else: 23 | action = np.random.choice(self.actions) 24 | return action 25 | 26 | def learn(self, *args): 27 | pass 28 | 29 | 30 | class QLearningTable(RL): 31 | def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): 32 | super(QLearningTable,self).__init__(actions, learning_rate, reward_decay, e_greedy) 33 | 34 | def learn(self, s, a, r, s_): 35 | self.check_state_exist(s_) 36 | q_predict = self.q_table.loc[s,a] 37 | if s_ != 'terminal': 38 | q_target = r + self.gamma*self.q_table.loc[s_,:].max() 39 | else: 40 | q_target = r 41 | self.q_table.loc[s,a] += self.lr*(q_target-q_predict) 42 | 43 | 44 | class SarsaTable(RL): 45 | def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): 46 | super(SarsaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy) 47 | 48 | def learn(self, s, a, r, s_, a_): 49 | self.check_state_exist(s_) 50 | q_predict = self.q_table.loc[s,a] 51 | if s_ != "terminal": 52 | q_target = r + self.gamma * self.q_table.loc[s_, a_] 53 | else: 54 | q_target = r 55 | self.q_table.loc[s,a] += self.lr * (q_target-q_predict) -------------------------------------------------------------------------------- /content/3_Sarsa_maze/maze_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import sys 4 | if sys.version_info.major == 2: 5 | import Tkinter as tk 6 | else: 7 | import tkinter as tk 8 | 9 | 10 | UNIT = 40 # pixels 11 | MAZE_H = 8 # grid height 12 | MAZE_W = 8 # grid width 13 | 14 | 15 | class Maze(tk.Tk, object): 16 | def __init__(self): 17 | super(Maze, self).__init__() 18 | self.action_space = ['u', 'd', 'l', 'r'] 19 | self.n_actions = len(self.action_space) 20 | self.title('maze') 21 | self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT)) 22 | self._build_maze() 23 | 24 | def _build_maze(self): 25 | self.canvas = tk.Canvas(self, bg='white', 26 | height=MAZE_H * UNIT, 27 | width=MAZE_W * UNIT) 28 | 29 | # create grids 30 | for c in range(0, MAZE_W * UNIT, UNIT): 31 | x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT 32 | self.canvas.create_line(x0, y0, x1, y1) 33 | for r in range(0, MAZE_H * UNIT, UNIT): 34 | x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r 35 | self.canvas.create_line(x0, y0, x1, y1) 36 | 37 | # create origin 38 | origin = np.array([20, 20]) 39 | 40 | # hell 41 | hell1_center = origin + np.array([UNIT * 2, UNIT]) 42 | self.hell1 = self.canvas.create_rectangle( 43 | hell1_center[0] - 15, hell1_center[1] - 15, 44 | hell1_center[0] + 15, hell1_center[1] + 15, 45 | fill='black') 46 | # hell 47 | hell2_center = origin + np.array([UNIT, UNIT * 2]) 48 | self.hell2 = self.canvas.create_rectangle( 49 | hell2_center[0] - 15, hell2_center[1] - 15, 50 | hell2_center[0] + 15, hell2_center[1] + 15, 51 | fill='black') 52 | 53 | # hell 54 | hell3_center = origin + np.array([UNIT * 2, UNIT * 6]) 55 | self.hell3 = self.canvas.create_rectangle( 56 | hell3_center[0] - 15, hell3_center[1] - 15, 57 | hell3_center[0] + 15, hell3_center[1] + 15, 58 | fill='black') 59 | 60 | # hell 61 | hell4_center = origin + np.array([UNIT * 6, UNIT * 2]) 62 | self.hell4 = self.canvas.create_rectangle( 63 | hell4_center[0] - 15, hell4_center[1] - 15, 64 | hell4_center[0] + 15, hell4_center[1] + 15, 65 | fill='black') 66 | 67 | # hell 68 | hell5_center = origin + np.array([UNIT * 4, UNIT * 4]) 69 | self.hell5 = self.canvas.create_rectangle( 70 | hell5_center[0] - 15, hell5_center[1] - 15, 71 | hell5_center[0] + 15, hell5_center[1] + 15, 72 | fill='black') 73 | 74 | # hell 75 | hell6_center = origin + np.array([UNIT * 4, UNIT * 1]) 76 | self.hell6 = self.canvas.create_rectangle( 77 | hell6_center[0] - 15, hell6_center[1] - 15, 78 | hell6_center[0] + 15, hell6_center[1] + 15, 79 | fill='black') 80 | 81 | # hell 82 | hell7_center = origin + np.array([UNIT * 1, UNIT * 3]) 83 | self.hell7 = self.canvas.create_rectangle( 84 | hell7_center[0] - 15, hell7_center[1] - 15, 85 | hell7_center[0] + 15, hell7_center[1] + 15, 86 | fill='black') 87 | 88 | # hell 89 | hell8_center = origin + np.array([UNIT * 2, UNIT * 4]) 90 | self.hell8 = self.canvas.create_rectangle( 91 | hell8_center[0] - 15, hell8_center[1] - 15, 92 | hell8_center[0] + 15, hell8_center[1] + 15, 93 | fill='black') 94 | 95 | # hell 96 | hell9_center = origin + np.array([UNIT * 3, UNIT * 2]) 97 | self.hell9 = self.canvas.create_rectangle( 98 | hell9_center[0] - 15, hell9_center[1] - 15, 99 | hell9_center[0] + 15, hell9_center[1] + 15, 100 | fill='black') 101 | 102 | 103 | 104 | 105 | # create oval 106 | oval_center = origin + UNIT * 3 107 | self.oval = self.canvas.create_oval( 108 | oval_center[0] - 15, oval_center[1] - 15, 109 | oval_center[0] + 15, oval_center[1] + 15, 110 | fill='yellow') 111 | 112 | # create red rect 113 | self.rect = self.canvas.create_rectangle( 114 | origin[0] - 15, origin[1] - 15, 115 | origin[0] + 15, origin[1] + 15, 116 | fill='red') 117 | 118 | # pack all 119 | self.canvas.pack() 120 | 121 | def reset(self): 122 | self.update() 123 | time.sleep(0.5) 124 | self.canvas.delete(self.rect) 125 | origin = np.array([20, 20]) 126 | self.rect = self.canvas.create_rectangle( 127 | origin[0] - 15, origin[1] - 15, 128 | origin[0] + 15, origin[1] + 15, 129 | fill='red') 130 | # return observation 131 | return self.canvas.coords(self.rect) 132 | 133 | def step(self, action): 134 | s = self.canvas.coords(self.rect) 135 | base_action = np.array([0, 0]) 136 | if action == 0: # up 137 | if s[1] > UNIT: 138 | base_action[1] -= UNIT 139 | elif action == 1: # down 140 | if s[1] < (MAZE_H - 1) * UNIT: 141 | base_action[1] += UNIT 142 | elif action == 2: # right 143 | if s[0] < (MAZE_W - 1) * UNIT: 144 | base_action[0] += UNIT 145 | elif action == 3: # left 146 | if s[0] > UNIT: 147 | base_action[0] -= UNIT 148 | 149 | self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent 150 | 151 | s_ = self.canvas.coords(self.rect) # next state 152 | 153 | # reward function 154 | if s_ == self.canvas.coords(self.oval): 155 | reward = 1 156 | done = True 157 | s_ = 'terminal' 158 | elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2), self.canvas.coords(self.hell3), 159 | self.canvas.coords(self.hell4),self.canvas.coords(self.hell5),self.canvas.coords(self.hell6),self.canvas.coords(self.hell7), 160 | self.canvas.coords(self.hell8),self.canvas.coords(self.hell9)]: 161 | reward = -1 162 | done = True 163 | s_ = 'terminal' 164 | else: 165 | reward = 0 166 | done = False 167 | 168 | return s_, reward, done 169 | 170 | def render(self): 171 | time.sleep(0.1) 172 | self.update() 173 | 174 | 175 | def update(): 176 | for t in range(10): 177 | s = env.reset() 178 | while True: 179 | env.render() 180 | a = 1 181 | s, r, done = env.step(a) 182 | if done: 183 | break 184 | 185 | if __name__ == '__main__': 186 | env = Maze() 187 | env.after(100, update) 188 | env.mainloop() -------------------------------------------------------------------------------- /content/3_Sarsa_maze/run_this.py: -------------------------------------------------------------------------------- 1 | from maze_env import Maze 2 | from RL_brain import SarsaTable,QLearningTable 3 | 4 | def update(): 5 | for episode in range(300): 6 | observation = env.reset() 7 | action = RL.choose_action(str(observation)) 8 | print(episode) 9 | while True: 10 | env.render() 11 | observation_,reward,done = env.step(action) 12 | action_ = RL.choose_action(str(observation_)) 13 | RL.learn(str(observation),action,reward,str(observation_)) 14 | observation = observation_ 15 | action = action_ 16 | 17 | if done: 18 | break 19 | 20 | print('game over') 21 | env.destroy() 22 | 23 | if __name__ == '__main__': 24 | env = Maze() 25 | # RL = SarsaTable(actions=list(range(env.n_actions))) 26 | RL = QLearningTable(actions=list(range(env.n_actions))) 27 | env.after(100, update) 28 | env.mainloop() -------------------------------------------------------------------------------- /content/4_Sarsa_lambda_maze/RL_brain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | class RL(object): 5 | def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): 6 | self.actions = action_space 7 | self.lr = learning_rate 8 | self.gamma = reward_decay 9 | self.epsilon = e_greedy 10 | 11 | self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64) 12 | 13 | def check_state_exist(self, state): 14 | if state not in self.q_table.index: 15 | self.q_table = self.q_table.append(pd.Series([0]*len(self.actions), index=self.q_table.columns, name=state)) 16 | 17 | def choose_action(self, observation): 18 | self.check_state_exist(observation) 19 | if np.random.rand() < self.epsilon: 20 | state_action = self.q_table.loc[observation, :] 21 | # print("state_action: {}".format(state_action)) 22 | # print("state_action == np.max(state_action): {}".format(state_action == np.max(state_action))) 23 | print(state_action[state_action==np.max(state_action)]) 24 | action = np.random.choice(state_action[state_action==np.max(state_action)].index) 25 | else: 26 | action = np.random.choice(self.actions) 27 | return action 28 | 29 | def learn(self, *args): 30 | pass 31 | 32 | class SarsaLambdaTable(RL): 33 | def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, trace_decay=0.9): 34 | super(SarsaLambdaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy) 35 | self.lambda_ = trace_decay 36 | self.eligibility_trace = self.q_table.copy() 37 | 38 | def check_state_exist(self, state): 39 | if state not in self.q_table.index: 40 | to_be_append = pd.Series([0]*len(self.actions), index=self.q_table.columns, name=state) 41 | self.q_table = self.q_table.append(to_be_append) 42 | self.eligibility_trace = self.eligibility_trace.append(to_be_append) 43 | 44 | def learn(self, s, a, r, s_, a_): 45 | self.check_state_exist(s_) 46 | q_predict = self.q_table.loc[s,a] 47 | if s_ != 'terminal': 48 | q_target = r + self.gamma * self.q_table.loc[s_, a_] 49 | else: 50 | q_target = r 51 | 52 | error = q_target - q_predict 53 | 54 | self.eligibility_trace.loc[s,:] *= 0 55 | self.eligibility_trace.loc[s,a] = 1 56 | 57 | self.q_table += self.lr * error * self.eligibility_trace 58 | 59 | self.eligibility_trace *= self.gamma*self.lambda_ -------------------------------------------------------------------------------- /content/4_Sarsa_lambda_maze/maze_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import sys 4 | if sys.version_info.major == 2: 5 | import Tkinter as tk 6 | else: 7 | import tkinter as tk 8 | 9 | 10 | UNIT = 40 # pixels 11 | MAZE_H = 8 # grid height 12 | MAZE_W = 8 # grid width 13 | 14 | 15 | class Maze(tk.Tk, object): 16 | def __init__(self): 17 | super(Maze, self).__init__() 18 | self.action_space = ['u', 'd', 'l', 'r'] 19 | self.n_actions = len(self.action_space) 20 | self.title('maze') 21 | self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT)) 22 | self._build_maze() 23 | 24 | def _build_maze(self): 25 | self.canvas = tk.Canvas(self, bg='white', 26 | height=MAZE_H * UNIT, 27 | width=MAZE_W * UNIT) 28 | 29 | # create grids 30 | for c in range(0, MAZE_W * UNIT, UNIT): 31 | x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT 32 | self.canvas.create_line(x0, y0, x1, y1) 33 | for r in range(0, MAZE_H * UNIT, UNIT): 34 | x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r 35 | self.canvas.create_line(x0, y0, x1, y1) 36 | 37 | # create origin 38 | origin = np.array([20, 20]) 39 | 40 | # hell 41 | hell1_center = origin + np.array([UNIT * 2, UNIT]) 42 | self.hell1 = self.canvas.create_rectangle( 43 | hell1_center[0] - 15, hell1_center[1] - 15, 44 | hell1_center[0] + 15, hell1_center[1] + 15, 45 | fill='black') 46 | # hell 47 | hell2_center = origin + np.array([UNIT, UNIT * 2]) 48 | self.hell2 = self.canvas.create_rectangle( 49 | hell2_center[0] - 15, hell2_center[1] - 15, 50 | hell2_center[0] + 15, hell2_center[1] + 15, 51 | fill='black') 52 | 53 | # hell 54 | hell3_center = origin + np.array([UNIT * 2, UNIT * 6]) 55 | self.hell3 = self.canvas.create_rectangle( 56 | hell3_center[0] - 15, hell3_center[1] - 15, 57 | hell3_center[0] + 15, hell3_center[1] + 15, 58 | fill='black') 59 | 60 | # hell 61 | hell4_center = origin + np.array([UNIT * 6, UNIT * 2]) 62 | self.hell4 = self.canvas.create_rectangle( 63 | hell4_center[0] - 15, hell4_center[1] - 15, 64 | hell4_center[0] + 15, hell4_center[1] + 15, 65 | fill='black') 66 | 67 | # hell 68 | hell5_center = origin + np.array([UNIT * 4, UNIT * 4]) 69 | self.hell5 = self.canvas.create_rectangle( 70 | hell5_center[0] - 15, hell5_center[1] - 15, 71 | hell5_center[0] + 15, hell5_center[1] + 15, 72 | fill='black') 73 | 74 | # hell 75 | hell6_center = origin + np.array([UNIT * 4, UNIT * 1]) 76 | self.hell6 = self.canvas.create_rectangle( 77 | hell6_center[0] - 15, hell6_center[1] - 15, 78 | hell6_center[0] + 15, hell6_center[1] + 15, 79 | fill='black') 80 | 81 | # hell 82 | hell7_center = origin + np.array([UNIT * 1, UNIT * 3]) 83 | self.hell7 = self.canvas.create_rectangle( 84 | hell7_center[0] - 15, hell7_center[1] - 15, 85 | hell7_center[0] + 15, hell7_center[1] + 15, 86 | fill='black') 87 | 88 | # hell 89 | hell8_center = origin + np.array([UNIT * 2, UNIT * 4]) 90 | self.hell8 = self.canvas.create_rectangle( 91 | hell8_center[0] - 15, hell8_center[1] - 15, 92 | hell8_center[0] + 15, hell8_center[1] + 15, 93 | fill='black') 94 | 95 | # hell 96 | hell9_center = origin + np.array([UNIT * 3, UNIT * 2]) 97 | self.hell9 = self.canvas.create_rectangle( 98 | hell9_center[0] - 15, hell9_center[1] - 15, 99 | hell9_center[0] + 15, hell9_center[1] + 15, 100 | fill='black') 101 | 102 | 103 | 104 | 105 | # create oval 106 | oval_center = origin + UNIT * 3 107 | self.oval = self.canvas.create_oval( 108 | oval_center[0] - 15, oval_center[1] - 15, 109 | oval_center[0] + 15, oval_center[1] + 15, 110 | fill='yellow') 111 | 112 | # create red rect 113 | self.rect = self.canvas.create_rectangle( 114 | origin[0] - 15, origin[1] - 15, 115 | origin[0] + 15, origin[1] + 15, 116 | fill='red') 117 | 118 | # pack all 119 | self.canvas.pack() 120 | 121 | def reset(self): 122 | self.update() 123 | time.sleep(0.5) 124 | self.canvas.delete(self.rect) 125 | origin = np.array([20, 20]) 126 | self.rect = self.canvas.create_rectangle( 127 | origin[0] - 15, origin[1] - 15, 128 | origin[0] + 15, origin[1] + 15, 129 | fill='red') 130 | # return observation 131 | return self.canvas.coords(self.rect) 132 | 133 | def step(self, action): 134 | s = self.canvas.coords(self.rect) 135 | base_action = np.array([0, 0]) 136 | if action == 0: # up 137 | if s[1] > UNIT: 138 | base_action[1] -= UNIT 139 | elif action == 1: # down 140 | if s[1] < (MAZE_H - 1) * UNIT: 141 | base_action[1] += UNIT 142 | elif action == 2: # right 143 | if s[0] < (MAZE_W - 1) * UNIT: 144 | base_action[0] += UNIT 145 | elif action == 3: # left 146 | if s[0] > UNIT: 147 | base_action[0] -= UNIT 148 | 149 | self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent 150 | 151 | s_ = self.canvas.coords(self.rect) # next state 152 | 153 | # reward function 154 | if s_ == self.canvas.coords(self.oval): 155 | print("bingo") 156 | reward = 1 157 | done = True 158 | s_ = 'terminal' 159 | elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2), self.canvas.coords(self.hell3), 160 | self.canvas.coords(self.hell4),self.canvas.coords(self.hell5),self.canvas.coords(self.hell6),self.canvas.coords(self.hell7), 161 | self.canvas.coords(self.hell8),self.canvas.coords(self.hell9)]: 162 | reward = -1 163 | done = True 164 | s_ = 'terminal' 165 | else: 166 | reward = -0.1 167 | done = False 168 | 169 | return s_, reward, done 170 | 171 | def render(self): 172 | time.sleep(0.1) 173 | self.update() 174 | 175 | 176 | def update(): 177 | for t in range(10): 178 | s = env.reset() 179 | while True: 180 | env.render() 181 | a = 1 182 | s, r, done = env.step(a) 183 | if done: 184 | break 185 | 186 | if __name__ == '__main__': 187 | env = Maze() 188 | env.after(100, update) 189 | env.mainloop() -------------------------------------------------------------------------------- /content/4_Sarsa_lambda_maze/run_this.py: -------------------------------------------------------------------------------- 1 | from maze_env import Maze 2 | from RL_brain import SarsaLambdaTable 3 | 4 | def update(): 5 | for episode in range(500): 6 | # print(episode) 7 | observation = env.reset() 8 | action = RL.choose_action(str(observation)) 9 | RL.eligibility_trace *= 0 10 | 11 | step = 0 12 | while True: 13 | step += 1 14 | # print("step: ", step, "action: ", action) 15 | env.render() 16 | observation_, reward, done = env.step(action) 17 | action_ = RL.choose_action(str(observation_)) 18 | RL.learn(str(observation), action, reward, str(observation_), action_) 19 | observation = observation_ 20 | action = action_ 21 | 22 | if done: 23 | break 24 | 25 | print('game over') 26 | env.destroy() 27 | 28 | if __name__ == '__main__': 29 | env = Maze() 30 | RL = SarsaLambdaTable(actions=list(range(env.n_actions))) 31 | env.after(100, update) 32 | env.mainloop() 33 | -------------------------------------------------------------------------------- /content/5.1_double_DQN/RL_brain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | np.random.seed(1) 8 | torch.manual_seed(1) 9 | 10 | class Net(nn.Module): 11 | def __init__(self, n_feature, n_hidden, n_output): 12 | super(Net, self).__init__() 13 | self.el = nn.Linear(n_feature, n_hidden) 14 | self.q = nn.Linear(n_hidden, n_output) 15 | 16 | def forward(self, x): 17 | x = self.el(x) 18 | x = F.relu(x) 19 | x = self.q(x) 20 | return x 21 | 22 | class DoubleDQN(): 23 | def __init__(self, n_actions, n_features, n_hidden=20, learning_rate=0.005, reward_decay=0.9, e_greedy=0.9, 24 | replace_target_iter=200, memory_size=3000, batch_size=32, e_greedy_increment=None, double_q=True): 25 | self.n_actions = n_actions 26 | self.n_hidden = n_hidden 27 | self.n_features = n_features 28 | self.lr = learning_rate 29 | self.gamma = reward_decay 30 | self.epsilon_max = e_greedy 31 | self.replace_target_iter = replace_target_iter 32 | self.memory_size = memory_size 33 | self.batch_size = batch_size 34 | self.epsilon_increment = e_greedy_increment 35 | self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max 36 | self.double_q = double_q 37 | 38 | self.learn_step_counter = 0 39 | self.memory = np.zeros((self.memory_size, n_features*2+2)) 40 | self._build_net() 41 | self.cost_his = [] 42 | 43 | def _build_net(self): 44 | self.q_eval = Net(self.n_features, self.n_hidden, self.n_actions) 45 | self.q_target = Net(self.n_features, self.n_hidden, self.n_actions) 46 | self.optimizer = torch.optim.RMSprop(self.q_eval.parameters(), lr=self.lr) 47 | self.loss_func = nn.MSELoss() 48 | 49 | def store_transition(self, s, a, r, s_): 50 | if not hasattr(self, 'memory_counter'): 51 | self.memory_counter = 0 52 | transition = np.hstack((s, [a, r], s_)) 53 | index = self.memory_counter % self.memory_size 54 | self.memory[index, :] = transition 55 | self.memory_counter += 1 56 | 57 | def choose_action(self, observation): 58 | observation = torch.Tensor(observation[np.newaxis, :]) 59 | actions_value = self.q_eval(observation) 60 | action = torch.max(actions_value, dim=1)[1] # record action value it get 61 | if not hasattr(self, 'q'): 62 | self.q = [] 63 | self.running_q = 0 64 | self.running_q = self.running_q*0.99 + 0.01 * torch.max(actions_value, dim=1)[0] 65 | self.q.append(self.running_q) 66 | 67 | if np.random.uniform() > self.epsilon: # randomly choose action 68 | action = np.random.randint(0, self.n_actions) 69 | return action 70 | 71 | def learn(self): 72 | if self.learn_step_counter % self.replace_target_iter == 0: 73 | self.q_target.load_state_dict(self.q_eval.state_dict()) 74 | print("\ntarget params replaced\n") 75 | 76 | if self.memory_counter > self.memory_size: 77 | sample_index = np.random.choice(self.memory_size, size=self.batch_size) 78 | else: 79 | sample_index = np.random.choice(self.memory_counter, size=self.batch_size) 80 | 81 | batch_memory = self.memory[sample_index, :] 82 | 83 | # q_eval4next is the output of the q_eval network when input s_(t+1) 84 | # q_next is the output of the q_target network when input s_(s+1) 85 | # we use q_eval4next to get which action was choosed by eval network in s_(t+1) 86 | # then we get the Q_value corresponding to that action output by target network 87 | q_next, q_eval4next = self.q_target(torch.Tensor(batch_memory[:,-self.n_features:])), self.q_eval(torch.Tensor(batch_memory[:,-self.n_features:])) 88 | q_eval = self.q_eval(torch.Tensor(batch_memory[:, :self.n_features])) 89 | 90 | # used for calculating y, we need to copy for q_eval because this operation could keep the Q_value that has not been selected unchanged, 91 | # so when we do q_target - q_eval, these Q_value become zero and wouldn't affect the calculation of the loss 92 | q_target = torch.Tensor(q_eval.data.numpy().copy()) 93 | 94 | batch_index = np.arange(self.batch_size, dtype=np.int32) 95 | eval_act_index = batch_memory[:, self.n_features].astype(int) 96 | reward = torch.Tensor(batch_memory[:, self.n_features+1]) 97 | 98 | if self.double_q: 99 | max_act4next = torch.max(q_eval4next, dim=1)[1] 100 | selected_q_next = q_next[batch_index, max_act4next] 101 | else: 102 | selected_q_next = torch.max(q_next, dim=1)[0] 103 | 104 | q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next 105 | 106 | loss = self.loss_func(q_eval, q_target) 107 | self.optimizer.zero_grad() 108 | loss.backward() 109 | self.optimizer.step() 110 | 111 | self.cost_his.append(loss) 112 | 113 | self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max 114 | self.learn_step_counter += 1 115 | 116 | 117 | -------------------------------------------------------------------------------- /content/5.1_double_DQN/run_Pendulum.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from RL_brain import DoubleDQN 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | env = gym.make("Pendulum-v0") 7 | env = env.unwrapped 8 | env.seed(1) 9 | MEMORY_SIZE = 3000 10 | ACTION_SPACE = 11 11 | 12 | naturel_DQN = DoubleDQN(n_actions=ACTION_SPACE,n_features=3,memory_size=MEMORY_SIZE,e_greedy_increment=0.001,double_q=False) 13 | double_DQN = DoubleDQN(n_actions=ACTION_SPACE,n_features=3,memory_size=MEMORY_SIZE,e_greedy_increment=0.001,double_q=True) 14 | 15 | def train(RL): 16 | total_steps = 0 17 | observation = env.reset() 18 | while True: 19 | if total_steps-MEMORY_SIZE > 8000: env.render() # show the game when trained for some time 20 | action = RL.choose_action(observation) 21 | f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # convert to [-2 ~ 2] float actions 22 | observation_, reward, done, info = env.step(np.array([f_action])) 23 | 24 | reward /= 10 #normalize to a range of (-1,0). r = 0 when get upright 25 | # the Q target at upright state will be 0, because Q_target = r + gamma * Qmax(s', a') = 0 + gamma * 0 26 | # so when Q at this state is greater than 0, the agent overestimates the Q. Please refer to the final result. 27 | 28 | RL.store_transition(observation, action, reward, observation_) 29 | 30 | if total_steps > MEMORY_SIZE: 31 | RL.learn() 32 | 33 | if total_steps - MEMORY_SIZE > 20000: 34 | break 35 | 36 | observation = observation_ 37 | total_steps += 1 38 | return RL.q 39 | 40 | q_natural = train(naturel_DQN) 41 | q_double = train(double_DQN) 42 | 43 | plt.plot(np.array(q_natural), c='r', label='natural') 44 | plt.plot(np.array(q_double), c='b', label='double') 45 | plt.legend(loc='best') 46 | plt.ylabel('Q eval') 47 | plt.xlabel('training steps') 48 | plt.grid() 49 | plt.show() -------------------------------------------------------------------------------- /content/5.2_Prioritized_Replay_DQN/Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClownW/Reinforcement-learning-with-PyTorch/b1b690a986372e8726df62b86a74baae1e02c88d/content/5.2_Prioritized_Replay_DQN/Figure_1.png -------------------------------------------------------------------------------- /content/5.2_Prioritized_Replay_DQN/RL_brain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | np.random.seed(1) 8 | torch.manual_seed(1) 9 | 10 | class SumTree(object): 11 | 12 | data_pointer = 0 13 | 14 | def __init__(self, capacity): 15 | self.capacity = capacity # for all priority values 16 | self.tree = np.zeros(2 * capacity - 1) 17 | self.data = np.zeros(capacity, dtype=object) # for all transitions 18 | 19 | def add(self, p, data): 20 | tree_idx = self.data_pointer + self.capacity - 1 21 | self.data[self.data_pointer] = data # store transition in self.data 22 | self.update(tree_idx, p) # add p to the tree 23 | self.data_pointer += 1 24 | if self.data_pointer >= self.capacity: 25 | self.data_pointer = 0 26 | 27 | def update(self, tree_idx, p): 28 | change = p - self.tree[tree_idx] 29 | self.tree[tree_idx] = p 30 | while tree_idx != 0: 31 | tree_idx = (tree_idx - 1) // 2 32 | self.tree[tree_idx] += change 33 | 34 | def get_leaf(self, v): 35 | parent_idx = 0 36 | while True: 37 | cl_idx = 2 * parent_idx + 1 # left kid of the parent node 38 | cr_idx = cl_idx + 1 39 | if cl_idx >= len(self.tree): # kid node is out of the tree, so parent is the leaf node 40 | leaf_idx = parent_idx 41 | break 42 | else: # downward search, always search for a higher priority node 43 | if v <= self.tree[cl_idx]: 44 | parent_idx = cl_idx 45 | else: 46 | v -= self.tree[cl_idx] 47 | parent_idx = cr_idx 48 | 49 | data_idx = leaf_idx - self.capacity + 1 50 | return leaf_idx, self.tree[leaf_idx], self.data[data_idx] 51 | 52 | @property 53 | def total_p(self): 54 | return self.tree[0] 55 | 56 | 57 | class Memory(object): # stored as (s, a, r, s_) in SumTree 58 | epsilon = 0.01 # small amount to avoid zero priority 59 | alpha = 0.6 # [0~1] convert the importance of TD error to priority 60 | beta = 0.4 # importance-sampling, from initial value increasing to 1 61 | beta_increment_per_sampling = 0.001 62 | abs_err_upper = 1. # clipped abs error 63 | 64 | def __init__(self, capacity): 65 | self.tree = SumTree(capacity) 66 | 67 | def store(self, transition): 68 | max_p = np.max(self.tree.tree[-self.tree.capacity:]) 69 | if max_p == 0: 70 | max_p = self.abs_err_upper 71 | self.tree.add(max_p, transition) # set the max of p for new p 72 | 73 | def sample(self, n): 74 | b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, self.tree.data[0].size)), np.empty((n, 1)) 75 | pri_seg = self.tree.total_p / n 76 | self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) # max=1 77 | 78 | min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p # for later calculation ISweight 79 | for i in range(n): 80 | a, b = pri_seg * i, pri_seg * (i+1) 81 | v = np.random.uniform(a, b) 82 | idx, p, data = self.tree.get_leaf(v) 83 | prob = p / self.tree.total_p 84 | ISWeights[i,0] = np.power(prob/min_prob, -self.beta) 85 | b_idx[i], b_memory[i,:] = idx, data 86 | return b_idx, b_memory, ISWeights 87 | 88 | def batch_update(self, tree_idx, abs_errors): 89 | abs_errors += self.epsilon # convert to abs and avoid 0 90 | clipped_errors = np.minimum(abs_errors.data, self.abs_err_upper) 91 | ps = np.power(clipped_errors, self.alpha) 92 | for ti, p in zip(tree_idx, ps): 93 | self.tree.update(ti, p) 94 | 95 | 96 | class Net(nn.Module): 97 | def __init__(self, n_feature, n_hidden, n_output): 98 | super(Net, self).__init__() 99 | self.el = nn.Linear(n_feature, n_hidden) 100 | self.q = nn.Linear(n_hidden, n_output) 101 | 102 | def forward(self, x): 103 | x = self.el(x) 104 | x = F.relu(x) 105 | x = self.q(x) 106 | return x 107 | 108 | 109 | class DQNPrioritizedReplay: 110 | def __init__(self, n_actions, n_features, n_hidden=20, learning_rate=0.005, reward_decay=0.9, e_greedy=0.9, replace_target_iter=500, 111 | memory_size=10000, batch_size=32, e_greedy_increment=None, output_graph=False, prioritized=True): 112 | self.n_actions = n_actions 113 | self.n_features = n_features 114 | self.n_hidden = n_hidden 115 | self.lr = learning_rate 116 | self.gamma = reward_decay 117 | self.epsilon_max = e_greedy 118 | self.replace_target_iter = replace_target_iter 119 | self.memory_size = memory_size 120 | self.batch_size = batch_size 121 | self.epsilon_increment = e_greedy_increment 122 | self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max 123 | 124 | self.prioritized = prioritized 125 | self.learn_step_counter = 0 126 | self._build_net() 127 | 128 | if self.prioritized: 129 | self.memory = Memory(capacity=memory_size) 130 | else: 131 | self.memory = np.zeros((self.memory_size, n_features*2+2)) 132 | 133 | self.cost_his = [] 134 | 135 | def _build_net(self): 136 | self.q_eval = Net(self.n_features, self.n_hidden, self.n_actions) 137 | self.q_target = Net(self.n_features, self.n_hidden, self.n_actions) 138 | self.optimizer = torch.optim.RMSprop(self.q_eval.parameters(), lr=self.lr) 139 | 140 | def store_transition(self, s, a, r, s_): 141 | if self.prioritized: # prioritized replay 142 | transition = np.hstack((s, [a, r], s_)) 143 | self.memory.store(transition) # have high priority for newly arrived transition 144 | else: # random replay 145 | if not hasattr(self, 'memory_counter'): 146 | self.memory_counter = 0 147 | transition = np.hstack((s, [a, r], s_)) 148 | index = self.memory_counter % self.memory_size 149 | self.memory[index, :] = transition 150 | self.memory_counter += 1 151 | 152 | def choose_action(self, observation): 153 | observation = torch.Tensor(observation[np.newaxis, :]) 154 | if np.random.uniform() < self.epsilon: 155 | actions_value = self.q_eval(observation) 156 | action = int(torch.max(actions_value, dim=1)[1]) 157 | else: 158 | action = np.random.randint(0, self.n_actions) 159 | return action 160 | 161 | 162 | def learn(self): 163 | if self.learn_step_counter % self.replace_target_iter == 0: 164 | self.q_target.load_state_dict(self.q_eval.state_dict()) 165 | # print("target params replaced\n") 166 | 167 | if self.prioritized: 168 | tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size) 169 | else: 170 | sample_index = np.random.choice(self.memory_size, size=self.batch_size) 171 | batch_memory = self.memory[sample_index, :] 172 | 173 | q_next, q_eval = self.q_target(torch.Tensor(batch_memory[:, -self.n_features:])), self.q_eval(torch.Tensor(batch_memory[:, :self.n_features])) 174 | q_target = torch.Tensor(q_eval.data.numpy().copy()) 175 | 176 | batch_index = np.arange(self.batch_size, dtype=np.int32) 177 | eval_act_index = batch_memory[:, self.n_features].astype(int) 178 | reward = torch.Tensor(batch_memory[:, self.n_features+1]) 179 | q_target[batch_index, eval_act_index] = reward + self.gamma*torch.max(q_next, 1)[0] 180 | 181 | if self.prioritized: 182 | self.abs_errors = torch.sum(torch.abs(q_target-q_eval), dim=1) 183 | # print("ISWeights shape: ", ISWeights.shape, 'q shape: ', ((q_target-q_eval)**2), 'q: ', (q_target-q_eval)) 184 | loss = torch.mean(torch.mean(torch.Tensor(ISWeights) * (q_target-q_eval)**2, dim=1)) 185 | self.memory.batch_update(tree_idx, self.abs_errors) 186 | else: 187 | self.loss_func = nn.MSELoss() 188 | loss = self.loss_func(q_eval, q_target) 189 | 190 | # print("loss: ", loss, self.prioritized) 191 | 192 | self.optimizer.zero_grad() 193 | loss.backward() 194 | self.optimizer.step() 195 | 196 | # increase epsilon 197 | self.cost_his.append(loss) 198 | self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max 199 | self.learn_step_counter += 1 200 | -------------------------------------------------------------------------------- /content/5.2_Prioritized_Replay_DQN/run_MountainCar.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from RL_brain import DQNPrioritizedReplay 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | env = gym.make("MountainCar-v0") 7 | env = env.unwrapped 8 | env.seed(21) 9 | MEMORY_SIZE = 10000 10 | 11 | RL_natural = DQNPrioritizedReplay(n_actions=3, n_features=2, memory_size=MEMORY_SIZE, e_greedy_increment=0.00005, prioritized=False) 12 | RL_prio = DQNPrioritizedReplay(n_actions=3, n_features=2, memory_size=MEMORY_SIZE, e_greedy_increment=0.00005, prioritized=True) 13 | 14 | def train(RL): 15 | total_steps = 0 16 | steps = [] 17 | episodes = [] 18 | for i_episode in range(20): 19 | observation = env.reset() 20 | while True: 21 | # print("episode: {} | total_steps: {}".format(i_episode, total_steps)) 22 | # if total_steps - MEMORY_SIZE > 8000: env.render() 23 | action = RL.choose_action(observation) 24 | observation_, reward, done, info = env.step(action) 25 | if done: reward = 10 26 | RL.store_transition(observation, action, reward, observation_) 27 | if total_steps > MEMORY_SIZE: 28 | RL.learn() 29 | if done: 30 | print('episode ', i_episode, ' finished') 31 | steps.append(total_steps) 32 | episodes.append(i_episode) 33 | break 34 | observation = observation_ 35 | total_steps += 1 36 | print("steps for {}th episode: {}".format(i_episode, total_steps)) 37 | return np.vstack((episodes, steps)) 38 | 39 | his_natural = train(RL_natural) 40 | his_prio = train(RL_prio) 41 | 42 | plt.plot(his_natural[0,:], his_natural[1,:]-his_natural[0,:], c='b', label='natural DQN') 43 | plt.plot(his_prio[0,:], his_prio[1,:]-his_prio[0,:], c='r', label='DQN with prioritized replay') 44 | plt.legend(loc='best') 45 | plt.ylabel('total training time') 46 | plt.xlabel('episode') 47 | plt.grid() 48 | plt.show() -------------------------------------------------------------------------------- /content/5.3_Dueling_DQN/RL_brain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | np.random.seed(1) 7 | torch.manual_seed(1) 8 | 9 | class Net(nn.Module): 10 | def __init__(self, n_feature, n_hidden, n_output, dueling=False): 11 | super(Net, self).__init__() 12 | self.dueling = dueling 13 | self.l1 = nn.Linear(n_feature, n_hidden) 14 | if self.dueling: 15 | self.values = nn.Linear(n_hidden, 1) 16 | self.advantages = nn.Linear(n_hidden, n_output) 17 | else: 18 | self.q = nn.Linear(n_hidden, n_output) 19 | 20 | def forward(self, x): 21 | x = self.l1(x) 22 | x = F.relu(x) 23 | if self.dueling: 24 | value = self.values(x) 25 | advantages = self.advantages(x) 26 | out = value + (advantages-torch.mean(advantages, dim=1, keepdim=True)) 27 | else: 28 | out = self.q(x) 29 | return out 30 | 31 | 32 | class DuelingDQN: 33 | def __init__(self, n_actions, n_features, n_hidden=20, learning_rate=0.001, reward_decay=0.9, e_greedy=0.9, 34 | replace_target_iter=200, memory_size=500, batch_size=32, e_greedy_increment=None, dueling=True): 35 | self.n_actions = n_actions 36 | self.n_features = n_features 37 | self.n_hidden = n_hidden 38 | self.lr = learning_rate 39 | self.gamma = reward_decay 40 | self.epsilon_max = e_greedy 41 | self.replace_target_iter = replace_target_iter 42 | self.memory_size = memory_size 43 | self.batch_size = batch_size 44 | self.epsilon_increment = e_greedy_increment 45 | self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max 46 | self.dueling = dueling 47 | 48 | self.learn_step_counter = 0 49 | self.memory = np.zeros((self.memory_size, n_features*2+2)) 50 | self._build_net() 51 | self.cost_his = [] 52 | 53 | def _build_net(self): 54 | self.q_eval = Net(self.n_features, self.n_hidden, self.n_actions, self.dueling) 55 | self.q_target = Net(self.n_features, self.n_hidden, self.n_actions, self.dueling) 56 | 57 | self.optimizer = torch.optim.RMSprop(self.q_eval.parameters(), lr=self.lr) 58 | self.loss_func = nn.MSELoss() 59 | 60 | def store_transition(self, s, a, r, s_): 61 | if not hasattr(self, 'memory_counter'): 62 | self.memory_counter = 0 63 | transition = np.hstack((s, [a, r], s_)) 64 | index = self.memory_counter % self.memory_size 65 | self.memory[index, :] = transition 66 | self.memory_counter += 1 67 | 68 | def choose_action(self, observation): 69 | observation = torch.Tensor(observation[np.newaxis, :]) 70 | if np.random.uniform() < self.epsilon: 71 | actions_value = self.q_eval(observation) 72 | action = torch.max(actions_value, dim=1)[1] 73 | else: 74 | action = np.random.randint(0, self.n_actions) 75 | return action 76 | 77 | def learn(self): 78 | if self.learn_step_counter % self.replace_target_iter == 0: 79 | self.q_target.load_state_dict(self.q_eval.state_dict()) 80 | 81 | sample_index = np.random.choice(self.memory_size, size=self.batch_size) 82 | batch_memory = self.memory[sample_index, :] 83 | 84 | q_next, q_eval = self.q_target(torch.Tensor(batch_memory[:, -self.n_features:])), self.q_eval(torch.Tensor(batch_memory[:, :self.n_features])) 85 | q_target = torch.Tensor(q_eval.data.numpy().copy()) 86 | 87 | batch_index = np.arange(self.batch_size, dtype=np.int32) 88 | eval_act_index = batch_memory[:, self.n_features].astype(int) 89 | reward = torch.Tensor(batch_memory[:, self.n_features+1]) 90 | 91 | q_target[batch_index, eval_act_index] = reward + self.gamma*torch.max(q_next, dim=1)[0] 92 | 93 | loss = self.loss_func(q_eval, q_target) 94 | self.optimizer.zero_grad() 95 | loss.backward() 96 | self.optimizer.step() 97 | 98 | self.cost_his.append(loss) 99 | 100 | self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max 101 | self.learn_step_counter += 1 -------------------------------------------------------------------------------- /content/5.3_Dueling_DQN/action15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClownW/Reinforcement-learning-with-PyTorch/b1b690a986372e8726df62b86a74baae1e02c88d/content/5.3_Dueling_DQN/action15.png -------------------------------------------------------------------------------- /content/5.3_Dueling_DQN/run_Pendulum.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from RL_brain import DuelingDQN 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import torch 6 | 7 | env = gym.make('Pendulum-v0') 8 | env = env.unwrapped 9 | env.seed(1) 10 | MEMORY_SIZE = 3000 11 | ACTION_SPACE = 25 12 | 13 | natural_DQN = DuelingDQN(n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE, e_greedy_increment=0.001, dueling=False) 14 | dueling_DQN = DuelingDQN(n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE, e_greedy_increment=0.001, dueling=True) 15 | 16 | def train(RL): 17 | acc_r = [0] 18 | total_steps = 0 19 | observation = env.reset() 20 | while True: 21 | # if total_steps-MEMORY_SIZE > 9000: env.render() 22 | 23 | action = RL.choose_action(observation) 24 | 25 | f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) 26 | observation_, reward, done, info = env.step(np.array([f_action])) 27 | 28 | reward /= 10 # normalize to a range of (-1, 0) 29 | acc_r.append(reward + acc_r[-1]) # accumulated reward 30 | 31 | RL.store_transition(observation, action, reward, observation_) 32 | 33 | if total_steps > MEMORY_SIZE: 34 | RL.learn() 35 | 36 | if total_steps-MEMORY_SIZE > 15000: 37 | break 38 | 39 | observation = observation_ 40 | total_steps += 1 41 | return RL.cost_his, acc_r 42 | 43 | c_natural, r_natural = train(natural_DQN) 44 | print("start training dueling DQN! ") 45 | c_dueling, r_dueling = train(dueling_DQN) 46 | 47 | plt.figure(1) 48 | plt.plot(np.array(c_natural), c='r', label='natural') 49 | plt.plot(np.array(c_dueling), c='b', label='dueling') 50 | plt.legend(loc='best') 51 | plt.ylabel('cost') 52 | plt.xlabel('training steps') 53 | plt.grid() 54 | 55 | plt.figure(2) 56 | plt.plot(np.array(r_natural), c='r', label='natural') 57 | plt.plot(np.array(r_dueling), c='b', label='dueling') 58 | plt.legend(loc='best') 59 | plt.ylabel('accumulated reward') 60 | plt.xlabel('training steps') 61 | plt.grid() 62 | 63 | plt.show() -------------------------------------------------------------------------------- /content/5_Deep_Q_Network/RL_brain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import matplotlib.pyplot as plt 7 | 8 | import copy 9 | 10 | np.random.seed(1) 11 | torch.manual_seed(1) 12 | 13 | # define the network architecture 14 | class Net(nn.Module): 15 | def __init__(self, n_feature, n_hidden, n_output): 16 | super(Net, self).__init__() 17 | self.el = nn.Linear(n_feature, n_hidden) 18 | self.q = nn.Linear(n_hidden, n_output) 19 | 20 | def forward(self, x): 21 | x = self.el(x) 22 | x = F.relu(x) 23 | x = self.q(x) 24 | return x 25 | 26 | 27 | class DeepQNetwork(): 28 | def __init__(self, n_actions, n_features, n_hidden=20, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, 29 | replace_target_iter=200, memory_size=500, batch_size=32, e_greedy_increment=None, 30 | ): 31 | self.n_actions = n_actions 32 | self.n_features = n_features 33 | self.n_hidden = n_hidden 34 | self.lr = learning_rate 35 | self.gamma = reward_decay 36 | self.epsilon_max = e_greedy 37 | self.replace_target_iter = replace_target_iter 38 | self.memory_size = memory_size 39 | self.batch_size = batch_size 40 | self.epsilon_increment = e_greedy_increment 41 | self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max 42 | 43 | # total learning step 44 | self.learn_step_counter = 0 45 | 46 | # initialize zero memory [s, a, r, s_] 47 | self.memory = np.zeros((self.memory_size, n_features*2+2)) 48 | 49 | self.loss_func = nn.MSELoss() 50 | self.cost_his = [] 51 | 52 | self._build_net() 53 | 54 | 55 | def _build_net(self): 56 | self.q_eval = Net(self.n_features, self.n_hidden, self.n_actions) 57 | self.q_target = Net(self.n_features, self.n_hidden, self.n_actions) 58 | self.optimizer = torch.optim.RMSprop(self.q_eval.parameters(), lr=self.lr) 59 | 60 | def store_transition(self, s, a, r, s_): 61 | if not hasattr(self, 'memory_counter'): 62 | self.memory_counter = 0 63 | transition = np.hstack((s, [a, r], s_)) 64 | # replace the old memory with new memory 65 | index = self.memory_counter % self.memory_size 66 | self.memory[index, :] = transition 67 | self.memory_counter += 1 68 | 69 | def choose_action(self, observation): 70 | observation = torch.Tensor(observation[np.newaxis, :]) 71 | if np.random.uniform() < self.epsilon: 72 | actions_value = self.q_eval(observation) 73 | 74 | action = np.argmax(actions_value.data.numpy()) 75 | else: 76 | action = np.random.randint(0, self.n_actions) 77 | return action 78 | 79 | def learn(self): 80 | # check to replace target parameters 81 | if self.learn_step_counter % self.replace_target_iter == 0: 82 | self.q_target.load_state_dict(self.q_eval.state_dict()) 83 | print("\ntarget params replaced\n") 84 | 85 | # sample batch memory from all memory 86 | if self.memory_counter > self.memory_size: 87 | sample_index = np.random.choice(self.memory_size, size=self.batch_size) 88 | else: 89 | sample_index = np.random.choice(self.memory_counter, size=self.batch_size) 90 | batch_memory = self.memory[sample_index, :] 91 | 92 | # q_next is used for getting which action would be choosed by target network in state s_(t+1) 93 | q_next, q_eval = self.q_target(torch.Tensor(batch_memory[:, -self.n_features:])), self.q_eval(torch.Tensor(batch_memory[:, :self.n_features])) 94 | # used for calculating y, we need to copy for q_eval because this operation could keep the Q_value that has not been selected unchanged, 95 | # so when we do q_target - q_eval, these Q_value become zero and wouldn't affect the calculation of the loss 96 | q_target = torch.Tensor(q_eval.data.numpy().copy()) 97 | 98 | batch_index = np.arange(self.batch_size, dtype=np.int32) 99 | eval_act_index = batch_memory[:, self.n_features].astype(int) 100 | reward = torch.Tensor(batch_memory[:, self.n_features+1]) 101 | q_target[batch_index, eval_act_index] = reward + self.gamma*torch.max(q_next, 1)[0] 102 | 103 | loss = self.loss_func(q_eval, q_target) 104 | self.optimizer.zero_grad() 105 | loss.backward() 106 | self.optimizer.step() 107 | 108 | # increase epsilon 109 | self.cost_his.append(loss) 110 | self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max 111 | self.learn_step_counter += 1 112 | 113 | def plot_cost(self): 114 | plt.plot(np.arange(len(self.cost_his)), self.cost_his) 115 | plt.ylabel('Cost') 116 | plt.xlabel('training steps') 117 | plt.show() -------------------------------------------------------------------------------- /content/5_Deep_Q_Network/maze_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import sys 4 | if sys.version_info.major == 2: 5 | import Tkinter as tk 6 | else: 7 | import tkinter as tk 8 | 9 | UNIT = 40 # pixels 10 | MAZE_H = 4 # grid height 11 | MAZE_W = 4 # grid width 12 | 13 | 14 | class Maze(tk.Tk, object): 15 | def __init__(self): 16 | super(Maze, self).__init__() 17 | self.action_space = ['u', 'd', 'l', 'r'] 18 | self.n_actions = len(self.action_space) 19 | self.n_features = 2 20 | self.title('maze') 21 | self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT)) 22 | self._build_maze() 23 | 24 | def _build_maze(self): 25 | self.canvas = tk.Canvas(self, bg='white', 26 | height=MAZE_H * UNIT, 27 | width=MAZE_W * UNIT) 28 | 29 | # create grids 30 | for c in range(0, MAZE_W * UNIT, UNIT): 31 | x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT 32 | self.canvas.create_line(x0, y0, x1, y1) 33 | for r in range(0, MAZE_H * UNIT, UNIT): 34 | x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r 35 | self.canvas.create_line(x0, y0, x1, y1) 36 | 37 | # create origin 38 | origin = np.array([20, 20]) 39 | 40 | # hell 41 | hell1_center = origin + np.array([UNIT * 2, UNIT]) 42 | self.hell1 = self.canvas.create_rectangle( 43 | hell1_center[0] - 15, hell1_center[1] - 15, 44 | hell1_center[0] + 15, hell1_center[1] + 15, 45 | fill='black') 46 | # hell 47 | # hell2_center = origin + np.array([UNIT, UNIT * 2]) 48 | # self.hell2 = self.canvas.create_rectangle( 49 | # hell2_center[0] - 15, hell2_center[1] - 15, 50 | # hell2_center[0] + 15, hell2_center[1] + 15, 51 | # fill='black') 52 | 53 | # create oval 54 | oval_center = origin + UNIT * 2 55 | self.oval = self.canvas.create_oval( 56 | oval_center[0] - 15, oval_center[1] - 15, 57 | oval_center[0] + 15, oval_center[1] + 15, 58 | fill='yellow') 59 | 60 | # create red rect 61 | self.rect = self.canvas.create_rectangle( 62 | origin[0] - 15, origin[1] - 15, 63 | origin[0] + 15, origin[1] + 15, 64 | fill='red') 65 | 66 | # pack all 67 | self.canvas.pack() 68 | 69 | def reset(self): 70 | self.update() 71 | time.sleep(0.1) 72 | self.canvas.delete(self.rect) 73 | origin = np.array([20, 20]) 74 | self.rect = self.canvas.create_rectangle( 75 | origin[0] - 15, origin[1] - 15, 76 | origin[0] + 15, origin[1] + 15, 77 | fill='red') 78 | # return observation 79 | return (np.array(self.canvas.coords(self.rect)[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT) 80 | 81 | def step(self, action): 82 | s = self.canvas.coords(self.rect) 83 | base_action = np.array([0, 0]) 84 | if action == 0: # up 85 | if s[1] > UNIT: 86 | base_action[1] -= UNIT 87 | elif action == 1: # down 88 | if s[1] < (MAZE_H - 1) * UNIT: 89 | base_action[1] += UNIT 90 | elif action == 2: # right 91 | if s[0] < (MAZE_W - 1) * UNIT: 92 | base_action[0] += UNIT 93 | elif action == 3: # left 94 | if s[0] > UNIT: 95 | base_action[0] -= UNIT 96 | 97 | self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent 98 | 99 | next_coords = self.canvas.coords(self.rect) # next state 100 | 101 | # reward function 102 | if next_coords == self.canvas.coords(self.oval): 103 | reward = 1 104 | done = True 105 | elif next_coords in [self.canvas.coords(self.hell1)]: 106 | reward = -1 107 | done = True 108 | else: 109 | reward = 0 110 | done = False 111 | s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT) 112 | return s_, reward, done 113 | 114 | def render(self): 115 | # time.sleep(0.01) 116 | self.update() -------------------------------------------------------------------------------- /content/5_Deep_Q_Network/run_this.py: -------------------------------------------------------------------------------- 1 | from maze_env import Maze 2 | from RL_brain import DeepQNetwork 3 | 4 | def run_maze(): 5 | step = 0 6 | for episode in range(300): 7 | print("episode: {}".format(episode)) 8 | observation = env.reset() 9 | while True: 10 | print("step: {}".format(step)) 11 | env.render() 12 | action = RL.choose_action(observation) 13 | observation_, reward, done = env.step(action) 14 | RL.store_transition(observation, action, reward, observation_) 15 | if (step>200) and (step%5==0): 16 | RL.learn() 17 | observation = observation_ 18 | if done: 19 | break 20 | step += 1 21 | print('game over') 22 | env.destroy() 23 | 24 | if __name__ == '__main__': 25 | env = Maze() 26 | RL = DeepQNetwork(env.n_actions, env.n_features, 27 | learning_rate=0.01, 28 | reward_decay=0.9, 29 | e_greedy=0.9, 30 | replace_target_iter=200, 31 | memory_size=2000 32 | ) 33 | env.after(100, run_maze) 34 | env.mainloop() 35 | RL.plot_cost() -------------------------------------------------------------------------------- /content/7_Policy_gradient_softmax/RL_brain.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | np.random.seed(1) 7 | torch.manual_seed(1) 8 | 9 | 10 | class Net(nn.Module): 11 | def __init__(self, n_feature, n_hidden, n_output): 12 | super(Net, self).__init__() 13 | self.layer = nn.Linear(n_feature, n_hidden) 14 | self.all_act = nn.Linear(n_hidden, n_output) 15 | 16 | def forward(self, x): 17 | x = self.layer(x) 18 | x = torch.tanh(x) 19 | x = self.all_act(x) 20 | return x 21 | 22 | 23 | 24 | 25 | class PolicyGradient: 26 | def __init__(self, n_actions, n_features, n_hidden=10, learning_rate=0.01, reward_decay=0.95): 27 | self.n_actions = n_actions 28 | self.n_features = n_features 29 | self.n_hidden = n_hidden 30 | self.lr = learning_rate 31 | self.gamma = reward_decay 32 | 33 | self.ep_obs, self.ep_as, self.ep_rs = [], [], [] 34 | 35 | self._build_net() 36 | 37 | def _build_net(self): 38 | self.net = Net(self.n_features, self.n_hidden, self.n_actions) 39 | self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.lr) 40 | 41 | def choose_action(self, observation): 42 | observation = torch.Tensor(observation[np.newaxis, :]) 43 | prob_weights = self.net(observation) 44 | prob = F.softmax(prob_weights) 45 | action = np.random.choice(range(prob_weights.shape[1]), p=prob.data.numpy().ravel()) 46 | return action 47 | 48 | def store_transition(self, s, a, r): 49 | self.ep_obs.append(s) 50 | self.ep_as.append(a) 51 | self.ep_rs.append(r) 52 | 53 | def learn(self): 54 | # discount and normalize episode reward 55 | discounted_ep_rs_norm = self._discount_and_norm_rewards() 56 | obs = torch.Tensor(np.vstack(self.ep_obs)) 57 | acts = torch.Tensor(np.array(self.ep_as)) 58 | vt = torch.Tensor(discounted_ep_rs_norm) 59 | 60 | all_act = self.net(obs) 61 | 62 | # cross_entropy combines nn.LogSoftmax() and nn.NLLLoss() in one single class 63 | neg_log_prob = F.cross_entropy(all_act, acts.long(), reduce=False) 64 | loss = torch.mean(neg_log_prob * vt) 65 | 66 | self.optimizer.zero_grad() 67 | loss.backward() 68 | self.optimizer.step() 69 | 70 | self.ep_obs, self.ep_as, self.ep_rs = [], [], [] 71 | return discounted_ep_rs_norm 72 | 73 | def _discount_and_norm_rewards(self): 74 | discounted_ep_rs = np.zeros_like(self.ep_rs) 75 | running_add = 0 76 | for t in reversed(range(len(self.ep_rs))): 77 | running_add = running_add*self.gamma + self.ep_rs[t] 78 | discounted_ep_rs[t] = running_add 79 | 80 | discounted_ep_rs -= np.mean(discounted_ep_rs) 81 | discounted_ep_rs /= np.std(discounted_ep_rs) 82 | return discounted_ep_rs 83 | -------------------------------------------------------------------------------- /content/7_Policy_gradient_softmax/run_CartPole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from RL_brain import PolicyGradient 3 | import matplotlib.pyplot as plt 4 | 5 | DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold 6 | RENDER = False # rendering wastes time 7 | 8 | env = gym.make('CartPole-v0') 9 | env.seed(1) 10 | env = env.unwrapped 11 | 12 | print(env.action_space) 13 | print(env.observation_space) 14 | print(env.observation_space.high) 15 | print(env.observation_space.low) 16 | 17 | RL = PolicyGradient(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.99) 18 | 19 | for i_episode in range(3000): 20 | observation = env.reset() 21 | 22 | while True: 23 | if RENDER: env.render() 24 | 25 | action = RL.choose_action(observation) 26 | observation_, reward, done, info = env.step(action) 27 | RL.store_transition(observation, action, reward) 28 | 29 | if done: 30 | ep_rs_sum = sum(RL.ep_rs) 31 | 32 | if 'running_reward' not in globals(): 33 | running_reward = ep_rs_sum 34 | else: 35 | running_reward = running_reward*0.99 + ep_rs_sum*0.01 36 | 37 | # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering 38 | print("episode:", i_episode, " reward:", int(running_reward)) 39 | 40 | vt = RL.learn() 41 | 42 | if i_episode == 0: 43 | plt.plot(vt) 44 | plt.xlabel('episode steps') 45 | plt.ylabel('normalized state-action value') 46 | plt.show() 47 | break 48 | 49 | observation = observation_ 50 | -------------------------------------------------------------------------------- /content/7_Policy_gradient_softmax/run_MountainCar.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from RL_brain import PolicyGradient 3 | import matplotlib.pyplot as plt 4 | 5 | DISPLAY_REWARD_THRESHOLD = -2000 6 | RENDER = False 7 | 8 | env = gym.make('MountainCar-v0') 9 | env.seed(1) 10 | env = env.unwrapped 11 | 12 | print(env.action_space) 13 | print(env.observation_space) 14 | print(env.observation_space.high) 15 | print(env.observation_space.low) 16 | 17 | RL = PolicyGradient(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995) 18 | 19 | for i_episode in range(1000): 20 | observation = env.reset() 21 | 22 | while True: 23 | if RENDER: env.render() 24 | 25 | action = RL.choose_action(observation) 26 | 27 | observation_, reward, done, info = env.step(action) 28 | 29 | RL.store_transition(observation, action, reward) 30 | 31 | if done: 32 | # calculate running reward 33 | ep_rs_sum = sum(RL.ep_rs) 34 | if 'running_reward' not in globals(): 35 | running_reward = ep_rs_sum 36 | else: 37 | running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 38 | 39 | if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True 40 | 41 | print('episode:', i_episode, " reward:", int(running_reward)) 42 | 43 | vt = RL.learn() 44 | 45 | if i_episode == 30: 46 | plt.plot(vt) 47 | plt.xlabel('episode steps') 48 | plt.ylabel('normalized state-action value') 49 | plt.show() 50 | 51 | break 52 | 53 | observation = observation_ -------------------------------------------------------------------------------- /content/8_Actor_Critic_Advantage/AC_CartPole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | np.random.seed(1) 9 | torch.manual_seed(1) 10 | 11 | 12 | MAX_EPISODE = 3000 13 | DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater than this threshold 14 | MAX_EP_STEPS = 1000 # maximum time steps in one episode 15 | RENDER = False # rendering wastes time 16 | GAMMA = 0.9 # reward discount in TD error 17 | LR_A = 0.001 # learning rate for actor 18 | LR_C = 0.01 # learning rete for critic 19 | 20 | 21 | env = gym.make('CartPole-v0') 22 | env.seed(1) # reproducible 23 | env = env.unwrapped 24 | 25 | 26 | N_F = env.observation_space.shape[0] 27 | N_A = env.action_space.n 28 | 29 | 30 | class Net(nn.Module): 31 | def __init__(self, n_feature, n_hidden, n_output, activate=False): 32 | super(Net, self).__init__() 33 | self.l1 = nn.Linear(n_feature, n_hidden) 34 | self.acts_prob = nn.Linear(n_hidden, n_output) 35 | self.activate=activate 36 | 37 | 38 | def forward(self, x): 39 | x = self.l1(x) 40 | x = F.relu(x) 41 | x = self.acts_prob(x) 42 | if self.activate: 43 | x = F.softmax(x) 44 | return x 45 | 46 | 47 | class Actor(object): 48 | def __init__(self, n_features, n_actions, n_hidden=20, lr=0.001): 49 | self.n_features = n_features 50 | self.n_actions = n_actions 51 | self.n_hidden = n_hidden 52 | self.lr = lr 53 | 54 | self._build_net() 55 | 56 | 57 | def _build_net(self): 58 | self.actor_net = Net(self.n_features, self.n_hidden, self.n_actions, activate=True) 59 | self.optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=self.lr) 60 | 61 | 62 | def choose_action(self, s): 63 | s = torch.Tensor(s[np.newaxis, :]) 64 | probs = self.actor_net(s) 65 | return np.random.choice(np.arange(probs.shape[1]), p=probs.data.numpy().ravel()) 66 | 67 | 68 | def learn(self, s, a, td): 69 | s = torch.Tensor(s[np.newaxis, :]) 70 | acts_prob = self.actor_net(s) 71 | log_prob = torch.log(acts_prob[0, a]) 72 | exp_v = torch.mean(log_prob * td) 73 | 74 | loss = -exp_v 75 | self.optimizer.zero_grad() 76 | loss.backward(retain_graph=True) 77 | self.optimizer.step() 78 | 79 | return exp_v 80 | 81 | 82 | class Critic(object): 83 | def __init__(self, n_features, lr=0.01): 84 | self.n_features = n_features 85 | self.lr = lr 86 | 87 | self._build_net() 88 | 89 | 90 | def _build_net(self): 91 | self.critic_net = Net(self.n_features, 20, 1) 92 | self.optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=self.lr) 93 | 94 | 95 | def learn(self, s, r, s_): 96 | s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :]) 97 | v, v_ = self.critic_net(s), self.critic_net(s_) 98 | td_error = r + GAMMA * v_ - v 99 | loss = td_error ** 2 100 | 101 | self.optimizer.zero_grad() 102 | loss.backward(retain_graph=True) 103 | self.optimizer.step() 104 | 105 | return td_error 106 | 107 | 108 | 109 | actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A) 110 | critic = Critic(n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor 111 | 112 | for i_episode in range(MAX_EPISODE): 113 | s = env.reset() 114 | t = 0 115 | track_r = [] 116 | 117 | while True: 118 | if RENDER: env.render() 119 | 120 | a = actor.choose_action(s) 121 | 122 | s_, r, done, info = env.step(a) 123 | 124 | if done: r = -20 125 | 126 | track_r.append(r) 127 | 128 | td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] 129 | actor.learn(s, a, td_error) # true_gradient = grad[logPi(s, a) * td_error] 130 | 131 | s = s_ 132 | t += 1 133 | 134 | if done or t>=MAX_EP_STEPS: 135 | ep_rs_sum = sum(track_r) 136 | 137 | if 'running_reward' not in globals(): 138 | running_reward = ep_rs_sum 139 | else: 140 | running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 141 | 142 | # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True 143 | print("episode: ", i_episode, " reward:", int(running_reward)) 144 | break -------------------------------------------------------------------------------- /content/8_Actor_Critic_Advantage/AC_continue_Pendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | np.random.seed(1) 8 | torch.manual_seed(1) # reproducible 9 | 10 | 11 | class Actor_Net(nn.Module): 12 | def __init__(self, n_features, n_hidden, n_outputs): 13 | super(Actor_Net, self).__init__() 14 | self.l1 = nn.Linear(n_features, n_hidden) 15 | self.mu = nn.Linear(n_hidden, n_outputs) 16 | self.sigma = nn.Linear(n_hidden, n_outputs) 17 | 18 | 19 | def forward(self, x): 20 | x = self.l1(x) 21 | x = F.relu(x) 22 | mu = self.mu(x) 23 | mu = torch.tanh(mu) 24 | sigma = self.sigma(x) 25 | sigma = F.softplus(sigma) 26 | 27 | return mu, sigma 28 | 29 | 30 | class Actor(object): 31 | def __init__(self, n_features, action_bound, n_hidden=30, lr=0.0001): 32 | self.n_features = n_features 33 | self.action_bound = action_bound 34 | self.n_hidden = n_hidden 35 | self.lr = lr 36 | 37 | self._build_net() 38 | 39 | 40 | def _build_net(self): 41 | self.actor_net = Actor_Net(self.n_features, self.n_hidden, 1) 42 | self.optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=self.lr) 43 | 44 | 45 | def normal_dist(self, s): 46 | s = torch.Tensor(s[np.newaxis, :]) 47 | mu, sigma = self.actor_net(s) 48 | mu, sigma = (mu*2).squeeze(), (sigma+0.1).squeeze() 49 | normal_dist = torch.distributions.Normal(mu, sigma) # get the normal distribution of average=mu and std=sigma 50 | return normal_dist 51 | 52 | 53 | def choose_action(self, s): 54 | normal_dist = self.normal_dist(s) 55 | self.action = torch.clamp(normal_dist.sample(), self.action_bound[0], self.action_bound[1]) # sample action accroding to the distribution 56 | return self.action 57 | 58 | 59 | def learn(self, s, a, td): 60 | normal_dist = self.normal_dist(s) 61 | log_prob = normal_dist.log_prob(a) # log_prob get the probability of action a under the distribution of normal_dist 62 | exp_v = log_prob * td.float() # advantage (TD_error) guided loss 63 | exp_v += 0.01*normal_dist.entropy() # Add cross entropy cost to encourage exploration 64 | loss = -exp_v # max(v) = min(-v) 65 | 66 | self.optimizer.zero_grad() 67 | loss.backward() 68 | self.optimizer.step() 69 | 70 | return exp_v 71 | 72 | 73 | class Critic_Net(nn.Module): 74 | def __init__(self, n_features, n_hidden, n_outputs): 75 | super(Critic_Net, self).__init__() 76 | self.l1 = nn.Linear(n_features, n_hidden) 77 | self.v = nn.Linear(n_hidden, n_outputs) 78 | 79 | 80 | def forward(self, x): 81 | x = self.l1(x) 82 | x = F.relu(x) 83 | x = self.v(x) 84 | return x 85 | 86 | 87 | class Critic(object): 88 | def __init__(self, n_features, n_hidden=30, n_output=1, lr=0.01): 89 | self.n_features = n_features 90 | self.n_hidden = n_hidden 91 | self.n_output = n_output 92 | self.lr = lr 93 | 94 | self._build_net() 95 | 96 | 97 | def _build_net(self): 98 | self.critic_net = Critic_Net(self.n_features, self.n_hidden, self.n_output) 99 | self.optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=self.lr) 100 | 101 | 102 | def learn(self, s, r, s_): 103 | s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :]) 104 | v, v_ = self.critic_net(s), self.critic_net(s_) 105 | td_error = torch.mean(r + GAMMA * v_.double() - v.double()) 106 | loss = td_error ** 2 107 | 108 | self.optimizer.zero_grad() 109 | loss.backward(retain_graph=True) 110 | self.optimizer.step() 111 | 112 | return td_error 113 | 114 | 115 | MAX_EPISODE = 1000 116 | MAX_EP_STEPS = 200 117 | DISPLAY_REWARD_THRESHOLD = -100 118 | RENDER = False 119 | GAMMA = 0.9 120 | LR_A = 0.001 121 | LR_C = 0.01 122 | 123 | 124 | env = gym.make('Pendulum-v0') 125 | env.seed(1) 126 | env = env.unwrapped 127 | 128 | 129 | N_S = env.observation_space.shape[0] 130 | A_BOUND = env.action_space.high 131 | 132 | 133 | actor = Actor(n_features=N_S, lr=LR_A, action_bound=[float(-A_BOUND), float(A_BOUND)]) 134 | critic = Critic(n_features=N_S, lr=LR_C) 135 | 136 | 137 | for i_episode in range(MAX_EPISODE): 138 | s = env.reset() 139 | t = 0 140 | ep_rs = [] 141 | while True: 142 | if RENDER: env.render() 143 | a = actor.choose_action(s) 144 | 145 | s_, r, done, info = env.step(a) 146 | r /= 10 147 | 148 | td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] 149 | actor.learn(s, a, td_error) # gradient = grad[logPi(s, a) * td_error] 150 | 151 | s = s_ 152 | t += 1 153 | ep_rs.append(r) 154 | if t > MAX_EP_STEPS: 155 | ep_rs_sum = sum(ep_rs) 156 | if 'running_reward' not in globals(): 157 | running_reward = ep_rs_sum 158 | else: 159 | running_reward = running_reward * 0.9 + ep_rs_sum * 0.1 160 | # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True 161 | print('episode: ', i_episode, ' reward:', int(running_reward)) 162 | break --------------------------------------------------------------------------------