├── README.md
└── content
    ├── 1_treasure_on_right
        └── treasure_on_right.py
    ├── 2_Q-learning-maze
        ├── RL_brain.py
        ├── maze_env.py
        └── run_this.py
    ├── 3_Sarsa_maze
        ├── RL_brain.py
        ├── maze_env.py
        └── run_this.py
    ├── 4_Sarsa_lambda_maze
        ├── RL_brain.py
        ├── maze_env.py
        └── run_this.py
    ├── 5.1_double_DQN
        ├── RL_brain.py
        └── run_Pendulum.py
    ├── 5.2_Prioritized_Replay_DQN
        ├── Figure_1.png
        ├── RL_brain.py
        └── run_MountainCar.py
    ├── 5.3_Dueling_DQN
        ├── RL_brain.py
        ├── action15.png
        └── run_Pendulum.py
    ├── 5_Deep_Q_Network
        ├── RL_brain.py
        ├── maze_env.py
        └── run_this.py
    ├── 7_Policy_gradient_softmax
        ├── RL_brain.py
        ├── run_CartPole.py
        └── run_MountainCar.py
    └── 8_Actor_Critic_Advantage
        ├── AC_CartPole.py
        └── AC_continue_Pendulum.py


/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement-learning-with-PyTorch
2 | Reinforcement learning with PyTorch, inspired by [MorvanZhou](https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow), change the framework from Tensorflow to PyTorch
3 | 


--------------------------------------------------------------------------------
/content/1_treasure_on_right/treasure_on_right.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | np.random.seed(2)
 6 | 
 7 | N_STATES = 6
 8 | ACTIONS = ['left', 'right']
 9 | MAX_EPISODES = 13
10 | FRESH_TIME = 0.3
11 | EPSILON = 0.9
12 | ALPHA = 0.1
13 | GAMMA = 0.9
14 | 
15 | def update_env(S, episode, step_counter):
16 |     env_list = ['-']*(N_STATES-1) + ['T']
17 |     if S == 'terminal':
18 |         interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
19 |         print('\r{}'.format(interaction),end='')
20 |         time.sleep(2)
21 |         print('\r                                ', end='')
22 |     else:
23 |         env_list[S] = 'o'
24 |         interaction = ''.join(env_list)
25 |         print('\r{}'.format(interaction), end='')
26 |         time.sleep(FRESH_TIME)
27 | 
28 | def get_env_feedback(S,A):
29 |     if A == 'right':
30 |         if S == N_STATES - 2:
31 |             S_ = 'terminal'
32 |             R = 1
33 |         else:
34 |             S_ = S + 1
35 |             R = 0
36 |     else:
37 |         R = 0
38 |         if S == 0:
39 |             S_ = S
40 |         else:
41 |             S_ = S - 1
42 |     return S_, R
43 | 
44 | def build_q_table(n_states, actions):
45 |     table = pd.DataFrame(np.zeros((n_states, len(actions))),columns=actions)
46 |     return table
47 | 
48 | def choose_action(state, q_table):
49 |     state_actions = q_table.iloc[state,:]
50 |     if (np.random.uniform()>EPSILON) or ((state_actions == 0).all()):
51 |         action_name = np.random.choice(ACTIONS)
52 |     else:
53 |         action_name = state_actions.idxmax()
54 |     return action_name
55 | 
56 | def rl():
57 |     q_table = build_q_table(N_STATES, ACTIONS)
58 |     for episode in range(MAX_EPISODES):
59 | #         print("episode: ", episode)
60 | #         print("q_table: ", q_table)
61 |         step_counter = 0
62 |         S = 0
63 |         is_terminated = False
64 |         update_env(S, episode, step_counter)
65 |         while not is_terminated:
66 |             A = choose_action(S, q_table)
67 |             S_, R = get_env_feedback(S, A)
68 |             q_predict = q_table.loc[S,A]
69 |             if S_ != 'terminal':
70 |                 q_target = R + GAMMA*q_table.iloc[S_,:].max()
71 |             else:
72 |                 q_target = R
73 |                 is_terminated = True
74 |                 
75 |             q_table.loc[S,A] += ALPHA*(q_target - q_predict)
76 |             S = S_
77 |             
78 |             update_env(S, episode, step_counter+1)
79 |             step_counter += 1
80 |     return q_table
81 | 
82 | if __name__ == "__main__":
83 |     q_table = rl()
84 |     print('\r\nQ-table:\n')
85 |     print(q_table)


--------------------------------------------------------------------------------
/content/2_Q-learning-maze/RL_brain.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | class QLearningTable:
 5 | 	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
 6 | 		self.actions = actions
 7 | 		self.lr = learning_rate
 8 | 		self.gamma = reward_decay
 9 | 		self.epsilon = e_greedy
10 | 		self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
11 | 
12 | 	def choose_action(self, observation):
13 | 		self.check_state_exist(observation)
14 | 		if np.random.uniform() < self.epsilon:
15 | 			state_action = self.q_table.loc[observation,:]
16 | 			action = np.random.choice(state_action[state_action==np.max(state_action)].index)
17 | 		else:
18 | 			action = np.random.choice(self.actions)
19 | 		return action
20 | 
21 | 	def learn(self, s, a, r, s_):
22 | 		self.check_state_exist(s_)
23 | 		q_predict = self.q_table.loc[s, a]
24 | 		if s_ != 'terminal':
25 | 			q_target = r + self.gamma*self.q_table.loc[s_, :].max()
26 | 		else:
27 | 			q_target = r
28 | 		self.q_table.loc[s, a] += self.lr*(q_target-q_predict)
29 | 
30 | 	def check_state_exist(self, state):
31 | 		if state not in self.q_table.index:
32 | 			self.q_table = self.q_table.append(pd.Series([0]*len(self.actions),index=self.q_table.columns,name=state))


--------------------------------------------------------------------------------
/content/2_Q-learning-maze/maze_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import sys
  4 | if sys.version_info.major == 2:
  5 |     import Tkinter as tk
  6 | else:
  7 |     import tkinter as tk
  8 | 
  9 | 
 10 | UNIT = 40   # pixels
 11 | MAZE_H = 8  # grid height
 12 | MAZE_W = 8  # grid width
 13 | 
 14 | 
 15 | class Maze(tk.Tk, object):
 16 |     def __init__(self):
 17 |         super(Maze, self).__init__()
 18 |         self.action_space = ['u', 'd', 'l', 'r']
 19 |         self.n_actions = len(self.action_space)
 20 |         self.title('maze')
 21 |         self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
 22 |         self._build_maze()
 23 | 
 24 |     def _build_maze(self):
 25 |         self.canvas = tk.Canvas(self, bg='white',
 26 |                            height=MAZE_H * UNIT,
 27 |                            width=MAZE_W * UNIT)
 28 | 
 29 |         # create grids
 30 |         for c in range(0, MAZE_W * UNIT, UNIT):
 31 |             x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
 32 |             self.canvas.create_line(x0, y0, x1, y1)
 33 |         for r in range(0, MAZE_H * UNIT, UNIT):
 34 |             x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
 35 |             self.canvas.create_line(x0, y0, x1, y1)
 36 | 
 37 |         # create origin
 38 |         origin = np.array([20, 20])
 39 | 
 40 |         # hell
 41 |         hell1_center = origin + np.array([UNIT * 2, UNIT])
 42 |         self.hell1 = self.canvas.create_rectangle(
 43 |             hell1_center[0] - 15, hell1_center[1] - 15,
 44 |             hell1_center[0] + 15, hell1_center[1] + 15,
 45 |             fill='black')
 46 |         # hell
 47 |         hell2_center = origin + np.array([UNIT, UNIT * 2])
 48 |         self.hell2 = self.canvas.create_rectangle(
 49 |             hell2_center[0] - 15, hell2_center[1] - 15,
 50 |             hell2_center[0] + 15, hell2_center[1] + 15,
 51 |             fill='black')
 52 | 
 53 |         # hell
 54 |         hell3_center = origin + np.array([UNIT * 2, UNIT * 6])
 55 |         self.hell3 = self.canvas.create_rectangle(
 56 |             hell3_center[0] - 15, hell3_center[1] - 15,
 57 |             hell3_center[0] + 15, hell3_center[1] + 15,
 58 |             fill='black')
 59 | 
 60 |         # hell
 61 |         hell4_center = origin + np.array([UNIT * 6, UNIT * 2])
 62 |         self.hell4 = self.canvas.create_rectangle(
 63 |             hell4_center[0] - 15, hell4_center[1] - 15,
 64 |             hell4_center[0] + 15, hell4_center[1] + 15,
 65 |             fill='black')
 66 | 
 67 |         # hell
 68 |         hell5_center = origin + np.array([UNIT * 4, UNIT * 4])
 69 |         self.hell5 = self.canvas.create_rectangle(
 70 |             hell5_center[0] - 15, hell5_center[1] - 15,
 71 |             hell5_center[0] + 15, hell5_center[1] + 15,
 72 |             fill='black')
 73 | 
 74 |         # hell
 75 |         hell6_center = origin + np.array([UNIT * 4, UNIT * 1])
 76 |         self.hell6 = self.canvas.create_rectangle(
 77 |             hell6_center[0] - 15, hell6_center[1] - 15,
 78 |             hell6_center[0] + 15, hell6_center[1] + 15,
 79 |             fill='black')
 80 | 
 81 |         # hell
 82 |         hell7_center = origin + np.array([UNIT * 1, UNIT * 3])
 83 |         self.hell7 = self.canvas.create_rectangle(
 84 |             hell7_center[0] - 15, hell7_center[1] - 15,
 85 |             hell7_center[0] + 15, hell7_center[1] + 15,
 86 |             fill='black')
 87 | 
 88 |         # hell
 89 |         hell8_center = origin + np.array([UNIT * 2, UNIT * 4])
 90 |         self.hell8 = self.canvas.create_rectangle(
 91 |             hell8_center[0] - 15, hell8_center[1] - 15,
 92 |             hell8_center[0] + 15, hell8_center[1] + 15,
 93 |             fill='black')
 94 | 
 95 |         # hell
 96 |         hell9_center = origin + np.array([UNIT * 3, UNIT * 2])
 97 |         self.hell9 = self.canvas.create_rectangle(
 98 |             hell9_center[0] - 15, hell9_center[1] - 15,
 99 |             hell9_center[0] + 15, hell9_center[1] + 15,
100 |             fill='black')
101 | 
102 | 
103 | 
104 | 
105 |         # create oval
106 |         oval_center = origin + UNIT * 3
107 |         self.oval = self.canvas.create_oval(
108 |             oval_center[0] - 15, oval_center[1] - 15,
109 |             oval_center[0] + 15, oval_center[1] + 15,
110 |             fill='yellow')
111 | 
112 |         # create red rect
113 |         self.rect = self.canvas.create_rectangle(
114 |             origin[0] - 15, origin[1] - 15,
115 |             origin[0] + 15, origin[1] + 15,
116 |             fill='red')
117 | 
118 |         # pack all
119 |         self.canvas.pack()
120 | 
121 |     def reset(self):
122 |         self.update()
123 |         time.sleep(0.5)
124 |         self.canvas.delete(self.rect)
125 |         origin = np.array([20, 20])
126 |         self.rect = self.canvas.create_rectangle(
127 |             origin[0] - 15, origin[1] - 15,
128 |             origin[0] + 15, origin[1] + 15,
129 |             fill='red')
130 |         # return observation
131 |         return self.canvas.coords(self.rect)
132 | 
133 |     def step(self, action):
134 |         s = self.canvas.coords(self.rect)
135 |         base_action = np.array([0, 0])
136 |         if action == 0:   # up
137 |             if s[1] > UNIT:
138 |                 base_action[1] -= UNIT
139 |         elif action == 1:   # down
140 |             if s[1] < (MAZE_H - 1) * UNIT:
141 |                 base_action[1] += UNIT
142 |         elif action == 2:   # right
143 |             if s[0] < (MAZE_W - 1) * UNIT:
144 |                 base_action[0] += UNIT
145 |         elif action == 3:   # left
146 |             if s[0] > UNIT:
147 |                 base_action[0] -= UNIT
148 | 
149 |         self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent
150 | 
151 |         s_ = self.canvas.coords(self.rect)  # next state
152 | 
153 |         # reward function
154 |         if s_ == self.canvas.coords(self.oval):
155 |             reward = 1
156 |             done = True
157 |             s_ = 'terminal'
158 |         elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2), self.canvas.coords(self.hell3),
159 |         self.canvas.coords(self.hell4),self.canvas.coords(self.hell5),self.canvas.coords(self.hell6),self.canvas.coords(self.hell7),
160 |         self.canvas.coords(self.hell8),self.canvas.coords(self.hell9)]:
161 |             reward = -1
162 |             done = True
163 |             s_ = 'terminal'
164 |         else:
165 |             reward = 0
166 |             done = False
167 | 
168 |         return s_, reward, done
169 | 
170 |     def render(self):
171 |         time.sleep(0.1)
172 |         self.update()
173 | 
174 | 
175 | def update():
176 |     for t in range(10):
177 |         s = env.reset()
178 |         while True:
179 |             env.render()
180 |             a = 1
181 |             s, r, done = env.step(a)
182 |             if done:
183 |                 break
184 | 
185 | if __name__ == '__main__':
186 |     env = Maze()
187 |     env.after(100, update)
188 |     env.mainloop()


--------------------------------------------------------------------------------
/content/2_Q-learning-maze/run_this.py:
--------------------------------------------------------------------------------
 1 | from maze_env import Maze
 2 | from RL_brain import QLearningTable
 3 | 
 4 | def update():
 5 | 	for episode in range(150):
 6 | 		observation = env.reset()
 7 | 		print(episode)
 8 | 		while True:
 9 | 			env.render()
10 | 			action = RL.choose_action(str(observation))
11 | 			# print("observation: {}".format(observation))
12 | 			observation_, reward, done = env.step(action)
13 | 			RL.learn(str(observation), action, reward, str(observation_))
14 | 			# print(RL.q_table)
15 | 			observation = observation_
16 | 			if done:
17 | 				break
18 | 	print('game over')
19 | 	env.destroy()
20 | 
21 | if __name__ == '__main__':
22 | 	env = Maze()
23 | 	# print("env.n_actions: {}".format(env.n_actions))
24 | 	RL = QLearningTable(actions=list(range(env.n_actions)))
25 | 
26 | 	env.after(100, update)
27 | 	env.mainloop()


--------------------------------------------------------------------------------
/content/3_Sarsa_maze/RL_brain.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | class RL(object):
 5 | 	def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
 6 | 		self.actions = action_space
 7 | 		self.lr = learning_rate
 8 | 		self.gamma = reward_decay
 9 | 		self.epsilon = e_greedy
10 | 
11 | 		self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
12 | 
13 | 	def check_state_exist(self, state):
14 | 		if state not in self.q_table.index:
15 | 			self.q_table = self.q_table.append(pd.Series([0]*len(self.actions),index=self.q_table.columns,name=state))
16 | 
17 | 	def choose_action(self, observation):
18 | 		self.check_state_exist(observation)
19 | 		if np.random.rand() < self.epsilon:
20 | 			state_action = self.q_table.loc[observation, :]
21 | 			action = np.random.choice(state_action[state_action == np.max(state_action)].index)
22 | 		else:
23 | 			action = np.random.choice(self.actions)
24 | 		return action
25 | 
26 | 	def learn(self, *args):
27 | 		pass
28 | 
29 | 
30 | class QLearningTable(RL):
31 | 	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
32 | 		super(QLearningTable,self).__init__(actions, learning_rate, reward_decay, e_greedy)
33 | 
34 | 	def learn(self, s, a, r, s_):
35 | 		self.check_state_exist(s_)
36 | 		q_predict = self.q_table.loc[s,a]
37 | 		if s_ != 'terminal':
38 | 			q_target = r + self.gamma*self.q_table.loc[s_,:].max()
39 | 		else:
40 | 			q_target = r
41 | 		self.q_table.loc[s,a] += self.lr*(q_target-q_predict)
42 | 
43 | 
44 | class SarsaTable(RL):
45 | 	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
46 | 		super(SarsaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
47 | 
48 | 	def learn(self, s, a, r, s_, a_):
49 | 		self.check_state_exist(s_)
50 | 		q_predict = self.q_table.loc[s,a]
51 | 		if s_ != "terminal":
52 | 			q_target = r + self.gamma * self.q_table.loc[s_, a_]
53 | 		else:
54 | 			q_target = r
55 | 		self.q_table.loc[s,a] += self.lr * (q_target-q_predict)


--------------------------------------------------------------------------------
/content/3_Sarsa_maze/maze_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import sys
  4 | if sys.version_info.major == 2:
  5 |     import Tkinter as tk
  6 | else:
  7 |     import tkinter as tk
  8 | 
  9 | 
 10 | UNIT = 40   # pixels
 11 | MAZE_H = 8  # grid height
 12 | MAZE_W = 8  # grid width
 13 | 
 14 | 
 15 | class Maze(tk.Tk, object):
 16 |     def __init__(self):
 17 |         super(Maze, self).__init__()
 18 |         self.action_space = ['u', 'd', 'l', 'r']
 19 |         self.n_actions = len(self.action_space)
 20 |         self.title('maze')
 21 |         self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
 22 |         self._build_maze()
 23 | 
 24 |     def _build_maze(self):
 25 |         self.canvas = tk.Canvas(self, bg='white',
 26 |                            height=MAZE_H * UNIT,
 27 |                            width=MAZE_W * UNIT)
 28 | 
 29 |         # create grids
 30 |         for c in range(0, MAZE_W * UNIT, UNIT):
 31 |             x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
 32 |             self.canvas.create_line(x0, y0, x1, y1)
 33 |         for r in range(0, MAZE_H * UNIT, UNIT):
 34 |             x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
 35 |             self.canvas.create_line(x0, y0, x1, y1)
 36 | 
 37 |         # create origin
 38 |         origin = np.array([20, 20])
 39 | 
 40 |         # hell
 41 |         hell1_center = origin + np.array([UNIT * 2, UNIT])
 42 |         self.hell1 = self.canvas.create_rectangle(
 43 |             hell1_center[0] - 15, hell1_center[1] - 15,
 44 |             hell1_center[0] + 15, hell1_center[1] + 15,
 45 |             fill='black')
 46 |         # hell
 47 |         hell2_center = origin + np.array([UNIT, UNIT * 2])
 48 |         self.hell2 = self.canvas.create_rectangle(
 49 |             hell2_center[0] - 15, hell2_center[1] - 15,
 50 |             hell2_center[0] + 15, hell2_center[1] + 15,
 51 |             fill='black')
 52 | 
 53 |         # hell
 54 |         hell3_center = origin + np.array([UNIT * 2, UNIT * 6])
 55 |         self.hell3 = self.canvas.create_rectangle(
 56 |             hell3_center[0] - 15, hell3_center[1] - 15,
 57 |             hell3_center[0] + 15, hell3_center[1] + 15,
 58 |             fill='black')
 59 | 
 60 |         # hell
 61 |         hell4_center = origin + np.array([UNIT * 6, UNIT * 2])
 62 |         self.hell4 = self.canvas.create_rectangle(
 63 |             hell4_center[0] - 15, hell4_center[1] - 15,
 64 |             hell4_center[0] + 15, hell4_center[1] + 15,
 65 |             fill='black')
 66 | 
 67 |         # hell
 68 |         hell5_center = origin + np.array([UNIT * 4, UNIT * 4])
 69 |         self.hell5 = self.canvas.create_rectangle(
 70 |             hell5_center[0] - 15, hell5_center[1] - 15,
 71 |             hell5_center[0] + 15, hell5_center[1] + 15,
 72 |             fill='black')
 73 | 
 74 |         # hell
 75 |         hell6_center = origin + np.array([UNIT * 4, UNIT * 1])
 76 |         self.hell6 = self.canvas.create_rectangle(
 77 |             hell6_center[0] - 15, hell6_center[1] - 15,
 78 |             hell6_center[0] + 15, hell6_center[1] + 15,
 79 |             fill='black')
 80 | 
 81 |         # hell
 82 |         hell7_center = origin + np.array([UNIT * 1, UNIT * 3])
 83 |         self.hell7 = self.canvas.create_rectangle(
 84 |             hell7_center[0] - 15, hell7_center[1] - 15,
 85 |             hell7_center[0] + 15, hell7_center[1] + 15,
 86 |             fill='black')
 87 | 
 88 |         # hell
 89 |         hell8_center = origin + np.array([UNIT * 2, UNIT * 4])
 90 |         self.hell8 = self.canvas.create_rectangle(
 91 |             hell8_center[0] - 15, hell8_center[1] - 15,
 92 |             hell8_center[0] + 15, hell8_center[1] + 15,
 93 |             fill='black')
 94 | 
 95 |         # hell
 96 |         hell9_center = origin + np.array([UNIT * 3, UNIT * 2])
 97 |         self.hell9 = self.canvas.create_rectangle(
 98 |             hell9_center[0] - 15, hell9_center[1] - 15,
 99 |             hell9_center[0] + 15, hell9_center[1] + 15,
100 |             fill='black')
101 | 
102 | 
103 | 
104 | 
105 |         # create oval
106 |         oval_center = origin + UNIT * 3
107 |         self.oval = self.canvas.create_oval(
108 |             oval_center[0] - 15, oval_center[1] - 15,
109 |             oval_center[0] + 15, oval_center[1] + 15,
110 |             fill='yellow')
111 | 
112 |         # create red rect
113 |         self.rect = self.canvas.create_rectangle(
114 |             origin[0] - 15, origin[1] - 15,
115 |             origin[0] + 15, origin[1] + 15,
116 |             fill='red')
117 | 
118 |         # pack all
119 |         self.canvas.pack()
120 | 
121 |     def reset(self):
122 |         self.update()
123 |         time.sleep(0.5)
124 |         self.canvas.delete(self.rect)
125 |         origin = np.array([20, 20])
126 |         self.rect = self.canvas.create_rectangle(
127 |             origin[0] - 15, origin[1] - 15,
128 |             origin[0] + 15, origin[1] + 15,
129 |             fill='red')
130 |         # return observation
131 |         return self.canvas.coords(self.rect)
132 | 
133 |     def step(self, action):
134 |         s = self.canvas.coords(self.rect)
135 |         base_action = np.array([0, 0])
136 |         if action == 0:   # up
137 |             if s[1] > UNIT:
138 |                 base_action[1] -= UNIT
139 |         elif action == 1:   # down
140 |             if s[1] < (MAZE_H - 1) * UNIT:
141 |                 base_action[1] += UNIT
142 |         elif action == 2:   # right
143 |             if s[0] < (MAZE_W - 1) * UNIT:
144 |                 base_action[0] += UNIT
145 |         elif action == 3:   # left
146 |             if s[0] > UNIT:
147 |                 base_action[0] -= UNIT
148 | 
149 |         self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent
150 | 
151 |         s_ = self.canvas.coords(self.rect)  # next state
152 | 
153 |         # reward function
154 |         if s_ == self.canvas.coords(self.oval):
155 |             reward = 1
156 |             done = True
157 |             s_ = 'terminal'
158 |         elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2), self.canvas.coords(self.hell3),
159 |         self.canvas.coords(self.hell4),self.canvas.coords(self.hell5),self.canvas.coords(self.hell6),self.canvas.coords(self.hell7),
160 |         self.canvas.coords(self.hell8),self.canvas.coords(self.hell9)]:
161 |             reward = -1
162 |             done = True
163 |             s_ = 'terminal'
164 |         else:
165 |             reward = 0
166 |             done = False
167 | 
168 |         return s_, reward, done
169 | 
170 |     def render(self):
171 |         time.sleep(0.1)
172 |         self.update()
173 | 
174 | 
175 | def update():
176 |     for t in range(10):
177 |         s = env.reset()
178 |         while True:
179 |             env.render()
180 |             a = 1
181 |             s, r, done = env.step(a)
182 |             if done:
183 |                 break
184 | 
185 | if __name__ == '__main__':
186 |     env = Maze()
187 |     env.after(100, update)
188 |     env.mainloop()


--------------------------------------------------------------------------------
/content/3_Sarsa_maze/run_this.py:
--------------------------------------------------------------------------------
 1 | from maze_env import Maze
 2 | from RL_brain import SarsaTable,QLearningTable
 3 | 
 4 | def update():
 5 | 	for episode in range(300):
 6 | 		observation = env.reset()
 7 | 		action = RL.choose_action(str(observation))
 8 | 		print(episode)
 9 | 		while True:
10 | 			env.render()
11 | 			observation_,reward,done = env.step(action)
12 | 			action_ = RL.choose_action(str(observation_))
13 | 			RL.learn(str(observation),action,reward,str(observation_))
14 | 			observation = observation_
15 | 			action = action_
16 | 
17 | 			if done:
18 | 				break
19 | 
20 | 	print('game over')
21 | 	env.destroy()
22 | 
23 | if __name__ == '__main__':
24 | 	env = Maze()
25 | 	# RL = SarsaTable(actions=list(range(env.n_actions)))
26 | 	RL = QLearningTable(actions=list(range(env.n_actions)))
27 | 	env.after(100, update)
28 | 	env.mainloop()


--------------------------------------------------------------------------------
/content/4_Sarsa_lambda_maze/RL_brain.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | class RL(object):
 5 | 	def __init__(self, action_space, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
 6 | 		self.actions = action_space
 7 | 		self.lr = learning_rate
 8 | 		self.gamma = reward_decay
 9 | 		self.epsilon = e_greedy
10 | 
11 | 		self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
12 | 
13 | 	def check_state_exist(self, state):
14 | 		if state not in self.q_table.index:
15 | 			self.q_table = self.q_table.append(pd.Series([0]*len(self.actions), index=self.q_table.columns, name=state))
16 | 
17 | 	def choose_action(self, observation):
18 | 		self.check_state_exist(observation)
19 | 		if np.random.rand() < self.epsilon:
20 | 			state_action = self.q_table.loc[observation, :]
21 | 			# print("state_action: {}".format(state_action))
22 | 			# print("state_action == np.max(state_action): {}".format(state_action == np.max(state_action)))
23 | 			print(state_action[state_action==np.max(state_action)])
24 | 			action = np.random.choice(state_action[state_action==np.max(state_action)].index)
25 | 		else:
26 | 			action = np.random.choice(self.actions)
27 | 		return action
28 | 
29 | 	def learn(self, *args):
30 | 		pass
31 | 
32 | class SarsaLambdaTable(RL):
33 | 	def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, trace_decay=0.9):
34 | 		super(SarsaLambdaTable, self).__init__(actions, learning_rate, reward_decay, e_greedy)
35 | 		self.lambda_ = trace_decay
36 | 		self.eligibility_trace = self.q_table.copy()
37 | 
38 | 	def check_state_exist(self, state):
39 | 		if state not in self.q_table.index:
40 | 			to_be_append = pd.Series([0]*len(self.actions), index=self.q_table.columns, name=state)
41 | 			self.q_table = self.q_table.append(to_be_append)
42 | 			self.eligibility_trace = self.eligibility_trace.append(to_be_append)
43 | 
44 | 	def learn(self, s, a, r, s_, a_):
45 | 		self.check_state_exist(s_)
46 | 		q_predict = self.q_table.loc[s,a]
47 | 		if s_ != 'terminal':
48 | 			q_target = r + self.gamma * self.q_table.loc[s_, a_]
49 | 		else:
50 | 			q_target = r
51 | 
52 | 		error = q_target - q_predict
53 | 
54 | 		self.eligibility_trace.loc[s,:] *= 0
55 | 		self.eligibility_trace.loc[s,a] = 1
56 | 
57 | 		self.q_table += self.lr * error * self.eligibility_trace
58 | 
59 | 		self.eligibility_trace *= self.gamma*self.lambda_


--------------------------------------------------------------------------------
/content/4_Sarsa_lambda_maze/maze_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import sys
  4 | if sys.version_info.major == 2:
  5 |     import Tkinter as tk
  6 | else:
  7 |     import tkinter as tk
  8 | 
  9 | 
 10 | UNIT = 40   # pixels
 11 | MAZE_H = 8  # grid height
 12 | MAZE_W = 8  # grid width
 13 | 
 14 | 
 15 | class Maze(tk.Tk, object):
 16 |     def __init__(self):
 17 |         super(Maze, self).__init__()
 18 |         self.action_space = ['u', 'd', 'l', 'r']
 19 |         self.n_actions = len(self.action_space)
 20 |         self.title('maze')
 21 |         self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
 22 |         self._build_maze()
 23 | 
 24 |     def _build_maze(self):
 25 |         self.canvas = tk.Canvas(self, bg='white',
 26 |                            height=MAZE_H * UNIT,
 27 |                            width=MAZE_W * UNIT)
 28 | 
 29 |         # create grids
 30 |         for c in range(0, MAZE_W * UNIT, UNIT):
 31 |             x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
 32 |             self.canvas.create_line(x0, y0, x1, y1)
 33 |         for r in range(0, MAZE_H * UNIT, UNIT):
 34 |             x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
 35 |             self.canvas.create_line(x0, y0, x1, y1)
 36 | 
 37 |         # create origin
 38 |         origin = np.array([20, 20])
 39 | 
 40 |         # hell
 41 |         hell1_center = origin + np.array([UNIT * 2, UNIT])
 42 |         self.hell1 = self.canvas.create_rectangle(
 43 |             hell1_center[0] - 15, hell1_center[1] - 15,
 44 |             hell1_center[0] + 15, hell1_center[1] + 15,
 45 |             fill='black')
 46 |         # hell
 47 |         hell2_center = origin + np.array([UNIT, UNIT * 2])
 48 |         self.hell2 = self.canvas.create_rectangle(
 49 |             hell2_center[0] - 15, hell2_center[1] - 15,
 50 |             hell2_center[0] + 15, hell2_center[1] + 15,
 51 |             fill='black')
 52 | 
 53 |         # hell
 54 |         hell3_center = origin + np.array([UNIT * 2, UNIT * 6])
 55 |         self.hell3 = self.canvas.create_rectangle(
 56 |             hell3_center[0] - 15, hell3_center[1] - 15,
 57 |             hell3_center[0] + 15, hell3_center[1] + 15,
 58 |             fill='black')
 59 | 
 60 |         # hell
 61 |         hell4_center = origin + np.array([UNIT * 6, UNIT * 2])
 62 |         self.hell4 = self.canvas.create_rectangle(
 63 |             hell4_center[0] - 15, hell4_center[1] - 15,
 64 |             hell4_center[0] + 15, hell4_center[1] + 15,
 65 |             fill='black')
 66 | 
 67 |         # hell
 68 |         hell5_center = origin + np.array([UNIT * 4, UNIT * 4])
 69 |         self.hell5 = self.canvas.create_rectangle(
 70 |             hell5_center[0] - 15, hell5_center[1] - 15,
 71 |             hell5_center[0] + 15, hell5_center[1] + 15,
 72 |             fill='black')
 73 | 
 74 |         # hell
 75 |         hell6_center = origin + np.array([UNIT * 4, UNIT * 1])
 76 |         self.hell6 = self.canvas.create_rectangle(
 77 |             hell6_center[0] - 15, hell6_center[1] - 15,
 78 |             hell6_center[0] + 15, hell6_center[1] + 15,
 79 |             fill='black')
 80 | 
 81 |         # hell
 82 |         hell7_center = origin + np.array([UNIT * 1, UNIT * 3])
 83 |         self.hell7 = self.canvas.create_rectangle(
 84 |             hell7_center[0] - 15, hell7_center[1] - 15,
 85 |             hell7_center[0] + 15, hell7_center[1] + 15,
 86 |             fill='black')
 87 | 
 88 |         # hell
 89 |         hell8_center = origin + np.array([UNIT * 2, UNIT * 4])
 90 |         self.hell8 = self.canvas.create_rectangle(
 91 |             hell8_center[0] - 15, hell8_center[1] - 15,
 92 |             hell8_center[0] + 15, hell8_center[1] + 15,
 93 |             fill='black')
 94 | 
 95 |         # hell
 96 |         hell9_center = origin + np.array([UNIT * 3, UNIT * 2])
 97 |         self.hell9 = self.canvas.create_rectangle(
 98 |             hell9_center[0] - 15, hell9_center[1] - 15,
 99 |             hell9_center[0] + 15, hell9_center[1] + 15,
100 |             fill='black')
101 | 
102 | 
103 | 
104 | 
105 |         # create oval
106 |         oval_center = origin + UNIT * 3
107 |         self.oval = self.canvas.create_oval(
108 |             oval_center[0] - 15, oval_center[1] - 15,
109 |             oval_center[0] + 15, oval_center[1] + 15,
110 |             fill='yellow')
111 | 
112 |         # create red rect
113 |         self.rect = self.canvas.create_rectangle(
114 |             origin[0] - 15, origin[1] - 15,
115 |             origin[0] + 15, origin[1] + 15,
116 |             fill='red')
117 | 
118 |         # pack all
119 |         self.canvas.pack()
120 | 
121 |     def reset(self):
122 |         self.update()
123 |         time.sleep(0.5)
124 |         self.canvas.delete(self.rect)
125 |         origin = np.array([20, 20])
126 |         self.rect = self.canvas.create_rectangle(
127 |             origin[0] - 15, origin[1] - 15,
128 |             origin[0] + 15, origin[1] + 15,
129 |             fill='red')
130 |         # return observation
131 |         return self.canvas.coords(self.rect)
132 | 
133 |     def step(self, action):
134 |         s = self.canvas.coords(self.rect)
135 |         base_action = np.array([0, 0])
136 |         if action == 0:   # up
137 |             if s[1] > UNIT:
138 |                 base_action[1] -= UNIT
139 |         elif action == 1:   # down
140 |             if s[1] < (MAZE_H - 1) * UNIT:
141 |                 base_action[1] += UNIT
142 |         elif action == 2:   # right
143 |             if s[0] < (MAZE_W - 1) * UNIT:
144 |                 base_action[0] += UNIT
145 |         elif action == 3:   # left
146 |             if s[0] > UNIT:
147 |                 base_action[0] -= UNIT
148 | 
149 |         self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent
150 | 
151 |         s_ = self.canvas.coords(self.rect)  # next state
152 | 
153 |         # reward function
154 |         if s_ == self.canvas.coords(self.oval):
155 |             print("bingo")
156 |             reward = 1
157 |             done = True
158 |             s_ = 'terminal'
159 |         elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2), self.canvas.coords(self.hell3),
160 |         self.canvas.coords(self.hell4),self.canvas.coords(self.hell5),self.canvas.coords(self.hell6),self.canvas.coords(self.hell7),
161 |         self.canvas.coords(self.hell8),self.canvas.coords(self.hell9)]:
162 |             reward = -1
163 |             done = True
164 |             s_ = 'terminal'
165 |         else:
166 |             reward = -0.1
167 |             done = False
168 | 
169 |         return s_, reward, done
170 | 
171 |     def render(self):
172 |         time.sleep(0.1)
173 |         self.update()
174 | 
175 | 
176 | def update():
177 |     for t in range(10):
178 |         s = env.reset()
179 |         while True:
180 |             env.render()
181 |             a = 1
182 |             s, r, done = env.step(a)
183 |             if done:
184 |                 break
185 | 
186 | if __name__ == '__main__':
187 |     env = Maze()
188 |     env.after(100, update)
189 |     env.mainloop()


--------------------------------------------------------------------------------
/content/4_Sarsa_lambda_maze/run_this.py:
--------------------------------------------------------------------------------
 1 | from maze_env import Maze
 2 | from RL_brain import SarsaLambdaTable
 3 | 
 4 | def update():
 5 | 	for episode in range(500):
 6 | 		# print(episode)
 7 | 		observation = env.reset()
 8 | 		action = RL.choose_action(str(observation))
 9 | 		RL.eligibility_trace *= 0
10 | 
11 | 		step = 0
12 | 		while True:
13 | 			step += 1
14 | 			# print("step: ", step, "action: ", action)
15 | 			env.render()
16 | 			observation_, reward, done = env.step(action)
17 | 			action_ = RL.choose_action(str(observation_))
18 | 			RL.learn(str(observation), action, reward, str(observation_), action_)
19 | 			observation = observation_
20 | 			action = action_
21 | 
22 | 			if done:
23 | 				break
24 | 
25 | 	print('game over')
26 | 	env.destroy()
27 | 
28 | if __name__ == '__main__':
29 | 	env = Maze()
30 | 	RL = SarsaLambdaTable(actions=list(range(env.n_actions)))
31 | 	env.after(100, update)
32 | 	env.mainloop()
33 | 


--------------------------------------------------------------------------------
/content/5.1_double_DQN/RL_brain.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | np.random.seed(1)
  8 | torch.manual_seed(1)
  9 | 
 10 | class Net(nn.Module):
 11 | 	def __init__(self, n_feature, n_hidden, n_output):
 12 | 		super(Net, self).__init__()
 13 | 		self.el = nn.Linear(n_feature, n_hidden)
 14 | 		self.q = nn.Linear(n_hidden, n_output)
 15 | 
 16 | 	def forward(self, x):
 17 | 		x = self.el(x)
 18 | 		x = F.relu(x)
 19 | 		x = self.q(x)
 20 | 		return x
 21 | 
 22 | class DoubleDQN():
 23 | 	def __init__(self, n_actions, n_features, n_hidden=20, learning_rate=0.005, reward_decay=0.9, e_greedy=0.9,
 24 | 				replace_target_iter=200, memory_size=3000, batch_size=32, e_greedy_increment=None, double_q=True):
 25 | 		self.n_actions = n_actions
 26 | 		self.n_hidden = n_hidden
 27 | 		self.n_features = n_features
 28 | 		self.lr = learning_rate
 29 | 		self.gamma = reward_decay
 30 | 		self.epsilon_max = e_greedy
 31 | 		self.replace_target_iter = replace_target_iter
 32 | 		self.memory_size = memory_size
 33 | 		self.batch_size = batch_size
 34 | 		self.epsilon_increment = e_greedy_increment
 35 | 		self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
 36 | 		self.double_q = double_q
 37 | 
 38 | 		self.learn_step_counter = 0
 39 | 		self.memory = np.zeros((self.memory_size, n_features*2+2))
 40 | 		self._build_net()
 41 | 		self.cost_his = []
 42 | 
 43 | 	def _build_net(self):
 44 | 		self.q_eval = Net(self.n_features, self.n_hidden, self.n_actions)
 45 | 		self.q_target = Net(self.n_features, self.n_hidden, self.n_actions)
 46 | 		self.optimizer = torch.optim.RMSprop(self.q_eval.parameters(), lr=self.lr)
 47 | 		self.loss_func = nn.MSELoss()
 48 | 
 49 | 	def store_transition(self, s, a, r, s_):
 50 | 		if not hasattr(self, 'memory_counter'):
 51 | 			self.memory_counter = 0
 52 | 		transition = np.hstack((s, [a, r], s_))
 53 | 		index = self.memory_counter % self.memory_size
 54 | 		self.memory[index, :] = transition
 55 | 		self.memory_counter += 1
 56 | 
 57 | 	def choose_action(self, observation):
 58 | 		observation = torch.Tensor(observation[np.newaxis, :])
 59 | 		actions_value = self.q_eval(observation)
 60 | 		action = torch.max(actions_value, dim=1)[1]  # record action value it get
 61 | 		if not hasattr(self, 'q'):
 62 | 			self.q = []
 63 | 			self.running_q = 0
 64 | 		self.running_q = self.running_q*0.99 + 0.01 * torch.max(actions_value, dim=1)[0]
 65 | 		self.q.append(self.running_q)
 66 | 
 67 | 		if np.random.uniform() > self.epsilon:  # randomly choose action
 68 | 			action = np.random.randint(0, self.n_actions)
 69 | 		return action
 70 | 
 71 | 	def learn(self):
 72 | 		if self.learn_step_counter % self.replace_target_iter == 0:
 73 | 			self.q_target.load_state_dict(self.q_eval.state_dict())
 74 | 			print("\ntarget params replaced\n")
 75 | 
 76 | 		if self.memory_counter > self.memory_size:
 77 | 			sample_index = np.random.choice(self.memory_size, size=self.batch_size)
 78 | 		else:
 79 | 			sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
 80 | 
 81 | 		batch_memory = self.memory[sample_index, :]
 82 | 
 83 | 		# q_eval4next is the output of the q_eval network when input s_(t+1)
 84 | 		# q_next is the output of the q_target network when input s_(s+1)
 85 | 		# we use q_eval4next to get which action was choosed by eval network in s_(t+1)
 86 | 		# then we get the Q_value corresponding to that action output by target network
 87 | 		q_next, q_eval4next = self.q_target(torch.Tensor(batch_memory[:,-self.n_features:])), self.q_eval(torch.Tensor(batch_memory[:,-self.n_features:]))
 88 | 		q_eval = self.q_eval(torch.Tensor(batch_memory[:, :self.n_features]))
 89 | 
 90 | 		# used for calculating y, we need to copy for q_eval because this operation could keep the Q_value that has not been selected unchanged,
 91 | 		# so when we do q_target - q_eval, these Q_value become zero and wouldn't affect the calculation of the loss 
 92 | 		q_target = torch.Tensor(q_eval.data.numpy().copy())
 93 | 
 94 | 		batch_index = np.arange(self.batch_size, dtype=np.int32)
 95 | 		eval_act_index = batch_memory[:, self.n_features].astype(int)
 96 | 		reward = torch.Tensor(batch_memory[:, self.n_features+1])
 97 | 
 98 | 		if self.double_q:
 99 | 			max_act4next = torch.max(q_eval4next, dim=1)[1]
100 | 			selected_q_next = q_next[batch_index, max_act4next]
101 | 		else:
102 | 			selected_q_next = torch.max(q_next, dim=1)[0]
103 | 
104 | 		q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next
105 | 
106 | 		loss = self.loss_func(q_eval, q_target)
107 | 		self.optimizer.zero_grad()
108 | 		loss.backward()
109 | 		self.optimizer.step()
110 | 
111 | 		self.cost_his.append(loss)
112 | 
113 | 		self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
114 | 		self.learn_step_counter += 1
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/content/5.1_double_DQN/run_Pendulum.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from RL_brain import DoubleDQN
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | env = gym.make("Pendulum-v0")
 7 | env = env.unwrapped
 8 | env.seed(1)
 9 | MEMORY_SIZE = 3000
10 | ACTION_SPACE = 11
11 | 
12 | naturel_DQN = DoubleDQN(n_actions=ACTION_SPACE,n_features=3,memory_size=MEMORY_SIZE,e_greedy_increment=0.001,double_q=False)
13 | double_DQN = DoubleDQN(n_actions=ACTION_SPACE,n_features=3,memory_size=MEMORY_SIZE,e_greedy_increment=0.001,double_q=True)
14 | 
15 | def train(RL):
16 | 	total_steps = 0
17 | 	observation = env.reset()
18 | 	while True:
19 | 		if total_steps-MEMORY_SIZE > 8000: env.render()  # show the game when trained for some time
20 | 		action = RL.choose_action(observation)
21 | 		f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # convert to [-2 ~ 2] float actions
22 | 		observation_, reward, done, info = env.step(np.array([f_action]))
23 | 
24 | 		reward /= 10  #normalize to a range of (-1,0). r = 0 when get upright
25 | 		# the Q target at upright state will be 0, because Q_target = r + gamma * Qmax(s', a') = 0 + gamma * 0
26 |         # so when Q at this state is greater than 0, the agent overestimates the Q. Please refer to the final result.
27 | 
28 | 		RL.store_transition(observation, action, reward, observation_)
29 | 
30 | 		if total_steps > MEMORY_SIZE:
31 | 			RL.learn()
32 | 
33 | 		if total_steps - MEMORY_SIZE > 20000:
34 | 			break
35 | 
36 | 		observation = observation_
37 | 		total_steps += 1
38 | 	return RL.q
39 | 
40 | q_natural = train(naturel_DQN)
41 | q_double = train(double_DQN)
42 | 
43 | plt.plot(np.array(q_natural), c='r', label='natural')
44 | plt.plot(np.array(q_double), c='b', label='double')
45 | plt.legend(loc='best')
46 | plt.ylabel('Q eval')
47 | plt.xlabel('training steps')
48 | plt.grid()
49 | plt.show()


--------------------------------------------------------------------------------
/content/5.2_Prioritized_Replay_DQN/Figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClownW/Reinforcement-learning-with-PyTorch/b1b690a986372e8726df62b86a74baae1e02c88d/content/5.2_Prioritized_Replay_DQN/Figure_1.png


--------------------------------------------------------------------------------
/content/5.2_Prioritized_Replay_DQN/RL_brain.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | np.random.seed(1)
  8 | torch.manual_seed(1)
  9 | 
 10 | class SumTree(object):
 11 | 
 12 | 	data_pointer = 0
 13 | 	
 14 | 	def __init__(self, capacity):
 15 | 		self.capacity = capacity   # for all priority values
 16 | 		self.tree = np.zeros(2 * capacity - 1)
 17 | 		self.data = np.zeros(capacity, dtype=object) # for all transitions
 18 | 
 19 | 	def add(self, p, data):
 20 | 		tree_idx = self.data_pointer + self.capacity - 1
 21 | 		self.data[self.data_pointer] = data # store transition in self.data
 22 | 		self.update(tree_idx, p) # add p to the tree
 23 | 		self.data_pointer += 1
 24 | 		if self.data_pointer >= self.capacity:
 25 | 			self.data_pointer = 0
 26 | 
 27 | 	def update(self, tree_idx, p):
 28 | 		change = p - self.tree[tree_idx]
 29 | 		self.tree[tree_idx] = p
 30 | 		while tree_idx != 0:
 31 | 			tree_idx = (tree_idx - 1) // 2
 32 | 			self.tree[tree_idx] += change
 33 | 
 34 | 	def get_leaf(self, v):
 35 | 		parent_idx = 0
 36 | 		while True:
 37 | 			cl_idx = 2 * parent_idx + 1  # left kid of the parent node
 38 | 			cr_idx = cl_idx + 1
 39 | 			if cl_idx >= len(self.tree):   # kid node is out of the tree, so parent is the leaf node
 40 | 				leaf_idx = parent_idx
 41 | 				break
 42 | 			else:      # downward search, always search for a higher priority node
 43 | 				if v <= self.tree[cl_idx]:
 44 | 					parent_idx = cl_idx
 45 | 				else:
 46 | 					v -= self.tree[cl_idx]
 47 | 					parent_idx = cr_idx
 48 | 
 49 | 		data_idx = leaf_idx - self.capacity + 1
 50 | 		return leaf_idx, self.tree[leaf_idx], self.data[data_idx]
 51 | 
 52 | 	@property
 53 | 	def total_p(self):
 54 | 		return self.tree[0]
 55 | 
 56 | 
 57 | class Memory(object):  # stored as (s, a, r, s_) in SumTree
 58 | 	epsilon = 0.01  # small amount to avoid zero priority
 59 | 	alpha = 0.6  # [0~1] convert the importance of TD error to priority
 60 | 	beta = 0.4  # importance-sampling, from initial value increasing to 1
 61 | 	beta_increment_per_sampling = 0.001
 62 | 	abs_err_upper = 1.  # clipped abs error
 63 | 
 64 | 	def __init__(self, capacity):
 65 | 		self.tree = SumTree(capacity)
 66 | 
 67 | 	def store(self, transition):
 68 | 		max_p = np.max(self.tree.tree[-self.tree.capacity:])
 69 | 		if max_p == 0:
 70 | 			max_p = self.abs_err_upper
 71 | 		self.tree.add(max_p, transition)  # set the max of p for new p
 72 | 
 73 | 	def sample(self, n):
 74 | 		b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, self.tree.data[0].size)), np.empty((n, 1))
 75 | 		pri_seg = self.tree.total_p / n
 76 | 		self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) # max=1
 77 | 
 78 | 		min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p   # for later calculation ISweight
 79 | 		for i in range(n):
 80 | 			a, b = pri_seg * i, pri_seg * (i+1)
 81 | 			v = np.random.uniform(a, b)
 82 | 			idx, p, data = self.tree.get_leaf(v)
 83 | 			prob = p / self.tree.total_p
 84 | 			ISWeights[i,0] = np.power(prob/min_prob, -self.beta)
 85 | 			b_idx[i], b_memory[i,:] = idx, data
 86 | 		return b_idx, b_memory, ISWeights
 87 | 
 88 | 	def batch_update(self, tree_idx, abs_errors):
 89 | 		abs_errors += self.epsilon   # convert to abs and avoid 0
 90 | 		clipped_errors = np.minimum(abs_errors.data, self.abs_err_upper)
 91 | 		ps = np.power(clipped_errors, self.alpha)
 92 | 		for ti, p in zip(tree_idx, ps):
 93 | 			self.tree.update(ti, p)
 94 | 
 95 | 
 96 | class Net(nn.Module):
 97 | 	def __init__(self, n_feature, n_hidden, n_output):
 98 | 		super(Net, self).__init__()
 99 | 		self.el = nn.Linear(n_feature, n_hidden)
100 | 		self.q = nn.Linear(n_hidden, n_output)
101 | 
102 | 	def forward(self, x):
103 | 		x = self.el(x)
104 | 		x = F.relu(x)
105 | 		x = self.q(x)
106 | 		return x
107 | 
108 | 
109 | class DQNPrioritizedReplay:
110 | 	def __init__(self, n_actions, n_features, n_hidden=20, learning_rate=0.005, reward_decay=0.9, e_greedy=0.9, replace_target_iter=500,
111 | 				memory_size=10000, batch_size=32, e_greedy_increment=None, output_graph=False, prioritized=True):
112 | 		self.n_actions = n_actions
113 | 		self.n_features = n_features
114 | 		self.n_hidden = n_hidden
115 | 		self.lr = learning_rate
116 | 		self.gamma = reward_decay
117 | 		self.epsilon_max = e_greedy
118 | 		self.replace_target_iter = replace_target_iter
119 | 		self.memory_size = memory_size
120 | 		self.batch_size = batch_size
121 | 		self.epsilon_increment = e_greedy_increment
122 | 		self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
123 | 
124 | 		self.prioritized = prioritized
125 | 		self.learn_step_counter = 0
126 | 		self._build_net()
127 | 
128 | 		if self.prioritized:
129 | 			self.memory = Memory(capacity=memory_size)
130 | 		else:
131 | 			self.memory = np.zeros((self.memory_size, n_features*2+2))
132 | 
133 | 		self.cost_his = []
134 | 
135 | 	def _build_net(self):
136 | 		self.q_eval = Net(self.n_features, self.n_hidden, self.n_actions)
137 | 		self.q_target = Net(self.n_features, self.n_hidden, self.n_actions)
138 | 		self.optimizer = torch.optim.RMSprop(self.q_eval.parameters(), lr=self.lr)
139 | 
140 | 	def store_transition(self, s, a, r, s_):
141 | 		if self.prioritized:  # prioritized replay
142 | 			transition = np.hstack((s, [a, r], s_))
143 | 			self.memory.store(transition)  # have high priority for newly arrived transition
144 | 		else:  # random replay
145 | 			if not hasattr(self, 'memory_counter'):
146 | 				self.memory_counter = 0
147 | 			transition = np.hstack((s, [a, r], s_))
148 | 			index = self.memory_counter % self.memory_size
149 | 			self.memory[index, :] = transition
150 | 			self.memory_counter += 1
151 | 
152 | 	def choose_action(self, observation):
153 | 		observation = torch.Tensor(observation[np.newaxis, :])
154 | 		if np.random.uniform() < self.epsilon:
155 | 			actions_value = self.q_eval(observation)
156 | 			action = int(torch.max(actions_value, dim=1)[1])
157 | 		else:
158 | 			action = np.random.randint(0, self.n_actions)
159 | 		return action
160 | 
161 | 
162 | 	def learn(self):
163 | 		if self.learn_step_counter % self.replace_target_iter == 0:
164 | 			self.q_target.load_state_dict(self.q_eval.state_dict())
165 | 			# print("target params replaced\n")
166 | 
167 | 		if self.prioritized:
168 | 			tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
169 | 		else:
170 | 			sample_index = np.random.choice(self.memory_size, size=self.batch_size)
171 | 			batch_memory = self.memory[sample_index, :]
172 | 
173 | 		q_next, q_eval = self.q_target(torch.Tensor(batch_memory[:, -self.n_features:])), self.q_eval(torch.Tensor(batch_memory[:, :self.n_features]))
174 | 		q_target = torch.Tensor(q_eval.data.numpy().copy())
175 | 
176 | 		batch_index = np.arange(self.batch_size, dtype=np.int32)
177 | 		eval_act_index = batch_memory[:, self.n_features].astype(int)
178 | 		reward = torch.Tensor(batch_memory[:, self.n_features+1])
179 | 		q_target[batch_index, eval_act_index] = reward + self.gamma*torch.max(q_next, 1)[0]
180 | 
181 | 		if self.prioritized:
182 | 			self.abs_errors = torch.sum(torch.abs(q_target-q_eval), dim=1)
183 | 			# print("ISWeights shape: ", ISWeights.shape, 'q shape: ', ((q_target-q_eval)**2), 'q: ', (q_target-q_eval))
184 | 			loss = torch.mean(torch.mean(torch.Tensor(ISWeights) * (q_target-q_eval)**2, dim=1))
185 | 			self.memory.batch_update(tree_idx, self.abs_errors)
186 | 		else:
187 | 			self.loss_func = nn.MSELoss()
188 | 			loss = self.loss_func(q_eval, q_target)
189 | 
190 | 		# print("loss: ", loss, self.prioritized)
191 | 		 
192 | 		self.optimizer.zero_grad()
193 | 		loss.backward()
194 | 		self.optimizer.step()
195 | 
196 | 		# increase epsilon
197 | 		self.cost_his.append(loss)
198 | 		self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
199 | 		self.learn_step_counter += 1
200 | 


--------------------------------------------------------------------------------
/content/5.2_Prioritized_Replay_DQN/run_MountainCar.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from RL_brain import DQNPrioritizedReplay
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | env = gym.make("MountainCar-v0")
 7 | env = env.unwrapped
 8 | env.seed(21)
 9 | MEMORY_SIZE = 10000
10 | 
11 | RL_natural = DQNPrioritizedReplay(n_actions=3, n_features=2, memory_size=MEMORY_SIZE, e_greedy_increment=0.00005, prioritized=False)
12 | RL_prio = DQNPrioritizedReplay(n_actions=3, n_features=2, memory_size=MEMORY_SIZE, e_greedy_increment=0.00005, prioritized=True)
13 | 
14 | def train(RL):
15 | 	total_steps = 0
16 | 	steps = []
17 | 	episodes = []
18 | 	for i_episode in range(20):
19 | 		observation = env.reset()
20 | 		while True:
21 | 			# print("episode: {} | total_steps: {}".format(i_episode, total_steps))
22 | 			# if total_steps - MEMORY_SIZE > 8000: env.render()
23 | 			action = RL.choose_action(observation)
24 | 			observation_, reward, done, info = env.step(action)
25 | 			if done: reward = 10
26 | 			RL.store_transition(observation, action, reward, observation_)
27 | 			if total_steps > MEMORY_SIZE:
28 | 				RL.learn()
29 | 			if done:
30 | 				print('episode ', i_episode, ' finished')
31 | 				steps.append(total_steps)
32 | 				episodes.append(i_episode)
33 | 				break
34 | 			observation = observation_
35 | 			total_steps += 1
36 | 		print("steps for {}th episode: {}".format(i_episode, total_steps))
37 | 	return np.vstack((episodes, steps))
38 | 
39 | his_natural = train(RL_natural)
40 | his_prio = train(RL_prio)
41 | 
42 | plt.plot(his_natural[0,:], his_natural[1,:]-his_natural[0,:], c='b', label='natural DQN')
43 | plt.plot(his_prio[0,:], his_prio[1,:]-his_prio[0,:], c='r', label='DQN with prioritized replay')
44 | plt.legend(loc='best')
45 | plt.ylabel('total training time')
46 | plt.xlabel('episode')
47 | plt.grid()
48 | plt.show()


--------------------------------------------------------------------------------
/content/5.3_Dueling_DQN/RL_brain.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | np.random.seed(1)
  7 | torch.manual_seed(1)
  8 | 
  9 | class Net(nn.Module):
 10 | 	def __init__(self, n_feature, n_hidden, n_output, dueling=False):
 11 | 		super(Net, self).__init__()
 12 | 		self.dueling = dueling
 13 | 		self.l1 = nn.Linear(n_feature, n_hidden)
 14 | 		if self.dueling:
 15 | 			self.values = nn.Linear(n_hidden, 1)
 16 | 			self.advantages = nn.Linear(n_hidden, n_output)
 17 | 		else:
 18 | 			self.q = nn.Linear(n_hidden, n_output)
 19 | 
 20 | 	def forward(self, x):
 21 | 		x = self.l1(x)
 22 | 		x = F.relu(x)
 23 | 		if self.dueling:
 24 | 			value = self.values(x)
 25 | 			advantages = self.advantages(x)
 26 | 			out = value + (advantages-torch.mean(advantages, dim=1, keepdim=True))
 27 | 		else:
 28 | 			out = self.q(x)
 29 | 		return out
 30 | 
 31 | 
 32 | class DuelingDQN:
 33 | 	def __init__(self, n_actions, n_features, n_hidden=20, learning_rate=0.001, reward_decay=0.9, e_greedy=0.9, 
 34 | 				replace_target_iter=200, memory_size=500, batch_size=32, e_greedy_increment=None, dueling=True):
 35 | 		self.n_actions = n_actions
 36 | 		self.n_features = n_features 
 37 | 		self.n_hidden = n_hidden
 38 | 		self.lr = learning_rate
 39 | 		self.gamma = reward_decay
 40 | 		self.epsilon_max = e_greedy
 41 | 		self.replace_target_iter = replace_target_iter
 42 | 		self.memory_size = memory_size
 43 | 		self.batch_size = batch_size
 44 | 		self.epsilon_increment = e_greedy_increment
 45 | 		self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
 46 | 		self.dueling = dueling
 47 | 
 48 | 		self.learn_step_counter = 0
 49 | 		self.memory = np.zeros((self.memory_size, n_features*2+2))
 50 | 		self._build_net()
 51 | 		self.cost_his = []
 52 | 
 53 | 	def _build_net(self):
 54 | 		self.q_eval = Net(self.n_features, self.n_hidden, self.n_actions, self.dueling)
 55 | 		self.q_target = Net(self.n_features, self.n_hidden, self.n_actions, self.dueling)
 56 | 
 57 | 		self.optimizer = torch.optim.RMSprop(self.q_eval.parameters(), lr=self.lr)
 58 | 		self.loss_func = nn.MSELoss()
 59 | 
 60 | 	def store_transition(self, s, a, r, s_):
 61 | 		if not hasattr(self, 'memory_counter'):
 62 | 			self.memory_counter = 0
 63 | 		transition = np.hstack((s, [a, r], s_))
 64 | 		index = self.memory_counter % self.memory_size
 65 | 		self.memory[index, :] = transition
 66 | 		self.memory_counter += 1
 67 | 
 68 | 	def choose_action(self, observation):
 69 | 		observation = torch.Tensor(observation[np.newaxis, :])
 70 | 		if np.random.uniform() < self.epsilon:
 71 | 			actions_value = self.q_eval(observation)
 72 | 			action = torch.max(actions_value, dim=1)[1]
 73 | 		else:
 74 | 			action = np.random.randint(0, self.n_actions)
 75 | 		return action
 76 | 
 77 | 	def learn(self):
 78 | 		if self.learn_step_counter % self.replace_target_iter == 0:
 79 | 			self.q_target.load_state_dict(self.q_eval.state_dict())
 80 | 
 81 | 		sample_index = np.random.choice(self.memory_size, size=self.batch_size)
 82 | 		batch_memory = self.memory[sample_index, :]
 83 | 
 84 | 		q_next, q_eval = self.q_target(torch.Tensor(batch_memory[:, -self.n_features:])), self.q_eval(torch.Tensor(batch_memory[:, :self.n_features]))
 85 | 		q_target = torch.Tensor(q_eval.data.numpy().copy())
 86 | 
 87 | 		batch_index = np.arange(self.batch_size, dtype=np.int32)
 88 | 		eval_act_index = batch_memory[:, self.n_features].astype(int)
 89 | 		reward = torch.Tensor(batch_memory[:, self.n_features+1])
 90 | 
 91 | 		q_target[batch_index, eval_act_index] = reward + self.gamma*torch.max(q_next, dim=1)[0]
 92 | 
 93 | 		loss = self.loss_func(q_eval, q_target)
 94 | 		self.optimizer.zero_grad()
 95 | 		loss.backward()
 96 | 		self.optimizer.step()
 97 | 
 98 | 		self.cost_his.append(loss)
 99 | 
100 | 		self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
101 | 		self.learn_step_counter += 1


--------------------------------------------------------------------------------
/content/5.3_Dueling_DQN/action15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClownW/Reinforcement-learning-with-PyTorch/b1b690a986372e8726df62b86a74baae1e02c88d/content/5.3_Dueling_DQN/action15.png


--------------------------------------------------------------------------------
/content/5.3_Dueling_DQN/run_Pendulum.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from RL_brain import DuelingDQN
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import torch
 6 | 
 7 | env = gym.make('Pendulum-v0')
 8 | env = env.unwrapped
 9 | env.seed(1)
10 | MEMORY_SIZE = 3000
11 | ACTION_SPACE = 25
12 | 
13 | natural_DQN = DuelingDQN(n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE, e_greedy_increment=0.001, dueling=False)
14 | dueling_DQN = DuelingDQN(n_actions=ACTION_SPACE, n_features=3, memory_size=MEMORY_SIZE, e_greedy_increment=0.001, dueling=True)
15 | 
16 | def train(RL):
17 | 	acc_r = [0]
18 | 	total_steps = 0
19 | 	observation = env.reset()
20 | 	while True:
21 | 		# if total_steps-MEMORY_SIZE > 9000: env.render()
22 | 
23 | 		action = RL.choose_action(observation)
24 | 
25 | 		f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4)
26 | 		observation_, reward, done, info = env.step(np.array([f_action]))
27 | 
28 | 		reward /= 10     # normalize to a range of (-1, 0)
29 | 		acc_r.append(reward + acc_r[-1])  # accumulated reward
30 | 
31 | 		RL.store_transition(observation, action, reward, observation_)
32 | 
33 | 		if total_steps > MEMORY_SIZE:
34 | 			RL.learn()
35 | 
36 | 		if total_steps-MEMORY_SIZE > 15000:
37 | 			break
38 | 
39 | 		observation = observation_
40 | 		total_steps += 1
41 | 	return RL.cost_his, acc_r
42 | 
43 | c_natural, r_natural = train(natural_DQN)
44 | print("start training dueling DQN! ")
45 | c_dueling, r_dueling = train(dueling_DQN)
46 | 
47 | plt.figure(1)
48 | plt.plot(np.array(c_natural), c='r', label='natural')
49 | plt.plot(np.array(c_dueling), c='b', label='dueling')
50 | plt.legend(loc='best')
51 | plt.ylabel('cost')
52 | plt.xlabel('training steps')
53 | plt.grid()
54 | 
55 | plt.figure(2)
56 | plt.plot(np.array(r_natural), c='r', label='natural')
57 | plt.plot(np.array(r_dueling), c='b', label='dueling')
58 | plt.legend(loc='best')
59 | plt.ylabel('accumulated reward')
60 | plt.xlabel('training steps')
61 | plt.grid()
62 | 
63 | plt.show()


--------------------------------------------------------------------------------
/content/5_Deep_Q_Network/RL_brain.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | import copy
  9 | 
 10 | np.random.seed(1)
 11 | torch.manual_seed(1)
 12 | 
 13 | # define the network architecture
 14 | class Net(nn.Module):
 15 | 	def __init__(self, n_feature, n_hidden, n_output):
 16 | 		super(Net, self).__init__()
 17 | 		self.el = nn.Linear(n_feature, n_hidden)
 18 | 		self.q = nn.Linear(n_hidden, n_output)
 19 | 
 20 | 	def forward(self, x):
 21 | 		x = self.el(x)
 22 | 		x = F.relu(x)
 23 | 		x = self.q(x)
 24 | 		return x
 25 | 
 26 | 
 27 | class DeepQNetwork():
 28 | 	def __init__(self, n_actions, n_features, n_hidden=20, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9,
 29 | 				replace_target_iter=200, memory_size=500, batch_size=32, e_greedy_increment=None,
 30 | 				):
 31 | 		self.n_actions = n_actions
 32 | 		self.n_features = n_features
 33 | 		self.n_hidden = n_hidden
 34 | 		self.lr = learning_rate
 35 | 		self.gamma = reward_decay
 36 | 		self.epsilon_max = e_greedy
 37 | 		self.replace_target_iter = replace_target_iter
 38 | 		self.memory_size = memory_size
 39 | 		self.batch_size = batch_size
 40 | 		self.epsilon_increment = e_greedy_increment
 41 | 		self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
 42 | 
 43 | 		# total learning step
 44 | 		self.learn_step_counter = 0
 45 | 
 46 | 		# initialize zero memory [s, a, r, s_]
 47 | 		self.memory = np.zeros((self.memory_size, n_features*2+2))
 48 | 
 49 | 		self.loss_func = nn.MSELoss()
 50 | 		self.cost_his = []
 51 | 
 52 | 		self._build_net()
 53 | 		
 54 | 
 55 | 	def _build_net(self):
 56 | 		self.q_eval = Net(self.n_features, self.n_hidden, self.n_actions)
 57 | 		self.q_target = Net(self.n_features, self.n_hidden, self.n_actions)
 58 | 		self.optimizer = torch.optim.RMSprop(self.q_eval.parameters(), lr=self.lr)
 59 | 
 60 | 	def store_transition(self, s, a, r, s_):
 61 | 		if not hasattr(self, 'memory_counter'):
 62 | 			self.memory_counter = 0
 63 | 		transition = np.hstack((s, [a, r], s_))
 64 | 		# replace the old memory with new memory
 65 | 		index = self.memory_counter % self.memory_size
 66 | 		self.memory[index, :] = transition 
 67 | 		self.memory_counter += 1
 68 | 
 69 | 	def choose_action(self, observation):
 70 | 		observation = torch.Tensor(observation[np.newaxis, :])
 71 | 		if np.random.uniform() < self.epsilon:
 72 | 			actions_value = self.q_eval(observation)
 73 | 
 74 | 			action = np.argmax(actions_value.data.numpy())
 75 | 		else:
 76 | 			action = np.random.randint(0, self.n_actions)
 77 | 		return action
 78 | 
 79 | 	def learn(self):
 80 | 		# check to replace target parameters
 81 | 		if self.learn_step_counter % self.replace_target_iter == 0:
 82 | 			self.q_target.load_state_dict(self.q_eval.state_dict())
 83 | 			print("\ntarget params replaced\n")
 84 | 
 85 | 		# sample batch memory from all memory
 86 | 		if self.memory_counter > self.memory_size:
 87 | 			sample_index = np.random.choice(self.memory_size, size=self.batch_size)
 88 | 		else:
 89 | 			sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
 90 | 		batch_memory = self.memory[sample_index, :]
 91 | 
 92 | 		# q_next is used for getting which action would be choosed by target network in state s_(t+1)
 93 | 		q_next, q_eval = self.q_target(torch.Tensor(batch_memory[:, -self.n_features:])), self.q_eval(torch.Tensor(batch_memory[:, :self.n_features]))
 94 | 		# used for calculating y, we need to copy for q_eval because this operation could keep the Q_value that has not been selected unchanged,
 95 | 		# so when we do q_target - q_eval, these Q_value become zero and wouldn't affect the calculation of the loss 
 96 | 		q_target = torch.Tensor(q_eval.data.numpy().copy())
 97 | 
 98 | 		batch_index = np.arange(self.batch_size, dtype=np.int32)
 99 | 		eval_act_index = batch_memory[:, self.n_features].astype(int)
100 | 		reward = torch.Tensor(batch_memory[:, self.n_features+1])
101 | 		q_target[batch_index, eval_act_index] = reward + self.gamma*torch.max(q_next, 1)[0]
102 | 
103 | 		loss = self.loss_func(q_eval, q_target)
104 | 		self.optimizer.zero_grad()
105 | 		loss.backward()
106 | 		self.optimizer.step()
107 | 
108 | 		# increase epsilon
109 | 		self.cost_his.append(loss)
110 | 		self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
111 | 		self.learn_step_counter += 1
112 | 
113 | 	def plot_cost(self):
114 | 		plt.plot(np.arange(len(self.cost_his)), self.cost_his)
115 | 		plt.ylabel('Cost')
116 | 		plt.xlabel('training steps')
117 | 		plt.show()


--------------------------------------------------------------------------------
/content/5_Deep_Q_Network/maze_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import sys
  4 | if sys.version_info.major == 2:
  5 |     import Tkinter as tk
  6 | else:
  7 |     import tkinter as tk
  8 | 
  9 | UNIT = 40   # pixels
 10 | MAZE_H = 4  # grid height
 11 | MAZE_W = 4  # grid width
 12 | 
 13 | 
 14 | class Maze(tk.Tk, object):
 15 |     def __init__(self):
 16 |         super(Maze, self).__init__()
 17 |         self.action_space = ['u', 'd', 'l', 'r']
 18 |         self.n_actions = len(self.action_space)
 19 |         self.n_features = 2
 20 |         self.title('maze')
 21 |         self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
 22 |         self._build_maze()
 23 | 
 24 |     def _build_maze(self):
 25 |         self.canvas = tk.Canvas(self, bg='white',
 26 |                            height=MAZE_H * UNIT,
 27 |                            width=MAZE_W * UNIT)
 28 | 
 29 |         # create grids
 30 |         for c in range(0, MAZE_W * UNIT, UNIT):
 31 |             x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
 32 |             self.canvas.create_line(x0, y0, x1, y1)
 33 |         for r in range(0, MAZE_H * UNIT, UNIT):
 34 |             x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
 35 |             self.canvas.create_line(x0, y0, x1, y1)
 36 | 
 37 |         # create origin
 38 |         origin = np.array([20, 20])
 39 | 
 40 |         # hell
 41 |         hell1_center = origin + np.array([UNIT * 2, UNIT])
 42 |         self.hell1 = self.canvas.create_rectangle(
 43 |             hell1_center[0] - 15, hell1_center[1] - 15,
 44 |             hell1_center[0] + 15, hell1_center[1] + 15,
 45 |             fill='black')
 46 |         # hell
 47 |         # hell2_center = origin + np.array([UNIT, UNIT * 2])
 48 |         # self.hell2 = self.canvas.create_rectangle(
 49 |         #     hell2_center[0] - 15, hell2_center[1] - 15,
 50 |         #     hell2_center[0] + 15, hell2_center[1] + 15,
 51 |         #     fill='black')
 52 | 
 53 |         # create oval
 54 |         oval_center = origin + UNIT * 2
 55 |         self.oval = self.canvas.create_oval(
 56 |             oval_center[0] - 15, oval_center[1] - 15,
 57 |             oval_center[0] + 15, oval_center[1] + 15,
 58 |             fill='yellow')
 59 | 
 60 |         # create red rect
 61 |         self.rect = self.canvas.create_rectangle(
 62 |             origin[0] - 15, origin[1] - 15,
 63 |             origin[0] + 15, origin[1] + 15,
 64 |             fill='red')
 65 | 
 66 |         # pack all
 67 |         self.canvas.pack()
 68 | 
 69 |     def reset(self):
 70 |         self.update()
 71 |         time.sleep(0.1)
 72 |         self.canvas.delete(self.rect)
 73 |         origin = np.array([20, 20])
 74 |         self.rect = self.canvas.create_rectangle(
 75 |             origin[0] - 15, origin[1] - 15,
 76 |             origin[0] + 15, origin[1] + 15,
 77 |             fill='red')
 78 |         # return observation
 79 |         return (np.array(self.canvas.coords(self.rect)[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
 80 | 
 81 |     def step(self, action):
 82 |         s = self.canvas.coords(self.rect)
 83 |         base_action = np.array([0, 0])
 84 |         if action == 0:   # up
 85 |             if s[1] > UNIT:
 86 |                 base_action[1] -= UNIT
 87 |         elif action == 1:   # down
 88 |             if s[1] < (MAZE_H - 1) * UNIT:
 89 |                 base_action[1] += UNIT
 90 |         elif action == 2:   # right
 91 |             if s[0] < (MAZE_W - 1) * UNIT:
 92 |                 base_action[0] += UNIT
 93 |         elif action == 3:   # left
 94 |             if s[0] > UNIT:
 95 |                 base_action[0] -= UNIT
 96 | 
 97 |         self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent
 98 | 
 99 |         next_coords = self.canvas.coords(self.rect)  # next state
100 | 
101 |         # reward function
102 |         if next_coords == self.canvas.coords(self.oval):
103 |             reward = 1
104 |             done = True
105 |         elif next_coords in [self.canvas.coords(self.hell1)]:
106 |             reward = -1
107 |             done = True
108 |         else:
109 |             reward = 0
110 |             done = False
111 |         s_ = (np.array(next_coords[:2]) - np.array(self.canvas.coords(self.oval)[:2]))/(MAZE_H*UNIT)
112 |         return s_, reward, done
113 | 
114 |     def render(self):
115 |         # time.sleep(0.01)
116 |         self.update()


--------------------------------------------------------------------------------
/content/5_Deep_Q_Network/run_this.py:
--------------------------------------------------------------------------------
 1 | from maze_env import Maze
 2 | from RL_brain import DeepQNetwork
 3 | 
 4 | def run_maze():
 5 | 	step = 0
 6 | 	for episode in range(300):
 7 | 		print("episode: {}".format(episode))
 8 | 		observation = env.reset()
 9 | 		while True:
10 | 			print("step: {}".format(step))
11 | 			env.render()
12 | 			action = RL.choose_action(observation)
13 | 			observation_, reward, done = env.step(action)
14 | 			RL.store_transition(observation, action, reward, observation_)
15 | 			if (step>200) and (step%5==0):
16 | 				RL.learn()
17 | 			observation = observation_
18 | 			if done:
19 | 				break
20 | 			step += 1
21 | 	print('game over')
22 | 	env.destroy()
23 | 
24 | if __name__ == '__main__':
25 | 	env = Maze()
26 | 	RL = DeepQNetwork(env.n_actions, env.n_features,
27 | 					learning_rate=0.01,
28 | 					reward_decay=0.9,
29 | 					e_greedy=0.9,
30 | 					replace_target_iter=200,
31 | 					memory_size=2000
32 | 					)
33 | 	env.after(100, run_maze)
34 | 	env.mainloop()
35 | 	RL.plot_cost()


--------------------------------------------------------------------------------
/content/7_Policy_gradient_softmax/RL_brain.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | np.random.seed(1)
 7 | torch.manual_seed(1)
 8 | 
 9 | 
10 | class Net(nn.Module):
11 | 	def __init__(self, n_feature, n_hidden, n_output):
12 | 		super(Net, self).__init__()
13 | 		self.layer = nn.Linear(n_feature, n_hidden)
14 | 		self.all_act = nn.Linear(n_hidden, n_output)
15 | 
16 | 	def forward(self, x):
17 | 		x = self.layer(x)
18 | 		x = torch.tanh(x)
19 | 		x = self.all_act(x)
20 | 		return x
21 | 
22 | 
23 | 
24 | 
25 | class PolicyGradient:
26 | 	def __init__(self, n_actions, n_features, n_hidden=10, learning_rate=0.01, reward_decay=0.95):
27 | 		self.n_actions = n_actions
28 | 		self.n_features = n_features
29 | 		self.n_hidden = n_hidden
30 | 		self.lr = learning_rate
31 | 		self.gamma = reward_decay
32 | 
33 | 		self.ep_obs, self.ep_as, self.ep_rs = [], [], []
34 | 
35 | 		self._build_net()
36 | 
37 | 	def _build_net(self):
38 | 		self.net = Net(self.n_features, self.n_hidden, self.n_actions)
39 | 		self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.lr)
40 | 
41 | 	def choose_action(self, observation):
42 | 		observation = torch.Tensor(observation[np.newaxis, :])
43 | 		prob_weights = self.net(observation)
44 | 		prob = F.softmax(prob_weights)
45 | 		action = np.random.choice(range(prob_weights.shape[1]), p=prob.data.numpy().ravel())
46 | 		return action
47 | 
48 | 	def store_transition(self, s, a, r):
49 | 		self.ep_obs.append(s)
50 | 		self.ep_as.append(a)
51 | 		self.ep_rs.append(r)
52 | 
53 | 	def learn(self):
54 | 		# discount and normalize episode reward
55 | 		discounted_ep_rs_norm = self._discount_and_norm_rewards()
56 | 		obs = torch.Tensor(np.vstack(self.ep_obs))
57 | 		acts = torch.Tensor(np.array(self.ep_as))
58 | 		vt = torch.Tensor(discounted_ep_rs_norm)
59 | 
60 | 		all_act = self.net(obs)
61 | 
62 | 		# cross_entropy combines nn.LogSoftmax() and nn.NLLLoss() in one single class
63 | 		neg_log_prob = F.cross_entropy(all_act, acts.long(), reduce=False)
64 | 		loss = torch.mean(neg_log_prob * vt)
65 | 
66 | 		self.optimizer.zero_grad()
67 | 		loss.backward()
68 | 		self.optimizer.step()
69 | 
70 | 		self.ep_obs, self.ep_as, self.ep_rs = [], [], []
71 | 		return discounted_ep_rs_norm
72 | 
73 | 	def _discount_and_norm_rewards(self):
74 | 		discounted_ep_rs = np.zeros_like(self.ep_rs)
75 | 		running_add = 0
76 | 		for t in reversed(range(len(self.ep_rs))):
77 | 			running_add = running_add*self.gamma + self.ep_rs[t]
78 | 			discounted_ep_rs[t] = running_add
79 | 
80 | 		discounted_ep_rs -= np.mean(discounted_ep_rs)
81 | 		discounted_ep_rs /= np.std(discounted_ep_rs)
82 | 		return discounted_ep_rs
83 | 


--------------------------------------------------------------------------------
/content/7_Policy_gradient_softmax/run_CartPole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from RL_brain import PolicyGradient
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | DISPLAY_REWARD_THRESHOLD = 400  # renders environment if total episode reward is greater then this threshold
 6 | RENDER = False  # rendering wastes time
 7 | 
 8 | env = gym.make('CartPole-v0')
 9 | env.seed(1)
10 | env = env.unwrapped
11 | 
12 | print(env.action_space)
13 | print(env.observation_space)
14 | print(env.observation_space.high)
15 | print(env.observation_space.low)
16 | 
17 | RL = PolicyGradient(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.99)
18 | 
19 | for i_episode in range(3000):
20 | 	observation = env.reset()
21 | 
22 | 	while True:
23 | 		if RENDER: env.render()
24 | 
25 | 		action = RL.choose_action(observation)
26 | 		observation_, reward, done, info = env.step(action)
27 | 		RL.store_transition(observation, action, reward)
28 | 
29 | 		if done:
30 | 			ep_rs_sum = sum(RL.ep_rs)
31 | 
32 | 			if 'running_reward' not in globals():
33 | 				running_reward = ep_rs_sum
34 | 			else:
35 | 				running_reward = running_reward*0.99 + ep_rs_sum*0.01
36 | 
37 | 			# if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
38 | 			print("episode:", i_episode, " reward:", int(running_reward))
39 | 
40 | 			vt = RL.learn()
41 | 
42 | 			if i_episode == 0:
43 | 				plt.plot(vt)
44 | 				plt.xlabel('episode steps')
45 | 				plt.ylabel('normalized state-action value')
46 | 				plt.show()
47 | 			break
48 | 
49 | 		observation = observation_
50 | 


--------------------------------------------------------------------------------
/content/7_Policy_gradient_softmax/run_MountainCar.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from RL_brain import PolicyGradient
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | DISPLAY_REWARD_THRESHOLD = -2000
 6 | RENDER = False
 7 | 
 8 | env = gym.make('MountainCar-v0')
 9 | env.seed(1)
10 | env = env.unwrapped
11 | 
12 | print(env.action_space)
13 | print(env.observation_space)
14 | print(env.observation_space.high)
15 | print(env.observation_space.low)
16 | 
17 | RL = PolicyGradient(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995)
18 | 
19 | for i_episode in range(1000):
20 | 	observation = env.reset()
21 | 
22 | 	while True:
23 | 		if RENDER: env.render()
24 | 
25 | 		action = RL.choose_action(observation)
26 | 
27 | 		observation_, reward, done, info = env.step(action)
28 | 
29 | 		RL.store_transition(observation, action, reward)
30 | 
31 | 		if done:
32 | 			# calculate running reward
33 | 			ep_rs_sum = sum(RL.ep_rs)
34 | 			if 'running_reward' not in globals():
35 | 				running_reward = ep_rs_sum
36 | 			else:
37 | 				running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
38 | 
39 | 			if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True
40 | 
41 | 			print('episode:', i_episode, " reward:", int(running_reward))
42 | 
43 | 			vt = RL.learn()
44 | 
45 | 			if i_episode == 30:
46 | 				plt.plot(vt)
47 | 				plt.xlabel('episode steps')
48 | 				plt.ylabel('normalized state-action value')
49 | 				plt.show()
50 | 
51 | 			break
52 | 
53 | 		observation = observation_


--------------------------------------------------------------------------------
/content/8_Actor_Critic_Advantage/AC_CartPole.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | 
  8 | np.random.seed(1)
  9 | torch.manual_seed(1)
 10 | 
 11 | 
 12 | MAX_EPISODE = 3000
 13 | DISPLAY_REWARD_THRESHOLD = 200    # renders environment if total episode reward is greater than this threshold
 14 | MAX_EP_STEPS = 1000   # maximum time steps in one episode
 15 | RENDER = False   # rendering wastes time
 16 | GAMMA = 0.9   # reward discount in TD error
 17 | LR_A = 0.001   # learning rate for actor
 18 | LR_C = 0.01   # learning rete for critic
 19 | 
 20 | 
 21 | env = gym.make('CartPole-v0')
 22 | env.seed(1)   # reproducible
 23 | env = env.unwrapped
 24 | 
 25 | 
 26 | N_F = env.observation_space.shape[0]
 27 | N_A = env.action_space.n
 28 | 
 29 | 
 30 | class Net(nn.Module):
 31 | 	def __init__(self, n_feature, n_hidden, n_output, activate=False):
 32 | 		super(Net, self).__init__()
 33 | 		self.l1 = nn.Linear(n_feature, n_hidden)
 34 | 		self.acts_prob = nn.Linear(n_hidden, n_output)
 35 | 		self.activate=activate
 36 | 
 37 | 
 38 | 	def forward(self, x):
 39 | 		x = self.l1(x)
 40 | 		x = F.relu(x)
 41 | 		x = self.acts_prob(x)
 42 | 		if self.activate:
 43 | 			x = F.softmax(x)
 44 | 		return x
 45 | 
 46 | 
 47 | class Actor(object):
 48 | 	def __init__(self, n_features, n_actions, n_hidden=20, lr=0.001):
 49 | 		self.n_features = n_features
 50 | 		self.n_actions = n_actions
 51 | 		self.n_hidden = n_hidden
 52 | 		self.lr = lr
 53 | 
 54 | 		self._build_net()
 55 | 
 56 | 
 57 | 	def _build_net(self):
 58 | 		self.actor_net = Net(self.n_features, self.n_hidden, self.n_actions, activate=True)
 59 | 		self.optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=self.lr)
 60 | 
 61 | 
 62 | 	def choose_action(self, s):
 63 | 		s = torch.Tensor(s[np.newaxis, :])
 64 | 		probs = self.actor_net(s)
 65 | 		return np.random.choice(np.arange(probs.shape[1]), p=probs.data.numpy().ravel())
 66 | 
 67 | 
 68 | 	def learn(self, s, a, td):
 69 | 		s = torch.Tensor(s[np.newaxis, :])
 70 | 		acts_prob = self.actor_net(s)
 71 | 		log_prob = torch.log(acts_prob[0, a])
 72 | 		exp_v = torch.mean(log_prob * td)
 73 | 
 74 | 		loss = -exp_v
 75 | 		self.optimizer.zero_grad()
 76 | 		loss.backward(retain_graph=True)
 77 | 		self.optimizer.step()
 78 | 
 79 | 		return exp_v
 80 | 
 81 | 
 82 | class Critic(object):
 83 | 	def __init__(self, n_features, lr=0.01):
 84 | 		self.n_features = n_features
 85 | 		self.lr = lr
 86 | 
 87 | 		self._build_net()
 88 | 
 89 | 
 90 | 	def _build_net(self):
 91 | 		self.critic_net = Net(self.n_features, 20, 1)
 92 | 		self.optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=self.lr)
 93 | 
 94 | 
 95 | 	def learn(self, s, r, s_):
 96 | 		s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :])
 97 | 		v, v_ = self.critic_net(s), self.critic_net(s_)
 98 | 		td_error = r + GAMMA * v_ - v
 99 | 		loss = td_error ** 2
100 | 
101 | 		self.optimizer.zero_grad()
102 | 		loss.backward(retain_graph=True)
103 | 		self.optimizer.step()
104 | 
105 | 		return td_error
106 | 
107 | 
108 | 
109 | actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A)
110 | critic = Critic(n_features=N_F, lr=LR_C)   # we need a good teacher, so the teacher should learn faster than the actor
111 | 
112 | for i_episode in range(MAX_EPISODE):
113 | 	s = env.reset()
114 | 	t = 0
115 | 	track_r = []
116 | 
117 | 	while True:
118 | 		if RENDER: env.render()
119 | 
120 | 		a = actor.choose_action(s)
121 | 
122 | 		s_, r, done, info = env.step(a)
123 | 
124 | 		if done: r = -20
125 | 
126 | 		track_r.append(r)
127 | 
128 | 		td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
129 | 		actor.learn(s, a, td_error)   # true_gradient = grad[logPi(s, a) * td_error]
130 | 
131 | 		s = s_
132 | 		t += 1
133 | 
134 | 		if done or t>=MAX_EP_STEPS:
135 | 			ep_rs_sum = sum(track_r)
136 | 
137 | 			if 'running_reward' not in globals():
138 | 				running_reward = ep_rs_sum
139 | 			else:
140 | 				running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
141 | 
142 | 			# if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True
143 | 			print("episode: ", i_episode, "  reward:", int(running_reward))
144 | 			break


--------------------------------------------------------------------------------
/content/8_Actor_Critic_Advantage/AC_continue_Pendulum.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | np.random.seed(1)
  8 | torch.manual_seed(1)   # reproducible
  9 | 
 10 | 
 11 | class Actor_Net(nn.Module):
 12 | 	def __init__(self, n_features, n_hidden, n_outputs):
 13 | 		super(Actor_Net, self).__init__()
 14 | 		self.l1 = nn.Linear(n_features, n_hidden)
 15 | 		self.mu = nn.Linear(n_hidden, n_outputs)
 16 | 		self.sigma = nn.Linear(n_hidden, n_outputs)
 17 | 
 18 | 
 19 | 	def forward(self, x):
 20 | 		x = self.l1(x)
 21 | 		x = F.relu(x)
 22 | 		mu = self.mu(x)
 23 | 		mu = torch.tanh(mu)
 24 | 		sigma = self.sigma(x)
 25 | 		sigma = F.softplus(sigma)
 26 | 
 27 | 		return mu, sigma
 28 | 
 29 | 
 30 | class Actor(object):
 31 | 	def __init__(self, n_features, action_bound, n_hidden=30, lr=0.0001):
 32 | 		self.n_features = n_features
 33 | 		self.action_bound = action_bound
 34 | 		self.n_hidden = n_hidden
 35 | 		self.lr = lr
 36 | 
 37 | 		self._build_net()
 38 | 
 39 | 
 40 | 	def _build_net(self):
 41 | 		self.actor_net = Actor_Net(self.n_features, self.n_hidden, 1)
 42 | 		self.optimizer = torch.optim.Adam(self.actor_net.parameters(), lr=self.lr)
 43 | 
 44 | 
 45 | 	def normal_dist(self, s):
 46 | 		s = torch.Tensor(s[np.newaxis, :])
 47 | 		mu, sigma = self.actor_net(s)
 48 | 		mu, sigma = (mu*2).squeeze(),  (sigma+0.1).squeeze()
 49 | 		normal_dist = torch.distributions.Normal(mu, sigma)   # get the normal distribution of average=mu and std=sigma
 50 | 		return normal_dist
 51 | 
 52 | 
 53 | 	def choose_action(self, s):
 54 | 		normal_dist = self.normal_dist(s)
 55 | 		self.action = torch.clamp(normal_dist.sample(), self.action_bound[0], self.action_bound[1])   # sample action accroding to the distribution
 56 | 		return self.action
 57 | 
 58 | 
 59 | 	def learn(self, s, a, td):
 60 | 		normal_dist = self.normal_dist(s)
 61 | 		log_prob = normal_dist.log_prob(a)   # log_prob get the probability of action a under the distribution of normal_dist
 62 | 		exp_v = log_prob * td.float()   # advantage (TD_error) guided loss
 63 | 		exp_v += 0.01*normal_dist.entropy()   # Add cross entropy cost to encourage exploration
 64 | 		loss = -exp_v   # max(v) = min(-v)
 65 | 
 66 | 		self.optimizer.zero_grad()
 67 | 		loss.backward()
 68 | 		self.optimizer.step()
 69 | 
 70 | 		return exp_v
 71 | 
 72 | 
 73 | class Critic_Net(nn.Module):
 74 | 	def __init__(self, n_features, n_hidden, n_outputs):
 75 | 		super(Critic_Net, self).__init__()
 76 | 		self.l1 = nn.Linear(n_features, n_hidden)
 77 | 		self.v = nn.Linear(n_hidden, n_outputs)
 78 | 
 79 | 
 80 | 	def forward(self, x):
 81 | 		x = self.l1(x)
 82 | 		x = F.relu(x)
 83 | 		x = self.v(x)
 84 | 		return x
 85 | 
 86 | 
 87 | class Critic(object):
 88 | 	def __init__(self, n_features, n_hidden=30, n_output=1, lr=0.01):
 89 | 		self.n_features = n_features
 90 | 		self.n_hidden = n_hidden
 91 | 		self.n_output = n_output
 92 | 		self.lr = lr
 93 | 
 94 | 		self._build_net()
 95 | 
 96 | 
 97 | 	def _build_net(self):
 98 | 		self.critic_net = Critic_Net(self.n_features, self.n_hidden, self.n_output)
 99 | 		self.optimizer = torch.optim.Adam(self.critic_net.parameters(), lr=self.lr)
100 | 
101 | 
102 | 	def learn(self, s, r, s_):
103 | 		s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :])
104 | 		v, v_ = self.critic_net(s), self.critic_net(s_)
105 | 		td_error = torch.mean(r + GAMMA * v_.double() - v.double())
106 | 		loss = td_error ** 2
107 | 
108 | 		self.optimizer.zero_grad()
109 | 		loss.backward(retain_graph=True)
110 | 		self.optimizer.step()
111 | 
112 | 		return td_error
113 | 
114 | 
115 | MAX_EPISODE = 1000
116 | MAX_EP_STEPS = 200
117 | DISPLAY_REWARD_THRESHOLD = -100
118 | RENDER = False
119 | GAMMA = 0.9
120 | LR_A = 0.001
121 | LR_C = 0.01
122 | 
123 | 
124 | env = gym.make('Pendulum-v0')
125 | env.seed(1)
126 | env = env.unwrapped
127 | 
128 | 
129 | N_S = env.observation_space.shape[0]
130 | A_BOUND = env.action_space.high
131 | 
132 | 
133 | actor = Actor(n_features=N_S, lr=LR_A, action_bound=[float(-A_BOUND), float(A_BOUND)])
134 | critic = Critic(n_features=N_S, lr=LR_C)
135 | 
136 | 
137 | for i_episode in range(MAX_EPISODE):
138 | 	s = env.reset()
139 | 	t = 0
140 | 	ep_rs = []
141 | 	while True:
142 | 		if RENDER: env.render()
143 | 		a = actor.choose_action(s)
144 | 
145 | 		s_, r, done, info = env.step(a)
146 | 		r /= 10
147 | 
148 | 		td_error = critic.learn(s, r, s_)   # gradient = grad[r + gamma * V(s_) - V(s)]
149 | 		actor.learn(s, a, td_error)   # gradient = grad[logPi(s, a) * td_error]
150 | 
151 | 		s = s_
152 | 		t += 1
153 | 		ep_rs.append(r)
154 | 		if t > MAX_EP_STEPS:
155 | 			ep_rs_sum = sum(ep_rs)
156 | 			if 'running_reward' not in globals():
157 | 				running_reward = ep_rs_sum
158 | 			else:
159 | 				running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
160 | 			# if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True
161 | 			print('episode: ', i_episode, '  reward:', int(running_reward))
162 | 			break


--------------------------------------------------------------------------------