├── Char00 Conventional Algorithms ├── Q-learning.py ├── Sarsa.py └── gridworld.py ├── Char01 DQN ├── DQN.py ├── DQN │ └── pic │ │ ├── finish_episode.jpg │ │ ├── readme.md │ │ └── value_loss.jpg ├── DQN_CartPole-v0.py ├── DQN_MountainCar-v0.py ├── DQN_mountain_car_v1.py ├── naiveDQN.py └── readme.md ├── Char02 Policy Gradient ├── PolicyGradient.py ├── REINFORCE.py ├── REINFORCE_with_Baseline.py ├── Run_Model.py ├── naive-policy-gradient.py └── pytorch_MountainCar-v0.py ├── Char03 Actor-Critic ├── AC_CartPole-v0.py └── AC_MountainCar-v0.py ├── Char04 A2C ├── A2C.py └── multiprocessing_env.py ├── Char05 DDPG ├── DDPG.py ├── DDPG_exp.jpg └── README.md ├── Char07 PPO ├── PPO2.py ├── PPO_CartPole_v0.py ├── PPO_MountainCar-v0.py ├── PPO_pendulum.py └── readme.md ├── Char08 ACER └── readme.md ├── Char09 SAC ├── SAC.py ├── SAC_BipedalWalker-v2.py ├── SAC_dual_Q_net.py ├── SAC_ep_r_curve.png └── test_agent.py ├── Char10 TD3 ├── Episode_reward_TD3_BipedakWalker.png ├── TD3.py ├── TD3_BipedalWalker-v2.py ├── TD3_Pendulum-v0.png ├── expTD3.pyPendulum-v0. │ ├── actor.pth │ ├── actor_target.pth │ ├── critic_1.pth │ ├── critic_1_target.pth │ ├── critic_2.pth │ └── critic_2_target.pth └── expTD3_BipedalWalker-v2.pyBipedalWalker-v2. │ ├── actor.pth │ ├── actor_target.pth │ ├── critic_1.pth │ ├── critic_1_target.pth │ ├── critic_2.pth │ └── critic_2_target.pth ├── LICENSE ├── More ├── Application in real world │ └── README.md ├── MARL │ └── README.md ├── plot.py └── readme.md ├── figures └── test.png ├── readme.md └── requirements.txt /Char00 Conventional Algorithms/Q-learning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import time 5 | 6 | ALPHA = 0.1 7 | GAMMA = 0.95 8 | EPSILION = 0.9 9 | N_STATE = 20 10 | ACTIONS = ['left', 'right'] 11 | MAX_EPISODES = 200 12 | FRESH_TIME = 0.1 13 | 14 | def build_q_table(n_state, actions): 15 | q_table = pd.DataFrame( 16 | np.zeros((n_state, len(actions))), 17 | np.arange(n_state), 18 | actions 19 | ) 20 | return q_table 21 | 22 | def choose_action(state, q_table): 23 | #epslion - greedy policy 24 | state_action = q_table.loc[state,:] 25 | if np.random.uniform()>EPSILION or (state_action==0).all(): 26 | action_name = np.random.choice(ACTIONS) 27 | else: 28 | action_name = state_action.idxmax() 29 | return action_name 30 | 31 | def get_env_feedback(state, action): 32 | if action=='right': 33 | if state == N_STATE-2: 34 | next_state = 'terminal' 35 | reward = 1 36 | else: 37 | next_state = state+1 38 | reward = -0.5 39 | else: 40 | if state == 0: 41 | next_state = 0 42 | 43 | else: 44 | next_state = state-1 45 | reward = -0.5 46 | return next_state, reward 47 | 48 | def update_env(state,episode, step_counter): 49 | env = ['-'] *(N_STATE-1)+['T'] 50 | if state =='terminal': 51 | print("Episode {}, the total step is {}".format(episode+1, step_counter)) 52 | final_env = ['-'] *(N_STATE-1)+['T'] 53 | return True, step_counter 54 | else: 55 | env[state]='*' 56 | env = ''.join(env) 57 | print(env) 58 | time.sleep(FRESH_TIME) 59 | return False, step_counter 60 | 61 | 62 | def q_learning(): 63 | q_table = build_q_table(N_STATE, ACTIONS) 64 | step_counter_times = [] 65 | for episode in range(MAX_EPISODES): 66 | state = 0 67 | is_terminal = False 68 | step_counter = 0 69 | update_env(state, episode, step_counter) 70 | while not is_terminal: 71 | action = choose_action(state,q_table) 72 | next_state, reward = get_env_feedback(state, action) 73 | next_q = q_table.loc[state, action] 74 | if next_state == 'terminal': 75 | is_terminal = True 76 | q_target = reward 77 | else: 78 | delta = reward + GAMMA*q_table.iloc[next_state,:].max()-q_table.loc[state, action] 79 | q_table.loc[state, action] += ALPHA*delta 80 | state = next_state 81 | is_terminal,steps = update_env(state, episode, step_counter+1) 82 | step_counter+=1 83 | if is_terminal: 84 | step_counter_times.append(steps) 85 | 86 | return q_table, step_counter_times 87 | 88 | def main(): 89 | q_table, step_counter_times= q_learning() 90 | print("Q table\n{}\n".format(q_table)) 91 | print('end') 92 | 93 | plt.plot(step_counter_times,'g-') 94 | plt.ylabel("steps") 95 | plt.show() 96 | print("The step_counter_times is {}".format(step_counter_times)) 97 | 98 | main() 99 | -------------------------------------------------------------------------------- /Char00 Conventional Algorithms/Sarsa.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import time 6 | 7 | ALPHA = 0.1 8 | GAMMA = 0.95 9 | EPSILION = 0.9 10 | N_STATE = 6 11 | ACTIONS = ['left', 'right'] 12 | MAX_EPISODES = 200 13 | FRESH_TIME = 0.1 14 | 15 | def build_q_table(n_state, actions): 16 | q_table = pd.DataFrame( 17 | np.zeros((n_state, len(actions))), 18 | np.arange(n_state), 19 | actions 20 | ) 21 | return q_table 22 | 23 | def choose_action(state, q_table): 24 | #epslion - greedy policy 25 | state_action = q_table.loc[state,:] 26 | if np.random.uniform()>EPSILION or (state_action==0).all(): 27 | action_name = np.random.choice(ACTIONS) 28 | else: 29 | action_name = state_action.idxmax() 30 | return action_name 31 | 32 | def get_env_feedback(state, action): 33 | if action=='right': 34 | if state == N_STATE-2: 35 | next_state = 'terminal' 36 | reward = 1 37 | else: 38 | next_state = state+1 39 | reward = -0.5 40 | else: 41 | if state == 0: 42 | next_state = 0 43 | 44 | else: 45 | next_state = state-1 46 | reward = -0.5 47 | return next_state, reward 48 | 49 | def update_env(state,episode, step_counter): 50 | env = ['-'] *(N_STATE-1)+['T'] 51 | if state =='terminal': 52 | print("Episode {}, the total step is {}".format(episode+1, step_counter)) 53 | final_env = ['-'] *(N_STATE-1)+['T'] 54 | return True, step_counter 55 | else: 56 | env[state]='*' 57 | env = ''.join(env) 58 | print(env) 59 | time.sleep(FRESH_TIME) 60 | return False, step_counter 61 | 62 | 63 | def sarsa_learning(): 64 | q_table = build_q_table(N_STATE, ACTIONS) 65 | step_counter_times = [] 66 | for episode in range(MAX_EPISODES): 67 | state = 0 68 | is_terminal = False 69 | step_counter = 0 70 | update_env(state, episode, step_counter) 71 | while not is_terminal: 72 | action = choose_action(state,q_table) 73 | next_state, reward = get_env_feedback(state, action) 74 | if next_state != 'terminal': 75 | next_action = choose_action(next_state, q_table) #sarsa update method 76 | else: 77 | next_action = action 78 | next_q = q_table.loc[state, action] 79 | 80 | if next_state == 'terminal': 81 | is_terminal = True 82 | q_target = reward 83 | else: 84 | delta = reward + GAMMA*q_table.loc[next_state,next_action]-q_table.loc[state, action] 85 | q_table.loc[state, action] += ALPHA*delta 86 | state = next_state 87 | is_terminal,steps = update_env(state, episode, step_counter+1) 88 | step_counter+=1 89 | if is_terminal: 90 | step_counter_times.append(steps) 91 | 92 | return q_table, step_counter_times 93 | 94 | def main(): 95 | q_table, step_counter_times= sarsa_learning() 96 | print("Q table\n{}\n".format(q_table)) 97 | print('end') 98 | 99 | plt.plot(step_counter_times,'g-') 100 | plt.ylabel("steps") 101 | plt.show() 102 | print("The step_counter_times is {}".format(step_counter_times)) 103 | 104 | main() 105 | -------------------------------------------------------------------------------- /Char00 Conventional Algorithms/gridworld.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | class GridWorld: 5 | 6 | def __init__(self, tot_row, tot_col): 7 | self.action_space_size = 4 8 | self.world_row = tot_row 9 | self.world_col = tot_col 10 | #The world is a matrix of size row x col x 2 11 | #The first layer contains the obstacles 12 | #The second layer contains the rewards 13 | #self.world_matrix = np.zeros((tot_row, tot_col, 2)) 14 | self.transition_matrix = np.ones((self.action_space_size, self.action_space_size))/ self.action_space_size 15 | #self.transition_array = np.ones(self.action_space_size) / self.action_space_size 16 | self.reward_matrix = np.zeros((tot_row, tot_col)) 17 | self.state_matrix = np.zeros((tot_row, tot_col)) 18 | self.position = [np.random.randint(tot_row), np.random.randint(tot_col)] 19 | 20 | #def setTransitionArray(self, transition_array): 21 | #if(transition_array.shape != self.transition_array): 22 | #raise ValueError('The shape of the two matrices must be the same.') 23 | #self.transition_array = transition_array 24 | 25 | def setTransitionMatrix(self, transition_matrix): 26 | '''Set the reward matrix. 27 | 28 | The transition matrix here is intended as a matrix which has a line 29 | for each action and the element of the row are the probabilities to 30 | executes each action when a command is given. For example: 31 | [[0.55, 0.25, 0.10, 0.10] 32 | [0.25, 0.25, 0.25, 0.25] 33 | [0.30, 0.20, 0.40, 0.10] 34 | [0.10, 0.20, 0.10, 0.60]] 35 | 36 | This matrix defines the transition rules for all the 4 possible actions. 37 | The first row corresponds to the probabilities of executing each one of 38 | the 4 actions when the policy orders to the robot to go UP. In this case 39 | the transition model says that with a probability of 0.55 the robot will 40 | go UP, with a probaiblity of 0.25 RIGHT, 0.10 DOWN and 0.10 LEFT. 41 | ''' 42 | if(transition_matrix.shape != self.transition_matrix.shape): 43 | raise ValueError('The shape of the two matrices must be the same.') 44 | self.transition_matrix = transition_matrix 45 | 46 | def setRewardMatrix(self, reward_matrix): 47 | '''Set the reward matrix. 48 | 49 | ''' 50 | if(reward_matrix.shape != self.reward_matrix.shape): 51 | raise ValueError('The shape of the matrix does not match with the shape of the world.') 52 | self.reward_matrix = reward_matrix 53 | 54 | def setStateMatrix(self, state_matrix): 55 | '''Set the obstacles in the world. 56 | 57 | The input to the function is a matrix with the 58 | same size of the world 59 | -1 for states which are not walkable. 60 | +1 for terminal states 61 | 0 for all the walkable states (non terminal) 62 | The following matrix represents the 4x3 world 63 | used in the series "dissecting reinforcement learning" 64 | [[0, 0, 0, +1] 65 | [0, -1, 0, +1] 66 | [0, 0, 0, 0]] 67 | ''' 68 | if(state_matrix.shape != self.state_matrix.shape): 69 | raise ValueError('The shape of the matrix does not match with the shape of the world.') 70 | self.state_matrix = state_matrix 71 | 72 | def setPosition(self, index_row=None, index_col=None): 73 | ''' Set the position of the robot in a specific state. 74 | 75 | ''' 76 | if(index_row is None or index_col is None): self.position = [np.random.randint(tot_row), np.random.randint(tot_col)] 77 | else: self.position = [index_row, index_col] 78 | 79 | def render(self): 80 | ''' Print the current world in the terminal. 81 | 82 | O represents the robot position 83 | - respresent empty states. 84 | # represents obstacles 85 | * represents terminal states 86 | ''' 87 | graph = "" 88 | for row in range(self.world_row): 89 | row_string = "" 90 | for col in range(self.world_col): 91 | if(self.position == [row, col]): row_string += u" \u25CB " # u" \u25CC " 92 | else: 93 | if(self.state_matrix[row, col] == 0): row_string += ' - ' 94 | elif(self.state_matrix[row, col] == -1): row_string += ' # ' 95 | elif(self.state_matrix[row, col] == +1): row_string += ' * ' 96 | row_string += '\n' 97 | graph += row_string 98 | print(graph) 99 | 100 | def reset(self, exploring_starts=False): 101 | ''' Set the position of the robot in the bottom left corner. 102 | 103 | It returns the first observation 104 | ''' 105 | if exploring_starts: 106 | while(True): 107 | row = np.random.randint(0, self.world_row) 108 | col = np.random.randint(0, self.world_col) 109 | if(self.state_matrix[row, col] == 0): break 110 | self.position = [row, col] 111 | else: 112 | self.position = [self.world_row-1, 0] 113 | #reward = self.reward_matrix[self.position[0], self.position[1]] 114 | return self.position 115 | 116 | def step(self, action): 117 | ''' One step in the world. 118 | 119 | [observation, reward, done = env.step(action)] 120 | The robot moves one step in the world based on the action given. 121 | The action can be 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT 122 | @return observation the position of the robot after the step 123 | @return reward the reward associated with the next state 124 | @return done True if the state is terminal 125 | ''' 126 | if(action >= self.action_space_size): 127 | raise ValueError('The action is not included in the action space.') 128 | 129 | #Based on the current action and the probability derived 130 | #from the trasition model it chooses a new actio to perform 131 | action = np.random.choice(4, 1, p=self.transition_matrix[int(action),:]) 132 | #action = self.transition_model(action) 133 | 134 | #Generating a new position based on the current position and action 135 | if(action == 0): new_position = [self.position[0]-1, self.position[1]] #UP 136 | elif(action == 1): new_position = [self.position[0], self.position[1]+1] #RIGHT 137 | elif(action == 2): new_position = [self.position[0]+1, self.position[1]] #DOWN 138 | elif(action == 3): new_position = [self.position[0], self.position[1]-1] #LEFT 139 | else: raise ValueError('The action is not included in the action space.') 140 | 141 | #Check if the new position is a valid position 142 | #print(self.state_matrix) 143 | if (new_position[0]>=0 and new_position[0]=0 and new_position[1]= MEMORY_CAPACITY: 128 | dqn.learn() 129 | if done: 130 | print("episode: {} , the episode reward is {}".format(i, round(ep_reward, 3))) 131 | if done: 132 | break 133 | state = next_state 134 | r = copy.copy(reward) 135 | reward_list.append(r) 136 | ax.set_xlim(0,300) 137 | #ax.cla() 138 | ax.plot(reward_list, 'g-', label='total_loss') 139 | plt.pause(0.001) 140 | 141 | 142 | if __name__ == '__main__': 143 | main() 144 | -------------------------------------------------------------------------------- /Char01 DQN/DQN/pic/finish_episode.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char01 DQN/DQN/pic/finish_episode.jpg -------------------------------------------------------------------------------- /Char01 DQN/DQN/pic/readme.md: -------------------------------------------------------------------------------- 1 | readme 2 | -------------------------------------------------------------------------------- /Char01 DQN/DQN/pic/value_loss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char01 DQN/DQN/pic/value_loss.jpg -------------------------------------------------------------------------------- /Char01 DQN/DQN_CartPole-v0.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | from collections import namedtuple 4 | from itertools import count 5 | 6 | import os, time 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | import gym 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import torch.optim as optim 15 | from torch.distributions import Normal, Categorical 16 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 17 | from tensorboardX import SummaryWriter 18 | 19 | # Hyper-parameters 20 | seed = 1 21 | render = False 22 | num_episodes = 2000 23 | env = gym.make('CartPole-v0').unwrapped 24 | num_state = env.observation_space.shape[0] 25 | num_action = env.action_space.n 26 | torch.manual_seed(seed) 27 | env.seed(seed) 28 | 29 | Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state']) 30 | 31 | class Net(nn.Module): 32 | def __init__(self): 33 | super(Net, self).__init__() 34 | self.fc1 = nn.Linear(num_state, 100) 35 | self.fc2 = nn.Linear(100, num_action) 36 | 37 | def forward(self, x): 38 | x = F.relu(self.fc1(x)) 39 | action_value = self.fc2(x) 40 | return action_value 41 | 42 | class DQN(): 43 | 44 | capacity = 8000 45 | learning_rate = 1e-3 46 | memory_count = 0 47 | batch_size = 256 48 | gamma = 0.995 49 | update_count = 0 50 | 51 | def __init__(self): 52 | super(DQN, self).__init__() 53 | self.target_net, self.act_net = Net(), Net() 54 | self.memory = [None]*self.capacity 55 | self.optimizer = optim.Adam(self.act_net.parameters(), self.learning_rate) 56 | self.loss_func = nn.MSELoss() 57 | self.writer = SummaryWriter('./DQN/logs') 58 | 59 | 60 | def select_action(self,state): 61 | state = torch.tensor(state, dtype=torch.float).unsqueeze(0) 62 | value = self.act_net(state) 63 | action_max_value, index = torch.max(value, 1) 64 | action = index.item() 65 | if np.random.rand(1) >= 0.9: # epslion greedy 66 | action = np.random.choice(range(num_action), 1).item() 67 | return action 68 | 69 | def store_transition(self,transition): 70 | index = self.memory_count % self.capacity 71 | self.memory[index] = transition 72 | self.memory_count += 1 73 | return self.memory_count >= self.capacity 74 | 75 | def update(self): 76 | if self.memory_count >= self.capacity: 77 | state = torch.tensor([t.state for t in self.memory]).float() 78 | action = torch.LongTensor([t.action for t in self.memory]).view(-1,1).long() 79 | reward = torch.tensor([t.reward for t in self.memory]).float() 80 | next_state = torch.tensor([t.next_state for t in self.memory]).float() 81 | 82 | reward = (reward - reward.mean()) / (reward.std() + 1e-7) 83 | with torch.no_grad(): 84 | target_v = reward + self.gamma * self.target_net(next_state).max(1)[0] 85 | 86 | #Update... 87 | for index in BatchSampler(SubsetRandomSampler(range(len(self.memory))), batch_size=self.batch_size, drop_last=False): 88 | v = (self.act_net(state).gather(1, action))[index] 89 | loss = self.loss_func(target_v[index].unsqueeze(1), (self.act_net(state).gather(1, action))[index]) 90 | self.optimizer.zero_grad() 91 | loss.backward() 92 | self.optimizer.step() 93 | self.writer.add_scalar('loss/value_loss', loss, self.update_count) 94 | self.update_count +=1 95 | if self.update_count % 100 ==0: 96 | self.target_net.load_state_dict(self.act_net.state_dict()) 97 | else: 98 | print("Memory Buff is too less") 99 | def main(): 100 | 101 | agent = DQN() 102 | for i_ep in range(num_episodes): 103 | state = env.reset() 104 | if render: env.render() 105 | for t in range(10000): 106 | action = agent.select_action(state) 107 | next_state, reward, done, info = env.step(action) 108 | if render: env.render() 109 | transition = Transition(state, action, reward, next_state) 110 | agent.store_transition(transition) 111 | state = next_state 112 | if done or t >=9999: 113 | agent.writer.add_scalar('live/finish_step', t+1, global_step=i_ep) 114 | agent.update() 115 | if i_ep % 10 == 0: 116 | print("episodes {}, step is {} ".format(i_ep, t)) 117 | break 118 | 119 | if __name__ == '__main__': 120 | main() 121 | -------------------------------------------------------------------------------- /Char01 DQN/DQN_MountainCar-v0.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | from collections import namedtuple 4 | from itertools import count 5 | 6 | import os, time 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | import gym 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import torch.optim as optim 15 | from torch.distributions import Normal, Categorical 16 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 17 | from tensorboardX import SummaryWriter 18 | 19 | # Hyper-parameters 20 | seed = 1 21 | render = False 22 | num_episodes = 400000 23 | env = gym.make('MountainCar-v0').unwrapped 24 | num_state = env.observation_space.shape[0] 25 | num_action = env.action_space.n 26 | torch.manual_seed(seed) 27 | env.seed(seed) 28 | 29 | Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state']) 30 | 31 | class Net(nn.Module): 32 | def __init__(self): 33 | super(Net, self).__init__() 34 | self.fc1 = nn.Linear(num_state, 100) 35 | self.fc2 = nn.Linear(100, num_action) 36 | 37 | def forward(self, x): 38 | x = F.relu(self.fc1(x)) 39 | action_prob = self.fc2(x) 40 | return action_prob 41 | 42 | class DQN(): 43 | 44 | capacity = 8000 45 | learning_rate = 1e-3 46 | memory_count = 0 47 | batch_size = 256 48 | gamma = 0.995 49 | update_count = 0 50 | 51 | def __init__(self): 52 | super(DQN, self).__init__() 53 | self.target_net, self.act_net = Net(), Net() 54 | self.memory = [None]*self.capacity 55 | self.optimizer = optim.Adam(self.act_net.parameters(), self.learning_rate) 56 | self.loss_func = nn.MSELoss() 57 | self.writer = SummaryWriter('./DQN/logs') 58 | 59 | 60 | def select_action(self,state): 61 | state = torch.tensor(state, dtype=torch.float).unsqueeze(0) 62 | value = self.act_net(state) 63 | action_max_value, index = torch.max(value, 1) 64 | action = index.item() 65 | if np.random.rand(1) >= 0.9: # epslion greedy 66 | action = np.random.choice(range(num_action), 1).item() 67 | return action 68 | 69 | def store_transition(self,transition): 70 | index = self.memory_count % self.capacity 71 | self.memory[index] = transition 72 | self.memory_count += 1 73 | return self.memory_count >= self.capacity 74 | 75 | def update(self): 76 | if self.memory_count >= self.capacity: 77 | state = torch.tensor([t.state for t in self.memory]).float() 78 | action = torch.LongTensor([t.action for t in self.memory]).view(-1,1).long() 79 | reward = torch.tensor([t.reward for t in self.memory]).float() 80 | next_state = torch.tensor([t.next_state for t in self.memory]).float() 81 | 82 | reward = (reward - reward.mean()) / (reward.std() + 1e-7) 83 | with torch.no_grad(): 84 | target_v = reward + self.gamma * self.target_net(next_state).max(1)[0] 85 | 86 | #Update... 87 | for index in BatchSampler(SubsetRandomSampler(range(len(self.memory))), batch_size=self.batch_size, drop_last=False): 88 | v = (self.act_net(state).gather(1, action))[index] 89 | loss = self.loss_func(target_v[index].unsqueeze(1), (self.act_net(state).gather(1, action))[index]) 90 | self.optimizer.zero_grad() 91 | loss.backward() 92 | self.optimizer.step() 93 | self.writer.add_scalar('loss/value_loss', loss, self.update_count) 94 | self.update_count +=1 95 | if self.update_count % 100 ==0: 96 | self.target_net.load_state_dict(self.act_net.state_dict()) 97 | else: 98 | print("Memory Buff is too less") 99 | def main(): 100 | 101 | agent = DQN() 102 | for i_ep in range(num_episodes): 103 | state = env.reset() 104 | if render: env.render() 105 | for t in range(10000): 106 | action = agent.select_action(state) 107 | next_state, reward, done, info = env.step(action) 108 | if render: env.render() 109 | transition = Transition(state, action, reward, next_state) 110 | agent.store_transition(transition) 111 | state = next_state 112 | if done or t >=9999: 113 | agent.writer.add_scalar('live/finish_step', t+1, global_step=i_ep) 114 | agent.update() 115 | if i_ep % 10 == 0: 116 | print("episodes {}, step is {} ".format(i_ep, t)) 117 | break 118 | 119 | if __name__ == '__main__': 120 | main() 121 | -------------------------------------------------------------------------------- /Char01 DQN/DQN_mountain_car_v1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch import optim 6 | import matplotlib.pyplot as plt 7 | import gym 8 | 9 | 10 | #hyper parameters 11 | EPSILON = 0.9 12 | GAMMA = 0.9 13 | LR = 0.01 14 | MEMORY_CAPACITY = 2000 15 | Q_NETWORK_ITERATION = 100 16 | BATCH_SIZE = 32 17 | 18 | EPISODES = 400 19 | env = gym.make('MountainCar-v0') 20 | env = env.unwrapped 21 | NUM_STATES = env.observation_space.shape[0] # 2 22 | NUM_ACTIONS = env.action_space.n 23 | 24 | 25 | class Net(nn.Module): 26 | def __init__(self): 27 | super(Net, self).__init__() 28 | 29 | self.fc1 = nn.Linear(NUM_STATES, 30) 30 | self.fc1.weight.data.normal_(0, 0.1) 31 | self.fc2 = nn.Linear(30, NUM_ACTIONS) 32 | self.fc2.weight.data.normal_(0, 0.1) 33 | 34 | 35 | def forward(self, x): 36 | x = self.fc1(x) 37 | x = F.relu(x) 38 | x = self.fc2(x) 39 | 40 | return x 41 | 42 | class Dqn(): 43 | def __init__(self): 44 | self.eval_net, self.target_net = Net(), Net() 45 | self.memory = np.zeros((MEMORY_CAPACITY, NUM_STATES *2 +2)) 46 | # state, action ,reward and next state 47 | self.memory_counter = 0 48 | self.learn_counter = 0 49 | self.optimizer = optim.Adam(self.eval_net.parameters(), LR) 50 | self.loss = nn.MSELoss() 51 | 52 | self.fig, self.ax = plt.subplots() 53 | 54 | def store_trans(self, state, action, reward, next_state): 55 | if self.memory_counter % 500 ==0: 56 | print("The experience pool collects {} time experience".format(self.memory_counter)) 57 | index = self.memory_counter % MEMORY_CAPACITY 58 | trans = np.hstack((state, [action], [reward], next_state)) 59 | self.memory[index,] = trans 60 | self.memory_counter += 1 61 | 62 | def choose_action(self, state): 63 | # notation that the function return the action's index nor the real action 64 | # EPSILON 65 | state = torch.unsqueeze(torch.FloatTensor(state) ,0) 66 | if np.random.randn() <= EPSILON: 67 | action_value = self.eval_net.forward(state) 68 | action = torch.max(action_value, 1)[1].data.numpy() # get action whose q is max 69 | action = action[0] #get the action index 70 | else: 71 | action = np.random.randint(0,NUM_ACTIONS) 72 | return action 73 | 74 | def plot(self, ax, x): 75 | ax.cla() 76 | ax.set_xlabel("episode") 77 | ax.set_ylabel("total reward") 78 | ax.plot(x, 'b-') 79 | plt.pause(0.000000000000001) 80 | 81 | def learn(self): 82 | # learn 100 times then the target network update 83 | if self.learn_counter % Q_NETWORK_ITERATION ==0: 84 | self.target_net.load_state_dict(self.eval_net.state_dict()) 85 | self.learn_counter+=1 86 | 87 | sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE) 88 | batch_memory = self.memory[sample_index, :] 89 | batch_state = torch.FloatTensor(batch_memory[:, :NUM_STATES]) 90 | #note that the action must be a int 91 | batch_action = torch.LongTensor(batch_memory[:, NUM_STATES:NUM_STATES+1].astype(int)) 92 | batch_reward = torch.FloatTensor(batch_memory[:, NUM_STATES+1: NUM_STATES+2]) 93 | batch_next_state = torch.FloatTensor(batch_memory[:, -NUM_STATES:]) 94 | 95 | q_eval = self.eval_net(batch_state).gather(1, batch_action) 96 | q_next = self.target_net(batch_next_state).detach() 97 | q_target = batch_reward + GAMMA*q_next.max(1)[0].view(BATCH_SIZE, 1) 98 | 99 | loss = self.loss(q_eval, q_target) 100 | self.optimizer.zero_grad() 101 | loss.backward() 102 | self.optimizer.step() 103 | 104 | 105 | 106 | def main(): 107 | net = Dqn() 108 | print("The DQN is collecting experience...") 109 | step_counter_list = [] 110 | for episode in range(EPISODES): 111 | state = env.reset() 112 | step_counter = 0 113 | while True: 114 | step_counter +=1 115 | env.render() 116 | action = net.choose_action(state) 117 | next_state, reward, done, info = env.step(action) 118 | reward = reward * 100 if reward >0 else reward * 5 119 | net.store_trans(state, action, reward, next_state) 120 | 121 | if net.memory_counter >= MEMORY_CAPACITY: 122 | net.learn() 123 | if done: 124 | print("episode {}, the reward is {}".format(episode, round(reward, 3))) 125 | if done: 126 | step_counter_list.append(step_counter) 127 | net.plot(net.ax, step_counter_list) 128 | break 129 | 130 | state = next_state 131 | 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /Char01 DQN/naiveDQN.py: -------------------------------------------------------------------------------- 1 | # Nota that this network won't work because the reward is always 1 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import numpy as np 7 | import gym 8 | 9 | # hyper-parameters 10 | BATCH_SIZE = 128 11 | LR = 0.01 12 | GAMMA = 0.90 13 | EPISILO = 0.9 14 | MEMORY_CAPACITY = 20000 15 | Q_NETWORK_ITERATION = 100 16 | 17 | env = gym.make("CartPole-v0") 18 | env = env.unwrapped 19 | NUM_ACTIONS = env.action_space.n 20 | NUM_STATES = env.observation_space.shape[0] 21 | ENV_A_SHAPE = 0 if isinstance(env.action_space.sample(), int) else env.action_space.sample.shape 22 | class Net(nn.Module): 23 | """docstring for Net""" 24 | def __init__(self): 25 | super(Net, self).__init__() 26 | self.fc1 = nn.Linear(NUM_STATES, 50) 27 | self.fc1.weight.data.normal_(0,0.1) 28 | self.fc2 = nn.Linear(50,30) 29 | self.fc2.weight.data.normal_(0,0.1) 30 | self.out = nn.Linear(30,NUM_ACTIONS) 31 | self.out.weight.data.normal_(0,0.1) 32 | 33 | def forward(self,x): 34 | x = self.fc1(x) 35 | x = F.relu(x) 36 | x = self.fc2(x) 37 | x = F.relu(x) 38 | action_prob = self.out(x) 39 | return action_prob 40 | 41 | class DQN(): 42 | """docstring for DQN""" 43 | def __init__(self): 44 | super(DQN, self).__init__() 45 | self.eval_net, self.target_net = Net(), Net() 46 | 47 | self.learn_step_counter = 0 48 | self.memory_counter = 0 49 | self.memory = np.zeros((MEMORY_CAPACITY, NUM_STATES * 2 + 2)) 50 | # why the NUM_STATE*2 +2 51 | # When we store the memory, we put the state, action, reward and next_state in the memory 52 | # here reward and action is a number, state is a ndarray 53 | self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR) 54 | self.loss_func = nn.MSELoss() 55 | 56 | def choose_action(self, state): 57 | state = torch.unsqueeze(torch.FloatTensor(state), 0) # get a 1D array 58 | if np.random.randn() <= EPISILO:# greedy policy 59 | action_value = self.eval_net.forward(state) 60 | action = torch.max(action_value, 1)[1].data.numpy() 61 | action = action[0] if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE) 62 | else: # random policy 63 | action = np.random.randint(0,NUM_ACTIONS) 64 | action = action if ENV_A_SHAPE ==0 else action.reshape(ENV_A_SHAPE) 65 | return action 66 | 67 | 68 | def store_transition(self, state, action, reward, next_state): 69 | transition = np.hstack((state, [action, reward], next_state)) 70 | index = self.memory_counter % MEMORY_CAPACITY 71 | self.memory[index, :] = transition 72 | self.memory_counter += 1 73 | 74 | 75 | def learn(self): 76 | 77 | #update the parameters 78 | if self.learn_step_counter % Q_NETWORK_ITERATION ==0: 79 | self.target_net.load_state_dict(self.eval_net.state_dict()) 80 | self.learn_step_counter+=1 81 | 82 | #sample batch from memory 83 | sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE) 84 | batch_memory = self.memory[sample_index, :] 85 | batch_state = torch.FloatTensor(batch_memory[:, :NUM_STATES]) 86 | batch_action = torch.LongTensor(batch_memory[:, NUM_STATES:NUM_STATES+1].astype(int)) 87 | batch_reward = torch.FloatTensor(batch_memory[:, NUM_STATES+1:NUM_STATES+2]) 88 | batch_next_state = torch.FloatTensor(batch_memory[:,-NUM_STATES:]) 89 | 90 | #q_eval 91 | q_eval = self.eval_net(batch_state).gather(1, batch_action) 92 | q_next = self.target_net(batch_next_state).detach() 93 | q_target = batch_reward + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1) 94 | loss = self.loss_func(q_eval, q_target) 95 | 96 | self.optimizer.zero_grad() 97 | loss.backward() 98 | self.optimizer.step() 99 | 100 | def main(): 101 | dqn = DQN() 102 | episodes = 400 103 | print("Collecting Experience....") 104 | for i in range(episodes): 105 | state = env.reset() 106 | ep_reward = 0 107 | while True: 108 | env.render() 109 | action = dqn.choose_action(state) 110 | next_state, reward, done, info = env.step(action) 111 | 112 | dqn.store_transition(state, action, reward, next_state) 113 | ep_reward += reward 114 | 115 | if dqn.memory_counter >= MEMORY_CAPACITY: 116 | dqn.learn() 117 | if done: 118 | print("episode: {} , the episode reward is {}".format(i, round(ep_reward, 3))) 119 | if done: 120 | break 121 | state = next_state 122 | 123 | if __name__ == '__main__': 124 | main() 125 | -------------------------------------------------------------------------------- /Char01 DQN/readme.md: -------------------------------------------------------------------------------- 1 | # Requirment: 2 | 3 | - tensorflow 1.10 4 | - pytorch 4.1 5 | - tensorboardX 6 | - gym 7 | 8 | ## Tips for MountainCar-v0 env: 9 | 10 | This is very sparse for MountainCar-v0, it is 0 at the beginning, only when the top of the mountain is 1, there is a reward. This leads to the fact that if the sample to the top of the mountain is not taken during training, basically the train will not come out. So you can change the reward, for example, to change to the current position of the Car is positively related. Of course, there is a more advanced approach to inverse reinforcement learning (using GAN). 11 | 12 | ![value_loss](DQN/pic/value_loss.jpg) 13 | ![step](DQN/pic/finish_episode.jpg) 14 | This is value loss for DQN, We can see that the loss increaded to 1e13 however, the network work well. This is because the training is going on, the target_net and act_net are very different, so the calculated loss becomes very large. The previous loss was small because the reward was very sparse, resulting in a small update of the two networks. 15 | -------------------------------------------------------------------------------- /Char02 Policy Gradient/PolicyGradient.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import gym 4 | import numpy as np 5 | from itertools import count 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.optim as optim 11 | import matplotlib.pyplot as plt 12 | from torch.distributions import Categorical 13 | 14 | 15 | 16 | parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') 17 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G', 18 | help='discount factor (default: 0.99)') 19 | parser.add_argument('--seed', type=int, default=543, metavar='N', 20 | help='random seed (default: 543)') 21 | parser.add_argument('--render', action='store_true', 22 | help='render the environment') 23 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 24 | help='interval between training status logs (default: 10)') 25 | args = parser.parse_args() 26 | 27 | 28 | env = gym.make('CartPole-v0') 29 | env.seed(args.seed) 30 | torch.manual_seed(args.seed) 31 | 32 | 33 | class Policy(nn.Module): 34 | def __init__(self): 35 | super(Policy, self).__init__() 36 | self.affine1 = nn.Linear(4, 128) 37 | self.affine2 = nn.Linear(128, 2) 38 | 39 | self.saved_log_probs = [] 40 | self.rewards = [] 41 | 42 | def forward(self, x): 43 | x = F.relu(self.affine1(x)) 44 | action_scores = self.affine2(x) 45 | return F.softmax(action_scores, dim=1) 46 | 47 | 48 | policy = Policy() 49 | optimizer = optim.Adam(policy.parameters(), lr=1e-2) 50 | eps = np.finfo(np.float32).eps.item() 51 | 52 | 53 | def select_action(state): 54 | state = torch.from_numpy(state).float().unsqueeze(0) 55 | probs = policy(state) 56 | m = Categorical(probs) 57 | action = m.sample() 58 | policy.saved_log_probs.append(m.log_prob(action)) 59 | return action.item() 60 | 61 | 62 | def finish_episode(): 63 | R = 0 64 | policy_loss = [] 65 | rewards = [] 66 | for r in policy.rewards[::-1]: 67 | R = r + args.gamma * R 68 | rewards.insert(0, R) 69 | rewards = torch.tensor(rewards) 70 | rewards = (rewards - rewards.mean()) / (rewards.std() + eps) 71 | for log_prob, reward in zip(policy.saved_log_probs, rewards): 72 | policy_loss.append(-log_prob * reward) 73 | optimizer.zero_grad() 74 | policy_loss = torch.cat(policy_loss).sum() 75 | policy_loss.backward() 76 | optimizer.step() 77 | del policy.rewards[:] 78 | del policy.saved_log_probs[:] 79 | 80 | 81 | def main(): 82 | running_reward = 10 83 | for i_episode in count(1): 84 | state = env.reset() 85 | for t in range(10000): # Don't infinite loop while learning 86 | action = select_action(state) 87 | state, reward, done, _ = env.step(action) 88 | if args.render: 89 | env.render() 90 | policy.rewards.append(reward) 91 | if done: 92 | break 93 | 94 | running_reward = running_reward * 0.99 + t * 0.01 95 | finish_episode() 96 | if i_episode % args.log_interval == 0: 97 | print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format( 98 | i_episode, t, running_reward)) 99 | if running_reward > env.spec.reward_threshold: 100 | print("Solved! Running reward is now {} and " 101 | "the last episode runs to {} time steps!".format(running_reward, t)) 102 | break 103 | 104 | 105 | if __name__ == '__main__': 106 | main() 107 | -------------------------------------------------------------------------------- /Char02 Policy Gradient/REINFORCE.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import numpy as np 4 | from itertools import count 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | from torch.distributions import Categorical 11 | 12 | 13 | parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') 14 | parser.add_argument('--gamma', type=float, default=0.99, metavar='G', 15 | help='discount factor (default: 0.99)') 16 | parser.add_argument('--seed', type=int, default=543, metavar='N', 17 | help='random seed (default: 543)') 18 | parser.add_argument('--render', action='store_true', 19 | help='render the environment') 20 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 21 | help='interval between training status logs (default: 10)') 22 | args = parser.parse_args() 23 | 24 | 25 | env = gym.make('CartPole-v0') 26 | env.seed(args.seed) 27 | torch.manual_seed(args.seed) 28 | 29 | 30 | class Policy(nn.Module): 31 | def __init__(self): 32 | super(Policy, self).__init__() 33 | self.affine1 = nn.Linear(4, 128) 34 | self.affine2 = nn.Linear(128, 2) 35 | 36 | self.saved_log_probs = [] 37 | self.rewards = [] 38 | 39 | def forward(self, x): 40 | x = F.relu(self.affine1(x)) 41 | action_scores = self.affine2(x) 42 | return F.softmax(action_scores, dim=1) 43 | 44 | 45 | policy = Policy() 46 | optimizer = optim.Adam(policy.parameters(), lr=1e-2) 47 | eps = np.finfo(np.float32).eps.item() 48 | 49 | 50 | def select_action(state): 51 | state = torch.from_numpy(state).float().unsqueeze(0) 52 | probs = policy(state) 53 | m = Categorical(probs) 54 | action = m.sample() 55 | policy.saved_log_probs.append(m.log_prob(action)) 56 | return action.item() 57 | 58 | 59 | def finish_episode(): 60 | R = 0 61 | policy_loss = [] 62 | rewards = [] 63 | for r in policy.rewards[::-1]: 64 | R = r + args.gamma * R 65 | rewards.insert(0, R) 66 | rewards = torch.tensor(rewards) 67 | rewards = (rewards - rewards.mean()) / (rewards.std() + eps) 68 | for log_prob, reward in zip(policy.saved_log_probs, rewards): 69 | policy_loss.append(-log_prob * reward) 70 | optimizer.zero_grad() 71 | policy_loss = torch.cat(policy_loss).sum() 72 | policy_loss.backward() 73 | optimizer.step() 74 | del policy.rewards[:] 75 | del policy.saved_log_probs[:] 76 | 77 | 78 | def main(): 79 | running_reward = 10 80 | for i_episode in count(1): 81 | state = env.reset() 82 | for t in range(10000): # Don't infinite loop while learning 83 | action = select_action(state) 84 | state, reward, done, _ = env.step(action) 85 | if args.render: 86 | env.render() 87 | policy.rewards.append(reward) 88 | if done: 89 | break 90 | 91 | running_reward = running_reward * 0.99 + t * 0.01 92 | finish_episode() 93 | if i_episode % args.log_interval == 0: 94 | print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format( 95 | i_episode, t, running_reward)) 96 | if running_reward > env.spec.reward_threshold: 97 | print("Solved! Running reward is now {} and " 98 | "the last episode runs to {} time steps!".format(running_reward, t)) 99 | break 100 | 101 | 102 | if __name__ == '__main__': 103 | main() 104 | -------------------------------------------------------------------------------- /Char02 Policy Gradient/REINFORCE_with_Baseline.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import gym 7 | import torch 8 | from torch.distributions import Categorical 9 | import torch.optim as optim 10 | from copy import deepcopy 11 | import argparse 12 | import matplotlib.pyplot as plt 13 | from tensorboardX import SummaryWriter 14 | from torch.nn.utils import clip_grad_norm_ 15 | render = False 16 | 17 | #parser = argparse.ArgumentParser(description='PyTorch REINFORCE example with baseline') 18 | # parser.add_argument('--render', action='store_true', default=True, 19 | # help='render the environment') 20 | #args = parser.parse_args() 21 | 22 | #神经网络的输出: Net1输入state,输出action_prob; Net2 输入state,输出各个action_reward 23 | class Policy(nn.Module): 24 | def __init__(self,n_states, n_hidden, n_output): 25 | super(Policy, self).__init__() 26 | self.linear1 = nn.Linear(n_states, n_hidden) 27 | self.linear2 = nn.Linear(n_hidden, n_output) 28 | 29 | #这是policy的参数 30 | self.reward = [] 31 | self.log_act_probs = [] 32 | self.Gt = [] 33 | self.sigma = [] 34 | #这是state_action_func的参数 35 | # self.Reward = [] 36 | # self.s_value = [] 37 | 38 | def forward(self, x): 39 | x = F.relu(self.linear1(x)) 40 | output = F.softmax(self.linear2(x), dim= 1) 41 | # self.act_probs.append(action_probs) 42 | return output 43 | 44 | 45 | 46 | env = gym.make('CartPole-v0') 47 | # writer = SummaryWriter('./yingyingying') 48 | # state = env.reset() 49 | n_states = env.observation_space.shape[0] 50 | n_actions = env.action_space.n 51 | 52 | policy = Policy(n_states, 128, n_actions) 53 | s_value_func = Policy(n_states, 128, 1) 54 | 55 | 56 | alpha_theta = 1e-3 57 | optimizer_theta = optim.Adam(policy.parameters(), lr=alpha_theta) 58 | # alpha_w = 1e-3 #初始化 59 | # optimizer_w = optim.Adam(policy.parameters(), lr=alpha_w) 60 | gamma = 0.99 61 | 62 | 63 | 64 | seed = 1 65 | env.seed(seed) 66 | torch.manual_seed(seed) 67 | live_time = [] 68 | 69 | def loop_episode(): 70 | 71 | state = env.reset() 72 | if render: env.render() 73 | policy_loss = [] 74 | s_value = [] 75 | state_sequence = [] 76 | log_act_prob = [] 77 | for t in range(1000): 78 | state = torch.from_numpy(state).unsqueeze(0).float() # 在第0维增加一个维度,将数据组织成[N , .....] 形式 79 | state_sequence.append(deepcopy(state)) 80 | action_probs = policy(state) 81 | m = Categorical(action_probs) 82 | action = m.sample() 83 | m_log_prob = m.log_prob(action) 84 | log_act_prob.append(m_log_prob) 85 | # policy.log_act_probs.append(m_log_prob) 86 | action = action.item() 87 | next_state, re, done, _ = env.step(action) 88 | if render: env.render() 89 | policy.reward.append(re) 90 | if done: 91 | live_time.append(t) 92 | break 93 | state = next_state 94 | 95 | R = 0 96 | Gt = [] 97 | 98 | # get Gt value 99 | for r in policy.reward[::-1]: 100 | R = r + gamma * R 101 | Gt.insert(0, R) 102 | # s_value_func.sigma.insert(0,sigma) 103 | # policy.Gt.insert(0,R) 104 | 105 | 106 | # update step by step 107 | for i in range(len(Gt)): 108 | 109 | 110 | 111 | G = Gt[i] 112 | V = s_value_func(state_sequence[i]) 113 | delta = G - V 114 | 115 | # update value network 116 | alpha_w = 1e-3 # 初始化 117 | 118 | optimizer_w = optim.Adam(policy.parameters(), lr=alpha_w) 119 | optimizer_w.zero_grad() 120 | policy_loss_w =-delta 121 | policy_loss_w.backward(retain_graph = True) 122 | clip_grad_norm_(policy_loss_w, 0.1) 123 | optimizer_w.step() 124 | 125 | # update policy network 126 | optimizer_theta.zero_grad() 127 | policy_loss_theta = - log_act_prob[i] * delta 128 | policy_loss_theta.backward(retain_graph = True) 129 | clip_grad_norm_(policy_loss_theta, 0.1) 130 | optimizer_theta.step() 131 | 132 | del policy.log_act_probs[:] 133 | del policy.reward[:] 134 | 135 | 136 | def plot(live_time): 137 | plt.ion() 138 | plt.grid() 139 | plt.plot(live_time, 'g-') 140 | plt.xlabel('running step') 141 | plt.ylabel('live time') 142 | plt.pause(0.000001) 143 | 144 | 145 | 146 | if __name__ == '__main__': 147 | 148 | #生成若干episode 149 | # graph_data = torch.autograd.Variable(torch.ones(1,4)) 150 | # writer.add_graph(policy, (graph_data, )) 151 | for i_episode in range(1000): 152 | loop_episode() 153 | plot(live_time) 154 | #policy.plot(live_time) 155 | -------------------------------------------------------------------------------- /Char02 Policy Gradient/Run_Model.py: -------------------------------------------------------------------------------- 1 | # MountainCar V0 2 | # 3 | import numpy as np 4 | import gym 5 | import matplotlib.pyplot as plt 6 | from itertools import count 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch.nn import functional as F 11 | from torch.optim import adam 12 | from torch.distributions import Categorical 13 | 14 | env = gym.make('MountainCar-v0') 15 | env = env.unwrapped 16 | env.seed(1) 17 | 18 | torch.manual_seed(1) 19 | plt.ion() 20 | 21 | 22 | #Hyperparameters 23 | learning_rate = 0.02 24 | gamma = 0.995 25 | episodes = 1000 26 | 27 | eps = np.finfo(np.float32).eps.item() 28 | 29 | action_space = env.action_space.n 30 | state_space = env.observation_space.shape[0] 31 | 32 | class Policy(nn.Module): 33 | def __init__(self): 34 | super(Policy, self).__init__() 35 | 36 | self.fc1 = nn.Linear(state_space, 20) 37 | #self.fc2 = nn.Linear(128,64) 38 | self.fc3 = nn.Linear(20, action_space) 39 | 40 | self.gamma = gamma 41 | self.saved_log_probs = [] 42 | self.rewards = [] 43 | 44 | def forward(self, x): 45 | 46 | x = F.relu(self.fc1(x)) 47 | #x = F.relu(self.fc2(x)) 48 | x = F.softmax(self.fc3(x), dim=1) 49 | 50 | return x 51 | 52 | policy = torch.load('policyNet.pkl') 53 | 54 | def plot(steps): 55 | ax = plt.subplot(111) 56 | ax.cla() 57 | ax.set_title('Training') 58 | ax.set_xlabel('Episode') 59 | ax.set_ylabel('Run Time') 60 | ax.plot(steps) 61 | RunTime = len(steps) 62 | path = './PG_MountainCar-v0/'+'RunTime'+str(RunTime)+'.jpg' 63 | if len(steps) % 100 == 0: 64 | #plt.savefig(path) 65 | pass 66 | plt.pause(0.0000001) 67 | 68 | def selct_action(state): 69 | state = torch.from_numpy(state).float().unsqueeze(0) 70 | probs = policy(state) 71 | c = Categorical(probs) 72 | action = c.sample() 73 | 74 | 75 | #policy.saved_log_probs.append(c.log_prob(action)) 76 | action = action.item() 77 | return action 78 | 79 | def run_Model(): 80 | running_reward = 0 81 | steps = [] 82 | for episode in count(60000): 83 | state = env.reset() 84 | 85 | for t in range(10000): 86 | action = selct_action(state) 87 | state, reward ,done, info = env.step(action) 88 | env.render() 89 | #policy.rewards.append(reward) 90 | 91 | if done: 92 | print("Episode {}, live time = {}".format(episode, t)) 93 | steps.append(t) 94 | plot(steps) 95 | break 96 | if episode % 50 == 0: 97 | pass 98 | #torch.save(policy, 'policyNet.pkl') 99 | 100 | running_reward = running_reward * policy.gamma - t*0.01 101 | #finish_episode() 102 | 103 | if __name__ == '__main__': 104 | run_Model() -------------------------------------------------------------------------------- /Char02 Policy Gradient/naive-policy-gradient.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import matplotlib.pyplot as plt 4 | from itertools import count 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | from torch.autograd import Variable 10 | from torch.distributions import Categorical 11 | 12 | 13 | #Hyperparameters 14 | learning_rate = 0.01 15 | gamma = 0.98 16 | 17 | num_episode = 5000 18 | batch_size = 32 19 | 20 | 21 | env = gym.make('CartPole-v0') 22 | state_space = env.observation_space.shape[0] 23 | action_space = env.action_space.n 24 | 25 | def plot_durations(episode_durations): 26 | plt.ion() 27 | plt.figure(2) 28 | plt.clf() 29 | duration_t = torch.FloatTensor(episode_durations) 30 | plt.title('Training') 31 | plt.xlabel('Episodes') 32 | plt.ylabel('Duration') 33 | plt.plot(duration_t.numpy()) 34 | 35 | if len(duration_t) >= 100: 36 | means = duration_t.unfold(0,100,1).mean(1).view(-1) 37 | means = torch.cat((torch.zeros(99), means)) 38 | plt.plot(means.numpy()) 39 | 40 | plt.pause(0.00001) 41 | 42 | class Policy(nn.Module): 43 | 44 | def __init__(self): 45 | super(Policy, self).__init__() 46 | 47 | self.state_space = state_space 48 | self.action_space = action_space 49 | 50 | self.fc1 = nn.Linear(self.state_space, 128) 51 | self.fc2 = nn.Linear(128, self.action_space) 52 | 53 | def forward(self, x): 54 | x = self.fc1(x) 55 | #x = F.dropout(x, 0.5) 56 | x = F.relu(x) 57 | x = F.softmax(self.fc2(x), dim=-1) 58 | 59 | return x 60 | 61 | policy = Policy() 62 | optimizer = torch.optim.Adam(policy.parameters(), lr=learning_rate) 63 | 64 | 65 | 66 | def train(): 67 | 68 | episode_durations = [] 69 | #Batch_history 70 | state_pool = [] 71 | action_pool = [] 72 | reward_pool = [] 73 | steps = 0 74 | 75 | for episode in range(num_episode): 76 | state = env.reset() 77 | state = torch.from_numpy(state).float() 78 | state = Variable(state) 79 | 80 | env.render() 81 | 82 | for t in count(): 83 | probs = policy(state) 84 | c = Categorical(probs) 85 | action = c.sample() 86 | 87 | action = action.data.numpy().astype('int32') 88 | next_state, reward, done, info = env.step(action) 89 | reward = 0 if done else reward # correct the reward 90 | env.render() 91 | 92 | state_pool.append(state) 93 | action_pool.append(float(action)) 94 | reward_pool.append(reward) 95 | 96 | state = next_state 97 | state = torch.from_numpy(state).float() 98 | state = Variable(state) 99 | 100 | steps += 1 101 | 102 | if done: 103 | episode_durations.append(t+1) 104 | plot_durations(episode_durations) 105 | break 106 | 107 | # update policy 108 | if episode >0 and episode % batch_size == 0: 109 | 110 | r = 0 111 | ''' 112 | for i in reversed(range(steps)): 113 | if reward_pool[i] == 0: 114 | running_add = 0 115 | else: 116 | running_add = running_add * gamma +reward_pool[i] 117 | reward_pool[i] = running_add 118 | ''' 119 | for i in reversed(range(steps)): 120 | if reward_pool[i] == 0: 121 | r = 0 122 | else: 123 | r = r * gamma + reward_pool[i] 124 | reward_pool[i] = r 125 | 126 | #Normalize reward 127 | reward_mean = np.mean(reward_pool) 128 | reward_std = np.std(reward_pool) 129 | reward_pool = (reward_pool-reward_mean)/reward_std 130 | 131 | #gradiend desent 132 | optimizer.zero_grad() 133 | 134 | for i in range(steps): 135 | state = state_pool[i] 136 | action = Variable(torch.FloatTensor([action_pool[i]])) 137 | reward = reward_pool[i] 138 | 139 | probs = policy(state) 140 | c = Categorical(probs) 141 | 142 | loss = -c.log_prob(action) * reward 143 | loss.backward() 144 | 145 | optimizer.step() 146 | 147 | # clear the batch pool 148 | state_pool = [] 149 | action_pool = [] 150 | reward_pool = [] 151 | steps = 0 152 | 153 | train() 154 | -------------------------------------------------------------------------------- /Char02 Policy Gradient/pytorch_MountainCar-v0.py: -------------------------------------------------------------------------------- 1 | # MountainCar V0 2 | 3 | import numpy as np 4 | import gym 5 | import matplotlib.pyplot as plt 6 | from itertools import count 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch.nn import functional as F 11 | from torch.optim import adam 12 | from torch.distributions import Categorical 13 | 14 | env = gym.make('MountainCar-v0') 15 | env = env.unwrapped 16 | env.seed(1) 17 | 18 | torch.manual_seed(1) 19 | plt.ion() 20 | 21 | 22 | #Hyperparameters 23 | learning_rate = 0.02 24 | gamma = 0.995 25 | episodes = 1000 26 | 27 | eps = np.finfo(np.float32).eps.item() 28 | 29 | action_space = env.action_space.n 30 | state_space = env.observation_space.shape[0] 31 | 32 | 33 | class Policy(nn.Module): 34 | def __init__(self): 35 | super(Policy, self).__init__() 36 | 37 | self.fc1 = nn.Linear(state_space, 20) 38 | #self.fc2 = nn.Linear(128,64) 39 | self.fc3 = nn.Linear(20, action_space) 40 | 41 | self.gamma = gamma 42 | self.saved_log_probs = [] 43 | self.rewards = [] 44 | 45 | def forward(self, x): 46 | 47 | x = F.relu(self.fc1(x)) 48 | #x = F.relu(self.fc2(x)) 49 | x = F.softmax(self.fc3(x), dim=1) 50 | 51 | return x 52 | 53 | policy = Policy() 54 | optimizer = adam.Adam(policy.parameters(), lr=learning_rate) 55 | 56 | def selct_action(state): 57 | state = torch.from_numpy(state).float().unsqueeze(0) 58 | probs = policy(state) 59 | c = Categorical(probs) 60 | action = c.sample() 61 | 62 | 63 | policy.saved_log_probs.append(c.log_prob(action)) 64 | action = action.item() 65 | return action 66 | 67 | def finish_episode(): 68 | R = 0 69 | policy_loss = [] 70 | rewards = [] 71 | 72 | for r in policy.rewards[::-1]: 73 | R = r + policy.gamma * R 74 | rewards.insert(0, R) 75 | 76 | # Formalize reward 77 | rewards = torch.tensor(rewards) 78 | rewards = (rewards - rewards.mean())/(rewards.std() + eps) 79 | 80 | # get loss 81 | for reward, log_prob in zip(rewards, policy.saved_log_probs): 82 | policy_loss.append(-log_prob * reward) 83 | 84 | optimizer.zero_grad() 85 | policy_loss = torch.cat(policy_loss).sum() 86 | policy_loss.backward() 87 | optimizer.step() 88 | 89 | 90 | 91 | del policy.rewards[:] 92 | del policy.saved_log_probs[:] 93 | 94 | def plot(steps): 95 | ax = plt.subplot(111) 96 | ax.cla() 97 | ax.set_title('Training') 98 | ax.set_xlabel('Episode') 99 | ax.set_ylabel('Run Time') 100 | ax.plot(steps) 101 | RunTime = len(steps) 102 | path = './PG_MountainCar-v0/'+'RunTime'+str(RunTime)+'.jpg' 103 | if len(steps) % 100 == 0: 104 | plt.savefig(path) 105 | plt.pause(0.0000001) 106 | 107 | 108 | 109 | def main(): 110 | 111 | running_reward = 0 112 | steps = [] 113 | for episode in count(60000): 114 | state = env.reset() 115 | 116 | for t in range(10000): 117 | action = selct_action(state) 118 | state, reward ,done, info = env.step(action) 119 | env.render() 120 | policy.rewards.append(reward) 121 | 122 | if done: 123 | print("Episode {}, live time = {}".format(episode, t)) 124 | steps.append(t) 125 | plot(steps) 126 | break 127 | if episode % 50 == 0: 128 | torch.save(policy, 'policyNet.pkl') 129 | 130 | running_reward = running_reward * policy.gamma - t*0.01 131 | finish_episode() 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /Char03 Actor-Critic/AC_CartPole-v0.py: -------------------------------------------------------------------------------- 1 | import gym, os 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from itertools import count 5 | from collections import namedtuple 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | from torch.distributions import Categorical 12 | 13 | #Parameters 14 | env = gym.make('CartPole-v0') 15 | env = env.unwrapped 16 | 17 | env.seed(1) 18 | torch.manual_seed(1) 19 | 20 | state_space = env.observation_space.shape[0] 21 | action_space = env.action_space.n 22 | 23 | 24 | #Hyperparameters 25 | learning_rate = 0.01 26 | gamma = 0.99 27 | episodes = 20000 28 | render = False 29 | eps = np.finfo(np.float32).eps.item() 30 | SavedAction = namedtuple('SavedAction', ['log_prob', 'value']) 31 | 32 | class Policy(nn.Module): 33 | def __init__(self): 34 | super(Policy, self).__init__() 35 | self.fc1 = nn.Linear(state_space, 32) 36 | 37 | self.action_head = nn.Linear(32, action_space) 38 | self.value_head = nn.Linear(32, 1) # Scalar Value 39 | 40 | self.save_actions = [] 41 | self.rewards = [] 42 | os.makedirs('./AC_CartPole-v0', exist_ok=True) 43 | 44 | def forward(self, x): 45 | x = F.relu(self.fc1(x)) 46 | action_score = self.action_head(x) 47 | state_value = self.value_head(x) 48 | 49 | return F.softmax(action_score, dim=-1), state_value 50 | 51 | model = Policy() 52 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 53 | 54 | def plot(steps): 55 | ax = plt.subplot(111) 56 | ax.cla() 57 | ax.grid() 58 | ax.set_title('Training') 59 | ax.set_xlabel('Episode') 60 | ax.set_ylabel('Run Time') 61 | ax.plot(steps) 62 | RunTime = len(steps) 63 | 64 | path = './AC_CartPole-v0/' + 'RunTime' + str(RunTime) + '.jpg' 65 | if len(steps) % 200 == 0: 66 | plt.savefig(path) 67 | plt.pause(0.0000001) 68 | 69 | def select_action(state): 70 | state = torch.from_numpy(state).float() 71 | probs, state_value = model(state) 72 | m = Categorical(probs) 73 | action = m.sample() 74 | model.save_actions.append(SavedAction(m.log_prob(action), state_value)) 75 | 76 | return action.item() 77 | 78 | 79 | def finish_episode(): 80 | R = 0 81 | save_actions = model.save_actions 82 | policy_loss = [] 83 | value_loss = [] 84 | rewards = [] 85 | 86 | for r in model.rewards[::-1]: 87 | R = r + gamma * R 88 | rewards.insert(0, R) 89 | 90 | rewards = torch.tensor(rewards) 91 | rewards = (rewards - rewards.mean()) / (rewards.std() + eps) 92 | 93 | for (log_prob , value), r in zip(save_actions, rewards): 94 | reward = r - value.item() 95 | policy_loss.append(-log_prob * reward) 96 | value_loss.append(F.smooth_l1_loss(value, torch.tensor([r]))) 97 | 98 | optimizer.zero_grad() 99 | loss = torch.stack(policy_loss).sum() + torch.stack(value_loss).sum() 100 | loss.backward() 101 | optimizer.step() 102 | 103 | del model.rewards[:] 104 | del model.save_actions[:] 105 | 106 | def main(): 107 | running_reward = 10 108 | live_time = [] 109 | for i_episode in count(episodes): 110 | state = env.reset() 111 | for t in count(): 112 | action = select_action(state) 113 | state, reward, done, info = env.step(action) 114 | if render: env.render() 115 | model.rewards.append(reward) 116 | 117 | if done or t >= 1000: 118 | break 119 | running_reward = running_reward * 0.99 + t * 0.01 120 | live_time.append(t) 121 | plot(live_time) 122 | if i_episode % 100 == 0: 123 | modelPath = './AC_CartPole_Model/ModelTraing'+str(i_episode)+'Times.pkl' 124 | torch.save(model, modelPath) 125 | finish_episode() 126 | 127 | if __name__ == '__main__': 128 | main() 129 | -------------------------------------------------------------------------------- /Char03 Actor-Critic/AC_MountainCar-v0.py: -------------------------------------------------------------------------------- 1 | import gym, os 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from itertools import count 5 | from collections import namedtuple 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.nn import functional as F 10 | from torch.optim import Adam 11 | from torch.distributions import Categorical 12 | from torch.nn.functional import smooth_l1_loss 13 | 14 | #Hyperparameters 15 | LEARNING_RATE = 0.01 16 | GAMMA = 0.995 17 | NUM_EPISODES = 50000 18 | RENDER = False 19 | #env info 20 | env = gym.make('MountainCar-v0') 21 | env = env.unwrapped 22 | 23 | env.seed(1) 24 | torch.manual_seed(1) 25 | 26 | num_state = env.observation_space.shape[0] 27 | num_action = env.action_space.n 28 | eps = np.finfo(np.float32).eps.item() 29 | plt.ion() 30 | saveAction = namedtuple('SavedActions',['probs', 'action_values']) 31 | 32 | class Module(nn.Module): 33 | def __init__(self): 34 | super(Module, self).__init__() 35 | self.fc1 = nn.Linear(num_state, 128) 36 | #self.fc2 = nn.Linear(64, 128) 37 | 38 | self.action_head = nn.Linear(128, num_action) 39 | self.value_head = nn.Linear(128, 1) 40 | self.policy_action_value = [] 41 | self.rewards = [] 42 | 43 | self.gamma = GAMMA 44 | os.makedirs('/AC_MountainCar-v0_Model/', exist_ok=True) 45 | 46 | 47 | def forward(self, x): 48 | x = F.relu(self.fc1(x)) 49 | #x = F.relu(self.fc2(x)) 50 | 51 | probs = F.softmax(self.action_head(x)) 52 | value = self.value_head(x) 53 | return probs, value 54 | 55 | policy = Module() 56 | optimizer = Adam(policy.parameters(), lr=LEARNING_RATE) 57 | 58 | def plot(steps): 59 | ax = plt.subplot(111) 60 | ax.cla() 61 | ax.grid() 62 | ax.set_title('Training') 63 | ax.set_xlabel('Episode') 64 | ax.set_ylabel('Run Time') 65 | ax.plot(steps) 66 | RunTime = len(steps) 67 | path = './AC_MountainCar-v0/' + 'RunTime' + str(RunTime) + '.jpg' 68 | if len(steps) % 200 == 0: 69 | plt.savefig(path) 70 | plt.pause(0.0000001) 71 | 72 | 73 | def select_action(state): 74 | state = torch.from_numpy(state).float() 75 | probs, value = policy(state) 76 | c = Categorical(probs) 77 | action = c.sample() 78 | log_prob = c.log_prob(action) 79 | 80 | 81 | policy.policy_action_value.append(saveAction(log_prob, value)) 82 | action = action.item() 83 | return action 84 | 85 | 86 | def finish_episode(): 87 | rewards = [] 88 | saveActions = policy.policy_action_value 89 | policy_loss = [] 90 | value_loss = [] 91 | R = 0 92 | 93 | for r in policy.rewards[::-1]: 94 | R = r + policy.gamma * R 95 | rewards.insert(0, R) 96 | 97 | # Normalize the reward 98 | rewards = torch.tensor(rewards) 99 | rewards = (rewards - rewards.mean()) / (rewards.std() + eps) 100 | 101 | #Figure out loss 102 | for (log_prob, value), r in zip(saveActions, rewards): 103 | reward = r - value.item() 104 | policy_loss.append(-log_prob * reward) 105 | value_loss.append(smooth_l1_loss(value, torch.tensor([r]) )) 106 | 107 | optimizer.zero_grad() 108 | loss = torch.stack(policy_loss).sum() + torch.stack(policy_loss).sum() 109 | loss.backward() 110 | optimizer.step() 111 | 112 | del policy.rewards[:] 113 | del policy.policy_action_value[:] 114 | 115 | 116 | def main(): 117 | run_steps = [] 118 | for i_episode in range(NUM_EPISODES): 119 | state = env.reset() 120 | if RENDER: env.render() 121 | 122 | for t in count(): 123 | action = select_action(state) 124 | state , reward, done, _ = env.step(action) 125 | reward = state[0] + reward 126 | if RENDER: env.render() 127 | policy.rewards.append(reward) 128 | 129 | if done: 130 | run_steps.append(t) 131 | print("Epiosde {} , run step is {} ".format(i_episode+1 , t+1)) 132 | break 133 | 134 | finish_episode() 135 | plot(run_steps) 136 | 137 | if i_episode % 100 == 0 and i_episode !=0: 138 | 139 | modelPath = './AC_MountainCar-v0_Model/ModelTraing' + str(i_episode) + 'Times.pkl' 140 | torch.save(policy, modelPath) 141 | 142 | 143 | 144 | if __name__ == '__main__': 145 | main() 146 | -------------------------------------------------------------------------------- /Char04 A2C/A2C.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | import random 4 | 5 | import gym 6 | import numpy as np 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | import torch.nn.functional as F 12 | from torch.distributions import Categorical 13 | 14 | import matplotlib.pyplot as plt 15 | 16 | use_cuda = torch.cuda.is_available() 17 | device = torch.device("cuda" if use_cuda else "cpu") 18 | 19 | from multiprocessing_env import SubprocVecEnv 20 | 21 | num_envs = 8 22 | env_name = "CartPole-v0" 23 | 24 | def make_env(): 25 | def _thunk(): 26 | env = gym.make(env_name) 27 | return env 28 | return _thunk 29 | 30 | plt.ion() 31 | envs = [make_env() for i in range(num_envs)] 32 | envs = SubprocVecEnv(envs) # 8 env 33 | 34 | env = gym.make(env_name) # a single env 35 | 36 | class ActorCritic(nn.Module): 37 | def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0): 38 | super(ActorCritic, self).__init__() 39 | 40 | self.critic = nn.Sequential( 41 | nn.Linear(num_inputs, hidden_size), 42 | nn.ReLU(), 43 | nn.Linear(hidden_size, 1) 44 | ) 45 | 46 | self.actor = nn.Sequential( 47 | nn.Linear(num_inputs, hidden_size), 48 | nn.ReLU(), 49 | nn.Linear(hidden_size, num_outputs), 50 | nn.Softmax(dim=1), 51 | ) 52 | 53 | def forward(self, x): 54 | value = self.critic(x) 55 | probs = self.actor(x) 56 | dist = Categorical(probs) 57 | return dist, value 58 | 59 | 60 | def test_env(vis=False): 61 | state = env.reset() 62 | if vis: env.render() 63 | done = False 64 | total_reward = 0 65 | while not done: 66 | state = torch.FloatTensor(state).unsqueeze(0).to(device) 67 | dist, _ = model(state) 68 | next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0]) 69 | state = next_state 70 | if vis: env.render() 71 | total_reward += reward 72 | return total_reward 73 | 74 | 75 | def compute_returns(next_value, rewards, masks, gamma=0.99): 76 | R = next_value 77 | returns = [] 78 | for step in reversed(range(len(rewards))): 79 | R = rewards[step] + gamma * R * masks[step] 80 | returns.insert(0, R) 81 | return returns 82 | 83 | def plot(frame_idx, rewards): 84 | plt.plot(rewards,'b-') 85 | plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) 86 | plt.pause(0.0001) 87 | 88 | 89 | num_inputs = envs.observation_space.shape[0] 90 | num_outputs = envs.action_space.n 91 | 92 | #Hyper params: 93 | hidden_size = 256 94 | lr = 1e-3 95 | num_steps = 5 96 | 97 | model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device) 98 | optimizer = optim.Adam(model.parameters()) 99 | 100 | 101 | max_frames = 20000 102 | frame_idx = 0 103 | test_rewards = [] 104 | 105 | 106 | state = envs.reset() 107 | 108 | while frame_idx < max_frames: 109 | 110 | log_probs = [] 111 | values = [] 112 | rewards = [] 113 | masks = [] 114 | entropy = 0 115 | 116 | # rollout trajectory 117 | for _ in range(num_steps): 118 | state = torch.FloatTensor(state).to(device) 119 | dist, value = model(state) 120 | 121 | action = dist.sample() 122 | next_state, reward, done, _ = envs.step(action.cpu().numpy()) 123 | 124 | log_prob = dist.log_prob(action) 125 | entropy += dist.entropy().mean() 126 | 127 | log_probs.append(log_prob) 128 | values.append(value) 129 | rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) 130 | masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) 131 | 132 | state = next_state 133 | frame_idx += 1 134 | 135 | if frame_idx % 100 == 0: 136 | test_rewards.append(np.mean([test_env() for _ in range(10)])) 137 | plot(frame_idx, test_rewards) 138 | 139 | next_state = torch.FloatTensor(next_state).to(device) 140 | _, next_value = model(next_state) 141 | returns = compute_returns(next_value, rewards, masks) 142 | 143 | log_probs = torch.cat(log_probs) 144 | returns = torch.cat(returns).detach() 145 | values = torch.cat(values) 146 | 147 | advantage = returns - values 148 | 149 | actor_loss = -(log_probs * advantage.detach()).mean() 150 | critic_loss = advantage.pow(2).mean() 151 | 152 | loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy 153 | 154 | optimizer.zero_grad() 155 | loss.backward() 156 | optimizer.step() 157 | 158 | #test_env(True) 159 | -------------------------------------------------------------------------------- /Char04 A2C/multiprocessing_env.py: -------------------------------------------------------------------------------- 1 | #This code is from openai baseline 2 | #https://github.com/openai/baselines/tree/master/baselines/common/vec_env 3 | 4 | import numpy as np 5 | from multiprocessing import Process, Pipe 6 | 7 | def worker(remote, parent_remote, env_fn_wrapper): 8 | parent_remote.close() 9 | env = env_fn_wrapper.x() 10 | while True: 11 | cmd, data = remote.recv() 12 | if cmd == 'step': 13 | ob, reward, done, info = env.step(data) 14 | if done: 15 | ob = env.reset() 16 | remote.send((ob, reward, done, info)) 17 | elif cmd == 'reset': 18 | ob = env.reset() 19 | remote.send(ob) 20 | elif cmd == 'reset_task': 21 | ob = env.reset_task() 22 | remote.send(ob) 23 | elif cmd == 'close': 24 | remote.close() 25 | break 26 | elif cmd == 'get_spaces': 27 | remote.send((env.observation_space, env.action_space)) 28 | else: 29 | raise NotImplementedError 30 | 31 | class VecEnv(object): 32 | """ 33 | An abstract asynchronous, vectorized environment. 34 | """ 35 | def __init__(self, num_envs, observation_space, action_space): 36 | self.num_envs = num_envs 37 | self.observation_space = observation_space 38 | self.action_space = action_space 39 | 40 | def reset(self): 41 | """ 42 | Reset all the environments and return an array of 43 | observations, or a tuple of observation arrays. 44 | If step_async is still doing work, that work will 45 | be cancelled and step_wait() should not be called 46 | until step_async() is invoked again. 47 | """ 48 | pass 49 | 50 | def step_async(self, actions): 51 | """ 52 | Tell all the environments to start taking a step 53 | with the given actions. 54 | Call step_wait() to get the results of the step. 55 | You should not call this if a step_async run is 56 | already pending. 57 | """ 58 | pass 59 | 60 | def step_wait(self): 61 | """ 62 | Wait for the step taken with step_async(). 63 | Returns (obs, rews, dones, infos): 64 | - obs: an array of observations, or a tuple of 65 | arrays of observations. 66 | - rews: an array of rewards 67 | - dones: an array of "episode done" booleans 68 | - infos: a sequence of info objects 69 | """ 70 | pass 71 | 72 | def close(self): 73 | """ 74 | Clean up the environments' resources. 75 | """ 76 | pass 77 | 78 | def step(self, actions): 79 | self.step_async(actions) 80 | return self.step_wait() 81 | 82 | 83 | class CloudpickleWrapper(object): 84 | """ 85 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 86 | """ 87 | def __init__(self, x): 88 | self.x = x 89 | def __getstate__(self): 90 | import cloudpickle 91 | return cloudpickle.dumps(self.x) 92 | def __setstate__(self, ob): 93 | import pickle 94 | self.x = pickle.loads(ob) 95 | 96 | 97 | class SubprocVecEnv(VecEnv): 98 | def __init__(self, env_fns, spaces=None): 99 | """ 100 | envs: list of gym environments to run in subprocesses 101 | """ 102 | self.waiting = False 103 | self.closed = False 104 | nenvs = len(env_fns) 105 | self.nenvs = nenvs 106 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 107 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 108 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 109 | for p in self.ps: 110 | p.daemon = True # if the main process crashes, we should not cause things to hang 111 | p.start() 112 | for remote in self.work_remotes: 113 | remote.close() 114 | 115 | self.remotes[0].send(('get_spaces', None)) 116 | observation_space, action_space = self.remotes[0].recv() 117 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 118 | 119 | def step_async(self, actions): 120 | for remote, action in zip(self.remotes, actions): 121 | remote.send(('step', action)) 122 | self.waiting = True 123 | 124 | def step_wait(self): 125 | results = [remote.recv() for remote in self.remotes] 126 | self.waiting = False 127 | obs, rews, dones, infos = zip(*results) 128 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 129 | 130 | def reset(self): 131 | for remote in self.remotes: 132 | remote.send(('reset', None)) 133 | return np.stack([remote.recv() for remote in self.remotes]) 134 | 135 | def reset_task(self): 136 | for remote in self.remotes: 137 | remote.send(('reset_task', None)) 138 | return np.stack([remote.recv() for remote in self.remotes]) 139 | 140 | def close(self): 141 | if self.closed: 142 | return 143 | if self.waiting: 144 | for remote in self.remotes: 145 | remote.recv() 146 | for remote in self.remotes: 147 | remote.send(('close', None)) 148 | for p in self.ps: 149 | p.join() 150 | self.closed = True 151 | 152 | def __len__(self): 153 | return self.nenvs 154 | -------------------------------------------------------------------------------- /Char05 DDPG/DDPG.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from itertools import count 3 | 4 | import os, sys, random 5 | import numpy as np 6 | 7 | import gym 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | from torch.distributions import Normal 13 | from tensorboardX import SummaryWriter 14 | 15 | ''' 16 | Implementation of Deep Deterministic Policy Gradients (DDPG) with pytorch 17 | riginal paper: https://arxiv.org/abs/1509.02971 18 | Not the author's implementation ! 19 | ''' 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--mode', default='train', type=str) # mode = 'train' or 'test' 23 | # OpenAI gym environment name, # ['BipedalWalker-v2', 'Pendulum-v0'] or any continuous environment 24 | # Note that DDPG is feasible about hyper-parameters. 25 | # You should fine-tuning if you change to another environment. 26 | parser.add_argument("--env_name", default="Pendulum-v0") 27 | parser.add_argument('--tau', default=0.005, type=float) # target smoothing coefficient 28 | parser.add_argument('--target_update_interval', default=1, type=int) 29 | parser.add_argument('--test_iteration', default=10, type=int) 30 | 31 | parser.add_argument('--learning_rate', default=1e-4, type=float) 32 | parser.add_argument('--gamma', default=0.99, type=int) # discounted factor 33 | parser.add_argument('--capacity', default=1000000, type=int) # replay buffer size 34 | parser.add_argument('--batch_size', default=100, type=int) # mini batch size 35 | parser.add_argument('--seed', default=False, type=bool) 36 | parser.add_argument('--random_seed', default=9527, type=int) 37 | # optional parameters 38 | 39 | parser.add_argument('--sample_frequency', default=2000, type=int) 40 | parser.add_argument('--render', default=False, type=bool) # show UI or not 41 | parser.add_argument('--log_interval', default=50, type=int) # 42 | parser.add_argument('--load', default=False, type=bool) # load model 43 | parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work 44 | parser.add_argument('--exploration_noise', default=0.1, type=float) 45 | parser.add_argument('--max_episode', default=100000, type=int) # num of games 46 | parser.add_argument('--print_log', default=5, type=int) 47 | parser.add_argument('--update_iteration', default=200, type=int) 48 | args = parser.parse_args() 49 | 50 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 51 | script_name = os.path.basename(__file__) 52 | env = gym.make(args.env_name) 53 | 54 | if args.seed: 55 | env.seed(args.random_seed) 56 | torch.manual_seed(args.random_seed) 57 | np.random.seed(args.random_seed) 58 | 59 | state_dim = env.observation_space.shape[0] 60 | action_dim = env.action_space.shape[0] 61 | max_action = float(env.action_space.high[0]) 62 | min_Val = torch.tensor(1e-7).float().to(device) # min value 63 | 64 | directory = './exp' + script_name + args.env_name +'./' 65 | 66 | class Replay_buffer(): 67 | ''' 68 | Code based on: 69 | https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py 70 | Expects tuples of (state, next_state, action, reward, done) 71 | ''' 72 | def __init__(self, max_size=args.capacity): 73 | self.storage = [] 74 | self.max_size = max_size 75 | self.ptr = 0 76 | 77 | def push(self, data): 78 | if len(self.storage) == self.max_size: 79 | self.storage[int(self.ptr)] = data 80 | self.ptr = (self.ptr + 1) % self.max_size 81 | else: 82 | self.storage.append(data) 83 | 84 | def sample(self, batch_size): 85 | ind = np.random.randint(0, len(self.storage), size=batch_size) 86 | x, y, u, r, d = [], [], [], [], [] 87 | 88 | for i in ind: 89 | X, Y, U, R, D = self.storage[i] 90 | x.append(np.array(X, copy=False)) 91 | y.append(np.array(Y, copy=False)) 92 | u.append(np.array(U, copy=False)) 93 | r.append(np.array(R, copy=False)) 94 | d.append(np.array(D, copy=False)) 95 | 96 | return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1) 97 | 98 | 99 | class Actor(nn.Module): 100 | def __init__(self, state_dim, action_dim, max_action): 101 | super(Actor, self).__init__() 102 | 103 | self.l1 = nn.Linear(state_dim, 400) 104 | self.l2 = nn.Linear(400, 300) 105 | self.l3 = nn.Linear(300, action_dim) 106 | 107 | self.max_action = max_action 108 | 109 | def forward(self, x): 110 | x = F.relu(self.l1(x)) 111 | x = F.relu(self.l2(x)) 112 | x = self.max_action * torch.tanh(self.l3(x)) 113 | return x 114 | 115 | 116 | class Critic(nn.Module): 117 | def __init__(self, state_dim, action_dim): 118 | super(Critic, self).__init__() 119 | 120 | self.l1 = nn.Linear(state_dim + action_dim, 400) 121 | self.l2 = nn.Linear(400 , 300) 122 | self.l3 = nn.Linear(300, 1) 123 | 124 | def forward(self, x, u): 125 | x = F.relu(self.l1(torch.cat([x, u], 1))) 126 | x = F.relu(self.l2(x)) 127 | x = self.l3(x) 128 | return x 129 | 130 | 131 | class DDPG(object): 132 | def __init__(self, state_dim, action_dim, max_action): 133 | self.actor = Actor(state_dim, action_dim, max_action).to(device) 134 | self.actor_target = Actor(state_dim, action_dim, max_action).to(device) 135 | self.actor_target.load_state_dict(self.actor.state_dict()) 136 | self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4) 137 | 138 | self.critic = Critic(state_dim, action_dim).to(device) 139 | self.critic_target = Critic(state_dim, action_dim).to(device) 140 | self.critic_target.load_state_dict(self.critic.state_dict()) 141 | self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) 142 | self.replay_buffer = Replay_buffer() 143 | self.writer = SummaryWriter(directory) 144 | 145 | self.num_critic_update_iteration = 0 146 | self.num_actor_update_iteration = 0 147 | self.num_training = 0 148 | 149 | def select_action(self, state): 150 | state = torch.FloatTensor(state.reshape(1, -1)).to(device) 151 | return self.actor(state).cpu().data.numpy().flatten() 152 | 153 | def update(self): 154 | 155 | for it in range(args.update_iteration): 156 | # Sample replay buffer 157 | x, y, u, r, d = self.replay_buffer.sample(args.batch_size) 158 | state = torch.FloatTensor(x).to(device) 159 | action = torch.FloatTensor(u).to(device) 160 | next_state = torch.FloatTensor(y).to(device) 161 | done = torch.FloatTensor(1-d).to(device) 162 | reward = torch.FloatTensor(r).to(device) 163 | 164 | # Compute the target Q value 165 | target_Q = self.critic_target(next_state, self.actor_target(next_state)) 166 | target_Q = reward + (done * args.gamma * target_Q).detach() 167 | 168 | # Get current Q estimate 169 | current_Q = self.critic(state, action) 170 | 171 | # Compute critic loss 172 | critic_loss = F.mse_loss(current_Q, target_Q) 173 | self.writer.add_scalar('Loss/critic_loss', critic_loss, global_step=self.num_critic_update_iteration) 174 | # Optimize the critic 175 | self.critic_optimizer.zero_grad() 176 | critic_loss.backward() 177 | self.critic_optimizer.step() 178 | 179 | # Compute actor loss 180 | actor_loss = -self.critic(state, self.actor(state)).mean() 181 | self.writer.add_scalar('Loss/actor_loss', actor_loss, global_step=self.num_actor_update_iteration) 182 | 183 | # Optimize the actor 184 | self.actor_optimizer.zero_grad() 185 | actor_loss.backward() 186 | self.actor_optimizer.step() 187 | 188 | # Update the frozen target models 189 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): 190 | target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) 191 | 192 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 193 | target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) 194 | 195 | self.num_actor_update_iteration += 1 196 | self.num_critic_update_iteration += 1 197 | 198 | def save(self): 199 | torch.save(self.actor.state_dict(), directory + 'actor.pth') 200 | torch.save(self.critic.state_dict(), directory + 'critic.pth') 201 | # print("====================================") 202 | # print("Model has been saved...") 203 | # print("====================================") 204 | 205 | def load(self): 206 | self.actor.load_state_dict(torch.load(directory + 'actor.pth')) 207 | self.critic.load_state_dict(torch.load(directory + 'critic.pth')) 208 | print("====================================") 209 | print("model has been loaded...") 210 | print("====================================") 211 | 212 | def main(): 213 | agent = DDPG(state_dim, action_dim, max_action) 214 | ep_r = 0 215 | if args.mode == 'test': 216 | agent.load() 217 | for i in range(args.test_iteration): 218 | state = env.reset() 219 | for t in count(): 220 | action = agent.select_action(state) 221 | next_state, reward, done, info = env.step(np.float32(action)) 222 | ep_r += reward 223 | env.render() 224 | if done or t >= args.max_length_of_trajectory: 225 | print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t)) 226 | ep_r = 0 227 | break 228 | state = next_state 229 | 230 | elif args.mode == 'train': 231 | if args.load: agent.load() 232 | total_step = 0 233 | for i in range(args.max_episode): 234 | total_reward = 0 235 | step =0 236 | state = env.reset() 237 | for t in count(): 238 | action = agent.select_action(state) 239 | action = (action + np.random.normal(0, args.exploration_noise, size=env.action_space.shape[0])).clip( 240 | env.action_space.low, env.action_space.high) 241 | 242 | next_state, reward, done, info = env.step(action) 243 | if args.render and i >= args.render_interval : env.render() 244 | agent.replay_buffer.push((state, next_state, action, reward, np.float(done))) 245 | 246 | state = next_state 247 | if done: 248 | break 249 | step += 1 250 | total_reward += reward 251 | total_step += step+1 252 | print("Total T:{} Episode: \t{} Total Reward: \t{:0.2f}".format(total_step, i, total_reward)) 253 | agent.update() 254 | # "Total T: %d Episode Num: %d Episode T: %d Reward: %f 255 | 256 | if i % args.log_interval == 0: 257 | agent.save() 258 | else: 259 | raise NameError("mode wrong!!!") 260 | 261 | if __name__ == '__main__': 262 | main() 263 | -------------------------------------------------------------------------------- /Char05 DDPG/DDPG_exp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char05 DDPG/DDPG_exp.jpg -------------------------------------------------------------------------------- /Char05 DDPG/README.md: -------------------------------------------------------------------------------- 1 | # DDPG 2 | - Original paper: https://arxiv.org/abs/1509.02971 3 | - OPENAI Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/ 4 | 5 | **Note that DDPG is feasible about hyper-parameters. You should fine-tuning if you change to another environment.** 6 | 7 | Episode reward in Pendulum-v0: 8 | 9 | ![ep_r](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char05%20DDPG/DDPG_exp.jpg) 10 | 11 | -------------------------------------------------------------------------------- /Char07 PPO/PPO2.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import pickle 4 | from collections import namedtuple 5 | 6 | import os 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | import gym 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import torch.optim as optim 15 | from torch.distributions import Normal 16 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 17 | 18 | # Parameters 19 | parser = argparse.ArgumentParser(description='Solve the Pendulum-v0 with PPO') 20 | parser.add_argument( 21 | '--gamma', type=float, default=0.9, metavar='G', help='discount factor (default: 0.9)') 22 | parser.add_argument('--seed', type=int, default=0, metavar='N', help='random seed (default: 0)') 23 | parser.add_argument('--render', action='store_true', default=True, help='render the environment') 24 | parser.add_argument( 25 | '--log-interval', 26 | type=int, 27 | default=10, 28 | metavar='N', 29 | help='interval between training status logs (default: 10)') 30 | args = parser.parse_args() 31 | 32 | env = gym.make('Pendulum-v0').unwrapped 33 | num_state = env.observation_space.shape[0] 34 | num_action = env.action_space.shape[0] 35 | torch.manual_seed(args.seed) 36 | env.seed(args.seed) 37 | 38 | Transition = namedtuple('Transition',['state', 'aciton', 'reward', 'a_log_prob', 'next_state']) 39 | TrainRecord = namedtuple('TrainRecord',['episode', 'reward']) 40 | 41 | class Actor(nn.Module): 42 | def __init__(self): 43 | super(Actor, self).__init__() 44 | self.fc1 = nn.Linear(num_state, 64) 45 | self.fc2 = nn.Linear(64,8) 46 | self.mu_head = nn.Linear(8, 1) 47 | self.sigma_head = nn.Linear(8, 1) 48 | 49 | def forward(self, x): 50 | x = F.leaky_relu(self.fc1(x)) 51 | x = F.leaky_relu(self.fc2(x)) 52 | 53 | mu = self.mu_head(x) 54 | sigma = self.sigma_head(x) 55 | 56 | return mu, sigma 57 | 58 | class Critic(nn.Module): 59 | def __init__(self): 60 | super(Critic, self).__init__() 61 | self.fc1 = nn.Linear(num_state, 64) 62 | self.fc2 = nn.Linear(64, 8) 63 | self.state_value= nn.Linear(8, 1) 64 | 65 | def forward(self, x): 66 | x = F.leaky_relu(self.fc1(x)) 67 | x = F.leaky_relu(self.fc2(x)) 68 | value = self.state_value(x) 69 | return value 70 | 71 | class PPO(): 72 | clip_param = 0.2 73 | max_grad_norm = 0.5 74 | ppo_epoch = 10 75 | buffer_capacity = 1000 76 | batch_size = 8 77 | 78 | def __init__(self): 79 | super(PPO, self).__init__() 80 | self.actor_net = Actor().float() 81 | self.critic_net = Critic().float() 82 | self.buffer = [] 83 | self.counter = 0 84 | self.training_step = 0 85 | 86 | self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3) 87 | self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 4e-3) 88 | if not os.path.exists('../param'): 89 | os.makedirs('../param/net_param') 90 | os.makedirs('../param/img') 91 | 92 | def select_action(self, state): 93 | state = torch.from_numpy(state).float().unsqueeze(0) 94 | with torch.no_grad(): 95 | mu, sigma = self.actor_net(state) 96 | dist = Normal(mu, sigma) 97 | action = dist.sample() 98 | action_log_prob = dist.log_prob(action) 99 | action = action.clamp(-2, 2) 100 | return action.item(), action_log_prob.item() 101 | 102 | 103 | def get_value(self, state): 104 | state = torch.from_numpy(state) 105 | with torch.no_grad(): 106 | value = self.critic_net(state) 107 | return value.item() 108 | 109 | def save_param(self): 110 | torch.save(self.actor_net.state_dict(), '../param/net_param/actor_net'+str(time.time())[:10],+'.pkl') 111 | torch.save(self.critic_net.state_dict(), '../param/net_param/critic_net'+str(time.time())[:10],+'.pkl') 112 | 113 | def store_transition(self, transition): 114 | self.buffer.append(transition) 115 | self.counter+=1 116 | return counter % self.buffer_capacity == 0 117 | 118 | def update(self): 119 | self.training_step +=1 120 | 121 | state = torch.tensor([t.state for t in self.buffer ], dtype=torch.float) 122 | action = torch.tensor([t.action for t in self.buffer], dtype=torch.float).view(-1, 1) 123 | reward = torch.tensor([t.reward for t in self.buffer], dtype=torch.float).view(-1, 1) 124 | next_state = torch.tensor([t.next_state for t in self.buffer], dtype=torch.float) 125 | old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float).view(-1, 1) 126 | 127 | reward = (reward - reward.mean())/(reward.std() + 1e-10) 128 | with torch.no_grad(): 129 | target_v = reward + args.gamma * self.critic_net(next_state) 130 | 131 | advantage = (target_v - self.critic_net(state)).detach() 132 | for _ in range(self.ppo_epoch): # iteration ppo_epoch 133 | for index in BatchSampler(SubsetRandomSampler(range(self.buffer_capacity), self.batch_size, True)): 134 | # epoch iteration, PPO core!!! 135 | mu, sigma = self.actor_net(state[index]) 136 | n = Normal(mu, sigma) 137 | action_log_prob = n.log_prob(action[index]) 138 | ratio = torch.exp(action_log_prob - old_action_log_prob) 139 | 140 | L1 = ratio * advantage[index] 141 | L2 = torch.clamp(ratio, 1-self.clip_param, 1+self.clip_param) * advantage[index] 142 | action_loss = -torch.min(L1, L2).mean() # MAX->MIN desent 143 | self.actor_optimizer.zero_grad() 144 | action_loss.backward() 145 | nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm) 146 | self.actor_optimizer.step() 147 | 148 | value_loss = F.smooth_l1_loss(self.critic_net(state[index]), target_v[index]) 149 | self.critic_net_optimizer.zero_grad() 150 | value_loss.backward() 151 | nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm) 152 | self.critic_net_optimizer.step() 153 | 154 | del self.buffer[:] 155 | 156 | def main(): 157 | 158 | agent = PPO() 159 | 160 | training_records = [] 161 | running_reward = -1000 162 | 163 | for i_epoch in range(1000): 164 | score = 0 165 | state = env.reset() 166 | if args.render: env.render() 167 | for t in range(200): 168 | action, action_log_prob = agent.select_action(state) 169 | next_state, reward, done, info = env.step(action) 170 | trans = Transition(state, action, reward, action_log_prob, next_state) 171 | if args.render: env.render() 172 | if agent.store_transition(trans): 173 | agent.update() 174 | score += reward 175 | state = next_state 176 | 177 | running_reward = running_reward * 0.9 + score * 0.1 178 | training_records.append(TrainingRecord(i_epoch, running_reward)) 179 | if i_epoch % 10 ==0: 180 | print("Epoch {}, Moving average score is: {:.2f} ".format(i_epoch, running_reward)) 181 | if running_reward > -200: 182 | print("Solved! Moving average score is now {}!".format(running_reward)) 183 | env.close() 184 | agent.save_param() 185 | break 186 | 187 | if __name__ == '__main__': 188 | main() 189 | -------------------------------------------------------------------------------- /Char07 PPO/PPO_CartPole_v0.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | from collections import namedtuple 4 | from itertools import count 5 | 6 | import os, time 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | import gym 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import torch.optim as optim 15 | from torch.distributions import Normal, Categorical 16 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 17 | from tensorboardX import SummaryWriter 18 | 19 | # Parameters 20 | gamma = 0.99 21 | render = False 22 | seed = 1 23 | log_interval = 10 24 | 25 | env = gym.make('CartPole-v0').unwrapped 26 | num_state = env.observation_space.shape[0] 27 | num_action = env.action_space.n 28 | torch.manual_seed(seed) 29 | env.seed(seed) 30 | Transition = namedtuple('Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state']) 31 | 32 | class Actor(nn.Module): 33 | def __init__(self): 34 | super(Actor, self).__init__() 35 | self.fc1 = nn.Linear(num_state, 100) 36 | self.action_head = nn.Linear(100, num_action) 37 | 38 | def forward(self, x): 39 | x = F.relu(self.fc1(x)) 40 | action_prob = F.softmax(self.action_head(x), dim=1) 41 | return action_prob 42 | 43 | 44 | class Critic(nn.Module): 45 | def __init__(self): 46 | super(Critic, self).__init__() 47 | self.fc1 = nn.Linear(num_state, 100) 48 | self.state_value = nn.Linear(100, 1) 49 | 50 | def forward(self, x): 51 | x = F.relu(self.fc1(x)) 52 | value = self.state_value(x) 53 | return value 54 | 55 | 56 | class PPO(): 57 | clip_param = 0.2 58 | max_grad_norm = 0.5 59 | ppo_update_time = 10 60 | buffer_capacity = 1000 61 | batch_size = 32 62 | 63 | def __init__(self): 64 | super(PPO, self).__init__() 65 | self.actor_net = Actor() 66 | self.critic_net = Critic() 67 | self.buffer = [] 68 | self.counter = 0 69 | self.training_step = 0 70 | self.writer = SummaryWriter('../exp') 71 | 72 | self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3) 73 | self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 3e-3) 74 | if not os.path.exists('../param'): 75 | os.makedirs('../param/net_param') 76 | os.makedirs('../param/img') 77 | 78 | def select_action(self, state): 79 | state = torch.from_numpy(state).float().unsqueeze(0) 80 | with torch.no_grad(): 81 | action_prob = self.actor_net(state) 82 | c = Categorical(action_prob) 83 | action = c.sample() 84 | return action.item(), action_prob[:,action.item()].item() 85 | 86 | def get_value(self, state): 87 | state = torch.from_numpy(state) 88 | with torch.no_grad(): 89 | value = self.critic_net(state) 90 | return value.item() 91 | 92 | def save_param(self): 93 | torch.save(self.actor_net.state_dict(), '../param/net_param/actor_net' + str(time.time())[:10], +'.pkl') 94 | torch.save(self.critic_net.state_dict(), '../param/net_param/critic_net' + str(time.time())[:10], +'.pkl') 95 | 96 | def store_transition(self, transition): 97 | self.buffer.append(transition) 98 | self.counter += 1 99 | 100 | 101 | def update(self, i_ep): 102 | state = torch.tensor([t.state for t in self.buffer], dtype=torch.float) 103 | action = torch.tensor([t.action for t in self.buffer], dtype=torch.long).view(-1, 1) 104 | reward = [t.reward for t in self.buffer] 105 | # update: don't need next_state 106 | #reward = torch.tensor([t.reward for t in self.buffer], dtype=torch.float).view(-1, 1) 107 | #next_state = torch.tensor([t.next_state for t in self.buffer], dtype=torch.float) 108 | old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float).view(-1, 1) 109 | 110 | R = 0 111 | Gt = [] 112 | for r in reward[::-1]: 113 | R = r + gamma * R 114 | Gt.insert(0, R) 115 | Gt = torch.tensor(Gt, dtype=torch.float) 116 | #print("The agent is updateing....") 117 | for i in range(self.ppo_update_time): 118 | for index in BatchSampler(SubsetRandomSampler(range(len(self.buffer))), self.batch_size, False): 119 | if self.training_step % 1000 ==0: 120 | print('I_ep {} ,train {} times'.format(i_ep,self.training_step)) 121 | #with torch.no_grad(): 122 | Gt_index = Gt[index].view(-1, 1) 123 | V = self.critic_net(state[index]) 124 | delta = Gt_index - V 125 | advantage = delta.detach() 126 | # epoch iteration, PPO core!!! 127 | action_prob = self.actor_net(state[index]).gather(1, action[index]) # new policy 128 | 129 | ratio = (action_prob/old_action_log_prob[index]) 130 | surr1 = ratio * advantage 131 | surr2 = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * advantage 132 | 133 | # update actor network 134 | action_loss = -torch.min(surr1, surr2).mean() # MAX->MIN desent 135 | self.writer.add_scalar('loss/action_loss', action_loss, global_step=self.training_step) 136 | self.actor_optimizer.zero_grad() 137 | action_loss.backward() 138 | nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm) 139 | self.actor_optimizer.step() 140 | 141 | #update critic network 142 | value_loss = F.mse_loss(Gt_index, V) 143 | self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.training_step) 144 | self.critic_net_optimizer.zero_grad() 145 | value_loss.backward() 146 | nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm) 147 | self.critic_net_optimizer.step() 148 | self.training_step += 1 149 | 150 | del self.buffer[:] # clear experience 151 | 152 | 153 | def main(): 154 | agent = PPO() 155 | for i_epoch in range(1000): 156 | state = env.reset() 157 | if render: env.render() 158 | 159 | for t in count(): 160 | action, action_prob = agent.select_action(state) 161 | next_state, reward, done, _ = env.step(action) 162 | trans = Transition(state, action, action_prob, reward, next_state) 163 | if render: env.render() 164 | agent.store_transition(trans) 165 | state = next_state 166 | 167 | if done : 168 | if len(agent.buffer) >= agent.batch_size:agent.update(i_epoch) 169 | agent.writer.add_scalar('liveTime/livestep', t, global_step=i_epoch) 170 | break 171 | 172 | if __name__ == '__main__': 173 | main() 174 | print("end") 175 | -------------------------------------------------------------------------------- /Char07 PPO/PPO_MountainCar-v0.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | from collections import namedtuple 4 | from itertools import count 5 | 6 | import os, time 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | import gym 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import torch.optim as optim 15 | from torch.distributions import Normal, Categorical 16 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 17 | from tensorboardX import SummaryWriter 18 | 19 | # Parameters 20 | env_name = 'MountainCar-v0' 21 | gamma = 0.99 22 | render = False 23 | seed = 1 24 | log_interval = 10 25 | 26 | env = gym.make(env_name).unwrapped 27 | num_state = env.observation_space.shape[0] 28 | num_action = env.action_space.n 29 | torch.manual_seed(seed) 30 | env.seed(seed) 31 | Transition = namedtuple('Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state']) 32 | 33 | 34 | class Actor(nn.Module): 35 | def __init__(self): 36 | super(Actor, self).__init__() 37 | self.fc1 = nn.Linear(num_state, 128) 38 | self.action_head = nn.Linear(128, num_action) 39 | 40 | def forward(self, x): 41 | x = F.relu(self.fc1(x)) 42 | action_prob = F.softmax(self.action_head(x), dim=1) 43 | return action_prob 44 | 45 | 46 | class Critic(nn.Module): 47 | def __init__(self): 48 | super(Critic, self).__init__() 49 | self.fc1 = nn.Linear(num_state, 128) 50 | self.state_value = nn.Linear(128, 1) 51 | 52 | def forward(self, x): 53 | x = F.relu(self.fc1(x)) 54 | value = self.state_value(x) 55 | return value 56 | 57 | 58 | class PPO(): 59 | clip_param = 0.2 60 | max_grad_norm = 0.5 61 | ppo_update_time = 10 62 | buffer_capacity = 8000 63 | batch_size = 32 64 | 65 | def __init__(self): 66 | super(PPO, self).__init__() 67 | self.actor_net = Actor() 68 | self.critic_net = Critic() 69 | self.buffer = [] 70 | self.counter = 0 71 | self.training_step = 0 72 | self.writer = SummaryWriter('../exp') 73 | 74 | self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 1e-3) 75 | self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), 3e-3) 76 | if not os.path.exists('../param'): 77 | os.makedirs('../param/net_param') 78 | os.makedirs('../param/img') 79 | 80 | def select_action(self, state): 81 | state = torch.from_numpy(state).float().unsqueeze(0) 82 | with torch.no_grad(): 83 | action_prob = self.actor_net(state) 84 | c = Categorical(action_prob) 85 | action = c.sample() 86 | return action.item(), action_prob[:, action.item()].item() 87 | 88 | def get_value(self, state): 89 | state = torch.from_numpy(state) 90 | with torch.no_grad(): 91 | value = self.critic_net(state) 92 | return value.item() 93 | 94 | def save_param(self): 95 | torch.save(self.actor_net.state_dict(), '../param/net_param/actor_net' + str(time.time())[:10], +'.pkl') 96 | torch.save(self.critic_net.state_dict(), '../param/net_param/critic_net' + str(time.time())[:10], +'.pkl') 97 | 98 | def store_transition(self, transition): 99 | self.buffer.append(transition) 100 | self.counter += 1 101 | 102 | def update(self, i_ep): 103 | state = torch.tensor([t.state for t in self.buffer], dtype=torch.float) 104 | action = torch.tensor([t.action for t in self.buffer], dtype=torch.long).view(-1, 1) 105 | reward = [t.reward for t in self.buffer] 106 | # update: don't need next_state 107 | # reward = torch.tensor([t.reward for t in self.buffer], dtype=torch.float).view(-1, 1) 108 | # next_state = torch.tensor([t.next_state for t in self.buffer], dtype=torch.float) 109 | old_action_log_prob = torch.tensor([t.a_log_prob for t in self.buffer], dtype=torch.float).view(-1, 1) 110 | 111 | R = 0 112 | Gt = [] 113 | for r in reward[::-1]: 114 | R = r + gamma * R 115 | Gt.insert(0, R) 116 | Gt = torch.tensor(Gt, dtype=torch.float) 117 | # print("The agent is updateing....") 118 | for i in range(self.ppo_update_time): 119 | for index in BatchSampler(SubsetRandomSampler(range(len(self.buffer))), self.batch_size, False): 120 | if self.training_step % 1000 == 0: 121 | print('I_ep {} ,train {} times'.format(i_ep, self.training_step)) 122 | # with torch.no_grad(): 123 | Gt_index = Gt[index].view(-1, 1) 124 | V = self.critic_net(state[index]) 125 | delta = Gt_index - V 126 | advantage = delta.detach() 127 | # epoch iteration, PPO core!!! 128 | action_prob = self.actor_net(state[index]).gather(1, action[index]) # new policy 129 | 130 | ratio = (action_prob / old_action_log_prob[index]) 131 | surr1 = ratio * advantage 132 | surr2 = torch.clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * advantage 133 | 134 | # update actor network 135 | action_loss = -torch.min(surr1, surr2).mean() # MAX->MIN desent 136 | self.writer.add_scalar('loss/action_loss', action_loss, global_step=self.training_step) 137 | self.actor_optimizer.zero_grad() 138 | action_loss.backward() 139 | nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm) 140 | self.actor_optimizer.step() 141 | 142 | # update critic network 143 | value_loss = F.mse_loss(Gt_index, V) 144 | self.writer.add_scalar('loss/value_loss', value_loss, global_step=self.training_step) 145 | self.critic_net_optimizer.zero_grad() 146 | value_loss.backward() 147 | nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm) 148 | self.critic_net_optimizer.step() 149 | self.training_step += 1 150 | 151 | del self.buffer[:] # clear experience 152 | 153 | 154 | def main(): 155 | agent = PPO() 156 | for i_epoch in range(1000): 157 | state = env.reset() 158 | if render: env.render() 159 | 160 | for t in count(): 161 | action, action_prob = agent.select_action(state) 162 | next_state, reward, done, _ = env.step(action) 163 | trans = Transition(state, action, action_prob, reward, next_state) 164 | if render: env.render() 165 | agent.store_transition(trans) 166 | state = next_state 167 | 168 | if done: 169 | if len(agent.buffer) >= agent.batch_size: agent.update(i_epoch) 170 | agent.writer.add_scalar('Steptime/steptime', t, global_step=i_epoch) 171 | break 172 | 173 | 174 | if __name__ == '__main__': 175 | main() 176 | print("end") 177 | -------------------------------------------------------------------------------- /Char07 PPO/PPO_pendulum.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | from collections import namedtuple 4 | 5 | import matplotlib.pyplot as plt 6 | 7 | import gym 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | from torch.distributions import Normal 13 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 14 | 15 | parser = argparse.ArgumentParser(description='Solve the Pendulum-v0 with PPO') 16 | parser.add_argument( 17 | '--gamma', type=float, default=0.9, metavar='G', help='discount factor (default: 0.9)') 18 | parser.add_argument('--seed', type=int, default=0, metavar='N', help='random seed (default: 0)') 19 | parser.add_argument('--render', action='store_true', help='render the environment') 20 | parser.add_argument( 21 | '--log-interval', 22 | type=int, 23 | default=10, 24 | metavar='N', 25 | help='interval between training status logs (default: 10)') 26 | args = parser.parse_args() 27 | 28 | torch.manual_seed(args.seed) 29 | 30 | TrainingRecord = namedtuple('TrainingRecord', ['ep', 'reward']) 31 | Transition = namedtuple('Transition', ['s', 'a', 'a_log_p', 'r', 's_']) 32 | 33 | 34 | class ActorNet(nn.Module): 35 | 36 | def __init__(self): 37 | super(ActorNet, self).__init__() 38 | self.fc = nn.Linear(3, 100) 39 | self.mu_head = nn.Linear(100, 1) 40 | self.sigma_head = nn.Linear(100, 1) 41 | 42 | def forward(self, x): 43 | x = F.relu(self.fc(x)) 44 | mu = 2.0 * F.tanh(self.mu_head(x)) 45 | sigma = F.softplus(self.sigma_head(x)) 46 | return (mu, sigma) 47 | 48 | 49 | class CriticNet(nn.Module): 50 | 51 | def __init__(self): 52 | super(CriticNet, self).__init__() 53 | self.fc = nn.Linear(3, 100) 54 | self.v_head = nn.Linear(100, 1) 55 | 56 | def forward(self, x): 57 | x = F.relu(self.fc(x)) 58 | state_value = self.v_head(x) 59 | return state_value 60 | 61 | 62 | class Agent(): 63 | 64 | clip_param = 0.2 65 | max_grad_norm = 0.5 66 | ppo_epoch = 10 67 | buffer_capacity, batch_size = 1000, 32 68 | 69 | def __init__(self): 70 | self.training_step = 0 71 | self.anet = ActorNet().float() 72 | self.cnet = CriticNet().float() 73 | self.buffer = [] 74 | self.counter = 0 75 | 76 | self.optimizer_a = optim.Adam(self.anet.parameters(), lr=1e-4) 77 | self.optimizer_c = optim.Adam(self.cnet.parameters(), lr=3e-4) 78 | 79 | def select_action(self, state): 80 | state = torch.from_numpy(state).float().unsqueeze(0) 81 | with torch.no_grad(): 82 | (mu, sigma) = self.anet(state) 83 | dist = Normal(mu, sigma) 84 | action = dist.sample() 85 | action_log_prob = dist.log_prob(action) 86 | action = action.clamp(-2.0, 2.0) 87 | return action.item(), action_log_prob.item() 88 | 89 | def get_value(self, state): 90 | 91 | state = torch.from_numpy(state).float().unsqueeze(0) 92 | with torch.no_grad(): 93 | state_value = self.cnet(state) 94 | return state_value.item() 95 | 96 | def save_param(self): 97 | torch.save(self.anet.state_dict(), 'param/ppo_anet_params.pkl') 98 | torch.save(self.cnet.state_dict(), 'param/ppo_cnet_params.pkl') 99 | 100 | def store(self, transition): 101 | self.buffer.append(transition) 102 | self.counter += 1 103 | return self.counter % self.buffer_capacity == 0 104 | 105 | def update(self): 106 | self.training_step += 1 107 | 108 | s = torch.tensor([t.s for t in self.buffer], dtype=torch.float) 109 | a = torch.tensor([t.a for t in self.buffer], dtype=torch.float).view(-1, 1) 110 | r = torch.tensor([t.r for t in self.buffer], dtype=torch.float).view(-1, 1) 111 | s_ = torch.tensor([t.s_ for t in self.buffer], dtype=torch.float) 112 | 113 | old_action_log_probs = torch.tensor( 114 | [t.a_log_p for t in self.buffer], dtype=torch.float).view(-1, 1) 115 | 116 | r = (r - r.mean()) / (r.std() + 1e-5) 117 | with torch.no_grad(): 118 | target_v = r + args.gamma * self.cnet(s_) 119 | 120 | adv = (target_v - self.cnet(s)).detach() 121 | 122 | for _ in range(self.ppo_epoch): 123 | for index in BatchSampler( 124 | SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False): 125 | 126 | (mu, sigma) = self.anet(s[index]) 127 | dist = Normal(mu, sigma) 128 | action_log_probs = dist.log_prob(a[index]) 129 | ratio = torch.exp(action_log_probs - old_action_log_probs[index]) 130 | 131 | surr1 = ratio * adv[index] 132 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 133 | 1.0 + self.clip_param) * adv[index] 134 | action_loss = -torch.min(surr1, surr2).mean() 135 | 136 | self.optimizer_a.zero_grad() 137 | action_loss.backward() 138 | nn.utils.clip_grad_norm_(self.anet.parameters(), self.max_grad_norm) 139 | self.optimizer_a.step() 140 | 141 | value_loss = F.smooth_l1_loss(self.cnet(s[index]), target_v[index]) 142 | self.optimizer_c.zero_grad() 143 | value_loss.backward() 144 | nn.utils.clip_grad_norm_(self.cnet.parameters(), self.max_grad_norm) 145 | self.optimizer_c.step() 146 | 147 | del self.buffer[:] 148 | 149 | 150 | def main(): 151 | env = gym.make('Pendulum-v0') 152 | env.seed(args.seed) 153 | 154 | agent = Agent() 155 | 156 | training_records = [] 157 | running_reward = -1000 158 | state = env.reset() 159 | for i_ep in range(1000): 160 | score = 0 161 | state = env.reset() 162 | 163 | for t in range(200): 164 | action, action_log_prob = agent.select_action(state) 165 | state_, reward, done, _ = env.step([action]) 166 | if args.render: 167 | env.render() 168 | if agent.store(Transition(state, action, action_log_prob, (reward + 8) / 8, state_)): 169 | agent.update() 170 | score += reward 171 | state = state_ 172 | 173 | running_reward = running_reward * 0.9 + score * 0.1 174 | training_records.append(TrainingRecord(i_ep, running_reward)) 175 | 176 | if i_ep % args.log_interval == 0: 177 | print('Ep {}\tMoving average score: {:.2f}\t'.format(i_ep, running_reward)) 178 | if running_reward > -200: 179 | print("Solved! Moving average score is now {}!".format(running_reward)) 180 | env.close() 181 | agent.save_param() 182 | with open('log/ppo_training_records.pkl', 'wb') as f: 183 | pickle.dump(training_records, f) 184 | break 185 | 186 | plt.plot([r.ep for r in training_records], [r.reward for r in training_records]) 187 | plt.title('PPO') 188 | plt.xlabel('Episode') 189 | plt.ylabel('Moving averaged episode reward') 190 | plt.savefig("img/ppo.png") 191 | plt.show() 192 | 193 | 194 | if __name__ == '__main__': 195 | main() 196 | -------------------------------------------------------------------------------- /Char07 PPO/readme.md: -------------------------------------------------------------------------------- 1 | # PPO 2 | 3 | - Original paper: https://arxiv.org/abs/1707.06347 4 | - Openai Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ 5 | 6 | Notice: This is not the author's implementation! 7 | -------------------------------------------------------------------------------- /Char08 ACER/readme.md: -------------------------------------------------------------------------------- 1 | #ACER 2 | 3 | actor-critic with experience replay 4 | -------------------------------------------------------------------------------- /Char09 SAC/SAC.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | from collections import namedtuple 4 | from itertools import count 5 | 6 | import os 7 | import numpy as np 8 | 9 | 10 | import gym 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import torch.optim as optim 15 | from torch.distributions import Normal 16 | from torch.autograd import grad 17 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 18 | from tensorboardX import SummaryWriter 19 | 20 | 21 | ''' 22 | Implementation of soft actor critic 23 | Original paper: https://arxiv.org/abs/1801.01290 24 | Not the author's implementation ! 25 | ''' 26 | 27 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 28 | parser = argparse.ArgumentParser() 29 | 30 | 31 | parser.add_argument("--env_name", default="Pendulum-v0") # OpenAI gym environment name 32 | parser.add_argument('--tau', default=0.005, type=float) # target smoothing coefficient 33 | parser.add_argument('--target_update_interval', default=1, type=int) 34 | parser.add_argument('--gradient_steps', default=1, type=int) 35 | 36 | 37 | parser.add_argument('--learning_rate', default=3e-4, type=int) 38 | parser.add_argument('--gamma', default=0.99, type=int) # discount gamma 39 | parser.add_argument('--capacity', default=10000, type=int) # replay buffer size 40 | parser.add_argument('--iteration', default=100000, type=int) # num of games 41 | parser.add_argument('--batch_size', default=128, type=int) # mini batch size 42 | parser.add_argument('--seed', default=1, type=int) 43 | 44 | # optional parameters 45 | parser.add_argument('--num_hidden_layers', default=2, type=int) 46 | parser.add_argument('--num_hidden_units_per_layer', default=256, type=int) 47 | parser.add_argument('--sample_frequency', default=256, type=int) 48 | parser.add_argument('--activation', default='Relu', type=str) 49 | parser.add_argument('--render', default=False, type=bool) # show UI or not 50 | parser.add_argument('--log_interval', default=2000, type=int) # 51 | parser.add_argument('--load', default=False, type=bool) # load model 52 | args = parser.parse_args() 53 | 54 | class NormalizedActions(gym.ActionWrapper): 55 | def _action(self, action): 56 | low = self.action_space.low 57 | high = self.action_space.high 58 | 59 | action = low + (action + 1.0) * 0.5 * (high - low) 60 | action = np.clip(action, low, high) 61 | 62 | return action 63 | 64 | def _reverse_action(self, action): 65 | low = self.action_space.low 66 | high = self.action_space.high 67 | 68 | action = 2 * (action - low) / (high - low) - 1 69 | action = np.clip(action, low, high) 70 | 71 | return action 72 | 73 | 74 | env = NormalizedActions(gym.make(args.env_name)) 75 | 76 | # Set seeds 77 | env.seed(args.seed) 78 | torch.manual_seed(args.seed) 79 | np.random.seed(args.seed) 80 | 81 | state_dim = env.observation_space.shape[0] 82 | action_dim = env.action_space.shape[0] 83 | max_action = float(env.action_space.high[0]) 84 | min_Val = torch.tensor(1e-7).float() 85 | Transition = namedtuple('Transition', ['s', 'a', 'r', 's_', 'd']) 86 | 87 | class Actor(nn.Module): 88 | def __init__(self, state_dim, min_log_std=-20, max_log_std=2): 89 | super(Actor, self).__init__() 90 | self.fc1 = nn.Linear(state_dim, 256) 91 | self.fc2 = nn.Linear(256, 256) 92 | self.mu_head = nn.Linear(256, 1) 93 | self.log_std_head = nn.Linear(256, 1) 94 | self.max_action = max_action 95 | 96 | self.min_log_std = min_log_std 97 | self.max_log_std = max_log_std 98 | 99 | def forward(self, x): 100 | x = F.relu(self.fc1(x)) 101 | x = F.relu(self.fc2(x)) 102 | mu = self.mu_head(x) 103 | log_std_head = F.relu(self.log_std_head(x)) 104 | log_std_head = torch.clamp(log_std_head, self.min_log_std, self.max_log_std) 105 | return mu, log_std_head 106 | 107 | 108 | class Critic(nn.Module): 109 | def __init__(self, state_dim): 110 | super(Critic, self).__init__() 111 | self.fc1 = nn.Linear(state_dim, 256) 112 | self.fc2 = nn.Linear(256, 256) 113 | self.fc3 = nn.Linear(256, 1) 114 | 115 | def forward(self, x): 116 | x = F.relu(self.fc1(x)) 117 | x = F.relu(self.fc2(x)) 118 | x = self.fc3(x) 119 | return x 120 | 121 | 122 | class Q(nn.Module): 123 | def __init__(self, state_dim, action_dim): 124 | super(Q, self).__init__() 125 | self.fc1 = nn.Linear(state_dim + action_dim, 256) 126 | self.fc2 = nn.Linear(256, 256) 127 | self.fc3 = nn.Linear(256, 1) 128 | 129 | def forward(self, s, a): 130 | s = s.reshape(-1, state_dim) 131 | a = a.reshape(-1, action_dim) 132 | x = torch.cat((s, a), -1) # combination s and a 133 | x = F.relu(self.fc1(x)) 134 | x = F.relu(self.fc2(x)) 135 | x = self.fc3(x) 136 | return x 137 | 138 | 139 | class SAC(): 140 | def __init__(self): 141 | super(SAC, self).__init__() 142 | 143 | self.policy_net = Actor(state_dim).to(device) 144 | self.value_net = Critic(state_dim).to(device) 145 | self.Q_net = Q(state_dim, action_dim).to(device) 146 | self.Target_value_net = Critic(state_dim).to(device) 147 | 148 | self.replay_buffer = [Transition] * args.capacity 149 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=args.learning_rate) 150 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=args.learning_rate) 151 | self.Q_optimizer = optim.Adam(self.Q_net.parameters(), lr=args.learning_rate) 152 | self.num_transition = 0 # pointer of replay buffer 153 | self.num_training = 1 154 | self.writer = SummaryWriter('./exp-SAC') 155 | 156 | self.value_criterion = nn.MSELoss() 157 | self.Q_criterion = nn.MSELoss() 158 | 159 | for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()): 160 | target_param.data.copy_(param.data) 161 | 162 | os.makedirs('./SAC_model/', exist_ok=True) 163 | 164 | def select_action(self, state): 165 | state = torch.FloatTensor(state).to(device) 166 | mu, log_sigma = self.policy_net(state) 167 | sigma = torch.exp(log_sigma) 168 | dist = Normal(mu, sigma) 169 | z = dist.sample() 170 | action = torch.tanh(z).detach().cpu().numpy() 171 | return action.item() # return a scalar, float32 172 | 173 | def store(self, s, a, r, s_, d): 174 | index = self.num_transition % args.capacity 175 | transition = Transition(s, a, r, s_, d) 176 | self.replay_buffer[index] = transition 177 | self.num_transition += 1 178 | 179 | def get_action_log_prob(self, state): 180 | 181 | batch_mu, batch_log_sigma = self.policy_net(state) 182 | batch_sigma = torch.exp(batch_log_sigma) 183 | dist = Normal(batch_mu, batch_sigma) 184 | z = dist.sample() 185 | action = torch.tanh(z) 186 | log_prob = dist.log_prob(z) - torch.log(1 - action.pow(2) + min_Val) 187 | return action, log_prob, z, batch_mu, batch_log_sigma 188 | 189 | 190 | def update(self): 191 | if self.num_training % 500 == 0: 192 | print("Training ... {} ".format(self.num_training)) 193 | s = torch.tensor([t.s for t in self.replay_buffer]).float().to(device) 194 | a = torch.tensor([t.a for t in self.replay_buffer]).to(device) 195 | r = torch.tensor([t.r for t in self.replay_buffer]).to(device) 196 | s_ = torch.tensor([t.s_ for t in self.replay_buffer]).float().to(device) 197 | d = torch.tensor([t.d for t in self.replay_buffer]).float().to(device) 198 | 199 | for _ in range(args.gradient_steps): 200 | #for index in BatchSampler(SubsetRandomSampler(range(args.capacity)), args.batch_size, False): 201 | index = np.random.choice(range(args.capacity), args.batch_size, replace=False) 202 | bn_s = s[index] 203 | bn_a = a[index].reshape(-1, 1) 204 | bn_r = r[index].reshape(-1, 1) 205 | bn_s_ = s_[index] 206 | bn_d = d[index].reshape(-1, 1) 207 | 208 | 209 | target_value = self.Target_value_net(bn_s_) 210 | next_q_value = bn_r + (1 - bn_d) * args.gamma * target_value 211 | 212 | excepted_value = self.value_net(bn_s) 213 | excepted_Q = self.Q_net(bn_s, bn_a) 214 | 215 | sample_action, log_prob, z, batch_mu, batch_log_sigma = self.get_action_log_prob(bn_s) 216 | excepted_new_Q = self.Q_net(bn_s, sample_action) 217 | next_value = excepted_new_Q - log_prob 218 | 219 | # !!!Note that the actions are sampled according to the current policy, 220 | # instead of replay buffer. (From original paper) 221 | 222 | V_loss = self.value_criterion(excepted_value, next_value.detach()) # J_V 223 | V_loss = V_loss.mean() 224 | 225 | # Single Q_net this is different from original paper!!! 226 | Q_loss = self.Q_criterion(excepted_Q, next_q_value.detach()) # J_Q 227 | Q_loss = Q_loss.mean() 228 | 229 | log_policy_target = excepted_new_Q - excepted_value 230 | 231 | pi_loss = log_prob * (log_prob- log_policy_target).detach() 232 | pi_loss = pi_loss.mean() 233 | 234 | self.writer.add_scalar('Loss/V_loss', V_loss, global_step=self.num_training) 235 | self.writer.add_scalar('Loss/Q_loss', Q_loss, global_step=self.num_training) 236 | self.writer.add_scalar('Loss/pi_loss', pi_loss, global_step=self.num_training) 237 | # mini batch gradient descent 238 | self.value_optimizer.zero_grad() 239 | V_loss.backward(retain_graph=True) 240 | nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.5) 241 | self.value_optimizer.step() 242 | 243 | self.Q_optimizer.zero_grad() 244 | Q_loss.backward(retain_graph = True) 245 | nn.utils.clip_grad_norm_(self.Q_net.parameters(), 0.5) 246 | self.Q_optimizer.step() 247 | 248 | self.policy_optimizer.zero_grad() 249 | pi_loss.backward(retain_graph = True) 250 | nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5) 251 | self.policy_optimizer.step() 252 | 253 | # soft update 254 | for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()): 255 | target_param.data.copy_(target_param * (1 - args.tau) + param * args.tau) 256 | 257 | self.num_training += 1 258 | 259 | def save(self): 260 | torch.save(self.policy_net.state_dict(), './SAC_model/policy_net.pth') 261 | torch.save(self.value_net.state_dict(), './SAC_model/value_net.pth') 262 | torch.save(self.Q_net.state_dict(), './SAC_model/Q_net.pth') 263 | print("====================================") 264 | print("Model has been saved...") 265 | print("====================================") 266 | 267 | def load(self): 268 | torch.load(self.policy_net.state_dict(), './SAC_model/policy_net.pth') 269 | torch.load(self.value_net.state_dict(), './SAC_model/value_net.pth') 270 | torch.load(self.Q_net.state_dict(), './SAC_model/Q_net.pth') 271 | print() 272 | 273 | def main(): 274 | 275 | agent = SAC() 276 | if args.load: agent.load() 277 | if args.render: env.render() 278 | print("====================================") 279 | print("Collection Experience...") 280 | print("====================================") 281 | 282 | ep_r = 0 283 | for i in range(args.iteration): 284 | state = env.reset() 285 | for t in range(200): 286 | action = agent.select_action(state) 287 | next_state, reward, done, info = env.step(np.float32(action)) 288 | ep_r += reward 289 | if args.render: env.render() 290 | agent.store(state, action, reward, next_state, done) 291 | 292 | if agent.num_transition >= args.capacity: 293 | agent.update() 294 | 295 | state = next_state 296 | if done or t == 199: 297 | if i % 10 == 0: 298 | print("Ep_i {}, the ep_r is {}, the t is {}".format(i, ep_r, t)) 299 | break 300 | if i % args.log_interval == 0: 301 | agent.save() 302 | agent.writer.add_scalar('ep_r', ep_r, global_step=i) 303 | ep_r = 0 304 | 305 | 306 | if __name__ == '__main__': 307 | main() -------------------------------------------------------------------------------- /Char09 SAC/SAC_BipedalWalker-v2.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import namedtuple 3 | from itertools import count 4 | import pickle 5 | 6 | import os, random 7 | import numpy as np 8 | 9 | import gym 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | from torch.distributions import Normal 15 | from tensorboardX import SummaryWriter 16 | 17 | 18 | ''' 19 | Implementation of soft actor critic, dual Q network version 20 | Original paper: https://arxiv.org/abs/1801.01290 21 | Not the author's implementation ! 22 | ''' 23 | 24 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 25 | parser = argparse.ArgumentParser() 26 | 27 | parser.add_argument("--env_name", default="BipedalWalker-v2") # OpenAI gym environment name 28 | parser.add_argument('--tau', default=0.005, type=float) # target smoothing coefficient 29 | parser.add_argument('--target_update_interval', default=1, type=int) 30 | parser.add_argument('--gradient_steps', default=1, type=int) 31 | 32 | parser.add_argument('--learning_rate', default=3e-4, type=float) 33 | parser.add_argument('--gamma', default=0.99, type=int) # discount gamma 34 | parser.add_argument('--capacity', default=1000000, type=int) # replay buffer size 35 | parser.add_argument('--iteration', default=100000, type=int) # num of games 36 | parser.add_argument('--batch_size', default=128, type=int) # mini batch size 37 | parser.add_argument('--seed', default=1, type=int) 38 | 39 | # optional parameters 40 | parser.add_argument('--num_hidden_layers', default=2, type=int) 41 | parser.add_argument('--num_hidden_units_per_layer', default=256, type=int) 42 | parser.add_argument('--sample_frequency', default=256, type=int) 43 | parser.add_argument('--activation', default='Relu', type=str) 44 | parser.add_argument('--render', default=False, type=bool) # show UI or not 45 | parser.add_argument('--log_interval', default=50, type=int) # 46 | parser.add_argument('--load', default=False, type=bool) # load model 47 | parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work 48 | args = parser.parse_args() 49 | 50 | class NormalizedActions(gym.ActionWrapper): 51 | def _action(self, action): 52 | low = self.action_space.low 53 | high = self.action_space.high 54 | 55 | action = low + (action + 1.0) * 0.5 * (high - low) 56 | action = np.clip(action, low, high) 57 | 58 | return action 59 | 60 | def _reverse_action(self, action): 61 | low = self.action_space.low 62 | high = self.action_space.high 63 | 64 | action = 2 * (action - low) / (high - low) - 1 65 | action = np.clip(action, low, high) 66 | 67 | return action 68 | 69 | 70 | env = NormalizedActions(gym.make(args.env_name)) 71 | 72 | # Set seeds 73 | env.seed(args.seed) 74 | torch.manual_seed(args.seed) 75 | np.random.seed(args.seed) 76 | 77 | state_dim = env.observation_space.shape[0] 78 | action_dim = env.action_space.shape[0] 79 | max_action = float(env.action_space.high[0]) 80 | min_Val = torch.tensor(1e-7).float().to(device) 81 | 82 | class Replay_buffer(): 83 | def __init__(self, capacity): 84 | self.capacity = capacity 85 | self.state_pool = torch.zeros(self.capacity, state_dim).float().to(device) 86 | self.action_pool = torch.zeros(self.capacity, action_dim).float().to(device) 87 | self.reward_pool = torch.zeros(self.capacity, 1).float().to(device) 88 | self.next_state_pool = torch.zeros(self.capacity, state_dim).float().to(device) 89 | self.done_pool = torch.zeros(self.capacity, 1).float().to(device) 90 | self.num_transition = 0 91 | 92 | def push(self, s, a, r, s_, d): 93 | index = self.num_transition % self.capacity 94 | s = torch.tensor(s).float().to(device) 95 | a = torch.tensor(a).float().to(device) 96 | r = torch.tensor(r).float().to(device) 97 | s_ = torch.tensor(s_).float().to(device) 98 | d = torch.tensor(d).float().to(device) 99 | for pool, ele in zip([self.state_pool, self.action_pool, self.reward_pool, self.next_state_pool, self.done_pool], 100 | [s, a, r, s_, d]): 101 | pool[index] = ele 102 | self.num_transition += 1 103 | 104 | def sample(self, batch_size): 105 | index = np.random.choice(range(self.capacity), batch_size, replace=False) 106 | bn_s, bn_a, bn_r, bn_s_, bn_d = self.state_pool[index], self.action_pool[index], self.reward_pool[index],\ 107 | self.next_state_pool[index], self.done_pool[index] 108 | 109 | return bn_s, bn_a, bn_r, bn_s_, bn_d 110 | 111 | class Actor(nn.Module): 112 | def __init__(self, state_dim, action_dim=action_dim, min_log_std=-10, max_log_std=2): 113 | super(Actor, self).__init__() 114 | self.fc1 = nn.Linear(state_dim, 256) 115 | self.fc2 = nn.Linear(256, 512) 116 | self.mu_head = nn.Linear(512, action_dim) 117 | self.log_std_head = nn.Linear(512, action_dim) 118 | self.max_action = max_action 119 | 120 | self.min_log_std = min_log_std 121 | self.max_log_std = max_log_std 122 | 123 | def forward(self, x): 124 | x = F.relu(self.fc1(x)) 125 | x = F.relu(self.fc2(x)) 126 | mu = self.mu_head(x) 127 | log_std_head = F.relu(self.log_std_head(x)) 128 | log_std_head = torch.clamp(log_std_head, self.min_log_std, self.max_log_std) 129 | return mu, log_std_head 130 | 131 | 132 | class Critic(nn.Module): 133 | def __init__(self, state_dim): 134 | super(Critic, self).__init__() 135 | self.fc1 = nn.Linear(state_dim, 256) 136 | self.fc2 = nn.Linear(256, 256) 137 | self.fc3 = nn.Linear(256, 1) 138 | 139 | def forward(self, x): 140 | x = F.relu(self.fc1(x)) 141 | x = F.relu(self.fc2(x)) 142 | x = self.fc3(x) 143 | return x 144 | 145 | 146 | class Q(nn.Module): 147 | def __init__(self, state_dim, action_dim): 148 | super(Q, self).__init__() 149 | self.fc1 = nn.Linear(state_dim + action_dim, 256) 150 | self.fc2 = nn.Linear(256, 256) 151 | self.fc3 = nn.Linear(256, 1) 152 | 153 | def forward(self, s, a): 154 | s = s.reshape(-1, state_dim) 155 | a = a.reshape(-1, action_dim) 156 | x = torch.cat((s, a), -1) # combination s and a 157 | x = F.relu(self.fc1(x)) 158 | x = F.relu(self.fc2(x)) 159 | x = self.fc3(x) 160 | return x 161 | 162 | 163 | class SAC(): 164 | def __init__(self): 165 | super(SAC, self).__init__() 166 | 167 | self.policy_net = Actor(state_dim).to(device) 168 | self.value_net = Critic(state_dim).to(device) 169 | self.Target_value_net = Critic(state_dim).to(device) 170 | self.Q_net1 = Q(state_dim, action_dim).to(device) 171 | self.Q_net2 = Q(state_dim, action_dim).to(device) 172 | 173 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=args.learning_rate) 174 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=args.learning_rate) 175 | self.Q1_optimizer = optim.Adam(self.Q_net1.parameters(), lr=args.learning_rate) 176 | self.Q2_optimizer = optim.Adam(self.Q_net2.parameters(), lr=args.learning_rate) 177 | 178 | self.replay_buffer = Replay_buffer(args.capacity) 179 | self.num_transition = 0 # pointer of replay buffer 180 | self.num_training = 0 181 | self.writer = SummaryWriter('./exp-SAC_dual_Q_network') 182 | 183 | self.value_criterion = nn.MSELoss() 184 | self.Q1_criterion = nn.MSELoss() 185 | self.Q2_criterion = nn.MSELoss() 186 | 187 | for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()): 188 | target_param.data.copy_(param.data) 189 | 190 | os.makedirs('./SAC_model/', exist_ok=True) 191 | 192 | def select_action(self, state): 193 | state = torch.FloatTensor(state).to(device) 194 | mu, log_sigma = self.policy_net(state) 195 | sigma = torch.exp(log_sigma) 196 | dist = Normal(mu, sigma) 197 | z = dist.sample() 198 | action = torch.tanh(z).detach().cpu().numpy() 199 | return action # return a scalar, float32 200 | 201 | def evaluate(self, state): 202 | batch_mu, batch_log_sigma = self.policy_net(state) 203 | batch_sigma = torch.exp(batch_log_sigma) 204 | dist = Normal(batch_mu, batch_sigma) 205 | noise = Normal(0, 1) 206 | 207 | z = noise.sample() 208 | action = torch.tanh(batch_mu + batch_sigma*z.to(device)) 209 | log_prob = dist.log_prob(batch_mu + batch_sigma * z.to(device)) - torch.log(1 - action.pow(2) + min_Val) 210 | return action, log_prob, z, batch_mu, batch_log_sigma 211 | 212 | def update(self): 213 | if self.num_training % 500 == 0: 214 | print("Training ... \t{} times ".format(self.num_training)) 215 | 216 | for _ in range(args.gradient_steps): 217 | bn_s, bn_a, bn_r, bn_s_, bn_d = self.replay_buffer.sample(args.batch_size) 218 | 219 | target_value = self.Target_value_net(bn_s_) 220 | next_q_value = bn_r + (1 - bn_d) * args.gamma * target_value 221 | 222 | excepted_value = self.value_net(bn_s) 223 | excepted_Q1 = self.Q_net1(bn_s, bn_a) 224 | excepted_Q2 = self.Q_net2(bn_s, bn_a) 225 | sample_action, log_prob, z, batch_mu, batch_log_sigma = self.evaluate(bn_s) 226 | excepted_new_Q = torch.min(self.Q_net1(bn_s, sample_action), self.Q_net2(bn_s, sample_action)) 227 | next_value = excepted_new_Q - log_prob 228 | 229 | # !!!Note that the actions are sampled according to the current policy, 230 | # instead of replay buffer. (From original paper) 231 | V_loss = self.value_criterion(excepted_value, next_value.detach()).mean() # J_V 232 | 233 | # Dual Q net 234 | Q1_loss = self.Q1_criterion(excepted_Q1, next_q_value.detach()).mean() # J_Q 235 | Q2_loss = self.Q2_criterion(excepted_Q2, next_q_value.detach()).mean() 236 | 237 | pi_loss = (log_prob - excepted_new_Q).mean() # according to original paper 238 | 239 | self.writer.add_scalar('Loss/V_loss', V_loss, global_step=self.num_training) 240 | self.writer.add_scalar('Loss/Q1_loss', Q1_loss, global_step=self.num_training) 241 | self.writer.add_scalar('Loss/Q2_loss', Q2_loss, global_step=self.num_training) 242 | self.writer.add_scalar('Loss/policy_loss', pi_loss, global_step=self.num_training) 243 | 244 | # mini batch gradient descent 245 | self.value_optimizer.zero_grad() 246 | V_loss.backward(retain_graph=True) 247 | nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.5) 248 | self.value_optimizer.step() 249 | 250 | self.Q1_optimizer.zero_grad() 251 | Q1_loss.backward(retain_graph = True) 252 | nn.utils.clip_grad_norm_(self.Q_net1.parameters(), 0.5) 253 | self.Q1_optimizer.step() 254 | 255 | self.Q2_optimizer.zero_grad() 256 | Q2_loss.backward(retain_graph = True) 257 | nn.utils.clip_grad_norm_(self.Q_net2.parameters(), 0.5) 258 | self.Q2_optimizer.step() 259 | 260 | self.policy_optimizer.zero_grad() 261 | pi_loss.backward(retain_graph = True) 262 | nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5) 263 | self.policy_optimizer.step() 264 | 265 | # update target v net update 266 | for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()): 267 | target_param.data.copy_(target_param * (1 - args.tau) + param * args.tau) 268 | 269 | self.num_training += 1 270 | 271 | def save(self): 272 | torch.save(self.policy_net.state_dict(), './SAC_model/policy_net.pth') 273 | torch.save(self.value_net.state_dict(), './SAC_model/value_net.pth') 274 | torch.save(self.Q_net1.state_dict(), './SAC_model/Q_net1.pth') 275 | torch.save(self.Q_net2.state_dict(), './SAC_model/Q_net2.pth') 276 | print("====================================") 277 | print("Model has been saved...") 278 | print("====================================") 279 | 280 | def load(self): 281 | self.policy_net.load_state_dict(torch.load('./SAC_model/policy_net.pth')) 282 | self.value_net.load_state_dict(torch.load( './SAC_model/value_net.pth')) 283 | self.Q_net1.load_state_dict(torch.load('./SAC_model/Q_net1.pth')) 284 | self.Q_net2.load_state_dict(torch.load('./SAC_model/Q_net2.pth')) 285 | print("====================================") 286 | print("model has been loaded...") 287 | print("====================================") 288 | 289 | 290 | def main(): 291 | agent = SAC() 292 | if args.load: agent.load() 293 | print("====================================") 294 | print("Collection Experience...") 295 | print("====================================") 296 | 297 | ep_r = 0 298 | for i in range(args.iteration): 299 | state = env.reset() 300 | for t in range(200): 301 | action = agent.select_action(state) 302 | next_state, reward, done, info = env.step(np.float32(action)) 303 | ep_r += reward 304 | if args.render and i >= args.render_interval : env.render() 305 | agent.replay_buffer.push(state, action, reward, next_state, done) 306 | 307 | state = next_state 308 | if done: 309 | if agent.replay_buffer.num_transition >= args.capacity: 310 | agent.update() 311 | if i > 100: 312 | print("Ep_i \t{}, the ep_r is \t{}, the step is \t{}".format(i, ep_r, t)) 313 | break 314 | if i % args.log_interval == 0: 315 | agent.save() 316 | agent.writer.add_scalar('ep_r', ep_r, global_step=i) 317 | ep_r = 0 318 | 319 | 320 | if __name__ == '__main__': 321 | main() 322 | -------------------------------------------------------------------------------- /Char09 SAC/SAC_dual_Q_net.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import namedtuple 3 | from itertools import count 4 | 5 | import os 6 | import numpy as np 7 | 8 | 9 | import gym 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | from torch.distributions import Normal 15 | from tensorboardX import SummaryWriter 16 | 17 | 18 | ''' 19 | Implementation of soft actor critic, dual Q network version 20 | Original paper: https://arxiv.org/abs/1801.01290 21 | Not the author's implementation ! 22 | ''' 23 | 24 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 25 | parser = argparse.ArgumentParser() 26 | 27 | 28 | parser.add_argument("--env_name", default="Pendulum-v0") # OpenAI gym environment name 29 | parser.add_argument('--tau', default=0.005, type=float) # target smoothing coefficient 30 | parser.add_argument('--target_update_interval', default=1, type=int) 31 | parser.add_argument('--gradient_steps', default=1, type=int) 32 | 33 | 34 | parser.add_argument('--learning_rate', default=3e-4, type=int) 35 | parser.add_argument('--gamma', default=0.99, type=int) # discount gamma 36 | parser.add_argument('--capacity', default=10000, type=int) # replay buffer size 37 | parser.add_argument('--iteration', default=100000, type=int) # num of games 38 | parser.add_argument('--batch_size', default=128, type=int) # mini batch size 39 | parser.add_argument('--seed', default=1, type=int) 40 | 41 | # optional parameters 42 | parser.add_argument('--num_hidden_layers', default=2, type=int) 43 | parser.add_argument('--num_hidden_units_per_layer', default=256, type=int) 44 | parser.add_argument('--sample_frequency', default=256, type=int) 45 | parser.add_argument('--activation', default='Relu', type=str) 46 | parser.add_argument('--render', default=False, type=bool) # show UI or not 47 | parser.add_argument('--log_interval', default=2000, type=int) # 48 | parser.add_argument('--load', default=False, type=bool) # load model 49 | args = parser.parse_args() 50 | 51 | class NormalizedActions(gym.ActionWrapper): 52 | def _action(self, action): 53 | low = self.action_space.low 54 | high = self.action_space.high 55 | 56 | action = low + (action + 1.0) * 0.5 * (high - low) 57 | action = np.clip(action, low, high) 58 | 59 | return action 60 | 61 | def _reverse_action(self, action): 62 | low = self.action_space.low 63 | high = self.action_space.high 64 | 65 | action = 2 * (action - low) / (high - low) - 1 66 | action = np.clip(action, low, high) 67 | 68 | return action 69 | 70 | 71 | env = NormalizedActions(gym.make(args.env_name)) 72 | 73 | # Set seeds 74 | env.seed(args.seed) 75 | torch.manual_seed(args.seed) 76 | np.random.seed(args.seed) 77 | 78 | state_dim = env.observation_space.shape[0] 79 | action_dim = env.action_space.shape[0] 80 | max_action = float(env.action_space.high[0]) 81 | min_Val = torch.tensor(1e-7).float().to(device) 82 | Transition = namedtuple('Transition', ['s', 'a', 'r', 's_', 'd']) 83 | 84 | class Actor(nn.Module): 85 | def __init__(self, state_dim, min_log_std=-10, max_log_std=2): 86 | super(Actor, self).__init__() 87 | self.fc1 = nn.Linear(state_dim, 256) 88 | self.fc2 = nn.Linear(256, 256) 89 | self.mu_head = nn.Linear(256, 1) 90 | self.log_std_head = nn.Linear(256, 1) 91 | self.max_action = max_action 92 | 93 | self.min_log_std = min_log_std 94 | self.max_log_std = max_log_std 95 | 96 | def forward(self, x): 97 | x = F.relu(self.fc1(x)) 98 | x = F.relu(self.fc2(x)) 99 | mu = self.mu_head(x) 100 | log_std_head = F.relu(self.log_std_head(x)) 101 | log_std_head = torch.clamp(log_std_head, self.min_log_std, self.max_log_std) 102 | return mu, log_std_head 103 | 104 | 105 | class Critic(nn.Module): 106 | def __init__(self, state_dim): 107 | super(Critic, self).__init__() 108 | self.fc1 = nn.Linear(state_dim, 256) 109 | self.fc2 = nn.Linear(256, 256) 110 | self.fc3 = nn.Linear(256, 1) 111 | 112 | def forward(self, x): 113 | x = F.relu(self.fc1(x)) 114 | x = F.relu(self.fc2(x)) 115 | x = self.fc3(x) 116 | return x 117 | 118 | 119 | class Q(nn.Module): 120 | def __init__(self, state_dim, action_dim): 121 | super(Q, self).__init__() 122 | self.fc1 = nn.Linear(state_dim + action_dim, 256) 123 | self.fc2 = nn.Linear(256, 256) 124 | self.fc3 = nn.Linear(256, 1) 125 | 126 | def forward(self, s, a): 127 | s = s.reshape(-1, state_dim) 128 | a = a.reshape(-1, action_dim) 129 | x = torch.cat((s, a), -1) # combination s and a 130 | x = F.relu(self.fc1(x)) 131 | x = F.relu(self.fc2(x)) 132 | x = self.fc3(x) 133 | return x 134 | 135 | 136 | class SAC(): 137 | def __init__(self): 138 | super(SAC, self).__init__() 139 | 140 | self.policy_net = Actor(state_dim).to(device) 141 | self.value_net = Critic(state_dim).to(device) 142 | self.Target_value_net = Critic(state_dim).to(device) 143 | self.Q_net1 = Q(state_dim, action_dim).to(device) 144 | self.Q_net2 = Q(state_dim, action_dim).to(device) 145 | 146 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=args.learning_rate) 147 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=args.learning_rate) 148 | self.Q1_optimizer = optim.Adam(self.Q_net1.parameters(), lr=args.learning_rate) 149 | self.Q2_optimizer = optim.Adam(self.Q_net2.parameters(), lr=args.learning_rate) 150 | 151 | self.replay_buffer = [Transition] * args.capacity 152 | self.num_transition = 0 # pointer of replay buffer 153 | self.num_training = 1 154 | self.writer = SummaryWriter('./exp-SAC_dual_Q_network') 155 | 156 | self.value_criterion = nn.MSELoss() 157 | self.Q1_criterion = nn.MSELoss() 158 | self.Q2_criterion = nn.MSELoss() 159 | 160 | for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()): 161 | target_param.data.copy_(param.data) 162 | 163 | os.makedirs('./SAC_model/', exist_ok=True) 164 | 165 | def select_action(self, state): 166 | state = torch.FloatTensor(state).to(device) 167 | mu, log_sigma = self.policy_net(state) 168 | sigma = torch.exp(log_sigma) 169 | dist = Normal(mu, sigma) 170 | z = dist.sample() 171 | action = torch.tanh(z).detach().cpu().numpy() 172 | return action.item() # return a scalar, float32 173 | 174 | def store(self, s, a, r, s_, d): 175 | index = self.num_transition % args.capacity 176 | transition = Transition(s, a, r, s_, d) 177 | self.replay_buffer[index] = transition 178 | self.num_transition += 1 179 | 180 | def evaluate(self, state): 181 | batch_mu, batch_log_sigma = self.policy_net(state) 182 | batch_sigma = torch.exp(batch_log_sigma) 183 | dist = Normal(batch_mu, batch_sigma) 184 | noise = Normal(0, 1) 185 | 186 | z = noise.sample() 187 | action = torch.tanh(batch_mu + batch_sigma*z.to(device)) 188 | log_prob = dist.log_prob(batch_mu + batch_sigma * z.to(device)) - torch.log(1 - action.pow(2) + min_Val) 189 | return action, log_prob, z, batch_mu, batch_log_sigma 190 | 191 | def update(self): 192 | if self.num_training % 500 == 0: 193 | print("Training ... {} times ".format(self.num_training)) 194 | s = torch.tensor([t.s for t in self.replay_buffer]).float().to(device) 195 | a = torch.tensor([t.a for t in self.replay_buffer]).to(device) 196 | r = torch.tensor([t.r for t in self.replay_buffer]).to(device) 197 | s_ = torch.tensor([t.s_ for t in self.replay_buffer]).float().to(device) 198 | d = torch.tensor([t.d for t in self.replay_buffer]).float().to(device) 199 | 200 | for _ in range(args.gradient_steps): 201 | #for index in BatchSampler(SubsetRandomSampler(range(args.capacity)), args.batch_size, False): 202 | index = np.random.choice(range(args.capacity), args.batch_size, replace=False) 203 | bn_s = s[index] 204 | bn_a = a[index].reshape(-1, 1) 205 | bn_r = r[index].reshape(-1, 1) 206 | bn_s_ = s_[index] 207 | bn_d = d[index].reshape(-1, 1) 208 | 209 | target_value = self.Target_value_net(bn_s_) 210 | next_q_value = bn_r + (1 - bn_d) * args.gamma * target_value 211 | 212 | excepted_value = self.value_net(bn_s) 213 | excepted_Q1 = self.Q_net1(bn_s, bn_a) 214 | excepted_Q2 = self.Q_net2(bn_s, bn_a) 215 | sample_action, log_prob, z, batch_mu, batch_log_sigma = self.evaluate(bn_s) 216 | excepted_new_Q = torch.min(self.Q_net1(bn_s, sample_action), self.Q_net2(bn_s, sample_action)) 217 | next_value = excepted_new_Q - log_prob 218 | 219 | # !!!Note that the actions are sampled according to the current policy, 220 | # instead of replay buffer. (From original paper) 221 | V_loss = self.value_criterion(excepted_value, next_value.detach()).mean() # J_V 222 | 223 | # Dual Q net 224 | Q1_loss = self.Q1_criterion(excepted_Q1, next_q_value.detach()).mean() # J_Q 225 | Q2_loss = self.Q2_criterion(excepted_Q2, next_q_value.detach()).mean() 226 | 227 | pi_loss = (log_prob - excepted_new_Q).mean() # according to original paper 228 | 229 | self.writer.add_scalar('Loss/V_loss', V_loss, global_step=self.num_training) 230 | self.writer.add_scalar('Loss/Q1_loss', Q1_loss, global_step=self.num_training) 231 | self.writer.add_scalar('Loss/Q2_loss', Q2_loss, global_step=self.num_training) 232 | self.writer.add_scalar('Loss/policy_loss', pi_loss, global_step=self.num_training) 233 | 234 | # mini batch gradient descent 235 | self.value_optimizer.zero_grad() 236 | V_loss.backward(retain_graph=True) 237 | nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.5) 238 | self.value_optimizer.step() 239 | 240 | self.Q1_optimizer.zero_grad() 241 | Q1_loss.backward(retain_graph = True) 242 | nn.utils.clip_grad_norm_(self.Q_net1.parameters(), 0.5) 243 | self.Q1_optimizer.step() 244 | 245 | self.Q2_optimizer.zero_grad() 246 | Q2_loss.backward(retain_graph = True) 247 | nn.utils.clip_grad_norm_(self.Q_net2.parameters(), 0.5) 248 | self.Q2_optimizer.step() 249 | 250 | self.policy_optimizer.zero_grad() 251 | pi_loss.backward(retain_graph = True) 252 | nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5) 253 | self.policy_optimizer.step() 254 | 255 | # update target v net update 256 | for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()): 257 | target_param.data.copy_(target_param * (1 - args.tau) + param * args.tau) 258 | 259 | self.num_training += 1 260 | 261 | def save(self): 262 | torch.save(self.policy_net.state_dict(), './SAC_model/policy_net.pth') 263 | torch.save(self.value_net.state_dict(), './SAC_model/value_net.pth') 264 | torch.save(self.Q_net1.state_dict(), './SAC_model/Q_net1.pth') 265 | torch.save(self.Q_net2.state_dict(), './SAC_model/Q_net2.pth') 266 | print("====================================") 267 | print("Model has been saved...") 268 | print("====================================") 269 | 270 | def load(self): 271 | self.policy_net.load_state_dict(torch.load('./SAC_model/policy_net.pth')) 272 | self.value_net.load_state_dict(torch.load( './SAC_model/value_net.pth')) 273 | self.Q_net1.load_state_dict(torch.load('./SAC_model/Q_net1.pth')) 274 | self.Q_net2.load_state_dict(torch.load('./SAC_model/Q_net2.pth')) 275 | print("model has been load") 276 | 277 | 278 | def main(): 279 | 280 | agent = SAC() 281 | if args.load: agent.load() 282 | if args.render: env.render() 283 | print("====================================") 284 | print("Collection Experience...") 285 | print("====================================") 286 | 287 | ep_r = 0 288 | for i in range(args.iteration): 289 | state = env.reset() 290 | for t in range(200): 291 | action = agent.select_action(state) 292 | next_state, reward, done, info = env.step(np.float32(action)) 293 | ep_r += reward 294 | if args.render: env.render() 295 | agent.store(state, action, reward, next_state, done) 296 | 297 | if agent.num_transition >= args.capacity: 298 | agent.update() 299 | 300 | state = next_state 301 | if done or t == 199: 302 | if i % 10 == 0: 303 | print("Ep_i {}, the ep_r is {}, the t is {}".format(i, ep_r, t)) 304 | break 305 | if i % args.log_interval == 0: 306 | agent.save() 307 | agent.writer.add_scalar('ep_r', ep_r, global_step=i) 308 | ep_r = 0 309 | 310 | 311 | if __name__ == '__main__': 312 | main() 313 | -------------------------------------------------------------------------------- /Char09 SAC/SAC_ep_r_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char09 SAC/SAC_ep_r_curve.png -------------------------------------------------------------------------------- /Char09 SAC/test_agent.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import namedtuple 3 | from itertools import count 4 | import pickle 5 | import os 6 | import numpy as np 7 | 8 | 9 | import gym 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | from torch.distributions import Normal 15 | from tensorboardX import SummaryWriter 16 | 17 | 18 | ''' 19 | Implementation of soft actor critic, dual Q network version 20 | Original paper: https://arxiv.org/abs/1801.01290 21 | Not the author's implementation ! 22 | ''' 23 | 24 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 25 | parser = argparse.ArgumentParser() 26 | 27 | 28 | parser.add_argument("--env_name", default="BipedalWalker-v2") # OpenAI gym environment name 29 | parser.add_argument('--tau', default=0.005, type=float) # target smoothing coefficient 30 | parser.add_argument('--target_update_interval', default=1, type=int) 31 | parser.add_argument('--gradient_steps', default=1, type=int) 32 | parser.add_argument('--mode', default='test', type=str) # test or train 33 | 34 | parser.add_argument('--learning_rate', default=3e-4, type=int) 35 | parser.add_argument('--gamma', default=0.99, type=int) # discount gamma 36 | parser.add_argument('--capacity', default=10000, type=int) # replay buffer size 37 | parser.add_argument('--iteration', default=100000, type=int) # num of games 38 | parser.add_argument('--batch_size', default=128, type=int) # mini batch size 39 | parser.add_argument('--seed', default=1, type=int) 40 | 41 | # optional parameters 42 | parser.add_argument('--num_hidden_layers', default=2, type=int) 43 | parser.add_argument('--num_hidden_units_per_layer', default=256, type=int) 44 | parser.add_argument('--sample_frequency', default=256, type=int) 45 | parser.add_argument('--activation', default='Relu', type=str) 46 | parser.add_argument('--render', default=False, type=bool) # show UI or not 47 | parser.add_argument('--log_interval', default=50, type=int) # 48 | parser.add_argument('--load', default=False, type=bool) # load model 49 | parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work 50 | args = parser.parse_args() 51 | 52 | class NormalizedActions(gym.ActionWrapper): 53 | def _action(self, action): 54 | low = self.action_space.low 55 | high = self.action_space.high 56 | 57 | action = low + (action + 1.0) * 0.5 * (high - low) 58 | action = np.clip(action, low, high) 59 | 60 | return action 61 | 62 | def _reverse_action(self, action): 63 | low = self.action_space.low 64 | high = self.action_space.high 65 | 66 | action = 2 * (action - low) / (high - low) - 1 67 | action = np.clip(action, low, high) 68 | 69 | return action 70 | 71 | 72 | env = NormalizedActions(gym.make(args.env_name)) 73 | 74 | # Set seeds 75 | env.seed(args.seed) 76 | torch.manual_seed(args.seed) 77 | np.random.seed(args.seed) 78 | 79 | state_dim = env.observation_space.shape[0] 80 | action_dim = env.action_space.shape[0] 81 | max_action = float(env.action_space.high[0]) 82 | min_Val = torch.tensor(1e-7).float().to(device) 83 | Transition = namedtuple('Transition', ['s', 'a', 'r', 's_', 'd']) 84 | 85 | class Actor(nn.Module): 86 | def __init__(self, state_dim, action_dim=action_dim, min_log_std=-10, max_log_std=2): 87 | super(Actor, self).__init__() 88 | self.fc1 = nn.Linear(state_dim, 256) 89 | self.fc2 = nn.Linear(256, 512) 90 | self.mu_head = nn.Linear(512, action_dim) 91 | self.log_std_head = nn.Linear(512, action_dim) 92 | self.max_action = max_action 93 | 94 | self.min_log_std = min_log_std 95 | self.max_log_std = max_log_std 96 | 97 | def forward(self, x): 98 | x = F.relu(self.fc1(x)) 99 | x = F.relu(self.fc2(x)) 100 | mu = self.mu_head(x) 101 | log_std_head = F.relu(self.log_std_head(x)) 102 | log_std_head = torch.clamp(log_std_head, self.min_log_std, self.max_log_std) 103 | return mu, log_std_head 104 | 105 | 106 | class Critic(nn.Module): 107 | def __init__(self, state_dim): 108 | super(Critic, self).__init__() 109 | self.fc1 = nn.Linear(state_dim, 256) 110 | self.fc2 = nn.Linear(256, 256) 111 | self.fc3 = nn.Linear(256, 1) 112 | 113 | def forward(self, x): 114 | x = F.relu(self.fc1(x)) 115 | x = F.relu(self.fc2(x)) 116 | x = self.fc3(x) 117 | return x 118 | 119 | 120 | class Q(nn.Module): 121 | def __init__(self, state_dim, action_dim): 122 | super(Q, self).__init__() 123 | self.fc1 = nn.Linear(state_dim + action_dim, 256) 124 | self.fc2 = nn.Linear(256, 256) 125 | self.fc3 = nn.Linear(256, 1) 126 | 127 | def forward(self, s, a): 128 | s = s.reshape(-1, state_dim) 129 | a = a.reshape(-1, action_dim) 130 | x = torch.cat((s, a), -1) # combination s and a 131 | x = F.relu(self.fc1(x)) 132 | x = F.relu(self.fc2(x)) 133 | x = self.fc3(x) 134 | return x 135 | 136 | 137 | class SAC(): 138 | def __init__(self): 139 | super(SAC, self).__init__() 140 | 141 | self.policy_net = Actor(state_dim).to(device) 142 | self.value_net = Critic(state_dim).to(device) 143 | self.Target_value_net = Critic(state_dim).to(device) 144 | self.Q_net1 = Q(state_dim, action_dim).to(device) 145 | self.Q_net2 = Q(state_dim, action_dim).to(device) 146 | 147 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=args.learning_rate) 148 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=args.learning_rate) 149 | self.Q1_optimizer = optim.Adam(self.Q_net1.parameters(), lr=args.learning_rate) 150 | self.Q2_optimizer = optim.Adam(self.Q_net2.parameters(), lr=args.learning_rate) 151 | 152 | self.replay_buffer = [Transition] * args.capacity 153 | self.num_transition = 0 # pointer of replay buffer 154 | self.num_training = 0 155 | self.writer = SummaryWriter('./test_agent') 156 | 157 | self.value_criterion = nn.MSELoss() 158 | self.Q1_criterion = nn.MSELoss() 159 | self.Q2_criterion = nn.MSELoss() 160 | 161 | for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()): 162 | target_param.data.copy_(param.data) 163 | 164 | os.makedirs('./SAC_model/', exist_ok=True) 165 | 166 | def select_action(self, state): 167 | state = torch.FloatTensor(state).to(device) 168 | mu, log_sigma = self.policy_net(state) 169 | sigma = torch.exp(log_sigma) 170 | dist = Normal(mu, sigma) 171 | z = dist.sample() 172 | action = torch.tanh(z).detach().cpu().numpy() 173 | return action # return a scalar, float32 174 | 175 | def store(self, s, a, r, s_, d): 176 | index = self.num_transition % args.capacity 177 | transition = Transition(s, a, r, s_, d) 178 | self.replay_buffer[index] = transition 179 | self.num_transition += 1 180 | 181 | def evaluate(self, state): 182 | batch_mu, batch_log_sigma = self.policy_net(state) 183 | batch_sigma = torch.exp(batch_log_sigma) 184 | dist = Normal(batch_mu, batch_sigma) 185 | noise = Normal(0, 1) 186 | 187 | z = noise.sample() 188 | action = torch.tanh(batch_mu + batch_sigma*z.to(device)) 189 | log_prob = dist.log_prob(batch_mu + batch_sigma * z.to(device)) - torch.log(1 - action.pow(2) + min_Val) 190 | return action, log_prob, z, batch_mu, batch_log_sigma 191 | 192 | def update(self): 193 | if self.num_training % 500 == 0: 194 | print("Training ... \t{} times ".format(self.num_training)) 195 | s = torch.tensor([t.s for t in self.replay_buffer]).float().to(device) 196 | a = torch.tensor([t.a for t in self.replay_buffer]).to(device) 197 | r = torch.tensor([t.r for t in self.replay_buffer]).to(device) 198 | s_ = torch.tensor([t.s_ for t in self.replay_buffer]).float().to(device) 199 | d = torch.tensor([t.d for t in self.replay_buffer]).float().to(device) 200 | 201 | for _ in range(args.gradient_steps): 202 | #for index in BatchSampler(SubsetRandomSampler(range(args.capacity)), args.batch_size, False): 203 | index = np.random.choice(range(args.capacity), args.batch_size, replace=False) 204 | bn_s = s[index].reshape(-1, state_dim) 205 | bn_a = a[index].reshape(-1, action_dim) 206 | bn_r = r[index].reshape(-1, 1) 207 | bn_s_ = s_[index].reshape(-1, state_dim) 208 | bn_d = d[index].reshape(-1, 1) 209 | 210 | target_value = self.Target_value_net(bn_s_) 211 | next_q_value = bn_r + (1 - bn_d) * args.gamma * target_value 212 | 213 | excepted_value = self.value_net(bn_s) 214 | excepted_Q1 = self.Q_net1(bn_s, bn_a) 215 | excepted_Q2 = self.Q_net2(bn_s, bn_a) 216 | sample_action, log_prob, z, batch_mu, batch_log_sigma = self.evaluate(bn_s) 217 | excepted_new_Q = torch.min(self.Q_net1(bn_s, sample_action), self.Q_net2(bn_s, sample_action)) 218 | next_value = excepted_new_Q - log_prob 219 | 220 | # !!!Note that the actions are sampled according to the current policy, 221 | # instead of replay buffer. (From original paper) 222 | V_loss = self.value_criterion(excepted_value, next_value.detach()).mean() # J_V 223 | 224 | # Dual Q net 225 | Q1_loss = self.Q1_criterion(excepted_Q1, next_q_value.detach()).mean() # J_Q 226 | Q2_loss = self.Q2_criterion(excepted_Q2, next_q_value.detach()).mean() 227 | 228 | pi_loss = (log_prob - excepted_new_Q).mean() # according to original paper 229 | 230 | self.writer.add_scalar('Loss/V_loss', V_loss, global_step=self.num_training) 231 | self.writer.add_scalar('Loss/Q1_loss', Q1_loss, global_step=self.num_training) 232 | self.writer.add_scalar('Loss/Q2_loss', Q2_loss, global_step=self.num_training) 233 | self.writer.add_scalar('Loss/policy_loss', pi_loss, global_step=self.num_training) 234 | 235 | # mini batch gradient descent 236 | self.value_optimizer.zero_grad() 237 | V_loss.backward(retain_graph=True) 238 | nn.utils.clip_grad_norm_(self.value_net.parameters(), 0.5) 239 | self.value_optimizer.step() 240 | 241 | self.Q1_optimizer.zero_grad() 242 | Q1_loss.backward(retain_graph = True) 243 | nn.utils.clip_grad_norm_(self.Q_net1.parameters(), 0.5) 244 | self.Q1_optimizer.step() 245 | 246 | self.Q2_optimizer.zero_grad() 247 | Q2_loss.backward(retain_graph = True) 248 | nn.utils.clip_grad_norm_(self.Q_net2.parameters(), 0.5) 249 | self.Q2_optimizer.step() 250 | 251 | self.policy_optimizer.zero_grad() 252 | pi_loss.backward(retain_graph = True) 253 | nn.utils.clip_grad_norm_(self.policy_net.parameters(), 0.5) 254 | self.policy_optimizer.step() 255 | 256 | # update target v net update 257 | for target_param, param in zip(self.Target_value_net.parameters(), self.value_net.parameters()): 258 | target_param.data.copy_(target_param * (1 - args.tau) + param * args.tau) 259 | 260 | self.num_training += 1 261 | 262 | def save(self): 263 | torch.save(self.policy_net.state_dict(), './SAC_model/policy_net.pth') 264 | torch.save(self.value_net.state_dict(), './SAC_model/value_net.pth') 265 | torch.save(self.Q_net1.state_dict(), './SAC_model/Q_net1.pth') 266 | torch.save(self.Q_net2.state_dict(), './SAC_model/Q_net1.pth') 267 | print("====================================") 268 | print("Model has been saved...") 269 | print("====================================") 270 | 271 | def load(self): 272 | self.policy_net.load_state_dict(torch.load('./SAC_model/policy_net.pth')) 273 | self.value_net.load_state_dict(torch.load( './SAC_model/value_net.pth')) 274 | self.Q_net1.load_state_dict(torch.load('./SAC_model/Q_net1.pth')) 275 | self.Q_net2.load_state_dict(torch.load('./SAC_model/Q_net1.pth')) 276 | print("model has been load") 277 | 278 | 279 | def main(): 280 | agent = SAC() 281 | ep_r = 0 282 | if args.mode == 'test': 283 | agent.load() 284 | for i in range(args.iteration): 285 | state = env.reset() 286 | for t in count(): 287 | action = agent.select_action(state) 288 | next_state, reward, done, info = env.step(np.float32(action)) 289 | ep_r += reward 290 | env.render() 291 | if done: 292 | break 293 | state = next_state 294 | else: 295 | print("====================================") 296 | print("Collection Experience...") 297 | print("====================================") 298 | 299 | 300 | for i in range(args.iteration): 301 | state = env.reset() 302 | for t in range(200): 303 | action = agent.select_action(state) 304 | next_state, reward, done, info = env.step(np.float32(action)) 305 | ep_r += reward 306 | if args.render and i >= args.render_interval : env.render() 307 | agent.store(state, action, reward, next_state, done) 308 | 309 | if agent.num_transition >= args.capacity: 310 | agent.update() 311 | 312 | state = next_state 313 | if done: 314 | if i > 100: 315 | print("Ep_i \t{}, the ep_r is \t{}, the step is \t{}".format(i, ep_r, t)) 316 | break 317 | if i % args.log_interval == 0: 318 | agent.save() 319 | agent.writer.add_scalar('ep_r', ep_r, global_step=i) 320 | ep_r = 0 321 | 322 | 323 | if __name__ == '__main__': 324 | main() 325 | -------------------------------------------------------------------------------- /Char10 TD3/Episode_reward_TD3_BipedakWalker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/Episode_reward_TD3_BipedakWalker.png -------------------------------------------------------------------------------- /Char10 TD3/TD3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import namedtuple 3 | from itertools import count 4 | 5 | import os, sys, random 6 | import numpy as np 7 | 8 | import gym 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | from torch.distributions import Normal 14 | from tensorboardX import SummaryWriter 15 | 16 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 17 | parser = argparse.ArgumentParser() 18 | 19 | parser.add_argument('--mode', default='train', type=str) # mode = 'train' or 'test' 20 | parser.add_argument("--env_name", default="Pendulum-v0") # OpenAI gym environment name, BipedalWalker-v2 21 | parser.add_argument('--tau', default=0.005, type=float) # target smoothing coefficient 22 | parser.add_argument('--target_update_interval', default=1, type=int) 23 | parser.add_argument('--iteration', default=5, type=int) 24 | 25 | parser.add_argument('--learning_rate', default=3e-4, type=float) 26 | parser.add_argument('--gamma', default=0.99, type=int) # discounted factor 27 | parser.add_argument('--capacity', default=50000, type=int) # replay buffer size 28 | parser.add_argument('--num_iteration', default=100000, type=int) # num of games 29 | parser.add_argument('--batch_size', default=100, type=int) # mini batch size 30 | parser.add_argument('--seed', default=1, type=int) 31 | 32 | # optional parameters 33 | parser.add_argument('--num_hidden_layers', default=2, type=int) 34 | parser.add_argument('--sample_frequency', default=256, type=int) 35 | parser.add_argument('--activation', default='Relu', type=str) 36 | parser.add_argument('--render', default=False, type=bool) # show UI or not 37 | parser.add_argument('--log_interval', default=50, type=int) # 38 | parser.add_argument('--load', default=False, type=bool) # load model 39 | parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work 40 | parser.add_argument('--policy_noise', default=0.2, type=float) 41 | parser.add_argument('--noise_clip', default=0.5, type=float) 42 | parser.add_argument('--policy_delay', default=2, type=int) 43 | parser.add_argument('--exploration_noise', default=0.1, type=float) 44 | parser.add_argument('--max_episode', default=2000, type=int) 45 | parser.add_argument('--print_log', default=5, type=int) 46 | args = parser.parse_args() 47 | 48 | 49 | 50 | # Set seeds 51 | # env.seed(args.seed) 52 | # torch.manual_seed(args.seed) 53 | # np.random.seed(args.seed) 54 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 55 | script_name = os.path.basename(__file__) 56 | env = gym.make(args.env_name) 57 | 58 | state_dim = env.observation_space.shape[0] 59 | action_dim = env.action_space.shape[0] 60 | max_action = float(env.action_space.high[0]) 61 | min_Val = torch.tensor(1e-7).float().to(device) # min value 62 | 63 | directory = './exp' + script_name + args.env_name +'./' 64 | ''' 65 | Implementation of TD3 with pytorch 66 | Original paper: https://arxiv.org/abs/1802.09477 67 | Not the author's implementation ! 68 | ''' 69 | 70 | class Replay_buffer(): 71 | ''' 72 | Code based on: 73 | https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py 74 | Expects tuples of (state, next_state, action, reward, done) 75 | ''' 76 | def __init__(self, max_size=args.capacity): 77 | self.storage = [] 78 | self.max_size = max_size 79 | self.ptr = 0 80 | 81 | def push(self, data): 82 | if len(self.storage) == self.max_size: 83 | self.storage[int(self.ptr)] = data 84 | self.ptr = (self.ptr + 1) % self.max_size 85 | else: 86 | self.storage.append(data) 87 | 88 | def sample(self, batch_size): 89 | ind = np.random.randint(0, len(self.storage), size=batch_size) 90 | x, y, u, r, d = [], [], [], [], [] 91 | 92 | for i in ind: 93 | X, Y, U, R, D = self.storage[i] 94 | x.append(np.array(X, copy=False)) 95 | y.append(np.array(Y, copy=False)) 96 | u.append(np.array(U, copy=False)) 97 | r.append(np.array(R, copy=False)) 98 | d.append(np.array(D, copy=False)) 99 | 100 | return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1) 101 | 102 | 103 | class Actor(nn.Module): 104 | 105 | def __init__(self, state_dim, action_dim, max_action): 106 | super(Actor, self).__init__() 107 | 108 | self.fc1 = nn.Linear(state_dim, 400) 109 | self.fc2 = nn.Linear(400, 300) 110 | self.fc3 = nn.Linear(300, action_dim) 111 | 112 | self.max_action = max_action 113 | 114 | def forward(self, state): 115 | a = F.relu(self.fc1(state)) 116 | a = F.relu(self.fc2(a)) 117 | a = torch.tanh(self.fc3(a)) * self.max_action 118 | return a 119 | 120 | 121 | class Critic(nn.Module): 122 | 123 | def __init__(self, state_dim, action_dim): 124 | super(Critic, self).__init__() 125 | 126 | self.fc1 = nn.Linear(state_dim + action_dim, 400) 127 | self.fc2 = nn.Linear(400, 300) 128 | self.fc3 = nn.Linear(300, 1) 129 | 130 | def forward(self, state, action): 131 | state_action = torch.cat([state, action], 1) 132 | 133 | q = F.relu(self.fc1(state_action)) 134 | q = F.relu(self.fc2(q)) 135 | q = self.fc3(q) 136 | return q 137 | 138 | 139 | class TD3(): 140 | def __init__(self, state_dim, action_dim, max_action): 141 | 142 | self.actor = Actor(state_dim, action_dim, max_action).to(device) 143 | self.actor_target = Actor(state_dim, action_dim, max_action).to(device) 144 | self.critic_1 = Critic(state_dim, action_dim).to(device) 145 | self.critic_1_target = Critic(state_dim, action_dim).to(device) 146 | self.critic_2 = Critic(state_dim, action_dim).to(device) 147 | self.critic_2_target = Critic(state_dim, action_dim).to(device) 148 | 149 | self.actor_optimizer = optim.Adam(self.actor.parameters()) 150 | self.critic_1_optimizer = optim.Adam(self.critic_1.parameters()) 151 | self.critic_2_optimizer = optim.Adam(self.critic_2.parameters()) 152 | 153 | self.actor_target.load_state_dict(self.actor.state_dict()) 154 | self.critic_1_target.load_state_dict(self.critic_1.state_dict()) 155 | self.critic_2_target.load_state_dict(self.critic_2.state_dict()) 156 | 157 | self.max_action = max_action 158 | self.memory = Replay_buffer(args.capacity) 159 | self.writer = SummaryWriter(directory) 160 | self.num_critic_update_iteration = 0 161 | self.num_actor_update_iteration = 0 162 | self.num_training = 0 163 | 164 | def select_action(self, state): 165 | state = torch.tensor(state.reshape(1, -1)).float().to(device) 166 | return self.actor(state).cpu().data.numpy().flatten() 167 | 168 | def update(self, num_iteration): 169 | 170 | if self.num_training % 500 == 0: 171 | print("====================================") 172 | print("model has been trained for {} times...".format(self.num_training)) 173 | print("====================================") 174 | for i in range(num_iteration): 175 | x, y, u, r, d = self.memory.sample(args.batch_size) 176 | state = torch.FloatTensor(x).to(device) 177 | action = torch.FloatTensor(u).to(device) 178 | next_state = torch.FloatTensor(y).to(device) 179 | done = torch.FloatTensor(d).to(device) 180 | reward = torch.FloatTensor(r).to(device) 181 | 182 | # Select next action according to target policy: 183 | noise = torch.ones_like(action).data.normal_(0, args.policy_noise).to(device) 184 | noise = noise.clamp(-args.noise_clip, args.noise_clip) 185 | next_action = (self.actor_target(next_state) + noise) 186 | next_action = next_action.clamp(-self.max_action, self.max_action) 187 | 188 | # Compute target Q-value: 189 | target_Q1 = self.critic_1_target(next_state, next_action) 190 | target_Q2 = self.critic_2_target(next_state, next_action) 191 | target_Q = torch.min(target_Q1, target_Q2) 192 | target_Q = reward + ((1 - done) * args.gamma * target_Q).detach() 193 | 194 | # Optimize Critic 1: 195 | current_Q1 = self.critic_1(state, action) 196 | loss_Q1 = F.mse_loss(current_Q1, target_Q) 197 | self.critic_1_optimizer.zero_grad() 198 | loss_Q1.backward() 199 | self.critic_1_optimizer.step() 200 | self.writer.add_scalar('Loss/Q1_loss', loss_Q1, global_step=self.num_critic_update_iteration) 201 | 202 | # Optimize Critic 2: 203 | current_Q2 = self.critic_2(state, action) 204 | loss_Q2 = F.mse_loss(current_Q2, target_Q) 205 | self.critic_2_optimizer.zero_grad() 206 | loss_Q2.backward() 207 | self.critic_2_optimizer.step() 208 | self.writer.add_scalar('Loss/Q2_loss', loss_Q2, global_step=self.num_critic_update_iteration) 209 | # Delayed policy updates: 210 | if i % args.policy_delay == 0: 211 | # Compute actor loss: 212 | actor_loss = - self.critic_1(state, self.actor(state)).mean() 213 | 214 | # Optimize the actor 215 | self.actor_optimizer.zero_grad() 216 | actor_loss.backward() 217 | self.actor_optimizer.step() 218 | self.writer.add_scalar('Loss/actor_loss', actor_loss, global_step=self.num_actor_update_iteration) 219 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 220 | target_param.data.copy_(((1- args.tau) * target_param.data) + args.tau * param.data) 221 | 222 | for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()): 223 | target_param.data.copy_(((1 - args.tau) * target_param.data) + args.tau * param.data) 224 | 225 | for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()): 226 | target_param.data.copy_(((1 - args.tau) * target_param.data) + args.tau * param.data) 227 | 228 | self.num_actor_update_iteration += 1 229 | self.num_critic_update_iteration += 1 230 | self.num_training += 1 231 | 232 | def save(self): 233 | torch.save(self.actor.state_dict(), directory+'actor.pth') 234 | torch.save(self.actor_target.state_dict(), directory+'actor_target.pth') 235 | torch.save(self.critic_1.state_dict(), directory+'critic_1.pth') 236 | torch.save(self.critic_1_target.state_dict(), directory+'critic_1_target.pth') 237 | torch.save(self.critic_2.state_dict(), directory+'critic_2.pth') 238 | torch.save(self.critic_2_target.state_dict(), directory+'critic_2_target.pth') 239 | print("====================================") 240 | print("Model has been saved...") 241 | print("====================================") 242 | 243 | def load(self): 244 | self.actor.load_state_dict(torch.load(directory + 'actor.pth')) 245 | self.actor_target.load_state_dict(torch.load(directory + 'actor_target.pth')) 246 | self.critic_1.load_state_dict(torch.load(directory + 'critic_1.pth')) 247 | self.critic_1_target.load_state_dict(torch.load(directory + 'critic_1_target.pth')) 248 | self.critic_2.load_state_dict(torch.load(directory + 'critic_2.pth')) 249 | self.critic_2_target.load_state_dict(torch.load(directory + 'critic_2_target.pth')) 250 | print("====================================") 251 | print("model has been loaded...") 252 | print("====================================") 253 | 254 | 255 | def main(): 256 | agent = TD3(state_dim, action_dim, max_action) 257 | ep_r = 0 258 | 259 | if args.mode == 'test': 260 | agent.load() 261 | for i in range(args.iteration): 262 | state = env.reset() 263 | for t in count(): 264 | action = agent.select_action(state) 265 | next_state, reward, done, info = env.step(np.float32(action)) 266 | ep_r += reward 267 | env.render() 268 | if done or t ==2000 : 269 | print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t)) 270 | break 271 | state = next_state 272 | 273 | elif args.mode == 'train': 274 | print("====================================") 275 | print("Collection Experience...") 276 | print("====================================") 277 | if args.load: agent.load() 278 | for i in range(args.num_iteration): 279 | state = env.reset() 280 | for t in range(2000): 281 | 282 | action = agent.select_action(state) 283 | action = action + np.random.normal(0, args.exploration_noise, size=env.action_space.shape[0]) 284 | action = action.clip(env.action_space.low, env.action_space.high) 285 | next_state, reward, done, info = env.step(action) 286 | ep_r += reward 287 | if args.render and i >= args.render_interval : env.render() 288 | agent.memory.push((state, next_state, action, reward, np.float(done))) 289 | if i+1 % 10 == 0: 290 | print('Episode {}, The memory size is {} '.format(i, len(agent.memory.storage))) 291 | if len(agent.memory.storage) >= args.capacity-1: 292 | agent.update(10) 293 | 294 | state = next_state 295 | if done or t == args.max_episode -1: 296 | agent.writer.add_scalar('ep_r', ep_r, global_step=i) 297 | if i % args.print_log == 0: 298 | print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t)) 299 | ep_r = 0 300 | break 301 | 302 | if i % args.log_interval == 0: 303 | agent.save() 304 | 305 | else: 306 | raise NameError("mode wrong!!!") 307 | 308 | if __name__ == '__main__': 309 | main() 310 | -------------------------------------------------------------------------------- /Char10 TD3/TD3_BipedalWalker-v2.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import namedtuple 3 | from itertools import count 4 | 5 | import os, sys, random 6 | import numpy as np 7 | 8 | import gym 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | from torch.distributions import Normal 14 | from tensorboardX import SummaryWriter 15 | 16 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 17 | parser = argparse.ArgumentParser() 18 | 19 | parser.add_argument('--mode', default='train', type=str) # mode = 'train' or 'test' 20 | # OpenAI gym environment name, # ['BipedalWalker-v2', 'Pendulum-v0'] or any continuous environment 21 | # Note that if you want test in another game, you should fine-tuning. 22 | parser.add_argument("--env_name", default="BipedalWalker-v2") 23 | parser.add_argument('--tau', default=0.005, type=float) # target smoothing coefficient 24 | parser.add_argument('--target_update_interval', default=1, type=int) 25 | parser.add_argument('--test_iteration', default=10, type=int) 26 | 27 | parser.add_argument('--learning_rate', default=3e-4, type=float) 28 | parser.add_argument('--gamma', default=0.99, type=int) # discounted factor 29 | parser.add_argument('--capacity', default=50000, type=int) # replay buffer size 30 | parser.add_argument('--num_iteration', default=100000, type=int) # num of games 31 | parser.add_argument('--batch_size', default=100, type=int) # mini batch size 32 | parser.add_argument('--seed', default=False, type=bool) 33 | parser.add_argument('--random_seed', default=9527, type=int) 34 | # optional parameters 35 | parser.add_argument('--num_hidden_layers', default=2, type=int) 36 | parser.add_argument('--sample_frequency', default=256, type=int) 37 | parser.add_argument('--render', default=False, type=bool) # show UI or not 38 | parser.add_argument('--log_interval', default=50, type=int) # 39 | parser.add_argument('--load', default=False, type=bool) # load model 40 | parser.add_argument('--render_interval', default=100, type=int) # after render_interval, the env.render() will work 41 | parser.add_argument('--policy_noise', default=0.2, type=float) 42 | parser.add_argument('--noise_clip', default=0.5, type=float) 43 | parser.add_argument('--policy_delay', default=2, type=int) 44 | parser.add_argument('--exploration_noise', default=0.1, type=float) 45 | parser.add_argument('--max_episode', default=2000, type=int) 46 | parser.add_argument('--print_log', default=5, type=int) 47 | args = parser.parse_args() 48 | 49 | 50 | 51 | 52 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 53 | script_name = os.path.basename(__file__) 54 | env = gym.make(args.env_name) 55 | if args.seed: 56 | env.seed(args.random_seed) 57 | torch.manual_seed(args.random_seed) 58 | np.random.seed(args.random_seed) 59 | 60 | state_dim = env.observation_space.shape[0] 61 | action_dim = env.action_space.shape[0] 62 | max_action = float(env.action_space.high[0]) 63 | min_Val = torch.tensor(1e-7).float().to(device) # min value 64 | 65 | directory = './exp' + script_name + args.env_name +'./' 66 | ''' 67 | Implementation of TD3 with pytorch 68 | Original paper: https://arxiv.org/abs/1802.09477 69 | Not the author's implementation ! 70 | ''' 71 | 72 | class Replay_buffer(): 73 | ''' 74 | Code based on: 75 | https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py 76 | Expects tuples of (state, next_state, action, reward, done) 77 | ''' 78 | def __init__(self, max_size=args.capacity): 79 | self.storage = [] 80 | self.max_size = max_size 81 | self.ptr = 0 82 | 83 | def push(self, data): 84 | if len(self.storage) == self.max_size: 85 | self.storage[int(self.ptr)] = data 86 | self.ptr = (self.ptr + 1) % self.max_size 87 | else: 88 | self.storage.append(data) 89 | 90 | def sample(self, batch_size): 91 | ind = np.random.randint(0, len(self.storage), size=batch_size) 92 | x, y, u, r, d = [], [], [], [], [] 93 | 94 | for i in ind: 95 | X, Y, U, R, D = self.storage[i] 96 | x.append(np.array(X, copy=False)) 97 | y.append(np.array(Y, copy=False)) 98 | u.append(np.array(U, copy=False)) 99 | r.append(np.array(R, copy=False)) 100 | d.append(np.array(D, copy=False)) 101 | 102 | return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1) 103 | 104 | 105 | class Actor(nn.Module): 106 | 107 | def __init__(self, state_dim, action_dim, max_action): 108 | super(Actor, self).__init__() 109 | 110 | self.fc1 = nn.Linear(state_dim, 400) 111 | self.fc2 = nn.Linear(400, 300) 112 | self.fc3 = nn.Linear(300, action_dim) 113 | 114 | self.max_action = max_action 115 | 116 | def forward(self, state): 117 | a = F.relu(self.fc1(state)) 118 | a = F.relu(self.fc2(a)) 119 | a = torch.tanh(self.fc3(a)) * self.max_action 120 | return a 121 | 122 | 123 | class Critic(nn.Module): 124 | 125 | def __init__(self, state_dim, action_dim): 126 | super(Critic, self).__init__() 127 | 128 | self.fc1 = nn.Linear(state_dim + action_dim, 400) 129 | self.fc2 = nn.Linear(400, 300) 130 | self.fc3 = nn.Linear(300, 1) 131 | 132 | def forward(self, state, action): 133 | state_action = torch.cat([state, action], 1) 134 | 135 | q = F.relu(self.fc1(state_action)) 136 | q = F.relu(self.fc2(q)) 137 | q = self.fc3(q) 138 | return q 139 | 140 | 141 | class TD3(): 142 | def __init__(self, state_dim, action_dim, max_action): 143 | 144 | self.actor = Actor(state_dim, action_dim, max_action).to(device) 145 | self.actor_target = Actor(state_dim, action_dim, max_action).to(device) 146 | self.critic_1 = Critic(state_dim, action_dim).to(device) 147 | self.critic_1_target = Critic(state_dim, action_dim).to(device) 148 | self.critic_2 = Critic(state_dim, action_dim).to(device) 149 | self.critic_2_target = Critic(state_dim, action_dim).to(device) 150 | 151 | self.actor_optimizer = optim.Adam(self.actor.parameters()) 152 | self.critic_1_optimizer = optim.Adam(self.critic_1.parameters()) 153 | self.critic_2_optimizer = optim.Adam(self.critic_2.parameters()) 154 | 155 | self.actor_target.load_state_dict(self.actor.state_dict()) 156 | self.critic_1_target.load_state_dict(self.critic_1.state_dict()) 157 | self.critic_2_target.load_state_dict(self.critic_2.state_dict()) 158 | 159 | self.max_action = max_action 160 | self.memory = Replay_buffer(args.capacity) 161 | self.writer = SummaryWriter(directory) 162 | self.num_critic_update_iteration = 0 163 | self.num_actor_update_iteration = 0 164 | self.num_training = 0 165 | 166 | def select_action(self, state): 167 | state = torch.tensor(state.reshape(1, -1)).float().to(device) 168 | return self.actor(state).cpu().data.numpy().flatten() 169 | 170 | def update(self, num_iteration): 171 | 172 | if self.num_training % 500 == 0: 173 | print("====================================") 174 | print("model has been trained for {} times...".format(self.num_training)) 175 | print("====================================") 176 | for i in range(num_iteration): 177 | x, y, u, r, d = self.memory.sample(args.batch_size) 178 | state = torch.FloatTensor(x).to(device) 179 | action = torch.FloatTensor(u).to(device) 180 | next_state = torch.FloatTensor(y).to(device) 181 | done = torch.FloatTensor(d).to(device) 182 | reward = torch.FloatTensor(r).to(device) 183 | 184 | # Select next action according to target policy: 185 | noise = torch.ones_like(action).data.normal_(0, args.policy_noise).to(device) 186 | noise = noise.clamp(-args.noise_clip, args.noise_clip) 187 | next_action = (self.actor_target(next_state) + noise) 188 | next_action = next_action.clamp(-self.max_action, self.max_action) 189 | 190 | # Compute target Q-value: 191 | target_Q1 = self.critic_1_target(next_state, next_action) 192 | target_Q2 = self.critic_2_target(next_state, next_action) 193 | target_Q = torch.min(target_Q1, target_Q2) 194 | target_Q = reward + ((1 - done) * args.gamma * target_Q).detach() 195 | 196 | # Optimize Critic 1: 197 | current_Q1 = self.critic_1(state, action) 198 | loss_Q1 = F.mse_loss(current_Q1, target_Q) 199 | self.critic_1_optimizer.zero_grad() 200 | loss_Q1.backward() 201 | self.critic_1_optimizer.step() 202 | self.writer.add_scalar('Loss/Q1_loss', loss_Q1, global_step=self.num_critic_update_iteration) 203 | 204 | # Optimize Critic 2: 205 | current_Q2 = self.critic_2(state, action) 206 | loss_Q2 = F.mse_loss(current_Q2, target_Q) 207 | self.critic_2_optimizer.zero_grad() 208 | loss_Q2.backward() 209 | self.critic_2_optimizer.step() 210 | self.writer.add_scalar('Loss/Q2_loss', loss_Q2, global_step=self.num_critic_update_iteration) 211 | # Delayed policy updates: 212 | if i % args.policy_delay == 0: 213 | # Compute actor loss: 214 | actor_loss = - self.critic_1(state, self.actor(state)).mean() 215 | 216 | # Optimize the actor 217 | self.actor_optimizer.zero_grad() 218 | actor_loss.backward() 219 | self.actor_optimizer.step() 220 | self.writer.add_scalar('Loss/actor_loss', actor_loss, global_step=self.num_actor_update_iteration) 221 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 222 | target_param.data.copy_(((1- args.tau) * target_param.data) + args.tau * param.data) 223 | 224 | for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()): 225 | target_param.data.copy_(((1 - args.tau) * target_param.data) + args.tau * param.data) 226 | 227 | for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()): 228 | target_param.data.copy_(((1 - args.tau) * target_param.data) + args.tau * param.data) 229 | 230 | self.num_actor_update_iteration += 1 231 | self.num_critic_update_iteration += 1 232 | self.num_training += 1 233 | 234 | def save(self): 235 | torch.save(self.actor.state_dict(), directory+'actor.pth') 236 | torch.save(self.actor_target.state_dict(), directory+'actor_target.pth') 237 | torch.save(self.critic_1.state_dict(), directory+'critic_1.pth') 238 | torch.save(self.critic_1_target.state_dict(), directory+'critic_1_target.pth') 239 | torch.save(self.critic_2.state_dict(), directory+'critic_2.pth') 240 | torch.save(self.critic_2_target.state_dict(), directory+'critic_2_target.pth') 241 | print("====================================") 242 | print("Model has been saved...") 243 | print("====================================") 244 | 245 | def load(self): 246 | self.actor.load_state_dict(torch.load(directory + 'actor.pth')) 247 | self.actor_target.load_state_dict(torch.load(directory + 'actor_target.pth')) 248 | self.critic_1.load_state_dict(torch.load(directory + 'critic_1.pth')) 249 | self.critic_1_target.load_state_dict(torch.load(directory + 'critic_1_target.pth')) 250 | self.critic_2.load_state_dict(torch.load(directory + 'critic_2.pth')) 251 | self.critic_2_target.load_state_dict(torch.load(directory + 'critic_2_target.pth')) 252 | print("====================================") 253 | print("model has been loaded...") 254 | print("====================================") 255 | 256 | 257 | def main(): 258 | agent = TD3(state_dim, action_dim, max_action) 259 | ep_r = 0 260 | 261 | if args.mode == 'test': 262 | agent.load() 263 | for i in range(args.test_iteration): 264 | state = env.reset() 265 | for t in count(): 266 | action = agent.select_action(state) 267 | next_state, reward, done, info = env.step(np.float32(action)) 268 | ep_r += reward 269 | env.render() 270 | if done or t ==2000 : 271 | print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t)) 272 | ep_r = 0 273 | break 274 | state = next_state 275 | 276 | 277 | elif args.mode == 'train': 278 | print("====================================") 279 | print("Collection Experience...") 280 | print("====================================") 281 | if args.load: agent.load() 282 | for i in range(args.num_iteration): 283 | state = env.reset() 284 | for t in range(2000): 285 | 286 | action = agent.select_action(state) 287 | action = action + np.random.normal(0, args.exploration_noise, size=env.action_space.shape[0]) 288 | action = action.clip(env.action_space.low, env.action_space.high) 289 | next_state, reward, done, info = env.step(action) 290 | ep_r += reward 291 | if args.render and i >= args.render_interval : env.render() 292 | agent.memory.push((state, next_state, action, reward, np.float(done))) 293 | if i+1 % 10 == 0: 294 | print('Episode {}, The memory size is {} '.format(i, len(agent.memory.storage))) 295 | if len(agent.memory.storage) >= args.capacity-1: 296 | agent.update(10) 297 | 298 | state = next_state 299 | if done or t == args.max_episode -1: 300 | agent.writer.add_scalar('ep_r', ep_r, global_step=i) 301 | if i % args.print_log == 0: 302 | print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t)) 303 | ep_r = 0 304 | break 305 | 306 | if i % args.log_interval == 0: 307 | agent.save() 308 | 309 | else: 310 | raise NameError("mode wrong!!!") 311 | 312 | if __name__ == '__main__': 313 | main() 314 | -------------------------------------------------------------------------------- /Char10 TD3/TD3_Pendulum-v0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/TD3_Pendulum-v0.png -------------------------------------------------------------------------------- /Char10 TD3/expTD3.pyPendulum-v0./actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./actor.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3.pyPendulum-v0./actor_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./actor_target.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3.pyPendulum-v0./critic_1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./critic_1.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3.pyPendulum-v0./critic_1_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./critic_1_target.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3.pyPendulum-v0./critic_2.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./critic_2.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3.pyPendulum-v0./critic_2_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3.pyPendulum-v0./critic_2_target.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./actor.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./actor_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./actor_target.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_1.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_1_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_1_target.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_2.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_2.pth -------------------------------------------------------------------------------- /Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_2_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/Char10 TD3/expTD3_BipedalWalker-v2.pyBipedalWalker-v2./critic_2_target.pth -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Johnny He 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /More/Application in real world/README.md: -------------------------------------------------------------------------------- 1 | # This repo we introduce some application in real world 2 | 3 | 4 | This [post](https://towardsdatascience.com/advanced-reinforcement-learning-6d769f529eb3) would give you an idea of the application of reinforcement learning in industry. 5 | 6 | Here I upload some papers about this topic. 7 | -------------------------------------------------------------------------------- /More/MARL/README.md: -------------------------------------------------------------------------------- 1 | # Multi-agent reinforcement learing 2 | 3 | Multi-Agent Reinforcement Learning is a very interesting research area, which has strong connections with single-agent RL, multi-agent systems, game theory, evolutionary computation and optimization theory. 4 | 5 | 6 | 1. Learning to Communicate with Deep Multi-Agent Reinforcement Learning in PyTorch 7 | 8 | You can click [here](https://github.com/sweetice/learning-to-communicate-pytorch) to read code. 9 | 10 | 11 | 12 | 2. MARL papers 13 | 14 | Paper list of multi-agent reinforcement learning(MARL) 15 | 16 | You can click [here](https://github.com/LantaoYu/MARL-Papers) to read the repo. 17 | -------------------------------------------------------------------------------- /More/plot.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | import re 6 | import os 7 | sns.set(style='darkgrid') 8 | 9 | def get_info(filename): 10 | filename = filename.replace('.npy', '') # remove .npy 11 | algo, env, seed = re.split('_', filename) 12 | seed = int(seed) 13 | return algo, env, seed 14 | 15 | 16 | def get_file_name(path='./'): 17 | file_names = [] 18 | for _, __, file_name in os.walk(path): 19 | file_names += file_name 20 | data_name = [f for f in file_names if '.npy' in f] 21 | return data_name 22 | 23 | def exact_data(file_name, steps): 24 | ''' 25 | exact data from single .npy file 26 | :param file_name: 27 | :return: a Dataframe include time, seed, algo_name, avg_reward 28 | ''' 29 | avg_reward = np.load(file_name).reshape(-1, 1) 30 | algo, env_name, seed = get_info(file_name) 31 | df = pd.DataFrame(avg_reward) 32 | df.columns = ['Average Return'] 33 | df['Time Steps (1e6)'] = steps 34 | df['Algorithm'] = algo 35 | df['env'] = env_name 36 | df['seed'] = seed 37 | return df 38 | 39 | 40 | if __name__ == '__main__': 41 | file_names = get_file_name('./') 42 | _, env_name, __ = get_info(file_names[0]) 43 | df = pd.DataFrame([]) 44 | steps = np.linspace(0, 1, 201) 45 | for file in file_names: 46 | data = exact_data(file, steps) 47 | df = pd.concat([df, data], axis=0) 48 | sns.lineplot(x='Time Steps (1e6)', y='Average Return', data=df, hue='Algorithm',ci=90) 49 | plt.title(env_name) 50 | plt.savefig(env_name + '.svg') 51 | plt.show() 52 | -------------------------------------------------------------------------------- /More/readme.md: -------------------------------------------------------------------------------- 1 | # More 2 | 3 | This folder give you more insights about RL. 4 | -------------------------------------------------------------------------------- /figures/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndyYue1893/Deep-reinforcement-learning-with-pytorch/7b9fac7e5e40ffdc6f7ccb8b0a81e7841370a996/figures/test.png -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | **Status:** Active (under active development, breaking changes may occur) 2 | 3 | This repository will implement the classic and state-of-the-art deep reinforcement learning algorithms. The aim of this repository is to provide clear pytorch code for people to learn the deep reinforcement learning algorithm. 4 | 5 | In the future, more state-of-the-art algorithms will be added and the existing codes will also be maintained. 6 | 7 | ![demo](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/figures/grid.gif) 8 | 9 | ## Requirements 10 | - python <=3.6 11 | - tensorboardX 12 | - gym >= 0.10 13 | - pytorch >= 0.4 14 | 15 | **Note that tensorflow does not support python3.7** 16 | 17 | ## Installation 18 | 19 | ``` 20 | pip install -r requirements.txt 21 | ``` 22 | 23 | If you fail: 24 | 25 | - Install gym 26 | 27 | ``` 28 | pip install gym 29 | ``` 30 | 31 | 32 | 33 | - Install the pytorch 34 | ```bash 35 | please go to official webisite to install it: https://pytorch.org/ 36 | 37 | Recommend use Anaconda Virtual Environment to manage your packages 38 | 39 | ``` 40 | 41 | - Install tensorboardX 42 | ```bash 43 | pip install tensorboardX 44 | pip install tensorflow==1.12 45 | ``` 46 | 47 | - Test 48 | ``` 49 | cd Char10\ TD3/ 50 | python TD3_BipedalWalker-v2.py --mode test 51 | ``` 52 | 53 | You could see a bipedalwalker if you install successfully. 54 | 55 | BipedalWalker: 56 | 57 | ![](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/figures/test.png) 58 | 59 | - 4. install openai-baselines (**Optional**) 60 | 61 | ```bash 62 | # clone the openai baselines 63 | git clone https://github.com/openai/baselines.git 64 | cd baselines 65 | pip install -e . 66 | 67 | ``` 68 | 69 | ## DQN 70 | 71 | Here I uploaded two DQN models which is trianing CartPole-v0 and MountainCar-v0. 72 | 73 | ### Tips for MountainCar-v0 74 | 75 | This is a sparse binary reward task. Only when car reach the top of the mountain there is a none-zero reward. In genearal it may take 1e5 steps in stochastic policy. You can add a reward term, for example, to change to the current position of the Car is positively related. Of course, there is a more advanced approach that is inverse reinforcement learning. 76 | 77 | ![value_loss](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char01%20DQN/DQN/pic/value_loss.jpg) 78 | ![step](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char01%20DQN/DQN/pic/finish_episode.jpg) 79 | This is value loss for DQN, We can see that the loss increaded to 1e13, however, the network work well. Because the target_net and act_net are very different with the training process going on. The calculated loss cumulate large. The previous loss was small because the reward was very sparse, resulting in a small update of the two networks. 80 | 81 | ### Papers Related to the DQN 82 | 83 | 84 | 1. Playing Atari with Deep Reinforcement Learning [[arxiv]](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/1.dqn.ipynb) 85 | 2. Deep Reinforcement Learning with Double Q-learning [[arxiv]](https://arxiv.org/abs/1509.06461) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/2.double%20dqn.ipynb) 86 | 3. Dueling Network Architectures for Deep Reinforcement Learning [[arxiv]](https://arxiv.org/abs/1511.06581) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/3.dueling%20dqn.ipynb) 87 | 4. Prioritized Experience Replay [[arxiv]](https://arxiv.org/abs/1511.05952) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/4.prioritized%20dqn.ipynb) 88 | 5. Noisy Networks for Exploration [[arxiv]](https://arxiv.org/abs/1706.10295) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/5.noisy%20dqn.ipynb) 89 | 6. A Distributional Perspective on Reinforcement Learning [[arxiv]](https://arxiv.org/pdf/1707.06887.pdf) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/6.categorical%20dqn.ipynb) 90 | 7. Rainbow: Combining Improvements in Deep Reinforcement Learning [[arxiv]](https://arxiv.org/abs/1710.02298) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/7.rainbow%20dqn.ipynb) 91 | 8. Distributional Reinforcement Learning with Quantile Regression [[arxiv]](https://arxiv.org/pdf/1710.10044.pdf) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/8.quantile%20regression%20dqn.ipynb) 92 | 9. Hierarchical Deep Reinforcement Learning: Integrating Temporal Abstraction and Intrinsic Motivation [[arxiv]](https://arxiv.org/abs/1604.06057) [[code]](https://github.com/higgsfield/RL-Adventure/blob/master/9.hierarchical%20dqn.ipynb) 93 | 10. Neural Episodic Control [[arxiv]](https://arxiv.org/pdf/1703.01988.pdf) [[code]](#) 94 | 95 | 96 | ## Policy Gradient 97 | 98 | 99 | Use the following command to run a saved model 100 | 101 | 102 | ``` 103 | python Run_Model.py 104 | ``` 105 | 106 | 107 | Use the following command to train model 108 | 109 | 110 | ``` 111 | python pytorch_MountainCar-v0.py 112 | ``` 113 | 114 | 115 | 116 | > policyNet.pkl 117 | 118 | This is a model that I have trained. 119 | 120 | 121 | ## Actor-Critic 122 | 123 | This is an algorithmic framework, and the classic REINFORCE method is stored under Actor-Critic. 124 | 125 | ## DDPG 126 | Episode reward in Pendulum-v0: 127 | 128 | ![ep_r](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char05%20DDPG/DDPG_exp.jpg) 129 | 130 | 131 | ## PPO 132 | 133 | - Original paper: https://arxiv.org/abs/1707.06347 134 | - Openai Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ 135 | 136 | 137 | ## A2C 138 | 139 | Advantage Policy Gradient, an paper in 2017 pointed out that the difference in performance between A2C and A3C is not obvious. 140 | 141 | The Asynchronous Advantage Actor Critic method (A3C) has been very influential since the paper was published. The algorithm combines a few key ideas: 142 | 143 | - An updating scheme that operates on fixed-length segments of experience (say, 20 timesteps) and uses these segments to compute estimators of the returns and advantage function. 144 | - Architectures that share layers between the policy and value function. 145 | - Asynchronous updates. 146 | 147 | ## A3C 148 | 149 | Original paper: https://arxiv.org/abs/1602.01783 150 | 151 | ## SAC 152 | 153 | **This is not the implementation of the author of paper!!!** 154 | 155 | Episode reward in Pendulum-v0: 156 | 157 | ![ep_r](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char09%20SAC/SAC_ep_r_curve.png) 158 | 159 | ## TD3 160 | 161 | **This is not the implementation of the author of paper!!!** 162 | 163 | Episode reward in Pendulum-v0: 164 | 165 | ![ep_r](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char10%20TD3/TD3_Pendulum-v0.png) 166 | 167 | Episode reward in BipedalWalker-v2: 168 | ![ep_r](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/blob/master/Char10%20TD3/Episode_reward_TD3_BipedakWalker.png) 169 | 170 | If you want to use the test your model: 171 | 172 | ``` 173 | python TD3_BipedalWalker-v2.py --mode test 174 | ``` 175 | 176 | ## Papers Related to the Deep Reinforcement Learning 177 | [01] [A Brief Survey of Deep Reinforcement Learning](https://arxiv.org/abs/1708.05866) 178 | [02] [The Beta Policy for Continuous Control Reinforcement Learning](https://www.ri.cmu.edu/wp-content/uploads/2017/06/thesis-Chou.pdf) 179 | [03] [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) 180 | [04] [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461) 181 | [05] [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581) 182 | [06] [Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971) 183 | [07] [Continuous Deep Q-Learning with Model-based Acceleration](https://arxiv.org/abs/1603.00748) 184 | [08] [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/abs/1602.01783) 185 | [09] [Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477) 186 | [10] [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347) 187 | [11] [Scalable trust-region method for deep reinforcement learning using Kronecker-factored approximation](https://arxiv.org/abs/1708.05144) 188 | [12] [High-Dimensional Continuous Control Using Generalized Advantage Estimation](https://arxiv.org/abs/1506.02438) 189 | [13] [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor](https://arxiv.org/abs/1801.01290) 190 | [14] [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/abs/1802.09477) 191 | 192 | ## TO DO 193 | - [x] DDPG 194 | - [x] SAC 195 | - [x] TD3 196 | 197 | 198 | # Best RL courses 199 | - [OpenAI's spinning up](https://spinningup.openai.com/) 200 | - [David Silver's course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html) 201 | - [Berkeley deep RL](http://rll.berkeley.edu/deeprlcourse/) 202 | - [Practical RL](https://github.com/yandexdataschool/Practical_RL) 203 | - [Deep Reinforcement Learning by Hung-yi Lee](https://www.youtube.com/playlist?list=PLJV_el3uVTsODxQFgzMzPLa16h6B8kWM_) 204 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.0 2 | torchvision 3 | tensorflow==1.15.2 4 | tensorboardX 5 | gym 6 | gym[atari] 7 | --------------------------------------------------------------------------------