├── .gitignore ├── A2C.py ├── DDPG.py ├── PPO.py ├── README.md ├── SAC.py ├── TD3.py ├── algorithms.py ├── doc ├── final_report.pdf └── image │ ├── buffer.png │ ├── cheetah.png │ ├── cheetahflip.jpeg │ ├── halfcheetah.jpeg │ ├── hiddendim.png │ ├── hopper.jpeg │ ├── hopper.png │ ├── pendulum.jpeg │ └── seed.png ├── main.py ├── plot_result.py ├── requirement.txt └── utils ├── ReplayBuffer.py ├── __pycache__ ├── ReplayBuffer.cpython-35.pyc ├── ReplayBuffer.cpython-36.pyc ├── models.cpython-35.pyc ├── models.cpython-36.pyc ├── multiprocessing_env.cpython-35.pyc └── multiprocessing_env.cpython-36.pyc ├── models.py └── multiprocessing_env.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pt 2 | log/ 3 | weights/ 4 | __pycache__/ 5 | -------------------------------------------------------------------------------- /A2C.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import argparse 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import torch.nn.functional as F 9 | from torch.distributions import Normal 10 | from torch.optim.lr_scheduler import ExponentialLR 11 | from tensorboardX import SummaryWriter 12 | 13 | from utils.models import ValueNetwork, GaussianPolicy 14 | from utils.multiprocessing_env import SubprocVecEnv 15 | 16 | 17 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 18 | 19 | def make_env(env_name): 20 | def _thunk(): 21 | env = gym.make(env_name) 22 | return env 23 | return _thunk 24 | 25 | class A2C(): 26 | def __init__(self, args): 27 | self.args = args 28 | envs = [make_env(self.args.env_name) for i in range(self.args.num_envs)] 29 | self.envs = SubprocVecEnv(envs) 30 | state_dim = self.envs.observation_space.shape[0] 31 | action_dim = self.envs.action_space.shape[0] 32 | self.eps = np.linspace(0, 0.5, self.args.num_envs) 33 | 34 | self.actor = GaussianPolicy(state_dim, action_dim, 64, self.envs.action_space) 35 | self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) 36 | self.actor_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=self.actor_optimizer, gamma=0.9) 37 | 38 | self.critic = ValueNetwork(state_dim, 64) 39 | self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) 40 | self.critic_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=self.critic_optimizer, gamma=0.9) 41 | self.global_steps = 0 42 | self.writer = SummaryWriter("log/" + self.args.env_name) 43 | 44 | if self.args.last_episode > 0: 45 | try: 46 | self.load(self.args.last_episode) 47 | except: 48 | print("can't find last checkpoint file") 49 | 50 | # set reandom seed 51 | self.env.seed(self.args.seed) 52 | torch.manual_seed(args.seed) 53 | np.random.seed(self.args.seed) 54 | 55 | 56 | def compute_returns(self, next_value, rewards, dones): 57 | R = next_value 58 | returns = [] 59 | for step in reversed(range(len(rewards))): 60 | R = rewards[step] + self.args.gamma * R 61 | returns.insert(0, R) 62 | return returns 63 | 64 | def get_value(self, state): 65 | state = torch.FloatTensor(state) 66 | with torch.no_grad(): 67 | value = self.critic(state) 68 | return value 69 | 70 | def evaluate(self, number = 1, render = True): 71 | env = gym.make(self.args.env_name) 72 | self.actor.eval() 73 | rewards = [] 74 | for _ in range(number): 75 | state = env.reset() 76 | done = False 77 | total_rews = 0 78 | count = 0 79 | while not done: 80 | state = torch.FloatTensor([state]).to(device) 81 | with torch.no_grad(): 82 | _, _, action = self.actor.sample(state) 83 | if render: 84 | env.render() 85 | state, reward, done, _ = env.step(action.cpu().numpy()[0]) 86 | total_rews += reward 87 | count += 1 88 | if count > 1000: 89 | print("time out") 90 | break 91 | if render: 92 | print("total reward of this episode is " + str(total_rews)) 93 | rewards.append(total_rews) 94 | env.close() 95 | rewards = np.array(rewards) 96 | if not render: 97 | self.writer.add_scalar('A2C_reward',rewards.mean(), self.global_steps) 98 | return rewards.max(), rewards.min(), rewards.mean() 99 | 100 | def train(self): 101 | state = self.envs.reset() 102 | episode_idx = self.args.last_episode 103 | 104 | self.actor.train() 105 | self.critic.train() 106 | 107 | while episode_idx < self.args.max_episode: 108 | log_probs = [] 109 | states = [] 110 | rewards = [] 111 | dones = [] 112 | 113 | # correct data 114 | for _ in range(self.args.max_length_trajectory): 115 | 116 | state_t = torch.FloatTensor(state).to(device) 117 | action, log_prob, _ = self.actor.sample(state_t, entropy = False) 118 | 119 | if True: 120 | random_action = torch.FloatTensor([self.envs.action_space.sample() for _ in range(self.args.num_envs)]) 121 | explore = (np.random.random(self.args.num_envs) < self.eps) 122 | action[explore] = random_action[explore] 123 | 124 | next_state, reward, done, _ = self.envs.step(action.cpu().detach().numpy()) 125 | self.global_steps += self.args.num_envs 126 | 127 | #value = self.get_value(state) 128 | log_probs.append(log_prob) 129 | states.append(state) 130 | rewards.append(reward) 131 | dones.append(done) 132 | 133 | state = next_state 134 | 135 | next_value = self.get_value(next_state).view(1, -1).cpu().numpy() 136 | returns = self.compute_returns(next_value, rewards, dones) 137 | 138 | log_probs = torch.cat(log_probs).view(-1, self.args.num_envs) 139 | returns = torch.FloatTensor(returns).view(-1, self.args.num_envs) 140 | states = torch.FloatTensor(states) 141 | values = self.critic(states).view(-1, self.args.num_envs) 142 | 143 | # update actor 144 | advantage = returns - values.detach() 145 | self.actor_optimizer.zero_grad() 146 | actor_loss = -(log_probs * advantage).sum() / self.args.num_envs 147 | actor_loss.backward() 148 | self.actor_optimizer.step() 149 | 150 | # update critic 151 | #values = self.critic(states).view(-1, args.num_envs) 152 | for _ in range(1): 153 | values = self.critic(states).view(-1, self.args.num_envs) 154 | self.critic_optimizer.zero_grad() 155 | critic_loss = F.smooth_l1_loss(values, returns) 156 | critic_loss.backward() 157 | self.critic_optimizer.step() 158 | 159 | episode_idx += 1 160 | 161 | if episode_idx % 200 == 0: 162 | self.actor_scheduler.step() 163 | self.critic_scheduler.step() 164 | self.eps = self.eps * 0.9 165 | 166 | 167 | if episode_idx % self.args.print_log == 0: 168 | print("epi {} best reward: {}".format(episode_idx, np.sum(rewards, axis = 0).max())) 169 | self.evaluate(10, False) 170 | self.save(episode_idx) 171 | 172 | def close(self): 173 | self.envs.close() 174 | self.writer.close() 175 | 176 | def save(self, episode = None): 177 | if episode == None: 178 | file_name = "weights/" + self.args.env_name + "_A2C_checkpoint.pt" 179 | else: 180 | file_name = "weights/" + self.args.env_name + "_A2C_checkpoint_" + str(episode) + ".pt" 181 | torch.save({'actor' : self.actor.state_dict(), 182 | 'critic' : self.critic.state_dict()}, file_name) 183 | print("save model to " + file_name) 184 | 185 | 186 | def load(self, episode = None): 187 | if episode == None: 188 | file_name = "weights/" + self.args.env_name + "_A2C_checkpoint.pt" 189 | else: 190 | file_name = "weights/" + self.args.env_name + "_A2C_checkpoint_" + str(episode) + ".pt" 191 | checkpoint = torch.load(file_name) 192 | self.actor.load_state_dict(checkpoint['actor']) 193 | self.critic.load_state_dict(checkpoint['critic']) 194 | print("successfully load model from " + file_name) 195 | -------------------------------------------------------------------------------- /DDPG.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from itertools import count 3 | 4 | import os, sys, random 5 | import numpy as np 6 | import _pickle as pickle 7 | 8 | import gym 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | from tensorboardX import SummaryWriter 14 | 15 | from utils.models import QNetwork, DeterministicPolicy 16 | from utils.ReplayBuffer import ReplayBuffer 17 | from algorithms import algorithms 18 | 19 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 20 | 21 | class DDPG(algorithms): 22 | def __init__(self, args): 23 | super().__init__(args) 24 | state_dim = self.env.observation_space.shape[0] 25 | action_dim = self.env.action_space.shape[0] 26 | 27 | self.actor = DeterministicPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) 28 | self.actor_target = DeterministicPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) 29 | self.actor_target.load_state_dict(self.actor.state_dict()) 30 | self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) 31 | 32 | self.critic = QNetwork(state_dim, action_dim, 64).to(device) 33 | self.critic_target = QNetwork(state_dim, action_dim, 64).to(device) 34 | self.critic_target.load_state_dict(self.critic.state_dict()) 35 | self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) 36 | 37 | self.replay_buffer = ReplayBuffer(self.args.capacity) 38 | self.num_critic_update_iteration = 0 39 | self.num_actor_update_iteration = 0 40 | self.num_training = 0 41 | self.global_steps = 0 42 | 43 | if self.args.last_episode > 0: 44 | self.load(self.args.last_episode) 45 | 46 | def update(self): 47 | for it in range(self.args.update_iteration): 48 | # sample from replay buffer 49 | x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size) 50 | state = torch.FloatTensor(x).to(device) 51 | action = torch.FloatTensor(u).to(device) 52 | next_state = torch.FloatTensor(y).to(device) 53 | done = torch.FloatTensor(d).to(device) 54 | reward = torch.FloatTensor(r).to(device) 55 | 56 | # computer the target Q value 57 | next_action, _, _ = self.actor_target.sample(next_state) 58 | target_Q = self.critic_target(next_state, next_action) 59 | target_Q = reward + ((1-done) * self.args.gamma * target_Q).detach() 60 | 61 | # get current Q estimate 62 | current_Q = self.critic(state, action) 63 | 64 | # compute cirtic loss and update 65 | critic_loss = F.mse_loss(current_Q, target_Q) 66 | self.critic_optimizer.zero_grad() 67 | critic_loss.backward() 68 | self.critic_optimizer.step() 69 | 70 | # computer actor loss 71 | actor_action, _, _ = self.actor.sample(state) 72 | actor_loss = -self.critic(state, actor_action).mean() 73 | self.actor_optimizer.zero_grad() 74 | actor_loss.backward() 75 | self.actor_optimizer.step() 76 | 77 | # update target model 78 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): 79 | target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) 80 | 81 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 82 | target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) 83 | 84 | self.num_actor_update_iteration += 1 85 | self.num_critic_update_iteration += 1 86 | 87 | def train(self): 88 | for i in range(self.args.max_episode): 89 | state = self.env.reset() 90 | ep_r = 0 91 | for t in count(): 92 | action, _, _ = self.actor.sample(torch.FloatTensor([state]).to(device)) 93 | action = action.cpu().detach().numpy()[0] 94 | 95 | next_state, reward, done, info = self.env.step(action) 96 | self.global_steps += 1 97 | ep_r += reward 98 | self.replay_buffer.push((state, next_state, action, reward, np.float(done))) 99 | state = next_state 100 | 101 | if done or t > self.args.max_length_trajectory: 102 | if i % self.args.print_log == 0: 103 | print("Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}".format(i, ep_r, t, self.global_steps)) 104 | self.evaluate(10, False) 105 | break 106 | 107 | if len(self.replay_buffer.storage) >= self.args.capacity - 1: 108 | self.update() 109 | self.save(i+1) 110 | 111 | def evaluate(self, number = 1, render = True): 112 | rewards = [] 113 | for _ in range(number): 114 | total_rews = 0 115 | time_step = 0 116 | done = False 117 | state = self.env.reset() 118 | while not done: 119 | with torch.no_grad(): 120 | # use the mean action 121 | _, _, action = self.actor.sample(torch.FloatTensor([state]).to(device)) 122 | action = action.cpu().detach().numpy()[0] 123 | if render: 124 | self.env.render() 125 | state, reward, done, _ = self.env.step(action) 126 | total_rews += reward 127 | time_step += 1 128 | 129 | if render: 130 | print("total reward of this episode is " + str(total_rews)) 131 | rewards.append(total_rews) 132 | rewards = np.array(rewards) 133 | if not render: 134 | pickle.dump((self.global_steps, rewards), self.log_file) 135 | print("mean reward {}, max reward {}".format(rewards.mean(), rewards.max())) 136 | 137 | def load(self, episode = None): 138 | file_name = self.weights_file(episode) 139 | checkpoint = torch.load(file_name) 140 | self.actor.load_state_dict(checkpoint['actor']) 141 | self.actor_target.load_state_dict(checkpoint['actor_target']) 142 | self.critic.load_state_dict(checkpoint['critic']) 143 | self.critic.load_state_dict(checkpoint['critic_target']) 144 | print("successfully load model from " + file_name) 145 | 146 | def save(self, episode = None): 147 | file_name = self.weights_file(episode) 148 | torch.save({'actor' : self.actor.state_dict(), 149 | 'critic' : self.critic.state_dict(), 150 | 'actor_target' : self.actor_target.state_dict(), 151 | 'critic_target' : self.critic_target.state_dict()}, file_name) 152 | print("save model to " + file_name) 153 | -------------------------------------------------------------------------------- /PPO.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import _pickle as pickle 5 | import gym 6 | from collections import namedtuple 7 | from itertools import count 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | from torch.distributions import Normal 14 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 15 | from tensorboardX import SummaryWriter 16 | 17 | from utils.models import ValueNetwork, GaussianFixstdPolicy 18 | from algorithms import algorithms 19 | 20 | Transition = namedtuple('Transition', ['state', 'action', 'reward', 'a_log_prob', 'next_state', 'done']) 21 | 22 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 23 | 24 | class PPO(algorithms): 25 | clip_param = 0.2 26 | max_grad_norm = 40 27 | ppo_epoch = 5 28 | buffer_capacity = 2048 29 | batch_size = 32 30 | 31 | def __init__(self, args): 32 | super().__init__(args) 33 | num_state = self.env.observation_space.shape[0] 34 | num_action = self.env.action_space.shape[0] 35 | 36 | self.actor = GaussianFixstdPolicy(num_state, num_action, 64, self.env.action_space) 37 | self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) 38 | 39 | self.critic = ValueNetwork(num_state, 64) 40 | 41 | self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) 42 | 43 | self.buffer = [] 44 | self.counter = 0 45 | self.training_step = 0 46 | self.global_steps = 0 47 | 48 | if self.args.last_episode > 0: 49 | self.load(self.args.last_episode) 50 | 51 | def store_transiction(self, transition): 52 | self.buffer.append(transition) 53 | self.counter += 1 54 | return self.counter % self.buffer_capacity == 0 55 | 56 | def compute_returns(self, next_value, rewards, dones): 57 | R = next_value 58 | returns = [] 59 | for step in reversed(range(len(rewards))): 60 | R = rewards[step] + self.args.gamma * R 61 | returns.insert(0, R) 62 | return torch.FloatTensor(returns) 63 | 64 | def update(self): 65 | self.training_step += 1 66 | 67 | state = torch.FloatTensor([t.state for t in self.buffer]) 68 | action = torch.FloatTensor([t.action for t in self.buffer]) 69 | rewards = torch.FloatTensor([t.reward for t in self.buffer]).view(-1, 1) 70 | next_state = torch.FloatTensor([t.next_state for t in self.buffer]) 71 | old_action_log_prob = torch.FloatTensor([t.a_log_prob for t in self.buffer]).view(-1, 1) 72 | dones = torch.FloatTensor([t.done for t in self.buffer]).view(-1, 1) 73 | 74 | rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-8) 75 | with torch.no_grad(): 76 | target_v = rewards + self.args.gamma * self.critic(next_state) 77 | advantage = (target_v - self.critic(state)) 78 | 79 | for _ in range(self.ppo_epoch): 80 | for index in BatchSampler(SubsetRandomSampler(range(self.buffer_capacity)), self.batch_size, False): 81 | action_log_prob, entropy = self.actor.action_log_prob(state[index], action[index]) 82 | ratio = torch.exp(action_log_prob - old_action_log_prob[index]) 83 | 84 | L1 = ratio * advantage[index] 85 | L2 = torch.clamp(ratio, 1-self.clip_param, 1+self.clip_param) * advantage[index] 86 | 87 | action_loss = -torch.min(L1, L2).mean() - 0.02 * entropy.sum(-1).mean() 88 | self.actor_optimizer.zero_grad() 89 | action_loss.backward() 90 | #print(action_loss) 91 | nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm) 92 | self.actor_optimizer.step() 93 | 94 | value_loss = F.smooth_l1_loss(self.critic(state[index]), target_v[index]) 95 | self.critic_optimizer.zero_grad() 96 | value_loss.backward() 97 | nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) 98 | self.critic_optimizer.step() 99 | 100 | del self.buffer[:] 101 | 102 | def train(self): 103 | self.actor.train() 104 | for i_epoch in range(self.args.max_episode): 105 | score = 0 106 | state = self.env.reset() 107 | for t in range(self.args.max_length_trajectory): 108 | action, action_log_prob, _ = self.actor.sample(torch.FloatTensor([state]).to(device)) 109 | action = action.cpu().detach().numpy()[0] 110 | action_log_prob = action_log_prob.cpu().detach().numpy()[0] 111 | 112 | #self.env.render() 113 | next_state, reward, done, info = self.env.step(action) 114 | trans = Transition(state, action, reward, action_log_prob, next_state, done) 115 | 116 | if self.store_transiction(trans): 117 | self.update() 118 | 119 | score += reward 120 | state = next_state 121 | 122 | if done: 123 | break 124 | 125 | if i_epoch % self.args.print_log == 0: 126 | print(self.actor.action_log_std.exp().detach()) 127 | print("Ep_i \t {}, time step {}, global_steps is {}".format(i_epoch, t, self.counter)) 128 | self.evaluate(10, False) 129 | 130 | self.save(i_epoch+1) 131 | 132 | def evaluate(self, number = 1, render = True): 133 | self.actor.eval() 134 | rewards = [] 135 | for _ in range(number): 136 | done = False 137 | total_rews = 0 138 | count = 0 139 | state = self.env.reset() 140 | while not done: 141 | with torch.no_grad(): 142 | action, _, _ = self.actor.sample(torch.FloatTensor([state]).to(device)) 143 | action = action.cpu().detach().numpy()[0] 144 | if render: 145 | self.env.render() 146 | state, reward, done, _ = self.env.step(action) 147 | 148 | total_rews += reward 149 | count += 1 150 | 151 | rewards.append(total_rews) 152 | if render: 153 | print("total reward of this episode is " + str(total_rews)) 154 | rewards = np.array(rewards) 155 | if not render: 156 | pickle.dump((self.counter, rewards), self.log_file) 157 | self.actor.train() 158 | print("mean reward {}, max reward {}".format(rewards.mean(), rewards.max())) 159 | 160 | def save(self, episode = None): 161 | file_name = self.weights_file(episode) 162 | torch.save({'actor' : self.actor.state_dict(), 163 | 'critic' : self.critic.state_dict()}, file_name) 164 | print("save model to " + file_name) 165 | 166 | def load(self, episode = None): 167 | file_name = self.weights_file(episode) 168 | checkpoint = torch.load(file_name) 169 | self.actor.load_state_dict(checkpoint['actor']) 170 | self.critic.load_state_dict(checkpoint['critic']) 171 | print("successfully load model from " + file_name) 172 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Reinforcement Learning Reimplementation 2 | This is my final project for [cse573: Artificial Intelligence](https://courses.cs.washington.edu/courses/cse573/20wi/). In this project, I reimplement 5 state-of-the-art algorithms (A2C, DDPG, PPO, TD3 and SAC) and carry out some experiments to study the effects of different aspects on the performance of models. This repo only serves for learning purpose and still has many difference from the published baseline. I borrow some ideas from [sweetice](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch)'s repo during implementation. 3 | 4 | ## Basic Usage 5 | For example, to train TD3 on Hopper-v2 environment for 2000 episode, simply use 6 | 7 | ``` 8 | python main.py --model TD3 --env_name Hopper-v2 --max_episode 2000 9 | ``` 10 | 11 | To evaluate the training result 12 | 13 | ``` 14 | python main.py --model TD3 --env_name Hopper-v2 --last_episode 2000 --mode eval 15 | ``` 16 | There are also many other options sepcified in the `main.py` file. For example, change the random seed to 10 and the capacity of replay buffer to 10000 17 | ``` 18 | python main.py --model TD3 --env_name Hopper-v2 --max_episode 2000 --seed 10 --capacity 10000 19 | ``` 20 | 21 | To visualize the training log 22 | 23 | ``` 24 | python plot_result.py --dir log/Hopper-v2/TD3 25 | ``` 26 | -------------------------------------------------------------------------------- /SAC.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from itertools import count 3 | 4 | import os, sys, random 5 | import numpy as np 6 | import _pickle as pickle 7 | 8 | import gym 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | from tensorboardX import SummaryWriter 14 | 15 | from utils.models import QNetwork, GaussianPolicy 16 | from utils.ReplayBuffer import ReplayBuffer 17 | from algorithms import algorithms 18 | 19 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 20 | 21 | class SAC(algorithms): 22 | def __init__(self, args): 23 | super().__init__(args) 24 | state_dim = self.env.observation_space.shape[0] 25 | action_dim = self.env.action_space.shape[0] 26 | 27 | self.actor = GaussianPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) 28 | self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) 29 | 30 | self.critic_1 = QNetwork(state_dim, action_dim, 64).to(device) 31 | self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(),self.args.lr) 32 | self.critic_target_1 = QNetwork(state_dim, action_dim, 64).to(device) 33 | self.critic_target_1.load_state_dict(self.critic_1.state_dict()) 34 | 35 | self.critic_2 = QNetwork(state_dim, action_dim, 64).to(device) 36 | self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(), self.args.lr) 37 | self.critic_target_2 = QNetwork(state_dim, action_dim, 64).to(device) 38 | self.critic_target_2.load_state_dict(self.critic_2.state_dict()) 39 | 40 | self.replay_buffer = ReplayBuffer(self.args.capacity) 41 | 42 | self.global_steps = 0 43 | 44 | def update(self): 45 | for it in range(self.args.update_iteration): 46 | # sample from replay buffer 47 | x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size) 48 | state = torch.FloatTensor(x).to(device) 49 | action = torch.FloatTensor(u).to(device) 50 | next_state = torch.FloatTensor(y).to(device) 51 | done = torch.FloatTensor(d).to(device) 52 | reward = torch.FloatTensor(r).to(device) 53 | 54 | # get the next action and compute target Q 55 | with torch.no_grad(): 56 | next_action, log_prob, _ = self.actor.sample(next_state) 57 | target_Q1 = self.critic_target_1(next_state, next_action) 58 | target_Q2 = self.critic_target_2(next_state, next_action) 59 | target_Q = torch.min(target_Q1, target_Q2) - self.args.alpha * log_prob 60 | y_Q = reward + self.args.gamma * (1 - done) * target_Q 61 | 62 | # update critic 63 | current_Q1 = self.critic_1(state, action) 64 | critic_loss1 = F.mse_loss(current_Q1, y_Q) 65 | self.critic_optimizer_1.zero_grad() 66 | critic_loss1.backward() 67 | self.critic_optimizer_1.step() 68 | 69 | current_Q2 = self.critic_2(state, action) 70 | critic_loss2 = F.mse_loss(current_Q2, y_Q) 71 | self.critic_optimizer_2.zero_grad() 72 | critic_loss2.backward() 73 | self.critic_optimizer_2.step() 74 | 75 | # update actor 76 | actor_action, actor_log_prob, _ = self.actor.sample(state) 77 | Q1 = self.critic_1(state, actor_action) 78 | Q2 = self.critic_2(state, actor_action) 79 | actor_loss = -(torch.min(Q1, Q2) - self.args.alpha * actor_log_prob).mean() 80 | self.actor_optimizer.zero_grad() 81 | actor_loss.backward() 82 | self.actor_optimizer.step() 83 | 84 | # update target network 85 | for param, target_param in zip(self.critic_1.parameters(), self.critic_target_1.parameters()): 86 | target_param.data.copy_((1-self.args.tau) * target_param.data + self.args.tau * param.data) 87 | 88 | for param, target_param in zip(self.critic_2.parameters(), self.critic_target_2.parameters()): 89 | target_param.data.copy_((1-self.args.tau) * target_param.data + self.args.tau * param.data) 90 | 91 | def train(self): 92 | for i in range(self.args.max_episode): 93 | state = self.env.reset() 94 | ep_r = 0 95 | for t in count(): 96 | action, _, _ = self.actor.sample(torch.FloatTensor([state]).to(device)) 97 | action = action.cpu().detach().numpy()[0] 98 | next_state, reward, done, info = self.env.step(action) 99 | self.global_steps += 1 100 | ep_r += reward 101 | self.replay_buffer.push((state, next_state, action, reward, np.float(done))) 102 | state = next_state 103 | 104 | if done or t > self.args.max_length_trajectory: 105 | if i % self.args.print_log == 0: 106 | print("Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}".format(i, ep_r, t, self.global_steps)) 107 | self.evaluate(10, False) 108 | ep_r = 0 109 | break 110 | 111 | if len(self.replay_buffer.storage) >= self.args.capacity - 1: 112 | self.update() 113 | 114 | self.save(i+1) 115 | 116 | def evaluate(self, number = 1, render = True): 117 | rewards = [] 118 | for _ in range(number): 119 | state = self.env.reset() 120 | done = False 121 | total_rews = 0 122 | time_step = 0 123 | while not done: 124 | with torch.no_grad(): 125 | # use the mean action 126 | action, _, _ = self.actor.sample(torch.FloatTensor([state]).to(device)) 127 | action = action.cpu().detach().numpy()[0] 128 | if render: 129 | self.env.render() 130 | state, reward, done, _ = self.env.step(action) 131 | total_rews += reward 132 | time_step += 1 133 | 134 | if render: 135 | print("total reward of this episode is " + str(total_rews)) 136 | rewards.append(total_rews) 137 | rewards = np.array(rewards) 138 | if not render: 139 | pickle.dump((self.global_steps, rewards), self.log_file) 140 | return rewards.max(), rewards.min(), rewards.mean() 141 | 142 | def save(self, episode): 143 | file_name = self.weights_file(episode) 144 | torch.save({'actor' : self.actor.state_dict(), 145 | 'critic_1' : self.critic_1.state_dict(), 146 | 'critic_2' : self.critic_2.state_dict(), 147 | 'critic_target_1' : self.critic_target_1.state_dict(), 148 | 'critic_target_2' : self.critic_target_2.state_dict()}, file_name) 149 | print("save model to " + file_name) 150 | 151 | def load(self, episode): 152 | file_name = self.weights_file(episode) 153 | checkpoint = torch.load(file_name) 154 | self.actor.load_state_dict(checkpoint['actor']) 155 | self.critic_1.load_state_dict(checkpoint['critic_1']) 156 | self.critic_2.load_state_dict(checkpoint['critic_2']) 157 | self.critic_target_1.load_state_dict(checkpoint['critic_target_1']) 158 | self.critic_target_2.load_state_dict(checkpoint['critic_target_2']) 159 | print("successfully load model from " + file_name) 160 | -------------------------------------------------------------------------------- /TD3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from itertools import count 3 | 4 | import os, sys, random 5 | import numpy as np 6 | 7 | import gym 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | from tensorboardX import SummaryWriter 13 | 14 | from utils.models import QNetwork, DeterministicPolicy 15 | from utils.ReplayBuffer import ReplayBuffer 16 | from algorithms import algorithms 17 | import _pickle as pickle 18 | 19 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 20 | 21 | class TD3(algorithms): 22 | def __init__(self, args): 23 | super().__init__(args) 24 | state_dim = self.env.observation_space.shape[0] 25 | action_dim = self.env.action_space.shape[0] 26 | 27 | self.actor = DeterministicPolicy(state_dim, action_dim, self.args.hidden_dim, self.env.action_space).to(device) 28 | self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) 29 | self.actor_target = DeterministicPolicy(state_dim, action_dim, self.args.hidden_dim, self.env.action_space).to(device) 30 | self.actor_target.load_state_dict(self.actor.state_dict()) 31 | 32 | self.critic_1 = QNetwork(state_dim, action_dim, self.args.hidden_dim).to(device) 33 | self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(), self.args.lr) 34 | self.critic_target_1 = QNetwork(state_dim, action_dim, self.args.hidden_dim).to(device) 35 | self.critic_target_1.load_state_dict(self.critic_1.state_dict()) 36 | 37 | self.critic_2 = QNetwork(state_dim, action_dim, self.args.hidden_dim).to(device) 38 | self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(), self.args.lr) 39 | self.critic_target_2 = QNetwork(state_dim, action_dim, self.args.hidden_dim).to(device) 40 | self.critic_target_2.load_state_dict(self.critic_2.state_dict()) 41 | 42 | self.replay_buffer = ReplayBuffer(self.args.capacity) 43 | self.num_critic_update_iteration = 0 44 | self.num_actor_update_iteration = 0 45 | self.num_training = 0 46 | self.global_steps = 0 47 | 48 | if self.args.last_episode > 0: 49 | self.load(self.args.last_episode) 50 | 51 | def update(self): 52 | for it in range(self.args.update_iteration): 53 | # sample from replay buffer 54 | x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size) 55 | state = torch.FloatTensor(x).to(device) 56 | action = torch.FloatTensor(u).to(device) 57 | next_state = torch.FloatTensor(y).to(device) 58 | done = torch.FloatTensor(d).to(device) 59 | reward = torch.FloatTensor(r).to(device) 60 | 61 | # compute next action from actor target 62 | next_action, _, _ = self.actor_target.sample(next_state) 63 | 64 | # compute target Q 65 | target_Q1 = self.critic_target_1(next_state, next_action) 66 | target_Q2 = self.critic_target_2(next_state, next_action) 67 | target_Q = torch.min(target_Q1, target_Q2) 68 | target_Q = reward + ((1-done) * self.args.gamma * target_Q).detach() 69 | 70 | # optimize critic1 71 | current_Q1 = self.critic_1(state, action) 72 | critic_loss1 = F.mse_loss(current_Q1, target_Q) 73 | self.critic_optimizer_1.zero_grad() 74 | critic_loss1.backward() 75 | self.critic_optimizer_1.step() 76 | 77 | # optimize critic2 78 | current_Q2 = self.critic_2(state, action) 79 | critic_loss2 = F.mse_loss(current_Q2, target_Q) 80 | self.critic_optimizer_2.zero_grad() 81 | critic_loss2.backward() 82 | self.critic_optimizer_2.step() 83 | 84 | # delayed policy update 85 | if it % self.args.policy_delay == 0: 86 | actor_loss = -self.critic_1(state, self.actor(state)).mean() 87 | self.actor_optimizer.zero_grad() 88 | actor_loss.backward() 89 | self.actor_optimizer.step() 90 | 91 | # update target network 92 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 93 | target_param.data.copy_((1-self.args.tau) * target_param.data + self.args.tau * param.data) 94 | 95 | for param, target_param in zip(self.critic_1.parameters(), self.critic_target_1.parameters()): 96 | target_param.data.copy_((1-self.args.tau) * target_param.data + self.args.tau * param.data) 97 | 98 | for param, target_param in zip(self.critic_2.parameters(), self.critic_target_2.parameters()): 99 | target_param.data.copy_((1-self.args.tau) * target_param.data + self.args.tau * param.data) 100 | 101 | self.num_actor_update_iteration += 1 102 | self.num_critic_update_iteration += 1 103 | self.num_training += 1 104 | 105 | def train(self): 106 | for i in range(self.args.max_episode): 107 | state = self.env.reset() 108 | ep_r = 0 109 | for t in count(): 110 | action, _, _ = self.actor.sample(torch.FloatTensor([state]).to(device)) 111 | action = action.cpu().detach().numpy()[0] 112 | next_state, reward, done, info = self.env.step(action) 113 | self.global_steps += 1 114 | 115 | ep_r += reward 116 | self.replay_buffer.push((state, next_state, action, reward, np.float(done))) 117 | state = next_state 118 | 119 | if done or t > self.args.max_length_trajectory: 120 | if i % self.args.print_log == 0: 121 | print("Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}".format(i, ep_r, t, self.global_steps)) 122 | self.evaluate(10, False) 123 | break 124 | 125 | if len(self.replay_buffer.storage) >= self.args.capacity * 0.3 - 1: 126 | self.update() 127 | self.save(i+1) 128 | 129 | def evaluate(self, number = 1, render = True): 130 | rewards = [] 131 | for _ in range(number): 132 | total_rews = 0 133 | done = False 134 | time_step = 0 135 | state = self.env.reset() 136 | while not done: 137 | with torch.no_grad(): 138 | # use the mean action 139 | _, _, action = self.actor.sample(torch.FloatTensor([state]).to(device)) 140 | action = action.cpu().detach().numpy()[0] 141 | if render: 142 | self.env.render() 143 | state, reward, done, _ = self.env.step(action) 144 | total_rews += reward 145 | time_step += 1 146 | 147 | # if time_step > 1000: 148 | # print("time out") 149 | # break 150 | if render: 151 | print("total reward of this episode is " + str(total_rews)) 152 | rewards.append(total_rews) 153 | rewards = np.array(rewards) 154 | if not render: 155 | pickle.dump((self.global_steps, rewards), self.log_file) 156 | print("mean reward {}, max reward {}".format(rewards.mean(), rewards.max())) 157 | 158 | def load(self, episode = None): 159 | file_name = self.weights_file(episode) 160 | checkpoint = torch.load(file_name) 161 | self.actor.load_state_dict(checkpoint['actor']) 162 | self.actor_target.load_state_dict(checkpoint['actor_target']) 163 | self.critic_1.load_state_dict(checkpoint['critic_1']) 164 | self.critic_2.load_state_dict(checkpoint['critic_2']) 165 | self.critic_target_1.load_state_dict(checkpoint['critic_target_1']) 166 | self.critic_target_2.load_state_dict(checkpoint['critic_target_2']) 167 | print("successfully load model from " + file_name) 168 | 169 | def save(self, episode = None): 170 | file_name = self.weights_file(episode) 171 | torch.save({'actor' : self.actor.state_dict(), 172 | 'critic_1' : self.critic_1.state_dict(), 173 | 'critic_2' : self.critic_2.state_dict(), 174 | 'actor_target' : self.actor_target.state_dict(), 175 | 'critic_target_1' : self.critic_target_1.state_dict(), 176 | 'critic_target_2' : self.critic_target_2.state_dict()}, file_name) 177 | print("save model to " + file_name) 178 | -------------------------------------------------------------------------------- /algorithms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import gym 4 | import numpy as np 5 | 6 | class algorithms(): 7 | def __init__(self, args): 8 | self.args = args 9 | self.env = gym.make(self.args.env_name) 10 | 11 | log_file = os.path.join("log", self.args.env_name, self.args.model) 12 | if not os.path.exists(log_file): 13 | os.makedirs(log_file) 14 | log_file = os.path.join(log_file, self.args.exp_name + ".pck") 15 | if self.args.last_episode > 0: 16 | print("continue logging to " + log_file) 17 | elif self.args.mode == 'train' and os.path.exists(log_file): 18 | os.remove(log_file) 19 | self.log_file = open(log_file, 'ab') 20 | 21 | # set reandom seed 22 | self.env.seed(self.args.seed) 23 | torch.manual_seed(self.args.seed) 24 | np.random.seed(self.args.seed) 25 | 26 | def weights_file(self, episode = None): 27 | file_name = os.path.join("weights", self.args.env_name, self.args.model) 28 | file_name = "weights/" + self.args.env_name + "/" + self.args.model + "/" 29 | if not os.path.exists(file_name): 30 | os.makedirs(file_name) 31 | if episode == None: 32 | file_name = os.path.join(file_name, self.args.exp_name + ".pt") 33 | else: 34 | file_name = os.path.join(file_name, self.args.exp_name + "_" + str(episode) + ".pt") 35 | return file_name 36 | 37 | def close(self): 38 | self.env.close() 39 | self.log_file.close() -------------------------------------------------------------------------------- /doc/final_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/doc/final_report.pdf -------------------------------------------------------------------------------- /doc/image/buffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/doc/image/buffer.png -------------------------------------------------------------------------------- /doc/image/cheetah.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/doc/image/cheetah.png -------------------------------------------------------------------------------- /doc/image/cheetahflip.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/doc/image/cheetahflip.jpeg -------------------------------------------------------------------------------- /doc/image/halfcheetah.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/doc/image/halfcheetah.jpeg -------------------------------------------------------------------------------- /doc/image/hiddendim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/doc/image/hiddendim.png -------------------------------------------------------------------------------- /doc/image/hopper.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/doc/image/hopper.jpeg -------------------------------------------------------------------------------- /doc/image/hopper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/doc/image/hopper.png -------------------------------------------------------------------------------- /doc/image/pendulum.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/doc/image/pendulum.jpeg -------------------------------------------------------------------------------- /doc/image/seed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/doc/image/seed.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from SAC import SAC 4 | from TD3 import TD3 5 | from DDPG import DDPG 6 | from PPO import PPO 7 | from A2C import A2C 8 | 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', default='Pendulum-v0') 12 | parser.add_argument('--model', default='SAC') 13 | parser.add_argument('--mode', default='train') 14 | parser.add_argument('--num_envs', default=8) 15 | 16 | parser.add_argument('--lr', default=0.001, type=float) 17 | parser.add_argument('--gamma', default=0.99, type=float) 18 | parser.add_argument('--tau', default=0.005, type=float) # target smoothing coefficient 19 | parser.add_argument('--alpha', default=0.2, type=float) 20 | 21 | parser.add_argument('--capacity', default=500000, type=int) # replay buffer size 22 | parser.add_argument('--hidden_dim', default=64, type=int) 23 | 24 | parser.add_argument('--max_episode', default=2000, type=int) # num of games 25 | parser.add_argument('--last_episode', default=0, type=int) 26 | parser.add_argument('--max_length_trajectory', default=5000, type=int) 27 | parser.add_argument('--print_log', default=50, type=int) 28 | parser.add_argument('--exploration_noise', default=0.1) 29 | parser.add_argument('--policy_delay', default=2) 30 | 31 | parser.add_argument('--update_iteration', default=10, type=int) 32 | parser.add_argument('--batch_size', default=64, type=int) # mini batch size 33 | 34 | # experiment relater 35 | parser.add_argument('--seed', default=0, type=int) 36 | parser.add_argument('--exp_name', default='experiment') 37 | args = parser.parse_args() 38 | 39 | def main(): 40 | if args.model == "TD3": 41 | agent = TD3(args) 42 | elif args.model == "DDPG": 43 | agent = DDPG(args) 44 | elif args.model == "PPO": 45 | agent = PPO(args) 46 | elif args.model == "A2C": 47 | agent = A2C(args) 48 | else: 49 | agent = SAC(args) 50 | 51 | if args.mode == 'train': 52 | agent.train() 53 | elif args.mode == 'eval': 54 | agent.evaluate(5) 55 | agent.close() 56 | 57 | if __name__ == '__main__': 58 | main() -------------------------------------------------------------------------------- /plot_result.py: -------------------------------------------------------------------------------- 1 | import _pickle as pickle 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib 5 | matplotlib.use('Qt5Agg') 6 | from matplotlib import pyplot as plt 7 | import argparse 8 | import os 9 | import pandas as pd 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--dir', help = "dir to the log files") 13 | args = parser.parse_args() 14 | 15 | log_files = [os.path.join(args.dir, p) for p in sorted(os.listdir(args.dir))] 16 | 17 | all_result = [] 18 | all_steps = [] 19 | names = [] 20 | for log in log_files: 21 | if log.endswith(".pck"): 22 | names.append(log.split('.')[0].split('/')[-1]) 23 | file = open(log, 'rb') 24 | steps = [] 25 | results = [] 26 | while True: 27 | try: 28 | s, r = pickle.load(file) 29 | steps.append(s) 30 | results.append(r) 31 | except: 32 | file.close() 33 | break 34 | all_result.append(np.array(results)) 35 | all_steps.append(np.array(steps)) 36 | 37 | for i in range(len(all_result)): 38 | steps = all_steps[i] 39 | results = all_result[i] 40 | df = pd.DataFrame(results.transpose()) 41 | df.columns = steps 42 | df = df.melt() 43 | df = df.rename(columns={'variable': 'time_step', 'value' : 'reward'}) 44 | sns.lineplot(x = 'time_step', y = 'reward', data = df) 45 | plt.legend(names) 46 | plt.show() 47 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | seaborn==0.10.0 4 | gym==0.10.11 5 | torch==1.4.0 6 | tensorboardX -------------------------------------------------------------------------------- /utils/ReplayBuffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(): 4 | def __init__(self, max_size): 5 | self.storage = [] 6 | self.max_size = max_size 7 | self.ptr = 0 8 | 9 | def push(self, data): 10 | if len(self.storage) == self.max_size: 11 | self.storage[int(self.ptr)] = data 12 | self.ptr = (self.ptr + 1) % self.max_size 13 | else: 14 | self.storage.append(data) 15 | 16 | def sample(self, batch_size): 17 | ind = np.random.randint(0, len(self.storage), size = batch_size) 18 | x, y, u, r, d = [], [], [], [], [] 19 | for i in ind: 20 | X, Y, U, R, D = self.storage[i] 21 | x.append(np.array(X, copy = False)) 22 | y.append(np.array(Y, copy = False)) 23 | u.append(np.array(U, copy = False)) 24 | r.append(np.array(R, copy = False)) 25 | d.append(np.array(D, copy = False)) 26 | return np.array(x).reshape(batch_size, -1), np.array(y).reshape(batch_size, -1), np.array(u).reshape(batch_size, -1), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1) 27 | -------------------------------------------------------------------------------- /utils/__pycache__/ReplayBuffer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/utils/__pycache__/ReplayBuffer.cpython-35.pyc -------------------------------------------------------------------------------- /utils/__pycache__/ReplayBuffer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/utils/__pycache__/ReplayBuffer.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/models.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/utils/__pycache__/models.cpython-35.pyc -------------------------------------------------------------------------------- /utils/__pycache__/models.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/utils/__pycache__/models.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/multiprocessing_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/utils/__pycache__/multiprocessing_env.cpython-35.pyc -------------------------------------------------------------------------------- /utils/__pycache__/multiprocessing_env.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tony23545/Deep-Reinforcement-Learning-Reimplementation/b6d852658afd1fdfc9592c0b52235fafb5d1240a/utils/__pycache__/multiprocessing_env.cpython-36.pyc -------------------------------------------------------------------------------- /utils/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal 5 | import numpy as np 6 | 7 | LOG_SIG_MAX = 2 8 | LOG_SIG_MIN = -20 9 | epsilon = 1e-6 10 | 11 | # Initialize Policy weights 12 | def weights_init_(m): 13 | if isinstance(m, nn.Linear): 14 | torch.nn.init.xavier_uniform_(m.weight, gain=1) 15 | torch.nn.init.constant_(m.bias, 0) 16 | 17 | 18 | class ValueNetwork(nn.Module): 19 | def __init__(self, num_inputs, hidden_dim): 20 | super(ValueNetwork, self).__init__() 21 | 22 | self.linear1 = nn.Linear(num_inputs, hidden_dim) 23 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 24 | self.linear3 = nn.Linear(hidden_dim, 1) 25 | 26 | self.apply(weights_init_) 27 | 28 | def forward(self, state): 29 | x = F.relu(self.linear1(state)) 30 | x = F.relu(self.linear2(x)) 31 | x = self.linear3(x) 32 | return x 33 | 34 | 35 | class QNetwork(nn.Module): 36 | def __init__(self, num_inputs, num_actions, hidden_dim): 37 | super(QNetwork, self).__init__() 38 | 39 | # Q1 architecture 40 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_dim) 41 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 42 | self.linear4 = nn.Linear(hidden_dim, hidden_dim) 43 | self.linear3 = nn.Linear(hidden_dim, 1) 44 | 45 | self.apply(weights_init_) 46 | 47 | def forward(self, state, action): 48 | 49 | xu = torch.cat([state, action], 1) 50 | 51 | x = F.relu(self.linear1(xu)) 52 | x = F.relu(self.linear2(x)) 53 | x = F.relu(self.linear4(x)) 54 | x = self.linear3(x) 55 | return x 56 | 57 | 58 | class GaussianFixstdPolicy(nn.Module): 59 | def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None): 60 | super(GaussianFixstdPolicy, self).__init__() 61 | 62 | self.linear1 = nn.Linear(num_inputs, hidden_dim) 63 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 64 | 65 | self.mean_linear = nn.Linear(hidden_dim, num_actions) 66 | self.apply(weights_init_) 67 | 68 | self.action_log_std = nn.Parameter(torch.ones(1, num_actions) * 0.5) 69 | 70 | # action rescaling 71 | if action_space is None: 72 | self.action_scale = torch.tensor(1.) 73 | self.action_bias = torch.tensor(0.) 74 | else: 75 | self.action_scale = torch.FloatTensor( 76 | (action_space.high - action_space.low) / 2.) 77 | self.action_bias = torch.FloatTensor( 78 | (action_space.high + action_space.low) / 2.) 79 | 80 | def forward(self, state): 81 | x = F.tanh(self.linear1(state)) 82 | x = F.tanh(self.linear2(x)) 83 | mean = self.mean_linear(x) 84 | return mean 85 | 86 | def sample(self, state): 87 | mean = self.forward(state) 88 | action_log_std = self.action_log_std.expand_as(mean) 89 | action_std = torch.exp(action_log_std) 90 | #print(action_std) 91 | #print(action_std) 92 | normal = Normal(mean, action_std) 93 | x_t = normal.sample() # for reparameterization trick (mean + std * N(0,1)) 94 | # y_t = torch.tanh(x_t) 95 | # action = y_t * self.action_scale + self.action_bias 96 | log_prob = normal.log_prob(x_t).sum(-1, keepdim=True) 97 | # mean = torch.tanh(mean) * self.action_scale + self.action_bias 98 | return x_t, log_prob, mean 99 | 100 | # for ppo 101 | def action_log_prob(self, state, action): 102 | mean = self.forward(state) 103 | #std = torch.exp(self.log_std) 104 | action_log_std = self.action_log_std.expand_as(mean) 105 | action_std = torch.exp(action_log_std) 106 | 107 | normal = Normal(mean, action_std) 108 | # action = (action - self.action_bias) / self.action_scale 109 | # # inverse tanh 110 | # action = 0.5*torch.log((1+action+0.00000001)/(1-action + 0.00000001)) 111 | log_prob = normal.log_prob(action).sum(-1, keepdim=True) 112 | entropy = normal.entropy() 113 | return log_prob, entropy 114 | 115 | class GaussianPolicy(nn.Module): 116 | def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None): 117 | super(GaussianPolicy, self).__init__() 118 | 119 | self.linear1 = nn.Linear(num_inputs, hidden_dim) 120 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 121 | 122 | self.mean_linear = nn.Linear(hidden_dim, num_actions) 123 | self.log_std_linear = nn.Linear(hidden_dim, num_actions) 124 | 125 | self.apply(weights_init_) 126 | 127 | # action rescaling 128 | if action_space is None: 129 | self.action_scale = torch.tensor(1.) 130 | self.action_bias = torch.tensor(0.) 131 | else: 132 | self.action_scale = torch.FloatTensor( 133 | (action_space.high - action_space.low) / 2.) 134 | self.action_bias = torch.FloatTensor( 135 | (action_space.high + action_space.low) / 2.) 136 | 137 | def forward(self, state): 138 | x = F.relu(self.linear1(state)) 139 | x = F.relu(self.linear2(x)) 140 | mean = self.mean_linear(x) 141 | log_std = self.log_std_linear(x) 142 | #log_std = torch.clamp(log_std, min=LOG_SIG_MIN, max=LOG_SIG_MAX) 143 | return mean, log_std 144 | 145 | def sample(self, state, entropy = True): 146 | mean, log_std = self.forward(state) 147 | std = log_std.exp() 148 | normal = Normal(mean, std) 149 | x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) 150 | y_t = torch.tanh(x_t) 151 | action = y_t * self.action_scale + self.action_bias 152 | log_prob = normal.log_prob(x_t) 153 | # Enforcing Action Bound 154 | if entropy: 155 | log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon) 156 | log_prob = log_prob.sum(1, keepdim=True) 157 | mean = torch.tanh(mean) * self.action_scale + self.action_bias 158 | return action, log_prob, mean 159 | 160 | def to(self, device): 161 | self.action_scale = self.action_scale.to(device) 162 | self.action_bias = self.action_bias.to(device) 163 | return super(GaussianPolicy, self).to(device) 164 | 165 | 166 | class DeterministicPolicy(nn.Module): 167 | def __init__(self, num_inputs, num_actions, hidden_dim, action_space=None): 168 | super(DeterministicPolicy, self).__init__() 169 | self.linear1 = nn.Linear(num_inputs, hidden_dim) 170 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 171 | self.linear3 = nn.Linear(hidden_dim, hidden_dim) 172 | 173 | self.mean = nn.Linear(hidden_dim, num_actions) 174 | self.noise = torch.Tensor(num_actions) 175 | 176 | self.apply(weights_init_) 177 | 178 | # action rescaling 179 | if action_space is None: 180 | self.action_scale = 1. 181 | self.action_bias = 0. 182 | else: 183 | self.action_scale = torch.FloatTensor( 184 | (action_space.high - action_space.low) / 2.) 185 | self.action_bias = torch.FloatTensor( 186 | (action_space.high + action_space.low) / 2.) 187 | 188 | def forward(self, state): 189 | x = F.relu(self.linear1(state)) 190 | x = F.relu(self.linear2(x)) 191 | x = F.relu(self.linear3(x)) 192 | mean = torch.tanh(self.mean(x)) * self.action_scale + self.action_bias 193 | return mean 194 | 195 | def sample(self, state): 196 | mean = self.forward(state) 197 | noise = self.noise.normal_(0., std=0.2) 198 | noise = noise.clamp(-0.5, 0.5) 199 | action = mean + noise 200 | return action, torch.tensor(0.), mean 201 | 202 | def to(self, device): 203 | self.action_scale = self.action_scale.to(device) 204 | self.action_bias = self.action_bias.to(device) 205 | self.noise = self.noise.to(device) 206 | return super(DeterministicPolicy, self).to(device) -------------------------------------------------------------------------------- /utils/multiprocessing_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiprocessing import Process, Pipe 3 | 4 | def worker(remote, parent_remote, env_fn_wrapper): 5 | parent_remote.close() 6 | env = env_fn_wrapper.x() 7 | while True: 8 | cmd, data = remote.recv() 9 | if cmd == 'step': 10 | ob, reward, done, info = env.step(data) 11 | if done: 12 | ob = env.reset() 13 | remote.send((ob, reward, done, info)) 14 | elif cmd == 'reset': 15 | ob = env.reset() 16 | remote.send(ob) 17 | elif cmd == 'reset_task': 18 | ob = env.reset_task() 19 | remote.send(ob) 20 | elif cmd == 'close': 21 | remote.close() 22 | break 23 | elif cmd == 'get_spaces': 24 | remote.send((env.observation_space, env.action_space)) 25 | else: 26 | raise NotImplementedError 27 | 28 | class VecEnv(object): 29 | ''' 30 | An abstract asynchronous vectoried environment 31 | ''' 32 | def __init__(self, num_envs, observation_space, action_space): 33 | self.num_envs = num_envs 34 | self.observation_space = observation_space 35 | self.action_space = action_space 36 | 37 | def reset(self): 38 | pass 39 | 40 | def step_async(self, actions): 41 | ''' 42 | Tell all the environments to start taking a step 43 | with the given actions. 44 | Call step_wait() to get the results of the step. 45 | You should not call this if a step_async run is alreay pending. 46 | ''' 47 | pass 48 | 49 | def step_wait(self): 50 | ''' 51 | Wait for the step taken with step_wait(). 52 | Return (obs, rews, dones, infors) 53 | ''' 54 | pass 55 | 56 | def close(self): 57 | pass 58 | 59 | def step(self, actions): 60 | self.step_async(actions) 61 | return self.step_wait() 62 | 63 | class CloudpickleWrapper(): 64 | ''' 65 | Use cloudpickle to serialize contents 66 | ''' 67 | def __init__(self, x): 68 | self.x = x 69 | def __getstate__(self): 70 | import cloudpickle 71 | return cloudpickle.dumps(self.x) 72 | def __setstate__(self, ob): 73 | import pickle 74 | self.x = pickle.loads(ob) 75 | 76 | class SubprocVecEnv(VecEnv): 77 | def __init__(self, env_fns, spaces = None): 78 | self.waiting = False 79 | self.closed = False 80 | nenvs = len(env_fns) 81 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 82 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 83 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 84 | 85 | for p in self.ps: 86 | p.daemon = True 87 | p.start() 88 | for remote in self.work_remotes: 89 | remote.close() 90 | 91 | self.remotes[0].send(('get_spaces', None)) 92 | observation_space, action_space = self.remotes[0].recv() 93 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 94 | 95 | def step_async(self, actions): 96 | for remote, action in zip(self.remotes, actions): 97 | remote.send(('step', action)) 98 | self.waiting = True 99 | 100 | def step_wait(self): 101 | results = [remote.recv() for remote in self.remotes] 102 | self.waiting = False 103 | obs, rews, dones, infos = zip(*results) 104 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 105 | 106 | def reset(self): 107 | for remote in self.remotes: 108 | remote.send(('reset', None)) 109 | return np.stack([remote.recv() for remote in self.remotes]) 110 | 111 | def reset_task(self): 112 | for remote in self.remotes: 113 | remote.send(('reset_task', None)) 114 | return np.stack([remote.recv() for remote in self.remotes]) 115 | 116 | def close(self): 117 | if self.closed: 118 | return 119 | if self.waiting: 120 | for remote in self.remotes: 121 | remote.recv() 122 | 123 | for remote in self.remotes: 124 | remote.send(('close', None)) 125 | for p in self.ps: 126 | p.join() 127 | self.closed = True 128 | 129 | def __len__(self): 130 | return self.nenvs 131 | 132 | --------------------------------------------------------------------------------