├── .gitignore ├── README.md ├── __init__.py ├── a2c ├── a2c.py ├── a2c_test.py ├── decoupled_a2c.py ├── decoupled_a2c_test.py └── models.py ├── a3c ├── a3c.py ├── a3c_model.pth ├── a3c_policy_model.pth ├── a3c_test.py ├── a3c_value_model.pth ├── models.py └── worker.py ├── common ├── __init__.py ├── noise.py ├── replay_buffers.py └── utils.py ├── ddpg ├── __init__.py ├── ddpg.py ├── ddpg_test.py └── models.py ├── ipynb ├── A2C.ipynb ├── DDPG.ipynb ├── SAC2018.ipynb ├── SAC2019.ipynb └── TD3.ipynb ├── sac ├── __init__.py ├── models.py ├── sac2018.py ├── sac2019.py └── sac_test.py ├── setup.py └── td3 ├── __init__.py ├── models.py ├── td3.py └── td3_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | policygradients.egg-info/ 2 | common/__pycache__/ 3 | ddpg/__pycache__/ 4 | td3/__pycache__/ 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Policy-Gradient-Methods 2 | 3 | Author: Chris Yoon 4 | 5 | Implementations of important policy gradient algorithms in deep reinforcement learning. 6 | 7 | 8 | 9 | ## Implementations 10 | 11 | - **Advantage Actor-Critic (A2C)** 12 | 13 | Paper: ["Asynchronous Methods for Deep Reinforcement Learning" (Mnih et al., 2016)](https://arxiv.org/pdf/1602.01783.pdf) 14 | 15 | - **Asynchronous Advantage Actor-Critic (A3C)** 16 | 17 | Paper: ["Asynchronous Methods for Deep Reinforcement Learning" (Mnih et al., 2016)](https://arxiv.org/pdf/1602.01783.pdf) 18 | 19 | - **Deep Deterministic Policy Gradients (DDPG)** 20 | 21 | Paper: ["Continuous control with deep reinforcement learning" (Lillicrap et al., 2015)](https://arxiv.org/abs/1509.02971) 22 | 23 | - **Twin Dueling Deep Deterministic Policy Gradients (TD3)** 24 | 25 | Paper: ["Addressing Function Approximation Error in Actor-Critic Methods" (Fujimoto et al., 2018)](https://arxiv.org/abs/1802.09477) 26 | 27 | - **Soft Actor Critic (SAC)** 28 | 29 | - Paper (`sac2018.py`): ["Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor" (Haarnoja et al., 2018)](https://arxiv.org/abs/1801.01290) 30 | - Paper (`sac2019.py`): ["Soft Actor-Critic Algorithms and Applications" (Haarnoja et al., 2019)](https://arxiv.org/abs/1812.05905) 31 | - Algorithm in 2018 paper uses value network, double Q networks, and Gaussian policy. Algorithm in 2019 paper uses double Q networks and Gaussian policy, and adds automatic entropy tuning. 32 | - TODO: SAC for discrete action space 33 | 34 | More implementations will be added soon. 35 | 36 | ## Known Dependencies 37 | 38 | - Python 3.6 39 | - PyTorch 1.0.2 40 | - gym 0.12.5 41 | 42 | ## How to run: 43 | 44 | Install package 45 | 46 | ```bash 47 | git clone git@github.com:cyoon1729/Policy-Gradient-Methods.git 48 | cd Policy-Gradient-Methods 49 | pip install . 50 | ``` 51 | 52 | Example: 53 | 54 | ```python 55 | import gym 56 | 57 | from policygradients.common.utils import mini_batch_train # import training function 58 | from policygradients.td3.td3 import TD3Agent # import agent from algorithm of interest 59 | 60 | # Create Gym environment 61 | env = gym.make("Pendulum-v0") 62 | 63 | # check agent class for initialization parameters and initialize agent 64 | gamma = 0.99 65 | tau = 1e-2 66 | noise_std = 0.2 67 | bound = 0.5 68 | delay_step = 2 69 | buffer_maxlen = 100000 70 | critic_lr = 1e-3 71 | actor_lr = 1e-3 72 | 73 | agent = TD3Agent(env, gamma, tau, buffer_maxlen, delay_step, noise_std, bound, critic_lr, actor_lr) 74 | 75 | # define training parameters 76 | max_episodes = 100 77 | max_steps = 500 78 | batch_size = 32 79 | 80 | # train agent with mini_batch_train function 81 | episode_rewards = mini_batch_train(env, agent, max_episodes, max_steps, batch_size) 82 | ``` 83 | 84 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyoon1729/Policy-Gradient-Methods/c003fc7f624c31cc8e9ab304f6a6947c9f440214/__init__.py -------------------------------------------------------------------------------- /a2c/a2c.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.optim as optim 4 | from torch.distributions import Categorical 5 | 6 | from models import TwoHeadNetwork 7 | 8 | 9 | class A2CAgent(): 10 | 11 | def __init__(self, env, gamma, lr): 12 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | self.env = env 15 | self.obs_dim = env.observation_space.shape[0] 16 | self.action_dim = env.action_space.n 17 | 18 | self.gamma = gamma 19 | self.lr = lr 20 | 21 | self.model = TwoHeadNetwork(self.obs_dim, self.action_dim) 22 | self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) 23 | 24 | def get_action(self, state): 25 | state = torch.FloatTensor(state).to(self.device) 26 | logits, _ = self.model.forward(state) 27 | dist = F.softmax(logits, dim=0) 28 | probs = Categorical(dist) 29 | 30 | return probs.sample().cpu().detach().item() 31 | 32 | def compute_loss(self, trajectory): 33 | states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) 34 | actions = torch.LongTensor([sars[1] for sars in trajectory]).view(-1, 1).to(self.device) 35 | rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) 36 | next_states = torch.FloatTensor([sars[3] for sars in trajectory]).to(self.device) 37 | dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1).to(self.device) 38 | 39 | # compute discounted rewards 40 | discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ 41 | * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. 42 | value_targets = rewards.view(-1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to(self.device) 43 | 44 | logits, values = self.model.forward(states) 45 | dists = F.softmax(logits, dim=1) 46 | probs = Categorical(dists) 47 | 48 | # compute value loss 49 | value_loss = F.mse_loss(values, value_targets.detach()) 50 | 51 | 52 | # compute entropy bonus 53 | entropy = [] 54 | for dist in dists: 55 | entropy.append(-torch.sum(dist.mean() * torch.log(dist))) 56 | entropy = torch.stack(entropy).sum() 57 | 58 | # compute policy loss 59 | advantage = value_targets - values 60 | policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(-1, 1) * advantage.detach() 61 | policy_loss = policy_loss.mean() 62 | 63 | total_loss = policy_loss + value_loss - 0.001 * entropy 64 | return total_loss 65 | 66 | def update(self, trajectory): 67 | loss = self.compute_loss(trajectory) 68 | 69 | self.optimizer.zero_grad() 70 | loss.backward() 71 | self.optimizer.step() -------------------------------------------------------------------------------- /a2c/a2c_test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from a2c import A2CAgent 4 | 5 | 6 | env = gym.make("CartPole-v0") 7 | obs_dim = env.observation_space.shape[0] 8 | action_dim = env.action_space.n 9 | MAX_EPISODE = 1500 10 | MAX_STEPS = 500 11 | 12 | lr = 1e-4 13 | gamma = 0.99 14 | 15 | agent = A2CAgent(env, gamma, lr) 16 | 17 | def run(): 18 | for episode in range(MAX_EPISODE): 19 | state = env.reset() 20 | trajectory = [] # [[s, a, r, s', done], [], ...] 21 | episode_reward = 0 22 | for steps in range(MAX_STEPS): 23 | action = agent.get_action(state) 24 | next_state, reward, done, _ = env.step(action) 25 | trajectory.append([state, action, reward, next_state, done]) 26 | episode_reward += reward 27 | 28 | if done: 29 | break 30 | 31 | state = next_state 32 | if episode % 10 == 0: 33 | print("Episode " + str(episode) + ": " + str(episode_reward)) 34 | agent.update(trajectory) 35 | 36 | run() -------------------------------------------------------------------------------- /a2c/decoupled_a2c.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.optim as optim 4 | from torch.distributions import Categorical 5 | 6 | from models import ValueNetwork, PolicyNetwork 7 | 8 | 9 | # unlike A2CAgent in a2c.py, here I separated value and policy network. 10 | class A2CAgent(): 11 | 12 | def __init__(self, env, gamma, lr): 13 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 14 | 15 | self.env = env 16 | self.obs_dim = env.observation_space.shape[0] 17 | self.action_dim = env.action_space.n 18 | 19 | self.gamma = gamma 20 | self.lr = lr 21 | 22 | self.value_network = ValueNetwork(self.obs_dim, 1) 23 | self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) 24 | 25 | self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) 26 | self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) 27 | 28 | def get_action(self, state): 29 | state = torch.FloatTensor(state).to(self.device) 30 | logits = self.policy_network.forward(state) 31 | dist = F.softmax(logits, dim=0) 32 | probs = Categorical(dist) 33 | 34 | return probs.sample().cpu().detach().item() 35 | 36 | def compute_loss(self, trajectory): 37 | states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) 38 | actions = torch.LongTensor([sars[1] for sars in trajectory]).view(-1, 1).to(self.device) 39 | rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) 40 | next_states = torch.FloatTensor([sars[3] for sars in trajectory]).to(self.device) 41 | dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1).to(self.device) 42 | 43 | # compute value target 44 | discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ 45 | * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. 46 | value_targets = rewards.view(-1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to(self.device) 47 | 48 | # compute value loss 49 | values = self.value_network.forward(states) 50 | value_loss = F.mse_loss(values, value_targets.detach()) 51 | 52 | 53 | # compute policy loss with entropy bonus 54 | logits = self.policy_network.forward(states) 55 | dists = F.softmax(logits, dim=1) 56 | probs = Categorical(dists) 57 | 58 | # compute entropy bonus 59 | entropy = [] 60 | for dist in dists: 61 | entropy.append(-torch.sum(dist.mean() * torch.log(dist))) 62 | entropy = torch.stack(entropy).sum() 63 | 64 | advantage = value_targets - values 65 | policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(-1, 1) * advantage.detach() 66 | policy_loss = policy_loss.mean() - 0.001 * entropy 67 | 68 | return value_loss, policy_loss 69 | 70 | def update(self, trajectory): 71 | value_loss, policy_loss = self.compute_loss(trajectory) 72 | 73 | self.value_optimizer.zero_grad() 74 | value_loss.backward() 75 | self.value_optimizer.step() 76 | 77 | self.policy_optimizer.zero_grad() 78 | policy_loss.backward() 79 | self.policy_optimizer.step() -------------------------------------------------------------------------------- /a2c/decoupled_a2c_test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from decoupled_a2c import A2CAgent 4 | 5 | 6 | env = gym.make("CartPole-v0") 7 | obs_dim = env.observation_space.shape[0] 8 | action_dim = env.action_space.n 9 | MAX_EPISODE = 1500 10 | MAX_STEPS = 500 11 | 12 | lr = 1e-4 13 | gamma = 0.99 14 | 15 | agent = A2CAgent(env, gamma, lr) 16 | 17 | def run(): 18 | for episode in range(MAX_EPISODE): 19 | state = env.reset() 20 | trajectory = [] # [[s, a, r, s', done], [], ...] 21 | episode_reward = 0 22 | for steps in range(MAX_STEPS): 23 | action = agent.get_action(state) 24 | next_state, reward, done, _ = env.step(action) 25 | trajectory.append([state, action, reward, next_state, done]) 26 | episode_reward += reward 27 | 28 | if done: 29 | break 30 | 31 | state = next_state 32 | if episode % 10 == 0: 33 | print("Episode " + str(episode) + ": " + str(episode_reward)) 34 | agent.update(trajectory) 35 | 36 | run() -------------------------------------------------------------------------------- /a2c/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class TwoHeadNetwork(nn.Module): 7 | 8 | def __init__(self, input_dim, output_dim): 9 | super(TwoHeadNetwork, self).__init__() 10 | self.policy1 = nn.Linear(input_dim, 256) 11 | self.policy2 = nn.Linear(256, output_dim) 12 | 13 | self.value1 = nn.Linear(input_dim, 256) 14 | self.value2 = nn.Linear(256, 1) 15 | 16 | def forward(self, state): 17 | logits = F.relu(self.policy1(state)) 18 | logits = self.policy2(logits) 19 | 20 | value = F.relu(self.value1(state)) 21 | value = self.value2(value) 22 | 23 | return logits, value 24 | 25 | 26 | class ValueNetwork(nn.Module): 27 | 28 | def __init__(self, input_dim, output_dim): 29 | super(ValueNetwork, self).__init__() 30 | self.fc1 = nn.Linear(input_dim, 256) 31 | self.fc2 = nn.Linear(256, output_dim) 32 | 33 | def forward(self, state): 34 | value = F.relu(self.fc1(state)) 35 | value = self.fc2(value) 36 | 37 | return value 38 | 39 | 40 | class PolicyNetwork(nn.Module): 41 | 42 | def __init__(self, input_dim, output_dim): 43 | super(PolicyNetwork, self).__init__() 44 | self.fc1 = nn.Linear(input_dim, 256) 45 | self.fc2 = nn.Linear(256, output_dim) 46 | 47 | def forward(self, state): 48 | logits = F.relu(self.fc1(state)) 49 | logits = self.fc2(logits) 50 | 51 | return logits -------------------------------------------------------------------------------- /a3c/a3c.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | import torch.multiprocessing as mp 4 | import gym 5 | 6 | from models import TwoHeadNetwork, ValueNetwork, PolicyNetwork 7 | from worker import Worker, DecoupledWorker 8 | 9 | 10 | class A3CAgent: 11 | 12 | def __init__(self, env, gamma, lr, global_max_episode): 13 | self.env = env 14 | 15 | self.gamma = gamma 16 | self.lr = lr 17 | self.global_episode = mp.Value('i', 0) 18 | self.GLOBAL_MAX_EPISODE = global_max_episode 19 | 20 | self.global_network = TwoHeadNetwork(self.env.observation_space.shape[0], self.env.action_space.n) 21 | self.global_network.share_memory() 22 | self.global_optimizer = optim.Adam(self.global_network.parameters(), lr=lr) 23 | self.workers = [Worker(i, env, self.gamma, self.global_network, self.global_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())] 24 | 25 | def train(self): 26 | print("Training on {} cores".format(mp.cpu_count())) 27 | input("Enter to start") 28 | 29 | [worker.start() for worker in self.workers] 30 | [worker.join() for worker in self.workers] 31 | 32 | def save_model(self): 33 | torch.save(self.global_network.state_dict(), "a3c_model.pth") 34 | 35 | 36 | class DecoupledA3CAgent: 37 | 38 | def __init__(self, env, gamma, lr, global_max_episode): 39 | self.env = env 40 | 41 | self.gamma = gamma 42 | self.lr = lr 43 | self.global_episode = mp.Value('i', 0) 44 | self.GLOBAL_MAX_EPISODE = global_max_episode 45 | 46 | self.global_value_network = ValueNetwork(self.env.observation_space.shape[0], 1) 47 | self.global_value_network.share_memory() 48 | self.global_policy_network = PolicyNetwork(self.env.observation_space.shape[0], self.env.action_space.n) 49 | self.global_policy_network.share_memory() 50 | self.global_value_optimizer = optim.Adam(self.global_value_network.parameters(), lr=lr) 51 | self.global_policy_optimizer = optim.Adam(self.global_policy_network.parameters(), lr=lr) 52 | 53 | self.workers = [DecoupledWorker(i, env, self.gamma, self.global_value_network, self.global_policy_network,\ 54 | self.global_value_optimizer, self.global_policy_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())] 55 | 56 | def train(self): 57 | print("Training on {} cores".format(mp.cpu_count())) 58 | input("Enter to start") 59 | 60 | [worker.start() for worker in self.workers] 61 | [worker.join() for worker in self.workers] 62 | 63 | def save_model(self): 64 | torch.save(self.global_value_network.state_dict(), "a3c_value_model.pth") 65 | torch.save(self.global_policy_network.state_dict(), "a3c_policy_model.pth") 66 | 67 | -------------------------------------------------------------------------------- /a3c/a3c_model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyoon1729/Policy-Gradient-Methods/c003fc7f624c31cc8e9ab304f6a6947c9f440214/a3c/a3c_model.pth -------------------------------------------------------------------------------- /a3c/a3c_policy_model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyoon1729/Policy-Gradient-Methods/c003fc7f624c31cc8e9ab304f6a6947c9f440214/a3c/a3c_policy_model.pth -------------------------------------------------------------------------------- /a3c/a3c_test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from a3c import A3CAgent, DecoupledA3CAgent 3 | 4 | def train_twoHeaded(): 5 | agent = A3CAgent(env, gamma, lr, GLOBAL_MAX_EPISODE) 6 | agent.train() 7 | agent.save_model() 8 | 9 | def train_decoupled(): 10 | agent = DecoupledA3CAgent(env, gamma, lr, GLOBAL_MAX_EPISODE) 11 | agent.train() 12 | agent.save_model() 13 | 14 | 15 | if __name__ == "__main__": 16 | env = gym.make("CartPole-v0") 17 | gamma = 0.99 18 | lr = 1e-3 19 | GLOBAL_MAX_EPISODE = 1000 20 | 21 | #train_twoHeaded() 22 | train_decoupled() 23 | -------------------------------------------------------------------------------- /a3c/a3c_value_model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyoon1729/Policy-Gradient-Methods/c003fc7f624c31cc8e9ab304f6a6947c9f440214/a3c/a3c_value_model.pth -------------------------------------------------------------------------------- /a3c/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class TwoHeadNetwork(nn.Module): 7 | 8 | def __init__(self, input_dim, output_dim): 9 | super(TwoHeadNetwork, self).__init__() 10 | self.policy1 = nn.Linear(input_dim, 256) 11 | self.policy2 = nn.Linear(256, output_dim) 12 | 13 | self.value1 = nn.Linear(input_dim, 256) 14 | self.value2 = nn.Linear(256, 1) 15 | 16 | def forward(self, state): 17 | logits = F.relu(self.policy1(state)) 18 | logits = self.policy2(logits) 19 | 20 | value = F.relu(self.value1(state)) 21 | value = self.value2(value) 22 | 23 | return logits, value 24 | 25 | 26 | class ValueNetwork(nn.Module): 27 | 28 | def __init__(self, input_dim, output_dim): 29 | super(ValueNetwork, self).__init__() 30 | self.fc1 = nn.Linear(input_dim, 256) 31 | self.fc2 = nn.Linear(256, output_dim) 32 | 33 | def forward(self, state): 34 | value = F.relu(self.fc1(state)) 35 | value = self.fc2(value) 36 | 37 | return value 38 | 39 | 40 | class PolicyNetwork(nn.Module): 41 | 42 | def __init__(self, input_dim, output_dim): 43 | super(PolicyNetwork, self).__init__() 44 | self.fc1 = nn.Linear(input_dim, 256) 45 | self.fc2 = nn.Linear(256, output_dim) 46 | 47 | def forward(self, state): 48 | logits = F.relu(self.fc1(state)) 49 | logits = self.fc2(logits) 50 | 51 | return logits -------------------------------------------------------------------------------- /a3c/worker.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.multiprocessing as mp 4 | from torch.distributions import Categorical 5 | 6 | from models import TwoHeadNetwork, ValueNetwork, PolicyNetwork 7 | 8 | 9 | class Worker(mp.Process): 10 | 11 | def __init__(self, id, env, gamma, global_network, global_optimizer, global_episode, GLOBAL_MAX_EPISODE): 12 | super(Worker, self).__init__() 13 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 14 | self.name = "w%i" % id 15 | 16 | self.env = env 17 | self.env.seed(id) 18 | self.obs_dim = env.observation_space.shape[0] 19 | self.action_dim = env.action_space.n 20 | 21 | self.gamma = gamma 22 | self.local_network = TwoHeadNetwork(self.obs_dim, self.action_dim) 23 | 24 | self.global_network = global_network 25 | self.global_episode = global_episode 26 | self.global_optimizer = global_optimizer 27 | self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE 28 | 29 | # sync local networks with global 30 | self.sync_with_global() 31 | 32 | def get_action(self, state): 33 | state = torch.FloatTensor(state).to(self.device) 34 | logits, _ = self.local_network.forward(state) 35 | dist = F.softmax(logits, dim=0) 36 | probs = Categorical(dist) 37 | 38 | return probs.sample().cpu().detach().item() 39 | 40 | def compute_loss(self, trajectory): 41 | states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) 42 | actions = torch.LongTensor([sars[1] for sars in trajectory]).view(-1, 1).to(self.device) 43 | rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) 44 | next_states = torch.FloatTensor([sars[3] for sars in trajectory]).to(self.device) 45 | dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1).to(self.device) 46 | 47 | # compute discounted rewards 48 | discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ 49 | * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. 50 | 51 | logits, values = self.local_network.forward(states) 52 | dists = F.softmax(logits, dim=1) 53 | probs = Categorical(dists) 54 | 55 | # compute value loss 56 | value_targets = rewards.view(-1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to(self.device) 57 | value_loss = F.mse_loss(values, value_targets.detach()) 58 | 59 | # compute entropy bonus 60 | entropy = [] 61 | for dist in dists: 62 | entropy.append(-torch.sum(dist.mean() * torch.log(dist))) 63 | entropy = torch.stack(entropy).sum() 64 | 65 | # compute policy loss 66 | advantage = value_targets - values 67 | policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(-1, 1) * advantage.detach() 68 | policy_loss = policy_loss.mean() 69 | 70 | total_loss = policy_loss + value_loss - 0.001 * entropy 71 | return total_loss 72 | 73 | def update_global(self, trajectory): 74 | loss = self.compute_loss(trajectory) 75 | 76 | self.global_optimizer.zero_grad() 77 | loss.backward() 78 | # propagate local gradients to global parameters 79 | for local_params, global_params in zip(self.local_network.parameters(), self.global_network.parameters()): 80 | global_params._grad = local_params._grad 81 | self.global_optimizer.step() 82 | 83 | def sync_with_global(self): 84 | self.local_network.load_state_dict(self.global_network.state_dict()) 85 | 86 | def run(self): 87 | state = self.env.reset() 88 | trajectory = [] # [[s, a, r, s', done], [], ...] 89 | episode_reward = 0 90 | 91 | while self.global_episode.value < self.GLOBAL_MAX_EPISODE: 92 | action = self.get_action(state) 93 | next_state, reward, done, _ = self.env.step(action) 94 | trajectory.append([state, action, reward, next_state, done]) 95 | episode_reward += reward 96 | 97 | if done: 98 | with self.global_episode.get_lock(): 99 | self.global_episode.value += 1 100 | print(self.name + " | episode: "+ str(self.global_episode.value) + " " + str(episode_reward)) 101 | 102 | self.update_global(trajectory) 103 | self.sync_with_global() 104 | 105 | trajectory = [] 106 | episode_reward = 0 107 | state = self.env.reset() 108 | else: 109 | state = next_state 110 | 111 | 112 | class DecoupledWorker(mp.Process): 113 | 114 | def __init__(self, id, env, gamma, global_value_network, global_policy_network, global_value_optimizer, global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE): 115 | super(DecoupledWorker, self).__init__() 116 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 117 | self.name = "w%i" % id 118 | 119 | self.env = env 120 | self.env.seed(id) 121 | self.obs_dim = env.observation_space.shape[0] 122 | self.action_dim = env.action_space.n 123 | 124 | self.gamma = gamma 125 | self.local_value_network = ValueNetwork(self.obs_dim, 1) 126 | self.local_policy_network = PolicyNetwork(self.obs_dim , self.action_dim) 127 | 128 | self.global_value_network = global_value_network 129 | self.global_policy_network = global_policy_network 130 | self.global_episode = global_episode 131 | self.global_value_optimizer = global_value_optimizer 132 | self.global_policy_optimizer = global_policy_optimizer 133 | self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE 134 | 135 | # sync local networks with global networks 136 | self.sync_with_global() 137 | 138 | def get_action(self, state): 139 | state = torch.FloatTensor(state).to(self.device) 140 | logits = self.local_policy_network.forward(state) 141 | dist = F.softmax(logits, dim=0) 142 | probs = Categorical(dist) 143 | 144 | return probs.sample().cpu().detach().item() 145 | 146 | def compute_loss(self, trajectory): 147 | states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) 148 | actions = torch.LongTensor([sars[1] for sars in trajectory]).view(-1, 1).to(self.device) 149 | rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) 150 | next_states = torch.FloatTensor([sars[3] for sars in trajectory]).to(self.device) 151 | dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1).to(self.device) 152 | 153 | # compute value target 154 | discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ 155 | * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. 156 | value_targets = rewards.view(-1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to(self.device) 157 | 158 | # compute value loss 159 | values = self.local_value_network.forward(states) 160 | value_loss = F.mse_loss(values, value_targets.detach()) 161 | 162 | 163 | # compute policy loss with entropy bonus 164 | logits = self.local_policy_network.forward(states) 165 | dists = F.softmax(logits, dim=1) 166 | probs = Categorical(dists) 167 | 168 | # compute entropy bonus 169 | entropy = [] 170 | for dist in dists: 171 | entropy.append(-torch.sum(dist.mean() * torch.log(dist))) 172 | entropy = torch.stack(entropy).sum() 173 | 174 | advantage = value_targets - values 175 | policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(-1, 1) * advantage.detach() 176 | policy_loss = policy_loss.mean() - 0.001 * entropy 177 | 178 | return value_loss, policy_loss 179 | 180 | def update_global(self, trajectory): 181 | value_loss, policy_loss = self.compute_loss(trajectory) 182 | 183 | self.global_value_optimizer.zero_grad() 184 | value_loss.backward() 185 | # propagate local gradients to global parameters 186 | for local_params, global_params in zip(self.local_value_network.parameters(), self.global_value_network.parameters()): 187 | global_params._grad = local_params._grad 188 | self.global_value_optimizer.step() 189 | 190 | self.global_policy_optimizer.zero_grad() 191 | policy_loss.backward() 192 | # propagate local gradients to global parameters 193 | for local_params, global_params in zip(self.local_policy_network.parameters(), self.global_policy_network.parameters()): 194 | global_params._grad = local_params._grad 195 | self.global_policy_optimizer.step() 196 | 197 | def sync_with_global(self): 198 | self.local_value_network.load_state_dict(self.global_value_network.state_dict()) 199 | self.local_policy_network.load_state_dict(self.global_policy_network.state_dict()) 200 | 201 | def run(self): 202 | state = self.env.reset() 203 | trajectory = [] # [[s, a, r, s', done], [], ...] 204 | episode_reward = 0 205 | 206 | while self.global_episode.value < self.GLOBAL_MAX_EPISODE: 207 | action = self.get_action(state) 208 | next_state, reward, done, _ = self.env.step(action) 209 | trajectory.append([state, action, reward, next_state, done]) 210 | episode_reward += reward 211 | 212 | if done: 213 | with self.global_episode.get_lock(): 214 | self.global_episode.value += 1 215 | print(self.name + " | episode: "+ str(self.global_episode.value) + " " + str(episode_reward)) 216 | 217 | self.update_global(trajectory) 218 | self.sync_with_global() 219 | 220 | trajectory = [] 221 | episode_reward = 0 222 | state = self.env.reset() 223 | else: 224 | state = next_state -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyoon1729/Policy-Gradient-Methods/c003fc7f624c31cc8e9ab304f6a6947c9f440214/common/__init__.py -------------------------------------------------------------------------------- /common/noise.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.autograd as autograd 5 | 6 | import math 7 | import numpy as np 8 | 9 | 10 | class NoisyLinear(nn.Module): 11 | 12 | def __init__(self, num_in, num_out, is_training=True): 13 | super(NoisyLinear, self).__init__() 14 | self.num_in = num_in 15 | self.num_out = num_out 16 | self.is_training = is_training 17 | 18 | self.mu_weight = nn.Parameter(torch.FloatTensor(num_out, num_in)) 19 | self.mu_bias = nn.Parameter(torch.FloatTensor(num_out)) 20 | self.sigma_weight = nn.Parameter(torch.FloatTensor(num_out, num_in)) 21 | self.sigma_bias = nn.Parameter(torch.FloatTensor(num_out)) 22 | self.register_buffer("epsilon_weight", torch.FloatTensor(num_out, num_in)) 23 | self.register_buffer("epsilon_bias", torch.FloatTensor(num_out)) 24 | 25 | self.reset_parameters() 26 | self.reset_noise() 27 | 28 | def forward(self, x): 29 | self.reset_noise() 30 | 31 | if self.is_training: 32 | weight = self.mu_weight + self.sigma_weight.mul(autograd.Variable(self.epsilon_weight)) 33 | bias = self.mu_bias + self.sigma_bias.mul(autograd.Variable(self.epsilon_bias)) 34 | else: 35 | weight = self.mu_weight 36 | buas = self.mu_bias 37 | 38 | y = F.linear(x, weight, bias) 39 | 40 | return y 41 | 42 | def reset_parameters(self): 43 | std = math.sqrt(3 / self.num_in) 44 | self.mu_weight.data.uniform_(-std, std) 45 | self.mu_bias.data.uniform_(-std,std) 46 | 47 | self.sigma_weight.data.fill_(0.017) 48 | self.sigma_bias.data.fill_(0.017) 49 | 50 | def reset_noise(self): 51 | self.epsilon_weight.data.normal_() 52 | self.epsilon_bias.data.normal_() 53 | 54 | 55 | class FactorizedNoisyLinear(nn.Module): 56 | 57 | def __init__(self, num_in, num_out, is_training=True): 58 | super(FactorizedNoisyLinear, self).__init__() 59 | self.num_in = num_in 60 | self.num_out = num_out 61 | self.is_training = is_training 62 | 63 | self.mu_weight = nn.Parameter(torch.FloatTensor(num_out, num_in)) 64 | self.mu_bias = nn.Parameter(torch.FloatTensor(num_out)) 65 | self.sigma_weight = nn.Parameter(torch.FloatTensor(num_out, num_in)) 66 | self.sigma_bias = nn.Parameter(torch.FloatTensor(num_out)) 67 | self.register_buffer("epsilon_i", torch.FloatTensor(num_in)) 68 | self.register_buffer("epsilon_j", torch.FloatTensor(num_out)) 69 | 70 | self.reset_parameters() 71 | self.reset_noise() 72 | 73 | def forward(self, x): 74 | self.reset_noise() 75 | 76 | if self.is_training: 77 | epsilon_weight = self.epsilon_j.ger(self.epsilon_i) 78 | epsilon_bias = self.epsilon_j 79 | weight = self.mu_weight + self.sigma_weight.mul(autograd.Variable(epsilon_weight)) 80 | bias = self.mu_bias + self.sigma_bias.mul(autograd.Variable(epsilon_bias)) 81 | else: 82 | weight = self.mu_weight 83 | bias = self.mu_bias 84 | 85 | y = F.linear(x, weight, bias) 86 | 87 | return y 88 | 89 | def reset_parameters(self): 90 | std = 1 / math.sqrt(self.num_in) 91 | self.mu_weight.data.uniform_(-std, std) 92 | self.mu_bias.data.uniform_(-std, std) 93 | 94 | self.sigma_weight.data.fill_(0.5 / math.sqrt(self.num_in)) 95 | self.sigma_bias.data.fill_(0.5 / math.sqrt(self.num_in)) 96 | 97 | def reset_noise(self): 98 | eps_i = torch.randn(self.num_in) 99 | eps_j = torch.randn(self.num_out) 100 | self.epsilon_i = eps_i.sign() * (eps_i.abs()).sqrt() 101 | self.epsilon_j = eps_j.sign() * (eps_j.abs()).sqrt() 102 | 103 | # Ornstein-Ulhenbeck Noise 104 | # Taken from #https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py 105 | class OUNoise(object): 106 | def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000): 107 | self.mu = mu 108 | self.theta = theta 109 | self.sigma = max_sigma 110 | self.max_sigma = max_sigma 111 | self.min_sigma = min_sigma 112 | self.decay_period = decay_period 113 | self.action_dim = action_space.shape[0] 114 | self.low = action_space.low 115 | self.high = action_space.high 116 | self.reset() 117 | 118 | def reset(self): 119 | self.state = np.ones(self.action_dim) * self.mu 120 | 121 | def evolve_state(self): 122 | x = self.state 123 | dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim) 124 | self.state = x + dx 125 | return self.state 126 | 127 | def get_action(self, action, t=0): 128 | ou_state = self.evolve_state() 129 | self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) 130 | return np.clip(action + ou_state, self.low, self.high) -------------------------------------------------------------------------------- /common/replay_buffers.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | from collections import deque 4 | 5 | class BasicBuffer: 6 | 7 | def __init__(self, max_size): 8 | self.max_size = max_size 9 | self.buffer = deque(maxlen=max_size) 10 | 11 | def push(self, state, action, reward, next_state, done): 12 | experience = (state, action, np.array([reward]), next_state, done) 13 | self.buffer.append(experience) 14 | 15 | def sample(self, batch_size): 16 | state_batch = [] 17 | action_batch = [] 18 | reward_batch = [] 19 | next_state_batch = [] 20 | done_batch = [] 21 | 22 | batch = random.sample(self.buffer, batch_size) 23 | 24 | for experience in batch: 25 | state, action, reward, next_state, done = experience 26 | state_batch.append(state) 27 | action_batch.append(action) 28 | reward_batch.append(reward) 29 | next_state_batch.append(next_state) 30 | done_batch.append(done) 31 | 32 | return (state_batch, action_batch, reward_batch, next_state_batch, done_batch) 33 | 34 | def sample_sequence(self, batch_size): 35 | state_batch = [] 36 | action_batch = [] 37 | reward_batch = [] 38 | next_state_batch = [] 39 | done_batch = [] 40 | 41 | min_start = len(self.buffer) - batch_size 42 | start = np.random.randint(0, min_start) 43 | 44 | for sample in range(start, start + batch_size): 45 | state, action, reward, next_state, done = self.buffer[start] 46 | state, action, reward, next_state, done = experience 47 | state_batch.append(state) 48 | action_batch.append(action) 49 | reward_batch.append(reward) 50 | next_state_batch.append(next_state) 51 | done_batch.append(done) 52 | 53 | return (state_batch, action_batch, reward_batch, next_state_batch, done_batch) 54 | 55 | def __len__(self): 56 | return len(self.buffer) 57 | -------------------------------------------------------------------------------- /common/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import gym 4 | import torch 5 | 6 | 7 | def mini_batch_train(env, agent, max_episodes, max_steps, batch_size): 8 | episode_rewards = [] 9 | 10 | for episode in range(max_episodes): 11 | state = env.reset() 12 | episode_reward = 0 13 | 14 | for step in range(max_steps): 15 | action = agent.get_action(state) 16 | next_state, reward, done, _ = env.step(action) 17 | agent.replay_buffer.push(state, action, reward, next_state, done) 18 | episode_reward += reward 19 | 20 | if len(agent.replay_buffer) > batch_size: 21 | agent.update(batch_size) 22 | 23 | if done or step == max_steps-1: 24 | episode_rewards.append(episode_reward) 25 | print("Episode " + str(episode) + ": " + str(episode_reward)) 26 | break 27 | 28 | state = next_state 29 | 30 | return episode_rewards 31 | 32 | def mini_batch_train_frames(env, agent, max_frames, batch_size): 33 | episode_rewards = [] 34 | state = env.reset() 35 | episode_reward = 0 36 | 37 | for frame in range(max_frames): 38 | action = agent.get_action(state) 39 | next_state, reward, done, _ = env.step(action) 40 | agent.replay_buffer.push(state, action, reward, next_state, done) 41 | episode_reward += reward 42 | 43 | if len(agent.replay_buffer) > batch_size: 44 | agent.update(batch_size) 45 | 46 | if done: 47 | episode_rewards.append(episode_reward) 48 | print("Frame " + str(frame) + ": " + str(episode_reward)) 49 | state = env.reset() 50 | episode_reward = 0 51 | 52 | state = next_state 53 | 54 | return episode_rewards 55 | 56 | # process episode rewards for multiple trials 57 | def process_episode_rewards(many_episode_rewards): 58 | minimum = [np.min(episode_reward) for episode_reward in episode_rewards] 59 | maximum = [np.max(episode_reward) for episode_reward in episode_rewards] 60 | mean = [np.mean(episode_reward) for episode_reward in episode_rewards] 61 | 62 | return minimum, maximum, mean -------------------------------------------------------------------------------- /ddpg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyoon1729/Policy-Gradient-Methods/c003fc7f624c31cc8e9ab304f6a6947c9f440214/ddpg/__init__.py -------------------------------------------------------------------------------- /ddpg/ddpg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | import torch.autograd as autograd 4 | import torch.nn.functional as F 5 | 6 | from models import Critic, Actor 7 | from common.replay_buffers import BasicBuffer 8 | from common.noise import OUNoise 9 | 10 | 11 | class DDPGAgent: 12 | 13 | def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate): 14 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 15 | 16 | self.env = env 17 | self.obs_dim = env.observation_space.shape[0] 18 | self.action_dim = env.action_space.shape[0] 19 | 20 | # hyperparameters 21 | self.env = env 22 | self.gamma = gamma 23 | self.tau = tau 24 | 25 | # initialize actor and critic networks 26 | self.critic = Critic(self.obs_dim, self.action_dim).to(self.device) 27 | self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device) 28 | 29 | self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) 30 | self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) 31 | 32 | # Copy critic target parameters 33 | for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): 34 | target_param.data.copy_(param.data) 35 | 36 | # optimizers 37 | self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) 38 | self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) 39 | 40 | self.replay_buffer = BasicBuffer(buffer_maxlen) 41 | self.noise = OUNoise(self.env.action_space) 42 | 43 | def get_action(self, obs): 44 | state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) 45 | action = self.actor.forward(state) 46 | action = action.squeeze(0).cpu().detach().numpy() 47 | 48 | return action 49 | 50 | def update(self, batch_size): 51 | states, actions, rewards, next_states, _ = self.replay_buffer.sample(batch_size) 52 | state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(batch_size) 53 | state_batch = torch.FloatTensor(state_batch).to(self.device) 54 | action_batch = torch.FloatTensor(action_batch).to(self.device) 55 | reward_batch = torch.FloatTensor(reward_batch).to(self.device) 56 | next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) 57 | masks = torch.FloatTensor(masks).to(self.device) 58 | 59 | curr_Q = self.critic.forward(state_batch, action_batch) 60 | next_actions = self.actor_target.forward(next_state_batch) 61 | next_Q = self.critic_target.forward(next_state_batch, next_actions.detach()) 62 | expected_Q = reward_batch + self.gamma * next_Q 63 | 64 | # update critic 65 | q_loss = F.mse_loss(curr_Q, expected_Q.detach()) 66 | 67 | self.critic_optimizer.zero_grad() 68 | q_loss.backward() 69 | self.critic_optimizer.step() 70 | 71 | # update actor 72 | policy_loss = -self.critic.forward(state_batch, self.actor.forward(state_batch)).mean() 73 | 74 | self.actor_optimizer.zero_grad() 75 | policy_loss.backward() 76 | self.actor_optimizer.step() 77 | 78 | # update target networks 79 | for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): 80 | target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) 81 | 82 | for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): 83 | target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) -------------------------------------------------------------------------------- /ddpg/ddpg_test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from common.utils import mini_batch_train 4 | from ddpg import DDPGAgent 5 | 6 | env = gym.make("Pendulum-v0") 7 | 8 | max_episodes = 100 9 | max_steps = 500 10 | batch_size = 32 11 | 12 | gamma = 0.99 13 | tau = 1e-2 14 | buffer_maxlen = 100000 15 | critic_lr = 1e-3 16 | actor_lr = 1e-3 17 | 18 | agent = DDPGAgent(env, gamma, tau, buffer_maxlen, critic_lr, actor_lr) 19 | episode_rewards = mini_batch_train(env, agent, max_episodes, max_steps, batch_size) -------------------------------------------------------------------------------- /ddpg/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Critic(nn.Module): 7 | 8 | def __init__(self, obs_dim, action_dim): 9 | super(Critic, self).__init__() 10 | 11 | self.obs_dim = obs_dim 12 | self.action_dim = action_dim 13 | 14 | self.linear1 = nn.Linear(self.obs_dim, 1024) 15 | self.linear2 = nn.Linear(1024 + self.action_dim, 512) 16 | self.linear3 = nn.Linear(512, 300) 17 | self.linear4 = nn.Linear(300, 1) 18 | 19 | def forward(self, x, a): 20 | x = F.relu(self.linear1(x)) 21 | xa_cat = torch.cat([x,a], 1) 22 | xa = F.relu(self.linear2(xa_cat)) 23 | xa = F.relu(self.linear3(xa)) 24 | qval = self.linear4(xa) 25 | 26 | return qval 27 | 28 | class Actor(nn.Module): 29 | 30 | def __init__(self, obs_dim, action_dim): 31 | super(Actor, self).__init__() 32 | 33 | self.obs_dim = obs_dim 34 | self.action_dim = action_dim 35 | 36 | self.linear1 = nn.Linear(self.obs_dim, 512) 37 | self.linear2 = nn.Linear(512, 128) 38 | self.linear3 = nn.Linear(128, self.action_dim) 39 | 40 | def forward(self, obs): 41 | x = F.relu(self.linear1(obs)) 42 | x = F.relu(self.linear2(x)) 43 | x = torch.tanh(self.linear3(x)) 44 | 45 | return x -------------------------------------------------------------------------------- /ipynb/A2C.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "a2c.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "qS89Jy2-00HL", 21 | "colab_type": "code", 22 | "colab": {} 23 | }, 24 | "source": [ 25 | "import torch\n", 26 | "import torch.nn as nn\n", 27 | "import numpy as np \n", 28 | "import torch.nn.functional as F \n", 29 | "import torch.optim as optim\n", 30 | "from torch.distributions import Categorical\n", 31 | "import gym" 32 | ], 33 | "execution_count": 0, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "Cg2Z_G6UmDZ5", 40 | "colab_type": "code", 41 | "colab": {} 42 | }, 43 | "source": [ 44 | "def run(env, agent):\n", 45 | " for episode in range(MAX_EPISODE):\n", 46 | " state = env.reset()\n", 47 | " trajectory = [] # [[s, a, r, s', done], [], ...]\n", 48 | " episode_reward = 0\n", 49 | " for steps in range(MAX_STEPS):\n", 50 | " action = agent.get_action(state)\n", 51 | " next_state, reward, done, _ = env.step(action)\n", 52 | " trajectory.append([state, action, reward, next_state, done])\n", 53 | " episode_reward += reward\n", 54 | "\n", 55 | " if done:\n", 56 | " break\n", 57 | " \n", 58 | " state = next_state\n", 59 | " if episode % 10 == 0:\n", 60 | " print(\"Episode \" + str(episode) + \": \" + str(episode_reward))\n", 61 | " agent.update(trajectory)" 62 | ], 63 | "execution_count": 0, 64 | "outputs": [] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "metadata": { 69 | "id": "wLHVlNXDmH9i", 70 | "colab_type": "code", 71 | "colab": {} 72 | }, 73 | "source": [ 74 | "class TwoHeadNetwork(nn.Module):\n", 75 | "\n", 76 | " def __init__(self, input_dim, output_dim):\n", 77 | " super(TwoHeadNetwork, self).__init__()\n", 78 | " self.policy1 = nn.Linear(input_dim, 256) \n", 79 | " self.policy2 = nn.Linear(256, output_dim)\n", 80 | "\n", 81 | " self.value1 = nn.Linear(input_dim, 256)\n", 82 | " self.value2 = nn.Linear(256, 1)\n", 83 | " \n", 84 | " def forward(self, state):\n", 85 | " logits = F.relu(self.policy1(state))\n", 86 | " logits = self.policy2(logits)\n", 87 | "\n", 88 | " value = F.relu(self.value1(state))\n", 89 | " value = self.value2(value)\n", 90 | "\n", 91 | " return logits, value\n", 92 | "\n", 93 | "\n", 94 | "class ValueNetwork(nn.Module):\n", 95 | "\n", 96 | " def __init__(self, input_dim, output_dim):\n", 97 | " super(ValueNetwork, self).__init__()\n", 98 | " self.fc1 = nn.Linear(input_dim, 256)\n", 99 | " self.fc2 = nn.Linear(256, output_dim)\n", 100 | "\n", 101 | " def forward(self, state):\n", 102 | " value = F.relu(self.fc1(state))\n", 103 | " value = self.fc2(value)\n", 104 | "\n", 105 | " return value\n", 106 | " \n", 107 | "\n", 108 | "class PolicyNetwork(nn.Module):\n", 109 | "\n", 110 | " def __init__(self, input_dim, output_dim):\n", 111 | " super(PolicyNetwork, self).__init__()\n", 112 | " self.fc1 = nn.Linear(input_dim, 256)\n", 113 | " self.fc2 = nn.Linear(256, output_dim)\n", 114 | " \n", 115 | " def forward(self, state):\n", 116 | " logits = F.relu(self.fc1(state))\n", 117 | " logits = self.fc2(logits)\n", 118 | "\n", 119 | " return logits" 120 | ], 121 | "execution_count": 0, 122 | "outputs": [] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "metadata": { 127 | "id": "2mvzHR62mPxU", 128 | "colab_type": "code", 129 | "colab": {} 130 | }, 131 | "source": [ 132 | "class A2CAgent():\n", 133 | "\n", 134 | " def __init__(self, env, gamma, lr):\n", 135 | " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 136 | " \n", 137 | " self.env = env\n", 138 | " self.obs_dim = env.observation_space.shape[0]\n", 139 | " self.action_dim = env.action_space.n\n", 140 | " \n", 141 | " self.gamma = gamma\n", 142 | " self.lr = lr\n", 143 | " \n", 144 | " self.model = TwoHeadNetwork(self.obs_dim, self.action_dim)\n", 145 | " self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)\n", 146 | " \n", 147 | " def get_action(self, state):\n", 148 | " state = torch.FloatTensor(state).to(self.device)\n", 149 | " logits, _ = self.model.forward(state)\n", 150 | " dist = F.softmax(logits, dim=0)\n", 151 | " probs = Categorical(dist)\n", 152 | "\n", 153 | " return probs.sample().cpu().detach().item()\n", 154 | " \n", 155 | " def compute_loss(self, trajectory):\n", 156 | " states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device)\n", 157 | " actions = torch.LongTensor([sars[1] for sars in trajectory]).view(-1, 1).to(self.device)\n", 158 | " rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device)\n", 159 | " next_states = torch.FloatTensor([sars[3] for sars in trajectory]).to(self.device)\n", 160 | " dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1).to(self.device)\n", 161 | " \n", 162 | " # compute discounted rewards\n", 163 | " discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\\\n", 164 | " * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code.\n", 165 | " value_targets = rewards.view(-1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to(self.device)\n", 166 | " \n", 167 | " logits, values = self.model.forward(states)\n", 168 | " dists = F.softmax(logits, dim=1)\n", 169 | " probs = Categorical(dists)\n", 170 | " \n", 171 | " # compute value loss\n", 172 | " value_loss = F.mse_loss(values, value_targets.detach())\n", 173 | " \n", 174 | " \n", 175 | " # compute entropy bonus\n", 176 | " entropy = []\n", 177 | " for dist in dists:\n", 178 | " entropy.append(-torch.sum(dist.mean() * torch.log(dist)))\n", 179 | " entropy = torch.stack(entropy).sum()\n", 180 | " \n", 181 | " # compute policy loss\n", 182 | " advantage = value_targets - values\n", 183 | " policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(-1, 1) * advantage.detach()\n", 184 | " policy_loss = policy_loss.mean()\n", 185 | " \n", 186 | " total_loss = policy_loss + value_loss - 0.001 * entropy \n", 187 | " return total_loss\n", 188 | " \n", 189 | " def update(self, trajectory):\n", 190 | " loss = self.compute_loss(trajectory)\n", 191 | "\n", 192 | " self.optimizer.zero_grad()\n", 193 | " loss.backward()\n", 194 | " self.optimizer.step()" 195 | ], 196 | "execution_count": 0, 197 | "outputs": [] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "metadata": { 202 | "id": "xwZeX_SVmSgZ", 203 | "colab_type": "code", 204 | "colab": {} 205 | }, 206 | "source": [ 207 | "# unlike A2CAgent in a2c.py, here I separated value and policy network.\n", 208 | "class decoupled_A2CAgent():\n", 209 | "\n", 210 | " def __init__(self, env, gamma, lr):\n", 211 | " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 212 | " \n", 213 | " self.env = env\n", 214 | " self.obs_dim = env.observation_space.shape[0]\n", 215 | " self.action_dim = env.action_space.n\n", 216 | " \n", 217 | " self.gamma = gamma\n", 218 | " self.lr = lr\n", 219 | " \n", 220 | " self.value_network = ValueNetwork(self.obs_dim, 1)\n", 221 | " self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim)\n", 222 | " \n", 223 | " self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr)\n", 224 | " self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr)\n", 225 | " \n", 226 | " def get_action(self, state):\n", 227 | " state = torch.FloatTensor(state).to(self.device)\n", 228 | " logits = self.policy_network.forward(state)\n", 229 | " dist = F.softmax(logits, dim=0)\n", 230 | " probs = Categorical(dist)\n", 231 | "\n", 232 | " return probs.sample().cpu().detach().item()\n", 233 | " \n", 234 | " def compute_loss(self, trajectory):\n", 235 | " states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device)\n", 236 | " actions = torch.LongTensor([sars[1] for sars in trajectory]).view(-1, 1).to(self.device)\n", 237 | " rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device)\n", 238 | " next_states = torch.FloatTensor([sars[3] for sars in trajectory]).to(self.device)\n", 239 | " dones = torch.FloatTensor([sars[4] for sars in trajectory]).view(-1, 1).to(self.device)\n", 240 | " \n", 241 | " # compute value target\n", 242 | " discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\\\n", 243 | " * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code.\n", 244 | " value_targets = rewards.view(-1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to(self.device)\n", 245 | " \n", 246 | " # compute value loss\n", 247 | " values = self.value_network.forward(states)\n", 248 | " value_loss = F.mse_loss(values, value_targets.detach())\n", 249 | " \n", 250 | " \n", 251 | " # compute policy loss with entropy bonus\n", 252 | " logits = self.policy_network.forward(states)\n", 253 | " dists = F.softmax(logits, dim=1)\n", 254 | " probs = Categorical(dists)\n", 255 | " \n", 256 | " # compute entropy bonus\n", 257 | " entropy = []\n", 258 | " for dist in dists:\n", 259 | " entropy.append(-torch.sum(dist.mean() * torch.log(dist)))\n", 260 | " entropy = torch.stack(entropy).sum()\n", 261 | " \n", 262 | " advantage = value_targets - values\n", 263 | " policy_loss = -probs.log_prob(actions.view(actions.size(0))).view(-1, 1) * advantage.detach()\n", 264 | " policy_loss = policy_loss.mean() - 0.001 * entropy\n", 265 | " \n", 266 | " return value_loss, policy_loss\n", 267 | " \n", 268 | " def update(self, trajectory):\n", 269 | " value_loss, policy_loss = self.compute_loss(trajectory)\n", 270 | "\n", 271 | " self.value_optimizer.zero_grad()\n", 272 | " value_loss.backward()\n", 273 | " self.value_optimizer.step()\n", 274 | "\n", 275 | " self.policy_optimizer.zero_grad()\n", 276 | " policy_loss.backward()\n", 277 | " self.policy_optimizer.step()" 278 | ], 279 | "execution_count": 0, 280 | "outputs": [] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "id": "WYAQuxls03zP", 286 | "colab_type": "code", 287 | "colab": {} 288 | }, 289 | "source": [ 290 | "env = gym.make(\"CartPole-v0\")\n", 291 | "\n", 292 | "lr = 1e-4\n", 293 | "gamma = 0.99\n", 294 | "\n", 295 | "MAX_EPISODE = 1500\n", 296 | "MAX_STEPS = 500\n", 297 | "\n", 298 | "agent = Agent(env, gamma, lr)\n", 299 | "\n", 300 | "run(env, agent)" 301 | ], 302 | "execution_count": 0, 303 | "outputs": [] 304 | } 305 | ] 306 | } -------------------------------------------------------------------------------- /ipynb/DDPG.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "DDPG.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "code", 18 | "metadata": { 19 | "id": "YJ6-cbagZ5CT", 20 | "colab_type": "code", 21 | "colab": {} 22 | }, 23 | "source": [ 24 | "import torch\n", 25 | "import torch.nn as nn\n", 26 | "import torch.nn.functional as F\n", 27 | "import torch.autograd as autograd\n", 28 | "import torch.optim as optim\n", 29 | "\n", 30 | "import gym\n", 31 | "import random\n", 32 | "import numpy as np\n", 33 | "from collections import deque" 34 | ], 35 | "execution_count": 0, 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "DCYDiWHiZ8Gp", 42 | "colab_type": "code", 43 | "colab": {} 44 | }, 45 | "source": [ 46 | "def mini_batch_train(env, agent, max_episodes, max_steps, batch_size):\n", 47 | " episode_rewards = []\n", 48 | "\n", 49 | " for episode in range(max_episodes):\n", 50 | " state = env.reset()\n", 51 | " episode_reward = 0\n", 52 | " \n", 53 | " for step in range(max_steps):\n", 54 | " action = agent.get_action(state)\n", 55 | " next_state, reward, done, _ = env.step(action)\n", 56 | " agent.replay_buffer.push(state, action, reward, next_state, done)\n", 57 | " episode_reward += reward\n", 58 | "\n", 59 | " if len(agent.replay_buffer) > batch_size:\n", 60 | " agent.update(batch_size) \n", 61 | "\n", 62 | " if done or step == max_steps-1:\n", 63 | " episode_rewards.append(episode_reward)\n", 64 | " print(\"Episode \" + str(episode) + \": \" + str(episode_reward))\n", 65 | " break\n", 66 | "\n", 67 | " state = next_state\n", 68 | "\n", 69 | " return episode_rewards" 70 | ], 71 | "execution_count": 0, 72 | "outputs": [] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "metadata": { 77 | "id": "Rz0bMnzdaJyN", 78 | "colab_type": "code", 79 | "colab": {} 80 | }, 81 | "source": [ 82 | "class BasicBuffer:\n", 83 | "\n", 84 | " def __init__(self, max_size):\n", 85 | " self.max_size = max_size\n", 86 | " self.buffer = deque(maxlen=max_size)\n", 87 | "\n", 88 | " def push(self, state, action, reward, next_state, done):\n", 89 | " experience = (state, action, np.array([reward]), next_state, done)\n", 90 | " self.buffer.append(experience)\n", 91 | "\n", 92 | " def sample(self, batch_size):\n", 93 | " state_batch = []\n", 94 | " action_batch = []\n", 95 | " reward_batch = []\n", 96 | " next_state_batch = []\n", 97 | " done_batch = []\n", 98 | "\n", 99 | " batch = random.sample(self.buffer, batch_size)\n", 100 | "\n", 101 | " for experience in batch:\n", 102 | " state, action, reward, next_state, done = experience\n", 103 | " state_batch.append(state)\n", 104 | " action_batch.append(action)\n", 105 | " reward_batch.append(reward)\n", 106 | " next_state_batch.append(next_state)\n", 107 | " done_batch.append(done)\n", 108 | "\n", 109 | " return (state_batch, action_batch, reward_batch, next_state_batch, done_batch)\n", 110 | "\n", 111 | " def __len__(self):\n", 112 | " return len(self.buffer)" 113 | ], 114 | "execution_count": 0, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "metadata": { 120 | "id": "USTdUdkGaWmg", 121 | "colab_type": "code", 122 | "colab": {} 123 | }, 124 | "source": [ 125 | "# Ornstein-Ulhenbeck Noise\n", 126 | "# Taken from #https://github.com/vitchyr/rlkit/blob/master/rlkit/exploration_strategies/ou_strategy.py\n", 127 | "class OUNoise(object):\n", 128 | " def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):\n", 129 | " self.mu = mu\n", 130 | " self.theta = theta\n", 131 | " self.sigma = max_sigma\n", 132 | " self.max_sigma = max_sigma\n", 133 | " self.min_sigma = min_sigma\n", 134 | " self.decay_period = decay_period\n", 135 | " self.action_dim = action_space.shape[0]\n", 136 | " self.low = action_space.low\n", 137 | " self.high = action_space.high\n", 138 | " self.reset()\n", 139 | " \n", 140 | " def reset(self):\n", 141 | " self.state = np.ones(self.action_dim) * self.mu\n", 142 | " \n", 143 | " def evolve_state(self):\n", 144 | " x = self.state\n", 145 | " dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)\n", 146 | " self.state = x + dx\n", 147 | " return self.state\n", 148 | " \n", 149 | " def get_action(self, action, t=0):\n", 150 | " ou_state = self.evolve_state()\n", 151 | " self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)\n", 152 | " return np.clip(action + ou_state, self.low, self.high)" 153 | ], 154 | "execution_count": 0, 155 | "outputs": [] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "metadata": { 160 | "id": "wYw7DI5-aAiw", 161 | "colab_type": "code", 162 | "colab": {} 163 | }, 164 | "source": [ 165 | "class Critic(nn.Module):\n", 166 | "\n", 167 | " def __init__(self, obs_dim, action_dim):\n", 168 | " super(Critic, self).__init__()\n", 169 | "\n", 170 | " self.obs_dim = obs_dim\n", 171 | " self.action_dim = action_dim\n", 172 | "\n", 173 | " self.linear1 = nn.Linear(self.obs_dim, 1024)\n", 174 | " self.linear2 = nn.Linear(1024 + self.action_dim, 512)\n", 175 | " self.linear3 = nn.Linear(512, 300)\n", 176 | " self.linear4 = nn.Linear(300, 1)\n", 177 | "\n", 178 | " def forward(self, x, a):\n", 179 | " x = F.relu(self.linear1(x))\n", 180 | " xa_cat = torch.cat([x,a], 1)\n", 181 | " xa = F.relu(self.linear2(xa_cat))\n", 182 | " xa = F.relu(self.linear3(xa))\n", 183 | " qval = self.linear4(xa)\n", 184 | "\n", 185 | " return qval\n", 186 | "\n", 187 | "class Actor(nn.Module):\n", 188 | "\n", 189 | " def __init__(self, obs_dim, action_dim):\n", 190 | " super(Actor, self).__init__()\n", 191 | "\n", 192 | " self.obs_dim = obs_dim\n", 193 | " self.action_dim = action_dim\n", 194 | "\n", 195 | " self.linear1 = nn.Linear(self.obs_dim, 512)\n", 196 | " self.linear2 = nn.Linear(512, 128)\n", 197 | " self.linear3 = nn.Linear(128, self.action_dim)\n", 198 | "\n", 199 | " def forward(self, obs):\n", 200 | " x = F.relu(self.linear1(obs))\n", 201 | " x = F.relu(self.linear2(x))\n", 202 | " x = torch.tanh(self.linear3(x))\n", 203 | "\n", 204 | " return x" 205 | ], 206 | "execution_count": 0, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "gADalF9XaAkv", 213 | "colab_type": "code", 214 | "colab": {} 215 | }, 216 | "source": [ 217 | "class DDPGAgent:\n", 218 | " \n", 219 | " def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate):\n", 220 | " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 221 | " \n", 222 | " self.env = env\n", 223 | " self.obs_dim = env.observation_space.shape[0]\n", 224 | " self.action_dim = env.action_space.shape[0]\n", 225 | " \n", 226 | " # hyperparameters\n", 227 | " self.env = env\n", 228 | " self.gamma = gamma\n", 229 | " self.tau = tau\n", 230 | " \n", 231 | " # initialize actor and critic networks\n", 232 | " self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)\n", 233 | " self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device)\n", 234 | " \n", 235 | " self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)\n", 236 | " self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device)\n", 237 | " \n", 238 | " # Copy critic target parameters\n", 239 | " for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):\n", 240 | " target_param.data.copy_(param.data)\n", 241 | " \n", 242 | " # optimizers\n", 243 | " self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate)\n", 244 | " self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate)\n", 245 | " \n", 246 | " self.replay_buffer = BasicBuffer(buffer_maxlen) \n", 247 | " self.noise = OUNoise(self.env.action_space)\n", 248 | " \n", 249 | " def get_action(self, obs):\n", 250 | " state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)\n", 251 | " action = self.actor.forward(state)\n", 252 | " action = action.squeeze(0).cpu().detach().numpy()\n", 253 | "\n", 254 | " return action\n", 255 | " \n", 256 | " def update(self, batch_size):\n", 257 | " states, actions, rewards, next_states, _ = self.replay_buffer.sample(batch_size)\n", 258 | " state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(batch_size)\n", 259 | " state_batch = torch.FloatTensor(state_batch).to(self.device)\n", 260 | " action_batch = torch.FloatTensor(action_batch).to(self.device)\n", 261 | " reward_batch = torch.FloatTensor(reward_batch).to(self.device)\n", 262 | " next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)\n", 263 | " masks = torch.FloatTensor(masks).to(self.device)\n", 264 | " \n", 265 | " curr_Q = self.critic.forward(state_batch, action_batch)\n", 266 | " next_actions = self.actor_target.forward(next_state_batch)\n", 267 | " next_Q = self.critic_target.forward(next_state_batch, next_actions.detach())\n", 268 | " expected_Q = reward_batch + self.gamma * next_Q\n", 269 | " \n", 270 | " # update critic\n", 271 | " q_loss = F.mse_loss(curr_Q, expected_Q.detach())\n", 272 | "\n", 273 | " self.critic_optimizer.zero_grad()\n", 274 | " q_loss.backward() \n", 275 | " self.critic_optimizer.step()\n", 276 | "\n", 277 | " # update actor\n", 278 | " policy_loss = -self.critic.forward(state_batch, self.actor.forward(state_batch)).mean()\n", 279 | " \n", 280 | " self.actor_optimizer.zero_grad()\n", 281 | " policy_loss.backward()\n", 282 | " self.actor_optimizer.step()\n", 283 | "\n", 284 | " # update target networks \n", 285 | " for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):\n", 286 | " target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))\n", 287 | " \n", 288 | " for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):\n", 289 | " target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))" 290 | ], 291 | "execution_count": 0, 292 | "outputs": [] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "metadata": { 297 | "id": "ifOTkUA4aRkt", 298 | "colab_type": "code", 299 | "colab": {} 300 | }, 301 | "source": [ 302 | "env = gym.make(\"Pendulum-v0\")\n", 303 | "\n", 304 | "max_episodes = 100\n", 305 | "max_steps = 500\n", 306 | "batch_size = 32\n", 307 | "\n", 308 | "gamma = 0.99\n", 309 | "tau = 1e-2\n", 310 | "buffer_maxlen = 100000\n", 311 | "critic_lr = 1e-3\n", 312 | "actor_lr = 1e-3\n", 313 | "\n", 314 | "agent = DDPGAgent(env, gamma, tau, buffer_maxlen, critic_lr, actor_lr)\n", 315 | "episode_rewards = mini_batch_train(env, agent, max_episodes, max_steps, batch_size)" 316 | ], 317 | "execution_count": 0, 318 | "outputs": [] 319 | } 320 | ] 321 | } -------------------------------------------------------------------------------- /ipynb/SAC2018.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "SAC2018.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "code", 18 | "metadata": { 19 | "id": "4Bp5fR_MSBLe", 20 | "colab_type": "code", 21 | "colab": {} 22 | }, 23 | "source": [ 24 | "import torch\n", 25 | "import torch.nn as nn\n", 26 | "import torch.nn.functional as F\n", 27 | "import torch.optim as optim\n", 28 | "from torch.distributions import Normal\n", 29 | "\n", 30 | "import gym\n", 31 | "import random\n", 32 | "import numpy as np\n", 33 | "from collections import deque" 34 | ], 35 | "execution_count": 0, 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "t6LsUoK3SGTR", 42 | "colab_type": "code", 43 | "colab": {} 44 | }, 45 | "source": [ 46 | "def mini_batch_train(env, agent, max_episodes, max_steps, batch_size):\n", 47 | " episode_rewards = []\n", 48 | " update_step = 0\n", 49 | "\n", 50 | " for episode in range(max_episodes):\n", 51 | " state = env.reset()\n", 52 | " episode_reward = 0\n", 53 | "\n", 54 | " for step in range(max_steps):\n", 55 | " action = agent.get_action(state)\n", 56 | " next_state, reward, done, _ = env.step(action)\n", 57 | " agent.replay_buffer.push(state, action, reward, next_state, done)\n", 58 | " episode_reward += reward\n", 59 | "\n", 60 | " if len(agent.replay_buffer) > batch_size:\n", 61 | " agent.update(batch_size)\n", 62 | " update_step += 1\n", 63 | "\n", 64 | " if done or step == max_steps-1:\n", 65 | " episode_rewards.append(episode_reward)\n", 66 | " break\n", 67 | "\n", 68 | " state = next_state\n", 69 | " \n", 70 | " print(\"Episode \" + str(episode) + \": \" + str(episode_reward))\n", 71 | "\n", 72 | " return episode_rewards" 73 | ], 74 | "execution_count": 0, 75 | "outputs": [] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "metadata": { 80 | "id": "FZSveC3rSHBb", 81 | "colab_type": "code", 82 | "colab": {} 83 | }, 84 | "source": [ 85 | "class BasicBuffer:\n", 86 | "\n", 87 | " def __init__(self, max_size):\n", 88 | " self.max_size = max_size\n", 89 | " self.buffer = deque(maxlen=max_size)\n", 90 | "\n", 91 | " def push(self, state, action, reward, next_state, done):\n", 92 | " experience = (state, action, np.array([reward]), next_state, done)\n", 93 | " self.buffer.append(experience)\n", 94 | "\n", 95 | " def sample(self, batch_size):\n", 96 | " state_batch = []\n", 97 | " action_batch = []\n", 98 | " reward_batch = []\n", 99 | " next_state_batch = []\n", 100 | " done_batch = []\n", 101 | "\n", 102 | " batch = random.sample(self.buffer, batch_size)\n", 103 | "\n", 104 | " for experience in batch:\n", 105 | " state, action, reward, next_state, done = experience\n", 106 | " state_batch.append(state)\n", 107 | " action_batch.append(action)\n", 108 | " reward_batch.append(reward)\n", 109 | " next_state_batch.append(next_state)\n", 110 | " done_batch.append(done)\n", 111 | "\n", 112 | " return (state_batch, action_batch, reward_batch, next_state_batch, done_batch)\n", 113 | "\n", 114 | " def __len__(self):\n", 115 | " return len(self.buffer)" 116 | ], 117 | "execution_count": 0, 118 | "outputs": [] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "metadata": { 123 | "id": "VjrRWx-2Sx1p", 124 | "colab_type": "code", 125 | "colab": {} 126 | }, 127 | "source": [ 128 | "class ValueNetwork(nn.Module):\n", 129 | "\n", 130 | " def __init__(self, input_dim, output_dim, init_w=3e-3):\n", 131 | " super(ValueNetwork, self).__init__()\n", 132 | " self.fc1 = nn.Linear(input_dim, 256)\n", 133 | " self.fc2 = nn.Linear(256, 256)\n", 134 | " self.fc3 = nn.Linear(256, output_dim)\n", 135 | "\n", 136 | " self.fc3.weight.data.uniform_(-init_w, init_w)\n", 137 | " self.fc3.bias.data.uniform_(-init_w, init_w)\n", 138 | "\n", 139 | " def forward(self, state):\n", 140 | " x = F.relu(self.fc1(state))\n", 141 | " x = F.relu(self.fc2(x))\n", 142 | " x = self.fc3(x)\n", 143 | "\n", 144 | " return x\n", 145 | "\n", 146 | "\n", 147 | "class SoftQNetwork(nn.Module):\n", 148 | " \n", 149 | " def __init__(self, num_inputs, num_actions, hidden_size=256, init_w=3e-3):\n", 150 | " super(SoftQNetwork, self).__init__()\n", 151 | " self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)\n", 152 | " self.linear2 = nn.Linear(hidden_size, hidden_size)\n", 153 | " self.linear3 = nn.Linear(hidden_size, 1)\n", 154 | "\n", 155 | " self.linear3.weight.data.uniform_(-init_w, init_w)\n", 156 | " self.linear3.bias.data.uniform_(-init_w, init_w)\n", 157 | "\n", 158 | " def forward(self, state, action):\n", 159 | " x = torch.cat([state, action], 1)\n", 160 | " x = F.relu(self.linear1(x))\n", 161 | " x = F.relu(self.linear2(x))\n", 162 | " x = self.linear3(x)\n", 163 | " return x\n", 164 | "\n", 165 | "\n", 166 | "class PolicyNetwork(nn.Module):\n", 167 | " \n", 168 | " def __init__(self, num_inputs, num_actions, hidden_size=256, init_w=3e-3, log_std_min=-20, log_std_max=2):\n", 169 | " super(PolicyNetwork, self).__init__()\n", 170 | " self.log_std_min = log_std_min\n", 171 | " self.log_std_max = log_std_max\n", 172 | "\n", 173 | " self.linear1 = nn.Linear(num_inputs, hidden_size)\n", 174 | " self.linear2 = nn.Linear(hidden_size, hidden_size)\n", 175 | "\n", 176 | " self.mean_linear = nn.Linear(hidden_size, num_actions)\n", 177 | " self.mean_linear.weight.data.uniform_(-init_w, init_w)\n", 178 | " self.mean_linear.bias.data.uniform_(-init_w, init_w)\n", 179 | "\n", 180 | " self.log_std_linear = nn.Linear(hidden_size, num_actions)\n", 181 | " self.log_std_linear.weight.data.uniform_(-init_w, init_w)\n", 182 | " self.log_std_linear.bias.data.uniform_(-init_w, init_w)\n", 183 | "\n", 184 | " def forward(self, state):\n", 185 | " x = F.relu(self.linear1(state))\n", 186 | " x = F.relu(self.linear2(x))\n", 187 | "\n", 188 | " mean = self.mean_linear(x)\n", 189 | " log_std = self.log_std_linear(x)\n", 190 | " log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)\n", 191 | "\n", 192 | " return mean, log_std\n", 193 | "\n", 194 | " def sample(self, state, epsilon=1e-6):\n", 195 | " mean, log_std = self.forward(state)\n", 196 | " std = log_std.exp()\n", 197 | "\n", 198 | " normal = Normal(mean, std)\n", 199 | " z = normal.rsample()\n", 200 | " action = torch.tanh(z)\n", 201 | "\n", 202 | " log_pi = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)\n", 203 | " log_pi = log_pi.sum(1, keepdim=True)\n", 204 | "\n", 205 | " return action, log_pi" 206 | ], 207 | "execution_count": 0, 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "metadata": { 213 | "id": "vQJka_ocSLSX", 214 | "colab_type": "code", 215 | "colab": {} 216 | }, 217 | "source": [ 218 | "class SACAgent:\n", 219 | " \n", 220 | " def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen):\n", 221 | " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 222 | " \n", 223 | " self.env = env \n", 224 | " self.action_range = [env.action_space.low, env.action_space.high]\n", 225 | "\n", 226 | " # hyperparameters\n", 227 | " self.gamma = gamma\n", 228 | " self.tau = tau \n", 229 | " self.update_step = 0\n", 230 | " self.delay_step = 2\n", 231 | "\n", 232 | " # initialize networks \n", 233 | " self.value_net = ValueNetwork(env.observation_space.shape[0], 1).to(self.device)\n", 234 | " self.target_value_net = ValueNetwork(env.observation_space.shape[0], 1).to(self.device)\n", 235 | " self.q_net1 = SoftQNetwork(env.observation_space.shape[0], env.action_space.shape[0]).to(self.device)\n", 236 | " self.q_net2 = SoftQNetwork(env.observation_space.shape[0], env.action_space.shape[0]).to(self.device)\n", 237 | " self.policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.shape[0]).to(self.device)\n", 238 | " \n", 239 | " # copy params to target param\n", 240 | " for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):\n", 241 | " target_param.data.copy_(param)\n", 242 | " \n", 243 | " # initialize optimizers \n", 244 | " self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr)\n", 245 | " self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)\n", 246 | " self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)\n", 247 | " self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)\n", 248 | "\n", 249 | " self.replay_buffer = BasicBuffer(buffer_maxlen)\n", 250 | " \n", 251 | " def get_action(self, state):\n", 252 | " state = torch.FloatTensor(state).unsqueeze(0).to(self.device)\n", 253 | " mean, log_std = self.policy_net.forward(state)\n", 254 | " std = log_std.exp()\n", 255 | " \n", 256 | " normal = Normal(mean, std)\n", 257 | " z = normal.sample()\n", 258 | " action = torch.tanh(z)\n", 259 | " action = action.cpu().detach().squeeze(0).numpy()\n", 260 | " \n", 261 | " return self.rescale_action(action)\n", 262 | " \n", 263 | " def rescale_action(self, action):\n", 264 | " return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\\\n", 265 | " (self.action_range[1] + self.action_range[0]) / 2.0\n", 266 | " \n", 267 | " def update(self, batch_size):\n", 268 | " states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)\n", 269 | " states = torch.FloatTensor(states).to(self.device)\n", 270 | " actions = torch.FloatTensor(actions).to(self.device)\n", 271 | " rewards = torch.FloatTensor(rewards).to(self.device)\n", 272 | " next_states = torch.FloatTensor(next_states).to(self.device)\n", 273 | " dones = torch.FloatTensor(dones).to(self.device)\n", 274 | " dones = dones.view(dones.size(0), -1)\n", 275 | " \n", 276 | " next_actions, next_log_pi = self.policy_net.sample(next_states)\n", 277 | " next_q1 = self.q_net1(next_states, next_actions)\n", 278 | " next_q2 = self.q_net2(next_states, next_actions)\n", 279 | " next_v = self.target_value_net(next_states)\n", 280 | " \n", 281 | " # value Loss\n", 282 | " next_v_target = torch.min(next_q1, next_q2) - next_log_pi\n", 283 | " curr_v = self.value_net.forward(states)\n", 284 | " v_loss = F.mse_loss(curr_v, next_v_target.detach())\n", 285 | " \n", 286 | " # q loss\n", 287 | " curr_q1 = self.q_net1.forward(states, actions)\n", 288 | " curr_q2 = self.q_net2.forward(states, actions)\n", 289 | " expected_q = rewards + (1 - dones) * self.gamma * next_v\n", 290 | " q1_loss = F.mse_loss(curr_q1, expected_q.detach())\n", 291 | " q2_loss = F.mse_loss(curr_q2, expected_q.detach())\n", 292 | " \n", 293 | " # update value network and q networks\n", 294 | " self.value_optimizer.zero_grad()\n", 295 | " v_loss.backward()\n", 296 | " self.value_optimizer.step()\n", 297 | " \n", 298 | " self.q1_optimizer.zero_grad()\n", 299 | " q1_loss.backward()\n", 300 | " self.q1_optimizer.step()\n", 301 | " \n", 302 | " self.q2_optimizer.zero_grad()\n", 303 | " q2_loss.backward()\n", 304 | " self.q2_optimizer.step()\n", 305 | " \n", 306 | " #delayed update for policy net and target value nets\n", 307 | " if self.update_step % self.delay_step == 0:\n", 308 | " new_actions, log_pi = self.policy_net.sample(states)\n", 309 | " min_q = torch.min(\n", 310 | " self.q_net1.forward(states, new_actions),\n", 311 | " self.q_net2.forward(states, new_actions)\n", 312 | " )\n", 313 | " policy_loss = (log_pi - min_q).mean()\n", 314 | " \n", 315 | " self.policy_optimizer.zero_grad()\n", 316 | " policy_loss.backward()\n", 317 | " self.policy_optimizer.step()\n", 318 | " \n", 319 | " # target networks\n", 320 | " for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):\n", 321 | " target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)\n", 322 | " \n", 323 | " self.update_step += 1" 324 | ], 325 | "execution_count": 0, 326 | "outputs": [] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "metadata": { 331 | "id": "1wWTGaoGSbWS", 332 | "colab_type": "code", 333 | "colab": {} 334 | }, 335 | "source": [ 336 | "env = gym.make(\"Pendulum-v0\")\n", 337 | "\n", 338 | "# SAC 2018 Params\n", 339 | "tau = 0.005\n", 340 | "gamma = 0.99\n", 341 | "value_lr = 3e-3\n", 342 | "q_lr = 3e-3\n", 343 | "policy_lr = 3e-3\n", 344 | "buffer_maxlen = 1000000\n", 345 | "\n", 346 | "# 2018 agent\n", 347 | "agent = SACAgent(env, gamma, tau, value_lr, q_lr, policy_lr, buffer_maxlen)\n", 348 | "\n", 349 | "# train\n", 350 | "episode_rewards = mini_batch_train(env, agent, 50, 500, 64)" 351 | ], 352 | "execution_count": 0, 353 | "outputs": [] 354 | } 355 | ] 356 | } -------------------------------------------------------------------------------- /ipynb/SAC2019.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "SAC2019.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "metadata": { 20 | "id": "l38j4LVlQk1l", 21 | "colab_type": "code", 22 | "colab": {} 23 | }, 24 | "source": [ 25 | "import torch\n", 26 | "import torch.nn as nn\n", 27 | "import torch.nn.functional as F\n", 28 | "import torch.optim as optim\n", 29 | "from torch.distributions import Normal\n", 30 | "\n", 31 | "import gym\n", 32 | "import random\n", 33 | "import numpy as np\n", 34 | "from collections import deque" 35 | ], 36 | "execution_count": 0, 37 | "outputs": [] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "id": "G29sGsAVQl-s", 43 | "colab_type": "code", 44 | "colab": {} 45 | }, 46 | "source": [ 47 | "def mini_batch_train(env, agent, max_episodes, max_steps, batch_size):\n", 48 | " episode_rewards = []\n", 49 | " update_step = 0\n", 50 | "\n", 51 | " for episode in range(max_episodes):\n", 52 | " state = env.reset()\n", 53 | " episode_reward = 0\n", 54 | "\n", 55 | " for step in range(max_steps):\n", 56 | " action = agent.get_action(state)\n", 57 | " next_state, reward, done, _ = env.step(action)\n", 58 | " agent.replay_buffer.push(state, action, reward, next_state, done)\n", 59 | " episode_reward += reward\n", 60 | "\n", 61 | " if len(agent.replay_buffer) > batch_size:\n", 62 | " agent.update(batch_size)\n", 63 | " update_step += 1\n", 64 | "\n", 65 | " if done or step == max_steps-1:\n", 66 | " episode_rewards.append(episode_reward)\n", 67 | " break\n", 68 | "\n", 69 | " state = next_state\n", 70 | " \n", 71 | " print(\"Episode \" + str(episode) + \": \" + str(episode_reward))\n", 72 | "\n", 73 | " return episode_rewards" 74 | ], 75 | "execution_count": 0, 76 | "outputs": [] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "metadata": { 81 | "id": "Q7gOX2DjREI5", 82 | "colab_type": "code", 83 | "colab": {} 84 | }, 85 | "source": [ 86 | "class BasicBuffer:\n", 87 | "\n", 88 | " def __init__(self, max_size):\n", 89 | " self.max_size = max_size\n", 90 | " self.buffer = deque(maxlen=max_size)\n", 91 | "\n", 92 | " def push(self, state, action, reward, next_state, done):\n", 93 | " experience = (state, action, np.array([reward]), next_state, done)\n", 94 | " self.buffer.append(experience)\n", 95 | "\n", 96 | " def sample(self, batch_size):\n", 97 | " state_batch = []\n", 98 | " action_batch = []\n", 99 | " reward_batch = []\n", 100 | " next_state_batch = []\n", 101 | " done_batch = []\n", 102 | "\n", 103 | " batch = random.sample(self.buffer, batch_size)\n", 104 | "\n", 105 | " for experience in batch:\n", 106 | " state, action, reward, next_state, done = experience\n", 107 | " state_batch.append(state)\n", 108 | " action_batch.append(action)\n", 109 | " reward_batch.append(reward)\n", 110 | " next_state_batch.append(next_state)\n", 111 | " done_batch.append(done)\n", 112 | "\n", 113 | " return (state_batch, action_batch, reward_batch, next_state_batch, done_batch)\n", 114 | "\n", 115 | " def __len__(self):\n", 116 | " return len(self.buffer)" 117 | ], 118 | "execution_count": 0, 119 | "outputs": [] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "id": "FuOsdm6hRQp2", 125 | "colab_type": "code", 126 | "colab": {} 127 | }, 128 | "source": [ 129 | "class ValueNetwork(nn.Module):\n", 130 | "\n", 131 | " def __init__(self, input_dim, output_dim, init_w=3e-3):\n", 132 | " super(ValueNetwork, self).__init__()\n", 133 | " self.fc1 = nn.Linear(input_dim, 256)\n", 134 | " self.fc2 = nn.Linear(256, 256)\n", 135 | " self.fc3 = nn.Linear(256, output_dim)\n", 136 | "\n", 137 | " self.fc3.weight.data.uniform_(-init_w, init_w)\n", 138 | " self.fc3.bias.data.uniform_(-init_w, init_w)\n", 139 | "\n", 140 | " def forward(self, state):\n", 141 | " x = F.relu(self.fc1(state))\n", 142 | " x = F.relu(self.fc2(x))\n", 143 | " x = self.fc3(x)\n", 144 | "\n", 145 | " return x\n", 146 | "\n", 147 | "\n", 148 | "class SoftQNetwork(nn.Module):\n", 149 | " \n", 150 | " def __init__(self, num_inputs, num_actions, hidden_size=256, init_w=3e-3):\n", 151 | " super(SoftQNetwork, self).__init__()\n", 152 | " self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)\n", 153 | " self.linear2 = nn.Linear(hidden_size, hidden_size)\n", 154 | " self.linear3 = nn.Linear(hidden_size, 1)\n", 155 | "\n", 156 | " self.linear3.weight.data.uniform_(-init_w, init_w)\n", 157 | " self.linear3.bias.data.uniform_(-init_w, init_w)\n", 158 | "\n", 159 | " def forward(self, state, action):\n", 160 | " x = torch.cat([state, action], 1)\n", 161 | " x = F.relu(self.linear1(x))\n", 162 | " x = F.relu(self.linear2(x))\n", 163 | " x = self.linear3(x)\n", 164 | " return x\n", 165 | "\n", 166 | "\n", 167 | "class PolicyNetwork(nn.Module):\n", 168 | " \n", 169 | " def __init__(self, num_inputs, num_actions, hidden_size=256, init_w=3e-3, log_std_min=-20, log_std_max=2):\n", 170 | " super(PolicyNetwork, self).__init__()\n", 171 | " self.log_std_min = log_std_min\n", 172 | " self.log_std_max = log_std_max\n", 173 | "\n", 174 | " self.linear1 = nn.Linear(num_inputs, hidden_size)\n", 175 | " self.linear2 = nn.Linear(hidden_size, hidden_size)\n", 176 | "\n", 177 | " self.mean_linear = nn.Linear(hidden_size, num_actions)\n", 178 | " self.mean_linear.weight.data.uniform_(-init_w, init_w)\n", 179 | " self.mean_linear.bias.data.uniform_(-init_w, init_w)\n", 180 | "\n", 181 | " self.log_std_linear = nn.Linear(hidden_size, num_actions)\n", 182 | " self.log_std_linear.weight.data.uniform_(-init_w, init_w)\n", 183 | " self.log_std_linear.bias.data.uniform_(-init_w, init_w)\n", 184 | "\n", 185 | " def forward(self, state):\n", 186 | " x = F.relu(self.linear1(state))\n", 187 | " x = F.relu(self.linear2(x))\n", 188 | "\n", 189 | " mean = self.mean_linear(x)\n", 190 | " log_std = self.log_std_linear(x)\n", 191 | " log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)\n", 192 | "\n", 193 | " return mean, log_std\n", 194 | "\n", 195 | " def sample(self, state, epsilon=1e-6):\n", 196 | " mean, log_std = self.forward(state)\n", 197 | " std = log_std.exp()\n", 198 | "\n", 199 | " normal = Normal(mean, std)\n", 200 | " z = normal.rsample()\n", 201 | " action = torch.tanh(z)\n", 202 | "\n", 203 | " log_pi = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)\n", 204 | " log_pi = log_pi.sum(1, keepdim=True)\n", 205 | "\n", 206 | " return action, log_pi" 207 | ], 208 | "execution_count": 0, 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "metadata": { 214 | "id": "1wiYqqOVQmM2", 215 | "colab_type": "code", 216 | "colab": {} 217 | }, 218 | "source": [ 219 | "class SACAgent:\n", 220 | " \n", 221 | " def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr, buffer_maxlen):\n", 222 | " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 223 | " \n", 224 | " self.env = env\n", 225 | " self.action_range = [env.action_space.low, env.action_space.high]\n", 226 | "\n", 227 | " # hyperparameters\n", 228 | " self.gamma = gamma\n", 229 | " self.tau = tau\n", 230 | " self.update_step = 0\n", 231 | " self.delay_step = 2\n", 232 | " \n", 233 | " # initialize networks \n", 234 | " self.q_net1 = SoftQNetwork(env.observation_space.shape[0], env.action_space.shape[0]).to(self.device)\n", 235 | " self.q_net2 = SoftQNetwork(env.observation_space.shape[0], env.action_space.shape[0]).to(self.device)\n", 236 | " self.target_q_net1 = SoftQNetwork(env.observation_space.shape[0], env.action_space.shape[0]).to(self.device)\n", 237 | " self.target_q_net2 = SoftQNetwork(env.observation_space.shape[0], env.action_space.shape[0]).to(self.device)\n", 238 | " self.policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.shape[0]).to(self.device)\n", 239 | "\n", 240 | " # copy params to target param\n", 241 | " for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()):\n", 242 | " target_param.data.copy_(param)\n", 243 | "\n", 244 | " for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()):\n", 245 | " target_param.data.copy_(param)\n", 246 | "\n", 247 | " # initialize optimizers \n", 248 | " self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr)\n", 249 | " self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr)\n", 250 | " self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)\n", 251 | "\n", 252 | " # entropy temperature\n", 253 | " self.alpha = alpha\n", 254 | " self.target_entropy = -torch.prod(torch.Tensor(self.env.action_space.shape).to(self.device)).item()\n", 255 | " self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)\n", 256 | " self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)\n", 257 | "\n", 258 | " self.replay_buffer = BasicBuffer(buffer_maxlen)\n", 259 | "\n", 260 | " def get_action(self, state):\n", 261 | " state = torch.FloatTensor(state).unsqueeze(0).to(self.device)\n", 262 | " mean, log_std = self.policy_net.forward(state)\n", 263 | " std = log_std.exp()\n", 264 | " \n", 265 | " normal = Normal(mean, std)\n", 266 | " z = normal.sample()\n", 267 | " action = torch.tanh(z)\n", 268 | " action = action.cpu().detach().squeeze(0).numpy()\n", 269 | " \n", 270 | " return self.rescale_action(action)\n", 271 | " \n", 272 | " def rescale_action(self, action):\n", 273 | " return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\\\n", 274 | " (self.action_range[1] + self.action_range[0]) / 2.0\n", 275 | " \n", 276 | " def update(self, batch_size):\n", 277 | " states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)\n", 278 | " states = torch.FloatTensor(states).to(self.device)\n", 279 | " actions = torch.FloatTensor(actions).to(self.device)\n", 280 | " rewards = torch.FloatTensor(rewards).to(self.device)\n", 281 | " next_states = torch.FloatTensor(next_states).to(self.device)\n", 282 | " dones = torch.FloatTensor(dones).to(self.device)\n", 283 | " dones = dones.view(dones.size(0), -1)\n", 284 | " \n", 285 | " next_actions, next_log_pi = self.policy_net.sample(next_states)\n", 286 | " next_q1 = self.target_q_net1(next_states, next_actions)\n", 287 | " next_q2 = self.target_q_net2(next_states, next_actions)\n", 288 | " next_q_target = torch.min(next_q1, next_q2) - self.alpha * next_log_pi\n", 289 | " expected_q = rewards + (1 - dones) * self.gamma * next_q_target\n", 290 | "\n", 291 | " # q loss\n", 292 | " curr_q1 = self.q_net1.forward(states, actions)\n", 293 | " curr_q2 = self.q_net2.forward(states, actions) \n", 294 | " q1_loss = F.mse_loss(curr_q1, expected_q.detach())\n", 295 | " q2_loss = F.mse_loss(curr_q2, expected_q.detach())\n", 296 | "\n", 297 | " # update q networks \n", 298 | " self.q1_optimizer.zero_grad()\n", 299 | " q1_loss.backward()\n", 300 | " self.q1_optimizer.step()\n", 301 | " \n", 302 | " self.q2_optimizer.zero_grad()\n", 303 | " q2_loss.backward()\n", 304 | " self.q2_optimizer.step()\n", 305 | " \n", 306 | " # delayed update for policy network and target q networks\n", 307 | " new_actions, log_pi = self.policy_net.sample(states)\n", 308 | " if self.update_step % self.delay_step == 0:\n", 309 | " min_q = torch.min(\n", 310 | " self.q_net1.forward(states, new_actions),\n", 311 | " self.q_net2.forward(states, new_actions)\n", 312 | " )\n", 313 | " policy_loss = (self.alpha * log_pi - min_q).mean()\n", 314 | " \n", 315 | " self.policy_optimizer.zero_grad()\n", 316 | " policy_loss.backward()\n", 317 | " self.policy_optimizer.step()\n", 318 | " \n", 319 | " # target networks\n", 320 | " for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()):\n", 321 | " target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)\n", 322 | "\n", 323 | " for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()):\n", 324 | " target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)\n", 325 | "\n", 326 | " # update temperature\n", 327 | " alpha_loss = (self.log_alpha * (-log_pi - self.target_entropy).detach()).mean()\n", 328 | "\n", 329 | " self.alpha_optim.zero_grad()\n", 330 | " alpha_loss.backward()\n", 331 | " self.alpha_optim.step()\n", 332 | " self.alpha = self.log_alpha.exp()\n", 333 | "\n", 334 | " self.update_step += 1" 335 | ], 336 | "execution_count": 0, 337 | "outputs": [] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "metadata": { 342 | "id": "OiHtp87wRT5Z", 343 | "colab_type": "code", 344 | "colab": {} 345 | }, 346 | "source": [ 347 | "env = gym.make(\"Pendulum-v0\")\n", 348 | "\n", 349 | "# SAC 2019 Params\n", 350 | "gamma = 0.99\n", 351 | "tau = 0.01\n", 352 | "alpha = 0.2\n", 353 | "a_lr = 3e-4\n", 354 | "q_lr = 3e-4\n", 355 | "p_lr = 3e-4\n", 356 | "buffer_maxlen = 1000000\n", 357 | "\n", 358 | "# 2019 agent\n", 359 | "agent = SACAgent(env, gamma, tau, alpha, q_lr, p_lr, a_lr, buffer_maxlen)\n", 360 | "\n", 361 | "# train\n", 362 | "episode_rewards = mini_batch_train(env, agent, 50, 500, 64)" 363 | ], 364 | "execution_count": 0, 365 | "outputs": [] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "metadata": { 370 | "id": "7isB3QMpRfTL", 371 | "colab_type": "code", 372 | "colab": {} 373 | }, 374 | "source": [ 375 | "" 376 | ], 377 | "execution_count": 0, 378 | "outputs": [] 379 | } 380 | ] 381 | } -------------------------------------------------------------------------------- /ipynb/TD3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "TD3.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "code", 18 | "metadata": { 19 | "id": "eFY4nKwHXR7x", 20 | "colab_type": "code", 21 | "colab": {} 22 | }, 23 | "source": [ 24 | "import torch\n", 25 | "import torch.nn as nn\n", 26 | "import torch.nn.functional as F\n", 27 | "import torch.autograd as autograd\n", 28 | "import torch.optim as optim\n", 29 | "\n", 30 | "import gym\n", 31 | "import random\n", 32 | "import numpy as np\n", 33 | "from collections import deque" 34 | ], 35 | "execution_count": 0, 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "TVXINdceXXVt", 42 | "colab_type": "code", 43 | "colab": {} 44 | }, 45 | "source": [ 46 | "def mini_batch_train(env, agent, max_episodes, max_steps, batch_size):\n", 47 | " episode_rewards = []\n", 48 | "\n", 49 | " for episode in range(max_episodes):\n", 50 | " state = env.reset()\n", 51 | " episode_reward = 0\n", 52 | " \n", 53 | " for step in range(max_steps):\n", 54 | " action = agent.get_action(state)\n", 55 | " next_state, reward, done, _ = env.step(action)\n", 56 | " agent.replay_buffer.push(state, action, reward, next_state, done)\n", 57 | " episode_reward += reward\n", 58 | "\n", 59 | " if len(agent.replay_buffer) > batch_size:\n", 60 | " agent.update(batch_size) \n", 61 | "\n", 62 | " if done or step == max_steps-1:\n", 63 | " episode_rewards.append(episode_reward)\n", 64 | " print(\"Episode \" + str(episode) + \": \" + str(episode_reward))\n", 65 | " break\n", 66 | "\n", 67 | " state = next_state" 68 | ], 69 | "execution_count": 0, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "Qex2Y_nAXsIN", 76 | "colab_type": "code", 77 | "colab": {} 78 | }, 79 | "source": [ 80 | "class BasicBuffer:\n", 81 | "\n", 82 | " def __init__(self, max_size):\n", 83 | " self.max_size = max_size\n", 84 | " self.buffer = deque(maxlen=max_size)\n", 85 | "\n", 86 | " def push(self, state, action, reward, next_state, done):\n", 87 | " experience = (state, action, np.array([reward]), next_state, done)\n", 88 | " self.buffer.append(experience)\n", 89 | "\n", 90 | " def sample(self, batch_size):\n", 91 | " state_batch = []\n", 92 | " action_batch = []\n", 93 | " reward_batch = []\n", 94 | " next_state_batch = []\n", 95 | " done_batch = []\n", 96 | "\n", 97 | " batch = random.sample(self.buffer, batch_size)\n", 98 | "\n", 99 | " for experience in batch:\n", 100 | " state, action, reward, next_state, done = experience\n", 101 | " state_batch.append(state)\n", 102 | " action_batch.append(action)\n", 103 | " reward_batch.append(reward)\n", 104 | " next_state_batch.append(next_state)\n", 105 | " done_batch.append(done)\n", 106 | "\n", 107 | " return (state_batch, action_batch, reward_batch, next_state_batch, done_batch)\n", 108 | "\n", 109 | " def __len__(self):\n", 110 | " return len(self.buffer)" 111 | ], 112 | "execution_count": 0, 113 | "outputs": [] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "metadata": { 118 | "id": "rPpQ_mNqXXft", 119 | "colab_type": "code", 120 | "colab": {} 121 | }, 122 | "source": [ 123 | "class Critic(nn.Module):\n", 124 | "\n", 125 | " def __init__(self, obs_dim, action_dim):\n", 126 | " super(Critic, self).__init__()\n", 127 | "\n", 128 | " self.obs_dim = obs_dim\n", 129 | " self.action_dim = action_dim\n", 130 | "\n", 131 | " self.linear1 = nn.Linear(self.obs_dim, 1024)\n", 132 | " self.linear2 = nn.Linear(1024 + self.action_dim, 512)\n", 133 | " self.linear3 = nn.Linear(512, 300)\n", 134 | " self.linear4 = nn.Linear(300, 1)\n", 135 | "\n", 136 | " def forward(self, x, a):\n", 137 | " x = F.relu(self.linear1(x))\n", 138 | " xa_cat = torch.cat([x,a], 1)\n", 139 | " xa = F.relu(self.linear2(xa_cat))\n", 140 | " xa = F.relu(self.linear3(xa))\n", 141 | " qval = self.linear4(xa)\n", 142 | "\n", 143 | " return qval\n", 144 | "\n", 145 | "class Actor(nn.Module):\n", 146 | "\n", 147 | " def __init__(self, obs_dim, action_dim):\n", 148 | " super(Actor, self).__init__()\n", 149 | "\n", 150 | " self.obs_dim = obs_dim\n", 151 | " self.action_dim = action_dim\n", 152 | "\n", 153 | " self.linear1 = nn.Linear(self.obs_dim, 512)\n", 154 | " self.linear2 = nn.Linear(512, 128)\n", 155 | " self.linear3 = nn.Linear(128, self.action_dim)\n", 156 | "\n", 157 | " def forward(self, obs):\n", 158 | " x = F.relu(self.linear1(obs))\n", 159 | " x = F.relu(self.linear2(x))\n", 160 | " x = torch.tanh(self.linear3(x))\n", 161 | "\n", 162 | " return x" 163 | ], 164 | "execution_count": 0, 165 | "outputs": [] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "metadata": { 170 | "id": "f0mCWDBUXXii", 171 | "colab_type": "code", 172 | "colab": {} 173 | }, 174 | "source": [ 175 | "class TD3Agent:\n", 176 | "\n", 177 | " def __init__(self, env, gamma, tau, buffer_maxlen, delay_step, noise_std, noise_bound, critic_lr, actor_lr):\n", 178 | " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 179 | " \n", 180 | " self.env = env\n", 181 | " self.obs_dim = env.observation_space.shape[0]\n", 182 | " self.action_dim = env.action_space.shape[0]\n", 183 | "\n", 184 | " # hyperparameters \n", 185 | " self.gamma = gamma\n", 186 | " self.tau = tau\n", 187 | " self.noise_std = noise_std\n", 188 | " self.noise_bound = noise_bound\n", 189 | " self.update_step = 0 \n", 190 | " self.delay_step = delay_step\n", 191 | " \n", 192 | " # initialize actor and critic networks\n", 193 | " self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device)\n", 194 | " self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device)\n", 195 | " self.critic1_target = Critic(self.obs_dim, self.action_dim).to(self.device)\n", 196 | " self.critic2_target = Critic(self.obs_dim, self.action_dim).to(self.device)\n", 197 | " \n", 198 | " self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)\n", 199 | " self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device)\n", 200 | " \n", 201 | " # Copy critic target parameters\n", 202 | " for target_param, param in zip(self.critic1_target.parameters(), self.critic1.parameters()):\n", 203 | " target_param.data.copy_(param.data)\n", 204 | "\n", 205 | " for target_param, param in zip(self.critic2_target.parameters(), self.critic2.parameters()):\n", 206 | " target_param.data.copy_(param.data)\n", 207 | "\n", 208 | " # initialize optimizers \n", 209 | " self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=critic_lr)\n", 210 | " self.critic2_optimizer = optim.Adam(self.critic1.parameters(), lr=critic_lr) \n", 211 | " self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)\n", 212 | " \n", 213 | " self.replay_buffer = BasicBuffer(buffer_maxlen) \n", 214 | "\n", 215 | " def get_action(self, obs):\n", 216 | " state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)\n", 217 | " action = self.actor.forward(state)\n", 218 | " action = action.squeeze(0).cpu().detach().numpy()\n", 219 | "\n", 220 | " return action\n", 221 | " \n", 222 | " def update(self, batch_size):\n", 223 | " state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(batch_size)\n", 224 | " state_batch = torch.FloatTensor(state_batch).to(self.device)\n", 225 | " action_batch = torch.FloatTensor(action_batch).to(self.device)\n", 226 | " reward_batch = torch.FloatTensor(reward_batch).to(self.device)\n", 227 | " next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)\n", 228 | " masks = torch.FloatTensor(masks).to(self.device)\n", 229 | " \n", 230 | " action_space_noise = self.generate_action_space_noise(action_batch)\n", 231 | " next_actions = self.actor.forward(state_batch) + action_space_noise\n", 232 | " next_Q1 = self.critic1_target.forward(next_state_batch, next_actions)\n", 233 | " next_Q2 = self.critic2_target.forward(next_state_batch, next_actions)\n", 234 | " expected_Q = reward_batch + self.gamma * torch.min(next_Q1, next_Q2)\n", 235 | "\n", 236 | " # critic loss\n", 237 | " curr_Q1 = self.critic1.forward(state_batch, action_batch)\n", 238 | " curr_Q2 = self.critic2.forward(state_batch, action_batch)\n", 239 | " critic1_loss = F.mse_loss(curr_Q1, expected_Q.detach())\n", 240 | " critic2_loss = F.mse_loss(curr_Q2, expected_Q.detach())\n", 241 | " \n", 242 | " # update critics\n", 243 | " self.critic1_optimizer.zero_grad()\n", 244 | " critic1_loss.backward()\n", 245 | " self.critic1_optimizer.step()\n", 246 | "\n", 247 | " self.critic2_optimizer.zero_grad()\n", 248 | " critic2_loss.backward()\n", 249 | " self.critic2_optimizer.step()\n", 250 | "\n", 251 | " # delyaed update for actor & target networks \n", 252 | " if(self.update_step % self.delay_step == 0):\n", 253 | " # actor\n", 254 | " self.actor_optimizer.zero_grad()\n", 255 | " policy_gradient = -self.critic1(state_batch, self.actor(state_batch)).mean()\n", 256 | " policy_gradient.backward()\n", 257 | " self.actor_optimizer.step()\n", 258 | "\n", 259 | " # target networks\n", 260 | " self.update_targets()\n", 261 | "\n", 262 | " self.update_step += 1\n", 263 | "\n", 264 | " def generate_action_space_noise(self, action_batch):\n", 265 | " noise = torch.normal(torch.zeros(action_batch.size()), self.noise_std).clamp(-self.noise_bound, self.noise_bound).to(self.device)\n", 266 | " return noise\n", 267 | "\n", 268 | " def update_targets(self):\n", 269 | " for target_param, param in zip(self.critic1_target.parameters(), self.critic1.parameters()):\n", 270 | " target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))\n", 271 | "\n", 272 | " for target_param, param in zip(self.critic2_target.parameters(), self.critic2.parameters()):\n", 273 | " target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))\n", 274 | " \n", 275 | " for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):\n", 276 | " target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))" 277 | ], 278 | "execution_count": 0, 279 | "outputs": [] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "metadata": { 284 | "id": "sn9QzzHoX3qs", 285 | "colab_type": "code", 286 | "colab": {} 287 | }, 288 | "source": [ 289 | "env = gym.make(\"Pendulum-v0\")\n", 290 | "gamma = 0.99\n", 291 | "tau = 1e-2\n", 292 | "noise_std = 0.2\n", 293 | "bound = 0.5\n", 294 | "delay_step = 2\n", 295 | "buffer_maxlen = 100000\n", 296 | "critic_lr = 1e-3\n", 297 | "actor_lr = 1e-3\n", 298 | "\n", 299 | "max_episodes = 100\n", 300 | "max_steps = 500\n", 301 | "batch_size = 32\n", 302 | "\n", 303 | "agent = TD3Agent(env, gamma, tau, buffer_maxlen, delay_step, noise_std, bound, critic_lr, actor_lr)\n", 304 | "episode_rewards = mini_batch_train(env, agent, 50, 500, 64)" 305 | ], 306 | "execution_count": 0, 307 | "outputs": [] 308 | } 309 | ] 310 | } -------------------------------------------------------------------------------- /sac/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyoon1729/Policy-Gradient-Methods/c003fc7f624c31cc8e9ab304f6a6947c9f440214/sac/__init__.py -------------------------------------------------------------------------------- /sac/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal 5 | 6 | 7 | class ValueNetwork(nn.Module): 8 | 9 | def __init__(self, input_dim, output_dim, init_w=3e-3): 10 | super(ValueNetwork, self).__init__() 11 | self.fc1 = nn.Linear(input_dim, 256) 12 | self.fc2 = nn.Linear(256, 256) 13 | self.fc3 = nn.Linear(256, output_dim) 14 | 15 | self.fc3.weight.data.uniform_(-init_w, init_w) 16 | self.fc3.bias.data.uniform_(-init_w, init_w) 17 | 18 | def forward(self, state): 19 | x = F.relu(self.fc1(state)) 20 | x = F.relu(self.fc2(x)) 21 | x = self.fc3(x) 22 | 23 | return x 24 | 25 | 26 | class SoftQNetwork(nn.Module): 27 | 28 | def __init__(self, num_inputs, num_actions, hidden_size=256, init_w=3e-3): 29 | super(SoftQNetwork, self).__init__() 30 | self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size) 31 | self.linear2 = nn.Linear(hidden_size, hidden_size) 32 | self.linear3 = nn.Linear(hidden_size, 1) 33 | 34 | self.linear3.weight.data.uniform_(-init_w, init_w) 35 | self.linear3.bias.data.uniform_(-init_w, init_w) 36 | 37 | def forward(self, state, action): 38 | x = torch.cat([state, action], 1) 39 | x = F.relu(self.linear1(x)) 40 | x = F.relu(self.linear2(x)) 41 | x = self.linear3(x) 42 | return x 43 | 44 | 45 | class PolicyNetwork(nn.Module): 46 | 47 | def __init__(self, num_inputs, num_actions, hidden_size=256, init_w=3e-3, log_std_min=-20, log_std_max=2): 48 | super(PolicyNetwork, self).__init__() 49 | self.log_std_min = log_std_min 50 | self.log_std_max = log_std_max 51 | 52 | self.linear1 = nn.Linear(num_inputs, hidden_size) 53 | self.linear2 = nn.Linear(hidden_size, hidden_size) 54 | 55 | self.mean_linear = nn.Linear(hidden_size, num_actions) 56 | self.mean_linear.weight.data.uniform_(-init_w, init_w) 57 | self.mean_linear.bias.data.uniform_(-init_w, init_w) 58 | 59 | self.log_std_linear = nn.Linear(hidden_size, num_actions) 60 | self.log_std_linear.weight.data.uniform_(-init_w, init_w) 61 | self.log_std_linear.bias.data.uniform_(-init_w, init_w) 62 | 63 | def forward(self, state): 64 | x = F.relu(self.linear1(state)) 65 | x = F.relu(self.linear2(x)) 66 | 67 | mean = self.mean_linear(x) 68 | log_std = self.log_std_linear(x) 69 | log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) 70 | 71 | return mean, log_std 72 | 73 | def sample(self, state, epsilon=1e-6): 74 | mean, log_std = self.forward(state) 75 | std = log_std.exp() 76 | 77 | normal = Normal(mean, std) 78 | z = normal.rsample() 79 | action = torch.tanh(z) 80 | 81 | log_pi = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon) 82 | log_pi = log_pi.sum(1, keepdim=True) 83 | 84 | return action, log_pi -------------------------------------------------------------------------------- /sac/sac2018.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.optim as optim 4 | from torch.distributions import Normal 5 | 6 | from models import ValueNetwork, SoftQNetwork, PolicyNetwork 7 | from common.replay_buffers import BasicBuffer 8 | 9 | 10 | class SACAgent: 11 | 12 | def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): 13 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 14 | 15 | self.env = env 16 | self.action_range = [env.action_space.low, env.action_space.high] 17 | self.obs_dim = env.observation_space.shape[0] 18 | self.action_dim = env.action_space.shape[0] 19 | 20 | # hyperparameters 21 | self.gamma = gamma 22 | self.tau = tau 23 | self.update_step = 0 24 | self.delay_step = 2 25 | 26 | # initialize networks 27 | self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) 28 | self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) 29 | self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) 30 | self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) 31 | self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) 32 | 33 | # copy params to target param 34 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): 35 | target_param.data.copy_(param) 36 | 37 | # initialize optimizers 38 | self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) 39 | self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) 40 | self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) 41 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) 42 | 43 | self.replay_buffer = BasicBuffer(buffer_maxlen) 44 | 45 | def get_action(self, state): 46 | state = torch.FloatTensor(state).unsqueeze(0).to(self.device) 47 | mean, log_std = self.policy_net.forward(state) 48 | std = log_std.exp() 49 | 50 | normal = Normal(mean, std) 51 | z = normal.sample() 52 | action = torch.tanh(z) 53 | action = action.cpu().detach().squeeze(0).numpy() 54 | 55 | return self.rescale_action(action) 56 | 57 | def rescale_action(self, action): 58 | return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ 59 | (self.action_range[1] + self.action_range[0]) / 2.0 60 | 61 | def update(self, batch_size): 62 | states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size) 63 | states = torch.FloatTensor(states).to(self.device) 64 | actions = torch.FloatTensor(actions).to(self.device) 65 | rewards = torch.FloatTensor(rewards).to(self.device) 66 | next_states = torch.FloatTensor(next_states).to(self.device) 67 | dones = torch.FloatTensor(dones).to(self.device) 68 | dones = dones.view(dones.size(0), -1) 69 | 70 | next_actions, next_log_pi = self.policy_net.sample(next_states) 71 | next_q1 = self.q_net1(next_states, next_actions) 72 | next_q2 = self.q_net2(next_states, next_actions) 73 | next_v = self.target_value_net(next_states) 74 | 75 | # value Loss 76 | next_v_target = torch.min(next_q1, next_q2) - next_log_pi 77 | curr_v = self.value_net.forward(states) 78 | v_loss = F.mse_loss(curr_v, next_v_target.detach()) 79 | 80 | # q loss 81 | curr_q1 = self.q_net1.forward(states, actions) 82 | curr_q2 = self.q_net2.forward(states, actions) 83 | expected_q = rewards + (1 - dones) * self.gamma * next_v 84 | q1_loss = F.mse_loss(curr_q1, expected_q.detach()) 85 | q2_loss = F.mse_loss(curr_q2, expected_q.detach()) 86 | 87 | # update value network and q networks 88 | self.value_optimizer.zero_grad() 89 | v_loss.backward() 90 | self.value_optimizer.step() 91 | 92 | self.q1_optimizer.zero_grad() 93 | q1_loss.backward() 94 | self.q1_optimizer.step() 95 | 96 | self.q2_optimizer.zero_grad() 97 | q2_loss.backward() 98 | self.q2_optimizer.step() 99 | 100 | #delayed update for policy net and target value nets 101 | if self.update_step % self.delay_step == 0: 102 | new_actions, log_pi = self.policy_net.sample(states) 103 | min_q = torch.min( 104 | self.q_net1.forward(states, new_actions), 105 | self.q_net2.forward(states, new_actions) 106 | ) 107 | policy_loss = (log_pi - min_q).mean() 108 | 109 | self.policy_optimizer.zero_grad() 110 | policy_loss.backward() 111 | self.policy_optimizer.step() 112 | 113 | # target networks 114 | for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): 115 | target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) 116 | 117 | self.update_step += 1 -------------------------------------------------------------------------------- /sac/sac2019.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.optim as optim 4 | from torch.distributions import Normal 5 | 6 | from models import SoftQNetwork, PolicyNetwork 7 | from common.replay_buffers import BasicBuffer 8 | 9 | 10 | class SACAgent: 11 | 12 | def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr, buffer_maxlen): 13 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 14 | 15 | self.env = env 16 | self.action_range = [env.action_space.low, env.action_space.high] 17 | self.obs_dim = env.observation_space.shape[0] 18 | self.action_dim = env.action_space.shape[0] 19 | 20 | # hyperparameters 21 | self.gamma = gamma 22 | self.tau = tau 23 | self.update_step = 0 24 | self.delay_step = 2 25 | 26 | # initialize networks 27 | self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) 28 | self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) 29 | self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) 30 | self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) 31 | self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) 32 | 33 | # copy params to target param 34 | for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): 35 | target_param.data.copy_(param) 36 | 37 | for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): 38 | target_param.data.copy_(param) 39 | 40 | # initialize optimizers 41 | self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) 42 | self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) 43 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) 44 | 45 | # entropy temperature 46 | self.alpha = alpha 47 | self.target_entropy = -torch.prod(torch.Tensor(self.env.action_space.shape).to(self.device)).item() 48 | self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) 49 | self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr) 50 | 51 | self.replay_buffer = BasicBuffer(buffer_maxlen) 52 | 53 | def get_action(self, state): 54 | state = torch.FloatTensor(state).unsqueeze(0).to(self.device) 55 | mean, log_std = self.policy_net.forward(state) 56 | std = log_std.exp() 57 | 58 | normal = Normal(mean, std) 59 | z = normal.sample() 60 | action = torch.tanh(z) 61 | action = action.cpu().detach().squeeze(0).numpy() 62 | 63 | return self.rescale_action(action) 64 | 65 | def rescale_action(self, action): 66 | return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ 67 | (self.action_range[1] + self.action_range[0]) / 2.0 68 | 69 | def update(self, batch_size): 70 | states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size) 71 | states = torch.FloatTensor(states).to(self.device) 72 | actions = torch.FloatTensor(actions).to(self.device) 73 | rewards = torch.FloatTensor(rewards).to(self.device) 74 | next_states = torch.FloatTensor(next_states).to(self.device) 75 | dones = torch.FloatTensor(dones).to(self.device) 76 | dones = dones.view(dones.size(0), -1) 77 | 78 | next_actions, next_log_pi = self.policy_net.sample(next_states) 79 | next_q1 = self.target_q_net1(next_states, next_actions) 80 | next_q2 = self.target_q_net2(next_states, next_actions) 81 | next_q_target = torch.min(next_q1, next_q2) - self.alpha * next_log_pi 82 | expected_q = rewards + (1 - dones) * self.gamma * next_q_target 83 | 84 | # q loss 85 | curr_q1 = self.q_net1.forward(states, actions) 86 | curr_q2 = self.q_net2.forward(states, actions) 87 | q1_loss = F.mse_loss(curr_q1, expected_q.detach()) 88 | q2_loss = F.mse_loss(curr_q2, expected_q.detach()) 89 | 90 | # update q networks 91 | self.q1_optimizer.zero_grad() 92 | q1_loss.backward() 93 | self.q1_optimizer.step() 94 | 95 | self.q2_optimizer.zero_grad() 96 | q2_loss.backward() 97 | self.q2_optimizer.step() 98 | 99 | # delayed update for policy network and target q networks 100 | new_actions, log_pi = self.policy_net.sample(states) 101 | if self.update_step % self.delay_step == 0: 102 | min_q = torch.min( 103 | self.q_net1.forward(states, new_actions), 104 | self.q_net2.forward(states, new_actions) 105 | ) 106 | policy_loss = (self.alpha * log_pi - min_q).mean() 107 | 108 | self.policy_optimizer.zero_grad() 109 | policy_loss.backward() 110 | self.policy_optimizer.step() 111 | 112 | # target networks 113 | for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): 114 | target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) 115 | 116 | for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): 117 | target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) 118 | 119 | # update temperature 120 | alpha_loss = (self.log_alpha * (-log_pi - self.target_entropy).detach()).mean() 121 | 122 | self.alpha_optim.zero_grad() 123 | alpha_loss.backward() 124 | self.alpha_optim.step() 125 | self.alpha = self.log_alpha.exp() 126 | 127 | self.update_step += 1 -------------------------------------------------------------------------------- /sac/sac_test.py: -------------------------------------------------------------------------------- 1 | from sac2018 import SACAgent 2 | #from sac2019 import SACAgent 3 | from common.utils import mini_batch_train 4 | import gym 5 | 6 | env = gym.make("Pendulum-v0") 7 | 8 | #SAC 2018 Params 9 | tau = 0.005 10 | gamma = 0.99 11 | value_lr = 3e-3 12 | q_lr = 3e-3 13 | policy_lr = 3e-3 14 | buffer_maxlen = 1000000 15 | 16 | # SAC 2019 Params 17 | # gamma = 0.99 18 | # tau = 0.01 19 | # alpha = 0.2 20 | # a_lr = 3e-4 21 | # q_lr = 3e-4 22 | # p_lr = 3e-4 23 | # buffer_maxlen = 1000000 24 | 25 | state = env.reset() 26 | #2018 agent 27 | agent = SACAgent(env, gamma, tau, value_lr, q_lr, policy_lr, buffer_maxlen) 28 | 29 | #2019 agent 30 | # agent = SACAgent(env, gamma, tau, alpha, q_lr, p_lr, a_lr, buffer_maxlen) 31 | 32 | # train 33 | episode_rewards = mini_batch_train(env, agent, 50, 500, 64) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name='policygradients', 5 | packages=find_packages(), 6 | version='0.0.1') 7 | -------------------------------------------------------------------------------- /td3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyoon1729/Policy-Gradient-Methods/c003fc7f624c31cc8e9ab304f6a6947c9f440214/td3/__init__.py -------------------------------------------------------------------------------- /td3/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.autograd as autograd 5 | 6 | 7 | class Critic(nn.Module): 8 | 9 | def __init__(self, obs_dim, action_dim): 10 | super(Critic, self).__init__() 11 | 12 | self.obs_dim = obs_dim 13 | self.action_dim = action_dim 14 | 15 | self.linear1 = nn.Linear(self.obs_dim, 1024) 16 | self.linear2 = nn.Linear(1024 + self.action_dim, 512) 17 | self.linear3 = nn.Linear(512, 300) 18 | self.linear4 = nn.Linear(300, 1) 19 | 20 | def forward(self, x, a): 21 | x = F.relu(self.linear1(x)) 22 | xa_cat = torch.cat([x,a], 1) 23 | xa = F.relu(self.linear2(xa_cat)) 24 | xa = F.relu(self.linear3(xa)) 25 | qval = self.linear4(xa) 26 | 27 | return qval 28 | 29 | class Actor(nn.Module): 30 | 31 | def __init__(self, obs_dim, action_dim): 32 | super(Actor, self).__init__() 33 | 34 | self.obs_dim = obs_dim 35 | self.action_dim = action_dim 36 | 37 | self.linear1 = nn.Linear(self.obs_dim, 512) 38 | self.linear2 = nn.Linear(512, 128) 39 | self.linear3 = nn.Linear(128, self.action_dim) 40 | 41 | def forward(self, obs): 42 | x = F.relu(self.linear1(obs)) 43 | x = F.relu(self.linear2(x)) 44 | x = torch.tanh(self.linear3(x)) 45 | 46 | return x -------------------------------------------------------------------------------- /td3/td3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | 6 | from models import Critic, Actor 7 | from common.replay_buffers import BasicBuffer 8 | 9 | 10 | class TD3Agent: 11 | 12 | def __init__(self, env, gamma, tau, buffer_maxlen, delay_step, noise_std, noise_bound, critic_lr, actor_lr): 13 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 14 | 15 | self.env = env 16 | self.obs_dim = env.observation_space.shape[0] 17 | self.action_dim = env.action_space.shape[0] 18 | 19 | # hyperparameters 20 | self.gamma = gamma 21 | self.tau = tau 22 | self.noise_std = noise_std 23 | self.noise_bound = noise_bound 24 | self.update_step = 0 25 | self.delay_step = delay_step 26 | 27 | # initialize actor and critic networks 28 | self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device) 29 | self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device) 30 | self.critic1_target = Critic(self.obs_dim, self.action_dim).to(self.device) 31 | self.critic2_target = Critic(self.obs_dim, self.action_dim).to(self.device) 32 | 33 | self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) 34 | self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device) 35 | 36 | # Copy critic target parameters 37 | for target_param, param in zip(self.critic1_target.parameters(), self.critic1.parameters()): 38 | target_param.data.copy_(param.data) 39 | 40 | for target_param, param in zip(self.critic2_target.parameters(), self.critic2.parameters()): 41 | target_param.data.copy_(param.data) 42 | 43 | # initialize optimizers 44 | self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=critic_lr) 45 | self.critic2_optimizer = optim.Adam(self.critic1.parameters(), lr=critic_lr) 46 | self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) 47 | 48 | self.replay_buffer = BasicBuffer(buffer_maxlen) 49 | 50 | def get_action(self, obs): 51 | state = torch.FloatTensor(obs).unsqueeze(0).to(self.device) 52 | action = self.actor.forward(state) 53 | action = action.squeeze(0).cpu().detach().numpy() 54 | 55 | return action 56 | 57 | def update(self, batch_size): 58 | state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(batch_size) 59 | state_batch = torch.FloatTensor(state_batch).to(self.device) 60 | action_batch = torch.FloatTensor(action_batch).to(self.device) 61 | reward_batch = torch.FloatTensor(reward_batch).to(self.device) 62 | next_state_batch = torch.FloatTensor(next_state_batch).to(self.device) 63 | masks = torch.FloatTensor(masks).to(self.device) 64 | 65 | action_space_noise = self.generate_action_space_noise(action_batch) 66 | next_actions = self.actor.forward(state_batch) + action_space_noise 67 | next_Q1 = self.critic1_target.forward(next_state_batch, next_actions) 68 | next_Q2 = self.critic2_target.forward(next_state_batch, next_actions) 69 | expected_Q = reward_batch + self.gamma * torch.min(next_Q1, next_Q2) 70 | 71 | # critic loss 72 | curr_Q1 = self.critic1.forward(state_batch, action_batch) 73 | curr_Q2 = self.critic2.forward(state_batch, action_batch) 74 | critic1_loss = F.mse_loss(curr_Q1, expected_Q.detach()) 75 | critic2_loss = F.mse_loss(curr_Q2, expected_Q.detach()) 76 | 77 | # update critics 78 | self.critic1_optimizer.zero_grad() 79 | critic1_loss.backward() 80 | self.critic1_optimizer.step() 81 | 82 | self.critic2_optimizer.zero_grad() 83 | critic2_loss.backward() 84 | self.critic2_optimizer.step() 85 | 86 | # delyaed update for actor & target networks 87 | if(self.update_step % self.delay_step == 0): 88 | # actor 89 | self.actor_optimizer.zero_grad() 90 | policy_gradient = -self.critic1(state_batch, self.actor(state_batch)).mean() 91 | policy_gradient.backward() 92 | self.actor_optimizer.step() 93 | 94 | # target networks 95 | self.update_targets() 96 | 97 | self.update_step += 1 98 | 99 | def generate_action_space_noise(self, action_batch): 100 | noise = torch.normal(torch.zeros(action_batch.size()), self.noise_std).clamp(-self.noise_bound, self.noise_bound).to(self.device) 101 | return noise 102 | 103 | def update_targets(self): 104 | for target_param, param in zip(self.critic1_target.parameters(), self.critic1.parameters()): 105 | target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) 106 | 107 | for target_param, param in zip(self.critic2_target.parameters(), self.critic2.parameters()): 108 | target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) 109 | 110 | for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): 111 | target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) -------------------------------------------------------------------------------- /td3/td3_test.py: -------------------------------------------------------------------------------- 1 | from td3 import TD3Agent 2 | from common.utils import mini_batch_train 3 | import gym 4 | 5 | env = gym.make("Pendulum-v0") 6 | gamma = 0.99 7 | tau = 1e-2 8 | noise_std = 0.2 9 | bound = 0.5 10 | delay_step = 2 11 | buffer_maxlen = 100000 12 | critic_lr = 1e-3 13 | actor_lr = 1e-3 14 | 15 | max_episodes = 100 16 | max_steps = 500 17 | batch_size = 32 18 | 19 | agent = TD3Agent(env, gamma, tau, buffer_maxlen, delay_step, noise_std, bound, critic_lr, actor_lr) 20 | episode_rewards = mini_batch_train(env, agent, 50, 500, 64) --------------------------------------------------------------------------------