├── LICENSE ├── README.md ├── REINFORCE.py ├── actor_critic.py ├── dqn.py ├── ppo.py ├── a3c.py ├── ppo-lstm.py ├── ddpg.py ├── vtrace.py ├── acer.py ├── ppo-continuous.py ├── a2c.py └── sac.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 seungeunrho 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # minimalRL-pytorch 2 | 3 | Implementations of basic RL algorithms with minimal lines of codes! (PyTorch based) 4 | 5 | * Each algorithm is complete within a single file. 6 | 7 | * Length of each file is up to 100~150 lines of codes. 8 | 9 | * Every algorithm can be trained within 30 seconds, even without GPU. 10 | 11 | * Envs are fixed to "CartPole-v1". You can just focus on the implementations. 12 | 13 | 14 | 15 | ## Algorithms 16 | 1. [REINFORCE](https://github.com/seungeunrho/minimalRL/blob/master/REINFORCE.py) (67 lines) 17 | 2. [Vanilla Actor-Critic](https://github.com/seungeunrho/minimalRL/blob/master/actor_critic.py) (98 lines) 18 | 3. [DQN](https://github.com/seungeunrho/minimalRL/blob/master/dqn.py) (112 lines, including replay memory and target network) 19 | 4. [PPO](https://github.com/seungeunrho/minimalRL/blob/master/ppo.py) (119 lines, including GAE) 20 | 5. [DDPG](https://github.com/seungeunrho/minimalRL/blob/master/ddpg.py) (145 lines, including OU noise and soft target update) 21 | 6. [A3C](https://github.com/seungeunrho/minimalRL/blob/master/a3c.py) (129 lines) 22 | 7. [ACER](https://github.com/seungeunrho/minimalRL/blob/master/acer.py) (149 lines) 23 | 8. [A2C](https://github.com/seungeunrho/minimalRL/blob/master/a2c.py) (188 lines) 24 | 9. [SAC](https://github.com/seungeunrho/minimalRL/blob/master/sac.py) (171 lines) added!! 25 | 10. [PPO-Continuous](https://github.com/seungeunrho/minimalRL/blob/master/ppo-continuous.py) (161 lines) added!! 26 | 11. [Vtrace](https://github.com/seungeunrho/minimalRL/blob/master/vtrace.py) (137 lines) added!! 27 | 12. Any suggestion ...? 28 | 29 | 30 | ## Dependencies 31 | 1. PyTorch 32 | 2. OpenAI GYM ( > 0.26.2 IMPORTANT!! No longer support for the previous versions) 33 | 34 | ## Usage 35 | ```bash 36 | # Works only with Python 3. 37 | # e.g. 38 | python3 REINFORCE.py 39 | python3 actor_critic.py 40 | python3 dqn.py 41 | python3 ppo.py 42 | python3 ddpg.py 43 | python3 a3c.py 44 | python3 a2c.py 45 | python3 acer.py 46 | python3 sac.py 47 | ``` 48 | -------------------------------------------------------------------------------- /REINFORCE.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | #Hyperparameters 9 | learning_rate = 0.0002 10 | gamma = 0.98 11 | 12 | class Policy(nn.Module): 13 | def __init__(self): 14 | super(Policy, self).__init__() 15 | self.data = [] 16 | 17 | self.fc1 = nn.Linear(4, 128) 18 | self.fc2 = nn.Linear(128, 2) 19 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 20 | 21 | def forward(self, x): 22 | x = F.relu(self.fc1(x)) 23 | x = F.softmax(self.fc2(x), dim=0) 24 | return x 25 | 26 | def put_data(self, item): 27 | self.data.append(item) 28 | 29 | def train_net(self): 30 | R = 0 31 | self.optimizer.zero_grad() 32 | for r, prob in self.data[::-1]: 33 | R = r + gamma * R 34 | loss = -torch.log(prob) * R 35 | loss.backward() 36 | self.optimizer.step() 37 | self.data = [] 38 | 39 | def main(): 40 | env = gym.make('CartPole-v1') 41 | pi = Policy() 42 | score = 0.0 43 | print_interval = 20 44 | 45 | 46 | for n_epi in range(10000): 47 | s, _ = env.reset() 48 | done = False 49 | 50 | while not done: # CartPole-v1 forced to terminates at 500 step. 51 | prob = pi(torch.from_numpy(s).float()) 52 | m = Categorical(prob) 53 | a = m.sample() 54 | s_prime, r, done, truncated, info = env.step(a.item()) 55 | pi.put_data((r,prob[a])) 56 | s = s_prime 57 | score += r 58 | 59 | pi.train_net() 60 | 61 | if n_epi%print_interval==0 and n_epi!=0: 62 | print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval)) 63 | score = 0.0 64 | env.close() 65 | 66 | if __name__ == '__main__': 67 | main() -------------------------------------------------------------------------------- /actor_critic.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | #Hyperparameters 9 | learning_rate = 0.0002 10 | gamma = 0.98 11 | n_rollout = 10 12 | 13 | class ActorCritic(nn.Module): 14 | def __init__(self): 15 | super(ActorCritic, self).__init__() 16 | self.data = [] 17 | 18 | self.fc1 = nn.Linear(4,256) 19 | self.fc_pi = nn.Linear(256,2) 20 | self.fc_v = nn.Linear(256,1) 21 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 22 | 23 | def pi(self, x, softmax_dim = 0): 24 | x = F.relu(self.fc1(x)) 25 | x = self.fc_pi(x) 26 | prob = F.softmax(x, dim=softmax_dim) 27 | return prob 28 | 29 | def v(self, x): 30 | x = F.relu(self.fc1(x)) 31 | v = self.fc_v(x) 32 | return v 33 | 34 | def put_data(self, transition): 35 | self.data.append(transition) 36 | 37 | def make_batch(self): 38 | s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], [] 39 | for transition in self.data: 40 | s,a,r,s_prime,done = transition 41 | s_lst.append(s) 42 | a_lst.append([a]) 43 | r_lst.append([r/100.0]) 44 | s_prime_lst.append(s_prime) 45 | done_mask = 0.0 if done else 1.0 46 | done_lst.append([done_mask]) 47 | 48 | s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 49 | torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \ 50 | torch.tensor(done_lst, dtype=torch.float) 51 | self.data = [] 52 | return s_batch, a_batch, r_batch, s_prime_batch, done_batch 53 | 54 | def train_net(self): 55 | s, a, r, s_prime, done = self.make_batch() 56 | td_target = r + gamma * self.v(s_prime) * done 57 | delta = td_target - self.v(s) 58 | 59 | pi = self.pi(s, softmax_dim=1) 60 | pi_a = pi.gather(1,a) 61 | loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach()) 62 | 63 | self.optimizer.zero_grad() 64 | loss.mean().backward() 65 | self.optimizer.step() 66 | 67 | def main(): 68 | env = gym.make('CartPole-v1') 69 | model = ActorCritic() 70 | print_interval = 20 71 | score = 0.0 72 | 73 | for n_epi in range(10000): 74 | done = False 75 | s, _ = env.reset() 76 | while not done: 77 | for t in range(n_rollout): 78 | prob = model.pi(torch.from_numpy(s).float()) 79 | m = Categorical(prob) 80 | a = m.sample().item() 81 | s_prime, r, done, truncated, info = env.step(a) 82 | model.put_data((s,a,r,s_prime,done)) 83 | 84 | s = s_prime 85 | score += r 86 | 87 | if done: 88 | break 89 | 90 | model.train_net() 91 | 92 | if n_epi%print_interval==0 and n_epi!=0: 93 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval)) 94 | score = 0.0 95 | env.close() 96 | 97 | if __name__ == '__main__': 98 | main() -------------------------------------------------------------------------------- /dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import collections 3 | import random 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | 10 | #Hyperparameters 11 | learning_rate = 0.0005 12 | gamma = 0.98 13 | buffer_limit = 50000 14 | batch_size = 32 15 | 16 | class ReplayBuffer(): 17 | def __init__(self): 18 | self.buffer = collections.deque(maxlen=buffer_limit) 19 | 20 | def put(self, transition): 21 | self.buffer.append(transition) 22 | 23 | def sample(self, n): 24 | mini_batch = random.sample(self.buffer, n) 25 | s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], [] 26 | 27 | for transition in mini_batch: 28 | s, a, r, s_prime, done_mask = transition 29 | s_lst.append(s) 30 | a_lst.append([a]) 31 | r_lst.append([r]) 32 | s_prime_lst.append(s_prime) 33 | done_mask_lst.append([done_mask]) 34 | 35 | return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 36 | torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ 37 | torch.tensor(done_mask_lst) 38 | 39 | def size(self): 40 | return len(self.buffer) 41 | 42 | class Qnet(nn.Module): 43 | def __init__(self): 44 | super(Qnet, self).__init__() 45 | self.fc1 = nn.Linear(4, 128) 46 | self.fc2 = nn.Linear(128, 128) 47 | self.fc3 = nn.Linear(128, 2) 48 | 49 | def forward(self, x): 50 | x = F.relu(self.fc1(x)) 51 | x = F.relu(self.fc2(x)) 52 | x = self.fc3(x) 53 | return x 54 | 55 | def sample_action(self, obs, epsilon): 56 | out = self.forward(obs) 57 | coin = random.random() 58 | if coin < epsilon: 59 | return random.randint(0,1) 60 | else : 61 | return out.argmax().item() 62 | 63 | def train(q, q_target, memory, optimizer): 64 | for i in range(10): 65 | s,a,r,s_prime,done_mask = memory.sample(batch_size) 66 | 67 | q_out = q(s) 68 | q_a = q_out.gather(1,a) 69 | max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1) 70 | target = r + gamma * max_q_prime * done_mask 71 | loss = F.smooth_l1_loss(q_a, target) 72 | 73 | optimizer.zero_grad() 74 | loss.backward() 75 | optimizer.step() 76 | 77 | def main(): 78 | env = gym.make('CartPole-v1') 79 | q = Qnet() 80 | q_target = Qnet() 81 | q_target.load_state_dict(q.state_dict()) 82 | memory = ReplayBuffer() 83 | 84 | print_interval = 20 85 | score = 0.0 86 | optimizer = optim.Adam(q.parameters(), lr=learning_rate) 87 | 88 | for n_epi in range(10000): 89 | epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1% 90 | s, _ = env.reset() 91 | done = False 92 | 93 | while not done: 94 | a = q.sample_action(torch.from_numpy(s).float(), epsilon) 95 | s_prime, r, done, truncated, info = env.step(a) 96 | done_mask = 0.0 if done else 1.0 97 | memory.put((s,a,r/100.0,s_prime, done_mask)) 98 | s = s_prime 99 | 100 | score += r 101 | if done: 102 | break 103 | 104 | if memory.size()>2000: 105 | train(q, q_target, memory, optimizer) 106 | 107 | if n_epi%print_interval==0 and n_epi!=0: 108 | q_target.load_state_dict(q.state_dict()) 109 | print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format( 110 | n_epi, score/print_interval, memory.size(), epsilon*100)) 111 | score = 0.0 112 | env.close() 113 | 114 | if __name__ == '__main__': 115 | main() 116 | -------------------------------------------------------------------------------- /ppo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | #Hyperparameters 9 | learning_rate = 0.0005 10 | gamma = 0.98 11 | lmbda = 0.95 12 | eps_clip = 0.1 13 | K_epoch = 3 14 | T_horizon = 20 15 | 16 | class PPO(nn.Module): 17 | def __init__(self): 18 | super(PPO, self).__init__() 19 | self.data = [] 20 | 21 | self.fc1 = nn.Linear(4,256) 22 | self.fc_pi = nn.Linear(256,2) 23 | self.fc_v = nn.Linear(256,1) 24 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 25 | 26 | def pi(self, x, softmax_dim = 0): 27 | x = F.relu(self.fc1(x)) 28 | x = self.fc_pi(x) 29 | prob = F.softmax(x, dim=softmax_dim) 30 | return prob 31 | 32 | def v(self, x): 33 | x = F.relu(self.fc1(x)) 34 | v = self.fc_v(x) 35 | return v 36 | 37 | def put_data(self, transition): 38 | self.data.append(transition) 39 | 40 | def make_batch(self): 41 | s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], [] 42 | for transition in self.data: 43 | s, a, r, s_prime, prob_a, done = transition 44 | 45 | s_lst.append(s) 46 | a_lst.append([a]) 47 | r_lst.append([r]) 48 | s_prime_lst.append(s_prime) 49 | prob_a_lst.append([prob_a]) 50 | done_mask = 0 if done else 1 51 | done_lst.append([done_mask]) 52 | 53 | s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 54 | torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ 55 | torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst) 56 | self.data = [] 57 | return s, a, r, s_prime, done_mask, prob_a 58 | 59 | def train_net(self): 60 | s, a, r, s_prime, done_mask, prob_a = self.make_batch() 61 | 62 | for i in range(K_epoch): 63 | td_target = r + gamma * self.v(s_prime) * done_mask 64 | delta = td_target - self.v(s) 65 | delta = delta.detach().numpy() 66 | 67 | advantage_lst = [] 68 | advantage = 0.0 69 | for delta_t in delta[::-1]: 70 | advantage = gamma * lmbda * advantage + delta_t[0] 71 | advantage_lst.append([advantage]) 72 | advantage_lst.reverse() 73 | advantage = torch.tensor(advantage_lst, dtype=torch.float) 74 | 75 | pi = self.pi(s, softmax_dim=1) 76 | pi_a = pi.gather(1,a) 77 | ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a)) # a/b == exp(log(a)-log(b)) 78 | 79 | surr1 = ratio * advantage 80 | surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage 81 | loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach()) 82 | 83 | self.optimizer.zero_grad() 84 | loss.mean().backward() 85 | self.optimizer.step() 86 | 87 | def main(): 88 | env = gym.make('CartPole-v1') 89 | model = PPO() 90 | score = 0.0 91 | print_interval = 20 92 | 93 | for n_epi in range(10000): 94 | s, _ = env.reset() 95 | done = False 96 | while not done: 97 | for t in range(T_horizon): 98 | prob = model.pi(torch.from_numpy(s).float()) 99 | m = Categorical(prob) 100 | a = m.sample().item() 101 | s_prime, r, done, truncated, info = env.step(a) 102 | 103 | model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done)) 104 | s = s_prime 105 | 106 | score += r 107 | if done: 108 | break 109 | 110 | model.train_net() 111 | 112 | if n_epi%print_interval==0 and n_epi!=0: 113 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval)) 114 | score = 0.0 115 | 116 | env.close() 117 | 118 | if __name__ == '__main__': 119 | main() -------------------------------------------------------------------------------- /a3c.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | import torch.multiprocessing as mp 8 | import time 9 | 10 | # Hyperparameters 11 | n_train_processes = 3 12 | learning_rate = 0.0002 13 | update_interval = 5 14 | gamma = 0.98 15 | max_train_ep = 300 16 | max_test_ep = 400 17 | 18 | 19 | class ActorCritic(nn.Module): 20 | def __init__(self): 21 | super(ActorCritic, self).__init__() 22 | self.fc1 = nn.Linear(4, 256) 23 | self.fc_pi = nn.Linear(256, 2) 24 | self.fc_v = nn.Linear(256, 1) 25 | 26 | def pi(self, x, softmax_dim=0): 27 | x = F.relu(self.fc1(x)) 28 | x = self.fc_pi(x) 29 | prob = F.softmax(x, dim=softmax_dim) 30 | return prob 31 | 32 | def v(self, x): 33 | x = F.relu(self.fc1(x)) 34 | v = self.fc_v(x) 35 | return v 36 | 37 | 38 | def train(global_model, rank): 39 | local_model = ActorCritic() 40 | local_model.load_state_dict(global_model.state_dict()) 41 | 42 | optimizer = optim.Adam(global_model.parameters(), lr=learning_rate) 43 | 44 | env = gym.make('CartPole-v1') 45 | 46 | for n_epi in range(max_train_ep): 47 | done = False 48 | s = env.reset() 49 | while not done: 50 | s_lst, a_lst, r_lst = [], [], [] 51 | for t in range(update_interval): 52 | prob = local_model.pi(torch.from_numpy(s).float()) 53 | m = Categorical(prob) 54 | a = m.sample().item() 55 | s_prime, r, done, info = env.step(a) 56 | 57 | s_lst.append(s) 58 | a_lst.append([a]) 59 | r_lst.append(r/100.0) 60 | 61 | s = s_prime 62 | if done: 63 | break 64 | 65 | s_final = torch.tensor(s_prime, dtype=torch.float) 66 | R = 0.0 if done else local_model.v(s_final).item() 67 | td_target_lst = [] 68 | for reward in r_lst[::-1]: 69 | R = gamma * R + reward 70 | td_target_lst.append([R]) 71 | td_target_lst.reverse() 72 | 73 | s_batch, a_batch, td_target = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 74 | torch.tensor(td_target_lst) 75 | advantage = td_target - local_model.v(s_batch) 76 | 77 | pi = local_model.pi(s_batch, softmax_dim=1) 78 | pi_a = pi.gather(1, a_batch) 79 | loss = -torch.log(pi_a) * advantage.detach() + \ 80 | F.smooth_l1_loss(local_model.v(s_batch), td_target.detach()) 81 | 82 | optimizer.zero_grad() 83 | loss.mean().backward() 84 | for global_param, local_param in zip(global_model.parameters(), local_model.parameters()): 85 | global_param._grad = local_param.grad 86 | optimizer.step() 87 | local_model.load_state_dict(global_model.state_dict()) 88 | 89 | env.close() 90 | print("Training process {} reached maximum episode.".format(rank)) 91 | 92 | 93 | def test(global_model): 94 | env = gym.make('CartPole-v1') 95 | score = 0.0 96 | print_interval = 20 97 | 98 | for n_epi in range(max_test_ep): 99 | done = False 100 | s = env.reset() 101 | while not done: 102 | prob = global_model.pi(torch.from_numpy(s).float()) 103 | a = Categorical(prob).sample().item() 104 | s_prime, r, done, info = env.step(a) 105 | s = s_prime 106 | score += r 107 | 108 | if n_epi % print_interval == 0 and n_epi != 0: 109 | print("# of episode :{}, avg score : {:.1f}".format( 110 | n_epi, score/print_interval)) 111 | score = 0.0 112 | time.sleep(1) 113 | env.close() 114 | 115 | 116 | if __name__ == '__main__': 117 | global_model = ActorCritic() 118 | global_model.share_memory() 119 | 120 | processes = [] 121 | for rank in range(n_train_processes + 1): # + 1 for test process 122 | if rank == 0: 123 | p = mp.Process(target=test, args=(global_model,)) 124 | else: 125 | p = mp.Process(target=train, args=(global_model, rank,)) 126 | p.start() 127 | processes.append(p) 128 | for p in processes: 129 | p.join() -------------------------------------------------------------------------------- /ppo-lstm.py: -------------------------------------------------------------------------------- 1 | #PPO-LSTM 2 | import gym 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torch.distributions import Categorical 8 | import time 9 | import numpy as np 10 | 11 | #Hyperparameters 12 | learning_rate = 0.0005 13 | gamma = 0.98 14 | lmbda = 0.95 15 | eps_clip = 0.1 16 | K_epoch = 2 17 | T_horizon = 20 18 | 19 | class PPO(nn.Module): 20 | def __init__(self): 21 | super(PPO, self).__init__() 22 | self.data = [] 23 | 24 | self.fc1 = nn.Linear(4,64) 25 | self.lstm = nn.LSTM(64,32) 26 | self.fc_pi = nn.Linear(32,2) 27 | self.fc_v = nn.Linear(32,1) 28 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 29 | 30 | def pi(self, x, hidden): 31 | x = F.relu(self.fc1(x)) 32 | x = x.view(-1, 1, 64) 33 | x, lstm_hidden = self.lstm(x, hidden) 34 | x = self.fc_pi(x) 35 | prob = F.softmax(x, dim=2) 36 | return prob, lstm_hidden 37 | 38 | def v(self, x, hidden): 39 | x = F.relu(self.fc1(x)) 40 | x = x.view(-1, 1, 64) 41 | x, lstm_hidden = self.lstm(x, hidden) 42 | v = self.fc_v(x) 43 | return v 44 | 45 | def put_data(self, transition): 46 | self.data.append(transition) 47 | 48 | def make_batch(self): 49 | s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, h_in_lst, h_out_lst, done_lst = [], [], [], [], [], [], [], [] 50 | for transition in self.data: 51 | s, a, r, s_prime, prob_a, h_in, h_out, done = transition 52 | 53 | s_lst.append(s) 54 | a_lst.append([a]) 55 | r_lst.append([r]) 56 | s_prime_lst.append(s_prime) 57 | prob_a_lst.append([prob_a]) 58 | h_in_lst.append(h_in) 59 | h_out_lst.append(h_out) 60 | done_mask = 0 if done else 1 61 | done_lst.append([done_mask]) 62 | 63 | s,a,r,s_prime,done_mask,prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 64 | torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ 65 | torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst) 66 | self.data = [] 67 | return s,a,r,s_prime, done_mask, prob_a, h_in_lst[0], h_out_lst[0] 68 | 69 | def train_net(self): 70 | s,a,r,s_prime,done_mask, prob_a, (h1_in, h2_in), (h1_out, h2_out) = self.make_batch() 71 | first_hidden = (h1_in.detach(), h2_in.detach()) 72 | second_hidden = (h1_out.detach(), h2_out.detach()) 73 | 74 | for i in range(K_epoch): 75 | v_prime = self.v(s_prime, second_hidden).squeeze(1) 76 | td_target = r + gamma * v_prime * done_mask 77 | v_s = self.v(s, first_hidden).squeeze(1) 78 | delta = td_target - v_s 79 | delta = delta.detach().numpy() 80 | 81 | advantage_lst = [] 82 | advantage = 0.0 83 | for item in delta[::-1]: 84 | advantage = gamma * lmbda * advantage + item[0] 85 | advantage_lst.append([advantage]) 86 | advantage_lst.reverse() 87 | advantage = torch.tensor(advantage_lst, dtype=torch.float) 88 | 89 | pi, _ = self.pi(s, first_hidden) 90 | pi_a = pi.squeeze(1).gather(1,a) 91 | ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a)) # a/b == log(exp(a)-exp(b)) 92 | 93 | surr1 = ratio * advantage 94 | surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage 95 | loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(v_s, td_target.detach()) 96 | 97 | self.optimizer.zero_grad() 98 | loss.mean().backward(retain_graph=True) 99 | self.optimizer.step() 100 | 101 | def main(): 102 | env = gym.make('CartPole-v1') 103 | model = PPO() 104 | score = 0.0 105 | print_interval = 20 106 | 107 | for n_epi in range(10000): 108 | h_out = (torch.zeros([1, 1, 32], dtype=torch.float), torch.zeros([1, 1, 32], dtype=torch.float)) 109 | s, _ = env.reset() 110 | done = False 111 | 112 | while not done: 113 | for t in range(T_horizon): 114 | h_in = h_out 115 | prob, h_out = model.pi(torch.from_numpy(s).float(), h_in) 116 | prob = prob.view(-1) 117 | m = Categorical(prob) 118 | a = m.sample().item() 119 | s_prime, r, done, truncated, info = env.step(a) 120 | 121 | model.put_data((s, a, r/100.0, s_prime, prob[a].item(), h_in, h_out, done)) 122 | s = s_prime 123 | 124 | score += r 125 | if done: 126 | break 127 | 128 | model.train_net() 129 | 130 | if n_epi%print_interval==0 and n_epi!=0: 131 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval)) 132 | score = 0.0 133 | 134 | env.close() 135 | 136 | if __name__ == '__main__': 137 | main() -------------------------------------------------------------------------------- /ddpg.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import collections 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | 10 | #Hyperparameters 11 | lr_mu = 0.0005 12 | lr_q = 0.001 13 | gamma = 0.99 14 | batch_size = 32 15 | buffer_limit = 50000 16 | tau = 0.005 # for target network soft update 17 | 18 | class ReplayBuffer(): 19 | def __init__(self): 20 | self.buffer = collections.deque(maxlen=buffer_limit) 21 | 22 | def put(self, transition): 23 | self.buffer.append(transition) 24 | 25 | def sample(self, n): 26 | mini_batch = random.sample(self.buffer, n) 27 | s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], [] 28 | 29 | for transition in mini_batch: 30 | s, a, r, s_prime, done = transition 31 | s_lst.append(s) 32 | a_lst.append([a]) 33 | r_lst.append([r]) 34 | s_prime_lst.append(s_prime) 35 | done_mask = 0.0 if done else 1.0 36 | done_mask_lst.append([done_mask]) 37 | 38 | return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst, dtype=torch.float), \ 39 | torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \ 40 | torch.tensor(done_mask_lst, dtype=torch.float) 41 | 42 | def size(self): 43 | return len(self.buffer) 44 | 45 | class MuNet(nn.Module): 46 | def __init__(self): 47 | super(MuNet, self).__init__() 48 | self.fc1 = nn.Linear(3, 128) 49 | self.fc2 = nn.Linear(128, 64) 50 | self.fc_mu = nn.Linear(64, 1) 51 | 52 | def forward(self, x): 53 | x = F.relu(self.fc1(x)) 54 | x = F.relu(self.fc2(x)) 55 | mu = torch.tanh(self.fc_mu(x))*2 # Multipled by 2 because the action space of the Pendulum-v0 is [-2,2] 56 | return mu 57 | 58 | class QNet(nn.Module): 59 | def __init__(self): 60 | super(QNet, self).__init__() 61 | self.fc_s = nn.Linear(3, 64) 62 | self.fc_a = nn.Linear(1,64) 63 | self.fc_q = nn.Linear(128, 32) 64 | self.fc_out = nn.Linear(32,1) 65 | 66 | def forward(self, x, a): 67 | h1 = F.relu(self.fc_s(x)) 68 | h2 = F.relu(self.fc_a(a)) 69 | cat = torch.cat([h1,h2], dim=1) 70 | q = F.relu(self.fc_q(cat)) 71 | q = self.fc_out(q) 72 | return q 73 | 74 | class OrnsteinUhlenbeckNoise: 75 | def __init__(self, mu): 76 | self.theta, self.dt, self.sigma = 0.1, 0.01, 0.1 77 | self.mu = mu 78 | self.x_prev = np.zeros_like(self.mu) 79 | 80 | def __call__(self): 81 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \ 82 | self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) 83 | self.x_prev = x 84 | return x 85 | 86 | def train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer): 87 | s,a,r,s_prime,done_mask = memory.sample(batch_size) 88 | 89 | target = r + gamma * q_target(s_prime, mu_target(s_prime)) * done_mask 90 | q_loss = F.smooth_l1_loss(q(s,a), target.detach()) 91 | q_optimizer.zero_grad() 92 | q_loss.backward() 93 | q_optimizer.step() 94 | 95 | mu_loss = -q(s,mu(s)).mean() # That's all for the policy loss. 96 | mu_optimizer.zero_grad() 97 | mu_loss.backward() 98 | mu_optimizer.step() 99 | 100 | def soft_update(net, net_target): 101 | for param_target, param in zip(net_target.parameters(), net.parameters()): 102 | param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau) 103 | 104 | def main(): 105 | env = gym.make('Pendulum-v1', max_episode_steps=200, autoreset=True) 106 | memory = ReplayBuffer() 107 | 108 | q, q_target = QNet(), QNet() 109 | q_target.load_state_dict(q.state_dict()) 110 | mu, mu_target = MuNet(), MuNet() 111 | mu_target.load_state_dict(mu.state_dict()) 112 | 113 | score = 0.0 114 | print_interval = 20 115 | 116 | mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu) 117 | q_optimizer = optim.Adam(q.parameters(), lr=lr_q) 118 | ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(1)) 119 | 120 | for n_epi in range(10000): 121 | s, _ = env.reset() 122 | done = False 123 | 124 | count = 0 125 | while count < 200 and not done: 126 | a = mu(torch.from_numpy(s).float()) 127 | a = a.item() + ou_noise()[0] 128 | s_prime, r, done, truncated, info = env.step([a]) 129 | memory.put((s,a,r/100.0,s_prime,done)) 130 | score +=r 131 | s = s_prime 132 | count += 1 133 | 134 | if memory.size()>2000: 135 | for i in range(10): 136 | train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer) 137 | soft_update(mu, mu_target) 138 | soft_update(q, q_target) 139 | 140 | if n_epi%print_interval==0 and n_epi!=0: 141 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval)) 142 | score = 0.0 143 | 144 | env.close() 145 | 146 | if __name__ == '__main__': 147 | main() -------------------------------------------------------------------------------- /vtrace.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | #Hyperparameters 9 | learning_rate = 0.0005 10 | gamma = 0.98 11 | T_horizon = 20 12 | clip_rho_threshold = 1.0 13 | clip_c_threshold = 1.0 14 | print_interval = 20 15 | 16 | class Vtrace(nn.Module): 17 | def __init__(self): 18 | super(Vtrace, self).__init__() 19 | self.data = [] 20 | 21 | self.fc1 = nn.Linear(4,256) 22 | self.fc_pi = nn.Linear(256,2) 23 | self.fc_v = nn.Linear(256,1) 24 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 25 | 26 | self.clip_rho_threshold = torch.tensor(clip_rho_threshold, dtype=torch.float) 27 | self.clip_c_threshold = torch.tensor(clip_c_threshold, dtype=torch.float) 28 | 29 | def pi(self, x, softmax_dim = 0): 30 | x = F.relu(self.fc1(x)) 31 | x = self.fc_pi(x) 32 | prob = F.softmax(x, dim=softmax_dim) 33 | return prob 34 | 35 | def v(self, x): 36 | x = F.relu(self.fc1(x)) 37 | v = self.fc_v(x) 38 | return v 39 | 40 | def put_data(self, transition): 41 | self.data.append(transition) 42 | 43 | def make_batch(self): 44 | s_lst, a_lst, r_lst, s_prime_lst, mu_a_lst, done_lst = [], [], [], [], [], [] 45 | for transition in self.data: 46 | s, a, r, s_prime, mu_a, done = transition 47 | 48 | s_lst.append(s) 49 | a_lst.append([a]) 50 | r_lst.append([r]) 51 | s_prime_lst.append(s_prime) 52 | mu_a_lst.append([mu_a]) 53 | done_mask = 0 if done else 1 54 | done_lst.append([done_mask]) 55 | 56 | s,a,r,s_prime,done_mask, mu_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 57 | torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ 58 | torch.tensor(done_lst, dtype=torch.float), torch.tensor(mu_a_lst) 59 | self.data = [] 60 | return s, a, r, s_prime, done_mask, mu_a 61 | 62 | def vtrace(self, s, a, r, s_prime, done_mask, mu_a): 63 | with torch.no_grad(): 64 | pi = self.pi(s, softmax_dim=1) 65 | pi_a = pi.gather(1,a) 66 | v, v_prime = self.v(s), self.v(s_prime) 67 | ratio = torch.exp(torch.log(pi_a) - torch.log(mu_a)) # a/b == exp(log(a)-log(b)) 68 | 69 | rhos = torch.min(self.clip_rho_threshold, ratio) 70 | cs = torch.min(self.clip_c_threshold, ratio).numpy() 71 | td_target = r + gamma * v_prime * done_mask 72 | delta = rhos*(td_target - v).numpy() 73 | 74 | vs_minus_v_xs_lst = [] 75 | vs_minus_v_xs = 0.0 76 | vs_minus_v_xs_lst.append([vs_minus_v_xs]) 77 | 78 | for i in range(len(delta)-1, -1, -1): 79 | vs_minus_v_xs = gamma * cs[i][0] * vs_minus_v_xs + delta[i][0] 80 | vs_minus_v_xs_lst.append([vs_minus_v_xs]) 81 | vs_minus_v_xs_lst.reverse() 82 | 83 | vs_minus_v_xs = torch.tensor(vs_minus_v_xs_lst, dtype=torch.float) 84 | vs = vs_minus_v_xs[:-1] + v.numpy() 85 | vs_prime = vs_minus_v_xs[1:] + v_prime.numpy() 86 | advantage = r + gamma * vs_prime - v.numpy() 87 | 88 | return vs, advantage, rhos 89 | 90 | def train_net(self): 91 | s, a, r, s_prime, done_mask, mu_a = self.make_batch() 92 | vs, advantage, rhos = self.vtrace(s, a, r, s_prime, done_mask, mu_a) 93 | 94 | pi = self.pi(s, softmax_dim=1) 95 | pi_a = pi.gather(1,a) 96 | 97 | val_loss = F.smooth_l1_loss(self.v(s) , vs) 98 | pi_loss = -rhos * torch.log(pi_a) * advantage 99 | loss = pi_loss + val_loss 100 | 101 | self.optimizer.zero_grad() 102 | loss.mean().backward() 103 | self.optimizer.step() 104 | 105 | def main(): 106 | env = gym.make('CartPole-v1') 107 | model = Vtrace() 108 | score = 0.0 109 | 110 | for n_epi in range(10000): 111 | s, _ = env.reset() 112 | done = False 113 | while not done: 114 | for t in range(T_horizon): 115 | prob = model.pi(torch.from_numpy(s).float()) 116 | m = Categorical(prob) 117 | a = m.sample().item() 118 | s_prime, r, done, truncated, info = env.step(a) 119 | 120 | model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done)) 121 | s = s_prime 122 | 123 | score += r 124 | if done: 125 | break 126 | 127 | model.train_net() 128 | 129 | if n_epi%print_interval==0 and n_epi!=0: 130 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval)) 131 | score = 0.0 132 | 133 | env.close() 134 | 135 | if __name__ == '__main__': 136 | main() -------------------------------------------------------------------------------- /acer.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import collections 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | from torch.distributions import Categorical 9 | 10 | # Characteristics 11 | # 1. Discrete action space, single thread version. 12 | # 2. Does not support trust-region updates. 13 | 14 | #Hyperparameters 15 | learning_rate = 0.0002 16 | gamma = 0.98 17 | buffer_limit = 6000 18 | rollout_len = 10 19 | batch_size = 4 # Indicates 4 sequences per mini-batch (4*rollout_len = 40 samples total) 20 | c = 1.0 # For truncating importance sampling ratio 21 | 22 | class ReplayBuffer(): 23 | def __init__(self): 24 | self.buffer = collections.deque(maxlen=buffer_limit) 25 | 26 | def put(self, seq_data): 27 | self.buffer.append(seq_data) 28 | 29 | def sample(self, on_policy=False): 30 | if on_policy: 31 | mini_batch = [self.buffer[-1]] 32 | else: 33 | mini_batch = random.sample(self.buffer, batch_size) 34 | 35 | s_lst, a_lst, r_lst, prob_lst, done_lst, is_first_lst = [], [], [], [], [], [] 36 | for seq in mini_batch: 37 | is_first = True # Flag for indicating whether the transition is the first item from a sequence 38 | for transition in seq: 39 | s, a, r, prob, done = transition 40 | 41 | s_lst.append(s) 42 | a_lst.append([a]) 43 | r_lst.append(r) 44 | prob_lst.append(prob) 45 | done_mask = 0.0 if done else 1.0 46 | done_lst.append(done_mask) 47 | is_first_lst.append(is_first) 48 | is_first = False 49 | 50 | s,a,r,prob,done_mask,is_first = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 51 | r_lst, torch.tensor(prob_lst, dtype=torch.float), done_lst, \ 52 | is_first_lst 53 | return s,a,r,prob,done_mask,is_first 54 | 55 | def size(self): 56 | return len(self.buffer) 57 | 58 | class ActorCritic(nn.Module): 59 | def __init__(self): 60 | super(ActorCritic, self).__init__() 61 | self.fc1 = nn.Linear(4,256) 62 | self.fc_pi = nn.Linear(256,2) 63 | self.fc_q = nn.Linear(256,2) 64 | 65 | def pi(self, x, softmax_dim = 0): 66 | x = F.relu(self.fc1(x)) 67 | x = self.fc_pi(x) 68 | pi = F.softmax(x, dim=softmax_dim) 69 | return pi 70 | 71 | def q(self, x): 72 | x = F.relu(self.fc1(x)) 73 | q = self.fc_q(x) 74 | return q 75 | 76 | def train(model, optimizer, memory, on_policy=False): 77 | s,a,r,prob,done_mask,is_first = memory.sample(on_policy) 78 | 79 | q = model.q(s) 80 | q_a = q.gather(1,a) 81 | pi = model.pi(s, softmax_dim = 1) 82 | pi_a = pi.gather(1,a) 83 | v = (q * pi).sum(1).unsqueeze(1).detach() 84 | 85 | rho = pi.detach()/prob 86 | rho_a = rho.gather(1,a) 87 | rho_bar = rho_a.clamp(max=c) 88 | correction_coeff = (1-c/rho).clamp(min=0) 89 | 90 | q_ret = v[-1] * done_mask[-1] 91 | q_ret_lst = [] 92 | for i in reversed(range(len(r))): 93 | q_ret = r[i] + gamma * q_ret 94 | q_ret_lst.append(q_ret.item()) 95 | q_ret = rho_bar[i] * (q_ret - q_a[i]) + v[i] 96 | 97 | if is_first[i] and i!=0: 98 | q_ret = v[i-1] * done_mask[i-1] # When a new sequence begins, q_ret is initialized 99 | 100 | q_ret_lst.reverse() 101 | q_ret = torch.tensor(q_ret_lst, dtype=torch.float).unsqueeze(1) 102 | 103 | loss1 = -rho_bar * torch.log(pi_a) * (q_ret - v) 104 | loss2 = -correction_coeff * pi.detach() * torch.log(pi) * (q.detach()-v) # bias correction term 105 | loss = loss1 + loss2.sum(1) + F.smooth_l1_loss(q_a, q_ret) 106 | 107 | optimizer.zero_grad() 108 | loss.mean().backward() 109 | optimizer.step() 110 | 111 | def main(): 112 | env = gym.make('CartPole-v1') 113 | memory = ReplayBuffer() 114 | model = ActorCritic() 115 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 116 | 117 | score = 0.0 118 | print_interval = 20 119 | 120 | for n_epi in range(10000): 121 | s, _ = env.reset() 122 | done = False 123 | 124 | while not done: 125 | seq_data = [] 126 | for t in range(rollout_len): 127 | prob = model.pi(torch.from_numpy(s).float()) 128 | a = Categorical(prob).sample().item() 129 | s_prime, r, done, truncated, info = env.step(a) 130 | seq_data.append((s, a, r/100.0, prob.detach().numpy(), done)) 131 | 132 | score +=r 133 | s = s_prime 134 | if done: 135 | break 136 | 137 | memory.put(seq_data) 138 | if memory.size()>500: 139 | train(model, optimizer, memory, on_policy=True) 140 | train(model, optimizer, memory) 141 | 142 | if n_epi%print_interval==0 and n_epi!=0: 143 | print("# of episode :{}, avg score : {:.1f}, buffer size : {}".format(n_epi, score/print_interval, memory.size())) 144 | score = 0.0 145 | 146 | env.close() 147 | 148 | if __name__ == '__main__': 149 | main() -------------------------------------------------------------------------------- /ppo-continuous.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Normal 7 | 8 | #Hyperparameters 9 | learning_rate = 0.0003 10 | gamma = 0.9 11 | lmbda = 0.9 12 | eps_clip = 0.2 13 | K_epoch = 10 14 | rollout_len = 3 15 | buffer_size = 10 16 | minibatch_size = 32 17 | 18 | class PPO(nn.Module): 19 | def __init__(self): 20 | super(PPO, self).__init__() 21 | self.data = [] 22 | 23 | self.fc1 = nn.Linear(3,128) 24 | self.fc_mu = nn.Linear(128,1) 25 | self.fc_std = nn.Linear(128,1) 26 | self.fc_v = nn.Linear(128,1) 27 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 28 | self.optimization_step = 0 29 | 30 | def pi(self, x, softmax_dim = 0): 31 | x = F.relu(self.fc1(x)) 32 | mu = 2.0*torch.tanh(self.fc_mu(x)) 33 | std = F.softplus(self.fc_std(x)) 34 | return mu, std 35 | 36 | def v(self, x): 37 | x = F.relu(self.fc1(x)) 38 | v = self.fc_v(x) 39 | return v 40 | 41 | def put_data(self, transition): 42 | self.data.append(transition) 43 | 44 | def make_batch(self): 45 | s_batch, a_batch, r_batch, s_prime_batch, prob_a_batch, done_batch = [], [], [], [], [], [] 46 | data = [] 47 | 48 | for j in range(buffer_size): 49 | for i in range(minibatch_size): 50 | rollout = self.data.pop() 51 | s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], [] 52 | 53 | for transition in rollout: 54 | s, a, r, s_prime, prob_a, done = transition 55 | 56 | s_lst.append(s) 57 | a_lst.append([a]) 58 | r_lst.append([r]) 59 | s_prime_lst.append(s_prime) 60 | prob_a_lst.append([prob_a]) 61 | done_mask = 0 if done else 1 62 | done_lst.append([done_mask]) 63 | 64 | s_batch.append(s_lst) 65 | a_batch.append(a_lst) 66 | r_batch.append(r_lst) 67 | s_prime_batch.append(s_prime_lst) 68 | prob_a_batch.append(prob_a_lst) 69 | done_batch.append(done_lst) 70 | 71 | mini_batch = torch.tensor(s_batch, dtype=torch.float), torch.tensor(a_batch, dtype=torch.float), \ 72 | torch.tensor(r_batch, dtype=torch.float), torch.tensor(s_prime_batch, dtype=torch.float), \ 73 | torch.tensor(done_batch, dtype=torch.float), torch.tensor(prob_a_batch, dtype=torch.float) 74 | data.append(mini_batch) 75 | 76 | return data 77 | 78 | def calc_advantage(self, data): 79 | data_with_adv = [] 80 | for mini_batch in data: 81 | s, a, r, s_prime, done_mask, old_log_prob = mini_batch 82 | with torch.no_grad(): 83 | td_target = r + gamma * self.v(s_prime) * done_mask 84 | delta = td_target - self.v(s) 85 | delta = delta.numpy() 86 | 87 | advantage_lst = [] 88 | advantage = 0.0 89 | for delta_t in delta[::-1]: 90 | advantage = gamma * lmbda * advantage + delta_t[0] 91 | advantage_lst.append([advantage]) 92 | advantage_lst.reverse() 93 | advantage = torch.tensor(advantage_lst, dtype=torch.float) 94 | data_with_adv.append((s, a, r, s_prime, done_mask, old_log_prob, td_target, advantage)) 95 | 96 | return data_with_adv 97 | 98 | 99 | def train_net(self): 100 | if len(self.data) == minibatch_size * buffer_size: 101 | data = self.make_batch() 102 | data = self.calc_advantage(data) 103 | 104 | for i in range(K_epoch): 105 | for mini_batch in data: 106 | s, a, r, s_prime, done_mask, old_log_prob, td_target, advantage = mini_batch 107 | 108 | mu, std = self.pi(s, softmax_dim=1) 109 | dist = Normal(mu, std) 110 | log_prob = dist.log_prob(a) 111 | ratio = torch.exp(log_prob - old_log_prob) # a/b == exp(log(a)-log(b)) 112 | 113 | surr1 = ratio * advantage 114 | surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage 115 | loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target) 116 | 117 | self.optimizer.zero_grad() 118 | loss.mean().backward() 119 | nn.utils.clip_grad_norm_(self.parameters(), 1.0) 120 | self.optimizer.step() 121 | self.optimization_step += 1 122 | 123 | def main(): 124 | env = gym.make('Pendulum-v1') 125 | model = PPO() 126 | score = 0.0 127 | print_interval = 20 128 | rollout = [] 129 | 130 | for n_epi in range(10000): 131 | s, _ = env.reset() 132 | done = False 133 | count = 0 134 | while count < 200 and not done: 135 | for t in range(rollout_len): 136 | mu, std = model.pi(torch.from_numpy(s).float()) 137 | dist = Normal(mu, std) 138 | a = dist.sample() 139 | log_prob = dist.log_prob(a) 140 | s_prime, r, done, truncated, info = env.step([a.item()]) 141 | 142 | rollout.append((s, a, r/10.0, s_prime, log_prob.item(), done)) 143 | if len(rollout) == rollout_len: 144 | model.put_data(rollout) 145 | rollout = [] 146 | 147 | s = s_prime 148 | score += r 149 | count += 1 150 | 151 | model.train_net() 152 | 153 | if n_epi%print_interval==0 and n_epi!=0: 154 | print("# of episode :{}, avg score : {:.1f}, optmization step: {}".format(n_epi, score/print_interval, model.optimization_step)) 155 | score = 0.0 156 | 157 | env.close() 158 | 159 | if __name__ == '__main__': 160 | main() -------------------------------------------------------------------------------- /a2c.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | import torch.multiprocessing as mp 8 | import numpy as np 9 | 10 | # Hyperparameters 11 | n_train_processes = 3 12 | learning_rate = 0.0002 13 | update_interval = 5 14 | gamma = 0.98 15 | max_train_steps = 60000 16 | PRINT_INTERVAL = update_interval * 100 17 | 18 | class ActorCritic(nn.Module): 19 | def __init__(self): 20 | super(ActorCritic, self).__init__() 21 | self.fc1 = nn.Linear(4, 256) 22 | self.fc_pi = nn.Linear(256, 2) 23 | self.fc_v = nn.Linear(256, 1) 24 | 25 | def pi(self, x, softmax_dim=1): 26 | x = F.relu(self.fc1(x)) 27 | x = self.fc_pi(x) 28 | prob = F.softmax(x, dim=softmax_dim) 29 | return prob 30 | 31 | def v(self, x): 32 | x = F.relu(self.fc1(x)) 33 | v = self.fc_v(x) 34 | return v 35 | 36 | def worker(worker_id, master_end, worker_end): 37 | master_end.close() # Forbid worker to use the master end for messaging 38 | env = gym.make('CartPole-v1') 39 | env.seed(worker_id) 40 | 41 | while True: 42 | cmd, data = worker_end.recv() 43 | if cmd == 'step': 44 | ob, reward, done, info = env.step(data) 45 | if done: 46 | ob = env.reset() 47 | worker_end.send((ob, reward, done, info)) 48 | elif cmd == 'reset': 49 | ob = env.reset() 50 | worker_end.send(ob) 51 | elif cmd == 'reset_task': 52 | ob = env.reset_task() 53 | worker_end.send(ob) 54 | elif cmd == 'close': 55 | worker_end.close() 56 | break 57 | elif cmd == 'get_spaces': 58 | worker_end.send((env.observation_space, env.action_space)) 59 | else: 60 | raise NotImplementedError 61 | 62 | class ParallelEnv: 63 | def __init__(self, n_train_processes): 64 | self.nenvs = n_train_processes 65 | self.waiting = False 66 | self.closed = False 67 | self.workers = list() 68 | 69 | master_ends, worker_ends = zip(*[mp.Pipe() for _ in range(self.nenvs)]) 70 | self.master_ends, self.worker_ends = master_ends, worker_ends 71 | 72 | for worker_id, (master_end, worker_end) in enumerate(zip(master_ends, worker_ends)): 73 | p = mp.Process(target=worker, 74 | args=(worker_id, master_end, worker_end)) 75 | p.daemon = True 76 | p.start() 77 | self.workers.append(p) 78 | 79 | # Forbid master to use the worker end for messaging 80 | for worker_end in worker_ends: 81 | worker_end.close() 82 | 83 | def step_async(self, actions): 84 | for master_end, action in zip(self.master_ends, actions): 85 | master_end.send(('step', action)) 86 | self.waiting = True 87 | 88 | def step_wait(self): 89 | results = [master_end.recv() for master_end in self.master_ends] 90 | self.waiting = False 91 | obs, rews, dones, infos = zip(*results) 92 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 93 | 94 | def reset(self): 95 | for master_end in self.master_ends: 96 | master_end.send(('reset', None)) 97 | return np.stack([master_end.recv() for master_end in self.master_ends]) 98 | 99 | def step(self, actions): 100 | self.step_async(actions) 101 | return self.step_wait() 102 | 103 | def close(self): # For clean up resources 104 | if self.closed: 105 | return 106 | if self.waiting: 107 | [master_end.recv() for master_end in self.master_ends] 108 | for master_end in self.master_ends: 109 | master_end.send(('close', None)) 110 | for worker in self.workers: 111 | worker.join() 112 | self.closed = True 113 | 114 | def test(step_idx, model): 115 | env = gym.make('CartPole-v1') 116 | score = 0.0 117 | done = False 118 | num_test = 10 119 | 120 | for _ in range(num_test): 121 | s = env.reset() 122 | while not done: 123 | prob = model.pi(torch.from_numpy(s).float(), softmax_dim=0) 124 | a = Categorical(prob).sample().numpy() 125 | s_prime, r, done, info = env.step(a) 126 | s = s_prime 127 | score += r 128 | done = False 129 | print(f"Step # :{step_idx}, avg score : {score/num_test:.1f}") 130 | 131 | env.close() 132 | 133 | def compute_target(v_final, r_lst, mask_lst): 134 | G = v_final.reshape(-1) 135 | td_target = list() 136 | 137 | for r, mask in zip(r_lst[::-1], mask_lst[::-1]): 138 | G = r + gamma * G * mask 139 | td_target.append(G) 140 | 141 | return torch.tensor(td_target[::-1]).float() 142 | 143 | if __name__ == '__main__': 144 | envs = ParallelEnv(n_train_processes) 145 | 146 | model = ActorCritic() 147 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 148 | 149 | step_idx = 0 150 | s = envs.reset() 151 | while step_idx < max_train_steps: 152 | s_lst, a_lst, r_lst, mask_lst = list(), list(), list(), list() 153 | for _ in range(update_interval): 154 | prob = model.pi(torch.from_numpy(s).float()) 155 | a = Categorical(prob).sample().numpy() 156 | s_prime, r, done, info = envs.step(a) 157 | 158 | s_lst.append(s) 159 | a_lst.append(a) 160 | r_lst.append(r/100.0) 161 | mask_lst.append(1 - done) 162 | 163 | s = s_prime 164 | step_idx += 1 165 | 166 | s_final = torch.from_numpy(s_prime).float() 167 | v_final = model.v(s_final).detach().clone().numpy() 168 | td_target = compute_target(v_final, r_lst, mask_lst) 169 | 170 | td_target_vec = td_target.reshape(-1) 171 | s_vec = torch.tensor(s_lst).float().reshape(-1, 4) # 4 == Dimension of state 172 | a_vec = torch.tensor(a_lst).reshape(-1).unsqueeze(1) 173 | advantage = td_target_vec - model.v(s_vec).reshape(-1) 174 | 175 | pi = model.pi(s_vec, softmax_dim=1) 176 | pi_a = pi.gather(1, a_vec).reshape(-1) 177 | loss = -(torch.log(pi_a) * advantage.detach()).mean() +\ 178 | F.smooth_l1_loss(model.v(s_vec).reshape(-1), td_target_vec) 179 | 180 | optimizer.zero_grad() 181 | loss.backward() 182 | optimizer.step() 183 | 184 | if step_idx % PRINT_INTERVAL == 0: 185 | test(step_idx, model) 186 | 187 | envs.close() -------------------------------------------------------------------------------- /sac.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Normal 7 | import numpy as np 8 | import collections, random 9 | 10 | #Hyperparameters 11 | lr_pi = 0.0005 12 | lr_q = 0.001 13 | init_alpha = 0.01 14 | gamma = 0.98 15 | batch_size = 32 16 | buffer_limit = 50000 17 | tau = 0.01 # for target network soft update 18 | target_entropy = -1.0 # for automated alpha update 19 | lr_alpha = 0.001 # for automated alpha update 20 | 21 | class ReplayBuffer(): 22 | def __init__(self): 23 | self.buffer = collections.deque(maxlen=buffer_limit) 24 | 25 | def put(self, transition): 26 | self.buffer.append(transition) 27 | 28 | def sample(self, n): 29 | mini_batch = random.sample(self.buffer, n) 30 | s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], [] 31 | 32 | for transition in mini_batch: 33 | s, a, r, s_prime, done = transition 34 | s_lst.append(s) 35 | a_lst.append([a]) 36 | r_lst.append([r]) 37 | s_prime_lst.append(s_prime) 38 | done_mask = 0.0 if done else 1.0 39 | done_mask_lst.append([done_mask]) 40 | 41 | return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst, dtype=torch.float), \ 42 | torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \ 43 | torch.tensor(done_mask_lst, dtype=torch.float) 44 | 45 | def size(self): 46 | return len(self.buffer) 47 | 48 | class PolicyNet(nn.Module): 49 | def __init__(self, learning_rate): 50 | super(PolicyNet, self).__init__() 51 | self.fc1 = nn.Linear(3, 128) 52 | self.fc_mu = nn.Linear(128,1) 53 | self.fc_std = nn.Linear(128,1) 54 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 55 | 56 | self.log_alpha = torch.tensor(np.log(init_alpha)) 57 | self.log_alpha.requires_grad = True 58 | self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr=lr_alpha) 59 | 60 | def forward(self, x): 61 | x = F.relu(self.fc1(x)) 62 | mu = self.fc_mu(x) 63 | std = F.softplus(self.fc_std(x)) 64 | dist = Normal(mu, std) 65 | action = dist.rsample() 66 | log_prob = dist.log_prob(action) 67 | real_action = torch.tanh(action) 68 | real_log_prob = log_prob - torch.log(1-torch.tanh(action).pow(2) + 1e-7) 69 | return real_action, real_log_prob 70 | 71 | def train_net(self, q1, q2, mini_batch): 72 | s, _, _, _, _ = mini_batch 73 | a, log_prob = self.forward(s) 74 | entropy = -self.log_alpha.exp() * log_prob 75 | 76 | q1_val, q2_val = q1(s,a), q2(s,a) 77 | q1_q2 = torch.cat([q1_val, q2_val], dim=1) 78 | min_q = torch.min(q1_q2, 1, keepdim=True)[0] 79 | 80 | loss = -min_q - entropy # for gradient ascent 81 | self.optimizer.zero_grad() 82 | loss.mean().backward() 83 | self.optimizer.step() 84 | 85 | self.log_alpha_optimizer.zero_grad() 86 | alpha_loss = -(self.log_alpha.exp() * (log_prob + target_entropy).detach()).mean() 87 | alpha_loss.backward() 88 | self.log_alpha_optimizer.step() 89 | 90 | class QNet(nn.Module): 91 | def __init__(self, learning_rate): 92 | super(QNet, self).__init__() 93 | self.fc_s = nn.Linear(3, 64) 94 | self.fc_a = nn.Linear(1,64) 95 | self.fc_cat = nn.Linear(128,32) 96 | self.fc_out = nn.Linear(32,1) 97 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 98 | 99 | def forward(self, x, a): 100 | h1 = F.relu(self.fc_s(x)) 101 | h2 = F.relu(self.fc_a(a)) 102 | cat = torch.cat([h1,h2], dim=1) 103 | q = F.relu(self.fc_cat(cat)) 104 | q = self.fc_out(q) 105 | return q 106 | 107 | def train_net(self, target, mini_batch): 108 | s, a, r, s_prime, done = mini_batch 109 | loss = F.smooth_l1_loss(self.forward(s, a) , target) 110 | self.optimizer.zero_grad() 111 | loss.mean().backward() 112 | self.optimizer.step() 113 | 114 | def soft_update(self, net_target): 115 | for param_target, param in zip(net_target.parameters(), self.parameters()): 116 | param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau) 117 | 118 | def calc_target(pi, q1, q2, mini_batch): 119 | s, a, r, s_prime, done = mini_batch 120 | 121 | with torch.no_grad(): 122 | a_prime, log_prob= pi(s_prime) 123 | entropy = -pi.log_alpha.exp() * log_prob 124 | q1_val, q2_val = q1(s_prime,a_prime), q2(s_prime,a_prime) 125 | q1_q2 = torch.cat([q1_val, q2_val], dim=1) 126 | min_q = torch.min(q1_q2, 1, keepdim=True)[0] 127 | target = r + gamma * done * (min_q + entropy) 128 | 129 | return target 130 | 131 | def main(): 132 | env = gym.make('Pendulum-v1') 133 | memory = ReplayBuffer() 134 | q1, q2, q1_target, q2_target = QNet(lr_q), QNet(lr_q), QNet(lr_q), QNet(lr_q) 135 | pi = PolicyNet(lr_pi) 136 | 137 | q1_target.load_state_dict(q1.state_dict()) 138 | q2_target.load_state_dict(q2.state_dict()) 139 | 140 | score = 0.0 141 | print_interval = 20 142 | 143 | for n_epi in range(10000): 144 | s, _ = env.reset() 145 | done = False 146 | count = 0 147 | 148 | while count < 200 and not done: 149 | a, log_prob= pi(torch.from_numpy(s).float()) 150 | s_prime, r, done, truncated, info = env.step([2.0*a.item()]) 151 | memory.put((s, a.item(), r/10.0, s_prime, done)) 152 | score +=r 153 | s = s_prime 154 | count += 1 155 | 156 | if memory.size()>1000: 157 | for i in range(20): 158 | mini_batch = memory.sample(batch_size) 159 | td_target = calc_target(pi, q1_target, q2_target, mini_batch) 160 | q1.train_net(td_target, mini_batch) 161 | q2.train_net(td_target, mini_batch) 162 | entropy = pi.train_net(q1, q2, mini_batch) 163 | q1.soft_update(q1_target) 164 | q2.soft_update(q2_target) 165 | 166 | if n_epi%print_interval==0 and n_epi!=0: 167 | print("# of episode :{}, avg score : {:.1f} alpha:{:.4f}".format(n_epi, score/print_interval, pi.log_alpha.exp())) 168 | score = 0.0 169 | 170 | env.close() 171 | 172 | if __name__ == '__main__': 173 | main() --------------------------------------------------------------------------------