├── README.md ├── REINFORCE.py ├── acer.py ├── actor-critic.py ├── apex-dqn.py ├── asynchronous-ppo.py ├── c51.py ├── curiosity-exploration.py ├── ddpg.py ├── diayn.py ├── dqn.py ├── dueling-double-per-dqn.py ├── goalgan.py ├── iqn.py ├── ppo.py ├── qr-dqn.py ├── sac.py ├── single-acer.py └── td3.py /README.md: -------------------------------------------------------------------------------- 1 | # Deep-reinforcement-learning-pytorch 2 | #### Popular deep-rl algorithms will be implemented here 3 | ## Algorithms available now 4 | 1. [DQN](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/dqn.py) 5 | 2. [Dueling-Double-PER-DQN](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/dueling-double-per-dqn.py) 6 | 3. [Ape-X DQN](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/apex-dqn.py) 7 | 4. [REINFORCE](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/REINFORCE.py) 8 | 5. [Vanilla Actor-Critic](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/actor-critic.py) 9 | 6. [PPO](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/ppo.py) 10 | 7. [DDPG](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/ddpg.py) 11 | 8. [TD3](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/td3.py) 12 | 9. [SAC](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/sac.py) 13 | 10. [Asynchronous-PPO(A3C)](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/asynchronous-ppo.py) 14 | 11. [ACER(single-thread)](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/single-acer.py) 15 | 12. [ACER](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/acer.py) 16 | ## Algorithms coming soon 17 | 13. [C51](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/c51.py) 18 | 14. [QR-DQN](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/qr-dqn.py) 19 | 15. [IQN](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/iqn.py) 20 | 16. [DIAYN](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/diayn.py) 21 | 17. [Curiosity-Exploration](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/curiosity-exploration.py) 22 | 18. [GoalGAN](https://github.com/rl-max/deep-reinforcement-learning-pytorch/blob/main/goalgan.py) 23 | 24 | (These algorithms require further testing) 25 | ## Algorithms to be added in the future 26 | 19. FQF(Fully parameterized Quantile Function) 27 | 20. D4PG 28 | 21. IMPALA 29 | 22. Synchronous-PPO 30 | 23. TRPO -------------------------------------------------------------------------------- /REINFORCE.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | #Hyperparameters 9 | EPISODES = 10000 10 | learning_rate = 0.0002 11 | discount_factor = 0.98 12 | 13 | class PolicyNet(nn.Module): 14 | def __init__(self): 15 | super().__init__() 16 | self.fc1 = nn.Linear(4, 128) 17 | self.fc2 = nn.Linear(128, 128) 18 | self.fc3 = nn.Linear(128, 2) 19 | 20 | def forward(self, x): 21 | x = F.relu(self.fc1(x)) 22 | x = F.relu(self.fc2(x)) 23 | x = F.softmax(self.fc3(x), dim=0) 24 | return x 25 | 26 | def train(net, optimizer, samples): 27 | R, loss = 0, 0 28 | optimizer.zero_grad() 29 | for prob, r in reversed(samples): 30 | R = r + discount_factor * R 31 | loss = -torch.log(prob) * R 32 | loss.backward() 33 | optimizer.step() 34 | 35 | if __name__ == '__main__': 36 | env = gym.make('CartPole-v1') 37 | net = PolicyNet() 38 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 39 | score = 0.0 40 | 41 | for ep in range(EPISODES): 42 | obs = env.reset() 43 | samples = [] 44 | done = False 45 | while not done: 46 | prob = net(torch.tensor(obs).float()) 47 | prob_ = Categorical(prob) 48 | action = prob_.sample().item() 49 | next_obs, reward, done, info = env.step(action) 50 | samples.append((prob[action], reward/100.0)) 51 | score += reward 52 | obs = next_obs 53 | 54 | train(net, optimizer, samples) 55 | 56 | if ep%10==0 and ep!=0: 57 | print('episode:{}, avg_score:{}'.format(ep, score/10.0)) 58 | score = 0.0 59 | env.close() -------------------------------------------------------------------------------- /acer.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import torch.autograd as autograd 7 | import random 8 | import itertools 9 | import threading as T 10 | from collections import deque 11 | from torch.distributions import Categorical 12 | 13 | #Hyperparameters 14 | EPISODES = 10000 15 | learning_rate = 0.0002 16 | discount_factor = 0.98 17 | num_agents = 3 18 | train_interval = 10 19 | replay_iter = 8 20 | buffer_len, start_train = 20000, 500 21 | is_clipping = 1.2 22 | trpo_delta = 1.0 23 | avgnet_ratio = 0.995 24 | 25 | class Network(nn.Module): 26 | def __init__(self): 27 | super().__init__() 28 | self.fc1 = nn.Linear(4, 128) 29 | self.fc2 = nn.Linear(128, 128) 30 | self.policy = nn.Linear(128, 2) 31 | self.qval = nn.Linear(128, 2) 32 | 33 | def p(self, x): 34 | x = F.relu(self.fc1(x)) 35 | self.pi = F.relu(self.fc2(x)) 36 | self.pi.retain_grad() 37 | prob = F.softmax(self.policy(self.pi), dim=1) 38 | return prob 39 | 40 | def q(self, x): 41 | x = F.relu(self.fc1(x)) 42 | x = F.relu(self.fc2(x)) 43 | return self.qval(x) 44 | 45 | def mini_batch(data): 46 | obs, acts, probs, rewards, next_obs, done = [], [], [], [], [], [] 47 | for transition in data: 48 | s, a, p, r, s_, d = transition 49 | obs.append(s); acts.append(a); probs.append(p); rewards.append(r) 50 | next_obs.append(s_), done.append(d) 51 | if d: 52 | break 53 | 54 | return torch.tensor(obs).float(), torch.tensor(acts), \ 55 | torch.stack(probs, dim=0).float(), torch.tensor(rewards).float(),\ 56 | torch.tensor(next_obs).float(), torch.tensor(done) 57 | 58 | def train_process(net, global_avgnet, samples, global_optimizer): 59 | obs, acts, old_probs, rewards, next_obs, done = samples 60 | acts, rewards = acts.view(-1, 1), rewards.view(-1, 1) 61 | final_q, final_p = net.q(next_obs[-1].unsqueeze(0)), net.p(next_obs[-1].unsqueeze(0)) 62 | final_v = torch.sum(final_q * final_p, dim=1) 63 | qval = net.q(obs) 64 | current_p = net.p(obs) 65 | avg_p = global_avgnet.p(obs) 66 | value = torch.sum(qval*current_p, dim=1, keepdim=True) 67 | 68 | act_q = qval.gather(1, acts) 69 | ratio = torch.exp(torch.log(current_p) - torch.log(old_probs)) 70 | ret_ratio = torch.min(torch.tensor(1.0), ratio.gather(1, acts)) 71 | policy_ratio = torch.min(torch.tensor(is_clipping), ratio.gather(1, acts)) 72 | 73 | ret_q = [] 74 | R = final_v if not done[-1] else torch.tensor([0.0]) 75 | for idx, r in enumerate(torch.flip(rewards, [0, 1])): 76 | R = r + discount_factor * R 77 | ret_q.append(R) 78 | R = ret_ratio[-1-idx]*(R - act_q[-1-idx]) + value[-1-idx] 79 | ret_q.reverse() 80 | ret_q = torch.stack(ret_q, dim=0) 81 | 82 | p_obj1 = policy_ratio.detach() * torch.log(current_p.gather(1, acts)) * \ 83 | (ret_q - value).detach() 84 | p_obj2 = 0 85 | for a in range(2): 86 | coeff = torch.max(torch.tensor(0), 1-is_clipping/ratio[:, a]).view(-1, 1) 87 | a_prob, a_qval = current_p[:, a].view(-1, 1), qval[:, a].view(-1, 1) 88 | p_obj2 += (coeff*a_prob).detach() * torch.log(a_prob) * (a_qval - value).detach() 89 | 90 | policy_obj = (p_obj1 + p_obj2).mean() 91 | 92 | g = autograd.grad(policy_obj, net.pi, retain_graph=True)[0] 93 | kld = F.kl_div(avg_p.detach(), current_p) 94 | k = autograd.grad(kld, net.pi, retain_graph=True)[0] 95 | #trust-region update 96 | k_norm = torch.linalg.norm(k, dim=1).view(-1, 1, 1)**2 97 | g_, k_ = g.unsqueeze(2), k.unsqueeze(1) 98 | solve = (torch.bmm(k_, g_) - trpo_delta) / k_norm 99 | new_g = g - torch.max(torch.tensor(0), solve.view(-1, 1))*k 100 | q_loss = F.smooth_l1_loss(act_q, ret_q.detach()) 101 | 102 | global_optimizer.zero_grad() 103 | net.policy.weight._grad = autograd.grad(-policy_obj, net.policy.weight, retain_graph=True)[0] 104 | net.pi.backward(-new_g) 105 | q_loss.backward() 106 | for global_param, local_param in zip(global_net.parameters(), net.parameters()): 107 | global_param._grad = local_param.grad 108 | global_optimizer.step() 109 | 110 | def train(net, global_avgnet, online_sample, buffer, global_optimizer): 111 | train_process(net, global_avgnet, mini_batch(online_sample), global_optimizer) 112 | 113 | if len(buffer) > start_train: 114 | for _ in range(replay_iter): 115 | key = random.randint(0, len(buffer)-train_interval) 116 | replay_sample = itertools.islice(buffer, key, key+train_interval) 117 | train_process(net, global_avgnet, mini_batch(replay_sample), global_optimizer) 118 | 119 | def agent(rank): 120 | env = gym.make('CartPole-v1') 121 | net = Network() 122 | net.load_state_dict(global_net.state_dict()) 123 | global_optimizer = optim.Adam(global_net.parameters(), lr=learning_rate) 124 | buffer = deque(maxlen=buffer_len) 125 | samples, score, step = [], 0.0, 0 126 | 127 | for ep in range(EPISODES): 128 | obs = env.reset() 129 | done = False 130 | while not done: 131 | prob = net.p(torch.tensor(obs).unsqueeze(0).float()) 132 | prob_ = Categorical(prob) 133 | action = prob_.sample().item() 134 | next_obs, reward, done, info = env.step(action) 135 | data = (obs, action, prob[0], reward/100.0, next_obs, done) 136 | samples.append(data) 137 | buffer.append(data) 138 | score += reward 139 | step += 1 140 | obs = next_obs 141 | 142 | if step%train_interval==0: 143 | train(net, global_avgnet, samples, buffer, global_optimizer) 144 | for a_param, param in zip(global_avgnet.parameters(), global_net.parameters()): 145 | a_param.data.copy_(a_param.data*avgnet_ratio + param.data*(1-avgnet_ratio)) 146 | net.load_state_dict(global_net.state_dict()) 147 | samples = [] 148 | 149 | if ep%10==0 and ep!=0: 150 | print('agent_num:{}, episode:{}, avg_score:{}'.format(rank,ep,score/10.0)) 151 | score = 0.0 152 | env.close() 153 | 154 | if __name__ == '__main__': 155 | global_net, global_avgnet = Network(), Network() 156 | global_avgnet.load_state_dict(global_net.state_dict()) 157 | agents = [] 158 | for rank in range(num_agents): 159 | actor = T.Thread(target=agent, args=(rank,)) 160 | actor.start() 161 | agents.append(actor) 162 | for t in agents: 163 | t.join() -------------------------------------------------------------------------------- /actor-critic.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | #Hyperparameters 9 | EPISODES = 10000 10 | learning_rate = 0.0002 11 | discount_factor = 0.98 12 | train_interval = 20 13 | 14 | class Network(nn.Module): 15 | def __init__(self): 16 | super().__init__() 17 | self.fc1 = nn.Linear(4, 128) 18 | self.fc2 = nn.Linear(128, 128) 19 | self.p = nn.Linear(128, 2) 20 | self.value = nn.Linear(128, 1) 21 | 22 | def pi(self, x): 23 | x = F.relu(self.fc1(x)) 24 | x = F.relu(self.fc2(x)) 25 | prob = F.softmax(self.p(x), dim=1) 26 | return prob 27 | 28 | def v(self, x): 29 | x = F.relu(self.fc1(x)) 30 | x = F.relu(self.fc2(x)) 31 | return self.value(x) 32 | 33 | def train(net, optimizer, samples): 34 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 35 | for transition in samples: 36 | s, a, r, s_, d = transition 37 | d = 0.0 if d else 1.0 38 | obs.append(s); acts.append(a); rewards.append(r) 39 | next_obs.append(s_), done.append(d) 40 | 41 | obs, acts, rewards, next_obs, done = torch.tensor(obs).float(),\ 42 | torch.tensor(acts), torch.tensor(rewards).float(), torch.tensor(next_obs).float(),\ 43 | torch.tensor(done) 44 | 45 | target = rewards.view(-1, 1) + discount_factor * net.v(next_obs) * done.view(-1, 1) 46 | td = target - net.v(obs) 47 | prob = net.pi(obs).gather(1, acts.view(-1, 1)) 48 | 49 | loss = -torch.log(prob) * td.detach() + F.smooth_l1_loss(net.v(obs), target.detach()) 50 | 51 | optimizer.zero_grad() 52 | loss.mean().backward() 53 | optimizer.step() 54 | 55 | 56 | if __name__ == '__main__': 57 | env = gym.make('CartPole-v1') 58 | net = Network() 59 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 60 | samples, score, step = [], 0.0, 0 61 | 62 | for ep in range(EPISODES): 63 | obs = env.reset() 64 | done = False 65 | while not done: 66 | prob = net.pi(torch.tensor(obs).unsqueeze(0).float()) 67 | prob_ = Categorical(prob) 68 | action = prob_.sample().item() 69 | next_obs, reward, done, info = env.step(action) 70 | samples.append((obs, action, reward/100.0, next_obs, done)) 71 | score += reward 72 | step += 1 73 | obs = next_obs 74 | 75 | if step%train_interval==0: 76 | train(net, optimizer, samples) 77 | samples = [] 78 | 79 | if ep%10==0 and ep!=0: 80 | print('episode:{}, num_train:{}, avg_score:{}'.format(ep, \ 81 | step//train_interval, score/10.0)) 82 | score = 0.0 83 | env.close() -------------------------------------------------------------------------------- /apex-dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | import numpy as np 8 | import threading as T 9 | import time 10 | from collections import deque 11 | 12 | #Hyperparameters 13 | EPISODES = 10000 14 | num_actors = 3 15 | learning_rate = 0.0005 16 | discount_factor = 0.98 17 | buffer_size, start_train = 100000, 2000 18 | epsilon, alpha, beta = 0.1, 0.6, 0.4 #for PER 19 | batch_size = 64 20 | 21 | class QNet(nn.Module): 22 | def __init__(self): 23 | super().__init__() 24 | self.fc1 = nn.Linear(4, 128) 25 | self.fc2 = nn.Linear(128, 128) 26 | self.v = nn.Linear(128, 1) 27 | self.adv = nn.Linear(128, 2) 28 | 29 | def forward(self, x): 30 | x = F.relu(self.fc1(x)) 31 | x = F.relu(self.fc2(x)) 32 | v = self.v(x) 33 | adv = self.adv(x) 34 | #Dueling network architecture 35 | mean_adv = 0.5*torch.sum(adv, dim=1, keepdim=True) 36 | q = v + adv - mean_adv 37 | return q 38 | 39 | def mini_batch(buffer, priority, new_step): 40 | for _ in range(new_step): 41 | priority.append(None) 42 | priority = np.array(priority) 43 | 44 | real_p = priority[priority!=None] #get real(calculated from TD-error) priority 45 | max_p = max(real_p) if len(real_p)!=0 else 1.0 46 | #priority of unvisited data should be max-priority 47 | prior = [p**alpha if p!=None else max_p**alpha for p in priority] 48 | prob = np.array(prior)/sum(prior) 49 | 50 | indices = np.random.choice(len(buffer), batch_size, p=prob) 51 | mini_batch = np.array(buffer, dtype=object)[indices] 52 | indices_prob = prob[indices] 53 | 54 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 55 | for sample in mini_batch: 56 | s, a, r, s_, d, _ = sample 57 | d = 0.0 if d else 1.0 58 | obs.append(s); acts.append(a); rewards.append(r); 59 | next_obs.append(s_); done.append(d) 60 | 61 | return torch.tensor(obs).float(), torch.tensor(acts), torch.tensor(rewards).float(), \ 62 | torch.tensor(next_obs).float(), torch.tensor(done), indices, \ 63 | torch.tensor(indices_prob).float() 64 | 65 | def train(net, target_net, optimizer, buffer, priority, new_step): 66 | obs, acts, rewards, next_obs, done, indices, prob = mini_batch(buffer, priority, new_step) 67 | target_a = net(next_obs).argmax(dim=1).view(-1, 1) 68 | q_target = target_net(next_obs).gather(1, target_a) 69 | target_q = rewards.view(-1, 1) + discount_factor * done.view(-1, 1) * q_target 70 | q = net(obs).gather(1, acts.view(-1, 1)) 71 | 72 | weight = (len(buffer)*prob) ** -beta #Importance-sampling weight of PER 73 | loss = weight.view(-1, 1) * F.smooth_l1_loss(q, target_q.detach(), reduce=False) 74 | 75 | optimizer.zero_grad() 76 | loss.mean().backward() 77 | optimizer.step() 78 | 79 | #update priority 80 | prior = (torch.abs(target_q - q) + epsilon).view(-1) 81 | prior = prior.detach().numpy() 82 | priority = np.array(priority) 83 | priority[indices] = prior 84 | priority = deque(priority, maxlen=buffer_size) 85 | return priority 86 | 87 | def actor(rank, act_net, learn): 88 | global priority, learning, new_step 89 | env = gym.make('CartPole-v1') 90 | score = 0.0 91 | epsilon, epsilon_decay = 0.6, 1-5e-5 92 | actnet_update = 20 93 | 94 | for ep in range(EPISODES): 95 | obs = env.reset() 96 | done = False 97 | while not done and not learning: #When the learner is not in the process of learning 98 | q_value = act_net(torch.tensor(obs).unsqueeze(0).float()) 99 | rand = random.random() 100 | if rand < epsilon: 101 | action = random.randint(0, 1) 102 | else: 103 | action = q_value.argmax().item() 104 | next_obs, reward, done, info = env.step(action) 105 | if not learning: #When the learner is not in the process of learning 106 | #Priority is initialized by 'None 107 | buffer.append((obs, action, reward/100.0, next_obs, done, None)) 108 | new_step += 1 109 | score += reward 110 | epsilon *= epsilon_decay 111 | obs = next_obs 112 | 113 | if learn: #if learner 114 | learning = True #variable notifying learning-start 115 | if len(buffer) > start_train: 116 | #train and get updated priority 117 | priority = train(net, target_net, optimizer, buffer, priority, new_step) 118 | new_step = 0 119 | if ep%target_interval==0 and ep!=0: 120 | target_net.load_state_dict(net.state_dict()) 121 | time.sleep(0.01) #delay for loop-stability 122 | learning = False 123 | 124 | if ep%actnet_update==0 and ep!=0: 125 | act_net.load_state_dict(net.state_dict()) 126 | 127 | if ep%10==0 and ep!=0 and learn: 128 | #print learner's data 129 | print('actor_num:{}, episode:{}, avg_score:{}, len_buffer:{}, \ 130 | epsilon:{}'.format(rank, ep, score/10.0, len(buffer), epsilon)) 131 | score = 0.0 132 | env.close() 133 | 134 | if __name__ == '__main__': 135 | #global objectives 136 | net, target_net = QNet(), QNet() 137 | target_net.load_state_dict(net.state_dict()) 138 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 139 | buffer, priority = deque(maxlen=buffer_size), deque(maxlen=buffer_size) 140 | learning, new_step = False, 0 141 | target_interval = 20 142 | 143 | #actor-0 doubles the parts of learner 144 | learner = T.Thread(target=actor, args=(0, net, True)) 145 | learner.start() 146 | threads = [learner] 147 | 148 | for rank in range(1, num_actors): 149 | act = T.Thread(target=actor, args=(rank, net, False)) 150 | act.start() 151 | threads.append(act) 152 | for t in threads: 153 | t.join() -------------------------------------------------------------------------------- /asynchronous-ppo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | import threading as T 8 | 9 | #Hyperparameters 10 | EPISODES = 10000 11 | learning_rate = 0.0002 12 | discount_factor = 0.98 13 | train_interval = 20 14 | num_agents = 3 15 | lmbda = 0.5 16 | epsilon = 0.2 17 | #Note:Unlike single-agent PPO, train-iteration is fixed at 1 18 | 19 | class Network(nn.Module): 20 | def __init__(self): 21 | super().__init__() 22 | self.fc1 = nn.Linear(4, 128) 23 | self.fc2 = nn.Linear(128, 128) 24 | self.p = nn.Linear(128, 2) 25 | self.value = nn.Linear(128, 1) 26 | 27 | def pi(self, x): 28 | x = F.relu(self.fc1(x)) 29 | x = F.relu(self.fc2(x)) 30 | prob = F.softmax(self.p(x), dim=1) 31 | return prob 32 | 33 | def v(self, x): 34 | x = F.relu(self.fc1(x)) 35 | x = F.relu(self.fc2(x)) 36 | return self.value(x) 37 | 38 | def mini_batch(samples): 39 | obs, acts, probs, rewards, next_obs, done = [], [], [], [], [], [] 40 | for transition in samples: 41 | s, a, p, r, s_, d = transition 42 | d = 0.0 if d else 1.0 43 | obs.append(s); acts.append(a); probs.append(p), rewards.append(r) 44 | next_obs.append(s_), done.append(d) 45 | 46 | return torch.tensor(obs).float(), torch.tensor(acts), torch.tensor(probs).float(), \ 47 | torch.tensor(rewards).float(), torch.tensor(next_obs).float(), torch.tensor(done) 48 | 49 | def train(net, samples, global_optimizer): 50 | obs, acts, probs, rewards, next_obs, done = mini_batch(samples) 51 | target = rewards.view(-1, 1) + discount_factor * net.v(next_obs) * done.view(-1, 1) 52 | td = target - net.v(obs) 53 | #Implementation of GAE(Generalized Advantage Estimation) 54 | advantage, R = [], 0.0 55 | for delta in torch.flip(td, dims=[0, 1]): 56 | R = delta + discount_factor * lmbda * R 57 | advantage.append(R) 58 | advantage.reverse() 59 | advantage = torch.tensor(advantage).float().unsqueeze(1) 60 | 61 | pi_a = net.pi(obs).gather(1, acts.view(-1, 1)) 62 | ratio = torch.exp(torch.log(pi_a) - torch.log(probs.view(-1, 1)).detach()) 63 | clipped = torch.clamp(ratio, 1-epsilon, 1+epsilon) 64 | 65 | p_loss = -torch.min(ratio*advantage, clipped*advantage) 66 | v_loss = F.smooth_l1_loss(net.v(obs), target.detach()) 67 | loss = (p_loss + v_loss).mean() 68 | 69 | global_optimizer.zero_grad() 70 | loss.backward() 71 | for global_param, local_param in zip(global_net.parameters(), net.parameters()): 72 | global_param._grad = local_param.grad 73 | global_optimizer.step() 74 | 75 | def agent(rank): 76 | env = gym.make('CartPole-v1') 77 | net = Network() #define local network 78 | net.load_state_dict(global_net.state_dict()) 79 | global_optimizer = optim.Adam(global_net.parameters(), lr=learning_rate) 80 | samples, score, step = [], 0.0, 0 81 | 82 | for ep in range(EPISODES): 83 | obs = env.reset() 84 | done = False 85 | while not done: 86 | prob = net.pi(torch.tensor(obs).unsqueeze(0).float()) 87 | prob_ = Categorical(prob) 88 | action = prob_.sample().item() 89 | next_obs, reward, done, info = env.step(action) 90 | samples.append((obs, action, prob[0][action], reward/100.0, next_obs, done)) 91 | score += reward 92 | step += 1 93 | obs = next_obs 94 | 95 | if step%train_interval==0: 96 | train(net, samples, global_optimizer) 97 | net.load_state_dict(global_net.state_dict()) 98 | samples = [] 99 | 100 | if ep%10==0 and ep!=0: 101 | print('agent_num:{}, episode:{}, avg_score:{}'.format(rank,ep,score/10.0)) 102 | score = 0.0 103 | env.close() 104 | 105 | if __name__ == '__main__': 106 | global_net = Network() 107 | agents = [] 108 | for rank in range(num_agents): 109 | actor = T.Thread(target=agent, args=(rank,)) 110 | actor.start() 111 | agents.append(actor) 112 | for t in agents: 113 | t.join() -------------------------------------------------------------------------------- /c51.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import numpy as np 7 | import random 8 | from collections import deque 9 | 10 | #Note: This is not being trained well. Use this for reference only. 11 | 12 | EPISODES = 10000 13 | learning_rate = 0.0005 14 | discount_factor = 0.9 15 | buffer_size, start_train = 100000, 2000 16 | batch_size = 32 17 | min_sprt, max_sprt = 0, 2 18 | num_sprt = 51 19 | interval = (max_sprt - min_sprt)/(num_sprt - 1) 20 | num_act = 2 21 | 22 | class QNet(nn.Module): 23 | def __init__(self): 24 | super().__init__() 25 | self.fc1 = nn.Linear(4, 256) 26 | self.fc2 = nn.Linear(256, 256) 27 | self.acts = [nn.Linear(256, num_sprt) for _ in range(num_act)] 28 | 29 | def forward(self, x): 30 | x = F.relu(self.fc1(x)) 31 | x = F.relu(self.fc2(x)) 32 | acts = [F.softmax(self.acts[idx](x), dim=1) for idx in range(num_act)] 33 | return acts 34 | 35 | def mini_batch(buffer): 36 | batch = random.sample(buffer, batch_size) 37 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 38 | for sample in batch: 39 | s, a, r, s_, d = sample 40 | d = 0.0 if d else 1.0 41 | obs.append(s); acts.append(a); rewards.append(r); 42 | next_obs.append(s_); done.append(d) 43 | 44 | return torch.tensor(obs).float(), torch.tensor(acts), \ 45 | torch.tensor(rewards).float(), torch.tensor(next_obs).float(), \ 46 | torch.tensor(done) 47 | 48 | def train(net, target_net, optimizer, buffer): 49 | obs, acts, rewards, next_obs, done = mini_batch(buffer) 50 | next_actdists = target_net(next_obs) 51 | support = torch.arange(min_sprt, max_sprt+1e-2, interval).unsqueeze(0) 52 | supports = support.repeat(batch_size, 1) 53 | target_sprts = rewards.view(-1, 1) + discount_factor * done.view(-1, 1) * supports 54 | target_sprts = torch.clamp(target_sprts, min_sprt, max_sprt) 55 | next_actvals = torch.stack([expect(next_actdists[idx]) for idx in range(num_act)], dim=0) 56 | next_maxact = torch.argmax(next_actvals, dim=0) 57 | next_dists = [] 58 | for idx, maxact in enumerate(next_maxact): 59 | next_dists.append(next_actdists[maxact][idx]) 60 | next_dists = torch.stack(next_dists, dim=0).detach() 61 | #projection 62 | target_dists = [] 63 | for supprt, target_sprt, dist in zip(supports, target_sprts, next_dists): 64 | sub_dists = [] 65 | for idx, ts in enumerate(target_sprt): 66 | diff = np.abs(supprt - ts) 67 | diff[diff > interval] = interval 68 | proportion = (interval - diff)/interval 69 | t_d = np.array(dist[idx] * proportion) 70 | sub_dists.append(t_d) 71 | t_dist = np.sum(np.array(sub_dists), axis=0) 72 | target_dists.append(t_dist) 73 | target_dists = torch.tensor(target_dists).float() 74 | dists = [] 75 | actdists = net(obs) 76 | for idx, act in enumerate(acts): 77 | dists.append(actdists[act][idx]) 78 | dists = torch.stack(dists, dim=0).float() 79 | 80 | loss = F.kl_div(dists, target_dists.detach()) 81 | optimizer.zero_grad() 82 | loss.backward() 83 | optimizer.step() 84 | 85 | def expect(dists): #function calculating expectation value 86 | support = torch.arange(min_sprt, max_sprt+1e-2, interval) 87 | supports = support.repeat(len(dists), 1) 88 | q_val = torch.sum(supports * dists, dim=1) 89 | return q_val 90 | 91 | if __name__ == '__main__': 92 | env = gym.make('CartPole-v1') 93 | net, target_net = QNet(), QNet() 94 | target_net.load_state_dict(net.state_dict()) 95 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 96 | 97 | buffer = deque(maxlen=buffer_size) 98 | score, step = 0.0, 0 99 | epsilon, epsilon_decay = 0.4, 1-5e-5 100 | target_interval = 10 101 | 102 | for ep in range(EPISODES): 103 | obs = env.reset() 104 | done = False 105 | while not done: 106 | acts_dist = net(torch.tensor(obs).unsqueeze(0).float()) 107 | acts_val = np.array([expect(acts_dist[idx]).item() for idx in range(num_act)]) 108 | rand = random.random() 109 | if rand < epsilon: 110 | action = random.randint(0, num_act-1) 111 | else: 112 | action = acts_val.argmax() 113 | next_obs, reward, done, info = env.step(action) 114 | buffer.append((obs, action, reward/50.0, next_obs, done)) 115 | obs = next_obs 116 | step += 1 117 | score += reward 118 | epsilon *= epsilon_decay 119 | 120 | if len(buffer) > start_train: 121 | train(net, target_net, optimizer, buffer) 122 | 123 | if ep%target_interval==0 and ep!=0: 124 | target_net.load_state_dict(net.state_dict()) 125 | 126 | if ep%10==0 and ep!=0: 127 | print('episode:{}, step:{}, avg_score:{}, len_buffer:{}, epsilon:{}'.format(ep, step, \ 128 | score/10.0, len(buffer), epsilon)) 129 | score = 0 130 | env.close() -------------------------------------------------------------------------------- /curiosity-exploration.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | from collections import deque 8 | 9 | #Hyperparameters 10 | EPISODES = 10000 11 | discount_fact = 0.98 12 | buffer_size, start_train = 50000, 100 13 | batch_size = 64 14 | reward_eta = 1.0 15 | q_lr = 0.0005 16 | encoder_lr = 0.0005 17 | forward_lr, inverse_lr = 0.0005, 0.0005 18 | 19 | class QNet(nn.Module): 20 | def __init__(self): 21 | super().__init__() 22 | self.conv1 = nn.Conv2d(3, 32, 8, stride=3) 23 | self.conv2 = nn.Conv2d(32, 32, 4, stride=2) 24 | self.batch_norm = nn.BatchNorm2d(32) 25 | self.conv3 = nn.Conv2d(32, 32, 5, stride=2) 26 | self.fc1 = nn.Linear(4800, 512) 27 | self.q = nn.Linear(512, 4) 28 | 29 | def forward(self, x): 30 | x = F.leaky_relu(self.conv1(x)) 31 | x = F.leaky_relu(self.conv2(x)) 32 | x = self.batch_norm(x) 33 | x = F.leaky_relu(self.conv3(x)) 34 | x = x.view(x.size(0), -1) 35 | x = F.leaky_relu(self.fc1(x)) 36 | return self.q(x) 37 | 38 | class Encoder(nn.Module): 39 | def __init__(self): 40 | super().__init__() 41 | self.conv1 = nn.Conv2d(3, 32, 7, stride=3) 42 | self.batch_norm = nn.BatchNorm2d(32) 43 | self.conv2 = nn.Conv2d(32, 32, 5, stride=2) 44 | self.encode = nn.Conv2d(32, 1, 1) 45 | 46 | def forward(self, x): 47 | x = F.leaky_relu(self.conv1(x)) 48 | x = self.batch_norm(x) 49 | x = F.leaky_relu(self.conv2(x)) 50 | encode = F.tanh(self.encode(x)) 51 | return encode 52 | 53 | class Forward_model(nn.Module): 54 | def __init__(self): 55 | #action shape: (1, 1) 56 | #state shape: (1, 1, 32, 24) 57 | super().__init__() 58 | self.conv_s = nn.Conv2d(1, 1, 1) 59 | self.conv_a = nn.Conv2d(1, 1, 1) 60 | self.conv1 = nn.Conv2d(1, 32, 4) 61 | self.conv2 = nn.Conv2d(32, 32, 2, stride=2) 62 | self.trans = nn.ConvTranspose2d(32, 1, 6, stride=2) 63 | 64 | def forward(self, x, a): 65 | a = a.view(a.size(0), 1, 1, 1).repeat(1, 1, 32, 24) 66 | a = self.conv_a(a) 67 | x = F.leaky_relu(self.conv_s(x)) 68 | x = torch.add(x, a) 69 | x = F.leaky_relu(self.conv1(x)) 70 | x = F.leaky_relu(self.conv2(x)) 71 | next_x = torch.tanh(self.trans(x)) 72 | return next_x 73 | 74 | class Inverse_model(nn.Module): 75 | def __init__(self): 76 | super().__init__() 77 | #state shape: (1, 1, 32, 24) 78 | self.encode1 = nn.Conv2d(1, 1, 3) 79 | self.encode2 = nn.Conv2d(1, 1, 3) 80 | self.conv1 = nn.Conv2d(1, 32, 4) 81 | self.conv2 = nn.Conv2d(32, 32, 2, stride=2) 82 | self.fc = nn.Linear(3744, 4) 83 | 84 | def forward(self, pre_x, curr_x): 85 | pre_x = self.encode1(pre_x) 86 | curr_x = self.encode2(curr_x) 87 | x = F.leaky_relu(torch.add(pre_x, curr_x)) 88 | x = F.leaky_relu(self.conv1(x)) 89 | x = F.leaky_relu(self.conv2(x)) 90 | x = x.view(x.size(0), -1) 91 | return self.fc(x) 92 | 93 | def mini_batch(buffer): 94 | mini_batch = random.sample(buffer, batch_size) 95 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 96 | for sample in mini_batch: 97 | s, a, r, s_, d = sample 98 | d = 0.0 if d else 1.0 99 | obs.append(s); acts.append(a); rewards.append(r); 100 | next_obs.append(s_); done.append(d) 101 | 102 | return torch.tensor(obs).float(), torch.tensor(acts), torch.tensor(rewards).float(),\ 103 | torch.tensor(next_obs).float(), torch.tensor(done) 104 | 105 | def train(networks, optimizers, buffer): 106 | q_net, q_target, encode, forward, inverse = networks 107 | q_optim, en_optim, fwd_optim, inv_optim = optimizers 108 | obs, acts, rewards, next_obs, done = mini_batch(buffer) 109 | 110 | acts_pred = inverse(encode(obs), encode(next_obs)) 111 | inv_loss = F.cross_entropy(acts_pred, acts) 112 | obs_pred = forward(encode(obs), acts.view(-1, 1).float()) 113 | fwrd_loss = F.mse_loss(obs_pred, encode(next_obs).detach()) 114 | 115 | en_optim.zero_grad(); fwd_optim.zero_grad(); inv_optim.zero_grad() 116 | (inv_loss + fwrd_loss).backward() 117 | en_optim.step() 118 | fwd_optim.step() 119 | inv_optim.step() 120 | 121 | target_q = rewards + discount_fact * done * q_target(next_obs).max(dim=1)[0] 122 | target_q = target_q.view(-1, 1) 123 | q = q_net(obs).gather(1, acts.view(-1, 1).long()) 124 | q_loss = F.smooth_l1_loss(q, target_q.detach()) 125 | q_optim.zero_grad() 126 | q_loss.backward() 127 | q_optim.step() 128 | 129 | if __name__ == '__main__': 130 | env = gym.make('Breakout-v0') 131 | q_net, q_target, encode, forward, inverse = QNet(), QNet(), Encoder(), \ 132 | Forward_model(), Inverse_model() 133 | q_target.load_state_dict(q_net.state_dict()) 134 | q_optim = optim.Adam(q_net.parameters(), lr=q_lr) 135 | en_optim = optim.Adam(encode.parameters(), lr=encoder_lr) 136 | fwd_optim = optim.Adam(forward.parameters(), lr=forward_lr) 137 | inv_optim = optim.Adam(inverse.parameters(), lr=inverse_lr) 138 | 139 | buffer = deque(maxlen=buffer_size) 140 | score, step = 0, 0 141 | epsilon, epsilon_decay = 0.6, 1-1e-5 142 | target_interval = 20 143 | 144 | for ep in range(EPISODES): 145 | obs = env.reset() 146 | obs = torch.tensor(obs).permute(2, 0, 1).float() 147 | done = False 148 | while not done: 149 | q_value = q_net(obs.unsqueeze(0)) 150 | rand = random.random() 151 | if rand < epsilon: 152 | action = random.randint(0, 3) 153 | else: 154 | action = q_value.argmax().item() 155 | next_obs, _, done, info = env.step(action) 156 | next_obs = torch.tensor(next_obs).permute(2, 0, 1).float() 157 | 158 | obs_pred = forward(encode(obs.unsqueeze(0)), torch.tensor([[action]]).float()) 159 | obs_ = encode(next_obs.unsqueeze(0)) 160 | reward = reward_eta * F.mse_loss(obs_pred.squeeze(), obs_.squeeze()).item() 161 | buffer.append((obs.numpy(), action, reward, next_obs.numpy(), done)) 162 | obs = next_obs 163 | step += 1 164 | score += reward 165 | epsilon *= epsilon_decay 166 | 167 | if len(buffer) > start_train: 168 | train((q_net, q_target, encode, forward, inverse), \ 169 | (q_optim, en_optim, fwd_optim, inv_optim), buffer) 170 | 171 | if ep%target_interval==0 and ep!=0: 172 | q_target.load_state_dict(q_net.state_dict()) 173 | 174 | if ep%10==0 and ep!=0: 175 | print('episode:{}, step:{}, avg_score:{}, len_buffer:{}, epsilon:{}'.format(ep, step, \ 176 | score/10.0, len(buffer), epsilon)) 177 | score = 0 178 | env.close() -------------------------------------------------------------------------------- /ddpg.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | from collections import deque 8 | 9 | #Hyperparameters 10 | EPISODES = 1000 11 | q_lr = 0.0005 12 | mu_lr = 0.0001 13 | discount_factor = 0.98 14 | train_interval = 10 15 | buffer_size, start_train = 100000, 2000 16 | batch_size = 32 17 | target_update = 0.995 18 | 19 | class QNet(nn.Module): 20 | def __init__(self): 21 | super().__init__() 22 | self.obs = nn.Linear(3, 128) 23 | self.act = nn.Linear(1, 128) 24 | self.fc = nn.Linear(256, 128) 25 | self.q = nn.Linear(128, 1) 26 | 27 | def forward(self, x, a): 28 | x = F.relu(self.obs(x)) 29 | a = F.relu(self.act(a)) 30 | x = torch.cat([x, a], dim=1) 31 | x = F.relu(self.fc(x)) 32 | return self.q(x) 33 | 34 | class ActionNet(nn.Module): 35 | def __init__(self): 36 | super().__init__() 37 | self.fc1 = nn.Linear(3, 128) 38 | self.fc2 = nn.Linear(128, 128) 39 | self.action = nn.Linear(128, 1) 40 | 41 | def forward(self, x): 42 | x = F.relu(self.fc1(x)) 43 | x = F.relu(self.fc2(x)) 44 | a = torch.tanh(self.action(x)) 45 | return 2*a 46 | 47 | def train(q, q_target, mu, mu_target, buffer, q_optimizer, mu_optimizer): 48 | mini_batch = random.sample(buffer, batch_size) 49 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 50 | for samples in mini_batch: 51 | s, a, r, s_, d = samples 52 | d = 0.0 if d else 1.0 53 | obs.append(s); acts.append(a); rewards.append(r); 54 | next_obs.append(s_); done.append(d) 55 | obs, acts, rewards, next_obs, done = torch.tensor(obs).float(), \ 56 | torch.tensor(acts).float(), torch.tensor(rewards).float(), \ 57 | torch.tensor(next_obs).float(), torch.tensor(done) 58 | 59 | target_q = rewards.view(-1, 1) + discount_factor * done.view(-1, 1) \ 60 | * q_target(next_obs, mu_target(next_obs)) 61 | 62 | q_loss = F.smooth_l1_loss(q(obs, acts.view(-1, 1)), target_q.detach()) 63 | q_optimizer.zero_grad() 64 | q_loss.backward() 65 | q_optimizer.step() 66 | 67 | mu_obj = -q(obs, mu(obs)).mean() 68 | mu_optimizer.zero_grad() 69 | mu_obj.backward() 70 | mu_optimizer.step() 71 | 72 | #Implementation of soft-update 73 | def soft_update(t_net, net, target_ratio): 74 | for t_param, param in zip(t_net.parameters(), net.parameters()): 75 | t_param.data.copy_(t_param.data*target_ratio + param.data*(1-target_ratio)) 76 | 77 | if __name__ == '__main__': 78 | env = gym.make('Pendulum-v0') 79 | qnet, q_target, munet, mu_target = QNet(), QNet(), ActionNet(), ActionNet() 80 | q_target.load_state_dict(qnet.state_dict()) 81 | mu_target.load_state_dict(munet.state_dict()) 82 | q_optimizer, mu_optimizer = optim.Adam(qnet.parameters(), lr=q_lr), \ 83 | optim.Adam(munet.parameters(), lr=mu_lr) 84 | 85 | buffer = deque(maxlen=buffer_size) 86 | score, step = 0.0, 0 87 | 88 | for ep in range(EPISODES): 89 | done = False 90 | obs = env.reset() 91 | while not done: 92 | a = munet(torch.tensor(obs).float()) 93 | noise = torch.randn(1) * 0.5 94 | action = torch.clamp(a+noise, -2, 2).item() 95 | next_obs, reward, done, info = env.step([action]) 96 | buffer.append((obs, action, reward/100.0, next_obs, done)) 97 | score += reward 98 | step += 1 99 | obs = next_obs 100 | if step%train_interval==0 and len(buffer) > start_train: 101 | train(qnet, q_target, munet, mu_target,\ 102 | buffer, q_optimizer, mu_optimizer) 103 | soft_update(q_target, qnet, target_update) 104 | soft_update(mu_target, munet, target_update) 105 | 106 | if ep%10==0 and ep!=0: 107 | print('epsidoes:{}, buffer_size:{}, avg_score:{}'.format(ep, len(buffer), score/10.0)) 108 | score = 0.0 109 | env.close() -------------------------------------------------------------------------------- /diayn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | import numpy as np 8 | from torch.distributions import Normal 9 | from collections import deque 10 | 11 | #Hyperparameters 12 | EPISODES = 1000 13 | discount_factor = 0.98 14 | log_alpha = torch.tensor(np.log(0.1), requires_grad=True) 15 | target_entropy = -4.0 16 | train_interval = 10 17 | q_lr = 0.0005 18 | policy_lr = 0.0002 19 | discrim_lr = 0.0005 20 | alpha_lr = 0.001 21 | buffer_size, start_train = 100000, 2000 22 | batch_size = 32 23 | target_update = 0.995 24 | skill_num, z_size = 16, 8 25 | #constant 26 | state_space, action_space = 24, 4 27 | 28 | class PolicyNet(nn.Module): 29 | def __init__(self): 30 | super().__init__() 31 | self.fc_s = nn.Linear(state_space, 256) 32 | self.fc_z = nn.Linear(z_size, 256) 33 | self.hidden = nn.Linear(512, 512) 34 | self.mu = nn.Linear(512, action_space) 35 | self.sigma = nn.Linear(512, action_space) 36 | 37 | def forward(self, obs, z): 38 | obs = F.relu(self.fc_s(obs)) 39 | z = F.relu(self.fc_z(z)) 40 | x = torch.cat([obs, z], dim=1) 41 | x = F.relu(self.hidden(x)) 42 | mu, sigma = self.mu(x), F.softplus(self.sigma(x)) 43 | dists = Normal(mu, sigma) 44 | actions = dists.rsample() 45 | log_probs = torch.sum(dists.log_prob(actions), dim=1, keepdim=True) 46 | return actions, log_probs 47 | 48 | class QNet(nn.Module): 49 | def __init__(self): 50 | super().__init__() 51 | self.fc_s = nn.Linear(state_space, 256) 52 | self.fc_z = nn.Linear(z_size, 256) 53 | self.fc_a = nn.Linear(action_space, 256) 54 | self.fc = nn.Linear(768, 768) 55 | self.q = nn.Linear(768, 1) 56 | 57 | def forward(self, obs, z, a): 58 | obs = F.relu(self.fc_s(obs)) 59 | z = F.relu(self.fc_z(z)) 60 | a = F.relu(self.fc_a(a)) 61 | x = torch.cat([obs, z, a], dim=1) 62 | x = F.relu(self.fc(x)) 63 | return self.q(x) 64 | 65 | class Discriminator(nn.Module): 66 | def __init__(self): 67 | super().__init__() 68 | self.fc1 = nn.Linear(state_space, 512) 69 | self.fc2 = nn.Linear(512, 512) 70 | self.fc3 = nn.Linear(512, 512) 71 | self.discrim = nn.Linear(512, skill_num) 72 | 73 | def forward(self, obs): 74 | x = F.relu(self.fc1(obs)) 75 | x = F.relu(self.fc2(x)) 76 | x = F.relu(self.fc3(x)) 77 | log_prob = F.log_softmax(self.discrim(x), dim=0) 78 | return log_prob 79 | 80 | def make_batch(buffer): 81 | mini_batch = random.sample(buffer, batch_size) 82 | obs, skills, acts, rewards, next_obs, done = [], [], [], [], [], [] 83 | for samples in mini_batch: 84 | s, z, a, r, s_, d = samples 85 | d = 0.0 if d else 1.0 86 | obs.append(s); skills.append(z); acts.append(a); 87 | rewards.append(r); next_obs.append(s_); done.append(d) 88 | 89 | return torch.tensor(obs).float(), torch.tensor(skills).float(), \ 90 | torch.tensor(acts).float(), torch.tensor(rewards).float(), \ 91 | torch.tensor(next_obs).float(), torch.tensor(done) 92 | 93 | def train(networks, buffer, optimizers): 94 | obs, skills, acts, rewards, next_obs, done = make_batch(buffer) 95 | 96 | q1, q1_target, q2, q2_target, pi = networks 97 | q1_optimizer, q2_optimizer, pi_optimizer, alpha_optimizer = optimizers 98 | 99 | next_acts, log_prob = pi(next_obs, skills) 100 | q_target = torch.min(q1_target(next_obs, skills, next_acts), \ 101 | q2_target(next_obs, skills, next_acts)) 102 | target = rewards.view(-1, 1) + discount_factor * done.view(-1, 1) * \ 103 | (q_target - torch.exp(log_alpha)*log_prob) 104 | 105 | q1_loss = F.smooth_l1_loss(q1(obs, skills, acts), target.detach()) 106 | q2_loss = F.smooth_l1_loss(q2(obs, skills, acts), target.detach()) 107 | 108 | q1_optimizer.zero_grad(); q1_loss.backward(); q1_optimizer.step() 109 | q2_optimizer.zero_grad(); q2_loss.backward(); q2_optimizer.step() 110 | 111 | sampled_a, log_prob = pi(obs, skills) 112 | q_value = torch.min(q1(obs, skills, sampled_a), q2(obs, skills, sampled_a)) 113 | policy_obj = -q_value + torch.exp(log_alpha)*log_prob 114 | pi_optimizer.zero_grad() 115 | policy_obj.mean().backward() 116 | pi_optimizer.step() 117 | 118 | alpha_obj = -torch.exp(log_alpha)*(log_prob.detach() + target_entropy) 119 | alpha_optimizer.zero_grad() 120 | alpha_obj.mean().backward() 121 | alpha_optimizer.step() 122 | 123 | def soft_update(t_net, net, target_ratio): 124 | for t_param, param in zip(t_net.parameters(), net.parameters()): 125 | t_param.data.copy_(t_param.data*target_ratio + param.data*(1-target_ratio)) 126 | 127 | if __name__ == '__main__': 128 | env = gym.make('BipedalWalkerHardcore-v3') 129 | q1net, q1_target, q2net, q2_target, pinet, discriminator = QNet(),\ 130 | QNet(), QNet(), QNet(), PolicyNet(), Discriminator() 131 | q1_target.load_state_dict(q1net.state_dict()) 132 | q2_target.load_state_dict(q2net.state_dict()) 133 | q1_optimizer = optim.Adam(q1net.parameters(), lr=q_lr) 134 | q2_optimizer = optim.Adam(q2net.parameters(), lr=q_lr) 135 | pi_optimizer = optim.Adam(pinet.parameters(), lr=policy_lr) 136 | alpha_optimizer = optim.Adam([log_alpha], lr=alpha_lr) 137 | disc_optimizer = optim.Adam(discriminator.parameters(), lr=discrim_lr) 138 | 139 | buffer = deque(maxlen=buffer_size) 140 | skills = [np.random.rand(z_size) for _ in range(skill_num)] 141 | score, step = 0.0, 0 142 | for ep in range(EPISODES): 143 | done = False 144 | obs = env.reset() 145 | select_z = random.randint(0, skill_num-1) 146 | z = skills[select_z] 147 | while not done: 148 | action, _ = pinet(torch.tensor(obs).unsqueeze(0).float(),\ 149 | torch.tensor(z).unsqueeze(0).float()) 150 | env.render() 151 | next_obs, _, done, info = env.step(action[0].detach()) 152 | reward = discriminator(torch.tensor(next_obs).float())[select_z] 153 | buffer.append((obs, z, action[0].detach().numpy(), reward.item()/100.0, next_obs, done)) 154 | score += reward.item() 155 | step += 1 156 | obs = next_obs 157 | 158 | #discriminator update 159 | disc_optimizer.zero_grad() 160 | (-reward).backward() 161 | disc_optimizer.step() 162 | 163 | if step%train_interval==0 and len(buffer) > start_train: 164 | train((q1net, q1_target, q2net, q2_target, pinet), buffer, \ 165 | (q1_optimizer, q2_optimizer, pi_optimizer, alpha_optimizer)) 166 | soft_update(q1_target, q1net, target_update) 167 | soft_update(q2_target, q2net, target_update) 168 | 169 | if ep%10==0 and ep!=0: 170 | print('episode:{}, buffer_size:{}, alpha:{}, avg_score:{}'.format( 171 | ep, len(buffer), torch.exp(log_alpha).item(), score/10.0)) 172 | score = 0.0 173 | env.close() -------------------------------------------------------------------------------- /dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | from collections import deque 8 | 9 | #Hyperparameters 10 | EPISODES = 10000 11 | learning_rate = 0.0005 12 | discount_factor = 0.98 13 | buffer_size, start_train = 50000, 2000 14 | batch_size = 32 15 | 16 | class QNet(nn.Module): 17 | def __init__(self): 18 | super().__init__() 19 | self.fc1 = nn.Linear(4, 128) 20 | self.fc2 = nn.Linear(128, 128) 21 | self.fc3 = nn.Linear(128, 2) 22 | 23 | def forward(self, x): 24 | x = F.relu(self.fc1(x)) 25 | x = F.relu(self.fc2(x)) 26 | q = self.fc3(x) 27 | return q 28 | 29 | def minibatch_and_train(net, target_net, optimizer, buffer): 30 | mini_batch = random.sample(buffer, batch_size) 31 | 32 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 33 | for sample in mini_batch: 34 | s, a, r, s_, d = sample 35 | d = 0.0 if d else 1.0 36 | obs.append(s); acts.append(a); rewards.append(r); 37 | next_obs.append(s_); done.append(d) 38 | 39 | obs, acts, rewards, next_obs, done = torch.tensor(obs).float(),\ 40 | torch.tensor(acts), torch.tensor(rewards).float(), torch.tensor(next_obs).float(),\ 41 | torch.tensor(done) 42 | 43 | target_q = rewards + discount_factor * done * target_net(next_obs).max(dim=1)[0] 44 | target_q = target_q.view(-1, 1) 45 | q = net(obs).gather(1, acts.view(-1, 1)) 46 | loss = F.smooth_l1_loss(q, target_q.detach()) 47 | 48 | optimizer.zero_grad() 49 | loss.backward() 50 | optimizer.step() 51 | 52 | if __name__ == '__main__': 53 | env = gym.make('CartPole-v1') 54 | net, target_net = QNet(), QNet() 55 | target_net.load_state_dict(net.state_dict()) 56 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 57 | 58 | buffer = deque(maxlen=buffer_size) 59 | score, step = 0, 0 60 | epsilon, epsilon_decay = 0.6, 1-1e-5 61 | target_interval = 20 62 | 63 | for ep in range(EPISODES): 64 | obs = env.reset()[0] 65 | done = False 66 | while not done: 67 | q_value = net(torch.tensor(obs).float()) 68 | rand = random.random() 69 | if rand < epsilon: 70 | action = random.randint(0, 1) 71 | else: 72 | action = q_value.argmax().item() 73 | 74 | next_obs, reward, done, _, info = env.step(action) 75 | buffer.append((obs, action, reward/100.0, next_obs, done)) 76 | obs = next_obs 77 | step += 1 78 | score += reward 79 | epsilon *= epsilon_decay 80 | 81 | if len(buffer) > start_train: 82 | minibatch_and_train(net, target_net, optimizer, buffer) 83 | 84 | if ep%target_interval==0 and ep!=0: 85 | target_net.load_state_dict(net.state_dict()) 86 | 87 | if ep%10==0 and ep!=0: 88 | print('episode:{}, step:{}, avg_score:{}, len_buffer:{}, epsilon:{}'.format(ep, step, \ 89 | score/10.0, len(buffer), epsilon)) 90 | score = 0 91 | env.close() -------------------------------------------------------------------------------- /dueling-double-per-dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | import numpy as np 8 | from collections import deque 9 | 10 | #Hyperparameters 11 | EPISODES = 10000 12 | learning_rate = 0.0005 13 | discount_factor = 0.98 14 | buffer_size, start_train = 50000, 2000 15 | epsilon, alpha, beta = 0.1, 0.6, 0.4 #for PER 16 | batch_size = 64 17 | 18 | class QNet(nn.Module): 19 | def __init__(self): 20 | super().__init__() 21 | self.fc1 = nn.Linear(4, 128) 22 | self.fc2 = nn.Linear(128, 128) 23 | self.v = nn.Linear(128, 1) 24 | self.adv = nn.Linear(128, 2) 25 | 26 | def forward(self, x): 27 | x = F.relu(self.fc1(x)) 28 | x = F.relu(self.fc2(x)) 29 | v = self.v(x) 30 | adv = self.adv(x) 31 | #Dueling network architecture 32 | mean_adv = 0.5*torch.sum(adv, dim=1, keepdim=True) 33 | q = v + adv - mean_adv 34 | return q 35 | 36 | def mini_batch(buffer, priority): 37 | real_p = priority[priority!=None] #get real(calculated from TD-error) priority 38 | max_p = max(real_p) if len(real_p)!=0 else 1.0 39 | #priority of unvisited data should be max-priority 40 | prior = np.array([p**alpha if p!=None else max_p**alpha for p in priority]) 41 | prob = prior/sum(prior) 42 | 43 | indices = np.random.choice(len(buffer), batch_size, p=prob) 44 | mini_batch = np.array(buffer, dtype=object)[indices] 45 | indices_prob = prob[indices] 46 | 47 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 48 | for sample in mini_batch: 49 | s, a, r, s_, d = sample 50 | d = 0.0 if d else 1.0 51 | obs.append(s); acts.append(a); rewards.append(r); 52 | next_obs.append(s_); done.append(d) 53 | 54 | return torch.tensor(obs).float(), torch.tensor(acts), torch.tensor(rewards).float(), \ 55 | torch.tensor(next_obs).float(), torch.tensor(done), indices, \ 56 | torch.tensor(indices_prob).float() 57 | 58 | def train(net, target_net, optimizer, buffer, priority): 59 | priority = np.array(priority) 60 | obs, acts, rewards, next_obs, done, indices, prob = mini_batch(buffer, priority) 61 | 62 | target_a = net(next_obs).argmax(dim=1).view(-1, 1) 63 | q_target = target_net(next_obs).gather(1, target_a) 64 | target_q = rewards.view(-1, 1) + discount_factor * done.view(-1, 1) * q_target 65 | q = net(obs).gather(1, acts.view(-1, 1)) 66 | 67 | weight = (len(buffer)*prob) ** -beta #Importance-sampling weight of PER 68 | loss = weight.view(-1, 1) * F.smooth_l1_loss(q, target_q.detach(), reduce=False) 69 | 70 | optimizer.zero_grad() 71 | loss.mean().backward() 72 | optimizer.step() 73 | 74 | #update priority 75 | prior = (torch.abs(target_q - q) + epsilon).view(-1) 76 | prior = prior.detach().numpy() 77 | priority[indices] = prior 78 | priority = deque(priority, maxlen=buffer_size) 79 | return priority 80 | 81 | if __name__ == '__main__': 82 | env = gym.make('CartPole-v1') 83 | net, target_net = QNet(), QNet() 84 | target_net.load_state_dict(net.state_dict()) 85 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 86 | 87 | buffer = deque(maxlen=buffer_size) 88 | priority = deque(maxlen=buffer_size) 89 | score, step = 0, 0 90 | epsilon, epsilon_decay = 0.6, 1-1e-5 91 | target_interval = 20 92 | 93 | for ep in range(EPISODES): 94 | obs = env.reset() 95 | done = False 96 | while not done: 97 | q_value = net(torch.tensor(obs).unsqueeze(0).float()) 98 | rand = random.random() 99 | if rand < epsilon: 100 | action = random.randint(0, 1) 101 | else: 102 | action = q_value.argmax().item() 103 | next_obs, reward, done, info = env.step(action) 104 | buffer.append((obs, action, reward/100.0, next_obs, done)) 105 | priority.append(None) #Priority is initialized by 'None' 106 | obs = next_obs 107 | step += 1 108 | score += reward 109 | epsilon *= epsilon_decay 110 | 111 | if len(buffer) > start_train: 112 | #train and get updated priority 113 | priority = train(net, target_net, optimizer, buffer, priority) 114 | 115 | if ep%target_interval==0 and ep!=0: 116 | target_net.load_state_dict(net.state_dict()) 117 | 118 | if ep%10==0 and ep!=0: 119 | print('episode:{}, step:{}, avg_score:{}, len_buffer:{}, epsilon:{}'.format(ep, step, \ 120 | score/10.0, len(buffer), epsilon)) 121 | score = 0.0 122 | env.close() -------------------------------------------------------------------------------- /goalgan.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | from collections import deque 8 | 9 | #Hyperparameters 10 | iteration = 1000 11 | update_epi = 20 12 | discount_factor = 0.98 13 | train_interval = 10 14 | buffer_size, start_train = 10000, 16 15 | batch_size = 16 16 | target_update = 0.995 17 | noise_size = 12 18 | goal_tolerance = 0.5 19 | gen_num, sample_num = 1, 2 20 | smple_start = 2 21 | max_step = 100 22 | r_min, r_max = 0.05, 0.95 23 | q_lr = 0.0005 24 | mu_lr = 0.0001 25 | gen_lr = 0.0005 26 | disc_lr = 0.0005 27 | state_space, action_space = 24, 4 #action_space is continuous 28 | 29 | 30 | class Generator(nn.Module): 31 | def __init__(self): 32 | super().__init__() 33 | self.fc1 = nn.Linear(noise_size, 128) 34 | self.fc2 = nn.Linear(128, 256) 35 | self.fc3 = nn.Linear(256, 256) 36 | self.goal = nn.Linear(256, state_space) 37 | 38 | def forward(self, z): 39 | x = F.relu(self.fc1(z)) 40 | x = F.relu(self.fc2(x)) 41 | x = F.relu(self.fc3(x)) 42 | return self.goal(x) 43 | 44 | class Discriminator(nn.Module): 45 | def __init__(self): 46 | super().__init__() 47 | self.fc1 = nn.Linear(state_space, 256) 48 | self.fc2 = nn.Linear(256, 256) 49 | self.fc3 = nn.Linear(256, 256) 50 | self.output = nn.Linear(256, 1) 51 | 52 | def forward(self, goal): 53 | x = F.relu(self.fc1(goal)) 54 | x = F.relu(self.fc2(x)) 55 | x = F.relu(self.fc3(x)) 56 | #least-square loss 57 | return self.output(x) 58 | 59 | class QNet(nn.Module): 60 | def __init__(self): 61 | super().__init__() 62 | self.obs = nn.Linear(state_space, 128) 63 | self.goal = nn.Linear(state_space, 128) 64 | self.act = nn.Linear(action_space, 128) 65 | self.fc1 = nn.Linear(384, 720) 66 | self.fc2 = nn.Linear(720, 720) 67 | self.q = nn.Linear(720, 1) 68 | 69 | def forward(self, x, goal, a): 70 | x = F.relu(self.obs(x)) 71 | z = F.relu(self.goal(goal)) 72 | a = F.relu(self.act(a)) 73 | x = torch.cat([x, z, a], dim=1) 74 | x = F.relu(self.fc1(x)) 75 | x = F.relu(self.fc2(x)) 76 | return self.q(x) 77 | 78 | class ActionNet(nn.Module): 79 | def __init__(self): 80 | super().__init__() 81 | self.obs = nn.Linear(state_space, 256) 82 | self.goal = nn.Linear(state_space, 256) 83 | self.fc2 = nn.Linear(512, 512) 84 | self.fc3 = nn.Linear(512, 512) 85 | self.action = nn.Linear(512, action_space) 86 | 87 | def forward(self, x, goal): 88 | x = F.relu(self.obs(x)) 89 | z = F.relu(self.goal(goal)) 90 | x = torch.cat([x, z], dim=1) 91 | x = F.relu(self.fc2(x)) 92 | x = F.relu(self.fc3(x)) 93 | a = torch.tanh(self.action(x)) 94 | return a 95 | 96 | def train(q, q_target, mu, mu_target, buffer, q_optimizer, mu_optimizer): 97 | mini_batch = random.sample(buffer, batch_size) 98 | obs, goals, acts, rewards, next_obs, done = [], [], [], [], [], [] 99 | for samples in mini_batch: 100 | s, g, a, r, s_, d = samples 101 | d = 0.0 if d else 1.0 102 | obs.append(s); goals.append(g); acts.append(a); rewards.append(r); 103 | next_obs.append(s_); done.append(d) 104 | 105 | obs, goals, acts, rewards, next_obs, done = torch.tensor(obs).float(), \ 106 | torch.tensor(goals).float(), torch.tensor(acts).float(), \ 107 | torch.tensor(rewards).float(), torch.tensor(next_obs).float(), \ 108 | torch.tensor(done) 109 | 110 | target_a = mu_target(next_obs, goals) 111 | target_q = rewards.view(-1, 1) + discount_factor * done.view(-1, 1) \ 112 | * q_target(next_obs, goals, target_a) 113 | 114 | q_loss = F.smooth_l1_loss(q(obs, goals, acts.view(-1, action_space)), target_q.detach()) 115 | q_optimizer.zero_grad() 116 | q_loss.backward() 117 | q_optimizer.step() 118 | mu_obj = -q(obs, goals, mu(obs, goals)).mean() 119 | mu_optimizer.zero_grad() 120 | mu_obj.backward() 121 | mu_optimizer.step() 122 | 123 | #Implementation of soft-update 124 | def soft_update(t_net, net, target_ratio): 125 | for t_param, param in zip(t_net.parameters(), net.parameters()): 126 | t_param.data.copy_(t_param.data*target_ratio + param.data*(1-target_ratio)) 127 | 128 | def update_policy(env, episode, goals, buffers): 129 | global qnet, q_target, munet, mu_target 130 | global q_optimizer, mu_optimer 131 | goal_label = [] 132 | for idx, goal in enumerate(goals): 133 | score = 0.0 134 | for ep in range(episode): 135 | done = False 136 | obs = env.reset() 137 | step = 0 138 | while not done: 139 | a = munet(torch.tensor(obs).unsqueeze(0).float(), goal.unsqueeze(0)) 140 | noise = torch.randn(4) * 0.3 141 | action = torch.clamp(a+noise, -1, 1)[0].detach().numpy() 142 | next_obs, _, _, info = env.step(action) 143 | dist = F.pairwise_distance(torch.tensor(next_obs).unsqueeze(0).float(), \ 144 | goal.unsqueeze(0)) 145 | reward, done = 0.0, False 146 | if dist.item() < goal_tolerance: 147 | reward, done = 1.0, True 148 | if step > max_step: done = True 149 | 150 | buffers[idx].append((obs, goal.detach().numpy(), action, \ 151 | reward/10.0, next_obs, done)) 152 | score += reward 153 | step += 1 154 | obs = next_obs 155 | if step%train_interval==0 and len(buffers[idx]) > start_train: 156 | train(qnet, q_target, munet, mu_target,\ 157 | buffers[idx], q_optimizer, mu_optimizer) 158 | soft_update(q_target, qnet, target_update) 159 | soft_update(mu_target, munet, target_update) 160 | 161 | print('epsidoes:{}, goal_success:{}'.format(ep, score/episode)) 162 | prob = score/episode 163 | label = 1 if prob >= r_min and prob <= r_max else 0 164 | goal_label.append(label) 165 | return buffers, goal_label 166 | 167 | def train_gan(goals, label): 168 | global generator, discrim, gen_optimizer, disc_optimizer 169 | logit = discrim(torch.stack(goals, dim=0)) 170 | label = torch.tensor(label).unsqueeze(1) 171 | 172 | data_loss = label*(logit - 1.0).pow(2) + (1-label)*(logit + 1.0).pow(2) 173 | z = torch.randn(label.size(0), noise_size) 174 | gen_loss = (discrim(generator(z).detach()) + 1).pow(2) 175 | discrim_loss = (data_loss + gen_loss).mean() 176 | disc_optimizer.zero_grad() 177 | discrim_loss.backward() 178 | disc_optimizer.step() 179 | 180 | z = torch.randn(8, noise_size) 181 | gen_loss = discrim(generator(z)).pow(2).mean() 182 | gen_optimizer.zero_grad() 183 | gen_loss.backward() 184 | gen_optimizer.step() 185 | 186 | if __name__ == '__main__': 187 | env = gym.make('BipedalWalkerHardcore-v3') 188 | qnet, q_target, munet, mu_target, generator, discrim = QNet(), QNet(), \ 189 | ActionNet(), ActionNet(), Generator(), Discriminator() 190 | q_target.load_state_dict(qnet.state_dict()) 191 | mu_target.load_state_dict(munet.state_dict()) 192 | q_optimizer, mu_optimizer = optim.Adam(qnet.parameters(), lr=q_lr), \ 193 | optim.Adam(munet.parameters(), lr=mu_lr) 194 | gen_optimizer = optim.Adam(generator.parameters(), lr=gen_lr) 195 | disc_optimizer = optim.Adam(discrim.parameters(), lr=disc_lr) 196 | buffers, goals_old = {}, [] 197 | 198 | for i in range(iteration): 199 | gen_goal = list(generator(torch.randn(gen_num, noise_size))) 200 | buffers.update({g:deque(maxlen=buffer_size) for g in gen_goal}) 201 | #generate goals-list with new goal and sampled goal 202 | goals = gen_goal 203 | if len(goals_old) > smple_start: 204 | smple_goal = random.sample(goals_old, sample_num) 205 | goals += smple_goal 206 | goal_buffers = [buffers[g] for g in goals] 207 | #train policy with goals-list 208 | goal_buffers, goal_label = update_policy(env, update_epi, goals, goal_buffers) 209 | buffers.update({g:goal_buffers[idx] for idx, g in enumerate(goals)}) 210 | #goal evaluation 211 | for idx, g in enumerate(goals): 212 | if goal_label[idx]: 213 | goals_old.append(g) 214 | train_gan(goals, goal_label) 215 | env.close() -------------------------------------------------------------------------------- /iqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | from collections import deque 8 | 9 | #Hyperparameters 10 | EPISODES = 10000 11 | learning_rate = 0.0001 12 | discount_factor = 0.98 13 | buffer_size, start_train = 100000, 2000 14 | batch_size = 32 15 | target_sprt = 64 16 | pred_sprt = 32 17 | embed_dim = 2 18 | cvar_eta = 0.75 19 | k = 1.0 20 | #constant 21 | state_space, action_space = 8, 4 22 | PI = 3.1416 23 | 24 | class Quantile(nn.Module): 25 | def __init__(self): 26 | super().__init__() 27 | self.embed = nn.Linear(embed_dim, 256) 28 | self.fc1 = nn.Linear(state_space, 256) 29 | self.fc2 = nn.Linear(256, 256) 30 | self.fc3 = nn.Linear(256, 256) 31 | self.acts = nn.Linear(256, action_space) 32 | 33 | def forward(self, obs, tau): 34 | taus = tau.view(-1, 1).expand(-1, embed_dim) 35 | embed_tau = taus * torch.arange(0, embed_dim) * PI 36 | embed_tau = F.relu(self.embed(torch.cos(embed_tau))) 37 | obs = F.relu(self.fc1(obs)) 38 | x = obs * embed_tau 39 | x = F.relu(self.fc2(x)) 40 | x = F.relu(self.fc3(x)) 41 | return self.acts(x) 42 | 43 | def mini_batch(buffer): 44 | mini_batch = random.sample(buffer, batch_size) 45 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 46 | 47 | for sample in mini_batch: 48 | s, a, r, s_, d = sample 49 | d = 0.0 if d else 1.0 50 | obs.append(s); acts.append(a); rewards.append(r); 51 | next_obs.append(s_); done.append(d) 52 | 53 | return torch.tensor(obs).float(), torch.tensor(acts), \ 54 | torch.tensor(rewards).float(), torch.tensor(next_obs).float(),\ 55 | torch.tensor(done) 56 | 57 | def train(net, target_net, optimizer, buffer): 58 | obs, acts, rewards, next_obs, done = mini_batch(buffer) 59 | 60 | next_q = [predict(target_net, next_obs, cvar_eta)[0] for _ in range(target_sprt)] 61 | next_q = torch.stack(next_q, dim=2) 62 | max_act = next_q.mean(dim=2).argmax(dim=1) 63 | next_qval = [next_q[idx][max_a] for idx, max_a in enumerate(max_act)] 64 | next_qval = torch.stack(next_qval, dim=0) 65 | target_q = rewards.view(-1, 1) + discount_factor * next_qval 66 | 67 | current_q, probs = [], [] 68 | for _ in range(pred_sprt): 69 | val, taus = predict(net, obs, cvar_eta) 70 | current_q.append(val); probs.append(taus) 71 | current_q = torch.stack(current_q, dim=2) 72 | curr_qval = [current_q[idx][a] for idx, a in enumerate(acts)] 73 | curr_qval = torch.stack(curr_qval, dim=0) 74 | probs = torch.stack(probs, dim=1).unsqueeze(1) 75 | 76 | #Quantile Regresion Loss 77 | target_q = target_q.view(batch_size, -1, 1).expand(-1, target_sprt, pred_sprt).detach() 78 | curr_q = curr_qval.view(batch_size, 1, -1).expand(-1, target_sprt, pred_sprt) 79 | diff = target_q - curr_q 80 | soft_diff = torch.where(torch.abs(diff)<=k, 0.5*torch.pow(diff, 2), \ 81 | k*(torch.abs(diff) - 0.5*k)) 82 | s_diff1 = soft_diff * probs 83 | s_diff2 = soft_diff * (1 - probs) 84 | error = torch.where(diff>=0, s_diff1, s_diff2) 85 | loss = torch.sum(error) / (batch_size * target_sprt) 86 | optimizer.zero_grad() 87 | loss.backward() 88 | optimizer.step() 89 | 90 | def predict(net, obs, cvar_eta): 91 | tau_ = cvar_eta * torch.rand(obs.size(0)) 92 | return net(obs, tau_), tau_ 93 | 94 | if __name__ == '__main__': 95 | env = gym.make('LunarLander-v2') 96 | net, target_net = Quantile(), Quantile() 97 | target_net.load_state_dict(net.state_dict()) 98 | optimizer = optim.Adam(net.parameters(), lr=learning_rate, eps=0.01/32) 99 | 100 | buffer = deque(maxlen=buffer_size) 101 | score, step = 0, 0 102 | epsilon, epsilon_decay = 0.4, 1-5e-6 103 | target_interval = 15 104 | 105 | for ep in range(EPISODES): 106 | obs = env.reset() 107 | done = False 108 | while not done: 109 | qvals = [predict(net, torch.tensor(obs).unsqueeze(0).float(), cvar_eta)[0] \ 110 | for _ in range(pred_sprt)] 111 | 112 | qvals = torch.stack(qvals, dim=0).mean(dim=0) 113 | rand = random.random() 114 | if rand < epsilon: 115 | action = random.randint(0, action_space-1) 116 | else: 117 | action = qvals.argmax().item() 118 | next_obs, reward, done, info = env.step(action) 119 | buffer.append((obs, action, reward/100.0, next_obs, done)) 120 | obs = next_obs 121 | step += 1 122 | score += reward 123 | epsilon *= epsilon_decay 124 | 125 | if len(buffer) > start_train: 126 | train(net, target_net, optimizer, buffer) 127 | 128 | if ep%target_interval==0 and ep!=0: 129 | target_net.load_state_dict(net.state_dict()) 130 | 131 | if ep%10==0 and ep!=0: 132 | print('episode:{}, step:{}, avg_score:{}, len_buffer:{}, epsilon:{}'.format(ep,\ 133 | step, score/10.0, len(buffer), epsilon)) 134 | score = 0 135 | env.close() -------------------------------------------------------------------------------- /ppo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | #Hyperparameters 9 | EPISODES = 10000 10 | learning_rate = 0.0002 11 | discount_factor, lmbda = 0.98, 0.5 12 | train_interval = 20 13 | train_iter = 3 14 | epsilon = 0.1 15 | 16 | class Network(nn.Module): 17 | def __init__(self): 18 | super().__init__() 19 | self.fc1 = nn.Linear(4, 128) 20 | self.fc2 = nn.Linear(128, 128) 21 | self.p = nn.Linear(128, 2) 22 | self.value = nn.Linear(128, 1) 23 | 24 | def pi(self, x): 25 | x = F.relu(self.fc1(x)) 26 | x = F.relu(self.fc2(x)) 27 | prob = F.softmax(self.p(x), dim=1) 28 | return prob 29 | 30 | def v(self, x): 31 | x = F.relu(self.fc1(x)) 32 | x = F.relu(self.fc2(x)) 33 | return self.value(x) 34 | 35 | def train(net, optimizer, samples): 36 | #mini-batch 37 | obs, acts, probs, rewards, next_obs, done = [], [], [], [], [], [] 38 | for transition in samples: 39 | s, a, p, r, s_, d = transition 40 | d = 0.0 if d else 1.0 41 | obs.append(s); acts.append(a); probs.append(p); rewards.append(r) 42 | next_obs.append(s_), done.append(d) 43 | 44 | obs, acts, probs, rewards, next_obs, done = torch.tensor(obs).float(), \ 45 | torch.tensor(acts), torch.tensor(probs).float(), torch.tensor(rewards).float(),\ 46 | torch.tensor(next_obs).float(), torch.tensor(done) 47 | 48 | #train 49 | for _ in range(train_iter): 50 | target = rewards.view(-1, 1) + discount_factor * net.v(next_obs) * done.view(-1, 1) 51 | td = target - net.v(obs) 52 | #Implementation of GAE(Generalized Advantage Estimation) 53 | advantage, R = [], 0.0 54 | for delta in torch.flip(td, dims=[0, 1]): 55 | R = delta + discount_factor * lmbda * R 56 | advantage.append(R) 57 | advantage.reverse() 58 | advantage = torch.tensor(advantage).float().unsqueeze(1) 59 | 60 | pi_a = net.pi(obs).gather(1, acts.view(-1, 1)) 61 | probs = probs.view(-1, 1) 62 | ratio = torch.exp(torch.log(pi_a) - torch.log(probs).detach()) 63 | clipped = torch.clamp(ratio, 1-epsilon, 1+epsilon) 64 | 65 | p_loss = -torch.min(ratio*advantage, clipped*advantage) 66 | v_loss = F.smooth_l1_loss(net.v(obs), target.detach()) 67 | loss = (p_loss + v_loss).mean() 68 | 69 | optimizer.zero_grad() 70 | loss.backward() 71 | optimizer.step() 72 | 73 | 74 | if __name__ == '__main__': 75 | env = gym.make('CartPole-v1') 76 | net = Network() 77 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 78 | samples, score, step = [], 0.0, 0 79 | 80 | for ep in range(EPISODES): 81 | obs = env.reset() 82 | done = False 83 | while not done: 84 | prob = net.pi(torch.tensor(obs).unsqueeze(0).float()) 85 | prob_ = Categorical(prob) 86 | action = prob_.sample().item() 87 | next_obs, reward, done, info = env.step(action) 88 | samples.append((obs, action, prob[0][action].item(), reward/100.0, next_obs, done)) 89 | score += reward 90 | step += 1 91 | obs = next_obs 92 | 93 | if step%train_interval==0: 94 | train(net, optimizer, samples) 95 | samples = [] 96 | 97 | if ep%10==0 and ep!=0: 98 | print('episode:{}, num_train:{}, avg_score:{}'.format(ep, \ 99 | step//train_interval, score/10.0)) 100 | score = 0.0 101 | env.close() -------------------------------------------------------------------------------- /qr-dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | from collections import deque 8 | 9 | #Hyperparameters 10 | EPISODES = 10000 11 | learning_rate = 0.0001 12 | discount_factor = 0.98 13 | buffer_size, start_train = 100000, 2000 14 | batch_size = 32 15 | num_support = 20 16 | k = 1.0 #for huber loss 17 | state_space, action_space = 8, 4 18 | 19 | #============================ base formula ============================ 20 | #make culminative distribution(tau-1 .... tau-n) from uniform probablity 21 | tau_prob = [n/num_support for n in range(num_support+1)] 22 | #get middle of two-taus(which is *unique minimizer* of wasserstein distance) 23 | mid_prob = [(tau_prob[i] + tau_prob[i+1])/2 for i in range(num_support)] 24 | 25 | class Quantile(nn.Module): 26 | def __init__(self): 27 | super().__init__() 28 | self.fc1 = nn.Linear(state_space, 256) 29 | self.fc2 = nn.Linear(256, 256) 30 | self.fc3 = nn.Linear(256, 256) 31 | self.acts = [nn.Linear(256, num_support) for _ in range(action_space)] 32 | 33 | def forward(self, x): 34 | ''' 35 | network-input: state 36 | output: q-distribution for each action -> Z(s, .) 37 | output-shape: (actions, *batch_size*, supports) 38 | ''' 39 | x = F.relu(self.fc1(x)) 40 | x = F.relu(self.fc2(x)) 41 | x = F.relu(self.fc3(x)) 42 | value = [self.acts[i](x) for i in range(action_space)] 43 | return value 44 | 45 | def make_batch(buffer): 46 | ''' 47 | Make batch of train-samples by sampling from the buffer 48 | ''' 49 | mini_batch = random.sample(buffer, batch_size) 50 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 51 | 52 | for sample in mini_batch: 53 | s, a, r, s_, d = sample 54 | d = 0.0 if d else 1.0 55 | obs.append(s); acts.append(a); rewards.append(r); 56 | next_obs.append(s_); done.append(d) 57 | 58 | return torch.tensor(obs).float(), torch.tensor(acts), \ 59 | torch.tensor(rewards).float(), torch.tensor(next_obs).float(),\ 60 | torch.tensor(done) 61 | 62 | def train(net, target_net, optimizer, buffer): 63 | ''' 64 | Train network by samples from buffer 65 | 66 | In this function, 67 | next_supports means *q-distribution* over next-states 68 | supports means *q-distribution* over states 69 | ''' 70 | obs, acts, rewards, next_obs, done = make_batch(buffer) 71 | next_supports = target_net(next_obs) 72 | #next_supports(=list)'s shape is (*act_space*, batch_size, num_support) 73 | next_supports = torch.stack(next_supports, dim=1) #convert to tensor 74 | #now, shape is (*batch_size*, act_space, num_support) 75 | 76 | next_q = (1/num_support) * torch.sum(next_supports, dim=2) #get Q-value from dist. 77 | #next_q(expectation over support)'s shape is (batch_size, act_space) 78 | max_acts = next_q.argmax(dim=1) 79 | #max_acts'shape is just (batch_size,) 80 | 81 | #============= get next-supports of optimal actions => Z(s', a*) ============== 82 | max_quantile = [next_supports[idx][max_a] for idx, max_a in enumerate(max_acts)] 83 | max_quantile = torch.stack(max_quantile, dim=0) #just convert to tensor 84 | #max_quantile's shape is (batch_size, *num_support*) (actions were reduced) 85 | 86 | target_supports = rewards.view(-1, 1) + discount_factor * max_quantile 87 | 88 | supports = torch.stack(net(obs), dim=1) #supports over states 89 | 90 | #============= get supports of action => Z(s, a) ============== 91 | supports_a = [supports[idx][a] for idx, a in enumerate(acts)] 92 | supports_a = torch.stack(supports_a, dim=0) #just convert to tensor 93 | 94 | #============== Quantile Regression Loss Calculation ================= 95 | target_supports = target_supports.view(batch_size, -1, 1).expand(-1, num_support, num_support).detach() 96 | supports_a = supports_a.view(batch_size, 1, -1).expand(-1, num_support, num_support) 97 | diff = target_supports - supports_a 98 | 99 | #Huber loss calculation 100 | soft_diff = torch.where(torch.abs(diff)<=k, 0.5*torch.pow(diff, 2), \ 101 | k*(torch.abs(diff) - 0.5*k)) 102 | s_diff1 = torch.tensor(mid_prob) * soft_diff 103 | s_diff2 = (1 - torch.tensor(mid_prob)) * soft_diff 104 | error = torch.where(diff>=0, s_diff1, s_diff2) 105 | loss = torch.sum(error) / (batch_size * num_support) 106 | 107 | optimizer.zero_grad() 108 | loss.backward() 109 | optimizer.step() 110 | 111 | if __name__ == '__main__': 112 | env = gym.make('LunarLander-v2') 113 | net, target_net = Quantile(), Quantile() 114 | target_net.load_state_dict(net.state_dict()) 115 | optimizer = optim.Adam(net.parameters(), lr=learning_rate, eps=0.01/32) 116 | 117 | buffer = deque(maxlen=buffer_size) 118 | score, step = 0, 0 119 | epsilon, epsilon_decay = 0.2, 1-5e-6 120 | target_interval = 20 121 | 122 | for ep in range(EPISODES): 123 | obs = env.reset() 124 | done = False 125 | while not done: 126 | quantiles = net(torch.tensor(obs).unsqueeze(0).float()) 127 | #shape: (action_space, 1, num_supports) 128 | quantiles = torch.stack(quantiles, dim=1) #shape: (1, action_space, num_supports) 129 | qvals = (1/num_support) * torch.sum(quantiles, dim=2) #mean over supports 130 | rand = random.random() 131 | if rand < epsilon: 132 | action = random.randint(0, action_space-1) 133 | else: 134 | action = qvals.argmax().item() 135 | next_obs, reward, done, info = env.step(action) 136 | buffer.append((obs, action, reward/100.0, next_obs, done)) 137 | obs = next_obs 138 | step += 1 139 | score += reward 140 | epsilon *= epsilon_decay 141 | 142 | if len(buffer) > start_train: 143 | train(net, target_net, optimizer, buffer) 144 | 145 | if ep%target_interval==0 and ep!=0: 146 | target_net.load_state_dict(net.state_dict()) 147 | 148 | if ep%10==0 and ep!=0: 149 | print('episode:{}, step:{}, avg_score:{}, len_buffer:{}, epsilon:{}'.format(ep, step, \ 150 | score/10.0, len(buffer), epsilon)) 151 | score = 0 152 | env.close() -------------------------------------------------------------------------------- /sac.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | import numpy as np 8 | from torch.distributions import Normal 9 | from collections import deque 10 | 11 | #Hyperparameters 12 | EPISODES = 1000 13 | discount_factor = 0.98 14 | log_alpha = torch.tensor(np.log(0.01), requires_grad=True) 15 | target_entropy = -1.0 16 | train_interval = 10 17 | q_lr = 0.0005 18 | policy_lr = 0.0002 19 | alpha_lr = 0.001 20 | buffer_size, start_train = 100000, 2000 21 | batch_size = 32 22 | target_update = 0.995 23 | 24 | class QNet(nn.Module): 25 | def __init__(self): 26 | super().__init__() 27 | self.obs = nn.Linear(3, 128) 28 | self.act = nn.Linear(1, 128) 29 | self.fc = nn.Linear(256, 256) 30 | self.q = nn.Linear(256, 1) 31 | 32 | def forward(self, x, a): 33 | x = F.relu(self.obs(x)) 34 | a = F.relu(self.act(a)) 35 | #print(x.shape, a.shape) 36 | x = torch.cat([x, a], dim=1) 37 | x = F.relu(self.fc(x)) 38 | return self.q(x) 39 | 40 | class PolicyNet(nn.Module): 41 | def __init__(self): 42 | super().__init__() 43 | self.fc1 = nn.Linear(3, 128) 44 | self.fc2 = nn.Linear(128, 128) 45 | self.mu = nn.Linear(128, 1) 46 | self.sigma = nn.Linear(128, 1) 47 | 48 | def forward(self, x): 49 | x = F.relu(self.fc1(x)) 50 | x = F.relu(self.fc2(x)) 51 | dist = Normal(self.mu(x), F.softplus(self.sigma(x))) 52 | action = dist.rsample() 53 | log_prob = dist.log_prob(action) 54 | action = 2*torch.tanh(action) 55 | return action, log_prob 56 | 57 | def make_batch(buffer): 58 | mini_batch = random.sample(buffer, batch_size) 59 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 60 | for samples in mini_batch: 61 | s, a, r, s_, d = samples 62 | d = 0.0 if d else 1.0 63 | obs.append(s); acts.append(a); rewards.append(r); 64 | next_obs.append(s_); done.append(d) 65 | 66 | return torch.tensor(obs).float(), torch.tensor(acts).float(), \ 67 | torch.tensor(rewards).float(), torch.tensor(next_obs).float(), \ 68 | torch.tensor(done) 69 | 70 | def train(networks, buffer, optimizers): 71 | obs, acts, rewards, next_obs, done = make_batch(buffer) 72 | q1, q1_target, q2, q2_target, pi = networks 73 | q1_optimizer, q2_optimizer, pi_optimizer, alpha_optimizer = optimizers 74 | 75 | next_acts, log_prob = pi(next_obs) 76 | q_target = torch.min(q1_target(next_obs, next_acts), q2_target(next_obs, next_acts)) 77 | target = rewards.view(-1, 1) + discount_factor * done.view(-1, 1) * \ 78 | (q_target - torch.exp(log_alpha)*log_prob) 79 | 80 | q1_loss = F.smooth_l1_loss(q1(obs, acts.view(-1, 1)), target.detach()) 81 | q2_loss = F.smooth_l1_loss(q2(obs, acts.view(-1, 1)), target.detach()) 82 | 83 | q1_optimizer.zero_grad(); q1_loss.backward(); q1_optimizer.step() 84 | q2_optimizer.zero_grad(); q2_loss.backward(); q2_optimizer.step() 85 | 86 | sampled_a, log_prob = pi(obs) 87 | q_value = torch.min(q1(obs, sampled_a), q2(obs, sampled_a)) 88 | policy_obj = -q_value + torch.exp(log_alpha)*log_prob 89 | pi_optimizer.zero_grad() 90 | policy_obj.mean().backward() 91 | pi_optimizer.step() 92 | 93 | alpha_obj = -torch.exp(log_alpha)*(log_prob.detach() + target_entropy) 94 | alpha_optimizer.zero_grad() 95 | alpha_obj.mean().backward() 96 | alpha_optimizer.step() 97 | 98 | def soft_update(t_net, net, target_ratio): 99 | for t_param, param in zip(t_net.parameters(), net.parameters()): 100 | t_param.data.copy_(t_param.data*target_ratio + param.data*(1-target_ratio)) 101 | 102 | if __name__ == '__main__': 103 | env = gym.make('Pendulum-v0') 104 | q1net, q1_target, q2net, q2_target, pinet = QNet(), QNet(), QNet(), \ 105 | QNet(), PolicyNet() 106 | q1_target.load_state_dict(q1net.state_dict()) 107 | q2_target.load_state_dict(q2net.state_dict()) 108 | q1_optimizer = optim.Adam(q1net.parameters(), lr=q_lr) 109 | q2_optimizer = optim.Adam(q2net.parameters(), lr=q_lr) 110 | pi_optimizer = optim.Adam(pinet.parameters(), lr=policy_lr) 111 | alpha_optimizer = optim.Adam([log_alpha], lr=alpha_lr) 112 | 113 | buffer = deque(maxlen=buffer_size) 114 | score, step = 0.0, 0 115 | for ep in range(EPISODES): 116 | done = False 117 | obs = env.reset() 118 | while not done: 119 | action, _ = pinet(torch.tensor(obs).float()) 120 | next_obs, reward, done, info = env.step([action.item()]) 121 | buffer.append((obs, action.item(), reward/10.0, next_obs, done)) 122 | score += reward 123 | step += 1 124 | obs = next_obs 125 | 126 | if step%train_interval==0 and len(buffer) > start_train: 127 | train((q1net, q1_target, q2net, q2_target, pinet), buffer, \ 128 | (q1_optimizer, q2_optimizer, pi_optimizer, alpha_optimizer)) 129 | soft_update(q1_target, q1net, target_update) 130 | soft_update(q2_target, q2net, target_update) 131 | 132 | if ep%10==0 and ep!=0: 133 | print('episode:{}, buffer_size:{}, alpha:{}, avg_score:{}'.format( 134 | ep, len(buffer), torch.exp(log_alpha).item(), score/10.0)) 135 | score = 0.0 136 | env.close() -------------------------------------------------------------------------------- /single-acer.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import torch.autograd as autograd 7 | import random 8 | import itertools 9 | from collections import deque 10 | from torch.distributions import Categorical 11 | 12 | #Note: single-thread version, support trpo-update 13 | #Hyperparameters 14 | EPISODES = 10000 15 | learning_rate = 0.0002 16 | discount_factor = 0.98 17 | train_interval = 10 18 | replay_iter = 8 19 | buffer_len, start_train = 20000, 500 20 | is_clipping = 1.2 21 | trpo_delta = 1.0 22 | avgnet_ratio = 0.995 23 | 24 | class Network(nn.Module): 25 | def __init__(self): 26 | super().__init__() 27 | self.fc1 = nn.Linear(4, 128) 28 | self.fc2 = nn.Linear(128, 128) 29 | self.policy = nn.Linear(128, 2) 30 | self.qval = nn.Linear(128, 2) 31 | 32 | def p(self, x): 33 | x = F.relu(self.fc1(x)) 34 | self.pi = F.relu(self.fc2(x)) 35 | self.pi.retain_grad() 36 | prob = F.softmax(self.policy(self.pi), dim=1) 37 | return prob 38 | 39 | def q(self, x): 40 | x = F.relu(self.fc1(x)) 41 | x = F.relu(self.fc2(x)) 42 | return self.qval(x) 43 | 44 | def mini_batch(data): 45 | obs, acts, probs, rewards, next_obs, done = [], [], [], [], [], [] 46 | for transition in data: 47 | s, a, p, r, s_, d = transition 48 | obs.append(s); acts.append(a); probs.append(p); rewards.append(r) 49 | next_obs.append(s_), done.append(d) 50 | if d: 51 | break 52 | 53 | return torch.tensor(obs).float(), torch.tensor(acts), \ 54 | torch.stack(probs, dim=0).float(), torch.tensor(rewards).float(),\ 55 | torch.tensor(next_obs).float(), torch.tensor(done) 56 | 57 | def train_process(net, avg_net, samples, optimizer): 58 | obs, acts, old_probs, rewards, next_obs, done = samples 59 | acts, rewards = acts.view(-1, 1), rewards.view(-1, 1) 60 | final_q, final_p = net.q(next_obs[-1].unsqueeze(0)), net.p(next_obs[-1].unsqueeze(0)) 61 | final_v = torch.sum(final_q * final_p, dim=1) 62 | qval = net.q(obs) 63 | current_p = net.p(obs) 64 | avg_p = avg_net.p(obs) 65 | value = torch.sum(qval*current_p, dim=1, keepdim=True) 66 | 67 | act_q = qval.gather(1, acts) 68 | ratio = torch.exp(torch.log(current_p) - torch.log(old_probs)) 69 | ret_ratio = torch.min(torch.tensor(1.0), ratio.gather(1, acts)) 70 | policy_ratio = torch.min(torch.tensor(is_clipping), ratio.gather(1, acts)) 71 | 72 | ret_q = [] 73 | R = final_v if not done[-1] else torch.tensor([0.0]) 74 | for idx, r in enumerate(torch.flip(rewards, [0, 1])): 75 | R = r + discount_factor * R 76 | ret_q.append(R) 77 | R = ret_ratio[-1-idx]*(R - act_q[-1-idx]) + value[-1-idx] 78 | ret_q.reverse() 79 | ret_q = torch.stack(ret_q, dim=0) 80 | 81 | p_obj1 = policy_ratio.detach() * torch.log(current_p.gather(1, acts)) * \ 82 | (ret_q - value).detach() 83 | p_obj2 = 0 84 | for a in range(2): 85 | coeff = torch.max(torch.tensor(0), 1-is_clipping/ratio[:, a]).view(-1, 1) 86 | a_prob, a_qval = current_p[:, a].view(-1, 1), qval[:, a].view(-1, 1) 87 | p_obj2 += (coeff*a_prob).detach() * torch.log(a_prob) * (a_qval - value).detach() 88 | 89 | policy_obj = (p_obj1 + p_obj2).mean() 90 | 91 | g = autograd.grad(policy_obj, net.pi, retain_graph=True)[0] 92 | kld = F.kl_div(avg_p.detach(), current_p) 93 | k = autograd.grad(kld, net.pi, retain_graph=True)[0] 94 | #trust-region update 95 | k_norm = torch.linalg.norm(k, dim=1).view(-1, 1, 1)**2 96 | g_, k_ = g.unsqueeze(2), k.unsqueeze(1) 97 | solve = (torch.bmm(k_, g_) - trpo_delta) / k_norm 98 | new_g = g - torch.max(torch.tensor(0), solve.view(-1, 1))*k 99 | 100 | q_loss = F.smooth_l1_loss(act_q, ret_q.detach()) 101 | optimizer.zero_grad() 102 | net.policy.weight._grad = autograd.grad(-policy_obj, net.policy.weight, retain_graph=True)[0] 103 | net.pi.backward(-new_g) 104 | q_loss.backward() 105 | optimizer.step() 106 | 107 | def train(net, avg_net, online_sample, buffer, optimizer): 108 | train_process(net, avg_net, mini_batch(online_sample), optimizer) 109 | 110 | if len(buffer) > start_train: 111 | for _ in range(replay_iter): 112 | key = random.randint(0, len(buffer)-train_interval) 113 | replay_sample = itertools.islice(buffer, key, key+train_interval) 114 | train_process(net, avg_net, mini_batch(replay_sample), optimizer) 115 | 116 | if __name__ == '__main__': 117 | env = gym.make('CartPole-v1') 118 | net, avg_net = Network(), Network() 119 | avg_net.load_state_dict(net.state_dict()) 120 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 121 | buffer = deque(maxlen=buffer_len) 122 | samples, score, step = [], 0.0, 0 123 | 124 | for ep in range(EPISODES): 125 | obs = env.reset() 126 | done = False 127 | while not done: 128 | prob = net.p(torch.tensor(obs).unsqueeze(0).float()) 129 | prob_ = Categorical(prob) 130 | action = prob_.sample().item() 131 | next_obs, reward, done, info = env.step(action) 132 | data = (obs, action, prob[0], reward/100.0, next_obs, done) 133 | samples.append(data) 134 | buffer.append(data) 135 | score += reward 136 | step += 1 137 | obs = next_obs 138 | 139 | if step%train_interval==0: 140 | train(net, avg_net, samples, buffer, optimizer) 141 | for a_param, param in zip(avg_net.parameters(), net.parameters()): 142 | a_param.data.copy_(a_param.data*avgnet_ratio + param.data*(1-avgnet_ratio)) 143 | samples = [] 144 | 145 | if ep%10==0 and ep!=0: 146 | print('episode:{}, num_train:{}, avg_score:{}'.format(ep, \ 147 | step//train_interval, score/10.0)) 148 | score = 0.0 149 | env.close() -------------------------------------------------------------------------------- /td3.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import random 7 | from collections import deque 8 | 9 | #Hyperparameters 10 | EPISODES = 1000 11 | q_lr = 0.0005 12 | mu_lr = 0.0001 13 | discount_factor = 0.98 14 | train_interval = 10 15 | policy_delay = 20 16 | buffer_size, start_train = 100000, 2000 17 | batch_size = 32 18 | target_update = 0.995 19 | 20 | class QNet(nn.Module): 21 | def __init__(self): 22 | super().__init__() 23 | self.obs = nn.Linear(3, 128) 24 | self.act = nn.Linear(1, 128) 25 | self.fc = nn.Linear(256, 256) 26 | self.q = nn.Linear(256, 1) 27 | 28 | def forward(self, x, a): 29 | x = F.relu(self.obs(x)) 30 | a = F.relu(self.act(a)) 31 | x = torch.cat([x, a], dim=1) 32 | x = F.relu(self.fc(x)) 33 | return self.q(x) 34 | 35 | class ActionNet(nn.Module): 36 | def __init__(self): 37 | super().__init__() 38 | self.fc1 = nn.Linear(3, 128) 39 | self.fc2 = nn.Linear(128, 128) 40 | self.action = nn.Linear(128, 1) 41 | 42 | def forward(self, x): 43 | x = F.relu(self.fc1(x)) 44 | x = F.relu(self.fc2(x)) 45 | a = torch.tanh(self.action(x)) 46 | return 2*a 47 | 48 | def make_bach(buffer): 49 | mini_batch = random.sample(buffer, batch_size) 50 | obs, acts, rewards, next_obs, done = [], [], [], [], [] 51 | for samples in mini_batch: 52 | s, a, r, s_, d = samples 53 | d = 0.0 if d else 1.0 54 | obs.append(s); acts.append(a); rewards.append(r); 55 | next_obs.append(s_); done.append(d) 56 | 57 | return torch.tensor(obs).float(), torch.tensor(acts).float(), \ 58 | torch.tensor(rewards).float(), torch.tensor(next_obs).float(), \ 59 | torch.tensor(done) 60 | 61 | def train(networks, buffer, optimizers, step): 62 | obs, acts, rewards, next_obs, done = make_bach(buffer) 63 | q1, q1_target, q2, q2_target, mu, mu_target = networks 64 | q1_optimizer, q2_optimizer, mu_optimizer = optimizers 65 | 66 | noise = torch.clamp(torch.randn(batch_size, 1)*0.5, -0.2, 0.2) 67 | target_act = torch.clamp(mu_target(next_obs) + noise, -2.0, 2.0) 68 | q_target = torch.min(q1_target(next_obs, target_act),\ 69 | q2_target(next_obs, target_act)) 70 | target = rewards.view(-1, 1) + discount_factor * done.view(-1, 1) * q_target 71 | 72 | q1_loss = F.smooth_l1_loss(q1(obs, acts.view(-1, 1)), target.detach()) 73 | q2_loss = F.smooth_l1_loss(q2(obs, acts.view(-1, 1)), target.detach()) 74 | q1_optimizer.zero_grad(); q1_loss.backward(); q1_optimizer.step() 75 | q2_optimizer.zero_grad(); q2_loss.backward(); q2_optimizer.step() 76 | 77 | if step%policy_delay==0: 78 | mu_obj = -q1(obs, mu(obs)).mean() 79 | mu_optimizer.zero_grad() 80 | mu_obj.backward() 81 | mu_optimizer.step() 82 | 83 | #Implementation of soft-update 84 | def soft_update(t_net, net, target_ratio): 85 | for t_param, param in zip(t_net.parameters(), net.parameters()): 86 | t_param.data.copy_(t_param.data*target_ratio + param.data*(1-target_ratio)) 87 | 88 | if __name__ == '__main__': 89 | env = gym.make('Pendulum-v0') 90 | q1net, q1_target, q2net, q2_target, munet, mu_target = QNet(), QNet(), QNet(), QNet(),\ 91 | ActionNet(), ActionNet() 92 | q1_target.load_state_dict(q1net.state_dict()) 93 | q2_target.load_state_dict(q2net.state_dict()) 94 | mu_target.load_state_dict(munet.state_dict()) 95 | q1_optimizer = optim.Adam(q1net.parameters(), lr=q_lr) 96 | q2_optimizer = optim.Adam(q2net.parameters(), lr=q_lr) 97 | mu_optimizer = optim.Adam(munet.parameters(), lr=mu_lr) 98 | 99 | buffer = deque(maxlen=buffer_size) 100 | score, step = 0.0, 0 101 | 102 | for ep in range(EPISODES): 103 | done = False 104 | obs = env.reset() 105 | while not done: 106 | a = munet(torch.tensor(obs).float()) 107 | noise = torch.randn(1) * 0.5 108 | action = torch.clamp(a+noise, -2.0, 2.0).item() 109 | next_obs, reward, done, info = env.step([action]) 110 | buffer.append((obs, action, reward/100.0, next_obs, done)) 111 | score += reward 112 | step += 1 113 | obs = next_obs 114 | 115 | if step%train_interval==0 and len(buffer) > start_train: 116 | train((q1net, q1_target, q2net, q2_target, munet, mu_target), buffer, \ 117 | (q1_optimizer, q2_optimizer, mu_optimizer), step) 118 | soft_update(q1_target, q1net, target_update) 119 | soft_update(q2_target, q2net, target_update) 120 | soft_update(mu_target, munet, target_update) 121 | 122 | if ep%10==0 and ep!=0: 123 | print('epsidoes:{}, buffer_size:{}, avg_score:{}'.format(ep, len(buffer), score/10.0)) 124 | score = 0.0 --------------------------------------------------------------------------------