├── deep_sea_treasure_env ├── __init__.py └── deep_sea_treasure_env.py ├── LICENSE ├── mp_queue_test.py ├── REINFORCE.py ├── README.md ├── actor_critic.py ├── dqn.py ├── ppo.py ├── a3c.py ├── ddpg.py ├── ppo-lstm.py ├── acer.py ├── a2c.py └── a3c_dst.py /deep_sea_treasure_env/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 seungeunrho 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /mp_queue_test.py: -------------------------------------------------------------------------------- 1 | import torch.multiprocessing as mp 2 | import torch 3 | from torch.utils.tensorboard import SummaryWriter 4 | import datetime 5 | import time 6 | 7 | max_step = 10 8 | 9 | 10 | def train(rank, model, Q): 11 | print('Do something') 12 | for step in range(0,max_step): 13 | Q.put(rank, True, 0.1) 14 | time.sleep(0.5) 15 | 16 | 17 | def log(model, Q): 18 | print('Start logging') 19 | writer = SummaryWriter(filename_suffix=datetime.datetime.now().ctime().replace(" ", "_")) 20 | 21 | for step in range(0, max_step): 22 | while not Q.empty(): 23 | data = Q.get() 24 | print(f'received data: {data}') 25 | writer.add_scalar('data', data, step) 26 | time.sleep(0.5) 27 | 28 | 29 | if __name__ == '__main__': 30 | num_processes = 4 31 | model = torch.nn.Module() 32 | # NOTE: this is required for the ``fork`` method to work 33 | model.share_memory() 34 | Q = mp.Queue() 35 | #summary_writer = SummaryWriter(filename_suffix=datetime.datetime.now().ctime().replace(" ", "_")) 36 | processes = [] 37 | for rank in range(num_processes): 38 | if rank == 0: 39 | # p = mp.Process(target=log, args=(model, Q, summary_writer)) 40 | p = mp.Process(target=log, args=(model, Q, )) 41 | else: 42 | p = mp.Process(target=train, args=(rank, model, Q)) 43 | p.start() 44 | processes.append(p) 45 | for p in processes: 46 | p.join() -------------------------------------------------------------------------------- /REINFORCE.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | #Hyperparameters 9 | learning_rate = 0.0002 10 | gamma = 0.98 11 | 12 | class Policy(nn.Module): 13 | def __init__(self): 14 | super(Policy, self).__init__() 15 | self.data = [] 16 | 17 | self.fc1 = nn.Linear(4, 128) 18 | self.fc2 = nn.Linear(128, 2) 19 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 20 | 21 | def forward(self, x): 22 | x = F.relu(self.fc1(x)) 23 | x = F.softmax(self.fc2(x), dim=0) 24 | return x 25 | 26 | def put_data(self, item): 27 | self.data.append(item) 28 | 29 | def train_net(self): 30 | R = 0 31 | self.optimizer.zero_grad() 32 | for r, prob in self.data[::-1]: 33 | R = r + gamma * R 34 | loss = -torch.log(prob) * R 35 | loss.backward() 36 | self.optimizer.step() 37 | self.data = [] 38 | 39 | def main(): 40 | env = gym.make('CartPole-v1') 41 | pi = Policy() 42 | score = 0.0 43 | print_interval = 20 44 | 45 | 46 | for n_epi in range(10000): 47 | s = env.reset() 48 | done = False 49 | 50 | while not done: # CartPole-v1 forced to terminates at 500 step. 51 | prob = pi(torch.from_numpy(s).float()) 52 | m = Categorical(prob) 53 | a = m.sample() 54 | s_prime, r, done, info = env.step(a.item()) 55 | pi.put_data((r,prob[a])) 56 | s = s_prime 57 | score += r 58 | 59 | pi.train_net() 60 | 61 | if n_epi%print_interval==0 and n_epi!=0: 62 | print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval)) 63 | score = 0.0 64 | env.close() 65 | 66 | if __name__ == '__main__': 67 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multi objective reinforcement learning 2 | 3 | This repo is a fork of minimalRL-pytorch in which the A3C code was made distributed so it can handle multiple objectives. Additionally a new 4 | Gym environment Deep Sea Treasure is implemented. This is a standard multi-objective optimization problem used to test different optimization 5 | strategies. 6 | 7 | For the description of the Deep Sea Treasure problem, see: 8 | Vamplew P, Dazeley R, Berry A, Issabekov R, Dekker E (2011) Empirical evaluation methods for multiobjective reinforcement learning algorithms. Mach Learn 84(1-2):51–80 9 | 10 | ... 11 | 12 | # minimalRL-pytorch 13 | 14 | Implementations of basic RL algorithms with minimal lines of codes! (PyTorch based) 15 | 16 | * Each algorithm is complete within a single file. 17 | 18 | * Length of each file is up to 100~150 lines of codes. 19 | 20 | * Every algorithm can be trained within 30 seconds, even without GPU. 21 | 22 | * Envs are fixed to "CartPole-v1". You can just focus on the implementations. 23 | 24 | 25 | 26 | ## Algorithms 27 | 1. [REINFORCE](https://github.com/seungeunrho/minimalRL/blob/master/REINFORCE.py) (67 lines) 28 | 2. [Vanilla Actor-Critic](https://github.com/seungeunrho/minimalRL/blob/master/actor_critic.py) (98 lines) 29 | 3. [DQN](https://github.com/seungeunrho/minimalRL/blob/master/dqn.py) (112 lines, including replay memory and target network) 30 | 4. [PPO](https://github.com/seungeunrho/minimalRL/blob/master/ppo.py) (119 lines, including GAE) 31 | 5. [DDPG](https://github.com/seungeunrho/minimalRL/blob/master/ddpg.py) (147 lines, including OU noise and soft target update) 32 | 6. [A3C](https://github.com/seungeunrho/minimalRL/blob/master/a3c.py) (129 lines) 33 | 7. [ACER](https://github.com/seungeunrho/minimalRL/blob/master/acer.py) (149 lines) 34 | 8. [A2C](https://github.com/seungeunrho/minimalRL/blob/master/a2c.py) added! (188 lines) 35 | 9. Any suggestion ..? 36 | 37 | 38 | ## Dependencies 39 | 1. PyTorch 40 | 2. OpenAI GYM 41 | 42 | ## Usage 43 | ```bash 44 | # Works only with Python 3. 45 | # e.g. 46 | python3 REINFORCE.py 47 | python3 actor_critic.py 48 | python3 dqn.py 49 | python3 ppo.py 50 | python3 ddpg.py 51 | python3 a3c.py 52 | python3 a2c.py 53 | python3 acer.py 54 | ``` 55 | -------------------------------------------------------------------------------- /actor_critic.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | #Hyperparameters 9 | learning_rate = 0.0002 10 | gamma = 0.98 11 | n_rollout = 10 12 | 13 | class ActorCritic(nn.Module): 14 | def __init__(self): 15 | super(ActorCritic, self).__init__() 16 | self.data = [] 17 | 18 | self.fc1 = nn.Linear(4,256) 19 | self.fc_pi = nn.Linear(256,2) 20 | self.fc_v = nn.Linear(256,1) 21 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 22 | 23 | def pi(self, x, softmax_dim = 0): 24 | x = F.relu(self.fc1(x)) 25 | x = self.fc_pi(x) 26 | prob = F.softmax(x, dim=softmax_dim) 27 | return prob 28 | 29 | def v(self, x): 30 | x = F.relu(self.fc1(x)) 31 | v = self.fc_v(x) 32 | return v 33 | 34 | def put_data(self, transition): 35 | self.data.append(transition) 36 | 37 | def make_batch(self): 38 | s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], [] 39 | for transition in self.data: 40 | s,a,r,s_prime,done = transition 41 | s_lst.append(s) 42 | a_lst.append([a]) 43 | r_lst.append([r/100.0]) 44 | s_prime_lst.append(s_prime) 45 | done_mask = 0.0 if done else 1.0 46 | done_lst.append([done_mask]) 47 | 48 | s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 49 | torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \ 50 | torch.tensor(done_lst, dtype=torch.float) 51 | self.data = [] 52 | return s_batch, a_batch, r_batch, s_prime_batch, done_batch 53 | 54 | def train_net(self): 55 | s, a, r, s_prime, done = self.make_batch() 56 | td_target = r + gamma * self.v(s_prime) * done 57 | delta = td_target - self.v(s) 58 | 59 | pi = self.pi(s, softmax_dim=1) 60 | pi_a = pi.gather(1,a) 61 | loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach()) 62 | 63 | self.optimizer.zero_grad() 64 | loss.mean().backward() 65 | self.optimizer.step() 66 | 67 | def main(): 68 | env = gym.make('CartPole-v1') 69 | model = ActorCritic() 70 | print_interval = 20 71 | score = 0.0 72 | 73 | for n_epi in range(10000): 74 | done = False 75 | s = env.reset() 76 | while not done: 77 | for t in range(n_rollout): 78 | prob = model.pi(torch.from_numpy(s).float()) 79 | m = Categorical(prob) 80 | a = m.sample().item() 81 | s_prime, r, done, info = env.step(a) 82 | model.put_data((s,a,r,s_prime,done)) 83 | 84 | s = s_prime 85 | score += r 86 | 87 | if done: 88 | break 89 | 90 | model.train_net() 91 | 92 | if n_epi%print_interval==0 and n_epi!=0: 93 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval)) 94 | score = 0.0 95 | env.close() 96 | 97 | if __name__ == '__main__': 98 | main() -------------------------------------------------------------------------------- /dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import collections 3 | import random 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | 10 | #Hyperparameters 11 | learning_rate = 0.0005 12 | gamma = 0.98 13 | buffer_limit = 50000 14 | batch_size = 32 15 | 16 | class ReplayBuffer(): 17 | def __init__(self): 18 | self.buffer = collections.deque(maxlen=buffer_limit) 19 | 20 | def put(self, transition): 21 | self.buffer.append(transition) 22 | 23 | def sample(self, n): 24 | mini_batch = random.sample(self.buffer, n) 25 | s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], [] 26 | 27 | for transition in mini_batch: 28 | s, a, r, s_prime, done_mask = transition 29 | s_lst.append(s) 30 | a_lst.append([a]) 31 | r_lst.append([r]) 32 | s_prime_lst.append(s_prime) 33 | done_mask_lst.append([done_mask]) 34 | 35 | return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 36 | torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ 37 | torch.tensor(done_mask_lst) 38 | 39 | def size(self): 40 | return len(self.buffer) 41 | 42 | class Qnet(nn.Module): 43 | def __init__(self): 44 | super(Qnet, self).__init__() 45 | self.fc1 = nn.Linear(4, 128) 46 | self.fc2 = nn.Linear(128, 128) 47 | self.fc3 = nn.Linear(128, 2) 48 | 49 | def forward(self, x): 50 | x = F.relu(self.fc1(x)) 51 | x = F.relu(self.fc2(x)) 52 | x = self.fc3(x) 53 | return x 54 | 55 | def sample_action(self, obs, epsilon): 56 | out = self.forward(obs) 57 | coin = random.random() 58 | if coin < epsilon: 59 | return random.randint(0,1) 60 | else : 61 | return out.argmax().item() 62 | 63 | def train(q, q_target, memory, optimizer): 64 | for i in range(10): 65 | s,a,r,s_prime,done_mask = memory.sample(batch_size) 66 | 67 | q_out = q(s) 68 | q_a = q_out.gather(1,a) 69 | max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1) 70 | target = r + gamma * max_q_prime * done_mask 71 | loss = F.smooth_l1_loss(q_a, target) 72 | 73 | optimizer.zero_grad() 74 | loss.backward() 75 | optimizer.step() 76 | 77 | def main(): 78 | env = gym.make('CartPole-v1') 79 | q = Qnet() 80 | q_target = Qnet() 81 | q_target.load_state_dict(q.state_dict()) 82 | memory = ReplayBuffer() 83 | 84 | print_interval = 20 85 | score = 0.0 86 | optimizer = optim.Adam(q.parameters(), lr=learning_rate) 87 | 88 | for n_epi in range(10000): 89 | epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1% 90 | s = env.reset() 91 | done = False 92 | 93 | while not done: 94 | a = q.sample_action(torch.from_numpy(s).float(), epsilon) 95 | s_prime, r, done, info = env.step(a) 96 | done_mask = 0.0 if done else 1.0 97 | memory.put((s,a,r/100.0,s_prime, done_mask)) 98 | s = s_prime 99 | 100 | score += r 101 | if done: 102 | break 103 | 104 | if memory.size()>2000: 105 | train(q, q_target, memory, optimizer) 106 | 107 | if n_epi%print_interval==0 and n_epi!=0: 108 | q_target.load_state_dict(q.state_dict()) 109 | print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format( 110 | n_epi, score/print_interval, memory.size(), epsilon*100)) 111 | score = 0.0 112 | env.close() 113 | 114 | if __name__ == '__main__': 115 | main() -------------------------------------------------------------------------------- /ppo.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | #Hyperparameters 9 | learning_rate = 0.0005 10 | gamma = 0.98 11 | lmbda = 0.95 12 | eps_clip = 0.1 13 | K_epoch = 3 14 | T_horizon = 20 15 | 16 | class PPO(nn.Module): 17 | def __init__(self): 18 | super(PPO, self).__init__() 19 | self.data = [] 20 | 21 | self.fc1 = nn.Linear(4,256) 22 | self.fc_pi = nn.Linear(256,2) 23 | self.fc_v = nn.Linear(256,1) 24 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 25 | 26 | def pi(self, x, softmax_dim = 0): 27 | x = F.relu(self.fc1(x)) 28 | x = self.fc_pi(x) 29 | prob = F.softmax(x, dim=softmax_dim) 30 | return prob 31 | 32 | def v(self, x): 33 | x = F.relu(self.fc1(x)) 34 | v = self.fc_v(x) 35 | return v 36 | 37 | def put_data(self, transition): 38 | self.data.append(transition) 39 | 40 | def make_batch(self): 41 | s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], [] 42 | for transition in self.data: 43 | s, a, r, s_prime, prob_a, done = transition 44 | 45 | s_lst.append(s) 46 | a_lst.append([a]) 47 | r_lst.append([r]) 48 | s_prime_lst.append(s_prime) 49 | prob_a_lst.append([prob_a]) 50 | done_mask = 0 if done else 1 51 | done_lst.append([done_mask]) 52 | 53 | s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 54 | torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ 55 | torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst) 56 | self.data = [] 57 | return s, a, r, s_prime, done_mask, prob_a 58 | 59 | def train_net(self): 60 | s, a, r, s_prime, done_mask, prob_a = self.make_batch() 61 | 62 | for i in range(K_epoch): 63 | td_target = r + gamma * self.v(s_prime) * done_mask 64 | delta = td_target - self.v(s) 65 | delta = delta.detach().numpy() 66 | 67 | advantage_lst = [] 68 | advantage = 0.0 69 | for delta_t in delta[::-1]: 70 | advantage = gamma * lmbda * advantage + delta_t[0] 71 | advantage_lst.append([advantage]) 72 | advantage_lst.reverse() 73 | advantage = torch.tensor(advantage_lst, dtype=torch.float) 74 | 75 | pi = self.pi(s, softmax_dim=1) 76 | pi_a = pi.gather(1,a) 77 | ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a)) # a/b == exp(log(a)-log(b)) 78 | 79 | surr1 = ratio * advantage 80 | surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage 81 | loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach()) 82 | 83 | self.optimizer.zero_grad() 84 | loss.mean().backward() 85 | self.optimizer.step() 86 | 87 | def main(): 88 | env = gym.make('CartPole-v1') 89 | model = PPO() 90 | score = 0.0 91 | print_interval = 20 92 | 93 | for n_epi in range(10000): 94 | s = env.reset() 95 | done = False 96 | while not done: 97 | for t in range(T_horizon): 98 | prob = model.pi(torch.from_numpy(s).float()) 99 | m = Categorical(prob) 100 | a = m.sample().item() 101 | s_prime, r, done, info = env.step(a) 102 | 103 | model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done)) 104 | s = s_prime 105 | 106 | score += r 107 | if done: 108 | break 109 | 110 | model.train_net() 111 | 112 | if n_epi%print_interval==0 and n_epi!=0: 113 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval)) 114 | score = 0.0 115 | 116 | env.close() 117 | 118 | if __name__ == '__main__': 119 | main() -------------------------------------------------------------------------------- /a3c.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | # import torch.multiprocessing as mp 8 | import multiprocessing as mp 9 | import time 10 | 11 | # Hyperparameters 12 | n_train_processes = 5 13 | learning_rate = 0.0002 14 | update_interval = 5 15 | gamma = 0.98 16 | max_train_ep = 300 17 | max_test_ep = 400 18 | 19 | 20 | class ActorCritic(nn.Module): 21 | def __init__(self): 22 | super(ActorCritic, self).__init__() 23 | self.fc1 = nn.Linear(4, 256) 24 | self.fc_pi = nn.Linear(256, 2) 25 | self.fc_v = nn.Linear(256, 1) 26 | 27 | def pi(self, x, softmax_dim=0): 28 | x = F.relu(self.fc1(x)) 29 | x = self.fc_pi(x) 30 | prob = F.softmax(x, dim=softmax_dim) 31 | return prob 32 | 33 | def v(self, x): 34 | x = F.relu(self.fc1(x)) 35 | v = self.fc_v(x) 36 | return v 37 | 38 | 39 | def train(global_model, rank): 40 | local_model = ActorCritic() 41 | local_model.load_state_dict(global_model.state_dict()) 42 | 43 | optimizer = optim.Adam(global_model.parameters(), lr=learning_rate) 44 | 45 | env = gym.make('CartPole-v1') 46 | 47 | for n_epi in range(max_train_ep): 48 | done = False 49 | s = env.reset() 50 | while not done: 51 | s_lst, a_lst, r_lst = [], [], [] 52 | for t in range(update_interval): 53 | prob = local_model.pi(torch.from_numpy(s).float()) 54 | m = Categorical(prob) 55 | a = m.sample().item() 56 | s_prime, r, done, info = env.step(a) 57 | 58 | s_lst.append(s) 59 | a_lst.append([a]) 60 | r_lst.append(r/100.0) 61 | 62 | s = s_prime 63 | if done: 64 | break 65 | 66 | s_final = torch.tensor(s_prime, dtype=torch.float) 67 | R = 0.0 if done else local_model.v(s_final).item() 68 | td_target_lst = [] 69 | for reward in r_lst[::-1]: 70 | R = gamma * R + reward 71 | td_target_lst.append([R]) 72 | td_target_lst.reverse() 73 | 74 | s_batch, a_batch, td_target = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 75 | torch.tensor(td_target_lst) 76 | advantage = td_target - local_model.v(s_batch) 77 | 78 | pi = local_model.pi(s_batch, softmax_dim=1) 79 | pi_a = pi.gather(1, a_batch) 80 | loss = -torch.log(pi_a) * advantage.detach() + \ 81 | F.smooth_l1_loss(local_model.v(s_batch), td_target.detach()) 82 | 83 | optimizer.zero_grad() 84 | loss.mean().backward() 85 | for global_param, local_param in zip(global_model.parameters(), local_model.parameters()): 86 | global_param._grad = local_param.grad 87 | optimizer.step() 88 | local_model.load_state_dict(global_model.state_dict()) 89 | 90 | env.close() 91 | print("Training process {} reached maximum episode.".format(rank)) 92 | 93 | 94 | def test(global_model): 95 | env = gym.make('CartPole-v1') 96 | score = 0.0 97 | print_interval = 20 98 | 99 | for n_epi in range(max_test_ep): 100 | done = False 101 | s = env.reset() 102 | while not done: 103 | prob = global_model.pi(torch.from_numpy(s).float()) 104 | a = Categorical(prob).sample().item() 105 | s_prime, r, done, info = env.step(a) 106 | s = s_prime 107 | score += r 108 | 109 | if n_epi % print_interval == 0 and n_epi != 0: 110 | print("# of episode :{}, avg score : {:.1f}".format( 111 | n_epi, score/print_interval)) 112 | score = 0.0 113 | time.sleep(1) 114 | env.close() 115 | 116 | 117 | if __name__ == '__main__': 118 | global_model = ActorCritic() 119 | global_model.share_memory() 120 | 121 | processes = [] 122 | for rank in range(n_train_processes + 1): # + 1 for test process 123 | if rank == 0: 124 | p = mp.Process(target=test, args=(global_model,)) 125 | else: 126 | p = mp.Process(target=train, args=(global_model, rank,)) 127 | p.start() 128 | processes.append(p) 129 | for p in processes: 130 | p.join() -------------------------------------------------------------------------------- /ddpg.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import collections 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.optim as optim 9 | 10 | #Hyperparameters 11 | lr_mu = 0.0005 12 | lr_q = 0.001 13 | gamma = 0.99 14 | batch_size = 32 15 | buffer_limit = 50000 16 | tau = 0.005 # for target network soft update 17 | 18 | class ReplayBuffer(): 19 | def __init__(self): 20 | self.buffer = collections.deque(maxlen=buffer_limit) 21 | 22 | def put(self, transition): 23 | self.buffer.append(transition) 24 | 25 | def sample(self, n): 26 | mini_batch = random.sample(self.buffer, n) 27 | s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], [] 28 | 29 | for transition in mini_batch: 30 | s, a, r, s_prime, done_mask = transition 31 | s_lst.append(s) 32 | a_lst.append([a]) 33 | r_lst.append([r]) 34 | s_prime_lst.append(s_prime) 35 | done_mask_lst.append([done_mask]) 36 | 37 | return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 38 | torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ 39 | torch.tensor(done_mask_lst) 40 | 41 | def size(self): 42 | return len(self.buffer) 43 | 44 | class MuNet(nn.Module): 45 | def __init__(self): 46 | super(MuNet, self).__init__() 47 | self.fc1 = nn.Linear(3, 128) 48 | self.fc2 = nn.Linear(128, 64) 49 | self.fc_mu = nn.Linear(64, 1) 50 | 51 | def forward(self, x): 52 | x = F.relu(self.fc1(x)) 53 | x = F.relu(self.fc2(x)) 54 | mu = torch.tanh(self.fc_mu(x))*2 # Multipled by 2 because the action space of the Pendulum-v0 is [-2,2] 55 | return mu 56 | 57 | class QNet(nn.Module): 58 | def __init__(self): 59 | super(QNet, self).__init__() 60 | 61 | self.fc_s = nn.Linear(3, 64) 62 | self.fc_a = nn.Linear(1,64) 63 | self.fc_q = nn.Linear(128, 32) 64 | self.fc_3 = nn.Linear(32,1) 65 | 66 | def forward(self, x, a): 67 | h1 = F.relu(self.fc_s(x)) 68 | h2 = F.relu(self.fc_a(a)) 69 | cat = torch.cat([h1,h2], dim=1) 70 | q = F.relu(self.fc_q(cat)) 71 | q = self.fc_3(q) 72 | return q 73 | 74 | class OrnsteinUhlenbeckNoise: 75 | def __init__(self, mu): 76 | self.theta, self.dt, self.sigma = 0.1, 0.01, 0.1 77 | self.mu = mu 78 | self.x_prev = np.zeros_like(self.mu) 79 | 80 | def __call__(self): 81 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \ 82 | self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) 83 | self.x_prev = x 84 | return x 85 | 86 | def train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer): 87 | s,a,r,s_prime,done_mask = memory.sample(batch_size) 88 | 89 | target = r + gamma * q_target(s_prime, mu_target(s_prime)) 90 | q_loss = F.smooth_l1_loss(q(s,a), target.detach()) 91 | q_optimizer.zero_grad() 92 | q_loss.backward() 93 | q_optimizer.step() 94 | 95 | mu_loss = -q(s,mu(s)).mean() # That's all for the policy loss. 96 | mu_optimizer.zero_grad() 97 | mu_loss.backward() 98 | mu_optimizer.step() 99 | 100 | def soft_update(net, net_target): 101 | for param_target, param in zip(net_target.parameters(), net.parameters()): 102 | param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau) 103 | 104 | def main(): 105 | env = gym.make('Pendulum-v0') 106 | memory = ReplayBuffer() 107 | 108 | q, q_target = QNet(), QNet() 109 | q_target.load_state_dict(q.state_dict()) 110 | mu, mu_target = MuNet(), MuNet() 111 | mu_target.load_state_dict(mu.state_dict()) 112 | 113 | score = 0.0 114 | print_interval = 20 115 | 116 | mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu) 117 | q_optimizer = optim.Adam(q.parameters(), lr=lr_q) 118 | ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(1)) 119 | 120 | for n_epi in range(10000): 121 | s = env.reset() 122 | 123 | for t in range(300): # maximum length of episode is 200 for Pendulum-v0 124 | a = mu(torch.from_numpy(s).float()) 125 | a = a.item() + ou_noise()[0] 126 | s_prime, r, done, info = env.step([a]) 127 | memory.put((s,a,r/100.0,s_prime,done)) 128 | score +=r 129 | s = s_prime 130 | 131 | if done: 132 | break 133 | 134 | if memory.size()>2000: 135 | for i in range(10): 136 | train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer) 137 | soft_update(mu, mu_target) 138 | soft_update(q, q_target) 139 | 140 | if n_epi%print_interval==0 and n_epi!=0: 141 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval)) 142 | score = 0.0 143 | 144 | env.close() 145 | 146 | if __name__ == '__main__': 147 | main() -------------------------------------------------------------------------------- /ppo-lstm.py: -------------------------------------------------------------------------------- 1 | #PPO-LSTM 2 | import gym 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torch.distributions import Categorical 8 | import time 9 | import numpy as np 10 | 11 | #Hyperparameters 12 | learning_rate = 0.0005 13 | gamma = 0.98 14 | lmbda = 0.95 15 | eps_clip = 0.1 16 | K_epoch = 2 17 | T_horizon = 20 18 | 19 | class PPO(nn.Module): 20 | def __init__(self): 21 | super(PPO, self).__init__() 22 | self.data = [] 23 | 24 | self.fc1 = nn.Linear(4,64) 25 | self.lstm = nn.LSTM(64,32) 26 | self.fc_pi = nn.Linear(32,2) 27 | self.fc_v = nn.Linear(32,1) 28 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 29 | 30 | def pi(self, x, hidden): 31 | x = F.relu(self.fc1(x)) 32 | x = x.view(-1, 1, 64) 33 | x, lstm_hidden = self.lstm(x, hidden) 34 | x = self.fc_pi(x) 35 | prob = F.softmax(x, dim=2) 36 | return prob, lstm_hidden 37 | 38 | def v(self, x, hidden): 39 | x = F.relu(self.fc1(x)) 40 | x = x.view(-1, 1, 64) 41 | x, lstm_hidden = self.lstm(x, hidden) 42 | v = self.fc_v(x) 43 | return v 44 | 45 | def put_data(self, transition): 46 | self.data.append(transition) 47 | 48 | def make_batch(self): 49 | s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, h_in_lst, h_out_lst, done_lst = [], [], [], [], [], [], [], [] 50 | for transition in self.data: 51 | s, a, r, s_prime, prob_a, h_in, h_out, done = transition 52 | 53 | s_lst.append(s) 54 | a_lst.append([a]) 55 | r_lst.append([r]) 56 | s_prime_lst.append(s_prime) 57 | prob_a_lst.append([prob_a]) 58 | h_in_lst.append(h_in) 59 | h_out_lst.append(h_out) 60 | done_mask = 0 if done else 1 61 | done_lst.append([done_mask]) 62 | 63 | s,a,r,s_prime,done_mask,prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 64 | torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ 65 | torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst) 66 | self.data = [] 67 | return s,a,r,s_prime, done_mask, prob_a, h_in_lst[0], h_out_lst[0] 68 | 69 | def train_net(self): 70 | s,a,r,s_prime,done_mask, prob_a, (h1_in, h2_in), (h1_out, h2_out) = self.make_batch() 71 | first_hidden = (h1_in.detach(), h2_in.detach()) 72 | second_hidden = (h1_out.detach(), h2_out.detach()) 73 | 74 | for i in range(K_epoch): 75 | v_prime = self.v(s_prime, second_hidden).squeeze(1) 76 | td_target = r + gamma * v_prime * done_mask 77 | v_s = self.v(s, first_hidden).squeeze(1) 78 | delta = td_target - v_s 79 | delta = delta.detach().numpy() 80 | 81 | advantage_lst = [] 82 | advantage = 0.0 83 | for item in delta[::-1]: 84 | advantage = gamma * lmbda * advantage + item[0] 85 | advantage_lst.append([advantage]) 86 | advantage_lst.reverse() 87 | advantage = torch.tensor(advantage_lst, dtype=torch.float) 88 | 89 | pi, _ = self.pi(s, first_hidden) 90 | pi_a = pi.squeeze(1).gather(1,a) 91 | ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a)) # a/b == log(exp(a)-exp(b)) 92 | 93 | surr1 = ratio * advantage 94 | surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage 95 | loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(v_s, td_target.detach()) 96 | 97 | self.optimizer.zero_grad() 98 | loss.mean().backward(retain_graph=True) 99 | self.optimizer.step() 100 | 101 | def main(): 102 | env = gym.make('CartPole-v1') 103 | model = PPO() 104 | score = 0.0 105 | print_interval = 20 106 | 107 | for n_epi in range(10000): 108 | h_out = (torch.zeros([1, 1, 32], dtype=torch.float), torch.zeros([1, 1, 32], dtype=torch.float)) 109 | s = env.reset() 110 | done = False 111 | 112 | while not done: 113 | for t in range(T_horizon): 114 | h_in = h_out 115 | prob, h_out = model.pi(torch.from_numpy(s).float(), h_in) 116 | prob = prob.view(-1) 117 | m = Categorical(prob) 118 | a = m.sample().item() 119 | s_prime, r, done, info = env.step(a) 120 | 121 | model.put_data((s, a, r/100.0, s_prime, prob[a].item(), h_in, h_out, done)) 122 | s = s_prime 123 | 124 | score += r 125 | if done: 126 | break 127 | 128 | model.train_net() 129 | 130 | if n_epi%print_interval==0 and n_epi!=0: 131 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval)) 132 | score = 0.0 133 | 134 | env.close() 135 | 136 | if __name__ == '__main__': 137 | main() -------------------------------------------------------------------------------- /deep_sea_treasure_env/deep_sea_treasure_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.spaces import Discrete 3 | from gym.spaces import Box 4 | import numpy as np 5 | 6 | class DeepSeaTreasureEnv(gym.Env): 7 | ''' 8 | This is an implementation of a standard Multiple Objective Optimization problem called 9 | Deep Sea Treasure Hunt (Vamplew, et al, 2011). 10 | In this environment there are multiple goals. 1) Search for as much treasure as possible 2) Take as little 11 | time as possible. There is no one optimal policy to solve this but a collection of policies which are 12 | equally optimal. 13 | ''' 14 | def __init__(self, max_steps=30): 15 | 16 | #limit on time spent seeking treasure 17 | self.max_steps = max_steps 18 | 19 | self.scale_time = 0.01 20 | self.scale_treasure = 1. 21 | 22 | #Grid world 23 | # values: 24 | # -- 0 = sea 25 | # -- -1 = sea floor 26 | # -- > 0 = Treasure 27 | self.grid = np.zeros((10,11), dtype=int) 28 | # sea floor 29 | self.grid[0, 2] = -1 30 | self.grid[0:1, 3] = -1 31 | self.grid[0:2, 4] = -1 32 | self.grid[0:5, 5] = -1 33 | self.grid[0:5, 6] = -1 34 | self.grid[0:5, 7] = -1 35 | self.grid[0:7, 8] = -1 36 | self.grid[0:7, 9] = -1 37 | self.grid[0:8, 10] = -1 38 | #treasure 39 | self.grid[0, 1] = 1 40 | self.grid[1, 2] = 2 41 | self.grid[2, 3] = 3 42 | self.grid[3, 4] = 5 43 | self.grid[4, 4] = 8 44 | self.grid[5, 4] = 16 45 | self.grid[6, 7] = 24 46 | self.grid[7, 7] = 50 47 | self.grid[8, 8] = 74 48 | self.grid[9, 10] = 124 49 | 50 | self.position = 0 51 | self.steps_taken = 0 52 | self.treasure_value = 0 53 | self.time_penalty = 0 54 | 55 | #actions: 56 | # 0 - Up 57 | # 1 - Down 58 | # 2 - Left 59 | # 3 - Right 60 | self.action_space = Discrete(4) 61 | # The agent observes: 62 | # - It's own position 63 | # - The treasure value 64 | # - The time penalty 65 | self.observation_space = Box(0,200, (3,)) 66 | 67 | def move(self, action): 68 | row = self.position // 10 69 | col = self.position % 10 70 | 71 | if action == 0: # Up 72 | if row > 0: 73 | row -= 1 74 | if action == 1: # Down 75 | if row < 10: 76 | row += 1 77 | if action == 2: # Left 78 | if col > 0: 79 | col -= 1 80 | if action == 3: # Right 81 | if col < 9: 82 | col += 1 83 | 84 | #account for time spent (even doing illegal moves) 85 | if self.grid[col, row] == -1: 86 | self.steps_taken += 1 87 | self.time_penalty += 1 88 | return 89 | else: 90 | self.position = row * 10 + col 91 | self.steps_taken += 1 92 | self.time_penalty += 1 93 | self.treasure_value = self.grid[self.position % 10, self.position // 10] 94 | 95 | 96 | def close(self): 97 | return 98 | 99 | def reset(self): 100 | self.position = 0 101 | self.steps_taken = 0 102 | self.time_penalty = 0 103 | self.treasure_value = 0 104 | obs = np.array([self.position, self.time_penalty, self.treasure_value]) 105 | return obs 106 | 107 | def step(self, action): 108 | """Run one timestep of the environment's dynamics. When end of 109 | episode is reached, you are responsible for calling `reset()` 110 | to reset this environment's state. 111 | Accepts an action and returns a tuple (observation, reward, done, info). 112 | Args: 113 | action (object): an action provided by the agent 114 | Returns: 115 | observation (object): agent's observation of the current environment 116 | reward (float) : amount of reward returned after previous action 117 | done (bool): whether the episode has ended, in which case further step() calls will return undefined results 118 | info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning) 119 | """ 120 | obs = np.zeros(1, dtype=int) 121 | reward = 0 122 | done = False 123 | info = {} 124 | 125 | self.move(action) 126 | 127 | obs = np.array([self.position, self.time_penalty, self.treasure_value]) 128 | 129 | reward = -self.time_penalty, self.treasure_value 130 | 131 | # reset time penalty after treasure find 132 | if (self.treasure_value > 0): 133 | self.time_penalty = 0 134 | # reward = self.scale_treasure * self.treasure_value - self.scale_time * self.time_spent 135 | 136 | if (self.steps_taken >= self.max_steps): 137 | done = True 138 | 139 | if(self.treasure_value > 0): 140 | done = True 141 | # reward = - self.scale_time * self.time_spent 142 | 143 | # reward = -self.time_spent + self.treasure_value 144 | # done = (self.treasure_value > 0) or (self.time_spent == self.max_steps) 145 | 146 | return obs, reward, done, info -------------------------------------------------------------------------------- /acer.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import collections 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | from torch.distributions import Categorical 9 | 10 | # Characteristics 11 | # 1. Discrete action space, single thread version. 12 | # 2. Does not support trust-region updates. 13 | 14 | #Hyperparameters 15 | learning_rate = 0.0002 16 | gamma = 0.98 17 | buffer_limit = 6000 18 | rollout_len = 10 19 | batch_size = 4 # Indicates 4 sequences per mini-batch (4*rollout_len = 40 samples total) 20 | c = 1.0 # For truncating importance sampling ratio 21 | 22 | class ReplayBuffer(): 23 | def __init__(self): 24 | self.buffer = collections.deque(maxlen=buffer_limit) 25 | 26 | def put(self, seq_data): 27 | self.buffer.append(seq_data) 28 | 29 | def sample(self, on_policy=False): 30 | if on_policy: 31 | mini_batch = [self.buffer[-1]] 32 | else: 33 | mini_batch = random.sample(self.buffer, batch_size) 34 | 35 | s_lst, a_lst, r_lst, prob_lst, done_lst, is_first_lst = [], [], [], [], [], [] 36 | for seq in mini_batch: 37 | is_first = True # Flag for indicating whether the transition is the first item from a sequence 38 | for transition in seq: 39 | s, a, r, prob, done = transition 40 | 41 | s_lst.append(s) 42 | a_lst.append([a]) 43 | r_lst.append(r) 44 | prob_lst.append(prob) 45 | done_mask = 0.0 if done else 1.0 46 | done_lst.append(done_mask) 47 | is_first_lst.append(is_first) 48 | is_first = False 49 | 50 | s,a,r,prob,done_mask,is_first = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 51 | r_lst, torch.tensor(prob_lst, dtype=torch.float), done_lst, \ 52 | is_first_lst 53 | return s,a,r,prob,done_mask,is_first 54 | 55 | def size(self): 56 | return len(self.buffer) 57 | 58 | class ActorCritic(nn.Module): 59 | def __init__(self): 60 | super(ActorCritic, self).__init__() 61 | self.fc1 = nn.Linear(4,256) 62 | self.fc_pi = nn.Linear(256,2) 63 | self.fc_q = nn.Linear(256,2) 64 | 65 | def pi(self, x, softmax_dim = 0): 66 | x = F.relu(self.fc1(x)) 67 | x = self.fc_pi(x) 68 | pi = F.softmax(x, dim=softmax_dim) 69 | return pi 70 | 71 | def q(self, x): 72 | x = F.relu(self.fc1(x)) 73 | q = self.fc_q(x) 74 | return q 75 | 76 | def train(model, optimizer, memory, on_policy=False): 77 | s,a,r,prob,done_mask,is_first = memory.sample(on_policy) 78 | 79 | q = model.q(s) 80 | q_a = q.gather(1,a) 81 | pi = model.pi(s, softmax_dim = 1) 82 | pi_a = pi.gather(1,a) 83 | v = (q * pi).sum(1).unsqueeze(1).detach() 84 | 85 | rho = pi.detach()/prob 86 | rho_a = rho.gather(1,a) 87 | rho_bar = rho_a.clamp(max=c) 88 | correction_coeff = (1-c/rho).clamp(min=0) 89 | 90 | q_ret = v[-1] * done_mask[-1] 91 | q_ret_lst = [] 92 | for i in reversed(range(len(r))): 93 | q_ret = r[i] + gamma * q_ret 94 | q_ret_lst.append(q_ret.item()) 95 | q_ret = rho_bar[i] * (q_ret - q_a[i]) + v[i] 96 | 97 | if is_first[i] and i!=0: 98 | q_ret = v[i-1] * done_mask[i-1] # When a new sequence begins, q_ret is initialized 99 | 100 | q_ret_lst.reverse() 101 | q_ret = torch.tensor(q_ret_lst, dtype=torch.float).unsqueeze(1) 102 | 103 | loss1 = -rho_bar * torch.log(pi_a) * (q_ret - v) 104 | loss2 = -correction_coeff * pi.detach() * torch.log(pi) * (q.detach()-v) # bias correction term 105 | loss = loss1 + loss2.sum(1) + F.smooth_l1_loss(q_a, q_ret) 106 | 107 | optimizer.zero_grad() 108 | loss.mean().backward() 109 | optimizer.step() 110 | 111 | def main(): 112 | env = gym.make('CartPole-v1') 113 | memory = ReplayBuffer() 114 | model = ActorCritic() 115 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 116 | 117 | score = 0.0 118 | print_interval = 20 119 | 120 | for n_epi in range(10000): 121 | s = env.reset() 122 | done = False 123 | 124 | while not done: 125 | seq_data = [] 126 | for t in range(rollout_len): 127 | prob = model.pi(torch.from_numpy(s).float()) 128 | a = Categorical(prob).sample().item() 129 | s_prime, r, done, info = env.step(a) 130 | seq_data.append((s, a, r/100.0, prob.detach().numpy(), done)) 131 | 132 | score +=r 133 | s = s_prime 134 | if done: 135 | break 136 | 137 | memory.put(seq_data) 138 | if memory.size()>500: 139 | train(model, optimizer, memory, on_policy=True) 140 | train(model, optimizer, memory) 141 | 142 | if n_epi%print_interval==0 and n_epi!=0: 143 | print("# of episode :{}, avg score : {:.1f}, buffer size : {}".format(n_epi, score/print_interval, memory.size())) 144 | score = 0.0 145 | 146 | env.close() 147 | 148 | if __name__ == '__main__': 149 | main() -------------------------------------------------------------------------------- /a2c.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | import torch.multiprocessing as mp 8 | import time 9 | import numpy as np 10 | from deep_sea_treasure_env.deep_sea_treasure_env import DeepSeaTreasureEnv 11 | 12 | # Hyperparameters 13 | n_train_processes = 3 14 | learning_rate = 0.0002 15 | update_interval = 5 16 | gamma = 0.98 17 | max_train_steps = 60000 18 | PRINT_INTERVAL = update_interval * 100 19 | 20 | class ActorCritic(nn.Module): 21 | def __init__(self): 22 | super(ActorCritic, self).__init__() 23 | self.fc1 = nn.Linear(3, 256) 24 | self.fc_pi = nn.Linear(256, 4) 25 | self.fc_v = nn.Linear(256, 1) 26 | 27 | def pi(self, x, softmax_dim=1): 28 | x = F.relu(self.fc1(x)) 29 | x = self.fc_pi(x) 30 | prob = F.softmax(x, dim=softmax_dim) 31 | return prob 32 | 33 | def v(self, x): 34 | x = F.relu(self.fc1(x)) 35 | v = self.fc_v(x) 36 | return v 37 | 38 | def worker(worker_id, master_end, worker_end): 39 | master_end.close() # Forbid worker to use the master end for messaging 40 | #env = gym.make('CartPole-v1') 41 | env = DeepSeaTreasureEnv() 42 | env.seed(worker_id) 43 | 44 | while True: 45 | cmd, data = worker_end.recv() 46 | if cmd == 'step': 47 | ob, reward, done, info = env.step(data) 48 | if done: 49 | ob = env.reset() 50 | worker_end.send((ob, reward, done, info)) 51 | elif cmd == 'reset': 52 | ob = env.reset() 53 | worker_end.send(ob) 54 | elif cmd == 'reset_task': 55 | ob = env.reset_task() 56 | worker_end.send(ob) 57 | elif cmd == 'close': 58 | worker_end.close() 59 | break 60 | elif cmd == 'get_spaces': 61 | worker_end.send((env.observation_space, env.action_space)) 62 | else: 63 | raise NotImplementedError 64 | 65 | class ParallelEnv: 66 | def __init__(self, n_train_processes): 67 | self.nenvs = n_train_processes 68 | self.waiting = False 69 | self.closed = False 70 | self.workers = list() 71 | 72 | master_ends, worker_ends = zip(*[mp.Pipe() for _ in range(self.nenvs)]) 73 | self.master_ends, self.worker_ends = master_ends, worker_ends 74 | 75 | for worker_id, (master_end, worker_end) in enumerate(zip(master_ends, worker_ends)): 76 | p = mp.Process(target=worker, 77 | args=(worker_id, master_end, worker_end)) 78 | p.daemon = True 79 | p.start() 80 | self.workers.append(p) 81 | 82 | # Forbid master to use the worker end for messaging 83 | for worker_end in worker_ends: 84 | worker_end.close() 85 | 86 | def step_async(self, actions): 87 | for master_end, action in zip(self.master_ends, actions): 88 | master_end.send(('step', action)) 89 | self.waiting = True 90 | 91 | def step_wait(self): 92 | results = [master_end.recv() for master_end in self.master_ends] 93 | self.waiting = False 94 | obs, rews, dones, infos = zip(*results) 95 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 96 | 97 | def reset(self): 98 | for master_end in self.master_ends: 99 | master_end.send(('reset', None)) 100 | return np.stack([master_end.recv() for master_end in self.master_ends]) 101 | 102 | def step(self, actions): 103 | self.step_async(actions) 104 | return self.step_wait() 105 | 106 | def close(self): # For clean up resources 107 | if self.closed: 108 | return 109 | if self.waiting: 110 | [master_end.recv() for master_end in self.master_ends] 111 | for master_end in self.master_ends: 112 | master_end.send(('close', None)) 113 | for worker in self.workers: 114 | worker.join() 115 | self.closed = True 116 | 117 | def test(step_idx, model): 118 | #env = gym.make('CartPole-v1') 119 | env = DeepSeaTreasureEnv() 120 | score = 0.0 121 | done = False 122 | num_test = 10 123 | 124 | for _ in range(num_test): 125 | s = env.reset() 126 | while not done: 127 | prob = model.pi(torch.from_numpy(s).float(), softmax_dim=0) 128 | a = Categorical(prob).sample().numpy() 129 | s_prime, r, done, info = env.step(a) 130 | s = s_prime 131 | score += r 132 | done = False 133 | print(f"Step # :{step_idx}, avg score : {score/num_test:.1f}") 134 | 135 | env.close() 136 | 137 | def compute_target(v_final, r_lst, mask_lst): 138 | G = v_final.reshape(-1) 139 | td_target = list() 140 | 141 | for r, mask in zip(r_lst[::-1], mask_lst[::-1]): 142 | G = r + gamma * G * mask 143 | td_target.append(G) 144 | 145 | return torch.tensor(td_target[::-1]).float() 146 | 147 | if __name__ == '__main__': 148 | envs = ParallelEnv(n_train_processes) 149 | 150 | model = ActorCritic() 151 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 152 | 153 | step_idx = 0 154 | s = envs.reset() 155 | while step_idx < max_train_steps: 156 | s_lst, a_lst, r_lst, mask_lst = list(), list(), list(), list() 157 | for _ in range(update_interval): 158 | prob = model.pi(torch.from_numpy(s).float()) 159 | a = Categorical(prob).sample().numpy() 160 | s_prime, r, done, info = envs.step(a) 161 | 162 | s_lst.append(s) 163 | a_lst.append(a) 164 | r_lst.append(r/100.0) 165 | mask_lst.append(1 - done) 166 | 167 | s = s_prime 168 | step_idx += 1 169 | 170 | s_final = torch.from_numpy(s_prime).float() 171 | v_final = model.v(s_final).detach().clone().numpy() 172 | td_target = compute_target(v_final, r_lst, mask_lst) 173 | 174 | td_target_vec = td_target.reshape(-1) 175 | s_vec = torch.tensor(s_lst).float().reshape(-1, 3) # 4 == Dimension of state 176 | a_vec = torch.tensor(a_lst).reshape(-1).unsqueeze(1) 177 | advantage = td_target_vec - model.v(s_vec).reshape(-1) 178 | 179 | pi = model.pi(s_vec, softmax_dim=1) 180 | pi_a = pi.gather(1, a_vec).reshape(-1) 181 | loss = -(torch.log(pi_a) * advantage.detach()).mean() +\ 182 | F.smooth_l1_loss(model.v(s_vec).reshape(-1), td_target_vec) 183 | 184 | optimizer.zero_grad() 185 | loss.backward() 186 | optimizer.step() 187 | 188 | if step_idx % PRINT_INTERVAL == 0: 189 | test(step_idx, model) 190 | 191 | envs.close() -------------------------------------------------------------------------------- /a3c_dst.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | # import torch.multiprocessing as mp 4 | import queue 5 | import datetime 6 | import multiprocessing as mp 7 | import time 8 | import numpy as np 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | from torch.distributions import Categorical 15 | from deap.tools.indicator import hv 16 | from torch.utils.tensorboard import SummaryWriter 17 | from deep_sea_treasure_env.deep_sea_treasure_env import DeepSeaTreasureEnv 18 | 19 | # Hyperparameters 20 | n_train_processes = 5 # Number of workers 21 | learning_rate = 0.00002 # LR of 0.00002 works well with deep sea treasure env. 22 | update_interval = 5 # nr of steps before actor critic network update 23 | gamma = 0.98 # discount factor 24 | max_train_ep = 1000 # maximum episodes for training 25 | max_test_ep = 1000 # maximum episodes for testing/logging 26 | goal_size = 10 # weight range for agents 27 | goal_partition = 5 # step size for weight range 28 | log_interval = 1000 # interval for log messages 29 | queue_read_interval = 3000 # max items read from queue 30 | run_timestamp = datetime.datetime.now().ctime().replace(" ", "_") 31 | 32 | 33 | class ActorCritic(nn.Module): 34 | def __init__(self): 35 | super(ActorCritic, self).__init__() 36 | self.fc1 = nn.Linear(3, 256) 37 | self.fc_pi = nn.Linear(256, 4) 38 | self.fc_v = nn.Linear(256, 1) 39 | 40 | def pi(self, x, softmax_dim=0): 41 | x = F.relu(self.fc1(x)) 42 | x = self.fc_pi(x) 43 | prob = F.softmax(x, dim=softmax_dim) 44 | return prob 45 | 46 | def v(self, x): 47 | x = F.relu(self.fc1(x)) 48 | v = self.fc_v(x) 49 | return v 50 | 51 | 52 | def train(rank, weights, data_pool ): 53 | print(f'agent_{rank} starting...', flush=True) 54 | local_model = ActorCritic() 55 | 56 | optimizer = optim.Adam(local_model.parameters(), lr=learning_rate) 57 | 58 | env = DeepSeaTreasureEnv() 59 | 60 | for n_epi in range(max_train_ep): 61 | if n_epi % log_interval == 0: 62 | print(f'agent {rank} starting epoch {n_epi}', flush=True) 63 | epoch_reward1 = [] 64 | epoch_reward2 = [] 65 | epoch_loss = [] 66 | epoch_pi = [] 67 | epoch_v = [] 68 | epoch_advantage = [] 69 | done = False 70 | s = env.reset() 71 | while not done: 72 | s_lst, a_lst, r_lst = [], [], [] 73 | s_prime = None 74 | for t in range(update_interval): 75 | prob = local_model.pi(torch.from_numpy(s).float()) 76 | m = Categorical(prob) 77 | a = m.sample().item() 78 | s_prime, r, done, info = env.step(a) 79 | 80 | s_lst.append(s) 81 | a_lst.append([a]) 82 | r_lst.append(r) 83 | 84 | s = s_prime 85 | if done: 86 | break 87 | 88 | s_final = torch.tensor(s_prime, dtype=torch.float) 89 | R = 0.0 if done else local_model.v(s_final).item() 90 | td_target_lst = [] 91 | for reward in r_lst[::-1]: 92 | R = gamma * R + weights[0] * reward[0] + weights[1] * reward[1] 93 | td_target_lst.append([np.double(R)]) 94 | td_target_lst.reverse() 95 | 96 | s_batch, a_batch, td_target = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 97 | torch.tensor(td_target_lst) 98 | advantage = td_target - local_model.v(s_batch) 99 | 100 | pi = local_model.pi(s_batch, softmax_dim=1) 101 | pi_a = pi.gather(1, a_batch) 102 | loss = -torch.log(pi_a) * advantage.float().detach() + \ 103 | F.smooth_l1_loss(local_model.v(s_batch), td_target.float().detach()) 104 | 105 | optimizer.zero_grad() 106 | loss.mean().backward() 107 | optimizer.step() 108 | 109 | avg_reward_1 = sum([reward[0] for reward in r_lst]) / len(r_lst) 110 | avg_reward_2 = sum([reward[1] for reward in r_lst]) / len(r_lst) 111 | epoch_reward1.append(avg_reward_1) 112 | epoch_reward2.append(avg_reward_2) 113 | epoch_v.append(local_model.v(s_batch).detach().mean()) 114 | epoch_pi.append(pi.argmax(dim=1).median()) 115 | epoch_advantage.append(advantage.detach().mean()) 116 | epoch_loss.append(loss.detach().mean()) 117 | 118 | sent = False 119 | while not sent: 120 | try: 121 | data_pool.put_nowait((n_epi, rank, sum(epoch_loss) / len(epoch_loss), 122 | sum(epoch_pi) / len(epoch_pi), 123 | sum(epoch_advantage) / len(epoch_advantage), 124 | sum(epoch_reward1) / len(epoch_reward1), 125 | sum(epoch_reward2) / len(epoch_reward2))) 126 | 127 | sent = True 128 | except queue.Full: 129 | print('queue in the queue, waiting....', flush=True) 130 | time.sleep(0.1) 131 | 132 | time.sleep(0) # Yield remaining time 133 | # if n_epi % 100 == 0: 134 | # time.sleep(1) 135 | 136 | env.close() 137 | 138 | while not data_pool.empty(): 139 | print(f'Agent {rank}: not all data consumed, waiting...', flush=True) 140 | time.sleep(10) 141 | 142 | print("Training process {} reached maximum episode.".format(rank)) 143 | 144 | torch.save(local_model.state_dict(), f'./agents/{run_timestamp}_agent_{rank}.ai') 145 | 146 | def data_complete(epi_list, epoch): 147 | for i in range(n_train_processes): 148 | if epi_list[i, epoch] != epoch: 149 | return False 150 | 151 | return True 152 | 153 | def first_missing(epi_list, epoch): 154 | if epoch >= max_test_ep: 155 | return -1 156 | 157 | for i in range(n_train_processes): 158 | if epi_list[i, epoch] != epoch: 159 | return i 160 | 161 | return -1 162 | 163 | def test(weights, data_pools): 164 | summary_writer = SummaryWriter(filename_suffix=run_timestamp) 165 | 166 | epi_list = np.empty((n_train_processes+1, max_test_ep), dtype=int) 167 | reward_list = np.empty((n_train_processes+1, max_test_ep), dtype=tuple) 168 | loss_list = np.empty((n_train_processes+1, max_test_ep), dtype=float) 169 | pi_list = np.empty((n_train_processes+1, max_test_ep), dtype=float) 170 | advantage_list = np.empty((n_train_processes+1, max_test_ep), dtype=float) 171 | 172 | i_epi = 0 173 | while i_epi < (max_test_ep): 174 | test_iteration = 0 175 | while data_complete(epi_list, i_epi) and test_iteration < queue_read_interval: 176 | # receive rewards 177 | if i_epi % log_interval == 0: 178 | print(f'processing data for epoch {i_epi}', flush=True) 179 | 180 | reward_set = reward_list[:, i_epi] 181 | reward_set = list(filter(None, reward_set)) 182 | if reward_set: 183 | hypervolume = hv.hypervolume(reward_set, [100, 100]) 184 | # if i_epi % log_interval == 0: 185 | print(f'Hypervolume indicator for episode {i_epi}: {hypervolume} for {len(reward_set)} points', flush=True) 186 | summary_writer.add_scalar("hypervolume_indicator", hypervolume, i_epi) 187 | else: 188 | print('reward_set is empty') 189 | 190 | for agent_rank in range(1, n_train_processes + 1): 191 | summary_writer.add_scalar(f'agent_{agent_rank}_weight_1', weights[agent_rank][0], i_epi) 192 | summary_writer.add_scalar(f'agent_{agent_rank}_weight_2', weights[agent_rank][1], i_epi) 193 | 194 | if loss_list[agent_rank][i_epi]: 195 | summary_writer.add_scalar(f'agent_{agent_rank}_loss', loss_list[agent_rank][i_epi], i_epi) 196 | 197 | if pi_list[agent_rank][i_epi]: 198 | summary_writer.add_scalar(f'agent_{agent_rank}_pi', pi_list[agent_rank][i_epi], i_epi) 199 | 200 | if advantage_list[agent_rank][i_epi]: 201 | summary_writer.add_scalar(f'agent_{agent_rank}_advantage', advantage_list[agent_rank][i_epi], i_epi) 202 | 203 | if reward_list[agent_rank][i_epi]: 204 | summary_writer.add_scalar(f'agent_{agent_rank}_reward_1', reward_list[agent_rank][i_epi][0], i_epi) 205 | 206 | if reward_list[agent_rank][i_epi]: 207 | summary_writer.add_scalar(f'agent_{agent_rank}_reward_2', reward_list[agent_rank][i_epi][1], i_epi) 208 | 209 | test_iteration += 1 210 | i_epi += 1 211 | if not i_epi < max_test_ep: 212 | break 213 | 214 | print(f'Waiting for epoch {i_epi} to be completed by all workers', flush=True) 215 | print(f'Waiting for worker: {first_missing(loss_list, i_epi)}', flush=True) 216 | 217 | for i in range(n_train_processes): # iterate over worker queues 218 | queue_not_empty = True 219 | 220 | read_counter = 0 221 | if epi_list[i, max_test_ep-1] != (max_test_ep-1): # only read from unfinished workers 222 | while queue_not_empty and read_counter < queue_read_interval: 223 | try: 224 | data = data_pools[i].get_nowait() 225 | 226 | n_epi = data[0] 227 | rank = data[1] 228 | loss = data[2] 229 | pi = data[3] 230 | advantage = data[4] 231 | avg_reward_1 = data[5] 232 | avg_reward_2 = data[6] 233 | 234 | epi_list[rank][n_epi] = n_epi 235 | reward_list[rank][n_epi] = (avg_reward_1, avg_reward_2) 236 | loss_list[rank][n_epi] = loss 237 | pi_list[rank][n_epi] = pi 238 | advantage_list[rank][n_epi] = advantage 239 | 240 | read_counter += 1 241 | except queue.Empty: 242 | queue_not_empty = False 243 | if read_counter > 0: 244 | print(f'read_queue for agent {i}, got {read_counter} datapoints', flush=True) 245 | print(f'last datapoint {data}', flush=True) 246 | if read_counter == queue_read_interval: 247 | print(f'read queue for agent {i}, got {read_counter} datapoints', flush=True) 248 | print(f'last datapoint {data}', flush=True) 249 | 250 | if __name__ == '__main__': 251 | mp.set_start_method('spawn') # Deal with fork issues 252 | try: 253 | os.mkdir('./agents',) 254 | except FileExistsError: 255 | pass 256 | global_model = ActorCritic() 257 | global_model.share_memory() 258 | data_pools = [] 259 | 260 | for i in range(n_train_processes): 261 | data_pools.append(mp.Queue()) 262 | 263 | weights = np.array(list(itertools.product(range(0, goal_size, int(goal_size / goal_partition)), 264 | range(0, goal_size, int(goal_size / goal_partition))))) 265 | 266 | #randomly sample from weightspace 267 | selected_weights = np.random.choice(len(weights), n_train_processes+1, replace=False) 268 | 269 | processes = [] 270 | for rank in range(0, n_train_processes + 1): # + 1 for test process 271 | if rank == 0: 272 | p = mp.Process(target=test, args=(weights[selected_weights], data_pools)) 273 | else: 274 | p = mp.Process(target=train, args=(rank-1, weights[selected_weights][rank-1], data_pools[rank-1])) 275 | p.start() 276 | processes.append(p) 277 | for p in processes: 278 | p.join() 279 | --------------------------------------------------------------------------------