├── LICENSE ├── README.md ├── maddpg ├── agent.py ├── buffer.py ├── ddpg │ ├── agent.py │ ├── buffer.py │ ├── main.py │ ├── networks.py │ ├── run.py │ └── utils.py ├── env_test.py ├── maddpg.py ├── networks.py ├── plot.py ├── plots │ └── maddpg_vs_ddpg.png ├── run.py └── utils.py └── mappo └── mappo ├── agent.py ├── mappo.py ├── memory.py ├── networks.py ├── run.py ├── utils.py └── vec_env.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Phil Tabor 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multi-Agent-Reinforcement-Learning 2 | PyTorch implementations of MADDPG, MAPPO (coming) 3 | 4 | The implementation of MADDPG is compatible with PyTorch 1.13 and PettingZoo 1.23.1. 5 | 6 | I recommend using a virtual environment to install dependencies, as I can't guarantee 7 | that future versions (i.e. torch 2) won't break this implementation. 8 | 9 | This code is part of my course on multi agent reinforcement learning, found on the 10 | Neuralnet Academy, which you can find here: https://www.neuralnet.ai/courses 11 | -------------------------------------------------------------------------------- /maddpg/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | import torch.nn.functional as F 4 | from networks import ActorNetwork, CriticNetwork 5 | 6 | 7 | class Agent: 8 | def __init__(self, actor_dims, critic_dims, n_actions, 9 | n_agents, agent_idx, chkpt_dir, min_action, 10 | max_action, alpha=1e-4, beta=1e-3, fc1=64, 11 | fc2=64, gamma=0.95, tau=0.01): 12 | self.gamma = gamma 13 | self.tau = tau 14 | self.n_actions = n_actions 15 | agent_name = 'agent_%s' % agent_idx 16 | self.agent_idx = agent_idx 17 | self.min_action = min_action 18 | self.max_action = max_action 19 | 20 | self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, 21 | chkpt_dir=chkpt_dir, 22 | name=agent_name+'_actor') 23 | self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2, 24 | n_actions, chkpt_dir=chkpt_dir, 25 | name=agent_name+'target__actor') 26 | 27 | self.critic = CriticNetwork(beta, critic_dims, fc1, fc2, 28 | chkpt_dir=chkpt_dir, 29 | name=agent_name+'_critic') 30 | self.target_critic = CriticNetwork(beta, critic_dims, fc1, fc2, 31 | chkpt_dir=chkpt_dir, 32 | name=agent_name+'_target__critic') 33 | 34 | self.update_network_parameters(tau=1) 35 | 36 | def choose_action(self, observation, evaluate=False): 37 | state = T.tensor(observation[np.newaxis, :], dtype=T.float, 38 | device=self.actor.device) 39 | actions = self.actor.forward(state) 40 | noise = T.randn(size=(self.n_actions,)).to(self.actor.device) 41 | noise *= T.tensor(1 - int(evaluate)) 42 | action = T.clamp(actions + noise, 43 | T.tensor(self.min_action, device=self.actor.device), 44 | T.tensor(self.max_action, device=self.actor.device)) 45 | return action.data.cpu().numpy()[0] 46 | 47 | def update_network_parameters(self, tau=None): 48 | tau = tau or self.tau 49 | 50 | src = self.actor 51 | dest = self.target_actor 52 | 53 | for param, target in zip(src.parameters(), dest.parameters()): 54 | target.data.copy_(tau * param.data + (1 - tau) * target.data) 55 | 56 | src = self.critic 57 | dest = self.target_critic 58 | 59 | for param, target in zip(src.parameters(), dest.parameters()): 60 | target.data.copy_(tau * param.data + (1 - tau) * target.data) 61 | 62 | def save_models(self): 63 | self.actor.save_checkpoint() 64 | self.target_actor.save_checkpoint() 65 | self.critic.save_checkpoint() 66 | self.target_critic.save_checkpoint() 67 | 68 | def load_models(self): 69 | self.actor.load_checkpoint() 70 | self.target_actor.load_checkpoint() 71 | self.critic.load_checkpoint() 72 | self.target_critic.load_checkpoint() 73 | 74 | def learn(self, memory, agent_list): 75 | if not memory.ready(): 76 | return 77 | 78 | actor_states, states, actions, rewards,\ 79 | actor_new_states, states_, dones = memory.sample_buffer() 80 | 81 | device = self.actor.device 82 | 83 | states = T.tensor(np.array(states), dtype=T.float, device=device) 84 | rewards = T.tensor(np.array(rewards), dtype=T.float, device=device) 85 | states_ = T.tensor(np.array(states_), dtype=T.float, device=device) 86 | dones = T.tensor(np.array(dones), device=device) 87 | 88 | actor_states = [T.tensor(actor_states[idx], 89 | device=device, dtype=T.float) 90 | for idx in range(len(agent_list))] 91 | actor_new_states = [T.tensor(actor_new_states[idx], 92 | device=device, dtype=T.float) 93 | for idx in range(len(agent_list))] 94 | actions = [T.tensor(actions[idx], device=device, dtype=T.float) 95 | for idx in range(len(agent_list))] 96 | 97 | with T.no_grad(): 98 | new_actions = T.cat([agent.target_actor(actor_new_states[idx]) 99 | for idx, agent in enumerate(agent_list)], 100 | dim=1) 101 | critic_value_ = self.target_critic.forward( 102 | states_, new_actions).squeeze() 103 | critic_value_[dones[:, self.agent_idx]] = 0.0 104 | target = rewards[:, self.agent_idx] + self.gamma * critic_value_ 105 | 106 | old_actions = T.cat([actions[idx] for idx in range(len(agent_list))], 107 | dim=1) 108 | critic_value = self.critic.forward(states, old_actions).squeeze() 109 | critic_loss = F.mse_loss(target, critic_value) 110 | 111 | self.critic.optimizer.zero_grad() 112 | critic_loss.backward() 113 | T.nn.utils.clip_grad_norm_(self.critic.parameters(), 10.0) 114 | self.critic.optimizer.step() 115 | 116 | actions[self.agent_idx] = self.actor.forward( 117 | actor_states[self.agent_idx]) 118 | actions = T.cat([actions[i] for i in range(len(agent_list))], dim=1) 119 | actor_loss = -self.critic.forward(states, actions).mean() 120 | self.actor.optimizer.zero_grad() 121 | actor_loss.backward() 122 | T.nn.utils.clip_grad_norm_(self.actor.parameters(), 10.0) 123 | self.actor.optimizer.step() 124 | 125 | self.update_network_parameters() 126 | -------------------------------------------------------------------------------- /maddpg/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class MultiAgentReplayBuffer: 5 | def __init__(self, max_size, critic_dims, actor_dims, 6 | n_actions, n_agents, batch_size): 7 | self.mem_size = max_size 8 | self.mem_cntr = 0 9 | self.n_agents = n_agents 10 | self.actor_dims = actor_dims 11 | self.batch_size = batch_size 12 | self.n_actions = n_actions 13 | 14 | self.state_memory = np.zeros((self.mem_size, critic_dims)) 15 | self.new_state_memory = np.zeros((self.mem_size, critic_dims)) 16 | self.reward_memory = np.zeros((self.mem_size, n_agents)) 17 | self.terminal_memory = np.zeros((self.mem_size, n_agents), dtype=bool) 18 | 19 | self.init_actor_memory() 20 | 21 | def init_actor_memory(self): 22 | self.actor_state_memory = [] 23 | self.actor_new_state_memory = [] 24 | self.actor_action_memory = [] 25 | 26 | for i in range(self.n_agents): 27 | self.actor_state_memory.append( 28 | np.zeros((self.mem_size, self.actor_dims[i]))) 29 | self.actor_new_state_memory.append( 30 | np.zeros((self.mem_size, self.actor_dims[i]))) 31 | self.actor_action_memory.append( 32 | np.zeros((self.mem_size, self.n_actions[i]))) 33 | 34 | def store_transition(self, raw_obs, state, action, reward, 35 | raw_obs_, state_, done): 36 | 37 | index = self.mem_cntr % self.mem_size 38 | for agent_idx in range(self.n_agents): 39 | self.actor_state_memory[agent_idx][index] = raw_obs[agent_idx] 40 | self.actor_new_state_memory[agent_idx][index] = raw_obs_[agent_idx] 41 | self.actor_action_memory[agent_idx][index] = action[agent_idx] 42 | 43 | self.state_memory[index] = state 44 | self.new_state_memory[index] = state_ 45 | self.reward_memory[index] = reward 46 | self.terminal_memory[index] = done 47 | self.mem_cntr += 1 48 | 49 | def sample_buffer(self): 50 | max_mem = min(self.mem_cntr, self.mem_size) 51 | 52 | batch = np.random.choice(max_mem, self.batch_size, replace=False) 53 | 54 | states = self.state_memory[batch] 55 | states_ = self.new_state_memory[batch] 56 | rewards = self.reward_memory[batch] 57 | terminal = self.terminal_memory[batch] 58 | 59 | actor_states = [] 60 | actor_new_states = [] 61 | actions = [] 62 | for agent_idx in range(self.n_agents): 63 | actor_states.append(self.actor_state_memory[agent_idx][batch]) 64 | actor_new_states.append( 65 | self.actor_new_state_memory[agent_idx][batch]) 66 | actions.append(self.actor_action_memory[agent_idx][batch]) 67 | 68 | return actor_states, states, actions, rewards, \ 69 | actor_new_states, states_, terminal 70 | 71 | def ready(self): 72 | if self.mem_cntr >= self.batch_size: 73 | return True 74 | return False 75 | -------------------------------------------------------------------------------- /maddpg/ddpg/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | import torch.nn.functional as F 4 | from networks import ActorNetwork, CriticNetwork 5 | from buffer import ReplayBuffer 6 | 7 | 8 | class Agent: 9 | def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, 10 | max_size=1000000, fc1_dims=400, fc2_dims=300, 11 | batch_size=64): 12 | self.gamma = gamma 13 | self.tau = tau 14 | self.batch_size = batch_size 15 | self.alpha = alpha 16 | self.beta = beta 17 | self.n_actions = n_actions 18 | 19 | self.memory = ReplayBuffer(max_size, input_dims, n_actions) 20 | 21 | self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, 22 | n_actions=n_actions, name='actor') 23 | self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, 24 | n_actions=n_actions, name='critic') 25 | 26 | self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, 27 | n_actions=n_actions, 28 | name='target_actor') 29 | 30 | self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, 31 | fc2_dims, n_actions=n_actions, 32 | name='target_critic') 33 | 34 | self.update_network_parameters(tau=1) 35 | 36 | def choose_action(self, observation, eval=False): 37 | state = T.tensor(observation[np.newaxis, :], dtype=T.float, 38 | device=self.actor.device) 39 | mu = self.actor.forward(state).to(self.actor.device) 40 | noise = T.rand(self.n_actions).to(self.actor.device) 41 | noise *= T.tensor(1 - int(eval)) 42 | mu_prime = mu + noise 43 | mu_prime = T.clamp(mu_prime, 0., 1.) 44 | 45 | return mu_prime.cpu().detach().numpy()[0] 46 | 47 | def remember(self, state, action, reward, state_, done): 48 | self.memory.store_transition(state, action, reward, state_, done) 49 | 50 | def save_models(self): 51 | self.actor.save_checkpoint() 52 | self.target_actor.save_checkpoint() 53 | self.critic.save_checkpoint() 54 | self.target_critic.save_checkpoint() 55 | 56 | def load_models(self): 57 | self.actor.load_checkpoint() 58 | self.target_actor.load_checkpoint() 59 | self.critic.load_checkpoint() 60 | self.target_critic.load_checkpoint() 61 | 62 | def learn(self): 63 | if self.memory.mem_cntr < self.batch_size: 64 | return 65 | 66 | states, actions, rewards, states_, done = \ 67 | self.memory.sample_buffer(self.batch_size) 68 | 69 | states = T.tensor(states, dtype=T.float).to(self.actor.device) 70 | states_ = T.tensor(states_, dtype=T.float).to(self.actor.device) 71 | actions = T.tensor(actions, dtype=T.float).to(self.actor.device) 72 | rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) 73 | done = T.tensor(done).to(self.actor.device) 74 | 75 | target_actions = self.target_actor.forward(states_) 76 | critic_value_ = self.target_critic.forward(states_, target_actions) 77 | critic_value = self.critic.forward(states, actions) 78 | 79 | critic_value_[done] = 0.0 80 | critic_value_ = critic_value_.view(-1) 81 | 82 | target = rewards + self.gamma*critic_value_ 83 | target = target.view(self.batch_size, 1) 84 | 85 | self.critic.optimizer.zero_grad() 86 | critic_loss = F.mse_loss(target, critic_value) 87 | critic_loss.backward() 88 | self.critic.optimizer.step() 89 | 90 | self.actor.optimizer.zero_grad() 91 | actor_loss = -self.critic.forward(states, self.actor.forward(states)) 92 | actor_loss = T.mean(actor_loss) 93 | actor_loss.backward() 94 | self.actor.optimizer.step() 95 | 96 | self.update_network_parameters() 97 | 98 | def update_network_parameters(self, tau=None): 99 | tau = tau or self.tau 100 | src = self.actor 101 | dest = self.target_actor 102 | for param, target in zip(src.parameters(), dest.parameters()): 103 | target.data.copy_(tau * param.data + (1 - tau) * target.data) 104 | src = self.critic 105 | dest = self.target_critic 106 | for param, target in zip(src.parameters(), dest.parameters()): 107 | target.data.copy_(tau * param.data + (1 - tau) * target.data) 108 | -------------------------------------------------------------------------------- /maddpg/ddpg/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ReplayBuffer: 5 | def __init__(self, max_size, input_shape, n_actions): 6 | self.mem_size = max_size 7 | self.mem_cntr = 0 8 | self.state_memory = np.zeros((self.mem_size, input_shape)) 9 | self.new_state_memory = np.zeros((self.mem_size, input_shape)) 10 | self.action_memory = np.zeros((self.mem_size, n_actions)) 11 | self.reward_memory = np.zeros(self.mem_size) 12 | self.terminal_memory = np.zeros(self.mem_size, dtype=bool) 13 | 14 | def store_transition(self, state, action, reward, state_, done): 15 | index = self.mem_cntr % self.mem_size 16 | self.state_memory[index] = state 17 | self.action_memory[index] = action 18 | self.reward_memory[index] = reward 19 | self.new_state_memory[index] = state_ 20 | self.terminal_memory[index] = done 21 | 22 | self.mem_cntr += 1 23 | 24 | def sample_buffer(self, batch_size): 25 | max_mem = min(self.mem_cntr, self.mem_size) 26 | 27 | batch = np.random.choice(max_mem, batch_size) 28 | 29 | states = self.state_memory[batch] 30 | actions = self.action_memory[batch] 31 | rewards = self.reward_memory[batch] 32 | states_ = self.new_state_memory[batch] 33 | dones = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, dones 36 | -------------------------------------------------------------------------------- /maddpg/ddpg/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from agent import Agent 3 | from pettingzoo.mpe import simple_speaker_listener_v4 4 | 5 | 6 | def obs_list_to_state_vector(observation): 7 | state = np.array([]) 8 | for obs in observation: 9 | state = np.concatenate([state, obs]) 10 | return state 11 | 12 | 13 | if __name__ == '__main__': 14 | env = simple_speaker_listener_v4.parallel_env( 15 | continuous_actions=True) 16 | scenario = 'simple_speaker_listener' 17 | 18 | initial_temp = env.reset() 19 | n_agents = env.max_num_agents 20 | 21 | agents = [] 22 | 23 | for agent in env.agents: 24 | input_dims = env.observation_space(agent).shape[0] 25 | n_actions = env.action_space(agent).shape[0] 26 | 27 | agents.append(Agent(alpha=1e-3, beta=1e-3, 28 | input_dims=input_dims, tau=0.01, gamma=0.95, 29 | batch_size=1024, fc1_dims=64, fc2_dims=64, 30 | n_actions=n_actions)) 31 | 32 | N_GAMES = 25_000 33 | PRINT_INTERVAL = 500 34 | total_steps = 0 35 | score_history = [] 36 | evaluate = False 37 | best_score = 0 38 | 39 | if evaluate: 40 | for agent in agents: 41 | agent.load_checkpoint() 42 | 43 | total_steps = 0 44 | 45 | for i in range(N_GAMES): 46 | observation, _ = env.reset() 47 | terminal = [False] * n_agents 48 | score = 0 49 | observation = list(observation.values()) 50 | 51 | while not any(terminal): 52 | action = [agent.choose_action(observation[idx]) 53 | for idx, agent in enumerate(agents)] 54 | action = {agent: act for agent, act in zip(env.agents, action)} 55 | observation_, reward, done, trunc, info = env.step(action) 56 | 57 | observation_ = list(observation_.values()) 58 | reward = list(reward.values()) 59 | done = list(done.values()) 60 | trunc = list(trunc.values()) 61 | action = list(action.values()) 62 | 63 | terminal = [d or t for d, t in zip(done, trunc)] 64 | 65 | for idx, agent in enumerate(agents): 66 | agent.remember(observation[idx], action[idx], 67 | reward[idx], observation_[idx], terminal[idx]) 68 | if total_steps % 100 == 0 and not evaluate: 69 | for agent in agents: 70 | agent.learn() 71 | score += sum(reward) 72 | observation = observation_ 73 | total_steps += 1 74 | score_history.append(score) 75 | avg_score = np.mean(score_history[-100:]) 76 | 77 | if avg_score > best_score: 78 | best_score = avg_score 79 | # agent.save_models() 80 | if i % PRINT_INTERVAL == 0 and i > 0: 81 | print(f'episode {i} avg score {avg_score:.1f}') 82 | -------------------------------------------------------------------------------- /maddpg/ddpg/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | 8 | class CriticNetwork(nn.Module): 9 | def __init__(self, beta, input_dims, fc1_dims, fc2_dims, 10 | n_actions, name, chkpt_dir='tmp/ddpg'): 11 | super(CriticNetwork, self).__init__() 12 | 13 | self.chkpt_file = os.path.join(chkpt_dir, name) 14 | self.fc1 = nn.Linear(input_dims+n_actions, fc1_dims) 15 | self.fc2 = nn.Linear(fc1_dims, fc2_dims) 16 | self.q = nn.Linear(fc2_dims, 1) 17 | 18 | self.optimizer = optim.Adam(self.parameters(), lr=beta) 19 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 20 | 21 | self.to(self.device) 22 | 23 | def forward(self, state, action): 24 | x = F.relu(self.fc1(T.cat([state, action], dim=1))) 25 | x = F.relu(self.fc2(x)) 26 | q = self.q(x) 27 | 28 | return q 29 | 30 | def save_checkpoint(self): 31 | T.save(self.state_dict(), self.chkpt_file) 32 | 33 | def load_checkpoint(self): 34 | self.load_state_dict(T.load(self.chkpt_file)) 35 | 36 | 37 | class ActorNetwork(nn.Module): 38 | def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, 39 | n_actions, name, chkpt_dir='tmp/ddpg'): 40 | super(ActorNetwork, self).__init__() 41 | 42 | self.chkpt_file = os.path.join(chkpt_dir, name) 43 | 44 | self.fc1 = nn.Linear(input_dims, fc1_dims) 45 | self.fc2 = nn.Linear(fc1_dims, fc2_dims) 46 | 47 | self.pi = nn.Linear(fc2_dims, n_actions) 48 | 49 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 50 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 51 | 52 | self.to(self.device) 53 | 54 | def forward(self, state): 55 | x = self.fc1(state) 56 | x = F.relu(x) 57 | x = F.relu(self.fc2(x)) 58 | pi = T.sigmoid(self.pi(x)) 59 | 60 | return pi 61 | 62 | def save_checkpoint(self): 63 | T.save(self.state_dict(), self.chkpt_file) 64 | 65 | def load_checkpoint(self): 66 | self.load_state_dict(T.load(self.chkpt_file)) 67 | -------------------------------------------------------------------------------- /maddpg/ddpg/run.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from agent import Agent 3 | from pettingzoo.mpe import simple_speaker_listener_v4 4 | 5 | 6 | def obs_list_to_state_vector(observation): 7 | state = np.array([]) 8 | for obs in observation: 9 | state = np.concatenate([state, obs]) 10 | return state 11 | 12 | 13 | def run(): 14 | parallel_env = simple_speaker_listener_v4.parallel_env( 15 | continuous_actions=True) 16 | _, _ = parallel_env.reset() 17 | n_agents = parallel_env.max_num_agents 18 | 19 | n_actions = [] 20 | agents = [] 21 | 22 | for agent in parallel_env.agents: 23 | input_dims = parallel_env.observation_space(agent).shape[0] 24 | n_actions = parallel_env.action_space(agent).shape[0] 25 | 26 | agents.append(Agent(input_dims=input_dims, n_actions=n_actions, 27 | gamma=0.95, tau=0.01, alpha=1e-4, beta=1e-3)) 28 | 29 | EVAL_INTERVAL = 1000 30 | MAX_STEPS = 10_000 31 | 32 | total_steps = 0 33 | episode = 0 34 | 35 | eval_scores = [] 36 | eval_steps = [] 37 | score = evaluate(agents, parallel_env, episode, total_steps) 38 | eval_scores.append(score) 39 | eval_steps.append(total_steps) 40 | 41 | while total_steps < MAX_STEPS: 42 | obs, _ = parallel_env.reset() 43 | terminal = [False] * n_agents 44 | obs = list(obs.values()) 45 | while not any(terminal): 46 | action = [agent.choose_action(obs[idx]) 47 | for idx, agent in enumerate(agents)] 48 | action = {agent: act 49 | for agent, act in zip(parallel_env.agents, action)} 50 | obs_, reward, done, truncated, info = parallel_env.step(action) 51 | list_done = list(done.values()) 52 | list_reward = list(reward.values()) 53 | list_action = list(action.values()) 54 | obs_ = list(obs_.values()) 55 | list_trunc = list(truncated.values()) 56 | 57 | terminal = [d or t for d, t in zip(list_done, list_trunc)] 58 | 59 | for idx, agent in enumerate(agents): 60 | agent.remember(obs[idx], list_action[idx], 61 | list_reward[idx], obs_[idx], terminal[idx]) 62 | 63 | if total_steps % 100 == 0: 64 | for agent in agents: 65 | agent.learn() 66 | obs = obs_ 67 | total_steps += 1 68 | 69 | if total_steps % EVAL_INTERVAL == 0 and total_steps > 0: 70 | score = evaluate(agents, parallel_env, episode, total_steps) 71 | eval_scores.append(score) 72 | eval_steps.append(total_steps) 73 | 74 | episode += 1 75 | 76 | np.save('../data/ddpg_scores.npy', np.array(eval_scores)) 77 | np.save('../data/ddpg_steps.npy', np.array(eval_steps)) 78 | 79 | 80 | def evaluate(agents, env, ep, step): 81 | score_history = [] 82 | for i in range(3): 83 | obs, _ = env.reset() 84 | score = 0 85 | terminal = [False] * env.max_num_agents 86 | obs = list(obs.values()) 87 | while not any(terminal): 88 | action = [agent.choose_action(obs[idx], eval=True) 89 | for idx, agent in enumerate(agents)] 90 | action = {agent: act 91 | for agent, act in zip(env.agents, action)} 92 | 93 | obs_, reward, done, truncated, info = env.step(action) 94 | obs_ = list(obs_.values()) 95 | list_trunc = list(truncated.values()) 96 | list_reward = list(reward.values()) 97 | list_done = list(done.values()) 98 | 99 | terminal = [d or t for d, t in zip(list_done, list_trunc)] 100 | 101 | obs = obs_ 102 | score += sum(list_reward) 103 | score_history.append(score) 104 | avg_score = np.mean(score_history) 105 | print(f'Evaluation episode {ep} train steps {step}' 106 | f' average score {avg_score:.1f}') 107 | 108 | return avg_score 109 | 110 | 111 | if __name__ == '__main__': 112 | run() 113 | -------------------------------------------------------------------------------- /maddpg/ddpg/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /maddpg/env_test.py: -------------------------------------------------------------------------------- 1 | from pettingzoo.mpe import simple_speaker_listener_v4 2 | 3 | 4 | env = simple_speaker_listener_v4.parallel_env(continuous_actions=True) 5 | _, _ = env.reset() 6 | for agent in env.agents: 7 | print(f'agent observation space {env.observation_space(agent)}') 8 | obs, info = env.reset() 9 | print(f'initial observation: {obs} debug info: {info}') 10 | terminal = [False] * env.max_num_agents 11 | while not any(terminal): 12 | actions = {} 13 | for agent in env.agents: 14 | actions[agent] = env.action_space(agent).sample() 15 | obs_, reward, done, trunc, info = env.step(actions) 16 | terminal = [d or t for d, t in zip(done.values(), trunc.values())] 17 | print(f'actions taken {actions}') 18 | print(f'obs values {obs.values()}') 19 | obs = list(obs.values()) 20 | print(f'obs as a list {obs}') 21 | -------------------------------------------------------------------------------- /maddpg/maddpg.py: -------------------------------------------------------------------------------- 1 | from agent import Agent 2 | 3 | 4 | class MADDPG: 5 | def __init__(self, actor_dims, critic_dims, n_agents, n_actions, env, 6 | alpha=1e-4, beta=1e-3, fc1=64, fc2=64, gamma=0.95, tau=0.01, 7 | chkpt_dir='tmp/maddpg/', scenario='co-op_navigation'): 8 | self.agents = [] 9 | chkpt_dir += scenario 10 | for agent_idx in range(n_agents): 11 | agent = list(env.action_spaces.keys())[agent_idx] 12 | min_action = env.action_space(agent).low 13 | max_action = env.action_space(agent).high 14 | self.agents.append(Agent(actor_dims[agent_idx], critic_dims, 15 | n_actions[agent_idx], n_agents, agent_idx, 16 | alpha=alpha, beta=beta, tau=tau, fc1=fc1, 17 | fc2=fc2, chkpt_dir=chkpt_dir, 18 | gamma=gamma, min_action=min_action, 19 | max_action=max_action)) 20 | 21 | def save_checkpoint(self): 22 | for agent in self.agents: 23 | agent.save_models() 24 | 25 | def load_checkpoint(self): 26 | for agent in self.agents: 27 | agent.load_models() 28 | 29 | def choose_action(self, raw_obs, evaluate=False): 30 | actions = {} 31 | for agent_id, agent in zip(raw_obs, self.agents): 32 | action = agent.choose_action(raw_obs[agent_id], evaluate) 33 | actions[agent_id] = action 34 | return actions 35 | 36 | def learn(self, memory): 37 | for agent in self.agents: 38 | agent.learn(memory, self.agents) 39 | -------------------------------------------------------------------------------- /maddpg/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | 8 | class CriticNetwork(nn.Module): 9 | def __init__(self, beta, input_dims, fc1, fc2, 10 | name, chkpt_dir): 11 | super(CriticNetwork, self).__init__() 12 | 13 | self.chkpt_file = os.path.join(chkpt_dir, name) 14 | self.fc1 = nn.Linear(input_dims, fc1) 15 | self.fc2 = nn.Linear(fc1, fc2) 16 | self.q = nn.Linear(fc2, 1) 17 | 18 | self.optimizer = optim.Adam(self.parameters(), lr=beta) 19 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 20 | 21 | self.to(self.device) 22 | 23 | def forward(self, state, action): 24 | x = F.relu(self.fc1(T.cat([state, action], dim=1))) 25 | x = F.relu(self.fc2(x)) 26 | q = self.q(x) 27 | 28 | return q 29 | 30 | def save_checkpoint(self): 31 | T.save(self.state_dict(), self.chkpt_file) 32 | 33 | def load_checkpoint(self): 34 | self.load_state_dict(T.load(self.chkpt_file)) 35 | 36 | 37 | class ActorNetwork(nn.Module): 38 | def __init__(self, alpha, input_dims, fc1, fc2, 39 | n_actions, name, chkpt_dir): 40 | super(ActorNetwork, self).__init__() 41 | 42 | self.chkpt_file = os.path.join(chkpt_dir, name) 43 | 44 | self.fc1 = nn.Linear(input_dims, fc1) 45 | self.fc2 = nn.Linear(fc1, fc2) 46 | self.pi = nn.Linear(fc2, n_actions) 47 | 48 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 49 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 50 | 51 | self.to(self.device) 52 | 53 | def forward(self, state): 54 | x = F.relu(self.fc1(state)) 55 | x = F.relu(self.fc2(x)) 56 | pi = T.tanh(self.pi(x)) 57 | 58 | return pi 59 | 60 | def save_checkpoint(self): 61 | T.save(self.state_dict(), self.chkpt_file) 62 | 63 | def load_checkpoint(self): 64 | self.load_state_dict(T.load(self.chkpt_file)) 65 | -------------------------------------------------------------------------------- /maddpg/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils import plot_learning_curve 3 | 4 | maddpg_scores = np.load('data/maddpg_scores.npy') 5 | maddpg_steps = np.load('data/maddpg_steps.npy') 6 | 7 | ddpg_scores = np.load('data/ddpg_scores.npy') 8 | ddpg_steps = np.load('data/ddpg_steps.npy') 9 | 10 | plot_learning_curve(x=maddpg_steps, 11 | scores=(maddpg_scores, ddpg_scores), 12 | filename='plots/maddpg_vs_ddpg.png') 13 | -------------------------------------------------------------------------------- /maddpg/plots/maddpg_vs_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Multi-Agent-Reinforcement-Learning/d236304b5bd06b9efc276b24aa0e2890fd5bf65d/maddpg/plots/maddpg_vs_ddpg.png -------------------------------------------------------------------------------- /maddpg/run.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from maddpg import MADDPG 3 | from buffer import MultiAgentReplayBuffer 4 | from pettingzoo.mpe import simple_speaker_listener_v4 5 | 6 | 7 | def obs_list_to_state_vector(observation): 8 | state = np.array([]) 9 | for obs in observation: 10 | state = np.concatenate([state, obs]) 11 | return state 12 | 13 | 14 | def run(): 15 | parallel_env = simple_speaker_listener_v4.parallel_env( 16 | continuous_actions=True) 17 | _, _ = parallel_env.reset() 18 | n_agents = parallel_env.max_num_agents 19 | 20 | actor_dims = [] 21 | n_actions = [] 22 | for agent in parallel_env.agents: 23 | actor_dims.append(parallel_env.observation_space(agent).shape[0]) 24 | n_actions.append(parallel_env.action_space(agent).shape[0]) 25 | critic_dims = sum(actor_dims) + sum(n_actions) 26 | 27 | maddpg_agents = MADDPG(actor_dims, critic_dims, n_agents, n_actions, 28 | env=parallel_env, gamma=0.95, alpha=1e-4, beta=1e-3) 29 | critic_dims = sum(actor_dims) 30 | memory = MultiAgentReplayBuffer(1_000_000, critic_dims, actor_dims, 31 | n_actions, n_agents, batch_size=1024) 32 | 33 | EVAL_INTERVAL = 1000 34 | MAX_STEPS = 10_000 35 | 36 | total_steps = 0 37 | episode = 0 38 | eval_scores = [] 39 | eval_steps = [] 40 | 41 | score = evaluate(maddpg_agents, parallel_env, episode, total_steps) 42 | eval_scores.append(score) 43 | eval_steps.append(total_steps) 44 | 45 | while total_steps < MAX_STEPS: 46 | obs, _ = parallel_env.reset() 47 | terminal = [False] * n_agents 48 | while not any(terminal): 49 | actions = maddpg_agents.choose_action(obs) 50 | 51 | obs_, reward, done, trunc, info = parallel_env.step(actions) 52 | 53 | list_done = list(done.values()) 54 | list_obs = list(obs.values()) 55 | list_reward = list(reward.values()) 56 | list_actions = list(actions.values()) 57 | list_obs_ = list(obs_.values()) 58 | list_trunc = list(trunc.values()) 59 | 60 | state = obs_list_to_state_vector(list_obs) 61 | state_ = obs_list_to_state_vector(list_obs_) 62 | 63 | terminal = [d or t for d, t in zip(list_done, list_trunc)] 64 | memory.store_transition(list_obs, state, list_actions, list_reward, 65 | list_obs_, state_, terminal) 66 | 67 | if total_steps % 100 == 0: 68 | maddpg_agents.learn(memory) 69 | obs = obs_ 70 | total_steps += 1 71 | 72 | if total_steps % EVAL_INTERVAL == 0: 73 | score = evaluate(maddpg_agents, parallel_env, episode, total_steps) 74 | eval_scores.append(score) 75 | eval_steps.append(total_steps) 76 | 77 | episode += 1 78 | 79 | np.save('data/maddpg_scores.npy', np.array(eval_scores)) 80 | np.save('data/maddpg_steps.npy', np.array(eval_steps)) 81 | 82 | 83 | def evaluate(agents, env, ep, step, n_eval=3): 84 | score_history = [] 85 | for i in range(n_eval): 86 | obs, _ = env.reset() 87 | score = 0 88 | terminal = [False] * env.max_num_agents 89 | while not any(terminal): 90 | actions = agents.choose_action(obs, evaluate=True) 91 | obs_, reward, done, trunc, info = env.step(actions) 92 | 93 | list_trunc = list(trunc.values()) 94 | list_reward = list(reward.values()) 95 | list_done = list(done.values()) 96 | 97 | terminal = [d or t for d, t in zip(list_done, list_trunc)] 98 | 99 | obs = obs_ 100 | score += sum(list_reward) 101 | score_history.append(score) 102 | avg_score = np.mean(score_history) 103 | print(f'Evaluation episode {ep} train steps {step}' 104 | f' average score {avg_score:.1f}') 105 | return avg_score 106 | 107 | 108 | if __name__ == '__main__': 109 | run() 110 | -------------------------------------------------------------------------------- /maddpg/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_learning_curve(x, scores, filename, lines=None): 6 | maddpg_scores, ddpg_scores = scores 7 | 8 | fig = plt.figure() 9 | ax = fig.add_subplot(111, label="1") 10 | ax2 = fig.add_subplot(111, label="2", frame_on=False) 11 | 12 | N = len(maddpg_scores) 13 | running_avg = np.empty(N) 14 | for t in range(N): 15 | running_avg[t] = np.mean( 16 | maddpg_scores[max(0, t-100):(t+1)]) 17 | 18 | ax.plot(x, running_avg, color="C0") 19 | ax.set_xlabel("Training Steps", color="C0") 20 | ax.set_ylabel("MADDPG Score", color="C0") 21 | ax.tick_params(axis='x', colors="C0") 22 | ax.tick_params(axis='y', colors="C0") 23 | 24 | N = len(ddpg_scores) 25 | running_avg = np.empty(N) 26 | for t in range(N): 27 | running_avg[t] = np.mean(ddpg_scores[max(0, t-100):(t+1)]) 28 | 29 | ax2.plot(x, running_avg, color="C1") 30 | ax2.axes.get_xaxis().set_visible(False) 31 | ax2.yaxis.tick_right() 32 | ax2.set_ylabel('DDPG Score', color="C1") 33 | ax2.yaxis.set_label_position('right') 34 | ax2.tick_params(axis='y', colors="C1") 35 | 36 | if lines is not None: 37 | for line in lines: 38 | plt.axvline(x=line) 39 | 40 | plt.savefig(filename) 41 | -------------------------------------------------------------------------------- /mappo/mappo/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | from networks import ContinuousActorNetwork, ContinuousCriticNetwork 4 | 5 | 6 | class Agent: 7 | def __init__(self, actor_dims, critic_dims, 8 | n_actions, agent_idx, agent_name, 9 | gamma=0.99, alpha=3e-4, T=2048, 10 | gae_lambda=0.95, policy_clip=0.2, 11 | batch_size=64, n_epochs=10, 12 | n_procs=8, chkpt_dir=None, 13 | scenario=None): 14 | self.gamma = gamma 15 | self.policy_clip = policy_clip 16 | self.n_epochs = n_epochs 17 | self.gae_lambda = gae_lambda 18 | self.entropy_coefficient = 1e-3 19 | self.agent_idx = agent_idx 20 | self.agent_name = agent_name 21 | self.n_procs = n_procs 22 | 23 | self.actor = ContinuousActorNetwork(n_actions, actor_dims, alpha, 24 | chkpt_dir=chkpt_dir, 25 | scenario=scenario) 26 | self.critic = ContinuousCriticNetwork(critic_dims, alpha, 27 | chkpt_dir=chkpt_dir, 28 | scenario=scenario) 29 | self.n_actions = n_actions 30 | 31 | def save_models(self): 32 | self.actor.save_checkpoint() 33 | self.critic.save_checkpoint() 34 | 35 | def load_models(self): 36 | self.actor.load_checkpoint() 37 | self.critic.load_checkpoint() 38 | 39 | def choose_action(self, observation): 40 | with T.no_grad(): 41 | state = T.tensor(observation, dtype=T.float, 42 | device=self.actor.device) 43 | 44 | dist = self.actor(state) 45 | action = dist.sample() 46 | probs = dist.log_prob(action) 47 | return action.cpu().numpy(), probs.cpu().numpy() 48 | 49 | def calc_adv_and_returns(self, memories): 50 | states, new_states, r, dones = memories 51 | with T.no_grad(): 52 | values = self.critic(states).squeeze() 53 | values_ = self.critic(new_states).squeeze() 54 | deltas = r[:, :, self.agent_idx] + self.gamma * values_ - values 55 | deltas = deltas.cpu().numpy() 56 | adv = [0] 57 | for step in reversed(range(deltas.shape[0])): 58 | advantage = deltas[step] +\ 59 | self.gamma*self.gae_lambda*adv[-1]*np.array(dones[step]) 60 | adv.append(advantage) 61 | adv.reverse() 62 | adv = np.array(adv[:-1]) 63 | adv = T.tensor(adv, device=self.critic.device).unsqueeze(2) 64 | returns = adv + values.unsqueeze(2) 65 | adv = (adv - adv.mean()) / (adv.std()+1e-4) 66 | return adv, returns 67 | 68 | def learn(self, memory): 69 | actor_states, states, actions, old_probs, rewards, actor_new_states, \ 70 | states_, dones = memory.recall() 71 | device = self.critic.device 72 | state_arr = T.tensor(states, dtype=T.float, device=device) 73 | states__arr = T.tensor(states_, dtype=T.float, device=device) 74 | r = T.tensor(rewards, dtype=T.float, device=device) 75 | action_arr = T.tensor(actions[self.agent_name], 76 | dtype=T.float, device=device) 77 | old_probs_arr = T.tensor(old_probs[self.agent_name], dtype=T.float, 78 | device=device) 79 | actor_states_arr = T.tensor(actor_states[self.agent_name], 80 | dtype=T.float, device=device) 81 | adv, returns = self.calc_adv_and_returns((state_arr, states__arr, 82 | r, dones)) 83 | for epoch in range(self.n_epochs): 84 | batches = memory.generate_batches() 85 | for batch in batches: 86 | old_probs = old_probs_arr[batch] 87 | actions = action_arr[batch] 88 | actor_states = actor_states_arr[batch] 89 | dist = self.actor(actor_states) 90 | new_probs = dist.log_prob(actions) 91 | prob_ratio = T.exp(new_probs.sum(2, keepdims=True) - old_probs. 92 | sum(2, keepdims=True)) 93 | weighted_probs = adv[batch] * prob_ratio 94 | weighted_clipped_probs = T.clamp( 95 | prob_ratio, 1-self.policy_clip, 1+self.policy_clip) * \ 96 | adv[batch] 97 | entropy = dist.entropy().sum(2, keepdims=True) 98 | actor_loss = -T.min(weighted_probs, 99 | weighted_clipped_probs) 100 | actor_loss -= self.entropy_coefficient * entropy 101 | self.actor.optimizer.zero_grad() 102 | actor_loss.mean().backward() 103 | T.nn.utils.clip_grad_norm_(self.actor.parameters(), 40) 104 | self.actor.optimizer.step() 105 | 106 | states = state_arr[batch] 107 | critic_value = self.critic(states).squeeze() 108 | critic_loss = \ 109 | (critic_value - returns[batch].squeeze()).pow(2).mean() 110 | self.critic.optimizer.zero_grad() 111 | critic_loss.backward() 112 | self.critic.optimizer.step() 113 | -------------------------------------------------------------------------------- /mappo/mappo/mappo.py: -------------------------------------------------------------------------------- 1 | from agent import Agent 2 | 3 | 4 | class MAPPO: 5 | def __init__(self, actor_dims, critic_dims, n_agents, n_actions, 6 | env, T, n_procs, n_epochs, 7 | alpha=1e-4, gamma=0.95, chkpt_dir='tmp/mappo/', 8 | scenario='co-op_navigation'): 9 | self.agents = [] 10 | chkpt_dir += scenario 11 | for agent_idx, agent in enumerate(env.agents): 12 | self.agents.append(Agent(actor_dims[agent], critic_dims, 13 | n_actions[agent], agent_idx, 14 | alpha=alpha, chkpt_dir=chkpt_dir, 15 | gamma=gamma, agent_name=agent, 16 | scenario=scenario)) 17 | 18 | def save_checkpoint(self): 19 | for agent in self.agents: 20 | agent.save_models() 21 | 22 | def load_checkpoint(self): 23 | for agent in self.agents: 24 | agent.load_models() 25 | 26 | def choose_action(self, raw_obs): 27 | actions = {} 28 | probs = {} 29 | for agent_id, agent in zip(raw_obs, self.agents): 30 | action, prob = agent.choose_action(raw_obs[agent_id]) 31 | actions[agent_id] = action 32 | probs[agent_id] = prob 33 | return actions, probs 34 | 35 | def learn(self, memory): 36 | for agent in self.agents: 37 | agent.learn(memory) 38 | -------------------------------------------------------------------------------- /mappo/mappo/memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class PPOMemory: 5 | def __init__(self, batch_size, T, n_agents, agents, n_procs, 6 | critic_dims, actor_dims, n_actions): 7 | 8 | self.states = np.zeros((T, n_procs, critic_dims), dtype=np.float32) 9 | self.rewards = np.zeros((T, n_procs, n_agents), dtype=np.float32) 10 | self.dones = np.zeros((T, n_procs), dtype=np.float32) 11 | self.new_states = np.zeros((T, n_procs, critic_dims), dtype=np.float32) 12 | 13 | self.actor_states = {a: np.zeros((T, n_procs, actor_dims[a])) 14 | for a in agents} 15 | self.actor_new_states = {a: np.zeros((T, n_procs, actor_dims[a])) 16 | for a in agents} 17 | self.actions = {a: np.zeros((T, n_procs, n_actions[a])) 18 | for a in agents} 19 | self.probs = {a: np.zeros((T, n_procs, n_actions[a])) 20 | for a in agents} 21 | 22 | self.mem_cntr = 0 23 | self.n_states = T 24 | self.n_procs = n_procs 25 | self.critic_dims = critic_dims 26 | self.actor_dims = actor_dims 27 | self.n_actions = n_actions 28 | self.n_agents = n_agents 29 | self.agents = agents 30 | self.batch_size = batch_size 31 | 32 | def recall(self): 33 | return self.actor_states, \ 34 | self.states, \ 35 | self.actions, \ 36 | self.probs, \ 37 | self.rewards, \ 38 | self.actor_new_states, \ 39 | self.new_states, \ 40 | self.dones 41 | 42 | def generate_batches(self): 43 | # batch_start = np.arange(0, n_states, self.batch_size) 44 | n_batches = int(self.n_states // self.batch_size) 45 | indices = np.arange(self.n_states, dtype=np.int64) 46 | np.random.shuffle(indices) 47 | # batches = [indices[i:i+self.batch_size] for i in batch_start] 48 | batches = [indices[i*self.batch_size:(i+1)*self.batch_size] 49 | for i in range(n_batches)] 50 | return batches 51 | 52 | def store_memory(self, raw_obs, state, action, probs, reward, 53 | raw_obs_, state_, done): 54 | index = self.mem_cntr % self.n_states 55 | self.states[index] = state 56 | self.new_states[index] = state_ 57 | self.dones[index] = done 58 | self.rewards[index] = reward 59 | 60 | for agent in self.agents: 61 | self.actions[agent][index] = action[agent] 62 | self.actor_states[agent][index] = raw_obs[agent] 63 | self.actor_new_states[agent][index] = raw_obs_[agent] 64 | self.probs[agent][index] = probs[agent] 65 | self.mem_cntr += 1 66 | 67 | def clear_memory(self): 68 | self.states = np.zeros((self.n_states, self.n_procs, self.critic_dims), 69 | dtype=np.float32) 70 | self.rewards = np.zeros((self.n_states, self.n_procs, self.n_agents), 71 | dtype=np.float32) 72 | self.dones = np.zeros((self.n_states, self.n_procs), dtype=np.float32) 73 | self.new_states = np.zeros((self.n_states, self.n_procs, 74 | self.critic_dims), dtype=np.float32) 75 | 76 | self.actor_states = {a: np.zeros( 77 | (self.n_states, self.n_procs, self.actor_dims[a])) 78 | for a in self.agents} 79 | self.actor_new_states = {a: np.zeros( 80 | (self.n_states, self.n_procs, self.actor_dims[a])) 81 | for a in self.agents} 82 | self.actions = {a: np.zeros( 83 | (self.n_states, self.n_procs, self.n_actions[a])) 84 | for a in self.agents} 85 | self.probs = {a: np.zeros( 86 | (self.n_states, self.n_procs, self.n_actions[a])) 87 | for a in self.agents} 88 | -------------------------------------------------------------------------------- /mappo/mappo/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Beta, Categorical 7 | 8 | 9 | class ContinuousActorNetwork(nn.Module): 10 | def __init__(self, n_actions, input_dims, alpha, 11 | fc1_dims=128, fc2_dims=128, chkpt_dir='models/', 12 | scenario=None): 13 | super(ContinuousActorNetwork, self).__init__() 14 | chkpt_dir += scenario 15 | if not os.path.exists(chkpt_dir): 16 | os.makedirs(chkpt_dir) 17 | self.checkpoint_file = os.path.join(chkpt_dir, 18 | 'actor_continuous_ppo') 19 | self.fc1 = nn.Linear(input_dims, fc1_dims) 20 | self.fc2 = nn.Linear(fc1_dims, fc2_dims) 21 | self.alpha = nn.Linear(fc2_dims, n_actions) 22 | self.beta = nn.Linear(fc2_dims, n_actions) 23 | 24 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 25 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 26 | self.to(self.device) 27 | 28 | def forward(self, state): 29 | x = T.tanh(self.fc1(state)) 30 | x = T.tanh(self.fc2(x)) 31 | alpha = F.relu(self.alpha(x)) + 1.0 32 | beta = F.relu(self.beta(x)) + 1.0 33 | dist = Beta(alpha, beta) 34 | return dist 35 | 36 | def save_checkpoint(self): 37 | T.save(self.state_dict(), self.checkpoint_file) 38 | 39 | def load_checkpoint(self): 40 | self.load_state_dict(T.load(self.checkpoint_file)) 41 | 42 | 43 | class ContinuousCriticNetwork(nn.Module): 44 | def __init__(self, input_dims, alpha, 45 | fc1_dims=128, fc2_dims=128, chkpt_dir='models/', 46 | scenario=None): 47 | super(ContinuousCriticNetwork, self).__init__() 48 | chkpt_dir += scenario 49 | if not os.path.exists(chkpt_dir): 50 | os.makedirs(chkpt_dir) 51 | 52 | self.checkpoint_file = os.path.join(chkpt_dir, 53 | 'critic_continuous_ppo') 54 | self.fc1 = nn.Linear(input_dims, fc1_dims) 55 | self.fc2 = nn.Linear(fc1_dims, fc2_dims) 56 | self.v = nn.Linear(fc2_dims, 1) 57 | 58 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 59 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 60 | self.to(self.device) 61 | 62 | def forward(self, state): 63 | x = T.tanh(self.fc1(state)) 64 | x = T.tanh(self.fc2(x)) 65 | v = self.v(x) 66 | 67 | return v 68 | 69 | def save_checkpoint(self): 70 | T.save(self.state_dict(), self.checkpoint_file) 71 | 72 | def load_checkpoint(self): 73 | self.load_state_dict(T.load(self.checkpoint_file)) 74 | -------------------------------------------------------------------------------- /mappo/mappo/run.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mappo import MAPPO 3 | from memory import PPOMemory 4 | from utils import obs_list_to_state_vector 5 | from vec_env import make_vec_envs 6 | 7 | 8 | def run(): 9 | env_id = 'Simple_Speaker_Listener' 10 | random_seed = 0 11 | n_procs = 2 12 | env = make_vec_envs(env_id, random_seed, n_procs) 13 | N = 2048 14 | batch_size = 64 15 | n_epochs = 10 16 | alpha = 3e-4 17 | scenario = 'simple_speaker_listener' 18 | 19 | n_agents = env.max_num_agents 20 | 21 | actor_dims = {} 22 | n_actions = {} 23 | for agent in env.agents: 24 | actor_dims[agent] = env.observation_space(agent).shape[0] 25 | n_actions[agent] = env.action_space(agent).shape[0] 26 | critic_dims = sum([actor_dims[a] for a in env.agents]) 27 | 28 | mappo_agents = MAPPO(actor_dims=actor_dims, critic_dims=critic_dims, 29 | n_agents=n_agents, n_actions=n_actions, 30 | n_epochs=n_epochs, env=env, gamma=0.95, alpha=alpha, 31 | T=N, n_procs=n_procs, scenario=scenario) 32 | 33 | memory = PPOMemory(batch_size, N, n_agents, env.agents, 34 | n_procs, critic_dims, actor_dims, n_actions) 35 | 36 | MAX_STEPS = 1_000_000 37 | total_steps = 0 38 | episode = 1 39 | traj_length = 0 40 | score_history, steps_history = [], [] 41 | 42 | while total_steps < MAX_STEPS: 43 | observation, _ = env.reset() 44 | terminal = [False] * n_procs 45 | score = [0] * n_procs 46 | while not any(terminal): 47 | a_p = [mappo_agents.choose_action(observation[idx]) for 48 | idx in range(n_procs)] 49 | action = [a[0] for a in a_p] 50 | prob = [a[1] for a in a_p] 51 | print(f'action {action}') 52 | observation_, reward, done, trunc, info = env.step(action) 53 | 54 | print(f'observation_ {observation_}') 55 | exit() 56 | 57 | total_steps += 1 58 | traj_length += 1 59 | 60 | done_arr = [list(d.values()) for d in done] 61 | obs_arr = [list(o.values()) for o in observation] 62 | reward_arr = [list(r.values()) for r in reward] 63 | new_obs_arr = [list(o.values()) for o in observation_] 64 | trunc_arr = [list(t.values()) for t in trunc] 65 | 66 | action_dict = {agent: [list(a[agent]) for a in action] 67 | for agent in env.agents} 68 | obs_dict = {agent: [list(o[agent]) for o in observation] 69 | for agent in env.agents} 70 | new_obs_dict = {agent: [list(o[agent]) for o in observation_] 71 | for agent in env.agents} 72 | probs_dict = {agent: [list(p[agent]) for p in prob] 73 | for agent in env.agents} 74 | 75 | state = obs_list_to_state_vector(obs_arr) 76 | state_ = obs_list_to_state_vector(new_obs_arr) 77 | 78 | score += [sum(r) for r in reward_arr] 79 | 80 | terminal = [any(d) or any(t) for d, t in zip(done_arr, trunc_arr)] 81 | mask = [0.0 if t else 1.0 for t in terminal] 82 | memory.store_memory(obs_dict, state, action_dict, 83 | probs_dict, reward_arr, 84 | new_obs_dict, state_, mask) 85 | 86 | if traj_length % N == 0: 87 | mappo_agents.learn(memory) 88 | traj_length = 0 89 | memory.clear_memory() 90 | observation = observation_ 91 | score_history.append(sum(score)/n_procs) 92 | steps_history.append(total_steps) 93 | avg_score = np.mean(score_history[-100:]) 94 | print(f'{env_id} Episode {episode} total steps {total_steps}' 95 | f' avg score {avg_score :.1f}') 96 | 97 | episode += 1 98 | 99 | np.save('data/mappo_scores.npy', np.array(score_history)) 100 | np.save('data/mappo_steps.npy', np.array(steps_history)) 101 | env.close() 102 | 103 | 104 | if __name__ == '__main__': 105 | run() 106 | -------------------------------------------------------------------------------- /mappo/mappo/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | 12 | 13 | def obs_list_to_state_vector(observation): 14 | state = [] 15 | for row in observation: 16 | obs = np.array([]) 17 | for o in row: 18 | obs = np.concatenate([obs, o]) 19 | state.append(obs) 20 | return np.array(state) 21 | -------------------------------------------------------------------------------- /mappo/mappo/vec_env.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import multiprocessing as mp 3 | import gym 4 | import numpy as np 5 | import torch as T 6 | from pettingzoo.mpe import simple_speaker_listener_v4 7 | 8 | 9 | # based on: 10 | # https://github.com/openai/baselines/blob/master/baselines/common/vec_env/subproc_vec_env.py 11 | # and: 12 | # https://github.com/maximecb/gym-miniworld/blob/master/pytorch-a2c-ppo-acktr/vec_env/subproc_vec_env.py 13 | 14 | def worker(remote, parent_remote, env_fn_wrapper): 15 | parent_remote.close() 16 | env = env_fn_wrapper.x() 17 | while True: 18 | cmd, data = remote.recv() 19 | if cmd == 'step': 20 | ob, reward, done, trunc, info = env.step(data) 21 | remote.send((ob, reward, done, trunc, info)) 22 | elif cmd == 'reset': 23 | ob, info = env.reset() 24 | remote.send((ob, info)) 25 | elif cmd == 'close': 26 | remote.close() 27 | break 28 | elif cmd == 'get_spaces': 29 | remote.send((env.observation_space, env.action_space)) 30 | elif cmd == 'max_num_agents': 31 | remote.send(env.max_num_agents) 32 | elif cmd == 'agents': 33 | remote.send(env.agents) 34 | else: 35 | raise NotImplementedError 36 | 37 | 38 | class SubprocVecEnv: 39 | def __init__(self, env_fns, spaces=None): 40 | self.waiting = False 41 | self.closed = False 42 | nenvs = len(env_fns) 43 | mp.set_start_method('forkserver') 44 | self.remotes, self.work_remotes = zip(*[mp.Pipe() 45 | for _ in range(nenvs)]) 46 | self.ps = [mp.Process(target=worker, args=(work_remote, remote, 47 | CloudpickleWrapper(env_fn))) 48 | for (work_remote, remote, env_fn) in 49 | zip(self.work_remotes, self.remotes, env_fns)] 50 | 51 | for p in self.ps: 52 | p.daemon = True 53 | p.start() 54 | 55 | for remote in self.work_remotes: 56 | remote.close() 57 | 58 | self.remotes[0].send(('get_spaces', None)) 59 | observation_space, action_space = self.remotes[0].recv() 60 | self.observation_space = observation_space 61 | self.action_space = action_space 62 | 63 | self.remotes[0].send(('reset', None)) 64 | _, _ = self.remotes[0].recv() 65 | 66 | self.remotes[0].send(('max_num_agents', None)) 67 | self.max_num_agents = self.remotes[0].recv() 68 | self.remotes[0].send(('agents', None)) 69 | self.agents = self.remotes[0].recv() 70 | 71 | def step_async(self, actions): 72 | assert not self.closed, "trying to operate after calling close()" 73 | for remote, action in zip(self.remotes, actions): 74 | remote.send(('step', action)) 75 | # self.waiting = True 76 | results = [remote.recv() for remote in self.remotes] 77 | obs, rews, dones, truncs, infos = zip(*results) 78 | return np.stack(obs), np.stack(rews), np.stack(dones), \ 79 | np.stack(truncs), infos 80 | """ 81 | def step_wait(self): 82 | assert not self.closed, "trying to operate after calling close()" 83 | results = [remote.recv() for remote in self.remotes] 84 | self.waiting = False 85 | obs, rews, dones, infos = zip(*results) 86 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 87 | """ 88 | def reset(self): 89 | assert not self.closed, "trying to operate after calling close()" 90 | for remote in self.remotes: 91 | remote.send(('reset', None)) 92 | obs_arr, info_arr = [], [] 93 | for remote in self.remotes: 94 | obs, info = remote.recv() 95 | obs_arr.append(obs) 96 | info_arr.append(info) 97 | return np.array(obs_arr), np.array(info_arr) 98 | 99 | def close_extras(self): 100 | if self.closed: 101 | return 102 | """ 103 | if self.waiting: 104 | for remote in self.remotes: 105 | remote.recv() 106 | """ 107 | for remote in self.remotes: 108 | remote.send(('close', None)) 109 | for p in self.ps: 110 | p.join() 111 | self.closed = True 112 | 113 | def close(self): 114 | if self.closed: 115 | return 116 | self.close_extras() 117 | self.closed = True 118 | 119 | def step(self, actions): 120 | # self.step_async(actions) 121 | obs, reward, dones, truncs, info = self.step_async(actions) 122 | return obs, reward, dones, truncs, info 123 | # return self.step_wait() 124 | 125 | def __del__(self): 126 | if not self.closed: 127 | self.close() 128 | 129 | 130 | class CloudpickleWrapper: 131 | def __init__(self, x): 132 | self.x = x 133 | 134 | def __getstate__(self): 135 | import cloudpickle 136 | return cloudpickle.dumps(self.x) 137 | 138 | def __setstate__(self, ob): 139 | import pickle 140 | self.x = pickle.loads(ob) 141 | 142 | 143 | def make_env(env_id, seed, rank): 144 | def _thunk(): 145 | env = simple_speaker_listener_v4.parallel_env( 146 | continuous_actions=True) 147 | _, _ = env.reset(seed=seed+rank) 148 | # env.seed(seed + rank) 149 | return env 150 | 151 | return _thunk 152 | 153 | 154 | def make_vec_envs(env_name, seed, num_processes): 155 | mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 156 | seed = seed + 10000 * mpi_rank if seed is not None else None 157 | set_global_seeds(seed) 158 | envs = [make_env(env_name, seed, i) for i in range(num_processes)] 159 | 160 | if len(envs) > 1: 161 | envs = SubprocVecEnv(envs) 162 | 163 | return envs 164 | 165 | 166 | def set_global_seeds(seed): 167 | import random 168 | np.random.seed(seed) 169 | random.seed(seed) 170 | T.manual_seed(seed) 171 | --------------------------------------------------------------------------------