├── LICENSE
├── README.md
├── maddpg
    ├── agent.py
    ├── buffer.py
    ├── ddpg
    │   ├── agent.py
    │   ├── buffer.py
    │   ├── main.py
    │   ├── networks.py
    │   ├── run.py
    │   └── utils.py
    ├── env_test.py
    ├── maddpg.py
    ├── networks.py
    ├── plot.py
    ├── plots
    │   └── maddpg_vs_ddpg.png
    ├── run.py
    └── utils.py
└── mappo
    └── mappo
        ├── agent.py
        ├── mappo.py
        ├── memory.py
        ├── networks.py
        ├── run.py
        ├── utils.py
        └── vec_env.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Phil Tabor
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Multi-Agent-Reinforcement-Learning
 2 | PyTorch implementations of MADDPG, MAPPO (coming)
 3 | 
 4 | The implementation of MADDPG is compatible with PyTorch 1.13 and PettingZoo 1.23.1.
 5 | 
 6 | I recommend using a virtual environment to install dependencies, as I can't guarantee
 7 | that future versions (i.e. torch 2) won't break this implementation.
 8 | 
 9 | This code is part of my course on multi agent reinforcement learning, found on the
10 | Neuralnet Academy, which you can find here: https://www.neuralnet.ai/courses
11 | 


--------------------------------------------------------------------------------
/maddpg/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | import torch.nn.functional as F
  4 | from networks import ActorNetwork, CriticNetwork
  5 | 
  6 | 
  7 | class Agent:
  8 |     def __init__(self, actor_dims, critic_dims, n_actions,
  9 |                  n_agents, agent_idx, chkpt_dir, min_action,
 10 |                  max_action, alpha=1e-4, beta=1e-3, fc1=64,
 11 |                  fc2=64, gamma=0.95, tau=0.01):
 12 |         self.gamma = gamma
 13 |         self.tau = tau
 14 |         self.n_actions = n_actions
 15 |         agent_name = 'agent_%s' % agent_idx
 16 |         self.agent_idx = agent_idx
 17 |         self.min_action = min_action
 18 |         self.max_action = max_action
 19 | 
 20 |         self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions,
 21 |                                   chkpt_dir=chkpt_dir,
 22 |                                   name=agent_name+'_actor')
 23 |         self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2,
 24 |                                          n_actions, chkpt_dir=chkpt_dir,
 25 |                                          name=agent_name+'target__actor')
 26 | 
 27 |         self.critic = CriticNetwork(beta, critic_dims, fc1, fc2,
 28 |                                     chkpt_dir=chkpt_dir,
 29 |                                     name=agent_name+'_critic')
 30 |         self.target_critic = CriticNetwork(beta, critic_dims, fc1, fc2,
 31 |                                            chkpt_dir=chkpt_dir,
 32 |                                            name=agent_name+'_target__critic')
 33 | 
 34 |         self.update_network_parameters(tau=1)
 35 | 
 36 |     def choose_action(self, observation, evaluate=False):
 37 |         state = T.tensor(observation[np.newaxis, :], dtype=T.float,
 38 |                          device=self.actor.device)
 39 |         actions = self.actor.forward(state)
 40 |         noise = T.randn(size=(self.n_actions,)).to(self.actor.device)
 41 |         noise *= T.tensor(1 - int(evaluate))
 42 |         action = T.clamp(actions + noise,
 43 |                          T.tensor(self.min_action, device=self.actor.device),
 44 |                          T.tensor(self.max_action, device=self.actor.device))
 45 |         return action.data.cpu().numpy()[0]
 46 | 
 47 |     def update_network_parameters(self, tau=None):
 48 |         tau = tau or self.tau
 49 | 
 50 |         src = self.actor
 51 |         dest = self.target_actor
 52 | 
 53 |         for param, target in zip(src.parameters(), dest.parameters()):
 54 |             target.data.copy_(tau * param.data + (1 - tau) * target.data)
 55 | 
 56 |         src = self.critic
 57 |         dest = self.target_critic
 58 | 
 59 |         for param, target in zip(src.parameters(), dest.parameters()):
 60 |             target.data.copy_(tau * param.data + (1 - tau) * target.data)
 61 | 
 62 |     def save_models(self):
 63 |         self.actor.save_checkpoint()
 64 |         self.target_actor.save_checkpoint()
 65 |         self.critic.save_checkpoint()
 66 |         self.target_critic.save_checkpoint()
 67 | 
 68 |     def load_models(self):
 69 |         self.actor.load_checkpoint()
 70 |         self.target_actor.load_checkpoint()
 71 |         self.critic.load_checkpoint()
 72 |         self.target_critic.load_checkpoint()
 73 | 
 74 |     def learn(self, memory, agent_list):
 75 |         if not memory.ready():
 76 |             return
 77 | 
 78 |         actor_states, states, actions, rewards,\
 79 |             actor_new_states, states_, dones = memory.sample_buffer()
 80 | 
 81 |         device = self.actor.device
 82 | 
 83 |         states = T.tensor(np.array(states), dtype=T.float, device=device)
 84 |         rewards = T.tensor(np.array(rewards), dtype=T.float, device=device)
 85 |         states_ = T.tensor(np.array(states_), dtype=T.float, device=device)
 86 |         dones = T.tensor(np.array(dones), device=device)
 87 | 
 88 |         actor_states = [T.tensor(actor_states[idx],
 89 |                                  device=device, dtype=T.float)
 90 |                         for idx in range(len(agent_list))]
 91 |         actor_new_states = [T.tensor(actor_new_states[idx],
 92 |                                      device=device, dtype=T.float)
 93 |                             for idx in range(len(agent_list))]
 94 |         actions = [T.tensor(actions[idx], device=device, dtype=T.float)
 95 |                    for idx in range(len(agent_list))]
 96 | 
 97 |         with T.no_grad():
 98 |             new_actions = T.cat([agent.target_actor(actor_new_states[idx])
 99 |                                  for idx, agent in enumerate(agent_list)],
100 |                                 dim=1)
101 |             critic_value_ = self.target_critic.forward(
102 |                                 states_, new_actions).squeeze()
103 |             critic_value_[dones[:, self.agent_idx]] = 0.0
104 |             target = rewards[:, self.agent_idx] + self.gamma * critic_value_
105 | 
106 |         old_actions = T.cat([actions[idx] for idx in range(len(agent_list))],
107 |                             dim=1)
108 |         critic_value = self.critic.forward(states, old_actions).squeeze()
109 |         critic_loss = F.mse_loss(target, critic_value)
110 | 
111 |         self.critic.optimizer.zero_grad()
112 |         critic_loss.backward()
113 |         T.nn.utils.clip_grad_norm_(self.critic.parameters(), 10.0)
114 |         self.critic.optimizer.step()
115 | 
116 |         actions[self.agent_idx] = self.actor.forward(
117 |                 actor_states[self.agent_idx])
118 |         actions = T.cat([actions[i] for i in range(len(agent_list))], dim=1)
119 |         actor_loss = -self.critic.forward(states, actions).mean()
120 |         self.actor.optimizer.zero_grad()
121 |         actor_loss.backward()
122 |         T.nn.utils.clip_grad_norm_(self.actor.parameters(), 10.0)
123 |         self.actor.optimizer.step()
124 | 
125 |         self.update_network_parameters()
126 | 


--------------------------------------------------------------------------------
/maddpg/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class MultiAgentReplayBuffer:
 5 |     def __init__(self, max_size, critic_dims, actor_dims,
 6 |                  n_actions, n_agents, batch_size):
 7 |         self.mem_size = max_size
 8 |         self.mem_cntr = 0
 9 |         self.n_agents = n_agents
10 |         self.actor_dims = actor_dims
11 |         self.batch_size = batch_size
12 |         self.n_actions = n_actions
13 | 
14 |         self.state_memory = np.zeros((self.mem_size, critic_dims))
15 |         self.new_state_memory = np.zeros((self.mem_size, critic_dims))
16 |         self.reward_memory = np.zeros((self.mem_size, n_agents))
17 |         self.terminal_memory = np.zeros((self.mem_size, n_agents), dtype=bool)
18 | 
19 |         self.init_actor_memory()
20 | 
21 |     def init_actor_memory(self):
22 |         self.actor_state_memory = []
23 |         self.actor_new_state_memory = []
24 |         self.actor_action_memory = []
25 | 
26 |         for i in range(self.n_agents):
27 |             self.actor_state_memory.append(
28 |                     np.zeros((self.mem_size, self.actor_dims[i])))
29 |             self.actor_new_state_memory.append(
30 |                     np.zeros((self.mem_size, self.actor_dims[i])))
31 |             self.actor_action_memory.append(
32 |                     np.zeros((self.mem_size, self.n_actions[i])))
33 | 
34 |     def store_transition(self, raw_obs, state, action, reward,
35 |                          raw_obs_, state_, done):
36 | 
37 |         index = self.mem_cntr % self.mem_size
38 |         for agent_idx in range(self.n_agents):
39 |             self.actor_state_memory[agent_idx][index] = raw_obs[agent_idx]
40 |             self.actor_new_state_memory[agent_idx][index] = raw_obs_[agent_idx]
41 |             self.actor_action_memory[agent_idx][index] = action[agent_idx]
42 | 
43 |         self.state_memory[index] = state
44 |         self.new_state_memory[index] = state_
45 |         self.reward_memory[index] = reward
46 |         self.terminal_memory[index] = done
47 |         self.mem_cntr += 1
48 | 
49 |     def sample_buffer(self):
50 |         max_mem = min(self.mem_cntr, self.mem_size)
51 | 
52 |         batch = np.random.choice(max_mem, self.batch_size, replace=False)
53 | 
54 |         states = self.state_memory[batch]
55 |         states_ = self.new_state_memory[batch]
56 |         rewards = self.reward_memory[batch]
57 |         terminal = self.terminal_memory[batch]
58 | 
59 |         actor_states = []
60 |         actor_new_states = []
61 |         actions = []
62 |         for agent_idx in range(self.n_agents):
63 |             actor_states.append(self.actor_state_memory[agent_idx][batch])
64 |             actor_new_states.append(
65 |                 self.actor_new_state_memory[agent_idx][batch])
66 |             actions.append(self.actor_action_memory[agent_idx][batch])
67 | 
68 |         return actor_states, states, actions, rewards, \
69 |             actor_new_states, states_, terminal
70 | 
71 |     def ready(self):
72 |         if self.mem_cntr >= self.batch_size:
73 |             return True
74 |         return False
75 | 


--------------------------------------------------------------------------------
/maddpg/ddpg/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | import torch.nn.functional as F
  4 | from networks import ActorNetwork, CriticNetwork
  5 | from buffer import ReplayBuffer
  6 | 
  7 | 
  8 | class Agent:
  9 |     def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99,
 10 |                  max_size=1000000, fc1_dims=400, fc2_dims=300,
 11 |                  batch_size=64):
 12 |         self.gamma = gamma
 13 |         self.tau = tau
 14 |         self.batch_size = batch_size
 15 |         self.alpha = alpha
 16 |         self.beta = beta
 17 |         self.n_actions = n_actions
 18 | 
 19 |         self.memory = ReplayBuffer(max_size, input_dims, n_actions)
 20 | 
 21 |         self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
 22 |                                   n_actions=n_actions, name='actor')
 23 |         self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
 24 |                                     n_actions=n_actions, name='critic')
 25 | 
 26 |         self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
 27 |                                          n_actions=n_actions,
 28 |                                          name='target_actor')
 29 | 
 30 |         self.target_critic = CriticNetwork(beta, input_dims, fc1_dims,
 31 |                                            fc2_dims, n_actions=n_actions,
 32 |                                            name='target_critic')
 33 | 
 34 |         self.update_network_parameters(tau=1)
 35 | 
 36 |     def choose_action(self, observation, eval=False):
 37 |         state = T.tensor(observation[np.newaxis, :], dtype=T.float,
 38 |                          device=self.actor.device)
 39 |         mu = self.actor.forward(state).to(self.actor.device)
 40 |         noise = T.rand(self.n_actions).to(self.actor.device)
 41 |         noise *= T.tensor(1 - int(eval))
 42 |         mu_prime = mu + noise
 43 |         mu_prime = T.clamp(mu_prime, 0., 1.)
 44 | 
 45 |         return mu_prime.cpu().detach().numpy()[0]
 46 | 
 47 |     def remember(self, state, action, reward, state_, done):
 48 |         self.memory.store_transition(state, action, reward, state_, done)
 49 | 
 50 |     def save_models(self):
 51 |         self.actor.save_checkpoint()
 52 |         self.target_actor.save_checkpoint()
 53 |         self.critic.save_checkpoint()
 54 |         self.target_critic.save_checkpoint()
 55 | 
 56 |     def load_models(self):
 57 |         self.actor.load_checkpoint()
 58 |         self.target_actor.load_checkpoint()
 59 |         self.critic.load_checkpoint()
 60 |         self.target_critic.load_checkpoint()
 61 | 
 62 |     def learn(self):
 63 |         if self.memory.mem_cntr < self.batch_size:
 64 |             return
 65 | 
 66 |         states, actions, rewards, states_, done = \
 67 |             self.memory.sample_buffer(self.batch_size)
 68 | 
 69 |         states = T.tensor(states, dtype=T.float).to(self.actor.device)
 70 |         states_ = T.tensor(states_, dtype=T.float).to(self.actor.device)
 71 |         actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
 72 |         rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
 73 |         done = T.tensor(done).to(self.actor.device)
 74 | 
 75 |         target_actions = self.target_actor.forward(states_)
 76 |         critic_value_ = self.target_critic.forward(states_, target_actions)
 77 |         critic_value = self.critic.forward(states, actions)
 78 | 
 79 |         critic_value_[done] = 0.0
 80 |         critic_value_ = critic_value_.view(-1)
 81 | 
 82 |         target = rewards + self.gamma*critic_value_
 83 |         target = target.view(self.batch_size, 1)
 84 | 
 85 |         self.critic.optimizer.zero_grad()
 86 |         critic_loss = F.mse_loss(target, critic_value)
 87 |         critic_loss.backward()
 88 |         self.critic.optimizer.step()
 89 | 
 90 |         self.actor.optimizer.zero_grad()
 91 |         actor_loss = -self.critic.forward(states, self.actor.forward(states))
 92 |         actor_loss = T.mean(actor_loss)
 93 |         actor_loss.backward()
 94 |         self.actor.optimizer.step()
 95 | 
 96 |         self.update_network_parameters()
 97 | 
 98 |     def update_network_parameters(self, tau=None):
 99 |         tau = tau or self.tau
100 |         src = self.actor
101 |         dest = self.target_actor
102 |         for param, target in zip(src.parameters(), dest.parameters()):
103 |             target.data.copy_(tau * param.data + (1 - tau) * target.data)
104 |         src = self.critic
105 |         dest = self.target_critic
106 |         for param, target in zip(src.parameters(), dest.parameters()):
107 |             target.data.copy_(tau * param.data + (1 - tau) * target.data)
108 | 


--------------------------------------------------------------------------------
/maddpg/ddpg/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class ReplayBuffer:
 5 |     def __init__(self, max_size, input_shape, n_actions):
 6 |         self.mem_size = max_size
 7 |         self.mem_cntr = 0
 8 |         self.state_memory = np.zeros((self.mem_size, input_shape))
 9 |         self.new_state_memory = np.zeros((self.mem_size, input_shape))
10 |         self.action_memory = np.zeros((self.mem_size, n_actions))
11 |         self.reward_memory = np.zeros(self.mem_size)
12 |         self.terminal_memory = np.zeros(self.mem_size, dtype=bool)
13 | 
14 |     def store_transition(self, state, action, reward, state_, done):
15 |         index = self.mem_cntr % self.mem_size
16 |         self.state_memory[index] = state
17 |         self.action_memory[index] = action
18 |         self.reward_memory[index] = reward
19 |         self.new_state_memory[index] = state_
20 |         self.terminal_memory[index] = done
21 | 
22 |         self.mem_cntr += 1
23 | 
24 |     def sample_buffer(self, batch_size):
25 |         max_mem = min(self.mem_cntr, self.mem_size)
26 | 
27 |         batch = np.random.choice(max_mem, batch_size)
28 | 
29 |         states = self.state_memory[batch]
30 |         actions = self.action_memory[batch]
31 |         rewards = self.reward_memory[batch]
32 |         states_ = self.new_state_memory[batch]
33 |         dones = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, dones
36 | 


--------------------------------------------------------------------------------
/maddpg/ddpg/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from agent import Agent
 3 | from pettingzoo.mpe import simple_speaker_listener_v4
 4 | 
 5 | 
 6 | def obs_list_to_state_vector(observation):
 7 |     state = np.array([])
 8 |     for obs in observation:
 9 |         state = np.concatenate([state, obs])
10 |     return state
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     env = simple_speaker_listener_v4.parallel_env(
15 |             continuous_actions=True)
16 |     scenario = 'simple_speaker_listener'
17 | 
18 |     initial_temp = env.reset()
19 |     n_agents = env.max_num_agents
20 | 
21 |     agents = []
22 | 
23 |     for agent in env.agents:
24 |         input_dims = env.observation_space(agent).shape[0]
25 |         n_actions = env.action_space(agent).shape[0]
26 | 
27 |         agents.append(Agent(alpha=1e-3, beta=1e-3,
28 |                       input_dims=input_dims, tau=0.01, gamma=0.95,
29 |                       batch_size=1024, fc1_dims=64, fc2_dims=64,
30 |                       n_actions=n_actions))
31 | 
32 |     N_GAMES = 25_000
33 |     PRINT_INTERVAL = 500
34 |     total_steps = 0
35 |     score_history = []
36 |     evaluate = False
37 |     best_score = 0
38 | 
39 |     if evaluate:
40 |         for agent in agents:
41 |             agent.load_checkpoint()
42 | 
43 |     total_steps = 0
44 | 
45 |     for i in range(N_GAMES):
46 |         observation, _ = env.reset()
47 |         terminal = [False] * n_agents
48 |         score = 0
49 |         observation = list(observation.values())
50 | 
51 |         while not any(terminal):
52 |             action = [agent.choose_action(observation[idx])
53 |                       for idx, agent in enumerate(agents)]
54 |             action = {agent: act for agent, act in zip(env.agents, action)}
55 |             observation_, reward, done, trunc, info = env.step(action)
56 | 
57 |             observation_ = list(observation_.values())
58 |             reward = list(reward.values())
59 |             done = list(done.values())
60 |             trunc = list(trunc.values())
61 |             action = list(action.values())
62 | 
63 |             terminal = [d or t for d, t in zip(done, trunc)]
64 | 
65 |             for idx, agent in enumerate(agents):
66 |                 agent.remember(observation[idx], action[idx],
67 |                                reward[idx], observation_[idx], terminal[idx])
68 |             if total_steps % 100 == 0 and not evaluate:
69 |                 for agent in agents:
70 |                     agent.learn()
71 |             score += sum(reward)
72 |             observation = observation_
73 |             total_steps += 1
74 |         score_history.append(score)
75 |         avg_score = np.mean(score_history[-100:])
76 | 
77 |         if avg_score > best_score:
78 |             best_score = avg_score
79 |             # agent.save_models()
80 |         if i % PRINT_INTERVAL == 0 and i > 0:
81 |             print(f'episode {i} avg score {avg_score:.1f}')
82 | 


--------------------------------------------------------------------------------
/maddpg/ddpg/networks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | 
 7 | 
 8 | class CriticNetwork(nn.Module):
 9 |     def __init__(self, beta, input_dims, fc1_dims, fc2_dims,
10 |                  n_actions, name, chkpt_dir='tmp/ddpg'):
11 |         super(CriticNetwork, self).__init__()
12 | 
13 |         self.chkpt_file = os.path.join(chkpt_dir, name)
14 |         self.fc1 = nn.Linear(input_dims+n_actions, fc1_dims)
15 |         self.fc2 = nn.Linear(fc1_dims, fc2_dims)
16 |         self.q = nn.Linear(fc2_dims, 1)
17 | 
18 |         self.optimizer = optim.Adam(self.parameters(), lr=beta)
19 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
20 | 
21 |         self.to(self.device)
22 | 
23 |     def forward(self, state, action):
24 |         x = F.relu(self.fc1(T.cat([state, action], dim=1)))
25 |         x = F.relu(self.fc2(x))
26 |         q = self.q(x)
27 | 
28 |         return q
29 | 
30 |     def save_checkpoint(self):
31 |         T.save(self.state_dict(), self.chkpt_file)
32 | 
33 |     def load_checkpoint(self):
34 |         self.load_state_dict(T.load(self.chkpt_file))
35 | 
36 | 
37 | class ActorNetwork(nn.Module):
38 |     def __init__(self, alpha, input_dims, fc1_dims, fc2_dims,
39 |                  n_actions, name, chkpt_dir='tmp/ddpg'):
40 |         super(ActorNetwork, self).__init__()
41 | 
42 |         self.chkpt_file = os.path.join(chkpt_dir, name)
43 | 
44 |         self.fc1 = nn.Linear(input_dims, fc1_dims)
45 |         self.fc2 = nn.Linear(fc1_dims, fc2_dims)
46 | 
47 |         self.pi = nn.Linear(fc2_dims, n_actions)
48 | 
49 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
50 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
51 | 
52 |         self.to(self.device)
53 | 
54 |     def forward(self, state):
55 |         x = self.fc1(state)
56 |         x = F.relu(x)
57 |         x = F.relu(self.fc2(x))
58 |         pi = T.sigmoid(self.pi(x))
59 | 
60 |         return pi
61 | 
62 |     def save_checkpoint(self):
63 |         T.save(self.state_dict(), self.chkpt_file)
64 | 
65 |     def load_checkpoint(self):
66 |         self.load_state_dict(T.load(self.chkpt_file))
67 | 


--------------------------------------------------------------------------------
/maddpg/ddpg/run.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from agent import Agent
  3 | from pettingzoo.mpe import simple_speaker_listener_v4
  4 | 
  5 | 
  6 | def obs_list_to_state_vector(observation):
  7 |     state = np.array([])
  8 |     for obs in observation:
  9 |         state = np.concatenate([state, obs])
 10 |     return state
 11 | 
 12 | 
 13 | def run():
 14 |     parallel_env = simple_speaker_listener_v4.parallel_env(
 15 |             continuous_actions=True)
 16 |     _, _ = parallel_env.reset()
 17 |     n_agents = parallel_env.max_num_agents
 18 | 
 19 |     n_actions = []
 20 |     agents = []
 21 | 
 22 |     for agent in parallel_env.agents:
 23 |         input_dims = parallel_env.observation_space(agent).shape[0]
 24 |         n_actions = parallel_env.action_space(agent).shape[0]
 25 | 
 26 |         agents.append(Agent(input_dims=input_dims, n_actions=n_actions,
 27 |                             gamma=0.95, tau=0.01, alpha=1e-4, beta=1e-3))
 28 | 
 29 |     EVAL_INTERVAL = 1000
 30 |     MAX_STEPS = 10_000
 31 | 
 32 |     total_steps = 0
 33 |     episode = 0
 34 | 
 35 |     eval_scores = []
 36 |     eval_steps = []
 37 |     score = evaluate(agents, parallel_env, episode, total_steps)
 38 |     eval_scores.append(score)
 39 |     eval_steps.append(total_steps)
 40 | 
 41 |     while total_steps < MAX_STEPS:
 42 |         obs, _ = parallel_env.reset()
 43 |         terminal = [False] * n_agents
 44 |         obs = list(obs.values())
 45 |         while not any(terminal):
 46 |             action = [agent.choose_action(obs[idx])
 47 |                       for idx, agent in enumerate(agents)]
 48 |             action = {agent: act
 49 |                       for agent, act in zip(parallel_env.agents, action)}
 50 |             obs_, reward, done, truncated, info = parallel_env.step(action)
 51 |             list_done = list(done.values())
 52 |             list_reward = list(reward.values())
 53 |             list_action = list(action.values())
 54 |             obs_ = list(obs_.values())
 55 |             list_trunc = list(truncated.values())
 56 | 
 57 |             terminal = [d or t for d, t in zip(list_done, list_trunc)]
 58 | 
 59 |             for idx, agent in enumerate(agents):
 60 |                 agent.remember(obs[idx], list_action[idx],
 61 |                                list_reward[idx], obs_[idx], terminal[idx])
 62 | 
 63 |             if total_steps % 100 == 0:
 64 |                 for agent in agents:
 65 |                     agent.learn()
 66 |             obs = obs_
 67 |             total_steps += 1
 68 | 
 69 |         if total_steps % EVAL_INTERVAL == 0 and total_steps > 0:
 70 |             score = evaluate(agents, parallel_env, episode, total_steps)
 71 |             eval_scores.append(score)
 72 |             eval_steps.append(total_steps)
 73 | 
 74 |         episode += 1
 75 | 
 76 |         np.save('../data/ddpg_scores.npy', np.array(eval_scores))
 77 |         np.save('../data/ddpg_steps.npy', np.array(eval_steps))
 78 | 
 79 | 
 80 | def evaluate(agents, env, ep, step):
 81 |     score_history = []
 82 |     for i in range(3):
 83 |         obs, _ = env.reset()
 84 |         score = 0
 85 |         terminal = [False] * env.max_num_agents
 86 |         obs = list(obs.values())
 87 |         while not any(terminal):
 88 |             action = [agent.choose_action(obs[idx], eval=True)
 89 |                       for idx, agent in enumerate(agents)]
 90 |             action = {agent: act
 91 |                       for agent, act in zip(env.agents, action)}
 92 | 
 93 |             obs_, reward, done, truncated, info = env.step(action)
 94 |             obs_ = list(obs_.values())
 95 |             list_trunc = list(truncated.values())
 96 |             list_reward = list(reward.values())
 97 |             list_done = list(done.values())
 98 | 
 99 |             terminal = [d or t for d, t in zip(list_done, list_trunc)]
100 | 
101 |             obs = obs_
102 |             score += sum(list_reward)
103 |         score_history.append(score)
104 |     avg_score = np.mean(score_history)
105 |     print(f'Evaluation episode {ep} train steps {step}'
106 |           f' average score {avg_score:.1f}')
107 | 
108 |     return avg_score
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     run()
113 | 


--------------------------------------------------------------------------------
/maddpg/ddpg/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/maddpg/env_test.py:
--------------------------------------------------------------------------------
 1 | from pettingzoo.mpe import simple_speaker_listener_v4
 2 | 
 3 | 
 4 | env = simple_speaker_listener_v4.parallel_env(continuous_actions=True)
 5 | _, _ = env.reset()
 6 | for agent in env.agents:
 7 |     print(f'agent observation space {env.observation_space(agent)}')
 8 | obs, info = env.reset()
 9 | print(f'initial observation: {obs} debug info: {info}')
10 | terminal = [False] * env.max_num_agents
11 | while not any(terminal):
12 |     actions = {}
13 |     for agent in env.agents:
14 |         actions[agent] = env.action_space(agent).sample()
15 |     obs_, reward, done, trunc, info = env.step(actions)
16 |     terminal = [d or t for d, t in zip(done.values(), trunc.values())]
17 | print(f'actions taken {actions}')
18 | print(f'obs values {obs.values()}')
19 | obs = list(obs.values())
20 | print(f'obs as a list {obs}')
21 | 


--------------------------------------------------------------------------------
/maddpg/maddpg.py:
--------------------------------------------------------------------------------
 1 | from agent import Agent
 2 | 
 3 | 
 4 | class MADDPG:
 5 |     def __init__(self, actor_dims, critic_dims, n_agents, n_actions, env,
 6 |                  alpha=1e-4, beta=1e-3, fc1=64, fc2=64, gamma=0.95, tau=0.01,
 7 |                  chkpt_dir='tmp/maddpg/', scenario='co-op_navigation'):
 8 |         self.agents = []
 9 |         chkpt_dir += scenario
10 |         for agent_idx in range(n_agents):
11 |             agent = list(env.action_spaces.keys())[agent_idx]
12 |             min_action = env.action_space(agent).low
13 |             max_action = env.action_space(agent).high
14 |             self.agents.append(Agent(actor_dims[agent_idx], critic_dims,
15 |                                n_actions[agent_idx], n_agents, agent_idx,
16 |                                alpha=alpha, beta=beta, tau=tau, fc1=fc1,
17 |                                fc2=fc2, chkpt_dir=chkpt_dir,
18 |                                gamma=gamma, min_action=min_action,
19 |                                max_action=max_action))
20 | 
21 |     def save_checkpoint(self):
22 |         for agent in self.agents:
23 |             agent.save_models()
24 | 
25 |     def load_checkpoint(self):
26 |         for agent in self.agents:
27 |             agent.load_models()
28 | 
29 |     def choose_action(self, raw_obs, evaluate=False):
30 |         actions = {}
31 |         for agent_id, agent in zip(raw_obs, self.agents):
32 |             action = agent.choose_action(raw_obs[agent_id], evaluate)
33 |             actions[agent_id] = action
34 |         return actions
35 | 
36 |     def learn(self, memory):
37 |         for agent in self.agents:
38 |             agent.learn(memory, self.agents)
39 | 


--------------------------------------------------------------------------------
/maddpg/networks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | 
 7 | 
 8 | class CriticNetwork(nn.Module):
 9 |     def __init__(self, beta, input_dims, fc1, fc2,
10 |                  name, chkpt_dir):
11 |         super(CriticNetwork, self).__init__()
12 | 
13 |         self.chkpt_file = os.path.join(chkpt_dir, name)
14 |         self.fc1 = nn.Linear(input_dims, fc1)
15 |         self.fc2 = nn.Linear(fc1, fc2)
16 |         self.q = nn.Linear(fc2, 1)
17 | 
18 |         self.optimizer = optim.Adam(self.parameters(), lr=beta)
19 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
20 | 
21 |         self.to(self.device)
22 | 
23 |     def forward(self, state, action):
24 |         x = F.relu(self.fc1(T.cat([state, action], dim=1)))
25 |         x = F.relu(self.fc2(x))
26 |         q = self.q(x)
27 | 
28 |         return q
29 | 
30 |     def save_checkpoint(self):
31 |         T.save(self.state_dict(), self.chkpt_file)
32 | 
33 |     def load_checkpoint(self):
34 |         self.load_state_dict(T.load(self.chkpt_file))
35 | 
36 | 
37 | class ActorNetwork(nn.Module):
38 |     def __init__(self, alpha, input_dims, fc1, fc2,
39 |                  n_actions, name, chkpt_dir):
40 |         super(ActorNetwork, self).__init__()
41 | 
42 |         self.chkpt_file = os.path.join(chkpt_dir, name)
43 | 
44 |         self.fc1 = nn.Linear(input_dims, fc1)
45 |         self.fc2 = nn.Linear(fc1, fc2)
46 |         self.pi = nn.Linear(fc2, n_actions)
47 | 
48 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
49 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
50 | 
51 |         self.to(self.device)
52 | 
53 |     def forward(self, state):
54 |         x = F.relu(self.fc1(state))
55 |         x = F.relu(self.fc2(x))
56 |         pi = T.tanh(self.pi(x))
57 | 
58 |         return pi
59 | 
60 |     def save_checkpoint(self):
61 |         T.save(self.state_dict(), self.chkpt_file)
62 | 
63 |     def load_checkpoint(self):
64 |         self.load_state_dict(T.load(self.chkpt_file))
65 | 


--------------------------------------------------------------------------------
/maddpg/plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from utils import plot_learning_curve
 3 | 
 4 | maddpg_scores = np.load('data/maddpg_scores.npy')
 5 | maddpg_steps = np.load('data/maddpg_steps.npy')
 6 | 
 7 | ddpg_scores = np.load('data/ddpg_scores.npy')
 8 | ddpg_steps = np.load('data/ddpg_steps.npy')
 9 | 
10 | plot_learning_curve(x=maddpg_steps,
11 |                     scores=(maddpg_scores, ddpg_scores),
12 |                     filename='plots/maddpg_vs_ddpg.png')
13 | 


--------------------------------------------------------------------------------
/maddpg/plots/maddpg_vs_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Multi-Agent-Reinforcement-Learning/d236304b5bd06b9efc276b24aa0e2890fd5bf65d/maddpg/plots/maddpg_vs_ddpg.png


--------------------------------------------------------------------------------
/maddpg/run.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from maddpg import MADDPG
  3 | from buffer import MultiAgentReplayBuffer
  4 | from pettingzoo.mpe import simple_speaker_listener_v4
  5 | 
  6 | 
  7 | def obs_list_to_state_vector(observation):
  8 |     state = np.array([])
  9 |     for obs in observation:
 10 |         state = np.concatenate([state, obs])
 11 |     return state
 12 | 
 13 | 
 14 | def run():
 15 |     parallel_env = simple_speaker_listener_v4.parallel_env(
 16 |             continuous_actions=True)
 17 |     _, _ = parallel_env.reset()
 18 |     n_agents = parallel_env.max_num_agents
 19 | 
 20 |     actor_dims = []
 21 |     n_actions = []
 22 |     for agent in parallel_env.agents:
 23 |         actor_dims.append(parallel_env.observation_space(agent).shape[0])
 24 |         n_actions.append(parallel_env.action_space(agent).shape[0])
 25 |     critic_dims = sum(actor_dims) + sum(n_actions)
 26 | 
 27 |     maddpg_agents = MADDPG(actor_dims, critic_dims, n_agents, n_actions,
 28 |                            env=parallel_env, gamma=0.95, alpha=1e-4, beta=1e-3)
 29 |     critic_dims = sum(actor_dims)
 30 |     memory = MultiAgentReplayBuffer(1_000_000, critic_dims, actor_dims,
 31 |                                     n_actions, n_agents, batch_size=1024)
 32 | 
 33 |     EVAL_INTERVAL = 1000
 34 |     MAX_STEPS = 10_000
 35 | 
 36 |     total_steps = 0
 37 |     episode = 0
 38 |     eval_scores = []
 39 |     eval_steps = []
 40 | 
 41 |     score = evaluate(maddpg_agents, parallel_env, episode, total_steps)
 42 |     eval_scores.append(score)
 43 |     eval_steps.append(total_steps)
 44 | 
 45 |     while total_steps < MAX_STEPS:
 46 |         obs, _ = parallel_env.reset()
 47 |         terminal = [False] * n_agents
 48 |         while not any(terminal):
 49 |             actions = maddpg_agents.choose_action(obs)
 50 | 
 51 |             obs_, reward, done, trunc, info = parallel_env.step(actions)
 52 | 
 53 |             list_done = list(done.values())
 54 |             list_obs = list(obs.values())
 55 |             list_reward = list(reward.values())
 56 |             list_actions = list(actions.values())
 57 |             list_obs_ = list(obs_.values())
 58 |             list_trunc = list(trunc.values())
 59 | 
 60 |             state = obs_list_to_state_vector(list_obs)
 61 |             state_ = obs_list_to_state_vector(list_obs_)
 62 | 
 63 |             terminal = [d or t for d, t in zip(list_done, list_trunc)]
 64 |             memory.store_transition(list_obs, state, list_actions, list_reward,
 65 |                                     list_obs_, state_, terminal)
 66 | 
 67 |             if total_steps % 100 == 0:
 68 |                 maddpg_agents.learn(memory)
 69 |             obs = obs_
 70 |             total_steps += 1
 71 | 
 72 |         if total_steps % EVAL_INTERVAL == 0:
 73 |             score = evaluate(maddpg_agents, parallel_env, episode, total_steps)
 74 |             eval_scores.append(score)
 75 |             eval_steps.append(total_steps)
 76 | 
 77 |         episode += 1
 78 | 
 79 |     np.save('data/maddpg_scores.npy', np.array(eval_scores))
 80 |     np.save('data/maddpg_steps.npy', np.array(eval_steps))
 81 | 
 82 | 
 83 | def evaluate(agents, env, ep, step, n_eval=3):
 84 |     score_history = []
 85 |     for i in range(n_eval):
 86 |         obs, _ = env.reset()
 87 |         score = 0
 88 |         terminal = [False] * env.max_num_agents
 89 |         while not any(terminal):
 90 |             actions = agents.choose_action(obs, evaluate=True)
 91 |             obs_, reward, done, trunc, info = env.step(actions)
 92 | 
 93 |             list_trunc = list(trunc.values())
 94 |             list_reward = list(reward.values())
 95 |             list_done = list(done.values())
 96 | 
 97 |             terminal = [d or t for d, t in zip(list_done, list_trunc)]
 98 | 
 99 |             obs = obs_
100 |             score += sum(list_reward)
101 |         score_history.append(score)
102 |     avg_score = np.mean(score_history)
103 |     print(f'Evaluation episode {ep} train steps {step}'
104 |           f' average score {avg_score:.1f}')
105 |     return avg_score
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     run()
110 | 


--------------------------------------------------------------------------------
/maddpg/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | def plot_learning_curve(x, scores, filename, lines=None):
 6 |     maddpg_scores, ddpg_scores = scores
 7 | 
 8 |     fig = plt.figure()
 9 |     ax = fig.add_subplot(111, label="1")
10 |     ax2 = fig.add_subplot(111, label="2", frame_on=False)
11 | 
12 |     N = len(maddpg_scores)
13 |     running_avg = np.empty(N)
14 |     for t in range(N):
15 |         running_avg[t] = np.mean(
16 |                 maddpg_scores[max(0, t-100):(t+1)])
17 | 
18 |     ax.plot(x, running_avg, color="C0")
19 |     ax.set_xlabel("Training Steps", color="C0")
20 |     ax.set_ylabel("MADDPG Score", color="C0")
21 |     ax.tick_params(axis='x', colors="C0")
22 |     ax.tick_params(axis='y', colors="C0")
23 | 
24 |     N = len(ddpg_scores)
25 |     running_avg = np.empty(N)
26 |     for t in range(N):
27 |         running_avg[t] = np.mean(ddpg_scores[max(0, t-100):(t+1)])
28 | 
29 |     ax2.plot(x, running_avg, color="C1")
30 |     ax2.axes.get_xaxis().set_visible(False)
31 |     ax2.yaxis.tick_right()
32 |     ax2.set_ylabel('DDPG Score', color="C1")
33 |     ax2.yaxis.set_label_position('right')
34 |     ax2.tick_params(axis='y', colors="C1")
35 | 
36 |     if lines is not None:
37 |         for line in lines:
38 |             plt.axvline(x=line)
39 | 
40 |     plt.savefig(filename)
41 | 


--------------------------------------------------------------------------------
/mappo/mappo/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | from networks import ContinuousActorNetwork, ContinuousCriticNetwork
  4 | 
  5 | 
  6 | class Agent:
  7 |     def __init__(self, actor_dims, critic_dims,
  8 |                  n_actions, agent_idx, agent_name,
  9 |                  gamma=0.99, alpha=3e-4, T=2048,
 10 |                  gae_lambda=0.95, policy_clip=0.2,
 11 |                  batch_size=64, n_epochs=10,
 12 |                  n_procs=8, chkpt_dir=None,
 13 |                  scenario=None):
 14 |         self.gamma = gamma
 15 |         self.policy_clip = policy_clip
 16 |         self.n_epochs = n_epochs
 17 |         self.gae_lambda = gae_lambda
 18 |         self.entropy_coefficient = 1e-3
 19 |         self.agent_idx = agent_idx
 20 |         self.agent_name = agent_name
 21 |         self.n_procs = n_procs
 22 | 
 23 |         self.actor = ContinuousActorNetwork(n_actions, actor_dims, alpha,
 24 |                                             chkpt_dir=chkpt_dir,
 25 |                                             scenario=scenario)
 26 |         self.critic = ContinuousCriticNetwork(critic_dims, alpha,
 27 |                                               chkpt_dir=chkpt_dir,
 28 |                                               scenario=scenario)
 29 |         self.n_actions = n_actions
 30 | 
 31 |     def save_models(self):
 32 |         self.actor.save_checkpoint()
 33 |         self.critic.save_checkpoint()
 34 | 
 35 |     def load_models(self):
 36 |         self.actor.load_checkpoint()
 37 |         self.critic.load_checkpoint()
 38 | 
 39 |     def choose_action(self, observation):
 40 |         with T.no_grad():
 41 |             state = T.tensor(observation, dtype=T.float,
 42 |                              device=self.actor.device)
 43 | 
 44 |             dist = self.actor(state)
 45 |             action = dist.sample()
 46 |             probs = dist.log_prob(action)
 47 |         return action.cpu().numpy(), probs.cpu().numpy()
 48 | 
 49 |     def calc_adv_and_returns(self, memories):
 50 |         states, new_states, r, dones = memories
 51 |         with T.no_grad():
 52 |             values = self.critic(states).squeeze()
 53 |             values_ = self.critic(new_states).squeeze()
 54 |             deltas = r[:, :, self.agent_idx] + self.gamma * values_ - values
 55 |             deltas = deltas.cpu().numpy()
 56 |             adv = [0]
 57 |             for step in reversed(range(deltas.shape[0])):
 58 |                 advantage = deltas[step] +\
 59 |                     self.gamma*self.gae_lambda*adv[-1]*np.array(dones[step])
 60 |                 adv.append(advantage)
 61 |             adv.reverse()
 62 |             adv = np.array(adv[:-1])
 63 |             adv = T.tensor(adv, device=self.critic.device).unsqueeze(2)
 64 |             returns = adv + values.unsqueeze(2)
 65 |             adv = (adv - adv.mean()) / (adv.std()+1e-4)
 66 |         return adv, returns
 67 | 
 68 |     def learn(self, memory):
 69 |         actor_states, states, actions, old_probs, rewards, actor_new_states, \
 70 |             states_, dones = memory.recall()
 71 |         device = self.critic.device
 72 |         state_arr = T.tensor(states, dtype=T.float, device=device)
 73 |         states__arr = T.tensor(states_, dtype=T.float, device=device)
 74 |         r = T.tensor(rewards, dtype=T.float, device=device)
 75 |         action_arr = T.tensor(actions[self.agent_name],
 76 |                               dtype=T.float, device=device)
 77 |         old_probs_arr = T.tensor(old_probs[self.agent_name], dtype=T.float,
 78 |                                  device=device)
 79 |         actor_states_arr = T.tensor(actor_states[self.agent_name],
 80 |                                     dtype=T.float, device=device)
 81 |         adv, returns = self.calc_adv_and_returns((state_arr, states__arr,
 82 |                                                  r, dones))
 83 |         for epoch in range(self.n_epochs):
 84 |             batches = memory.generate_batches()
 85 |             for batch in batches:
 86 |                 old_probs = old_probs_arr[batch]
 87 |                 actions = action_arr[batch]
 88 |                 actor_states = actor_states_arr[batch]
 89 |                 dist = self.actor(actor_states)
 90 |                 new_probs = dist.log_prob(actions)
 91 |                 prob_ratio = T.exp(new_probs.sum(2, keepdims=True) - old_probs.
 92 |                                    sum(2, keepdims=True))
 93 |                 weighted_probs = adv[batch] * prob_ratio
 94 |                 weighted_clipped_probs = T.clamp(
 95 |                         prob_ratio, 1-self.policy_clip, 1+self.policy_clip) * \
 96 |                     adv[batch]
 97 |                 entropy = dist.entropy().sum(2, keepdims=True)
 98 |                 actor_loss = -T.min(weighted_probs,
 99 |                                     weighted_clipped_probs)
100 |                 actor_loss -= self.entropy_coefficient * entropy
101 |                 self.actor.optimizer.zero_grad()
102 |                 actor_loss.mean().backward()
103 |                 T.nn.utils.clip_grad_norm_(self.actor.parameters(), 40)
104 |                 self.actor.optimizer.step()
105 | 
106 |                 states = state_arr[batch]
107 |                 critic_value = self.critic(states).squeeze()
108 |                 critic_loss = \
109 |                     (critic_value - returns[batch].squeeze()).pow(2).mean()
110 |                 self.critic.optimizer.zero_grad()
111 |                 critic_loss.backward()
112 |                 self.critic.optimizer.step()
113 | 


--------------------------------------------------------------------------------
/mappo/mappo/mappo.py:
--------------------------------------------------------------------------------
 1 | from agent import Agent
 2 | 
 3 | 
 4 | class MAPPO:
 5 |     def __init__(self, actor_dims, critic_dims, n_agents, n_actions,
 6 |                  env, T, n_procs, n_epochs,
 7 |                  alpha=1e-4, gamma=0.95, chkpt_dir='tmp/mappo/',
 8 |                  scenario='co-op_navigation'):
 9 |         self.agents = []
10 |         chkpt_dir += scenario
11 |         for agent_idx, agent in enumerate(env.agents):
12 |             self.agents.append(Agent(actor_dims[agent], critic_dims,
13 |                                n_actions[agent], agent_idx,
14 |                                alpha=alpha, chkpt_dir=chkpt_dir,
15 |                                gamma=gamma, agent_name=agent,
16 |                                scenario=scenario))
17 | 
18 |     def save_checkpoint(self):
19 |         for agent in self.agents:
20 |             agent.save_models()
21 | 
22 |     def load_checkpoint(self):
23 |         for agent in self.agents:
24 |             agent.load_models()
25 | 
26 |     def choose_action(self, raw_obs):
27 |         actions = {}
28 |         probs = {}
29 |         for agent_id, agent in zip(raw_obs, self.agents):
30 |             action, prob = agent.choose_action(raw_obs[agent_id])
31 |             actions[agent_id] = action
32 |             probs[agent_id] = prob
33 |         return actions, probs
34 | 
35 |     def learn(self, memory):
36 |         for agent in self.agents:
37 |             agent.learn(memory)
38 | 


--------------------------------------------------------------------------------
/mappo/mappo/memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class PPOMemory:
 5 |     def __init__(self, batch_size, T, n_agents, agents, n_procs,
 6 |                  critic_dims, actor_dims, n_actions):
 7 | 
 8 |         self.states = np.zeros((T, n_procs, critic_dims), dtype=np.float32)
 9 |         self.rewards = np.zeros((T, n_procs, n_agents), dtype=np.float32)
10 |         self.dones = np.zeros((T, n_procs), dtype=np.float32)
11 |         self.new_states = np.zeros((T, n_procs, critic_dims), dtype=np.float32)
12 | 
13 |         self.actor_states = {a: np.zeros((T, n_procs, actor_dims[a]))
14 |                              for a in agents}
15 |         self.actor_new_states = {a: np.zeros((T, n_procs, actor_dims[a]))
16 |                                  for a in agents}
17 |         self.actions = {a: np.zeros((T, n_procs, n_actions[a]))
18 |                         for a in agents}
19 |         self.probs = {a: np.zeros((T, n_procs, n_actions[a]))
20 |                       for a in agents}
21 | 
22 |         self.mem_cntr = 0
23 |         self.n_states = T
24 |         self.n_procs = n_procs
25 |         self.critic_dims = critic_dims
26 |         self.actor_dims = actor_dims
27 |         self.n_actions = n_actions
28 |         self.n_agents = n_agents
29 |         self.agents = agents
30 |         self.batch_size = batch_size
31 | 
32 |     def recall(self):
33 |         return self.actor_states, \
34 |             self.states, \
35 |             self.actions, \
36 |             self.probs, \
37 |             self.rewards, \
38 |             self.actor_new_states, \
39 |             self.new_states, \
40 |             self.dones
41 | 
42 |     def generate_batches(self):
43 |         # batch_start = np.arange(0, n_states, self.batch_size)
44 |         n_batches = int(self.n_states // self.batch_size)
45 |         indices = np.arange(self.n_states, dtype=np.int64)
46 |         np.random.shuffle(indices)
47 |         # batches = [indices[i:i+self.batch_size] for i in batch_start]
48 |         batches = [indices[i*self.batch_size:(i+1)*self.batch_size]
49 |                    for i in range(n_batches)]
50 |         return batches
51 | 
52 |     def store_memory(self, raw_obs, state, action, probs, reward,
53 |                      raw_obs_, state_, done):
54 |         index = self.mem_cntr % self.n_states
55 |         self.states[index] = state
56 |         self.new_states[index] = state_
57 |         self.dones[index] = done
58 |         self.rewards[index] = reward
59 | 
60 |         for agent in self.agents:
61 |             self.actions[agent][index] = action[agent]
62 |             self.actor_states[agent][index] = raw_obs[agent]
63 |             self.actor_new_states[agent][index] = raw_obs_[agent]
64 |             self.probs[agent][index] = probs[agent]
65 |         self.mem_cntr += 1
66 | 
67 |     def clear_memory(self):
68 |         self.states = np.zeros((self.n_states, self.n_procs, self.critic_dims),
69 |                                dtype=np.float32)
70 |         self.rewards = np.zeros((self.n_states, self.n_procs, self.n_agents),
71 |                                 dtype=np.float32)
72 |         self.dones = np.zeros((self.n_states, self.n_procs), dtype=np.float32)
73 |         self.new_states = np.zeros((self.n_states, self.n_procs,
74 |                                    self.critic_dims), dtype=np.float32)
75 | 
76 |         self.actor_states = {a: np.zeros(
77 |             (self.n_states, self.n_procs, self.actor_dims[a]))
78 |                              for a in self.agents}
79 |         self.actor_new_states = {a: np.zeros(
80 |             (self.n_states, self.n_procs, self.actor_dims[a]))
81 |                                  for a in self.agents}
82 |         self.actions = {a: np.zeros(
83 |             (self.n_states, self.n_procs, self.n_actions[a]))
84 |                         for a in self.agents}
85 |         self.probs = {a: np.zeros(
86 |             (self.n_states, self.n_procs, self.n_actions[a]))
87 |                       for a in self.agents}
88 | 


--------------------------------------------------------------------------------
/mappo/mappo/networks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | from torch.distributions import Beta, Categorical
 7 | 
 8 | 
 9 | class ContinuousActorNetwork(nn.Module):
10 |     def __init__(self, n_actions, input_dims, alpha,
11 |                  fc1_dims=128, fc2_dims=128, chkpt_dir='models/',
12 |                  scenario=None):
13 |         super(ContinuousActorNetwork, self).__init__()
14 |         chkpt_dir += scenario
15 |         if not os.path.exists(chkpt_dir):
16 |             os.makedirs(chkpt_dir)
17 |         self.checkpoint_file = os.path.join(chkpt_dir,
18 |                                             'actor_continuous_ppo')
19 |         self.fc1 = nn.Linear(input_dims, fc1_dims)
20 |         self.fc2 = nn.Linear(fc1_dims, fc2_dims)
21 |         self.alpha = nn.Linear(fc2_dims, n_actions)
22 |         self.beta = nn.Linear(fc2_dims, n_actions)
23 | 
24 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
25 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
26 |         self.to(self.device)
27 | 
28 |     def forward(self, state):
29 |         x = T.tanh(self.fc1(state))
30 |         x = T.tanh(self.fc2(x))
31 |         alpha = F.relu(self.alpha(x)) + 1.0
32 |         beta = F.relu(self.beta(x)) + 1.0
33 |         dist = Beta(alpha, beta)
34 |         return dist
35 | 
36 |     def save_checkpoint(self):
37 |         T.save(self.state_dict(), self.checkpoint_file)
38 | 
39 |     def load_checkpoint(self):
40 |         self.load_state_dict(T.load(self.checkpoint_file))
41 | 
42 | 
43 | class ContinuousCriticNetwork(nn.Module):
44 |     def __init__(self, input_dims, alpha,
45 |                  fc1_dims=128, fc2_dims=128, chkpt_dir='models/',
46 |                  scenario=None):
47 |         super(ContinuousCriticNetwork, self).__init__()
48 |         chkpt_dir += scenario
49 |         if not os.path.exists(chkpt_dir):
50 |             os.makedirs(chkpt_dir)
51 | 
52 |         self.checkpoint_file = os.path.join(chkpt_dir,
53 |                                             'critic_continuous_ppo')
54 |         self.fc1 = nn.Linear(input_dims, fc1_dims)
55 |         self.fc2 = nn.Linear(fc1_dims, fc2_dims)
56 |         self.v = nn.Linear(fc2_dims, 1)
57 | 
58 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
59 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
60 |         self.to(self.device)
61 | 
62 |     def forward(self, state):
63 |         x = T.tanh(self.fc1(state))
64 |         x = T.tanh(self.fc2(x))
65 |         v = self.v(x)
66 | 
67 |         return v
68 | 
69 |     def save_checkpoint(self):
70 |         T.save(self.state_dict(), self.checkpoint_file)
71 | 
72 |     def load_checkpoint(self):
73 |         self.load_state_dict(T.load(self.checkpoint_file))
74 | 


--------------------------------------------------------------------------------
/mappo/mappo/run.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from mappo import MAPPO
  3 | from memory import PPOMemory
  4 | from utils import obs_list_to_state_vector
  5 | from vec_env import make_vec_envs
  6 | 
  7 | 
  8 | def run():
  9 |     env_id = 'Simple_Speaker_Listener'
 10 |     random_seed = 0
 11 |     n_procs = 2
 12 |     env = make_vec_envs(env_id, random_seed, n_procs)
 13 |     N = 2048
 14 |     batch_size = 64
 15 |     n_epochs = 10
 16 |     alpha = 3e-4
 17 |     scenario = 'simple_speaker_listener'
 18 | 
 19 |     n_agents = env.max_num_agents
 20 | 
 21 |     actor_dims = {}
 22 |     n_actions = {}
 23 |     for agent in env.agents:
 24 |         actor_dims[agent] = env.observation_space(agent).shape[0]
 25 |         n_actions[agent] = env.action_space(agent).shape[0]
 26 |     critic_dims = sum([actor_dims[a] for a in env.agents])
 27 | 
 28 |     mappo_agents = MAPPO(actor_dims=actor_dims, critic_dims=critic_dims,
 29 |                          n_agents=n_agents, n_actions=n_actions,
 30 |                          n_epochs=n_epochs, env=env, gamma=0.95, alpha=alpha,
 31 |                          T=N, n_procs=n_procs, scenario=scenario)
 32 | 
 33 |     memory = PPOMemory(batch_size, N, n_agents, env.agents,
 34 |                        n_procs, critic_dims, actor_dims, n_actions)
 35 | 
 36 |     MAX_STEPS = 1_000_000
 37 |     total_steps = 0
 38 |     episode = 1
 39 |     traj_length = 0
 40 |     score_history, steps_history = [], []
 41 | 
 42 |     while total_steps < MAX_STEPS:
 43 |         observation, _ = env.reset()
 44 |         terminal = [False] * n_procs
 45 |         score = [0] * n_procs
 46 |         while not any(terminal):
 47 |             a_p = [mappo_agents.choose_action(observation[idx]) for
 48 |                    idx in range(n_procs)]
 49 |             action = [a[0] for a in a_p]
 50 |             prob = [a[1] for a in a_p]
 51 |             print(f'action {action}')
 52 |             observation_, reward, done, trunc, info = env.step(action)
 53 | 
 54 |             print(f'observation_ {observation_}')
 55 |             exit()
 56 | 
 57 |             total_steps += 1
 58 |             traj_length += 1
 59 | 
 60 |             done_arr = [list(d.values()) for d in done]
 61 |             obs_arr = [list(o.values()) for o in observation]
 62 |             reward_arr = [list(r.values()) for r in reward]
 63 |             new_obs_arr = [list(o.values()) for o in observation_]
 64 |             trunc_arr = [list(t.values()) for t in trunc]
 65 | 
 66 |             action_dict = {agent: [list(a[agent]) for a in action]
 67 |                            for agent in env.agents}
 68 |             obs_dict = {agent: [list(o[agent]) for o in observation]
 69 |                         for agent in env.agents}
 70 |             new_obs_dict = {agent: [list(o[agent]) for o in observation_]
 71 |                             for agent in env.agents}
 72 |             probs_dict = {agent: [list(p[agent]) for p in prob]
 73 |                           for agent in env.agents}
 74 | 
 75 |             state = obs_list_to_state_vector(obs_arr)
 76 |             state_ = obs_list_to_state_vector(new_obs_arr)
 77 | 
 78 |             score += [sum(r) for r in reward_arr]
 79 | 
 80 |             terminal = [any(d) or any(t) for d, t in zip(done_arr, trunc_arr)]
 81 |             mask = [0.0 if t else 1.0 for t in terminal]
 82 |             memory.store_memory(obs_dict, state, action_dict,
 83 |                                 probs_dict, reward_arr,
 84 |                                 new_obs_dict, state_, mask)
 85 | 
 86 |             if traj_length % N == 0:
 87 |                 mappo_agents.learn(memory)
 88 |                 traj_length = 0
 89 |                 memory.clear_memory()
 90 |             observation = observation_
 91 |         score_history.append(sum(score)/n_procs)
 92 |         steps_history.append(total_steps)
 93 |         avg_score = np.mean(score_history[-100:])
 94 |         print(f'{env_id} Episode {episode} total steps {total_steps}'
 95 |               f' avg score {avg_score :.1f}')
 96 | 
 97 |         episode += 1
 98 | 
 99 |     np.save('data/mappo_scores.npy', np.array(score_history))
100 |     np.save('data/mappo_steps.npy', np.array(steps_history))
101 |     env.close()
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     run()
106 | 


--------------------------------------------------------------------------------
/mappo/mappo/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 
12 | 
13 | def obs_list_to_state_vector(observation):
14 |     state = []
15 |     for row in observation:
16 |         obs = np.array([])
17 |         for o in row:
18 |             obs = np.concatenate([obs, o])
19 |         state.append(obs)
20 |     return np.array(state)
21 | 


--------------------------------------------------------------------------------
/mappo/mappo/vec_env.py:
--------------------------------------------------------------------------------
  1 | from mpi4py import MPI
  2 | import multiprocessing as mp
  3 | import gym
  4 | import numpy as np
  5 | import torch as T
  6 | from pettingzoo.mpe import simple_speaker_listener_v4
  7 | 
  8 | 
  9 | # based on:
 10 | # https://github.com/openai/baselines/blob/master/baselines/common/vec_env/subproc_vec_env.py
 11 | # and:
 12 | # https://github.com/maximecb/gym-miniworld/blob/master/pytorch-a2c-ppo-acktr/vec_env/subproc_vec_env.py
 13 | 
 14 | def worker(remote, parent_remote, env_fn_wrapper):
 15 |     parent_remote.close()
 16 |     env = env_fn_wrapper.x()
 17 |     while True:
 18 |         cmd, data = remote.recv()
 19 |         if cmd == 'step':
 20 |             ob, reward, done, trunc, info = env.step(data)
 21 |             remote.send((ob, reward, done, trunc, info))
 22 |         elif cmd == 'reset':
 23 |             ob, info = env.reset()
 24 |             remote.send((ob, info))
 25 |         elif cmd == 'close':
 26 |             remote.close()
 27 |             break
 28 |         elif cmd == 'get_spaces':
 29 |             remote.send((env.observation_space, env.action_space))
 30 |         elif cmd == 'max_num_agents':
 31 |             remote.send(env.max_num_agents)
 32 |         elif cmd == 'agents':
 33 |             remote.send(env.agents)
 34 |         else:
 35 |             raise NotImplementedError
 36 | 
 37 | 
 38 | class SubprocVecEnv:
 39 |     def __init__(self, env_fns, spaces=None):
 40 |         self.waiting = False
 41 |         self.closed = False
 42 |         nenvs = len(env_fns)
 43 |         mp.set_start_method('forkserver')
 44 |         self.remotes, self.work_remotes = zip(*[mp.Pipe()
 45 |                                                 for _ in range(nenvs)])
 46 |         self.ps = [mp.Process(target=worker, args=(work_remote, remote,
 47 |                               CloudpickleWrapper(env_fn)))
 48 |                    for (work_remote, remote, env_fn) in
 49 |                    zip(self.work_remotes, self.remotes, env_fns)]
 50 | 
 51 |         for p in self.ps:
 52 |             p.daemon = True
 53 |             p.start()
 54 | 
 55 |         for remote in self.work_remotes:
 56 |             remote.close()
 57 | 
 58 |         self.remotes[0].send(('get_spaces', None))
 59 |         observation_space, action_space = self.remotes[0].recv()
 60 |         self.observation_space = observation_space
 61 |         self.action_space = action_space
 62 | 
 63 |         self.remotes[0].send(('reset', None))
 64 |         _, _ = self.remotes[0].recv()
 65 | 
 66 |         self.remotes[0].send(('max_num_agents', None))
 67 |         self.max_num_agents = self.remotes[0].recv()
 68 |         self.remotes[0].send(('agents', None))
 69 |         self.agents = self.remotes[0].recv()
 70 | 
 71 |     def step_async(self, actions):
 72 |         assert not self.closed, "trying to operate after calling close()"
 73 |         for remote, action in zip(self.remotes, actions):
 74 |             remote.send(('step', action))
 75 |         # self.waiting = True
 76 |         results = [remote.recv() for remote in self.remotes]
 77 |         obs, rews, dones, truncs, infos = zip(*results)
 78 |         return np.stack(obs), np.stack(rews), np.stack(dones), \
 79 |                np.stack(truncs), infos
 80 |     """
 81 |     def step_wait(self):
 82 |         assert not self.closed, "trying to operate after calling close()"
 83 |         results = [remote.recv() for remote in self.remotes]
 84 |         self.waiting = False
 85 |         obs, rews, dones, infos = zip(*results)
 86 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
 87 |     """
 88 |     def reset(self):
 89 |         assert not self.closed, "trying to operate after calling close()"
 90 |         for remote in self.remotes:
 91 |             remote.send(('reset', None))
 92 |         obs_arr, info_arr = [], []
 93 |         for remote in self.remotes:
 94 |             obs, info = remote.recv()
 95 |             obs_arr.append(obs)
 96 |             info_arr.append(info)
 97 |         return np.array(obs_arr), np.array(info_arr)
 98 | 
 99 |     def close_extras(self):
100 |         if self.closed:
101 |             return
102 |         """
103 |         if self.waiting:
104 |             for remote in self.remotes:
105 |                 remote.recv()
106 |         """
107 |         for remote in self.remotes:
108 |             remote.send(('close', None))
109 |         for p in self.ps:
110 |             p.join()
111 |         self.closed = True
112 | 
113 |     def close(self):
114 |         if self.closed:
115 |             return
116 |         self.close_extras()
117 |         self.closed = True
118 | 
119 |     def step(self, actions):
120 |         # self.step_async(actions)
121 |         obs, reward, dones, truncs, info = self.step_async(actions)
122 |         return obs, reward, dones, truncs, info
123 |         # return self.step_wait()
124 | 
125 |     def __del__(self):
126 |         if not self.closed:
127 |             self.close()
128 | 
129 | 
130 | class CloudpickleWrapper:
131 |     def __init__(self, x):
132 |         self.x = x
133 | 
134 |     def __getstate__(self):
135 |         import cloudpickle
136 |         return cloudpickle.dumps(self.x)
137 | 
138 |     def __setstate__(self, ob):
139 |         import pickle
140 |         self.x = pickle.loads(ob)
141 | 
142 | 
143 | def make_env(env_id, seed, rank):
144 |     def _thunk():
145 |         env = simple_speaker_listener_v4.parallel_env(
146 |             continuous_actions=True)
147 |         _, _ = env.reset(seed=seed+rank)
148 |         # env.seed(seed + rank)
149 |         return env
150 | 
151 |     return _thunk
152 | 
153 | 
154 | def make_vec_envs(env_name, seed, num_processes):
155 |     mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
156 |     seed = seed + 10000 * mpi_rank if seed is not None else None
157 |     set_global_seeds(seed)
158 |     envs = [make_env(env_name, seed, i) for i in range(num_processes)]
159 | 
160 |     if len(envs) > 1:
161 |         envs = SubprocVecEnv(envs)
162 | 
163 |     return envs
164 | 
165 | 
166 | def set_global_seeds(seed):
167 |     import random
168 |     np.random.seed(seed)
169 |     random.seed(seed)
170 |     T.manual_seed(seed)
171 | 


--------------------------------------------------------------------------------