├── README.md
├── networks.py
├── main.py
├── agent.py
├── buffer.py
└── maddpg.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Multi-Agent-Deep-Deterministic-Policy-Gradients
 2 | A Pytorch implementation of the multi agent deep deterministic policy gradients(MADDPG) algorithm
 3 | 
 4 | This is my implementation of the algorithm presented in the paper: Multi Agent Actor Critic for Mixed Cooperative-Competitive Environments.
 5 | You can find this paper here:
 6 | https://arxiv.org/pdf/1706.02275.pdf
 7 | 
 8 | You will need to install the Multi Agent Particle Environment(MAPE), which you can find here:
 9 | https://github.com/openai/multiagent-particle-envs
10 | 
11 | Make sure to create a virtual environment with the dependencies for the MAPE, since they are somewhat out of date.
12 | I also recommend running this with PyTorch version 1.4.0, as the latest version (1.8) seems to have an issue with
13 | an in place operation I use in the calculation of the critic loss.
14 | 
15 | It's probably easiest to just clone this repo into the same directory as the MAPE, as the main file requires the
16 | make_env function from that package. 
17 | 
18 | The video for this tutorial is found here:
19 | https://youtu.be/tZTQ6S9PfkE
20 | 


--------------------------------------------------------------------------------
/networks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | 
 7 | class CriticNetwork(nn.Module):
 8 |     def __init__(self, beta, input_dims, fc1_dims, fc2_dims, 
 9 |                     n_agents, n_actions, name, chkpt_dir):
10 |         super(CriticNetwork, self).__init__()
11 | 
12 |         self.chkpt_file = os.path.join(chkpt_dir, name)
13 | 
14 |         self.fc1 = nn.Linear(input_dims+n_agents*n_actions, fc1_dims)
15 |         self.fc2 = nn.Linear(fc1_dims, fc2_dims)
16 |         self.q = nn.Linear(fc2_dims, 1)
17 | 
18 |         self.optimizer = optim.Adam(self.parameters(), lr=beta)
19 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
20 |  
21 |         self.to(self.device)
22 | 
23 |     def forward(self, state, action):
24 |         x = F.relu(self.fc1(T.cat([state, action], dim=1)))
25 |         x = F.relu(self.fc2(x))
26 |         q = self.q(x)
27 | 
28 |         return q
29 | 
30 |     def save_checkpoint(self):
31 |         T.save(self.state_dict(), self.chkpt_file)
32 | 
33 |     def load_checkpoint(self):
34 |         self.load_state_dict(T.load(self.chkpt_file))
35 | 
36 | 
37 | class ActorNetwork(nn.Module):
38 |     def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, 
39 |                  n_actions, name, chkpt_dir):
40 |         super(ActorNetwork, self).__init__()
41 | 
42 |         self.chkpt_file = os.path.join(chkpt_dir, name)
43 | 
44 |         self.fc1 = nn.Linear(input_dims, fc1_dims)
45 |         self.fc2 = nn.Linear(fc1_dims, fc2_dims)
46 |         self.pi = nn.Linear(fc2_dims, n_actions)
47 | 
48 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
49 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
50 |  
51 |         self.to(self.device)
52 | 
53 |     def forward(self, state):
54 |         x = F.relu(self.fc1(state))
55 |         x = F.relu(self.fc2(x))
56 |         pi = T.softmax(self.pi(x), dim=1)
57 | 
58 |         return pi
59 | 
60 |     def save_checkpoint(self):
61 |         T.save(self.state_dict(), self.chkpt_file)
62 | 
63 |     def load_checkpoint(self):
64 |         self.load_state_dict(T.load(self.chkpt_file))
65 | 
66 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from maddpg import MADDPG
 3 | from buffer import MultiAgentReplayBuffer
 4 | from make_env import make_env
 5 | 
 6 | def obs_list_to_state_vector(observation):
 7 |     state = np.array([])
 8 |     for obs in observation:
 9 |         state = np.concatenate([state, obs])
10 |     return state
11 | 
12 | if __name__ == '__main__':
13 |     #scenario = 'simple'
14 |     scenario = 'simple_adversary'
15 |     env = make_env(scenario)
16 |     n_agents = env.n
17 |     actor_dims = []
18 |     for i in range(n_agents):
19 |         actor_dims.append(env.observation_space[i].shape[0])
20 |     critic_dims = sum(actor_dims)
21 | 
22 |     # action space is a list of arrays, assume each agent has same action space
23 |     n_actions = env.action_space[0].n
24 |     maddpg_agents = MADDPG(actor_dims, critic_dims, n_agents, n_actions, 
25 |                            fc1=64, fc2=64,  
26 |                            alpha=0.01, beta=0.01, scenario=scenario,
27 |                            chkpt_dir='tmp/maddpg/')
28 | 
29 |     memory = MultiAgentReplayBuffer(1000000, critic_dims, actor_dims, 
30 |                         n_actions, n_agents, batch_size=1024)
31 | 
32 |     PRINT_INTERVAL = 500
33 |     N_GAMES = 50000
34 |     MAX_STEPS = 25
35 |     total_steps = 0
36 |     score_history = []
37 |     evaluate = False
38 |     best_score = 0
39 | 
40 |     if evaluate:
41 |         maddpg_agents.load_checkpoint()
42 | 
43 |     for i in range(N_GAMES):
44 |         obs = env.reset()
45 |         score = 0
46 |         done = [False]*n_agents
47 |         episode_step = 0
48 |         while not any(done):
49 |             if evaluate:
50 |                 env.render()
51 |                 #time.sleep(0.1) # to slow down the action for the video
52 |             actions = maddpg_agents.choose_action(obs)
53 |             obs_, reward, done, info = env.step(actions)
54 | 
55 |             state = obs_list_to_state_vector(obs)
56 |             state_ = obs_list_to_state_vector(obs_)
57 | 
58 |             if episode_step >= MAX_STEPS:
59 |                 done = [True]*n_agents
60 | 
61 |             memory.store_transition(obs, state, actions, reward, obs_, state_, done)
62 | 
63 |             if total_steps % 100 == 0 and not evaluate:
64 |                 maddpg_agents.learn(memory)
65 | 
66 |             obs = obs_
67 | 
68 |             score += sum(reward)
69 |             total_steps += 1
70 |             episode_step += 1
71 | 
72 |         score_history.append(score)
73 |         avg_score = np.mean(score_history[-100:])
74 |         if not evaluate:
75 |             if avg_score > best_score:
76 |                 maddpg_agents.save_checkpoint()
77 |                 best_score = avg_score
78 |         if i % PRINT_INTERVAL == 0 and i > 0:
79 |             print('episode', i, 'average score {:.1f}'.format(avg_score))
80 | 


--------------------------------------------------------------------------------
/agent.py:
--------------------------------------------------------------------------------
 1 | import torch as T
 2 | from networks import ActorNetwork, CriticNetwork
 3 | 
 4 | class Agent:
 5 |     def __init__(self, actor_dims, critic_dims, n_actions, n_agents, agent_idx, chkpt_dir,
 6 |                     alpha=0.01, beta=0.01, fc1=64, 
 7 |                     fc2=64, gamma=0.95, tau=0.01):
 8 |         self.gamma = gamma
 9 |         self.tau = tau
10 |         self.n_actions = n_actions
11 |         self.agent_name = 'agent_%s' % agent_idx
12 |         self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, 
13 |                                   chkpt_dir=chkpt_dir,  name=self.agent_name+'_actor')
14 |         self.critic = CriticNetwork(beta, critic_dims, 
15 |                             fc1, fc2, n_agents, n_actions, 
16 |                             chkpt_dir=chkpt_dir, name=self.agent_name+'_critic')
17 |         self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions,
18 |                                         chkpt_dir=chkpt_dir, 
19 |                                         name=self.agent_name+'_target_actor')
20 |         self.target_critic = CriticNetwork(beta, critic_dims, 
21 |                                             fc1, fc2, n_agents, n_actions,
22 |                                             chkpt_dir=chkpt_dir,
23 |                                             name=self.agent_name+'_target_critic')
24 | 
25 |         self.update_network_parameters(tau=1)
26 | 
27 |     def choose_action(self, observation):
28 |         state = T.tensor([observation], dtype=T.float).to(self.actor.device)
29 |         actions = self.actor.forward(state)
30 |         noise = T.rand(self.n_actions).to(self.actor.device)
31 |         action = actions + noise
32 | 
33 |         return action.detach().cpu().numpy()[0]
34 | 
35 |     def update_network_parameters(self, tau=None):
36 |         if tau is None:
37 |             tau = self.tau
38 | 
39 |         target_actor_params = self.target_actor.named_parameters()
40 |         actor_params = self.actor.named_parameters()
41 | 
42 |         target_actor_state_dict = dict(target_actor_params)
43 |         actor_state_dict = dict(actor_params)
44 |         for name in actor_state_dict:
45 |             actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
46 |                     (1-tau)*target_actor_state_dict[name].clone()
47 | 
48 |         self.target_actor.load_state_dict(actor_state_dict)
49 | 
50 |         target_critic_params = self.target_critic.named_parameters()
51 |         critic_params = self.critic.named_parameters()
52 | 
53 |         target_critic_state_dict = dict(target_critic_params)
54 |         critic_state_dict = dict(critic_params)
55 |         for name in critic_state_dict:
56 |             critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
57 |                     (1-tau)*target_critic_state_dict[name].clone()
58 | 
59 |         self.target_critic.load_state_dict(critic_state_dict)
60 | 
61 |     def save_models(self):
62 |         self.actor.save_checkpoint()
63 |         self.target_actor.save_checkpoint()
64 |         self.critic.save_checkpoint()
65 |         self.target_critic.save_checkpoint()
66 | 
67 |     def load_models(self):
68 |         self.actor.load_checkpoint()
69 |         self.target_actor.load_checkpoint()
70 |         self.critic.load_checkpoint()
71 |         self.target_critic.load_checkpoint()
72 | 


--------------------------------------------------------------------------------
/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class MultiAgentReplayBuffer:
 4 |     def __init__(self, max_size, critic_dims, actor_dims, 
 5 |             n_actions, n_agents, batch_size):
 6 |         self.mem_size = max_size
 7 |         self.mem_cntr = 0
 8 |         self.n_agents = n_agents
 9 |         self.actor_dims = actor_dims
10 |         self.batch_size = batch_size
11 |         self.n_actions = n_actions
12 | 
13 |         self.state_memory = np.zeros((self.mem_size, critic_dims))
14 |         self.new_state_memory = np.zeros((self.mem_size, critic_dims))
15 |         self.reward_memory = np.zeros((self.mem_size, n_agents))
16 |         self.terminal_memory = np.zeros((self.mem_size, n_agents), dtype=bool)
17 | 
18 |         self.init_actor_memory()
19 | 
20 |     def init_actor_memory(self):
21 |         self.actor_state_memory = []
22 |         self.actor_new_state_memory = []
23 |         self.actor_action_memory = []
24 | 
25 |         for i in range(self.n_agents):
26 |             self.actor_state_memory.append(
27 |                             np.zeros((self.mem_size, self.actor_dims[i])))
28 |             self.actor_new_state_memory.append(
29 |                             np.zeros((self.mem_size, self.actor_dims[i])))
30 |             self.actor_action_memory.append(
31 |                             np.zeros((self.mem_size, self.n_actions)))
32 | 
33 | 
34 |     def store_transition(self, raw_obs, state, action, reward, 
35 |                                raw_obs_, state_, done):
36 |         # this introduces a bug: if we fill up the memory capacity and then
37 |         # zero out our actor memory, the critic will still have memories to access
38 |         # while the actor will have nothing but zeros to sample. Obviously
39 |         # not what we intend.
40 |         # In reality, there's no problem with just using the same index
41 |         # for both the actor and critic states. I'm not sure why I thought
42 |         # this was necessary in the first place. Sorry for the confusion!
43 | 
44 |         #if self.mem_cntr % self.mem_size == 0 and self.mem_cntr > 0:
45 |         #    self.init_actor_memory()
46 |         
47 |         index = self.mem_cntr % self.mem_size
48 | 
49 |         for agent_idx in range(self.n_agents):
50 |             self.actor_state_memory[agent_idx][index] = raw_obs[agent_idx]
51 |             self.actor_new_state_memory[agent_idx][index] = raw_obs_[agent_idx]
52 |             self.actor_action_memory[agent_idx][index] = action[agent_idx]
53 | 
54 |         self.state_memory[index] = state
55 |         self.new_state_memory[index] = state_
56 |         self.reward_memory[index] = reward
57 |         self.terminal_memory[index] = done
58 |         self.mem_cntr += 1
59 | 
60 |     def sample_buffer(self):
61 |         max_mem = min(self.mem_cntr, self.mem_size)
62 | 
63 |         batch = np.random.choice(max_mem, self.batch_size, replace=False)
64 | 
65 |         states = self.state_memory[batch]
66 |         rewards = self.reward_memory[batch]
67 |         states_ = self.new_state_memory[batch]
68 |         terminal = self.terminal_memory[batch]
69 | 
70 |         actor_states = []
71 |         actor_new_states = []
72 |         actions = []
73 |         for agent_idx in range(self.n_agents):
74 |             actor_states.append(self.actor_state_memory[agent_idx][batch])
75 |             actor_new_states.append(self.actor_new_state_memory[agent_idx][batch])
76 |             actions.append(self.actor_action_memory[agent_idx][batch])
77 | 
78 |         return actor_states, states, actions, rewards, \
79 |                actor_new_states, states_, terminal
80 | 
81 |     def ready(self):
82 |         if self.mem_cntr >= self.batch_size:
83 |             return True
84 | 


--------------------------------------------------------------------------------
/maddpg.py:
--------------------------------------------------------------------------------
 1 | import torch as T
 2 | import torch.nn.functional as F
 3 | from agent import Agent
 4 | 
 5 | class MADDPG:
 6 |     def __init__(self, actor_dims, critic_dims, n_agents, n_actions, 
 7 |                  scenario='simple',  alpha=0.01, beta=0.01, fc1=64, 
 8 |                  fc2=64, gamma=0.99, tau=0.01, chkpt_dir='tmp/maddpg/'):
 9 |         self.agents = []
10 |         self.n_agents = n_agents
11 |         self.n_actions = n_actions
12 |         chkpt_dir += scenario 
13 |         for agent_idx in range(self.n_agents):
14 |             self.agents.append(Agent(actor_dims[agent_idx], critic_dims,  
15 |                             n_actions, n_agents, agent_idx, alpha=alpha, beta=beta,
16 |                             chkpt_dir=chkpt_dir))
17 | 
18 | 
19 |     def save_checkpoint(self):
20 |         print('... saving checkpoint ...')
21 |         for agent in self.agents:
22 |             agent.save_models()
23 | 
24 |     def load_checkpoint(self):
25 |         print('... loading checkpoint ...')
26 |         for agent in self.agents:
27 |             agent.load_models()
28 | 
29 |     def choose_action(self, raw_obs):
30 |         actions = []
31 |         for agent_idx, agent in enumerate(self.agents):
32 |             action = agent.choose_action(raw_obs[agent_idx])
33 |             actions.append(action)
34 |         return actions
35 | 
36 |     def learn(self, memory):
37 |         if not memory.ready():
38 |             return
39 | 
40 |         actor_states, states, actions, rewards, \
41 |         actor_new_states, states_, dones = memory.sample_buffer()
42 | 
43 |         device = self.agents[0].actor.device
44 | 
45 |         states = T.tensor(states, dtype=T.float).to(device)
46 |         actions = T.tensor(actions, dtype=T.float).to(device)
47 |         rewards = T.tensor(rewards).to(device)
48 |         states_ = T.tensor(states_, dtype=T.float).to(device)
49 |         dones = T.tensor(dones).to(device)
50 | 
51 |         all_agents_new_actions = []
52 |         all_agents_new_mu_actions = []
53 |         old_agents_actions = []
54 | 
55 |         for agent_idx, agent in enumerate(self.agents):
56 |             new_states = T.tensor(actor_new_states[agent_idx], 
57 |                                  dtype=T.float).to(device)
58 | 
59 |             new_pi = agent.target_actor.forward(new_states)
60 | 
61 |             all_agents_new_actions.append(new_pi)
62 |             mu_states = T.tensor(actor_states[agent_idx], 
63 |                                  dtype=T.float).to(device)
64 |             pi = agent.actor.forward(mu_states)
65 |             all_agents_new_mu_actions.append(pi)
66 |             old_agents_actions.append(actions[agent_idx])
67 | 
68 |         new_actions = T.cat([acts for acts in all_agents_new_actions], dim=1)
69 |         mu = T.cat([acts for acts in all_agents_new_mu_actions], dim=1)
70 |         old_actions = T.cat([acts for acts in old_agents_actions],dim=1)
71 | 
72 |         for agent_idx, agent in enumerate(self.agents):
73 |             critic_value_ = agent.target_critic.forward(states_, new_actions).flatten()
74 |             critic_value_[dones[:,0]] = 0.0
75 |             critic_value = agent.critic.forward(states, old_actions).flatten()
76 | 
77 |             target = rewards[:,agent_idx] + agent.gamma*critic_value_
78 |             critic_loss = F.mse_loss(target, critic_value)
79 |             agent.critic.optimizer.zero_grad()
80 |             critic_loss.backward(retain_graph=True)
81 |             agent.critic.optimizer.step()
82 | 
83 |             actor_loss = agent.critic.forward(states, mu).flatten()
84 |             actor_loss = -T.mean(actor_loss)
85 |             agent.actor.optimizer.zero_grad()
86 |             actor_loss.backward(retain_graph=True)
87 |             agent.actor.optimizer.step()
88 | 
89 |             agent.update_network_parameters()
90 | 


--------------------------------------------------------------------------------