├── README.md ├── networks.py ├── main.py ├── agent.py ├── buffer.py └── maddpg.py /README.md: -------------------------------------------------------------------------------- 1 | # Multi-Agent-Deep-Deterministic-Policy-Gradients 2 | A Pytorch implementation of the multi agent deep deterministic policy gradients(MADDPG) algorithm 3 | 4 | This is my implementation of the algorithm presented in the paper: Multi Agent Actor Critic for Mixed Cooperative-Competitive Environments. 5 | You can find this paper here: 6 | https://arxiv.org/pdf/1706.02275.pdf 7 | 8 | You will need to install the Multi Agent Particle Environment(MAPE), which you can find here: 9 | https://github.com/openai/multiagent-particle-envs 10 | 11 | Make sure to create a virtual environment with the dependencies for the MAPE, since they are somewhat out of date. 12 | I also recommend running this with PyTorch version 1.4.0, as the latest version (1.8) seems to have an issue with 13 | an in place operation I use in the calculation of the critic loss. 14 | 15 | It's probably easiest to just clone this repo into the same directory as the MAPE, as the main file requires the 16 | make_env function from that package. 17 | 18 | The video for this tutorial is found here: 19 | https://youtu.be/tZTQ6S9PfkE 20 | -------------------------------------------------------------------------------- /networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | class CriticNetwork(nn.Module): 8 | def __init__(self, beta, input_dims, fc1_dims, fc2_dims, 9 | n_agents, n_actions, name, chkpt_dir): 10 | super(CriticNetwork, self).__init__() 11 | 12 | self.chkpt_file = os.path.join(chkpt_dir, name) 13 | 14 | self.fc1 = nn.Linear(input_dims+n_agents*n_actions, fc1_dims) 15 | self.fc2 = nn.Linear(fc1_dims, fc2_dims) 16 | self.q = nn.Linear(fc2_dims, 1) 17 | 18 | self.optimizer = optim.Adam(self.parameters(), lr=beta) 19 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 20 | 21 | self.to(self.device) 22 | 23 | def forward(self, state, action): 24 | x = F.relu(self.fc1(T.cat([state, action], dim=1))) 25 | x = F.relu(self.fc2(x)) 26 | q = self.q(x) 27 | 28 | return q 29 | 30 | def save_checkpoint(self): 31 | T.save(self.state_dict(), self.chkpt_file) 32 | 33 | def load_checkpoint(self): 34 | self.load_state_dict(T.load(self.chkpt_file)) 35 | 36 | 37 | class ActorNetwork(nn.Module): 38 | def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, 39 | n_actions, name, chkpt_dir): 40 | super(ActorNetwork, self).__init__() 41 | 42 | self.chkpt_file = os.path.join(chkpt_dir, name) 43 | 44 | self.fc1 = nn.Linear(input_dims, fc1_dims) 45 | self.fc2 = nn.Linear(fc1_dims, fc2_dims) 46 | self.pi = nn.Linear(fc2_dims, n_actions) 47 | 48 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 49 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 50 | 51 | self.to(self.device) 52 | 53 | def forward(self, state): 54 | x = F.relu(self.fc1(state)) 55 | x = F.relu(self.fc2(x)) 56 | pi = T.softmax(self.pi(x), dim=1) 57 | 58 | return pi 59 | 60 | def save_checkpoint(self): 61 | T.save(self.state_dict(), self.chkpt_file) 62 | 63 | def load_checkpoint(self): 64 | self.load_state_dict(T.load(self.chkpt_file)) 65 | 66 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from maddpg import MADDPG 3 | from buffer import MultiAgentReplayBuffer 4 | from make_env import make_env 5 | 6 | def obs_list_to_state_vector(observation): 7 | state = np.array([]) 8 | for obs in observation: 9 | state = np.concatenate([state, obs]) 10 | return state 11 | 12 | if __name__ == '__main__': 13 | #scenario = 'simple' 14 | scenario = 'simple_adversary' 15 | env = make_env(scenario) 16 | n_agents = env.n 17 | actor_dims = [] 18 | for i in range(n_agents): 19 | actor_dims.append(env.observation_space[i].shape[0]) 20 | critic_dims = sum(actor_dims) 21 | 22 | # action space is a list of arrays, assume each agent has same action space 23 | n_actions = env.action_space[0].n 24 | maddpg_agents = MADDPG(actor_dims, critic_dims, n_agents, n_actions, 25 | fc1=64, fc2=64, 26 | alpha=0.01, beta=0.01, scenario=scenario, 27 | chkpt_dir='tmp/maddpg/') 28 | 29 | memory = MultiAgentReplayBuffer(1000000, critic_dims, actor_dims, 30 | n_actions, n_agents, batch_size=1024) 31 | 32 | PRINT_INTERVAL = 500 33 | N_GAMES = 50000 34 | MAX_STEPS = 25 35 | total_steps = 0 36 | score_history = [] 37 | evaluate = False 38 | best_score = 0 39 | 40 | if evaluate: 41 | maddpg_agents.load_checkpoint() 42 | 43 | for i in range(N_GAMES): 44 | obs = env.reset() 45 | score = 0 46 | done = [False]*n_agents 47 | episode_step = 0 48 | while not any(done): 49 | if evaluate: 50 | env.render() 51 | #time.sleep(0.1) # to slow down the action for the video 52 | actions = maddpg_agents.choose_action(obs) 53 | obs_, reward, done, info = env.step(actions) 54 | 55 | state = obs_list_to_state_vector(obs) 56 | state_ = obs_list_to_state_vector(obs_) 57 | 58 | if episode_step >= MAX_STEPS: 59 | done = [True]*n_agents 60 | 61 | memory.store_transition(obs, state, actions, reward, obs_, state_, done) 62 | 63 | if total_steps % 100 == 0 and not evaluate: 64 | maddpg_agents.learn(memory) 65 | 66 | obs = obs_ 67 | 68 | score += sum(reward) 69 | total_steps += 1 70 | episode_step += 1 71 | 72 | score_history.append(score) 73 | avg_score = np.mean(score_history[-100:]) 74 | if not evaluate: 75 | if avg_score > best_score: 76 | maddpg_agents.save_checkpoint() 77 | best_score = avg_score 78 | if i % PRINT_INTERVAL == 0 and i > 0: 79 | print('episode', i, 'average score {:.1f}'.format(avg_score)) 80 | -------------------------------------------------------------------------------- /agent.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | from networks import ActorNetwork, CriticNetwork 3 | 4 | class Agent: 5 | def __init__(self, actor_dims, critic_dims, n_actions, n_agents, agent_idx, chkpt_dir, 6 | alpha=0.01, beta=0.01, fc1=64, 7 | fc2=64, gamma=0.95, tau=0.01): 8 | self.gamma = gamma 9 | self.tau = tau 10 | self.n_actions = n_actions 11 | self.agent_name = 'agent_%s' % agent_idx 12 | self.actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, 13 | chkpt_dir=chkpt_dir, name=self.agent_name+'_actor') 14 | self.critic = CriticNetwork(beta, critic_dims, 15 | fc1, fc2, n_agents, n_actions, 16 | chkpt_dir=chkpt_dir, name=self.agent_name+'_critic') 17 | self.target_actor = ActorNetwork(alpha, actor_dims, fc1, fc2, n_actions, 18 | chkpt_dir=chkpt_dir, 19 | name=self.agent_name+'_target_actor') 20 | self.target_critic = CriticNetwork(beta, critic_dims, 21 | fc1, fc2, n_agents, n_actions, 22 | chkpt_dir=chkpt_dir, 23 | name=self.agent_name+'_target_critic') 24 | 25 | self.update_network_parameters(tau=1) 26 | 27 | def choose_action(self, observation): 28 | state = T.tensor([observation], dtype=T.float).to(self.actor.device) 29 | actions = self.actor.forward(state) 30 | noise = T.rand(self.n_actions).to(self.actor.device) 31 | action = actions + noise 32 | 33 | return action.detach().cpu().numpy()[0] 34 | 35 | def update_network_parameters(self, tau=None): 36 | if tau is None: 37 | tau = self.tau 38 | 39 | target_actor_params = self.target_actor.named_parameters() 40 | actor_params = self.actor.named_parameters() 41 | 42 | target_actor_state_dict = dict(target_actor_params) 43 | actor_state_dict = dict(actor_params) 44 | for name in actor_state_dict: 45 | actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ 46 | (1-tau)*target_actor_state_dict[name].clone() 47 | 48 | self.target_actor.load_state_dict(actor_state_dict) 49 | 50 | target_critic_params = self.target_critic.named_parameters() 51 | critic_params = self.critic.named_parameters() 52 | 53 | target_critic_state_dict = dict(target_critic_params) 54 | critic_state_dict = dict(critic_params) 55 | for name in critic_state_dict: 56 | critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ 57 | (1-tau)*target_critic_state_dict[name].clone() 58 | 59 | self.target_critic.load_state_dict(critic_state_dict) 60 | 61 | def save_models(self): 62 | self.actor.save_checkpoint() 63 | self.target_actor.save_checkpoint() 64 | self.critic.save_checkpoint() 65 | self.target_critic.save_checkpoint() 66 | 67 | def load_models(self): 68 | self.actor.load_checkpoint() 69 | self.target_actor.load_checkpoint() 70 | self.critic.load_checkpoint() 71 | self.target_critic.load_checkpoint() 72 | -------------------------------------------------------------------------------- /buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class MultiAgentReplayBuffer: 4 | def __init__(self, max_size, critic_dims, actor_dims, 5 | n_actions, n_agents, batch_size): 6 | self.mem_size = max_size 7 | self.mem_cntr = 0 8 | self.n_agents = n_agents 9 | self.actor_dims = actor_dims 10 | self.batch_size = batch_size 11 | self.n_actions = n_actions 12 | 13 | self.state_memory = np.zeros((self.mem_size, critic_dims)) 14 | self.new_state_memory = np.zeros((self.mem_size, critic_dims)) 15 | self.reward_memory = np.zeros((self.mem_size, n_agents)) 16 | self.terminal_memory = np.zeros((self.mem_size, n_agents), dtype=bool) 17 | 18 | self.init_actor_memory() 19 | 20 | def init_actor_memory(self): 21 | self.actor_state_memory = [] 22 | self.actor_new_state_memory = [] 23 | self.actor_action_memory = [] 24 | 25 | for i in range(self.n_agents): 26 | self.actor_state_memory.append( 27 | np.zeros((self.mem_size, self.actor_dims[i]))) 28 | self.actor_new_state_memory.append( 29 | np.zeros((self.mem_size, self.actor_dims[i]))) 30 | self.actor_action_memory.append( 31 | np.zeros((self.mem_size, self.n_actions))) 32 | 33 | 34 | def store_transition(self, raw_obs, state, action, reward, 35 | raw_obs_, state_, done): 36 | # this introduces a bug: if we fill up the memory capacity and then 37 | # zero out our actor memory, the critic will still have memories to access 38 | # while the actor will have nothing but zeros to sample. Obviously 39 | # not what we intend. 40 | # In reality, there's no problem with just using the same index 41 | # for both the actor and critic states. I'm not sure why I thought 42 | # this was necessary in the first place. Sorry for the confusion! 43 | 44 | #if self.mem_cntr % self.mem_size == 0 and self.mem_cntr > 0: 45 | # self.init_actor_memory() 46 | 47 | index = self.mem_cntr % self.mem_size 48 | 49 | for agent_idx in range(self.n_agents): 50 | self.actor_state_memory[agent_idx][index] = raw_obs[agent_idx] 51 | self.actor_new_state_memory[agent_idx][index] = raw_obs_[agent_idx] 52 | self.actor_action_memory[agent_idx][index] = action[agent_idx] 53 | 54 | self.state_memory[index] = state 55 | self.new_state_memory[index] = state_ 56 | self.reward_memory[index] = reward 57 | self.terminal_memory[index] = done 58 | self.mem_cntr += 1 59 | 60 | def sample_buffer(self): 61 | max_mem = min(self.mem_cntr, self.mem_size) 62 | 63 | batch = np.random.choice(max_mem, self.batch_size, replace=False) 64 | 65 | states = self.state_memory[batch] 66 | rewards = self.reward_memory[batch] 67 | states_ = self.new_state_memory[batch] 68 | terminal = self.terminal_memory[batch] 69 | 70 | actor_states = [] 71 | actor_new_states = [] 72 | actions = [] 73 | for agent_idx in range(self.n_agents): 74 | actor_states.append(self.actor_state_memory[agent_idx][batch]) 75 | actor_new_states.append(self.actor_new_state_memory[agent_idx][batch]) 76 | actions.append(self.actor_action_memory[agent_idx][batch]) 77 | 78 | return actor_states, states, actions, rewards, \ 79 | actor_new_states, states_, terminal 80 | 81 | def ready(self): 82 | if self.mem_cntr >= self.batch_size: 83 | return True 84 | -------------------------------------------------------------------------------- /maddpg.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | import torch.nn.functional as F 3 | from agent import Agent 4 | 5 | class MADDPG: 6 | def __init__(self, actor_dims, critic_dims, n_agents, n_actions, 7 | scenario='simple', alpha=0.01, beta=0.01, fc1=64, 8 | fc2=64, gamma=0.99, tau=0.01, chkpt_dir='tmp/maddpg/'): 9 | self.agents = [] 10 | self.n_agents = n_agents 11 | self.n_actions = n_actions 12 | chkpt_dir += scenario 13 | for agent_idx in range(self.n_agents): 14 | self.agents.append(Agent(actor_dims[agent_idx], critic_dims, 15 | n_actions, n_agents, agent_idx, alpha=alpha, beta=beta, 16 | chkpt_dir=chkpt_dir)) 17 | 18 | 19 | def save_checkpoint(self): 20 | print('... saving checkpoint ...') 21 | for agent in self.agents: 22 | agent.save_models() 23 | 24 | def load_checkpoint(self): 25 | print('... loading checkpoint ...') 26 | for agent in self.agents: 27 | agent.load_models() 28 | 29 | def choose_action(self, raw_obs): 30 | actions = [] 31 | for agent_idx, agent in enumerate(self.agents): 32 | action = agent.choose_action(raw_obs[agent_idx]) 33 | actions.append(action) 34 | return actions 35 | 36 | def learn(self, memory): 37 | if not memory.ready(): 38 | return 39 | 40 | actor_states, states, actions, rewards, \ 41 | actor_new_states, states_, dones = memory.sample_buffer() 42 | 43 | device = self.agents[0].actor.device 44 | 45 | states = T.tensor(states, dtype=T.float).to(device) 46 | actions = T.tensor(actions, dtype=T.float).to(device) 47 | rewards = T.tensor(rewards).to(device) 48 | states_ = T.tensor(states_, dtype=T.float).to(device) 49 | dones = T.tensor(dones).to(device) 50 | 51 | all_agents_new_actions = [] 52 | all_agents_new_mu_actions = [] 53 | old_agents_actions = [] 54 | 55 | for agent_idx, agent in enumerate(self.agents): 56 | new_states = T.tensor(actor_new_states[agent_idx], 57 | dtype=T.float).to(device) 58 | 59 | new_pi = agent.target_actor.forward(new_states) 60 | 61 | all_agents_new_actions.append(new_pi) 62 | mu_states = T.tensor(actor_states[agent_idx], 63 | dtype=T.float).to(device) 64 | pi = agent.actor.forward(mu_states) 65 | all_agents_new_mu_actions.append(pi) 66 | old_agents_actions.append(actions[agent_idx]) 67 | 68 | new_actions = T.cat([acts for acts in all_agents_new_actions], dim=1) 69 | mu = T.cat([acts for acts in all_agents_new_mu_actions], dim=1) 70 | old_actions = T.cat([acts for acts in old_agents_actions],dim=1) 71 | 72 | for agent_idx, agent in enumerate(self.agents): 73 | critic_value_ = agent.target_critic.forward(states_, new_actions).flatten() 74 | critic_value_[dones[:,0]] = 0.0 75 | critic_value = agent.critic.forward(states, old_actions).flatten() 76 | 77 | target = rewards[:,agent_idx] + agent.gamma*critic_value_ 78 | critic_loss = F.mse_loss(target, critic_value) 79 | agent.critic.optimizer.zero_grad() 80 | critic_loss.backward(retain_graph=True) 81 | agent.critic.optimizer.step() 82 | 83 | actor_loss = agent.critic.forward(states, mu).flatten() 84 | actor_loss = -T.mean(actor_loss) 85 | agent.actor.optimizer.zero_grad() 86 | actor_loss.backward(retain_graph=True) 87 | agent.actor.optimizer.step() 88 | 89 | agent.update_network_parameters() 90 | --------------------------------------------------------------------------------