├── .gitignore ├── README.md ├── algo ├── __init__.py ├── ac.py ├── base.py ├── q_learning.py └── tools.py ├── replay.gif ├── requirements.txt ├── senarios └── senario_battle.py └── train_battle.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | /data -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pytorch Version for Mean Field Multi-Agent Reinforcement Learning 2 | Pytorch implementation of MF-Q and MF-AC in the paper [Mean Field Multi-Agent Reinforcement Learning](https://arxiv.org/pdf/1802.05438.pdf). 3 | 4 | The original code can be found in [mlii/mfrl](https://github.com/mlii/mfrl). 5 | 6 | Please uncomment the following two lines of code in `base.py` if the algorithm occasionally fails to converge. 7 | ```python 8 | #distribution = torch.distributions.Categorical(predict) 9 | #actions = distribution.sample().detach().cpu().numpy() 10 | ``` 11 | 12 | ## Example 13 | ![image](https://github.com/deligentfool/mfrl_pytorch/blob/master/replay.gif) 14 | -------------------------------------------------------------------------------- /algo/__init__.py: -------------------------------------------------------------------------------- 1 | from . import q_learning 2 | from . import ac 3 | 4 | IQL = q_learning.DQN 5 | MFQ = q_learning.MFQ 6 | AC = ac.ActorCritic 7 | MFAC = ac.MFAC 8 | 9 | 10 | def spawn_ai(algo_name, env, handle, human_name, max_steps, cuda=True): 11 | if algo_name == 'mfq': 12 | model = MFQ(env, human_name, handle, max_steps, memory_size=80000) 13 | elif algo_name == 'iql': 14 | model = IQL(env, human_name, handle, max_steps, memory_size=80000) 15 | elif algo_name == 'ac': 16 | model = AC(env, human_name, handle, use_cuda=cuda) 17 | elif algo_name == 'mfac': 18 | model = MFAC(env, human_name, handle, use_cuda=cuda) 19 | if cuda: 20 | model = model.cuda() 21 | return model -------------------------------------------------------------------------------- /algo/ac.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | 7 | from . import tools 8 | 9 | 10 | class ActorCritic(nn.Module): 11 | def __init__(self, env, name, handle, value_coef=0.1, ent_coef=0.08, gamma=0.95, batch_size=64, learning_rate=1e-4, use_cuda=False): 12 | super(ActorCritic, self).__init__() 13 | 14 | self.env = env 15 | self.name = name 16 | self.view_space = env.unwrapped.env.get_view_space(handle) 17 | assert len(self.view_space) == 3 18 | self.feature_space = env.unwrapped.env.get_feature_space(handle)[0] 19 | self.num_actions = env.unwrapped.env.get_action_space(handle)[0] 20 | self.gamma = gamma 21 | 22 | self.batch_size = batch_size 23 | self.learning_rate = learning_rate 24 | 25 | self.value_coef = value_coef # coefficient of value in the total loss 26 | self.ent_coef = ent_coef # coefficient of entropy in the total loss 27 | 28 | # init training buffers 29 | self.view_buf = np.empty([1,] + list(self.view_space)) 30 | self.feature_buf = np.empty([1,] + [self.feature_space]) 31 | self.action_buf = np.empty(1, dtype=np.int32) 32 | self.reward_buf = np.empty(1, dtype=np.float32) 33 | self.replay_buffer = tools.EpisodesBuffer() 34 | 35 | self.net = self._construct_net() 36 | self.optim = torch.optim.Adam(lr=self.learning_rate, params=self.get_all_params()) 37 | self.use_cuda = use_cuda 38 | 39 | def get_all_params(self): 40 | params = [] 41 | for k, v in self.net.items(): 42 | params += list(v.parameters()) 43 | return params 44 | 45 | def _construct_net(self): 46 | temp_dict = nn.ModuleDict() 47 | temp_dict['obs_linear'] = nn.Linear(np.prod(self.view_space), 256) 48 | temp_dict['emb_linear'] = nn.Linear(self.feature_space, 256) 49 | temp_dict['cat_linear'] = nn.Linear(256 * 2, 256 * 2) 50 | temp_dict['policy_linear'] = nn.Linear(256 * 2, self.num_actions) 51 | temp_dict['value_linear'] = nn.Linear(256 * 2, 1) 52 | return temp_dict 53 | 54 | def _calc_value(self, **kwargs): 55 | if self.use_cuda: 56 | obs = torch.FloatTensor(kwargs['obs']).cuda().unsqueeze(0) 57 | feature = torch.FloatTensor(kwargs['feature']).cuda().unsqueeze(0) 58 | else: 59 | obs = torch.FloatTensor(kwargs['obs']).unsqueeze(0) 60 | feature = torch.FloatTensor(kwargs['feature']).unsqueeze(0) 61 | flatten_view = obs.reshape(obs.size()[0], -1) 62 | h_view = F.relu(self.net['obs_linear'](flatten_view)) 63 | h_emb = F.relu(self.net['emb_linear'](feature)) 64 | dense = torch.cat([h_view, h_emb], dim=-1) 65 | dense = F.relu(self.net['cat_linear'](dense)) 66 | value = self.net['value_linear'](dense) 67 | value = value.flatten() 68 | return value.detach().cpu().numpy() 69 | 70 | def train(self, cuda): 71 | # calc buffer size 72 | n = 0 73 | # batch_data = sample_buffer.episodes() 74 | batch_data = self.replay_buffer.episodes() 75 | self.replay_buffer = tools.EpisodesBuffer() 76 | 77 | for episode in batch_data: 78 | n += len(episode.rewards) 79 | 80 | self.view_buf.resize([n,] + list(self.view_space), refcheck=False) 81 | self.feature_buf.resize([n,] + [self.feature_space], refcheck=False) 82 | self.action_buf.resize(n, refcheck=False) 83 | self.reward_buf.resize(n, refcheck=False) 84 | view, feature = self.view_buf, self.feature_buf 85 | action, reward = self.action_buf, self.reward_buf 86 | 87 | ct = 0 88 | gamma = self.gamma 89 | # collect episodes from multiple separate buffers to a continuous buffer 90 | for episode in batch_data: 91 | v, f, a, r = episode.views, episode.features, episode.actions, episode.rewards 92 | m = len(episode.rewards) 93 | 94 | r = np.array(r) 95 | 96 | keep = self._calc_value(obs=v[-1], feature=f[-1]) 97 | 98 | for i in reversed(range(m)): 99 | keep = keep * gamma + r[i] 100 | r[i] = keep 101 | 102 | view[ct:ct + m] = v 103 | feature[ct:ct + m] = f 104 | action[ct:ct + m] = a 105 | reward[ct:ct + m] = r 106 | ct += m 107 | 108 | assert n == ct 109 | 110 | if self.use_cuda: 111 | view = torch.FloatTensor(view).cuda() 112 | feature = torch.FloatTensor(feature).cuda() 113 | action = torch.LongTensor(action).cuda() 114 | reward = torch.FloatTensor(reward).cuda() 115 | action_mask = torch.zeros([action.size(0), self.num_actions]).cuda().scatter_(1, action.unsqueeze(-1), 1).float() 116 | else: 117 | view = torch.FloatTensor(view) 118 | feature = torch.FloatTensor(feature) 119 | action = torch.LongTensor(action) 120 | reward = torch.FloatTensor(reward) 121 | action_mask = torch.zeros([action.size(0), self.num_actions]).scatter_(1, action.unsqueeze(-1), 1).float() 122 | 123 | # train 124 | flatten_view = view.flatten(1) 125 | h_view = F.relu(self.net['obs_linear'](flatten_view)) 126 | h_emb = F.relu(self.net['emb_linear'](feature)) 127 | dense = torch.cat([h_view, h_emb], dim=-1) 128 | dense = F.relu(self.net['cat_linear'](dense)) 129 | policy = F.softmax(self.net['policy_linear'](dense / 0.1), dim=-1) 130 | policy = torch.clamp(policy, 1e-10, 1-1e-10) 131 | value = self.net['value_linear'](dense) 132 | value = value.flatten() 133 | 134 | advantage = (reward - value).detach() 135 | log_policy = (policy + 1e-6).log() 136 | log_prob = (log_policy * action_mask).sum(1) 137 | 138 | pg_loss = -(advantage * log_prob).mean() 139 | vf_loss = self.value_coef * (reward - value).pow(2).mean() 140 | neg_entropy = self.ent_coef * (policy * log_policy).sum(1).mean() 141 | total_loss = pg_loss + vf_loss + neg_entropy 142 | 143 | # train op (clip gradient) 144 | self.optim.zero_grad() 145 | total_loss.backward() 146 | grad_norm = torch.nn.utils.clip_grad_norm_(self.get_all_params(), 5.0) 147 | self.optim.step() 148 | 149 | print('[*] PG_LOSS:', np.round(pg_loss.detach().cpu().item(), 6), '/ VF_LOSS:', np.round(vf_loss.detach().cpu().item(), 6), '/ ENT_LOSS:', np.round(neg_entropy.detach().cpu().item()), '/ Value:', np.mean(value.detach().cpu().numpy())) 150 | 151 | def act(self, **kwargs): 152 | flatten_view = kwargs['obs'].reshape(kwargs['obs'].size()[0], -1) 153 | h_view = F.relu(self.net['obs_linear'](flatten_view)) 154 | h_emb = F.relu(self.net['emb_linear'](kwargs['feature'])) 155 | dense = torch.cat([h_view, h_emb], dim=-1) 156 | dense = F.relu(self.net['cat_linear'](dense)) 157 | policy = F.softmax(self.net['policy_linear'](dense / 0.1), dim=-1) 158 | policy = torch.clamp(policy, 1e-10, 1-1e-10) 159 | distribution = torch.distributions.Categorical(policy) 160 | action = distribution.sample().detach().cpu().numpy() 161 | return action.astype(np.int32).reshape((-1,)) 162 | 163 | def flush_buffer(self, **kwargs): 164 | self.replay_buffer.push(**kwargs) 165 | 166 | def save(self, dir_path, step=0): 167 | os.makedirs(dir_path, exist_ok=True) 168 | file_path = os.path.join(dir_path, "ac_{}".format(step)) 169 | torch.save(self.net.state_dict(), file_path) 170 | print("[*] Model saved") 171 | 172 | def load(self, dir_path, step=0): 173 | file_path = os.path.join(dir_path, "ac_{}".format(step)) 174 | 175 | self.net.load_state_dict(torch.load(file_path)) 176 | print("[*] Loaded model") 177 | 178 | 179 | 180 | class MFAC(nn.Module): 181 | def __init__(self, env, name, handle, value_coef=0.1, ent_coef=0.08, gamma=0.95, batch_size=64, learning_rate=1e-4, use_cuda=False): 182 | super(MFAC, self).__init__() 183 | 184 | self.env = env 185 | self.name = name 186 | self.view_space = env.unwrapped.env.get_view_space(handle) 187 | assert len(self.view_space) == 3 188 | self.feature_space = env.unwrapped.env.get_feature_space(handle)[0] 189 | self.num_actions = env.unwrapped.env.get_action_space(handle)[0] 190 | self.gamma = gamma 191 | 192 | self.batch_size = batch_size 193 | self.learning_rate = learning_rate 194 | 195 | self.value_coef = value_coef # coefficient of value in the total loss 196 | self.ent_coef = ent_coef # coefficient of entropy in the total loss 197 | 198 | # init training buffers 199 | self.view_buf = np.empty([1,] + list(self.view_space)) 200 | self.feature_buf = np.empty([1,] + [self.feature_space]) 201 | self.action_buf = np.empty(1, dtype=np.int32) 202 | self.reward_buf = np.empty(1, dtype=np.float32) 203 | self.replay_buffer = tools.EpisodesBuffer(use_mean=True) 204 | 205 | self.net = self._construct_net() 206 | self.optim = torch.optim.Adam(lr=self.learning_rate, params=self.get_all_params()) 207 | self.use_cuda = use_cuda 208 | 209 | def get_all_params(self): 210 | params = [] 211 | for k, v in self.net.items(): 212 | params += list(v.parameters()) 213 | return params 214 | 215 | def _construct_net(self): 216 | temp_dict = nn.ModuleDict() 217 | temp_dict['obs_linear'] = nn.Linear(np.prod(self.view_space), 256) 218 | temp_dict['emb_linear'] = nn.Linear(self.feature_space, 256) 219 | # * use the action_prob 220 | temp_dict['action_linear_1'] = nn.Linear(self.num_actions, 64) 221 | temp_dict['action_linear_2'] = nn.Linear(64, 32) 222 | temp_dict['act_obs_emb_linear'] = nn.Linear(32 + 256 * 2, 256) 223 | temp_dict['value_linear'] = nn.Linear(256, 1) 224 | 225 | temp_dict['cat_linear'] = nn.Linear(256 * 2, 256 * 2) 226 | temp_dict['policy_linear'] = nn.Linear(256 * 2, self.num_actions) 227 | return temp_dict 228 | 229 | def _calc_value(self, **kwargs): 230 | if self.use_cuda: 231 | obs = torch.FloatTensor(kwargs['obs']).cuda().unsqueeze(0) 232 | feature = torch.FloatTensor(kwargs['feature']).cuda().unsqueeze(0) 233 | input_act_prob = torch.FloatTensor(kwargs['prob']).cuda().unsqueeze(0) 234 | else: 235 | obs = torch.FloatTensor(kwargs['obs']).unsqueeze(0) 236 | feature = torch.FloatTensor(kwargs['feature']).unsqueeze(0) 237 | input_act_prob = torch.FloatTensor(kwargs['prob']).unsqueeze(0) 238 | flatten_view = obs.flatten(1) 239 | h_view = F.relu(self.net['obs_linear'](flatten_view)) 240 | h_emb = F.relu(self.net['emb_linear'](feature)) 241 | cat_layer = torch.cat([h_view, h_emb], dim=-1) 242 | action_dense = F.relu(self.net['action_linear_1'](input_act_prob)) 243 | action_dense = F.relu(self.net['action_linear_2'](action_dense)) 244 | cat_act_obs_emb = torch.cat([action_dense, cat_layer], dim=-1) 245 | dense_act_obs_emb = F.relu(self.net['act_obs_emb_linear'](cat_act_obs_emb)) 246 | value = self.net['value_linear'](dense_act_obs_emb) 247 | value = value.flatten() 248 | return value.detach().cpu().numpy() 249 | 250 | def train(self, cuda): 251 | # calc buffer size 252 | n = 0 253 | # batch_data = sample_buffer.episodes() 254 | batch_data = self.replay_buffer.episodes() 255 | self.replay_buffer = tools.EpisodesBuffer(use_mean=True) 256 | 257 | for episode in batch_data: 258 | n += len(episode.rewards) 259 | 260 | self.view_buf.resize([n,] + list(self.view_space), refcheck=False) 261 | self.feature_buf.resize([n,] + [self.feature_space], refcheck=False) 262 | self.action_buf.resize(n, refcheck=False) 263 | self.reward_buf.resize(n, refcheck=False) 264 | view, feature = self.view_buf, self.feature_buf 265 | action, reward = self.action_buf, self.reward_buf 266 | act_prob_buff = np.zeros((n, self.num_actions), dtype=np.float32) 267 | 268 | ct = 0 269 | gamma = self.gamma 270 | # collect episodes from multiple separate buffers to a continuous buffer 271 | for episode in batch_data: 272 | v, f, a, r, prob = episode.views, episode.features, episode.actions, episode.rewards, episode.probs 273 | m = len(episode.rewards) 274 | 275 | assert len(prob) > 0 276 | 277 | r = np.array(r) 278 | 279 | keep = self._calc_value(obs=v[-1], feature=f[-1], prob=prob[-1]) 280 | 281 | for i in reversed(range(m)): 282 | keep = keep * gamma + r[i] 283 | r[i] = keep 284 | 285 | view[ct:ct + m] = v 286 | feature[ct:ct + m] = f 287 | action[ct:ct + m] = a 288 | reward[ct:ct + m] = r 289 | act_prob_buff[ct:ct + m] = prob 290 | ct += m 291 | 292 | assert n == ct 293 | 294 | if self.use_cuda: 295 | view = torch.FloatTensor(view).cuda() 296 | feature = torch.FloatTensor(feature).cuda() 297 | action = torch.LongTensor(action).cuda() 298 | reward = torch.FloatTensor(reward).cuda() 299 | act_prob_buff = torch.FloatTensor(act_prob_buff).cuda() 300 | action_mask = torch.zeros([action.size(0), self.num_actions]).cuda().scatter_(1, action.unsqueeze(-1), 1).float() 301 | else: 302 | view = torch.FloatTensor(view) 303 | feature = torch.FloatTensor(feature) 304 | action = torch.LongTensor(action) 305 | reward = torch.FloatTensor(reward) 306 | act_prob_buff = torch.FloatTensor(act_prob_buff) 307 | action_mask = torch.zeros([action.size(0), self.num_actions]).scatter_(1, action.unsqueeze(-1), 1).float() 308 | 309 | # train 310 | flatten_view = view.flatten(1) 311 | h_view = F.relu(self.net['obs_linear'](flatten_view)) 312 | h_emb = F.relu(self.net['emb_linear'](feature)) 313 | cat_layer = torch.cat([h_view, h_emb], dim=-1) 314 | dense = F.relu(self.net['cat_linear'](cat_layer)) 315 | policy = F.softmax(self.net['policy_linear'](dense / 0.1), dim=-1) 316 | policy = torch.clamp(policy, 1e-10, 1-1e-10) 317 | action_dense = F.relu(self.net['action_linear_1'](act_prob_buff)) 318 | action_dense = F.relu(self.net['action_linear_2'](action_dense)) 319 | cat_act_obs_emb = torch.cat([action_dense, cat_layer], dim=-1) 320 | dense_act_obs_emb = F.relu(self.net['act_obs_emb_linear'](cat_act_obs_emb)) 321 | value = self.net['value_linear'](dense_act_obs_emb) 322 | value = value.flatten() 323 | 324 | advantage = (reward - value).detach() 325 | log_policy = (policy + 1e-6).log() 326 | log_prob = (log_policy * action_mask).sum(1) 327 | 328 | pg_loss = -(advantage * log_prob).mean() 329 | vf_loss = self.value_coef * (reward - value).pow(2).mean() 330 | neg_entropy = self.ent_coef * (policy * log_policy).sum(1).mean() 331 | total_loss = pg_loss + vf_loss + neg_entropy 332 | 333 | # train op (clip gradient) 334 | self.optim.zero_grad() 335 | total_loss.backward() 336 | grad_norm = torch.nn.utils.clip_grad_norm_(self.get_all_params(), 5.0) 337 | self.optim.step() 338 | 339 | print('[*] PG_LOSS:', np.round(pg_loss.detach().cpu().item(), 6), '/ VF_LOSS:', np.round(vf_loss.detach().cpu().item(), 6), '/ ENT_LOSS:', np.round(neg_entropy.detach().cpu().item()), '/ Value:', np.mean(value.detach().cpu().numpy())) 340 | 341 | def act(self, **kwargs): 342 | flatten_view = kwargs['obs'].reshape(kwargs['obs'].size()[0], -1) 343 | h_view = F.relu(self.net['obs_linear'](flatten_view)) 344 | h_emb = F.relu(self.net['emb_linear'](kwargs['feature'])) 345 | cat_layer = torch.cat([h_view, h_emb], dim=-1) 346 | dense = F.relu(self.net['cat_linear'](cat_layer)) 347 | policy = F.softmax(self.net['policy_linear'](dense / 0.1), dim=-1) 348 | policy = torch.clamp(policy, 1e-10, 1-1e-10) 349 | distribution = torch.distributions.Categorical(policy) 350 | action = distribution.sample().detach().cpu().numpy() 351 | return action.astype(np.int32).reshape((-1,)) 352 | 353 | def flush_buffer(self, **kwargs): 354 | self.replay_buffer.push(**kwargs) 355 | 356 | def save(self, dir_path, step=0): 357 | os.makedirs(dir_path, exist_ok=True) 358 | file_path = os.path.join(dir_path, "mfac_{}".format(step)) 359 | torch.save(self.net.state_dict(), file_path) 360 | print("[*] Model saved") 361 | 362 | def load(self, dir_path, step=0): 363 | file_path = os.path.join(dir_path, "mfac_{}".format(step)) 364 | 365 | self.net.load_state_dict(torch.load(file_path)) 366 | print("[*] Loaded model") 367 | -------------------------------------------------------------------------------- /algo/base.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | 7 | class ValueNet(nn.Module): 8 | def __init__(self, env, name, handle, update_every=5, use_mf=False, learning_rate=1e-4, tau=0.005, gamma=0.95): 9 | super(ValueNet, self).__init__() 10 | self.env = env 11 | self.name = name 12 | self._saver = None 13 | 14 | self.view_space = env.unwrapped.env.get_view_space(handle) 15 | assert len(self.view_space) == 3 16 | self.feature_space = env.unwrapped.env.get_feature_space(handle)[0] 17 | self.num_actions = env.unwrapped.env.get_action_space(handle)[0] 18 | 19 | self.update_every = update_every 20 | self.use_mf = use_mf # trigger of using mean field 21 | self.temperature = 0.1 22 | 23 | self.lr= learning_rate 24 | self.tau = tau 25 | self.gamma = gamma 26 | 27 | self.eval_net = self._construct_net() 28 | self.target_net = self._construct_net() 29 | 30 | self.optim = torch.optim.Adam(lr=self.lr, params=self.get_params(self.eval_net)) 31 | 32 | def _construct_net(self): 33 | temp_dict = nn.ModuleDict() 34 | temp_dict['conv1'] = nn.Conv2d(in_channels=self.view_space[2], out_channels=32, kernel_size=3) 35 | temp_dict['conv2'] = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3) 36 | temp_dict['obs_linear'] = nn.Linear(self.get_flatten_dim(temp_dict), 256) 37 | temp_dict['emb_linear'] = nn.Linear(self.feature_space, 32) 38 | if self.use_mf: 39 | temp_dict['prob_emb_linear'] = nn.Sequential( 40 | nn.Linear(self.num_actions, 64), 41 | nn.ReLU(), 42 | nn.Linear(64, 32) 43 | ) 44 | temp_dict['final_linear'] = nn.Sequential( 45 | nn.Linear(320 if self.use_mf else 288, 128), 46 | nn.ReLU(), 47 | nn.Linear(128, 64), 48 | nn.ReLU(), 49 | nn.Linear(64, self.num_actions) 50 | ) 51 | return temp_dict 52 | 53 | def get_flatten_dim(self, dict): 54 | return dict['conv2'](dict['conv1'](torch.zeros(1, self.view_space[2], self.view_space[0], self.view_space[1]))).flatten().size()[0] 55 | 56 | def get_params(self, dict): 57 | params = [] 58 | for k, v in dict.items(): 59 | params += list(v.parameters()) 60 | return params 61 | 62 | def get_all_params(self): 63 | params = [] 64 | eval_params = self.get_params(self.eval_net) 65 | target_params = self.get_params(self.target_net) 66 | params += eval_params 67 | params += target_params 68 | return params 69 | 70 | def calc_target_q(self, obs, feature, dones, rewards, prob=None): 71 | t_h = F.relu(self.target_net['conv2'](F.relu(self.target_net['conv1'](obs)))).flatten(start_dim=1) 72 | t_h = torch.cat([self.target_net['obs_linear'](t_h), self.target_net['emb_linear'](feature)], -1) 73 | if self.use_mf: 74 | t_h = torch.cat([t_h, self.target_net['prob_emb_linear'](prob)], -1) 75 | t_q = self.target_net['final_linear'](t_h) 76 | 77 | e_h = F.relu(self.eval_net['conv2'](F.relu(self.eval_net['conv1'](obs)))).flatten(start_dim=1) 78 | e_h = torch.cat([self.eval_net['obs_linear'](e_h), self.eval_net['emb_linear'](feature)], -1) 79 | if self.use_mf: 80 | e_h = torch.cat([e_h, self.eval_net['prob_emb_linear'](prob)], -1) 81 | e_q = self.eval_net['final_linear'](e_h) 82 | 83 | act_idx = e_q.max(1)[1] 84 | q_values = torch.gather(t_q, 1, act_idx.unsqueeze(-1)) 85 | target_q_value = rewards + (1. - dones) * q_values.reshape(-1) * self.gamma 86 | return target_q_value 87 | 88 | def update(self): 89 | for k, v in self.target_net.items(): 90 | for param, target_param in zip(self.eval_net[k].parameters(), self.target_net[k].parameters()): 91 | target_param.detach().copy_(self.tau * param.detach() + (1. - self.tau) * target_param.detach()) 92 | 93 | def act(self, obs, feature, prob=None, eps=None): 94 | if eps is not None: 95 | self.temperature = eps 96 | 97 | e_h = F.relu(self.eval_net['conv2'](F.relu(self.eval_net['conv1'](obs)))).flatten(start_dim=1) 98 | e_h = torch.cat([self.eval_net['obs_linear'](e_h), self.eval_net['emb_linear'](feature)], -1) 99 | if self.use_mf: 100 | e_h = torch.cat([e_h, self.eval_net['prob_emb_linear'](prob)], -1) 101 | e_q = self.eval_net['final_linear'](e_h) 102 | predict = F.softmax(e_q / self.temperature, dim=-1) 103 | #distribution = torch.distributions.Categorical(predict) 104 | #actions = distribution.sample().detach().cpu().numpy() 105 | actions = predict.max(1)[1].detach().cpu().numpy() 106 | return actions 107 | 108 | def train(self, obs, feature, target_q, acts, prob=None, mask=None): 109 | e_h = F.relu(self.eval_net['conv2'](F.relu(self.eval_net['conv1'](obs)))).flatten(start_dim=1) 110 | e_h = torch.cat([self.eval_net['obs_linear'](e_h), self.eval_net['emb_linear'](feature)], -1) 111 | if self.use_mf: 112 | e_h = torch.cat([e_h, self.eval_net['prob_emb_linear'](prob)], -1) 113 | e_q = self.eval_net['final_linear'](e_h) 114 | 115 | e_q = torch.gather(e_q, 1, acts.unsqueeze(-1)).squeeze() 116 | if mask is not None: 117 | loss = ((e_q - target_q.detach()).pow(2) * mask).sum() / mask.sum() 118 | else: 119 | loss = (e_q - target_q.detach()).pow(2).mean() 120 | 121 | self.optim.zero_grad() 122 | loss.backward() 123 | self.optim.step() 124 | return loss.item(), {'Eval-Q': np.round(np.mean(e_q.detach().cpu().numpy()), 6), 'Target-Q': np.round(np.mean(target_q.detach().cpu().numpy()), 6)} 125 | -------------------------------------------------------------------------------- /algo/q_learning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | 7 | from . import base 8 | from . import tools 9 | 10 | 11 | class DQN(base.ValueNet): 12 | def __init__(self, env, name, handle, sub_len, memory_size=2**10, batch_size=64, update_every=5, use_mf=False, learning_rate=0.0001, tau=0.005, gamma=0.95): 13 | super().__init__(env, name, handle, update_every=update_every, use_mf=use_mf, learning_rate=learning_rate, tau=tau, gamma=gamma) 14 | 15 | self.replay_buffer = tools.MemoryGroup(self.view_space, self.feature_space, self.num_actions, memory_size, batch_size, sub_len) 16 | 17 | def flush_buffer(self, **kwargs): 18 | self.replay_buffer.push(**kwargs) 19 | 20 | def train(self, cuda): 21 | self.replay_buffer.tight() 22 | batch_num = self.replay_buffer.get_batch_num() 23 | 24 | for i in range(batch_num): 25 | obs, feat, obs_next, feat_next, dones, rewards, acts, masks = self.replay_buffer.sample() 26 | 27 | obs = torch.FloatTensor(obs).permute([0, 3, 1, 2]).cuda() if cuda else torch.FloatTensor(obs).permute([0, 3, 1, 2]) 28 | obs_next = torch.FloatTensor(obs_next).permute([0, 3, 1, 2]).cuda() if cuda else torch.FloatTensor(obs_next).permute([0, 3, 1, 2]) 29 | feat = torch.FloatTensor(feat).cuda() if cuda else torch.FloatTensor(feat) 30 | feat_next = torch.FloatTensor(feat_next).cuda() if cuda else torch.FloatTensor(feat_next) 31 | acts = torch.LongTensor(acts).cuda() if cuda else torch.LongTensor(acts) 32 | rewards = torch.FloatTensor(rewards).cuda() if cuda else torch.FloatTensor(rewards) 33 | dones = torch.FloatTensor(dones).cuda() if cuda else torch.FloatTensor(dones) 34 | masks = torch.FloatTensor(masks).cuda() if cuda else torch.FloatTensor(masks) 35 | 36 | target_q = self.calc_target_q(obs=obs_next, feature=feat_next, rewards=rewards, dones=dones) 37 | loss, q = super().train(obs=obs, feature=feat, target_q=target_q, acts=acts, mask=masks) 38 | 39 | self.update() 40 | 41 | if i % 50 == 0: 42 | print('[*] LOSS:', loss, '/ Q:', q) 43 | 44 | def save(self, dir_path, step=0): 45 | os.makedirs(dir_path, exist_ok=True) 46 | eval_file_path = os.path.join(dir_path, "dqn_eval_{}".format(step)) 47 | target_file_path = os.path.join(dir_path, "dqn_target_{}".format(step)) 48 | torch.save(self.eval_net.state_dict(), eval_file_path) 49 | torch.save(self.target_net.state_dict(), target_file_path) 50 | print("[*] Model saved") 51 | 52 | def load(self, dir_path, step=0): 53 | eval_file_path = os.path.join(dir_path, "dqn_eval_{}".format(step)) 54 | target_file_path = os.path.join(dir_path, "dqn_target_{}".format(step)) 55 | 56 | self.target_net.load_state_dict(torch.load(target_file_path)) 57 | self.eval_net.load_state_dict(torch.load(eval_file_path)) 58 | print("[*] Loaded model") 59 | 60 | 61 | 62 | class MFQ(base.ValueNet): 63 | def __init__(self, env, name, handle, sub_len, eps=1.0, memory_size=2**10, batch_size=64, update_every=5, use_mf=True, learning_rate=0.0001, tau=0.005, gamma=0.95): 64 | super().__init__(env, name, handle, update_every=update_every, use_mf=use_mf, learning_rate=learning_rate, tau=tau, gamma=gamma) 65 | 66 | config = { 67 | 'max_len': memory_size, 68 | 'batch_size': batch_size, 69 | 'obs_shape': self.view_space, 70 | 'feat_shape': self.feature_space, 71 | 'act_n': self.num_actions, 72 | 'use_mean': True, 73 | 'sub_len': sub_len 74 | } 75 | 76 | self.train_ct = 0 77 | self.replay_buffer = tools.MemoryGroup(**config) 78 | self.update_every = update_every 79 | 80 | def flush_buffer(self, **kwargs): 81 | self.replay_buffer.push(**kwargs) 82 | 83 | def train(self, cuda): 84 | self.replay_buffer.tight() 85 | batch_name = self.replay_buffer.get_batch_num() 86 | 87 | for i in range(batch_name): 88 | obs, feat, acts, act_prob, obs_next, feat_next, act_prob_next, rewards, dones, masks = self.replay_buffer.sample() 89 | 90 | obs = torch.FloatTensor(obs).permute([0, 3, 1, 2]).cuda() if cuda else torch.FloatTensor(obs).permute([0, 3, 1, 2]) 91 | obs_next = torch.FloatTensor(obs_next).permute([0, 3, 1, 2]).cuda() if cuda else torch.FloatTensor(obs_next).permute([0, 3, 1, 2]) 92 | feat = torch.FloatTensor(feat).cuda() if cuda else torch.FloatTensor(feat) 93 | feat_next = torch.FloatTensor(feat_next).cuda() if cuda else torch.FloatTensor(feat_next) 94 | acts = torch.LongTensor(acts).cuda() if cuda else torch.LongTensor(acts) 95 | act_prob = torch.FloatTensor(act_prob).cuda() if cuda else torch.FloatTensor(act_prob) 96 | act_prob_next = torch.FloatTensor(act_prob_next).cuda() if cuda else torch.FloatTensor(act_prob_next) 97 | rewards = torch.FloatTensor(rewards).cuda() if cuda else torch.FloatTensor(rewards) 98 | dones = torch.FloatTensor(dones).cuda() if cuda else torch.FloatTensor(dones) 99 | masks = torch.FloatTensor(masks).cuda() if cuda else torch.FloatTensor(masks) 100 | 101 | target_q = self.calc_target_q(obs=obs_next, feature=feat_next, rewards=rewards, dones=dones, prob=act_prob_next) 102 | loss, q = super().train(obs=obs, feature=feat, target_q=target_q, prob=act_prob, acts=acts, mask=masks) 103 | 104 | self.update() 105 | 106 | if i % 50 == 0: 107 | print('[*] LOSS:', loss, '/ Q:', q) 108 | 109 | def save(self, dir_path, step=0): 110 | os.makedirs(dir_path, exist_ok=True) 111 | eval_file_path = os.path.join(dir_path, "mfq_eval_{}".format(step)) 112 | target_file_path = os.path.join(dir_path, "mfq_target_{}".format(step)) 113 | torch.save(self.eval_net.state_dict(), eval_file_path) 114 | torch.save(self.target_net.state_dict(), target_file_path) 115 | print("[*] Model saved") 116 | 117 | def load(self, dir_path, step=0): 118 | eval_file_path = os.path.join(dir_path, "mfq_eval_{}".format(step)) 119 | target_file_path = os.path.join(dir_path, "mfq_target_{}".format(step)) 120 | 121 | self.target_net.load_state_dict(torch.load(target_file_path)) 122 | self.eval_net.load_state_dict(torch.load(eval_file_path)) 123 | print("[*] Loaded model") -------------------------------------------------------------------------------- /algo/tools.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import os 5 | import numpy as np 6 | from moviepy.editor import ImageSequenceClip 7 | 8 | 9 | class Color: 10 | INFO = '\033[1;34m{}\033[0m' 11 | WARNING = '\033[1;33m{}\033[0m' 12 | ERROR = '\033[1;31m{}\033[0m' 13 | 14 | 15 | class Buffer: 16 | def __init__(self): 17 | pass 18 | 19 | def push(self, **kwargs): 20 | raise NotImplementedError 21 | 22 | 23 | class MetaBuffer(object): 24 | def __init__(self, shape, max_len, dtype='float32'): 25 | self.max_len = max_len 26 | self.data = np.zeros([max_len] + list(shape if isinstance(shape, tuple) else [shape])).astype(dtype) 27 | self.start = 0 28 | self.length = 0 29 | self._flag = 0 30 | 31 | def __len__(self): 32 | return self.length 33 | 34 | def __getitem__(self, idx): 35 | if idx < 0 or idx >= self.length: 36 | raise KeyError() 37 | return self.data[idx] 38 | 39 | def sample(self, idx): 40 | return self.data[idx % self.length] 41 | 42 | def pull(self): 43 | return self.data[:self.length] 44 | 45 | def append(self, value): 46 | start = 0 47 | num = len(value) 48 | 49 | if self._flag + num > self.max_len: 50 | tail = self.max_len - self._flag 51 | self.data[self._flag:] = value[:tail] 52 | num -= tail 53 | start = tail 54 | self._flag = 0 55 | 56 | self.data[self._flag:self._flag + num] = value[start:] 57 | self._flag += num 58 | self.length = min(self.length + len(value), self.max_len) 59 | 60 | def reset_new(self, start, value): 61 | self.data[start:] = value 62 | 63 | 64 | class EpisodesBufferEntry: 65 | """Entry for episode buffer""" 66 | def __init__(self): 67 | self.views = [] 68 | self.features = [] 69 | self.actions = [] 70 | self.rewards = [] 71 | self.probs = [] 72 | self.terminal = False 73 | 74 | def append(self, view, feature, action, reward, alive, probs=None): 75 | self.views.append(view.copy()) 76 | self.features.append(feature.copy()) 77 | self.actions.append(action) 78 | self.rewards.append(reward) 79 | if probs is not None: 80 | self.probs.append(probs) 81 | if not alive: 82 | self.terminal = True 83 | 84 | 85 | class EpisodesBuffer(Buffer): 86 | """Replay buffer to store a whole episode for all agents 87 | one entry for one agent 88 | """ 89 | def __init__(self, use_mean=False): 90 | super().__init__() 91 | self.buffer = {} 92 | self.use_mean = use_mean 93 | 94 | def push(self, **kwargs): 95 | view, feature = kwargs['state'] 96 | acts = kwargs['acts'] 97 | rewards = kwargs['rewards'] 98 | alives = kwargs['alives'] 99 | ids = kwargs['ids'] 100 | 101 | if self.use_mean: 102 | probs = kwargs['prob'] 103 | 104 | buffer = self.buffer 105 | index = np.random.permutation(len(view)) 106 | 107 | for i in range(len(ids)): 108 | i = index[i] 109 | entry = buffer.get(ids[i]) 110 | if entry is None: 111 | entry = EpisodesBufferEntry() 112 | buffer[ids[i]] = entry 113 | 114 | if self.use_mean: 115 | entry.append(view[i], feature[i], acts[i], rewards[i], alives[i], probs=probs[i]) 116 | else: 117 | entry.append(view[i], feature[i], acts[i], rewards[i], alives[i]) 118 | 119 | def reset(self): 120 | """ clear replay buffer """ 121 | self.buffer = {} 122 | 123 | def episodes(self): 124 | """ get episodes """ 125 | return self.buffer.values() 126 | 127 | 128 | class AgentMemory(object): 129 | def __init__(self, obs_shape, feat_shape, act_n, max_len, use_mean=False): 130 | self.obs0 = MetaBuffer(obs_shape, max_len) 131 | self.feat0 = MetaBuffer(feat_shape, max_len) 132 | self.actions = MetaBuffer((), max_len, dtype='int32') 133 | self.rewards = MetaBuffer((), max_len) 134 | self.terminals = MetaBuffer((), max_len, dtype='bool') 135 | self.use_mean = use_mean 136 | 137 | if self.use_mean: 138 | self.prob = MetaBuffer((act_n,), max_len) 139 | 140 | def append(self, obs0, feat0, act, reward, alive, prob=None): 141 | self.obs0.append(np.array([obs0])) 142 | self.feat0.append(np.array([feat0])) 143 | self.actions.append(np.array([act], dtype=np.int32)) 144 | self.rewards.append(np.array([reward])) 145 | self.terminals.append(np.array([not alive], dtype=np.bool)) 146 | 147 | if self.use_mean: 148 | self.prob.append(np.array([prob])) 149 | 150 | def pull(self): 151 | res = { 152 | 'obs0': self.obs0.pull(), 153 | 'feat0': self.feat0.pull(), 154 | 'act': self.actions.pull(), 155 | 'rewards': self.rewards.pull(), 156 | 'terminals': self.terminals.pull(), 157 | 'prob': None if not self.use_mean else self.prob.pull() 158 | } 159 | 160 | return res 161 | 162 | 163 | class MemoryGroup(object): 164 | def __init__(self, obs_shape, feat_shape, act_n, max_len, batch_size, sub_len, use_mean=False): 165 | self.agent = dict() 166 | self.max_len = max_len 167 | self.batch_size = batch_size 168 | self.obs_shape = obs_shape 169 | self.feat_shape = feat_shape 170 | self.sub_len = sub_len 171 | self.use_mean = use_mean 172 | self.act_n = act_n 173 | 174 | self.obs0 = MetaBuffer(obs_shape, max_len) 175 | self.feat0 = MetaBuffer(feat_shape, max_len) 176 | self.actions = MetaBuffer((), max_len, dtype='int32') 177 | self.rewards = MetaBuffer((), max_len) 178 | self.terminals = MetaBuffer((), max_len, dtype='bool') 179 | self.masks = MetaBuffer((), max_len, dtype='bool') 180 | if use_mean: 181 | self.prob = MetaBuffer((act_n,), max_len) 182 | self._new_add = 0 183 | 184 | def _flush(self, **kwargs): 185 | self.obs0.append(kwargs['obs0']) 186 | self.feat0.append(kwargs['feat0']) 187 | self.actions.append(kwargs['act']) 188 | self.rewards.append(kwargs['rewards']) 189 | self.terminals.append(kwargs['terminals']) 190 | 191 | if self.use_mean: 192 | self.prob.append(kwargs['prob']) 193 | 194 | mask = np.where(kwargs['terminals'] == True, False, True) 195 | mask[-1] = False 196 | self.masks.append(mask) 197 | 198 | def push(self, **kwargs): 199 | for i, _id in enumerate(kwargs['ids']): 200 | if self.agent.get(_id) is None: 201 | self.agent[_id] = AgentMemory(self.obs_shape, self.feat_shape, self.act_n, self.sub_len, use_mean=self.use_mean) 202 | if self.use_mean: 203 | self.agent[_id].append(obs0=kwargs['state'][0][i], feat0=kwargs['state'][1][i], act=kwargs['acts'][i], reward=kwargs['rewards'][i], alive=kwargs['alives'][i], prob=kwargs['prob'][i]) 204 | else: 205 | self.agent[_id].append(obs0=kwargs['state'][0][i], feat0=kwargs['state'][1][i], act=kwargs['acts'][i], reward=kwargs['rewards'][i], alive=kwargs['alives'][i]) 206 | 207 | def tight(self): 208 | ids = list(self.agent.keys()) 209 | np.random.shuffle(ids) 210 | for ele in ids: 211 | tmp = self.agent[ele].pull() 212 | self._new_add += len(tmp['obs0']) 213 | self._flush(**tmp) 214 | self.agent = dict() # clear 215 | 216 | def sample(self): 217 | idx = np.random.choice(self.nb_entries, size=self.batch_size) 218 | next_idx = (idx + 1) % self.nb_entries 219 | 220 | obs = self.obs0.sample(idx) 221 | obs_next = self.obs0.sample(next_idx) 222 | feature = self.feat0.sample(idx) 223 | feature_next = self.feat0.sample(next_idx) 224 | actions = self.actions.sample(idx) 225 | rewards = self.rewards.sample(idx) 226 | dones = self.terminals.sample(idx) 227 | masks = self.masks.sample(idx) 228 | 229 | if self.use_mean: 230 | act_prob = self.prob.sample(idx) 231 | act_next_prob = self.prob.sample(next_idx) 232 | return obs, feature, actions, act_prob, obs_next, feature_next, act_next_prob, rewards, dones, masks 233 | else: 234 | return obs, feature, obs_next, feature_next, dones, rewards, actions, masks 235 | 236 | def get_batch_num(self): 237 | print('\n[INFO] Length of buffer and new add:', len(self.obs0), self._new_add) 238 | res = self._new_add * 2 // self.batch_size 239 | self._new_add = 0 240 | return res 241 | 242 | @property 243 | def nb_entries(self): 244 | return len(self.obs0) 245 | 246 | 247 | 248 | class Runner(object): 249 | def __init__(self, env, handles, max_steps, models, 250 | play_handle, render_every=None, save_every=None, tau=None, log_name=None, log_dir=None, model_dir=None, render_dir=None, train=False, cuda=True): 251 | """Initialize runner 252 | 253 | Parameters 254 | ---------- 255 | env: magent.GridWorld 256 | environment handle 257 | handles: list 258 | group handles 259 | max_steps: int 260 | the maximum of stages in a episode 261 | render_every: int 262 | render environment interval 263 | save_every: int 264 | states the interval of evaluation for self-play update 265 | models: list 266 | contains models 267 | play_handle: method like 268 | run game 269 | tau: float 270 | tau index for self-play update 271 | log_name: str 272 | define the name of log dir 273 | log_dir: str 274 | donates the directory of logs 275 | model_dir: str 276 | donates the dircetory of models 277 | """ 278 | self.env = env 279 | self.models = models 280 | self.max_steps = max_steps 281 | self.handles = handles 282 | self.render_every = render_every 283 | self.save_every = save_every 284 | self.play = play_handle 285 | self.model_dir = model_dir 286 | self.render_dir = render_dir 287 | self.train = train 288 | self.tau = tau 289 | self.cuda = cuda 290 | 291 | os.makedirs(self.render_dir, exist_ok=True) 292 | 293 | def sp_op(self): 294 | l_vars, r_vars = self.models[0].get_all_params(), self.models[1].get_all_params() 295 | for l_var, r_var in zip(l_vars, r_vars): 296 | r_var.detach().copy_((1. - self.tau) * l_var + self.tau * r_var) 297 | 298 | def run(self, variant_eps, iteration, win_cnt=None): 299 | info = {'main': None, 'opponent': None} 300 | 301 | # pass 302 | info['main'] = {'ave_agent_reward': 0., 'total_reward': 0., 'kill': 0.} 303 | info['opponent'] = {'ave_agent_reward': 0., 'total_reward': 0., 'kill': 0.} 304 | 305 | max_nums, nums, agent_r_records, total_rewards, render_list = self.play(env=self.env, n_round=iteration, handles=self.handles, 306 | models=self.models, print_every=50, eps=variant_eps, render=(iteration + 1) % self.render_every == 0 if self.render_every > 0 else False, train=self.train, cuda=self.cuda) 307 | 308 | for i, tag in enumerate(['main', 'opponent']): 309 | info[tag]['total_reward'] = total_rewards[i] 310 | info[tag]['kill'] = max_nums[i] - nums[1 - i] 311 | info[tag]['ave_agent_reward'] = agent_r_records[i] 312 | 313 | if self.train: 314 | print('\n[INFO] {}'.format(info['main'])) 315 | 316 | # if self.save_every and (iteration + 1) % self.save_every == 0: 317 | if info['main']['total_reward'] > info['opponent']['total_reward']: 318 | print(Color.INFO.format('\n[INFO] Begin self-play Update ...')) 319 | self.sp_op() 320 | print(Color.INFO.format('[INFO] Self-play Updated!\n')) 321 | 322 | print(Color.INFO.format('[INFO] Saving model ...')) 323 | self.models[0].save(self.model_dir + '-0', iteration) 324 | self.models[1].save(self.model_dir + '-1', iteration) 325 | 326 | else: 327 | print('\n[INFO] {0} \n {1}'.format(info['main'], info['opponent'])) 328 | if info['main']['kill'] > info['opponent']['kill']: 329 | win_cnt['main'] += 1 330 | elif info['main']['kill'] < info['opponent']['kill']: 331 | win_cnt['opponent'] += 1 332 | else: 333 | win_cnt['main'] += 1 334 | win_cnt['opponent'] += 1 335 | 336 | if len(render_list) > 0: 337 | print('[*] Saving Render') 338 | clip = ImageSequenceClip(render_list, fps=20) 339 | clip.write_gif('{}/replay_{}.gif'.format(self.render_dir, iteration+1), fps=20, verbose=False) 340 | print('[*] Saved Render') 341 | 342 | -------------------------------------------------------------------------------- /replay.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deligentfool/mfrl_pytorch/c492d5f8d7f42c35a6864d6f1306752398878422/replay.gif -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | moviepy==1.0.3 2 | numpy==1.21.2 3 | PettingZoo==1.14.0 4 | magent==0.1.14 5 | torch 6 | torchaudio 7 | torchvision 8 | -------------------------------------------------------------------------------- /senarios/senario_battle.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | 9 | def play(env, n_round, handles, models, print_every, eps=1.0, render=False, train=False, cuda=True): 10 | """play a ground and train""" 11 | env.reset() 12 | 13 | max_steps = env.unwrapped.max_cycles 14 | step_ct = 0 15 | done = False 16 | 17 | obs_list = [] 18 | if render: 19 | obs_list.append(env.render(mode='rgb_array')) 20 | 21 | n_group = len(handles) 22 | state = [None for _ in range(n_group)] 23 | acts = [None for _ in range(n_group)] 24 | ids = [None for _ in range(n_group)] 25 | 26 | alives = [None for _ in range(n_group)] 27 | rewards = [None for _ in range(n_group)] 28 | nums = [env.unwrapped.env.get_num(handle) for handle in handles] 29 | max_nums = nums.copy() 30 | 31 | loss = [None for _ in range(n_group)] 32 | eval_q = [None for _ in range(n_group)] 33 | n_action = [env.unwrapped.env.get_action_space(handles[0])[0], env.unwrapped.env.get_action_space(handles[1])[0]] 34 | 35 | print("\n\n[*] ROUND #{0}, EPS: {1:.2f} NUMBER: {2}".format(n_round, eps, nums)) 36 | mean_rewards = [[] for _ in range(n_group)] 37 | total_rewards = [[] for _ in range(n_group)] 38 | 39 | former_act_prob = [np.zeros((1, env.unwrapped.env.get_action_space(handles[0])[0])), np.zeros((1, env.unwrapped.env.get_action_space(handles[1])[0]))] 40 | 41 | while not done and step_ct < max_steps: 42 | # take actions for every model 43 | for i in range(n_group): 44 | state[i] = list(env.unwrapped.env.get_observation(handles[i])) 45 | ids[i] = env.unwrapped.env.get_agent_id(handles[i]) 46 | 47 | for i in range(n_group): 48 | former_act_prob[i] = np.tile(former_act_prob[i], (len(state[i][0]), 1)) 49 | if cuda: 50 | acts[i] = models[i].act(obs=torch.FloatTensor(state[i][0]).permute([0, 3, 1, 2]).cuda(), feature=torch.FloatTensor(state[i][1]).cuda(), prob=torch.FloatTensor(former_act_prob[i]).cuda(), eps=eps) 51 | else: 52 | acts[i] = models[i].act(obs=torch.FloatTensor(state[i][0]).permute([0, 3, 1, 2]), feature=torch.FloatTensor(state[i][1]), prob=torch.FloatTensor(former_act_prob[i]), eps=eps) 53 | 54 | 55 | for i in range(n_group): 56 | env.unwrapped.env.set_action(handles[i], acts[i].astype(np.int32)) 57 | 58 | # simulate one step 59 | done = env.unwrapped.env.step() 60 | 61 | for i in range(n_group): 62 | rewards[i] = env.unwrapped.env.get_reward(handles[i]) 63 | alives[i] = env.unwrapped.env.get_alive(handles[i]) 64 | 65 | buffer = { 66 | 'state': state[0], 'acts': acts[0], 'rewards': rewards[0], 67 | 'alives': alives[0], 'ids': ids[0] 68 | } 69 | 70 | buffer['prob'] = former_act_prob[0] 71 | 72 | for i in range(n_group): 73 | former_act_prob[i] = np.mean(list(map(lambda x: np.eye(n_action[i])[x], acts[i])), axis=0, keepdims=True) 74 | 75 | if train: 76 | models[0].flush_buffer(**buffer) 77 | 78 | # stat info 79 | nums = [env.unwrapped.env.get_num(handle) for handle in handles] 80 | 81 | for i in range(n_group): 82 | sum_reward = sum(rewards[i]) 83 | rewards[i] = sum_reward / nums[i] 84 | mean_rewards[i].append(rewards[i]) 85 | total_rewards[i].append(sum_reward) 86 | 87 | if render: 88 | obs_list.append(env.render(mode='rgb_array')) 89 | 90 | # clear dead agents 91 | env.unwrapped.env.clear_dead() 92 | 93 | info = {"Ave-Reward": np.round(rewards, decimals=6), "NUM": nums} 94 | 95 | step_ct += 1 96 | 97 | if step_ct % print_every == 0: 98 | print("> step #{}, info: {}".format(step_ct, info)) 99 | 100 | if train: 101 | models[0].train(cuda) 102 | 103 | for i in range(n_group): 104 | mean_rewards[i] = sum(mean_rewards[i]) / len(mean_rewards[i]) 105 | total_rewards[i] = sum(total_rewards[i]) 106 | 107 | return max_nums, nums, mean_rewards, total_rewards, obs_list 108 | 109 | 110 | def battle(env, n_round, handles, models, print_every, eps=1.0, render=False, train=False, cuda=True): 111 | """play a ground and train""" 112 | env.reset() 113 | 114 | max_steps = env.unwrapped.max_cycles 115 | step_ct = 0 116 | done = False 117 | 118 | obs_list = [] 119 | if render: 120 | obs_list.append(np.transpose(env.render(mode='rgb_array'), axes=(1, 0, 2))) 121 | 122 | n_group = len(handles) 123 | state = [None for _ in range(n_group)] 124 | acts = [None for _ in range(n_group)] 125 | ids = [None for _ in range(n_group)] 126 | 127 | alives = [None for _ in range(n_group)] 128 | rewards = [None for _ in range(n_group)] 129 | nums = [env.unwrapped.env.get_num(handle) for handle in handles] 130 | max_nums = nums.copy() 131 | 132 | n_action = [env.unwrapped.env.get_action_space(handles[0])[0], env.unwrapped.env.get_action_space(handles[1])[0]] 133 | 134 | print("\n\n[*] ROUND #{0}, EPS: {1:.2f} NUMBER: {2}".format(n_round, eps, nums)) 135 | mean_rewards = [[] for _ in range(n_group)] 136 | total_rewards = [[] for _ in range(n_group)] 137 | 138 | former_act_prob = [np.zeros((1, env.unwrapped.env.get_action_space(handles[0])[0])), np.zeros((1, env.unwrapped.env.get_action_space(handles[1])[0]))] 139 | 140 | while not done and step_ct < max_steps: 141 | # take actions for every model 142 | for i in range(n_group): 143 | state[i] = list(env.unwrapped.env.get_observation(handles[i])) 144 | ids[i] = env.unwrapped.env.get_agent_id(handles[i]) 145 | 146 | for i in range(n_group): 147 | former_act_prob[i] = np.tile(former_act_prob[i], (len(state[i][0]), 1)) 148 | if cuda: 149 | acts[i] = models[i].act(obs=torch.FloatTensor(state[i][0]).permute([0, 3, 1, 2]).cuda(), feature=torch.FloatTensor(state[i][1]).cuda(), prob=torch.FloatTensor(former_act_prob[i]).cuda(), eps=eps) 150 | else: 151 | acts[i] = models[i].act(obs=torch.FloatTensor(state[i][0]).permute([0, 3, 1, 2]), feature=torch.FloatTensor(state[i][1]), prob=torch.FloatTensor(former_act_prob[i]), eps=eps) 152 | 153 | for i in range(n_group): 154 | env.unwrapped.env.set_action(handles[i], acts[i].astype(np.int32)) 155 | 156 | # simulate one step 157 | done = env.unwrapped.env.step() 158 | 159 | for i in range(n_group): 160 | rewards[i] = env.unwrapped.env.get_reward(handles[i]) 161 | alives[i] = env.unwrapped.env.get_alive(handles[i]) 162 | 163 | for i in range(n_group): 164 | former_act_prob[i] = np.mean(list(map(lambda x: np.eye(n_action[i])[x], acts[i])), axis=0, keepdims=True) 165 | 166 | # stat info 167 | nums = [env.unwrapped.env.get_num(handle) for handle in handles] 168 | 169 | for i in range(n_group): 170 | sum_reward = sum(rewards[i]) 171 | rewards[i] = sum_reward / nums[i] 172 | mean_rewards[i].append(rewards[i]) 173 | total_rewards[i].append(sum_reward) 174 | 175 | if render: 176 | obs_list.append(np.transpose(env.render(mode='rgb_array'), axes=(1, 0, 2))) 177 | 178 | # clear dead agents 179 | env.unwrapped.env.clear_dead() 180 | 181 | info = {"Ave-Reward": np.round(rewards, decimals=6), "NUM": nums} 182 | 183 | step_ct += 1 184 | 185 | if step_ct % print_every == 0: 186 | print("> step #{}, info: {}".format(step_ct, info)) 187 | 188 | for i in range(n_group): 189 | mean_rewards[i] = sum(mean_rewards[i]) / len(mean_rewards[i]) 190 | total_rewards[i] = sum(total_rewards[i]) 191 | 192 | return max_nums, nums, mean_rewards, total_rewards, obs_list 193 | -------------------------------------------------------------------------------- /train_battle.py: -------------------------------------------------------------------------------- 1 | """Self Play 2 | """ 3 | 4 | import argparse 5 | import os 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import numpy as np 10 | from pettingzoo.magent import battle_v3 11 | 12 | from algo import spawn_ai 13 | from algo import tools 14 | from senarios.senario_battle import play 15 | 16 | 17 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 18 | os.makedirs('./data', exist_ok=True) 19 | 20 | def linear_decay(epoch, x, y): 21 | min_v, max_v = y[0], y[-1] 22 | start, end = x[0], x[-1] 23 | 24 | if epoch == start: 25 | return min_v 26 | 27 | eps = min_v 28 | 29 | for i, x_i in enumerate(x): 30 | if epoch <= x_i: 31 | interval = (y[i] - y[i - 1]) / (x_i - x[i - 1]) 32 | eps = interval * (epoch - x[i - 1]) + y[i - 1] 33 | break 34 | 35 | return eps 36 | 37 | if __name__ == '__main__': 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument('--algo', type=str, choices={'ac', 'mfac', 'mfq', 'iql'}, help='choose an algorithm from the preset', required=True) 40 | parser.add_argument('--save_every', type=int, default=20, help='decide the self-play update interval') 41 | parser.add_argument('--update_every', type=int, default=5, help='decide the udpate interval for q-learning, optional') 42 | parser.add_argument('--n_round', type=int, default=2000, help='set the trainning round') 43 | parser.add_argument('--render', action='store_true', help='render or not (if true, will render every save)') 44 | parser.add_argument('--map_size', type=int, default=40, help='set the size of map') # then the amount of agents is 64 45 | parser.add_argument('--max_steps', type=int, default=400, help='set the max steps') 46 | parser.add_argument('--cuda', type=bool, default=True, help='use the cuda') 47 | 48 | args = parser.parse_args() 49 | 50 | # Initialize the environment 51 | env = battle_v3.env( 52 | map_size=args.map_size, 53 | minimap_mode=True, 54 | step_reward=-0.005, 55 | dead_penalty=-0.1, 56 | attack_penalty=-0.1, 57 | attack_opponent_reward=0.2, 58 | max_cycles=args.max_steps, 59 | extra_features=True 60 | ) 61 | handles = env.unwrapped.env.get_handles() 62 | 63 | log_dir = os.path.join(BASE_DIR, 'data/tmp/{}'.format(args.algo)) 64 | render_dir = os.path.join(BASE_DIR, 'data/render/{}'.format(args.algo)) 65 | model_dir = os.path.join(BASE_DIR, 'data/models/{}'.format(args.algo)) 66 | 67 | start_from = 0 68 | 69 | models = [spawn_ai(args.algo, env, handles[0], args.algo + '-me', args.max_steps, args.cuda), spawn_ai(args.algo, env, handles[1], args.algo + '-opponent', args.max_steps, args.cuda)] 70 | runner = tools.Runner(env, handles, args.max_steps, models, play, 71 | render_every=args.save_every if args.render else 0, save_every=args.save_every, tau=0.01, log_name=args.algo, 72 | log_dir=log_dir, model_dir=model_dir, render_dir=render_dir, train=True, cuda=args.cuda) 73 | 74 | for k in range(start_from, start_from + args.n_round): 75 | eps = linear_decay(k, [0, int(args.n_round * 0.8), args.n_round], [1, 0.2, 0.1]) 76 | runner.run(eps, k) 77 | --------------------------------------------------------------------------------