├── .gitignore
├── README.md
├── algo
    ├── __init__.py
    ├── ac.py
    ├── base.py
    ├── q_learning.py
    └── tools.py
├── replay.gif
├── requirements.txt
├── senarios
    └── senario_battle.py
└── train_battle.py


/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | /data


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pytorch Version for Mean Field Multi-Agent Reinforcement Learning
 2 | Pytorch implementation of MF-Q and MF-AC in the paper [Mean Field Multi-Agent Reinforcement Learning](https://arxiv.org/pdf/1802.05438.pdf).
 3 | 
 4 | The original code can be found in [mlii/mfrl](https://github.com/mlii/mfrl).
 5 | 
 6 | Please uncomment the following two lines of code in `base.py` if the algorithm occasionally fails to converge.
 7 | ```python
 8 |   #distribution = torch.distributions.Categorical(predict) 
 9 |   #actions = distribution.sample().detach().cpu().numpy()
10 | ```
11 | 
12 | ## Example
13 | ![image](https://github.com/deligentfool/mfrl_pytorch/blob/master/replay.gif)
14 | 


--------------------------------------------------------------------------------
/algo/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import q_learning
 2 | from . import ac
 3 | 
 4 | IQL = q_learning.DQN
 5 | MFQ = q_learning.MFQ
 6 | AC = ac.ActorCritic
 7 | MFAC = ac.MFAC
 8 | 
 9 | 
10 | def spawn_ai(algo_name, env, handle, human_name, max_steps, cuda=True):
11 |     if algo_name == 'mfq':
12 |         model = MFQ(env, human_name, handle, max_steps, memory_size=80000)
13 |     elif algo_name == 'iql':
14 |         model = IQL(env, human_name, handle, max_steps, memory_size=80000)
15 |     elif algo_name == 'ac':
16 |         model = AC(env, human_name, handle, use_cuda=cuda)
17 |     elif algo_name == 'mfac':
18 |         model = MFAC(env, human_name, handle, use_cuda=cuda)
19 |     if cuda:
20 |         model = model.cuda()
21 |     return model


--------------------------------------------------------------------------------
/algo/ac.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | 
  7 | from . import tools
  8 | 
  9 | 
 10 | class ActorCritic(nn.Module):
 11 |     def __init__(self, env, name, handle, value_coef=0.1, ent_coef=0.08, gamma=0.95, batch_size=64, learning_rate=1e-4, use_cuda=False):
 12 |         super(ActorCritic, self).__init__()
 13 |         
 14 |         self.env = env
 15 |         self.name = name
 16 |         self.view_space = env.unwrapped.env.get_view_space(handle)
 17 |         assert len(self.view_space) == 3
 18 |         self.feature_space = env.unwrapped.env.get_feature_space(handle)[0]
 19 |         self.num_actions = env.unwrapped.env.get_action_space(handle)[0]
 20 |         self.gamma = gamma
 21 |         
 22 |         self.batch_size = batch_size
 23 |         self.learning_rate = learning_rate
 24 | 
 25 |         self.value_coef = value_coef  # coefficient of value in the total loss
 26 |         self.ent_coef = ent_coef  # coefficient of entropy in the total loss
 27 |         
 28 |         # init training buffers
 29 |         self.view_buf = np.empty([1,] + list(self.view_space))
 30 |         self.feature_buf = np.empty([1,] + [self.feature_space])
 31 |         self.action_buf = np.empty(1, dtype=np.int32)
 32 |         self.reward_buf = np.empty(1, dtype=np.float32)
 33 |         self.replay_buffer = tools.EpisodesBuffer()
 34 |         
 35 |         self.net = self._construct_net()
 36 |         self.optim = torch.optim.Adam(lr=self.learning_rate, params=self.get_all_params())
 37 |         self.use_cuda = use_cuda
 38 |         
 39 |     def get_all_params(self):
 40 |         params = []
 41 |         for k, v in self.net.items():
 42 |             params += list(v.parameters())
 43 |         return params
 44 |         
 45 |     def _construct_net(self):
 46 |         temp_dict = nn.ModuleDict()
 47 |         temp_dict['obs_linear'] = nn.Linear(np.prod(self.view_space), 256)
 48 |         temp_dict['emb_linear'] = nn.Linear(self.feature_space, 256)
 49 |         temp_dict['cat_linear'] = nn.Linear(256 * 2, 256 * 2)
 50 |         temp_dict['policy_linear'] = nn.Linear(256 * 2, self.num_actions)
 51 |         temp_dict['value_linear'] = nn.Linear(256 * 2, 1)
 52 |         return temp_dict
 53 |     
 54 |     def _calc_value(self, **kwargs):
 55 |         if self.use_cuda:
 56 |             obs = torch.FloatTensor(kwargs['obs']).cuda().unsqueeze(0)
 57 |             feature = torch.FloatTensor(kwargs['feature']).cuda().unsqueeze(0)
 58 |         else:
 59 |             obs = torch.FloatTensor(kwargs['obs']).unsqueeze(0)
 60 |             feature = torch.FloatTensor(kwargs['feature']).unsqueeze(0)
 61 |         flatten_view = obs.reshape(obs.size()[0], -1)
 62 |         h_view = F.relu(self.net['obs_linear'](flatten_view))
 63 |         h_emb = F.relu(self.net['emb_linear'](feature))
 64 |         dense = torch.cat([h_view, h_emb], dim=-1)
 65 |         dense = F.relu(self.net['cat_linear'](dense))
 66 |         value = self.net['value_linear'](dense)
 67 |         value = value.flatten()
 68 |         return value.detach().cpu().numpy()
 69 |     
 70 |     def train(self, cuda):
 71 |         # calc buffer size
 72 |         n = 0
 73 |         # batch_data = sample_buffer.episodes()
 74 |         batch_data = self.replay_buffer.episodes()
 75 |         self.replay_buffer = tools.EpisodesBuffer()
 76 | 
 77 |         for episode in batch_data:
 78 |             n += len(episode.rewards)
 79 | 
 80 |         self.view_buf.resize([n,] + list(self.view_space), refcheck=False)
 81 |         self.feature_buf.resize([n,] + [self.feature_space], refcheck=False)
 82 |         self.action_buf.resize(n, refcheck=False)
 83 |         self.reward_buf.resize(n, refcheck=False)
 84 |         view, feature = self.view_buf, self.feature_buf
 85 |         action, reward = self.action_buf, self.reward_buf
 86 |         
 87 |         ct = 0
 88 |         gamma = self.gamma
 89 |         # collect episodes from multiple separate buffers to a continuous buffer
 90 |         for episode in batch_data:
 91 |             v, f, a, r = episode.views, episode.features, episode.actions, episode.rewards
 92 |             m = len(episode.rewards)
 93 | 
 94 |             r = np.array(r)
 95 | 
 96 |             keep = self._calc_value(obs=v[-1], feature=f[-1])
 97 | 
 98 |             for i in reversed(range(m)):
 99 |                 keep = keep * gamma + r[i]
100 |                 r[i] = keep
101 | 
102 |             view[ct:ct + m] = v
103 |             feature[ct:ct + m] = f
104 |             action[ct:ct + m] = a
105 |             reward[ct:ct + m] = r
106 |             ct += m
107 | 
108 |         assert n == ct
109 |         
110 |         if self.use_cuda:
111 |             view = torch.FloatTensor(view).cuda()
112 |             feature = torch.FloatTensor(feature).cuda()
113 |             action = torch.LongTensor(action).cuda()
114 |             reward = torch.FloatTensor(reward).cuda()
115 |             action_mask = torch.zeros([action.size(0), self.num_actions]).cuda().scatter_(1, action.unsqueeze(-1), 1).float()
116 |         else:
117 |             view = torch.FloatTensor(view)
118 |             feature = torch.FloatTensor(feature)
119 |             action = torch.LongTensor(action)
120 |             reward = torch.FloatTensor(reward)
121 |             action_mask = torch.zeros([action.size(0), self.num_actions]).scatter_(1, action.unsqueeze(-1), 1).float()
122 | 
123 |         # train
124 |         flatten_view = view.flatten(1)
125 |         h_view = F.relu(self.net['obs_linear'](flatten_view))
126 |         h_emb = F.relu(self.net['emb_linear'](feature))
127 |         dense = torch.cat([h_view, h_emb], dim=-1)
128 |         dense = F.relu(self.net['cat_linear'](dense))
129 |         policy = F.softmax(self.net['policy_linear'](dense / 0.1), dim=-1)
130 |         policy = torch.clamp(policy, 1e-10, 1-1e-10)
131 |         value = self.net['value_linear'](dense)
132 |         value = value.flatten()
133 |         
134 |         advantage = (reward - value).detach()
135 |         log_policy = (policy + 1e-6).log()
136 |         log_prob = (log_policy * action_mask).sum(1)
137 |         
138 |         pg_loss = -(advantage * log_prob).mean()
139 |         vf_loss = self.value_coef * (reward - value).pow(2).mean()
140 |         neg_entropy = self.ent_coef * (policy * log_policy).sum(1).mean()
141 |         total_loss = pg_loss + vf_loss + neg_entropy
142 |         
143 |         # train op (clip gradient)
144 |         self.optim.zero_grad()
145 |         total_loss.backward()
146 |         grad_norm = torch.nn.utils.clip_grad_norm_(self.get_all_params(), 5.0)
147 |         self.optim.step()
148 |         
149 |         print('[*] PG_LOSS:', np.round(pg_loss.detach().cpu().item(), 6), '/ VF_LOSS:', np.round(vf_loss.detach().cpu().item(), 6), '/ ENT_LOSS:', np.round(neg_entropy.detach().cpu().item()), '/ Value:', np.mean(value.detach().cpu().numpy()))
150 |         
151 |     def act(self, **kwargs):
152 |         flatten_view = kwargs['obs'].reshape(kwargs['obs'].size()[0], -1)
153 |         h_view = F.relu(self.net['obs_linear'](flatten_view))
154 |         h_emb = F.relu(self.net['emb_linear'](kwargs['feature']))
155 |         dense = torch.cat([h_view, h_emb], dim=-1)
156 |         dense = F.relu(self.net['cat_linear'](dense))
157 |         policy = F.softmax(self.net['policy_linear'](dense / 0.1), dim=-1)
158 |         policy = torch.clamp(policy, 1e-10, 1-1e-10)
159 |         distribution = torch.distributions.Categorical(policy)
160 |         action = distribution.sample().detach().cpu().numpy()
161 |         return action.astype(np.int32).reshape((-1,))
162 |     
163 |     def flush_buffer(self, **kwargs):
164 |         self.replay_buffer.push(**kwargs)
165 |         
166 |     def save(self, dir_path, step=0):
167 |         os.makedirs(dir_path, exist_ok=True)
168 |         file_path = os.path.join(dir_path, "ac_{}".format(step))
169 |         torch.save(self.net.state_dict(), file_path)
170 |         print("[*] Model saved")
171 |         
172 |     def load(self, dir_path, step=0):
173 |         file_path = os.path.join(dir_path, "ac_{}".format(step))
174 | 
175 |         self.net.load_state_dict(torch.load(file_path))
176 |         print("[*] Loaded model")
177 |         
178 |         
179 |         
180 | class MFAC(nn.Module):
181 |     def __init__(self, env, name, handle, value_coef=0.1, ent_coef=0.08, gamma=0.95, batch_size=64, learning_rate=1e-4, use_cuda=False):
182 |         super(MFAC, self).__init__()
183 |         
184 |         self.env = env
185 |         self.name = name
186 |         self.view_space = env.unwrapped.env.get_view_space(handle)
187 |         assert len(self.view_space) == 3
188 |         self.feature_space = env.unwrapped.env.get_feature_space(handle)[0]
189 |         self.num_actions = env.unwrapped.env.get_action_space(handle)[0]
190 |         self.gamma = gamma
191 |         
192 |         self.batch_size = batch_size
193 |         self.learning_rate = learning_rate
194 | 
195 |         self.value_coef = value_coef  # coefficient of value in the total loss
196 |         self.ent_coef = ent_coef  # coefficient of entropy in the total loss
197 |         
198 |         # init training buffers
199 |         self.view_buf = np.empty([1,] + list(self.view_space))
200 |         self.feature_buf = np.empty([1,] + [self.feature_space])
201 |         self.action_buf = np.empty(1, dtype=np.int32)
202 |         self.reward_buf = np.empty(1, dtype=np.float32)
203 |         self.replay_buffer = tools.EpisodesBuffer(use_mean=True)
204 |         
205 |         self.net = self._construct_net()
206 |         self.optim = torch.optim.Adam(lr=self.learning_rate, params=self.get_all_params())
207 |         self.use_cuda = use_cuda
208 |         
209 |     def get_all_params(self):
210 |         params = []
211 |         for k, v in self.net.items():
212 |             params += list(v.parameters())
213 |         return params
214 |         
215 |     def _construct_net(self):
216 |         temp_dict = nn.ModuleDict()
217 |         temp_dict['obs_linear'] = nn.Linear(np.prod(self.view_space), 256)
218 |         temp_dict['emb_linear'] = nn.Linear(self.feature_space, 256)
219 |         # * use the action_prob
220 |         temp_dict['action_linear_1'] = nn.Linear(self.num_actions, 64)
221 |         temp_dict['action_linear_2'] = nn.Linear(64, 32)
222 |         temp_dict['act_obs_emb_linear'] = nn.Linear(32 + 256 * 2, 256)
223 |         temp_dict['value_linear'] = nn.Linear(256, 1)
224 | 
225 |         temp_dict['cat_linear'] = nn.Linear(256 * 2, 256 * 2)
226 |         temp_dict['policy_linear'] = nn.Linear(256 * 2, self.num_actions)
227 |         return temp_dict
228 |     
229 |     def _calc_value(self, **kwargs):
230 |         if self.use_cuda:
231 |             obs = torch.FloatTensor(kwargs['obs']).cuda().unsqueeze(0)
232 |             feature = torch.FloatTensor(kwargs['feature']).cuda().unsqueeze(0)
233 |             input_act_prob = torch.FloatTensor(kwargs['prob']).cuda().unsqueeze(0)
234 |         else:
235 |             obs = torch.FloatTensor(kwargs['obs']).unsqueeze(0)
236 |             feature = torch.FloatTensor(kwargs['feature']).unsqueeze(0)
237 |             input_act_prob = torch.FloatTensor(kwargs['prob']).unsqueeze(0)
238 |         flatten_view = obs.flatten(1)
239 |         h_view = F.relu(self.net['obs_linear'](flatten_view))
240 |         h_emb = F.relu(self.net['emb_linear'](feature))
241 |         cat_layer = torch.cat([h_view, h_emb], dim=-1)
242 |         action_dense = F.relu(self.net['action_linear_1'](input_act_prob))
243 |         action_dense = F.relu(self.net['action_linear_2'](action_dense))
244 |         cat_act_obs_emb = torch.cat([action_dense, cat_layer], dim=-1)
245 |         dense_act_obs_emb = F.relu(self.net['act_obs_emb_linear'](cat_act_obs_emb))
246 |         value = self.net['value_linear'](dense_act_obs_emb)
247 |         value = value.flatten()
248 |         return value.detach().cpu().numpy()
249 |     
250 |     def train(self, cuda):
251 |         # calc buffer size
252 |         n = 0
253 |         # batch_data = sample_buffer.episodes()
254 |         batch_data = self.replay_buffer.episodes()
255 |         self.replay_buffer = tools.EpisodesBuffer(use_mean=True)
256 | 
257 |         for episode in batch_data:
258 |             n += len(episode.rewards)
259 | 
260 |         self.view_buf.resize([n,] + list(self.view_space), refcheck=False)
261 |         self.feature_buf.resize([n,] + [self.feature_space], refcheck=False)
262 |         self.action_buf.resize(n, refcheck=False)
263 |         self.reward_buf.resize(n, refcheck=False)
264 |         view, feature = self.view_buf, self.feature_buf
265 |         action, reward = self.action_buf, self.reward_buf
266 |         act_prob_buff = np.zeros((n, self.num_actions), dtype=np.float32)
267 |         
268 |         ct = 0
269 |         gamma = self.gamma
270 |         # collect episodes from multiple separate buffers to a continuous buffer
271 |         for episode in batch_data:
272 |             v, f, a, r, prob = episode.views, episode.features, episode.actions, episode.rewards, episode.probs
273 |             m = len(episode.rewards)
274 |             
275 |             assert len(prob) > 0
276 | 
277 |             r = np.array(r)
278 | 
279 |             keep = self._calc_value(obs=v[-1], feature=f[-1], prob=prob[-1])
280 | 
281 |             for i in reversed(range(m)):
282 |                 keep = keep * gamma + r[i]
283 |                 r[i] = keep
284 | 
285 |             view[ct:ct + m] = v
286 |             feature[ct:ct + m] = f
287 |             action[ct:ct + m] = a
288 |             reward[ct:ct + m] = r
289 |             act_prob_buff[ct:ct + m] = prob
290 |             ct += m
291 | 
292 |         assert n == ct
293 |         
294 |         if self.use_cuda:
295 |             view = torch.FloatTensor(view).cuda()
296 |             feature = torch.FloatTensor(feature).cuda()
297 |             action = torch.LongTensor(action).cuda()
298 |             reward = torch.FloatTensor(reward).cuda()
299 |             act_prob_buff = torch.FloatTensor(act_prob_buff).cuda()
300 |             action_mask = torch.zeros([action.size(0), self.num_actions]).cuda().scatter_(1, action.unsqueeze(-1), 1).float()
301 |         else:
302 |             view = torch.FloatTensor(view)
303 |             feature = torch.FloatTensor(feature)
304 |             action = torch.LongTensor(action)
305 |             reward = torch.FloatTensor(reward)
306 |             act_prob_buff = torch.FloatTensor(act_prob_buff)
307 |             action_mask = torch.zeros([action.size(0), self.num_actions]).scatter_(1, action.unsqueeze(-1), 1).float()
308 | 
309 |         # train
310 |         flatten_view = view.flatten(1)
311 |         h_view = F.relu(self.net['obs_linear'](flatten_view))
312 |         h_emb = F.relu(self.net['emb_linear'](feature))
313 |         cat_layer = torch.cat([h_view, h_emb], dim=-1)
314 |         dense = F.relu(self.net['cat_linear'](cat_layer))
315 |         policy = F.softmax(self.net['policy_linear'](dense / 0.1), dim=-1)
316 |         policy = torch.clamp(policy, 1e-10, 1-1e-10)
317 |         action_dense = F.relu(self.net['action_linear_1'](act_prob_buff))
318 |         action_dense = F.relu(self.net['action_linear_2'](action_dense))
319 |         cat_act_obs_emb = torch.cat([action_dense, cat_layer], dim=-1)
320 |         dense_act_obs_emb = F.relu(self.net['act_obs_emb_linear'](cat_act_obs_emb))
321 |         value = self.net['value_linear'](dense_act_obs_emb)
322 |         value = value.flatten()
323 |         
324 |         advantage = (reward - value).detach()
325 |         log_policy = (policy + 1e-6).log()
326 |         log_prob = (log_policy * action_mask).sum(1)
327 |         
328 |         pg_loss = -(advantage * log_prob).mean()
329 |         vf_loss = self.value_coef * (reward - value).pow(2).mean()
330 |         neg_entropy = self.ent_coef * (policy * log_policy).sum(1).mean()
331 |         total_loss = pg_loss + vf_loss + neg_entropy
332 |         
333 |         # train op (clip gradient)
334 |         self.optim.zero_grad()
335 |         total_loss.backward()
336 |         grad_norm = torch.nn.utils.clip_grad_norm_(self.get_all_params(), 5.0)
337 |         self.optim.step()
338 |         
339 |         print('[*] PG_LOSS:', np.round(pg_loss.detach().cpu().item(), 6), '/ VF_LOSS:', np.round(vf_loss.detach().cpu().item(), 6), '/ ENT_LOSS:', np.round(neg_entropy.detach().cpu().item()), '/ Value:', np.mean(value.detach().cpu().numpy()))
340 |         
341 |     def act(self, **kwargs):
342 |         flatten_view = kwargs['obs'].reshape(kwargs['obs'].size()[0], -1)
343 |         h_view = F.relu(self.net['obs_linear'](flatten_view))
344 |         h_emb = F.relu(self.net['emb_linear'](kwargs['feature']))
345 |         cat_layer = torch.cat([h_view, h_emb], dim=-1)
346 |         dense = F.relu(self.net['cat_linear'](cat_layer))
347 |         policy = F.softmax(self.net['policy_linear'](dense / 0.1), dim=-1)
348 |         policy = torch.clamp(policy, 1e-10, 1-1e-10)
349 |         distribution = torch.distributions.Categorical(policy)
350 |         action = distribution.sample().detach().cpu().numpy()
351 |         return action.astype(np.int32).reshape((-1,))
352 |     
353 |     def flush_buffer(self, **kwargs):
354 |         self.replay_buffer.push(**kwargs)
355 |         
356 |     def save(self, dir_path, step=0):
357 |         os.makedirs(dir_path, exist_ok=True)
358 |         file_path = os.path.join(dir_path, "mfac_{}".format(step))
359 |         torch.save(self.net.state_dict(), file_path)
360 |         print("[*] Model saved")
361 |         
362 |     def load(self, dir_path, step=0):
363 |         file_path = os.path.join(dir_path, "mfac_{}".format(step))
364 | 
365 |         self.net.load_state_dict(torch.load(file_path))
366 |         print("[*] Loaded model")
367 | 


--------------------------------------------------------------------------------
/algo/base.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | 
  6 | 
  7 | class ValueNet(nn.Module):
  8 |     def __init__(self, env, name, handle, update_every=5, use_mf=False, learning_rate=1e-4, tau=0.005, gamma=0.95):
  9 |         super(ValueNet, self).__init__()
 10 |         self.env = env
 11 |         self.name = name
 12 |         self._saver = None
 13 |         
 14 |         self.view_space = env.unwrapped.env.get_view_space(handle)
 15 |         assert len(self.view_space) == 3
 16 |         self.feature_space = env.unwrapped.env.get_feature_space(handle)[0]
 17 |         self.num_actions = env.unwrapped.env.get_action_space(handle)[0]
 18 |         
 19 |         self.update_every = update_every
 20 |         self.use_mf = use_mf  # trigger of using mean field
 21 |         self.temperature = 0.1
 22 |         
 23 |         self.lr= learning_rate
 24 |         self.tau = tau
 25 |         self.gamma = gamma
 26 |         
 27 |         self.eval_net = self._construct_net()
 28 |         self.target_net = self._construct_net()
 29 |         
 30 |         self.optim = torch.optim.Adam(lr=self.lr, params=self.get_params(self.eval_net))
 31 |         
 32 |     def _construct_net(self):
 33 |         temp_dict = nn.ModuleDict()
 34 |         temp_dict['conv1'] = nn.Conv2d(in_channels=self.view_space[2], out_channels=32, kernel_size=3)
 35 |         temp_dict['conv2'] = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3)
 36 |         temp_dict['obs_linear'] = nn.Linear(self.get_flatten_dim(temp_dict), 256)
 37 |         temp_dict['emb_linear'] = nn.Linear(self.feature_space, 32)
 38 |         if self.use_mf:
 39 |             temp_dict['prob_emb_linear'] = nn.Sequential(
 40 |                 nn.Linear(self.num_actions, 64),
 41 |                 nn.ReLU(),
 42 |                 nn.Linear(64, 32)
 43 |             )
 44 |         temp_dict['final_linear'] = nn.Sequential(
 45 |             nn.Linear(320 if self.use_mf else 288, 128),
 46 |             nn.ReLU(),
 47 |             nn.Linear(128, 64),
 48 |             nn.ReLU(),
 49 |             nn.Linear(64, self.num_actions)
 50 |         )
 51 |         return temp_dict
 52 |         
 53 |     def get_flatten_dim(self, dict):
 54 |         return dict['conv2'](dict['conv1'](torch.zeros(1, self.view_space[2], self.view_space[0], self.view_space[1]))).flatten().size()[0]
 55 |     
 56 |     def get_params(self, dict):
 57 |         params = []
 58 |         for k, v in dict.items():
 59 |             params += list(v.parameters())
 60 |         return params
 61 |     
 62 |     def get_all_params(self):
 63 |         params = []
 64 |         eval_params = self.get_params(self.eval_net)
 65 |         target_params = self.get_params(self.target_net)
 66 |         params += eval_params
 67 |         params += target_params
 68 |         return params
 69 |     
 70 |     def calc_target_q(self, obs, feature, dones, rewards, prob=None):
 71 |         t_h = F.relu(self.target_net['conv2'](F.relu(self.target_net['conv1'](obs)))).flatten(start_dim=1)
 72 |         t_h = torch.cat([self.target_net['obs_linear'](t_h), self.target_net['emb_linear'](feature)], -1)
 73 |         if self.use_mf:
 74 |             t_h = torch.cat([t_h, self.target_net['prob_emb_linear'](prob)], -1)
 75 |         t_q = self.target_net['final_linear'](t_h)
 76 | 
 77 |         e_h = F.relu(self.eval_net['conv2'](F.relu(self.eval_net['conv1'](obs)))).flatten(start_dim=1)
 78 |         e_h = torch.cat([self.eval_net['obs_linear'](e_h), self.eval_net['emb_linear'](feature)], -1)
 79 |         if self.use_mf:
 80 |             e_h = torch.cat([e_h, self.eval_net['prob_emb_linear'](prob)], -1)
 81 |         e_q = self.eval_net['final_linear'](e_h)
 82 |         
 83 |         act_idx = e_q.max(1)[1]
 84 |         q_values = torch.gather(t_q, 1, act_idx.unsqueeze(-1))
 85 |         target_q_value = rewards + (1. - dones) * q_values.reshape(-1) * self.gamma
 86 |         return target_q_value
 87 |     
 88 |     def update(self):
 89 |         for k, v in self.target_net.items():
 90 |             for param, target_param in zip(self.eval_net[k].parameters(), self.target_net[k].parameters()):
 91 |                 target_param.detach().copy_(self.tau * param.detach() + (1. - self.tau) * target_param.detach())
 92 |                 
 93 |     def act(self, obs, feature, prob=None, eps=None):
 94 |         if eps is not None:
 95 |             self.temperature = eps
 96 |             
 97 |         e_h = F.relu(self.eval_net['conv2'](F.relu(self.eval_net['conv1'](obs)))).flatten(start_dim=1)
 98 |         e_h = torch.cat([self.eval_net['obs_linear'](e_h), self.eval_net['emb_linear'](feature)], -1)
 99 |         if self.use_mf:
100 |             e_h = torch.cat([e_h, self.eval_net['prob_emb_linear'](prob)], -1)
101 |         e_q = self.eval_net['final_linear'](e_h)
102 |         predict = F.softmax(e_q / self.temperature, dim=-1)
103 |         #distribution = torch.distributions.Categorical(predict)
104 |         #actions = distribution.sample().detach().cpu().numpy()
105 |         actions = predict.max(1)[1].detach().cpu().numpy()
106 |         return actions
107 |     
108 |     def train(self, obs, feature, target_q, acts, prob=None, mask=None):
109 |         e_h = F.relu(self.eval_net['conv2'](F.relu(self.eval_net['conv1'](obs)))).flatten(start_dim=1)
110 |         e_h = torch.cat([self.eval_net['obs_linear'](e_h), self.eval_net['emb_linear'](feature)], -1)
111 |         if self.use_mf:
112 |             e_h = torch.cat([e_h, self.eval_net['prob_emb_linear'](prob)], -1)
113 |         e_q = self.eval_net['final_linear'](e_h)
114 |         
115 |         e_q = torch.gather(e_q, 1, acts.unsqueeze(-1)).squeeze()
116 |         if mask is not None:
117 |             loss = ((e_q - target_q.detach()).pow(2) * mask).sum() / mask.sum()
118 |         else:
119 |             loss = (e_q - target_q.detach()).pow(2).mean()
120 | 
121 |         self.optim.zero_grad()
122 |         loss.backward()
123 |         self.optim.step()
124 |         return loss.item(), {'Eval-Q': np.round(np.mean(e_q.detach().cpu().numpy()), 6), 'Target-Q': np.round(np.mean(target_q.detach().cpu().numpy()), 6)}
125 |         


--------------------------------------------------------------------------------
/algo/q_learning.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | 
  7 | from . import base
  8 | from . import tools
  9 | 
 10 | 
 11 | class DQN(base.ValueNet):
 12 |     def __init__(self, env, name, handle, sub_len, memory_size=2**10, batch_size=64, update_every=5, use_mf=False, learning_rate=0.0001, tau=0.005, gamma=0.95):
 13 |         super().__init__(env, name, handle, update_every=update_every, use_mf=use_mf, learning_rate=learning_rate, tau=tau, gamma=gamma)
 14 |         
 15 |         self.replay_buffer = tools.MemoryGroup(self.view_space, self.feature_space, self.num_actions, memory_size, batch_size, sub_len)
 16 |         
 17 |     def flush_buffer(self, **kwargs):
 18 |         self.replay_buffer.push(**kwargs)
 19 |         
 20 |     def train(self, cuda):
 21 |         self.replay_buffer.tight()
 22 |         batch_num = self.replay_buffer.get_batch_num()
 23 | 
 24 |         for i in range(batch_num):
 25 |             obs, feat, obs_next, feat_next, dones, rewards, acts, masks = self.replay_buffer.sample()
 26 |             
 27 |             obs = torch.FloatTensor(obs).permute([0, 3, 1, 2]).cuda() if cuda else torch.FloatTensor(obs).permute([0, 3, 1, 2])
 28 |             obs_next = torch.FloatTensor(obs_next).permute([0, 3, 1, 2]).cuda() if cuda else torch.FloatTensor(obs_next).permute([0, 3, 1, 2])
 29 |             feat = torch.FloatTensor(feat).cuda() if cuda else torch.FloatTensor(feat)
 30 |             feat_next = torch.FloatTensor(feat_next).cuda() if cuda else torch.FloatTensor(feat_next)
 31 |             acts = torch.LongTensor(acts).cuda() if cuda else torch.LongTensor(acts)
 32 |             rewards = torch.FloatTensor(rewards).cuda() if cuda else torch.FloatTensor(rewards)
 33 |             dones = torch.FloatTensor(dones).cuda() if cuda else torch.FloatTensor(dones)
 34 |             masks = torch.FloatTensor(masks).cuda() if cuda else torch.FloatTensor(masks)
 35 |             
 36 |             target_q = self.calc_target_q(obs=obs_next, feature=feat_next, rewards=rewards, dones=dones)
 37 |             loss, q = super().train(obs=obs, feature=feat, target_q=target_q, acts=acts, mask=masks)
 38 |             
 39 |             self.update()
 40 | 
 41 |             if i % 50 == 0:
 42 |                 print('[*] LOSS:', loss, '/ Q:', q)
 43 |     
 44 |     def save(self, dir_path, step=0):
 45 |         os.makedirs(dir_path, exist_ok=True)
 46 |         eval_file_path = os.path.join(dir_path, "dqn_eval_{}".format(step))
 47 |         target_file_path = os.path.join(dir_path, "dqn_target_{}".format(step))
 48 |         torch.save(self.eval_net.state_dict(), eval_file_path)
 49 |         torch.save(self.target_net.state_dict(), target_file_path)
 50 |         print("[*] Model saved")
 51 |         
 52 |     def load(self, dir_path, step=0):
 53 |         eval_file_path = os.path.join(dir_path, "dqn_eval_{}".format(step))
 54 |         target_file_path = os.path.join(dir_path, "dqn_target_{}".format(step))
 55 | 
 56 |         self.target_net.load_state_dict(torch.load(target_file_path))
 57 |         self.eval_net.load_state_dict(torch.load(eval_file_path))
 58 |         print("[*] Loaded model")
 59 |         
 60 |         
 61 | 
 62 | class MFQ(base.ValueNet):
 63 |     def __init__(self, env, name, handle, sub_len, eps=1.0, memory_size=2**10, batch_size=64, update_every=5, use_mf=True, learning_rate=0.0001, tau=0.005, gamma=0.95):
 64 |         super().__init__(env, name, handle, update_every=update_every, use_mf=use_mf, learning_rate=learning_rate, tau=tau, gamma=gamma)
 65 |         
 66 |         config = {
 67 |             'max_len': memory_size,
 68 |             'batch_size': batch_size,
 69 |             'obs_shape': self.view_space,
 70 |             'feat_shape': self.feature_space,
 71 |             'act_n': self.num_actions,
 72 |             'use_mean': True,
 73 |             'sub_len': sub_len
 74 |         }
 75 | 
 76 |         self.train_ct = 0
 77 |         self.replay_buffer = tools.MemoryGroup(**config)
 78 |         self.update_every = update_every
 79 |         
 80 |     def flush_buffer(self, **kwargs):
 81 |         self.replay_buffer.push(**kwargs)
 82 | 
 83 |     def train(self, cuda):
 84 |         self.replay_buffer.tight()
 85 |         batch_name = self.replay_buffer.get_batch_num()
 86 | 
 87 |         for i in range(batch_name):
 88 |             obs, feat, acts, act_prob, obs_next, feat_next, act_prob_next, rewards, dones, masks = self.replay_buffer.sample()
 89 |             
 90 |             obs = torch.FloatTensor(obs).permute([0, 3, 1, 2]).cuda() if cuda else torch.FloatTensor(obs).permute([0, 3, 1, 2])
 91 |             obs_next = torch.FloatTensor(obs_next).permute([0, 3, 1, 2]).cuda() if cuda else torch.FloatTensor(obs_next).permute([0, 3, 1, 2])
 92 |             feat = torch.FloatTensor(feat).cuda() if cuda else torch.FloatTensor(feat)
 93 |             feat_next = torch.FloatTensor(feat_next).cuda() if cuda else torch.FloatTensor(feat_next)
 94 |             acts = torch.LongTensor(acts).cuda() if cuda else torch.LongTensor(acts)
 95 |             act_prob = torch.FloatTensor(act_prob).cuda() if cuda else torch.FloatTensor(act_prob)
 96 |             act_prob_next = torch.FloatTensor(act_prob_next).cuda() if cuda else torch.FloatTensor(act_prob_next)
 97 |             rewards = torch.FloatTensor(rewards).cuda() if cuda else torch.FloatTensor(rewards)
 98 |             dones = torch.FloatTensor(dones).cuda() if cuda else torch.FloatTensor(dones)
 99 |             masks = torch.FloatTensor(masks).cuda() if cuda else torch.FloatTensor(masks)
100 |             
101 |             target_q = self.calc_target_q(obs=obs_next, feature=feat_next, rewards=rewards, dones=dones, prob=act_prob_next)
102 |             loss, q = super().train(obs=obs, feature=feat, target_q=target_q, prob=act_prob, acts=acts, mask=masks)
103 | 
104 |             self.update()
105 | 
106 |             if i % 50 == 0:
107 |                 print('[*] LOSS:', loss, '/ Q:', q)
108 | 
109 |     def save(self, dir_path, step=0):
110 |         os.makedirs(dir_path, exist_ok=True)
111 |         eval_file_path = os.path.join(dir_path, "mfq_eval_{}".format(step))
112 |         target_file_path = os.path.join(dir_path, "mfq_target_{}".format(step))
113 |         torch.save(self.eval_net.state_dict(), eval_file_path)
114 |         torch.save(self.target_net.state_dict(), target_file_path)
115 |         print("[*] Model saved")
116 |         
117 |     def load(self, dir_path, step=0):
118 |         eval_file_path = os.path.join(dir_path, "mfq_eval_{}".format(step))
119 |         target_file_path = os.path.join(dir_path, "mfq_target_{}".format(step))
120 | 
121 |         self.target_net.load_state_dict(torch.load(target_file_path))
122 |         self.eval_net.load_state_dict(torch.load(eval_file_path))
123 |         print("[*] Loaded model")


--------------------------------------------------------------------------------
/algo/tools.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import os
  5 | import numpy as np
  6 | from moviepy.editor import ImageSequenceClip
  7 | 
  8 | 
  9 | class Color:
 10 |     INFO = '\033[1;34m{}\033[0m'
 11 |     WARNING = '\033[1;33m{}\033[0m'
 12 |     ERROR = '\033[1;31m{}\033[0m'
 13 |     
 14 | 
 15 | class Buffer:
 16 |     def __init__(self):
 17 |         pass
 18 | 
 19 |     def push(self, **kwargs):
 20 |         raise NotImplementedError
 21 |     
 22 | 
 23 | class MetaBuffer(object):
 24 |     def __init__(self, shape, max_len, dtype='float32'):
 25 |         self.max_len = max_len
 26 |         self.data = np.zeros([max_len] + list(shape if isinstance(shape, tuple) else [shape])).astype(dtype)
 27 |         self.start = 0
 28 |         self.length = 0
 29 |         self._flag = 0
 30 | 
 31 |     def __len__(self):
 32 |         return self.length
 33 | 
 34 |     def __getitem__(self, idx):
 35 |         if idx < 0 or idx >= self.length:
 36 |             raise KeyError()
 37 |         return self.data[idx]
 38 | 
 39 |     def sample(self, idx):
 40 |         return self.data[idx % self.length]
 41 | 
 42 |     def pull(self):
 43 |         return self.data[:self.length]
 44 | 
 45 |     def append(self, value):
 46 |         start = 0
 47 |         num = len(value)
 48 | 
 49 |         if self._flag + num > self.max_len:
 50 |             tail = self.max_len - self._flag
 51 |             self.data[self._flag:] = value[:tail]
 52 |             num -= tail
 53 |             start = tail
 54 |             self._flag = 0
 55 | 
 56 |         self.data[self._flag:self._flag + num] = value[start:]
 57 |         self._flag += num
 58 |         self.length = min(self.length + len(value), self.max_len)
 59 | 
 60 |     def reset_new(self, start, value):
 61 |         self.data[start:] = value
 62 |         
 63 |         
 64 | class EpisodesBufferEntry:
 65 |     """Entry for episode buffer"""
 66 |     def __init__(self):
 67 |         self.views = []
 68 |         self.features = []
 69 |         self.actions = []
 70 |         self.rewards = []
 71 |         self.probs = []
 72 |         self.terminal = False
 73 | 
 74 |     def append(self, view, feature, action, reward, alive, probs=None):
 75 |         self.views.append(view.copy())
 76 |         self.features.append(feature.copy())
 77 |         self.actions.append(action)
 78 |         self.rewards.append(reward)
 79 |         if probs is not None:
 80 |             self.probs.append(probs)
 81 |         if not alive:
 82 |             self.terminal = True
 83 |             
 84 |             
 85 | class EpisodesBuffer(Buffer):
 86 |     """Replay buffer to store a whole episode for all agents
 87 |        one entry for one agent
 88 |     """
 89 |     def __init__(self, use_mean=False):
 90 |         super().__init__()
 91 |         self.buffer = {}
 92 |         self.use_mean = use_mean
 93 | 
 94 |     def push(self, **kwargs):
 95 |         view, feature = kwargs['state']
 96 |         acts = kwargs['acts']
 97 |         rewards = kwargs['rewards']
 98 |         alives = kwargs['alives']
 99 |         ids = kwargs['ids']
100 | 
101 |         if self.use_mean:
102 |             probs = kwargs['prob']
103 | 
104 |         buffer = self.buffer
105 |         index = np.random.permutation(len(view))
106 | 
107 |         for i in range(len(ids)):
108 |             i = index[i]
109 |             entry = buffer.get(ids[i])
110 |             if entry is None:
111 |                 entry = EpisodesBufferEntry()
112 |                 buffer[ids[i]] = entry
113 | 
114 |             if self.use_mean:
115 |                 entry.append(view[i], feature[i], acts[i], rewards[i], alives[i], probs=probs[i])
116 |             else:
117 |                 entry.append(view[i], feature[i], acts[i], rewards[i], alives[i])
118 | 
119 |     def reset(self):
120 |         """ clear replay buffer """
121 |         self.buffer = {}
122 | 
123 |     def episodes(self):
124 |         """ get episodes """
125 |         return self.buffer.values()
126 | 
127 | 
128 | class AgentMemory(object):
129 |     def __init__(self, obs_shape, feat_shape, act_n, max_len, use_mean=False):
130 |         self.obs0 = MetaBuffer(obs_shape, max_len)
131 |         self.feat0 = MetaBuffer(feat_shape, max_len)
132 |         self.actions = MetaBuffer((), max_len, dtype='int32')
133 |         self.rewards = MetaBuffer((), max_len)
134 |         self.terminals = MetaBuffer((), max_len, dtype='bool')
135 |         self.use_mean = use_mean
136 | 
137 |         if self.use_mean:
138 |             self.prob = MetaBuffer((act_n,), max_len)
139 | 
140 |     def append(self, obs0, feat0, act, reward, alive, prob=None):
141 |         self.obs0.append(np.array([obs0]))
142 |         self.feat0.append(np.array([feat0]))
143 |         self.actions.append(np.array([act], dtype=np.int32))
144 |         self.rewards.append(np.array([reward]))
145 |         self.terminals.append(np.array([not alive], dtype=np.bool))
146 | 
147 |         if self.use_mean:
148 |             self.prob.append(np.array([prob]))
149 | 
150 |     def pull(self):
151 |         res = {
152 |             'obs0': self.obs0.pull(),
153 |             'feat0': self.feat0.pull(),
154 |             'act': self.actions.pull(),
155 |             'rewards': self.rewards.pull(),
156 |             'terminals': self.terminals.pull(),
157 |             'prob': None if not self.use_mean else self.prob.pull()
158 |         }
159 | 
160 |         return res
161 | 
162 | 
163 | class MemoryGroup(object):
164 |     def __init__(self, obs_shape, feat_shape, act_n, max_len, batch_size, sub_len, use_mean=False):
165 |         self.agent = dict()
166 |         self.max_len = max_len
167 |         self.batch_size = batch_size
168 |         self.obs_shape = obs_shape
169 |         self.feat_shape = feat_shape
170 |         self.sub_len = sub_len
171 |         self.use_mean = use_mean
172 |         self.act_n = act_n
173 | 
174 |         self.obs0 = MetaBuffer(obs_shape, max_len)
175 |         self.feat0 = MetaBuffer(feat_shape, max_len)
176 |         self.actions = MetaBuffer((), max_len, dtype='int32')
177 |         self.rewards = MetaBuffer((), max_len)
178 |         self.terminals = MetaBuffer((), max_len, dtype='bool')
179 |         self.masks = MetaBuffer((), max_len, dtype='bool')
180 |         if use_mean:
181 |             self.prob = MetaBuffer((act_n,), max_len)
182 |         self._new_add = 0
183 | 
184 |     def _flush(self, **kwargs):
185 |         self.obs0.append(kwargs['obs0'])
186 |         self.feat0.append(kwargs['feat0'])
187 |         self.actions.append(kwargs['act'])
188 |         self.rewards.append(kwargs['rewards'])
189 |         self.terminals.append(kwargs['terminals'])
190 | 
191 |         if self.use_mean:
192 |             self.prob.append(kwargs['prob'])
193 | 
194 |         mask = np.where(kwargs['terminals'] == True, False, True)
195 |         mask[-1] = False
196 |         self.masks.append(mask)
197 | 
198 |     def push(self, **kwargs):
199 |         for i, _id in enumerate(kwargs['ids']):
200 |             if self.agent.get(_id) is None:
201 |                 self.agent[_id] = AgentMemory(self.obs_shape, self.feat_shape, self.act_n, self.sub_len, use_mean=self.use_mean)
202 |             if self.use_mean:
203 |                 self.agent[_id].append(obs0=kwargs['state'][0][i], feat0=kwargs['state'][1][i], act=kwargs['acts'][i], reward=kwargs['rewards'][i], alive=kwargs['alives'][i], prob=kwargs['prob'][i])
204 |             else:
205 |                 self.agent[_id].append(obs0=kwargs['state'][0][i], feat0=kwargs['state'][1][i], act=kwargs['acts'][i], reward=kwargs['rewards'][i], alive=kwargs['alives'][i])
206 | 
207 |     def tight(self):
208 |         ids = list(self.agent.keys())
209 |         np.random.shuffle(ids)
210 |         for ele in ids:
211 |             tmp = self.agent[ele].pull()
212 |             self._new_add += len(tmp['obs0'])
213 |             self._flush(**tmp)
214 |         self.agent = dict()  # clear
215 | 
216 |     def sample(self):
217 |         idx = np.random.choice(self.nb_entries, size=self.batch_size)
218 |         next_idx = (idx + 1) % self.nb_entries
219 | 
220 |         obs = self.obs0.sample(idx)
221 |         obs_next = self.obs0.sample(next_idx)
222 |         feature = self.feat0.sample(idx)
223 |         feature_next = self.feat0.sample(next_idx)
224 |         actions = self.actions.sample(idx)
225 |         rewards = self.rewards.sample(idx)
226 |         dones = self.terminals.sample(idx)
227 |         masks = self.masks.sample(idx)
228 | 
229 |         if self.use_mean:
230 |             act_prob = self.prob.sample(idx)
231 |             act_next_prob = self.prob.sample(next_idx)
232 |             return obs, feature, actions, act_prob, obs_next, feature_next, act_next_prob, rewards, dones, masks
233 |         else:
234 |             return obs, feature, obs_next, feature_next, dones, rewards, actions, masks
235 | 
236 |     def get_batch_num(self):
237 |         print('\n[INFO] Length of buffer and new add:', len(self.obs0), self._new_add)
238 |         res = self._new_add * 2 // self.batch_size
239 |         self._new_add = 0
240 |         return res
241 | 
242 |     @property
243 |     def nb_entries(self):
244 |         return len(self.obs0)
245 | 
246 | 
247 | 
248 | class Runner(object):
249 |     def __init__(self, env, handles, max_steps, models,
250 |                 play_handle, render_every=None, save_every=None, tau=None, log_name=None, log_dir=None, model_dir=None, render_dir=None, train=False, cuda=True):
251 |         """Initialize runner
252 | 
253 |         Parameters
254 |         ----------
255 |         env: magent.GridWorld
256 |             environment handle
257 |         handles: list
258 |             group handles
259 |         max_steps: int
260 |             the maximum of stages in a episode
261 |         render_every: int
262 |             render environment interval
263 |         save_every: int
264 |             states the interval of evaluation for self-play update
265 |         models: list
266 |             contains models
267 |         play_handle: method like
268 |             run game
269 |         tau: float
270 |             tau index for self-play update
271 |         log_name: str
272 |             define the name of log dir
273 |         log_dir: str
274 |             donates the directory of logs
275 |         model_dir: str
276 |             donates the dircetory of models
277 |         """
278 |         self.env = env
279 |         self.models = models
280 |         self.max_steps = max_steps
281 |         self.handles = handles
282 |         self.render_every = render_every
283 |         self.save_every = save_every
284 |         self.play = play_handle
285 |         self.model_dir = model_dir
286 |         self.render_dir = render_dir
287 |         self.train = train
288 |         self.tau = tau
289 |         self.cuda = cuda
290 |         
291 |         os.makedirs(self.render_dir, exist_ok=True)
292 |                 
293 |     def sp_op(self):
294 |         l_vars, r_vars = self.models[0].get_all_params(), self.models[1].get_all_params()
295 |         for l_var, r_var in zip(l_vars, r_vars):
296 |             r_var.detach().copy_((1. - self.tau) * l_var + self.tau * r_var)
297 | 
298 |     def run(self, variant_eps, iteration, win_cnt=None):
299 |         info = {'main': None, 'opponent': None}
300 | 
301 |         # pass
302 |         info['main'] = {'ave_agent_reward': 0., 'total_reward': 0., 'kill': 0.}
303 |         info['opponent'] = {'ave_agent_reward': 0., 'total_reward': 0., 'kill': 0.}
304 | 
305 |         max_nums, nums, agent_r_records, total_rewards, render_list = self.play(env=self.env, n_round=iteration, handles=self.handles,
306 |                     models=self.models, print_every=50, eps=variant_eps, render=(iteration + 1) % self.render_every == 0 if self.render_every > 0 else False, train=self.train, cuda=self.cuda)
307 | 
308 |         for i, tag in enumerate(['main', 'opponent']):
309 |             info[tag]['total_reward'] = total_rewards[i]
310 |             info[tag]['kill'] = max_nums[i] - nums[1 - i]
311 |             info[tag]['ave_agent_reward'] = agent_r_records[i]
312 | 
313 |         if self.train:
314 |             print('\n[INFO] {}'.format(info['main']))
315 | 
316 |             # if self.save_every and (iteration + 1) % self.save_every == 0:
317 |             if info['main']['total_reward'] > info['opponent']['total_reward']:
318 |                 print(Color.INFO.format('\n[INFO] Begin self-play Update ...'))
319 |                 self.sp_op()
320 |                 print(Color.INFO.format('[INFO] Self-play Updated!\n'))
321 | 
322 |                 print(Color.INFO.format('[INFO] Saving model ...'))
323 |                 self.models[0].save(self.model_dir + '-0', iteration)
324 |                 self.models[1].save(self.model_dir + '-1', iteration)
325 | 
326 |         else:
327 |             print('\n[INFO] {0} \n {1}'.format(info['main'], info['opponent']))
328 |             if info['main']['kill'] > info['opponent']['kill']:
329 |                 win_cnt['main'] += 1
330 |             elif info['main']['kill'] < info['opponent']['kill']:
331 |                 win_cnt['opponent'] += 1
332 |             else:
333 |                 win_cnt['main'] += 1
334 |                 win_cnt['opponent'] += 1
335 |                 
336 |         if len(render_list) > 0:
337 |             print('[*] Saving Render')
338 |             clip = ImageSequenceClip(render_list, fps=20)
339 |             clip.write_gif('{}/replay_{}.gif'.format(self.render_dir, iteration+1), fps=20, verbose=False)
340 |             print('[*] Saved Render')
341 |             
342 | 


--------------------------------------------------------------------------------
/replay.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deligentfool/mfrl_pytorch/c492d5f8d7f42c35a6864d6f1306752398878422/replay.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | moviepy==1.0.3
2 | numpy==1.21.2
3 | PettingZoo==1.14.0
4 | magent==0.1.14
5 | torch
6 | torchaudio
7 | torchvision
8 | 


--------------------------------------------------------------------------------
/senarios/senario_battle.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | def play(env, n_round, handles, models, print_every, eps=1.0, render=False, train=False, cuda=True):
 10 |     """play a ground and train"""
 11 |     env.reset()
 12 | 
 13 |     max_steps = env.unwrapped.max_cycles
 14 |     step_ct = 0
 15 |     done = False
 16 |     
 17 |     obs_list = []
 18 |     if render:
 19 |         obs_list.append(env.render(mode='rgb_array'))
 20 |     
 21 |     n_group = len(handles)
 22 |     state = [None for _ in range(n_group)]
 23 |     acts = [None for _ in range(n_group)]
 24 |     ids = [None for _ in range(n_group)]
 25 | 
 26 |     alives = [None for _ in range(n_group)]
 27 |     rewards = [None for _ in range(n_group)]
 28 |     nums = [env.unwrapped.env.get_num(handle) for handle in handles]
 29 |     max_nums = nums.copy()
 30 | 
 31 |     loss = [None for _ in range(n_group)]
 32 |     eval_q = [None for _ in range(n_group)]
 33 |     n_action = [env.unwrapped.env.get_action_space(handles[0])[0], env.unwrapped.env.get_action_space(handles[1])[0]]
 34 | 
 35 |     print("\n\n[*] ROUND #{0}, EPS: {1:.2f} NUMBER: {2}".format(n_round, eps, nums))
 36 |     mean_rewards = [[] for _ in range(n_group)]
 37 |     total_rewards = [[] for _ in range(n_group)]
 38 | 
 39 |     former_act_prob = [np.zeros((1, env.unwrapped.env.get_action_space(handles[0])[0])), np.zeros((1, env.unwrapped.env.get_action_space(handles[1])[0]))]
 40 | 
 41 |     while not done and step_ct < max_steps:
 42 |         # take actions for every model
 43 |         for i in range(n_group):
 44 |             state[i] = list(env.unwrapped.env.get_observation(handles[i]))
 45 |             ids[i] = env.unwrapped.env.get_agent_id(handles[i])
 46 | 
 47 |         for i in range(n_group):
 48 |             former_act_prob[i] = np.tile(former_act_prob[i], (len(state[i][0]), 1))
 49 |             if cuda:
 50 |                 acts[i] = models[i].act(obs=torch.FloatTensor(state[i][0]).permute([0, 3, 1, 2]).cuda(), feature=torch.FloatTensor(state[i][1]).cuda(), prob=torch.FloatTensor(former_act_prob[i]).cuda(), eps=eps)
 51 |             else:
 52 |                 acts[i] = models[i].act(obs=torch.FloatTensor(state[i][0]).permute([0, 3, 1, 2]), feature=torch.FloatTensor(state[i][1]), prob=torch.FloatTensor(former_act_prob[i]), eps=eps)
 53 |                 
 54 |                 
 55 |         for i in range(n_group):
 56 |             env.unwrapped.env.set_action(handles[i], acts[i].astype(np.int32))
 57 | 
 58 |         # simulate one step
 59 |         done = env.unwrapped.env.step()
 60 | 
 61 |         for i in range(n_group):
 62 |             rewards[i] = env.unwrapped.env.get_reward(handles[i])
 63 |             alives[i] = env.unwrapped.env.get_alive(handles[i])
 64 | 
 65 |         buffer = {
 66 |             'state': state[0], 'acts': acts[0], 'rewards': rewards[0],
 67 |             'alives': alives[0], 'ids': ids[0]
 68 |         }
 69 | 
 70 |         buffer['prob'] = former_act_prob[0]
 71 | 
 72 |         for i in range(n_group):
 73 |             former_act_prob[i] = np.mean(list(map(lambda x: np.eye(n_action[i])[x], acts[i])), axis=0, keepdims=True)
 74 | 
 75 |         if train:
 76 |             models[0].flush_buffer(**buffer)
 77 | 
 78 |         # stat info
 79 |         nums = [env.unwrapped.env.get_num(handle) for handle in handles]
 80 | 
 81 |         for i in range(n_group):
 82 |             sum_reward = sum(rewards[i])
 83 |             rewards[i] = sum_reward / nums[i]
 84 |             mean_rewards[i].append(rewards[i])
 85 |             total_rewards[i].append(sum_reward)
 86 | 
 87 |         if render:
 88 |             obs_list.append(env.render(mode='rgb_array'))
 89 |             
 90 |         # clear dead agents
 91 |         env.unwrapped.env.clear_dead()
 92 | 
 93 |         info = {"Ave-Reward": np.round(rewards, decimals=6), "NUM": nums}
 94 | 
 95 |         step_ct += 1
 96 | 
 97 |         if step_ct % print_every == 0:
 98 |             print("> step #{}, info: {}".format(step_ct, info))
 99 | 
100 |     if train:
101 |         models[0].train(cuda)
102 | 
103 |     for i in range(n_group):
104 |         mean_rewards[i] = sum(mean_rewards[i]) / len(mean_rewards[i])
105 |         total_rewards[i] = sum(total_rewards[i])
106 | 
107 |     return max_nums, nums, mean_rewards, total_rewards, obs_list
108 | 
109 | 
110 | def battle(env, n_round, handles, models, print_every, eps=1.0, render=False, train=False, cuda=True):
111 |     """play a ground and train"""
112 |     env.reset()
113 | 
114 |     max_steps = env.unwrapped.max_cycles
115 |     step_ct = 0
116 |     done = False
117 | 
118 |     obs_list = []
119 |     if render:
120 |         obs_list.append(np.transpose(env.render(mode='rgb_array'), axes=(1, 0, 2)))
121 | 
122 |     n_group = len(handles)
123 |     state = [None for _ in range(n_group)]
124 |     acts = [None for _ in range(n_group)]
125 |     ids = [None for _ in range(n_group)]
126 | 
127 |     alives = [None for _ in range(n_group)]
128 |     rewards = [None for _ in range(n_group)]
129 |     nums = [env.unwrapped.env.get_num(handle) for handle in handles]
130 |     max_nums = nums.copy()
131 | 
132 |     n_action = [env.unwrapped.env.get_action_space(handles[0])[0], env.unwrapped.env.get_action_space(handles[1])[0]]
133 | 
134 |     print("\n\n[*] ROUND #{0}, EPS: {1:.2f} NUMBER: {2}".format(n_round, eps, nums))
135 |     mean_rewards = [[] for _ in range(n_group)]
136 |     total_rewards = [[] for _ in range(n_group)]
137 | 
138 |     former_act_prob = [np.zeros((1, env.unwrapped.env.get_action_space(handles[0])[0])), np.zeros((1, env.unwrapped.env.get_action_space(handles[1])[0]))]
139 | 
140 |     while not done and step_ct < max_steps:
141 |         # take actions for every model
142 |         for i in range(n_group):
143 |             state[i] = list(env.unwrapped.env.get_observation(handles[i]))
144 |             ids[i] = env.unwrapped.env.get_agent_id(handles[i])
145 | 
146 |         for i in range(n_group):
147 |             former_act_prob[i] = np.tile(former_act_prob[i], (len(state[i][0]), 1))
148 |             if cuda:
149 |                 acts[i] = models[i].act(obs=torch.FloatTensor(state[i][0]).permute([0, 3, 1, 2]).cuda(), feature=torch.FloatTensor(state[i][1]).cuda(), prob=torch.FloatTensor(former_act_prob[i]).cuda(), eps=eps)
150 |             else:
151 |                 acts[i] = models[i].act(obs=torch.FloatTensor(state[i][0]).permute([0, 3, 1, 2]), feature=torch.FloatTensor(state[i][1]), prob=torch.FloatTensor(former_act_prob[i]), eps=eps)
152 |             
153 |         for i in range(n_group):
154 |             env.unwrapped.env.set_action(handles[i], acts[i].astype(np.int32))
155 | 
156 |         # simulate one step
157 |         done = env.unwrapped.env.step()
158 | 
159 |         for i in range(n_group):
160 |             rewards[i] = env.unwrapped.env.get_reward(handles[i])
161 |             alives[i] = env.unwrapped.env.get_alive(handles[i])
162 | 
163 |         for i in range(n_group):
164 |             former_act_prob[i] = np.mean(list(map(lambda x: np.eye(n_action[i])[x], acts[i])), axis=0, keepdims=True)
165 | 
166 |         # stat info
167 |         nums = [env.unwrapped.env.get_num(handle) for handle in handles]
168 | 
169 |         for i in range(n_group):
170 |             sum_reward = sum(rewards[i])
171 |             rewards[i] = sum_reward / nums[i]
172 |             mean_rewards[i].append(rewards[i])
173 |             total_rewards[i].append(sum_reward)
174 | 
175 |         if render:
176 |             obs_list.append(np.transpose(env.render(mode='rgb_array'), axes=(1, 0, 2)))
177 |             
178 |         # clear dead agents
179 |         env.unwrapped.env.clear_dead()
180 | 
181 |         info = {"Ave-Reward": np.round(rewards, decimals=6), "NUM": nums}
182 | 
183 |         step_ct += 1
184 | 
185 |         if step_ct % print_every == 0:
186 |             print("> step #{}, info: {}".format(step_ct, info))
187 | 
188 |     for i in range(n_group):
189 |         mean_rewards[i] = sum(mean_rewards[i]) / len(mean_rewards[i])
190 |         total_rewards[i] = sum(total_rewards[i])
191 | 
192 |     return max_nums, nums, mean_rewards, total_rewards, obs_list
193 | 


--------------------------------------------------------------------------------
/train_battle.py:
--------------------------------------------------------------------------------
 1 | """Self Play
 2 | """
 3 | 
 4 | import argparse
 5 | import os
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | import numpy as np
10 | from pettingzoo.magent import battle_v3
11 | 
12 | from algo import spawn_ai
13 | from algo import tools
14 | from senarios.senario_battle import play
15 | 
16 | 
17 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
18 | os.makedirs('./data', exist_ok=True)
19 | 
20 | def linear_decay(epoch, x, y):
21 |     min_v, max_v = y[0], y[-1]
22 |     start, end = x[0], x[-1]
23 | 
24 |     if epoch == start:
25 |         return min_v
26 | 
27 |     eps = min_v
28 | 
29 |     for i, x_i in enumerate(x):
30 |         if epoch <= x_i:
31 |             interval = (y[i] - y[i - 1]) / (x_i - x[i - 1])
32 |             eps = interval * (epoch - x[i - 1]) + y[i - 1]
33 |             break
34 | 
35 |     return eps
36 | 
37 | if __name__ == '__main__':
38 |     parser = argparse.ArgumentParser()
39 |     parser.add_argument('--algo', type=str, choices={'ac', 'mfac', 'mfq', 'iql'}, help='choose an algorithm from the preset', required=True)
40 |     parser.add_argument('--save_every', type=int, default=20, help='decide the self-play update interval')
41 |     parser.add_argument('--update_every', type=int, default=5, help='decide the udpate interval for q-learning, optional')
42 |     parser.add_argument('--n_round', type=int, default=2000, help='set the trainning round')
43 |     parser.add_argument('--render', action='store_true', help='render or not (if true, will render every save)')
44 |     parser.add_argument('--map_size', type=int, default=40, help='set the size of map')  # then the amount of agents is 64
45 |     parser.add_argument('--max_steps', type=int, default=400, help='set the max steps')
46 |     parser.add_argument('--cuda', type=bool, default=True, help='use the cuda')
47 | 
48 |     args = parser.parse_args()
49 | 
50 |     # Initialize the environment
51 |     env = battle_v3.env(
52 |         map_size=args.map_size,
53 |         minimap_mode=True,
54 |         step_reward=-0.005,
55 |         dead_penalty=-0.1,
56 |         attack_penalty=-0.1,
57 |         attack_opponent_reward=0.2,
58 |         max_cycles=args.max_steps,
59 |         extra_features=True
60 |     )
61 |     handles = env.unwrapped.env.get_handles()
62 | 
63 |     log_dir = os.path.join(BASE_DIR, 'data/tmp/{}'.format(args.algo))
64 |     render_dir = os.path.join(BASE_DIR, 'data/render/{}'.format(args.algo))
65 |     model_dir = os.path.join(BASE_DIR, 'data/models/{}'.format(args.algo))
66 | 
67 |     start_from = 0
68 | 
69 |     models = [spawn_ai(args.algo, env, handles[0], args.algo + '-me', args.max_steps, args.cuda), spawn_ai(args.algo, env, handles[1], args.algo + '-opponent', args.max_steps, args.cuda)]
70 |     runner = tools.Runner(env, handles, args.max_steps, models, play,
71 |                             render_every=args.save_every if args.render else 0, save_every=args.save_every, tau=0.01, log_name=args.algo,
72 |                             log_dir=log_dir, model_dir=model_dir, render_dir=render_dir, train=True, cuda=args.cuda)
73 | 
74 |     for k in range(start_from, start_from + args.n_round):
75 |         eps = linear_decay(k, [0, int(args.n_round * 0.8), args.n_round], [1, 0.2, 0.1])
76 |         runner.run(eps, k)
77 | 


--------------------------------------------------------------------------------