├── .gitignore
├── matd3
    ├── common
    │   ├── distributions.py
    │   └── tf_util.py
    ├── maddpg
    │   ├── __init__.py
    │   └── trainer
    │   │   ├── maddpg.py
    │   │   └── replay_buffer.py
    ├── matd3
    │   ├── __init__.py
    │   └── trainer
    │   │   ├── matd3.py
    │   │   └── replay_buffer.py
    ├── multiagent
    │   ├── __init__.py
    │   ├── core.py
    │   ├── environment.py
    │   ├── multi_discrete.py
    │   ├── policy.py
    │   ├── rendering.py
    │   ├── scenario.py
    │   └── scenarios
    │   │   ├── __init__.py
    │   │   ├── simple.py
    │   │   ├── simple_adversary.py
    │   │   ├── simple_crypto.py
    │   │   ├── simple_push.py
    │   │   ├── simple_reference.py
    │   │   ├── simple_speaker_listener.py
    │   │   ├── simple_spread.py
    │   │   ├── simple_spread_two_ag.py
    │   │   ├── simple_tag.py
    │   │   └── simple_world_comm.py
    └── train.py
└── readme.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.npy
  6 | #Documentation
  7 | *.html
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # PyCharm
 77 | .idea
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # Environments
 89 | .env
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | 
110 | logdir
111 | logdirMaddpg
112 | leraning_curves
113 | parallelRunStarter.py
114 | learning_curves/
115 | *.pkl
116 | *.pdf
117 | data/*
118 | 


--------------------------------------------------------------------------------
/matd3/common/distributions.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import common.tf_util as U
  4 | from tensorflow.python.ops import math_ops
  5 | from multiagent.multi_discrete import MultiDiscrete
  6 | from tensorflow.python.ops import nn
  7 | 
  8 | class Pd(object):
  9 |     """
 10 |     A particular probability distribution
 11 |     """
 12 |     def flatparam(self):
 13 |         raise NotImplementedError
 14 |     def mode(self):
 15 |         raise NotImplementedError
 16 |     def logp(self, x):
 17 |         raise NotImplementedError
 18 |     def kl(self, other):
 19 |         raise NotImplementedError
 20 |     def entropy(self):
 21 |         raise NotImplementedError
 22 |     def sample(self):
 23 |         raise NotImplementedError
 24 | 
 25 | class PdType(object):
 26 |     """
 27 |     Parametrized family of probability distributions
 28 |     """
 29 |     def pdclass(self):
 30 |         raise NotImplementedError
 31 |     def pdfromflat(self, flat):
 32 |         return self.pdclass()(flat)
 33 |     def param_shape(self):
 34 |         raise NotImplementedError
 35 |     def sample_shape(self):
 36 |         raise NotImplementedError
 37 |     def sample_dtype(self):
 38 |         raise NotImplementedError
 39 | 
 40 |     def param_placeholder(self, prepend_shape, name=None):
 41 |         return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
 42 |     def sample_placeholder(self, prepend_shape, name=None):
 43 |         return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
 44 | 
 45 | class CategoricalPdType(PdType):
 46 |     def __init__(self, ncat):
 47 |         self.ncat = ncat
 48 |     def pdclass(self):
 49 |         return CategoricalPd
 50 |     def param_shape(self):
 51 |         return [self.ncat]
 52 |     def sample_shape(self):
 53 |         return []
 54 |     def sample_dtype(self):
 55 |         return tf.int32
 56 | 
 57 | class SoftCategoricalPdType(PdType):
 58 |     def __init__(self, ncat):
 59 |         self.ncat = ncat
 60 |     def pdclass(self):
 61 |         return SoftCategoricalPd
 62 |     def param_shape(self):
 63 |         return [self.ncat]
 64 |     def sample_shape(self):
 65 |         return [self.ncat]
 66 |     def sample_dtype(self):
 67 |         return tf.float32
 68 | 
 69 | class MultiCategoricalPdType(PdType):
 70 |     def __init__(self, low, high):
 71 |         self.low = low
 72 |         self.high = high
 73 |         self.ncats = high - low + 1
 74 |     def pdclass(self):
 75 |         return MultiCategoricalPd
 76 |     def pdfromflat(self, flat):
 77 |         return MultiCategoricalPd(self.low, self.high, flat)
 78 |     def param_shape(self):
 79 |         return [sum(self.ncats)]
 80 |     def sample_shape(self):
 81 |         return [len(self.ncats)]
 82 |     def sample_dtype(self):
 83 |         return tf.int32
 84 | 
 85 | class SoftMultiCategoricalPdType(PdType):
 86 |     def __init__(self, low, high):
 87 |         self.low = low
 88 |         self.high = high
 89 |         self.ncats = high - low + 1
 90 |     def pdclass(self):
 91 |         return SoftMultiCategoricalPd
 92 |     def pdfromflat(self, flat):
 93 |         return SoftMultiCategoricalPd(self.low, self.high, flat)
 94 |     def param_shape(self):
 95 |         return [sum(self.ncats)]
 96 |     def sample_shape(self):
 97 |         return [sum(self.ncats)]
 98 |     def sample_dtype(self):
 99 |         return tf.float32
100 | 
101 | class DiagGaussianPdType(PdType):
102 |     def __init__(self, size):
103 |         self.size = size
104 |     def pdclass(self):
105 |         return DiagGaussianPd
106 |     def param_shape(self):
107 |         return [2*self.size]
108 |     def sample_shape(self):
109 |         return [self.size]
110 |     def sample_dtype(self):
111 |         return tf.float32
112 | 
113 | class BernoulliPdType(PdType):
114 |     def __init__(self, size):
115 |         self.size = size
116 |     def pdclass(self):
117 |         return BernoulliPd
118 |     def param_shape(self):
119 |         return [self.size]
120 |     def sample_shape(self):
121 |         return [self.size]
122 |     def sample_dtype(self):
123 |         return tf.int32
124 | 
125 | # WRONG SECOND DERIVATIVES
126 | # class CategoricalPd(Pd):
127 | #     def __init__(self, logits):
128 | #         self.logits = logits
129 | #         self.ps = tf.nn.softmax(logits)
130 | #     @classmethod
131 | #     def fromflat(cls, flat):
132 | #         return cls(flat)
133 | #     def flatparam(self):
134 | #         return self.logits
135 | #     def mode(self):
136 | #         return U.argmax(self.logits, axis=1)
137 | #     def logp(self, x):
138 | #         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
139 | #     def kl(self, other):
140 | #         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
141 | #                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
142 | #     def entropy(self):
143 | #         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
144 | #     def sample(self):
145 | #         u = tf.random_uniform(tf.shape(self.logits))
146 | #         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
147 | 
148 | class CategoricalPd(Pd):
149 |     def __init__(self, logits):
150 |         self.logits = logits
151 |     def flatparam(self):
152 |         return self.logits
153 |     def mode(self):
154 |         return U.argmax(self.logits, axis=1)
155 |     def logp(self, x):
156 |         return -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
157 |     def kl(self, other):
158 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
159 |         a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
160 |         ea0 = tf.exp(a0)
161 |         ea1 = tf.exp(a1)
162 |         z0 = U.sum(ea0, axis=1, keepdims=True)
163 |         z1 = U.sum(ea1, axis=1, keepdims=True)
164 |         p0 = ea0 / z0
165 |         return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
166 |     def entropy(self):
167 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
168 |         ea0 = tf.exp(a0)
169 |         z0 = U.sum(ea0, axis=1, keepdims=True)
170 |         p0 = ea0 / z0
171 |         return U.sum(p0 * (tf.log(z0) - a0), axis=1)
172 |     def sample(self):
173 |         u = tf.random_uniform(tf.shape(self.logits))
174 |         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
175 |     @classmethod
176 |     def fromflat(cls, flat):
177 |         return cls(flat)
178 | 
179 | class SoftCategoricalPd(Pd):
180 |     def __init__(self, logits):
181 |         self.logits = logits
182 |     def flatparam(self):
183 |         return self.logits
184 |     def mode(self):
185 |         return U.softmax(self.logits, axis=-1)
186 |     def logp(self, x):
187 |         return -tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
188 |     def kl(self, other):
189 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
190 |         a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
191 |         ea0 = tf.exp(a0)
192 |         ea1 = tf.exp(a1)
193 |         z0 = U.sum(ea0, axis=1, keepdims=True)
194 |         z1 = U.sum(ea1, axis=1, keepdims=True)
195 |         p0 = ea0 / z0
196 |         return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
197 |     def entropy(self):
198 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
199 |         ea0 = tf.exp(a0)
200 |         z0 = U.sum(ea0, axis=1, keepdims=True)
201 |         p0 = ea0 / z0
202 |         return U.sum(p0 * (tf.log(z0) - a0), axis=1)
203 |     def sample(self):
204 |         u = tf.random_uniform(tf.shape(self.logits))
205 |         return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1)
206 |     @classmethod
207 |     def fromflat(cls, flat):
208 |         return cls(flat)        
209 | 
210 | class MultiCategoricalPd(Pd):
211 |     def __init__(self, low, high, flat):
212 |         self.flat = flat
213 |         self.low = tf.constant(low, dtype=tf.int32)
214 |         self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
215 |     def flatparam(self):
216 |         return self.flat
217 |     def mode(self):
218 |         return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
219 |     def logp(self, x):
220 |         return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
221 |     def kl(self, other):
222 |         return tf.add_n([
223 |                 p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
224 |             ])
225 |     def entropy(self):
226 |         return tf.add_n([p.entropy() for p in self.categoricals])
227 |     def sample(self):
228 |         return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
229 |     @classmethod
230 |     def fromflat(cls, flat):
231 |         return cls(flat)
232 | 
233 | class SoftMultiCategoricalPd(Pd):  # doesn't work yet
234 |     def __init__(self, low, high, flat):
235 |         self.flat = flat
236 |         self.low = tf.constant(low, dtype=tf.float32)
237 |         self.categoricals = list(map(SoftCategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
238 |     def flatparam(self):
239 |         return self.flat
240 |     def mode(self):
241 |         x = []
242 |         for i in range(len(self.categoricals)):
243 |             x.append(self.low[i] + self.categoricals[i].mode())
244 |         return tf.concat(x, axis=-1)
245 |     def logp(self, x):
246 |         return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
247 |     def kl(self, other):
248 |         return tf.add_n([
249 |                 p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
250 |             ])
251 |     def entropy(self):
252 |         return tf.add_n([p.entropy() for p in self.categoricals])
253 |     def sample(self):
254 |         x = []
255 |         for i in range(len(self.categoricals)):
256 |             x.append(self.low[i] + self.categoricals[i].sample())
257 |         return tf.concat(x, axis=-1)
258 |     @classmethod
259 |     def fromflat(cls, flat):
260 |         return cls(flat)
261 | 
262 | class DiagGaussianPd(Pd):
263 |     def __init__(self, flat):
264 |         self.flat = flat
265 |         mean, logstd = tf.split(axis=1, num_or_size_splits=2, value=flat)
266 |         self.mean = mean
267 |         self.logstd = logstd
268 |         self.std = tf.exp(logstd)
269 |     def flatparam(self):
270 |         return self.flat        
271 |     def mode(self):
272 |         return self.mean
273 |     def logp(self, x):
274 |         return - 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=1) \
275 |                - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) \
276 |                - U.sum(self.logstd, axis=1)
277 |     def kl(self, other):
278 |         assert isinstance(other, DiagGaussianPd)
279 |         return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=1)
280 |     def entropy(self):
281 |         return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), 1)
282 |     def sample(self):
283 |         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
284 |     @classmethod
285 |     def fromflat(cls, flat):
286 |         return cls(flat)
287 | 
288 | class BernoulliPd(Pd):
289 |     def __init__(self, logits):
290 |         self.logits = logits
291 |         self.ps = tf.sigmoid(logits)
292 |     def flatparam(self):
293 |         return self.logits
294 |     def mode(self):
295 |         return tf.round(self.ps)
296 |     def logp(self, x):
297 |         return - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1)
298 |     def kl(self, other):
299 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
300 |     def entropy(self):
301 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
302 |     def sample(self):
303 |         p = tf.sigmoid(self.logits)
304 |         u = tf.random_uniform(tf.shape(p))
305 |         return tf.to_float(math_ops.less(u, p))
306 |     @classmethod
307 |     def fromflat(cls, flat):
308 |         return cls(flat)
309 | 
310 | def make_pdtype(ac_space):
311 |     from gym import spaces
312 |     if isinstance(ac_space, spaces.Box):
313 |         assert len(ac_space.shape) == 1
314 |         return DiagGaussianPdType(ac_space.shape[0])
315 |     elif isinstance(ac_space, spaces.Discrete):
316 |         # return CategoricalPdType(ac_space.n)
317 |         return SoftCategoricalPdType(ac_space.n)
318 |     elif isinstance(ac_space, MultiDiscrete):
319 |         #return MultiCategoricalPdType(ac_space.low, ac_space.high)
320 |         return SoftMultiCategoricalPdType(ac_space.low, ac_space.high)
321 |     elif isinstance(ac_space, spaces.MultiBinary):
322 |         return BernoulliPdType(ac_space.n)
323 |     else:
324 |         raise NotImplementedError
325 | 
326 | def shape_el(v, i):
327 |     maybe = v.get_shape()[i]
328 |     if maybe is not None:
329 |         return maybe
330 |     else:
331 |         return tf.shape(v)[i]
332 | 


--------------------------------------------------------------------------------
/matd3/common/tf_util.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import numpy as np
  3 | import os
  4 | import tensorflow as tf
  5 | 
  6 | def sum(x, axis=None, keepdims=False):
  7 |     return tf.reduce_sum(x, axis=None if axis is None else [axis], keep_dims = keepdims)
  8 | def mean(x, axis=None, keepdims=False):
  9 |     return tf.reduce_mean(x, axis=None if axis is None else [axis], keep_dims = keepdims)
 10 | def var(x, axis=None, keepdims=False):
 11 |     meanx = mean(x, axis=axis, keepdims=keepdims)
 12 |     return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
 13 | def std(x, axis=None, keepdims=False):
 14 |     return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
 15 | def max(x, axis=None, keepdims=False):
 16 |     return tf.reduce_max(x, axis=None if axis is None else [axis], keep_dims = keepdims)
 17 | def min(x, axis=None, keepdims=False):
 18 |     return tf.reduce_min(x, axis=None if axis is None else [axis], keep_dims = keepdims)
 19 | def concatenate(arrs, axis=0):
 20 |     return tf.concat(axis=axis, values=arrs)
 21 | def argmax(x, axis=None):
 22 |     return tf.argmax(x, axis=axis)
 23 | def softmax(x, axis=None):
 24 |     return tf.nn.softmax(x, dim=axis)
 25 | 
 26 | # ================================================================
 27 | # Misc
 28 | # ================================================================
 29 | 
 30 | 
 31 | def is_placeholder(x):
 32 |     return type(x) is tf.Tensor and len(x.op.inputs) == 0
 33 | 
 34 | # ================================================================
 35 | # Inputs
 36 | # ================================================================
 37 | 
 38 | 
 39 | class TfInput(object):
 40 |     def __init__(self, name="(unnamed)"):
 41 |         """Generalized Tensorflow placeholder. The main differences are:
 42 |             - possibly uses multiple placeholders internally and returns multiple values
 43 |             - can apply light postprocessing to the value feed to placeholder.
 44 |         """
 45 |         self.name = name
 46 | 
 47 |     def get(self):
 48 |         """Return the tf variable(s) representing the possibly postprocessed value
 49 |         of placeholder(s).
 50 |         """
 51 |         raise NotImplemented()
 52 | 
 53 |     def make_feed_dict(data):
 54 |         """Given data input it to the placeholder(s)."""
 55 |         raise NotImplemented()
 56 | 
 57 | 
 58 | class PlacholderTfInput(TfInput):
 59 |     def __init__(self, placeholder):
 60 |         """Wrapper for regular tensorflow placeholder."""
 61 |         super().__init__(placeholder.name)
 62 |         self._placeholder = placeholder
 63 | 
 64 |     def get(self):
 65 |         return self._placeholder
 66 | 
 67 |     def make_feed_dict(self, data):
 68 |         return {self._placeholder: data}
 69 | 
 70 | 
 71 | class BatchInput(PlacholderTfInput):
 72 |     def __init__(self, shape, dtype=tf.float32, name=None):
 73 |         """Creates a placeholder for a batch of tensors of a given shape and dtype
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         shape: [int]
 78 |             shape of a single elemenet of the batch
 79 |         dtype: tf.dtype
 80 |             number representation used for tensor contents
 81 |         name: str
 82 |             name of the underlying placeholder
 83 |         """
 84 |         super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
 85 | 
 86 | 
 87 | class Uint8Input(PlacholderTfInput):
 88 |     def __init__(self, shape, name=None):
 89 |         """Takes input in uint8 format which is cast to float32 and divided by 255
 90 |         before passing it to the model.
 91 | 
 92 |         On GPU this ensures lower data transfer times.
 93 | 
 94 |         Parameters
 95 |         ----------
 96 |         shape: [int]
 97 |             shape of the tensor.
 98 |         name: str
 99 |             name of the underlying placeholder
100 |         """
101 | 
102 |         super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
103 |         self._shape = shape
104 |         self._output = tf.cast(super().get(), tf.float32) / 255.0
105 | 
106 |     def get(self):
107 |         return self._output
108 | 
109 | 
110 | def ensure_tf_input(thing):
111 |     """Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
112 |     if isinstance(thing, TfInput):
113 |         return thing
114 |     elif is_placeholder(thing):
115 |         return PlacholderTfInput(thing)
116 |     else:
117 |         raise ValueError("Must be a placeholder or TfInput")
118 | 
119 | # ================================================================
120 | # Mathematical utils
121 | # ================================================================
122 | 
123 | 
124 | def huber_loss(x, delta=1.0):
125 |     """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
126 |     return tf.where(
127 |         tf.abs(x) < delta,
128 |         tf.square(x) * 0.5,
129 |         delta * (tf.abs(x) - 0.5 * delta)
130 |     )
131 | 
132 | # ================================================================
133 | # Optimizer utils
134 | # ================================================================
135 | 
136 | 
137 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
138 |     """Minimizes `objective` using `optimizer` w.r.t. variables in
139 |     `var_list` while ensure the norm of the gradients for each
140 |     variable is clipped to `clip_val`
141 |     """
142 |     if clip_val is None:
143 |         return optimizer.minimize(objective, var_list=var_list)
144 |     else:
145 |         gradients = optimizer.compute_gradients(objective, var_list=var_list)
146 |         for i, (grad, var) in enumerate(gradients):
147 |             if grad is not None:
148 |                 gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
149 |         return optimizer.apply_gradients(gradients)
150 | 
151 | 
152 | # ================================================================
153 | # Global session
154 | # ================================================================
155 | 
156 | def get_session():
157 |     """Returns recently made Tensorflow session"""
158 |     return tf.get_default_session()
159 | 
160 | 
161 | def make_session(num_cpu):
162 |     """Returns a session that will use <num_cpu> CPU's only"""
163 |     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2,allow_growth=True)
164 |     tf_config = tf.ConfigProto(
165 |         inter_op_parallelism_threads=num_cpu,
166 |         intra_op_parallelism_threads=num_cpu,
167 |         gpu_options=gpu_options)
168 |     return tf.Session(config=tf_config)
169 | 
170 | 
171 | def single_threaded_session():
172 |     """Returns a session which will only use a single CPU"""
173 |     return make_session(1)
174 | 
175 | 
176 | ALREADY_INITIALIZED = set()
177 | 
178 | 
179 | def initialize():
180 |     """Initialize all the uninitialized variables in the global scope."""
181 |     new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
182 |     get_session().run(tf.variables_initializer(new_variables))
183 |     ALREADY_INITIALIZED.update(new_variables)
184 | 
185 | 
186 | # ================================================================
187 | # Scopes
188 | # ================================================================
189 | 
190 | 
191 | def scope_vars(scope, trainable_only=False):
192 |     """
193 |     Get variables inside a scope
194 |     The scope can be specified as a string
195 | 
196 |     Parameters
197 |     ----------
198 |     scope: str or VariableScope
199 |         scope in which the variables reside.
200 |     trainable_only: bool
201 |         whether or not to return only the variables that were marked as trainable.
202 | 
203 |     Returns
204 |     -------
205 |     vars: [tf.Variable]
206 |         list of variables in `scope`.
207 |     """
208 |     return tf.get_collection(
209 |         tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
210 |         scope=scope if isinstance(scope, str) else scope.name
211 |     )
212 | 
213 | 
214 | def scope_name():
215 |     """Returns the name of current scope as a string, e.g. deepq/q_func"""
216 |     return tf.get_variable_scope().name
217 | 
218 | 
219 | def absolute_scope_name(relative_scope_name):
220 |     """Appends parent scope name to `relative_scope_name`"""
221 |     return scope_name() + "/" + relative_scope_name
222 | 
223 | # ================================================================
224 | # Saving variables
225 | # ================================================================
226 | 
227 | 
228 | def load_state(fname, saver=None):
229 |     """Load all the variables to the current session from the location <fname>"""
230 |     if saver is None:
231 |         saver = tf.train.Saver()
232 |     saver.restore(get_session(), fname)
233 |     return saver
234 | 
235 | 
236 | def save_state(fname, saver=None, global_step=None):
237 |     """Save all the variables in the current session to the location <fname>"""
238 |     os.makedirs(os.path.dirname(fname), exist_ok=True)
239 |     if saver is None:
240 |         saver = tf.train.Saver(max_to_keep=None)
241 |     if global_step is not None:
242 |         saver.save(get_session(), fname, global_step)
243 |     else:
244 |         saver.save(get_session(), fname)
245 |     return saver
246 | 
247 | # ================================================================
248 | # Theano-like Function
249 | # ================================================================
250 | 
251 | 
252 | def function(inputs, outputs, updates=None, givens=None):
253 |     """Just like Theano function. Take a bunch of tensorflow placeholders and expersions
254 |     computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
255 |     values to be feed to the inputs placeholders and produces the values of the experessions
256 |     in outputs.
257 | 
258 |     Input values can be passed in the same order as inputs or can be provided as kwargs based
259 |     on placeholder name (passed to constructor or accessible via placeholder.op.name).
260 | 
261 |     Example:
262 |         x = tf.placeholder(tf.int32, (), name="x")
263 |         y = tf.placeholder(tf.int32, (), name="y")
264 |         z = 3 * x + 2 * y
265 |         lin = function([x, y], z, givens={y: 0})
266 | 
267 |         with single_threaded_session():
268 |             initialize()
269 | 
270 |             assert lin(2) == 6
271 |             assert lin(x=3) == 9
272 |             assert lin(2, 2) == 10
273 |             assert lin(x=2, y=3) == 12
274 | 
275 |     Parameters
276 |     ----------
277 |     inputs: [tf.placeholder or TfInput]
278 |         list of input arguments
279 |     outputs: [tf.Variable] or tf.Variable
280 |         list of outputs or a single output to be returned from function. Returned
281 |         value will also have the same shape.
282 |     """
283 |     if isinstance(outputs, list):
284 |         return _Function(inputs, outputs, updates, givens=givens)
285 |     elif isinstance(outputs, (dict, collections.OrderedDict)):
286 |         f = _Function(inputs, outputs.values(), updates, givens=givens)
287 |         return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
288 |     else:
289 |         f = _Function(inputs, [outputs], updates, givens=givens)
290 |         return lambda *args, **kwargs: f(*args, **kwargs)[0]
291 | 
292 | 
293 | class _Function(object):
294 |     def __init__(self, inputs, outputs, updates, givens, check_nan=False):
295 |         for inpt in inputs:
296 |             if not issubclass(type(inpt), TfInput):
297 |                 assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of rl_algs.common.TfInput"
298 |         self.inputs = inputs
299 |         updates = updates or []
300 |         self.update_group = tf.group(*updates)
301 |         self.outputs_update = list(outputs) + [self.update_group]
302 |         self.givens = {} if givens is None else givens
303 |         self.check_nan = check_nan
304 | 
305 |     def _feed_input(self, feed_dict, inpt, value):
306 |         if issubclass(type(inpt), TfInput):
307 |             feed_dict.update(inpt.make_feed_dict(value))
308 |         elif is_placeholder(inpt):
309 |             feed_dict[inpt] = value
310 | 
311 |     def __call__(self, *args, **kwargs):
312 |         assert len(args) <= len(self.inputs), "Too many arguments provided"
313 |         feed_dict = {}
314 |         # Update the args
315 |         for inpt, value in zip(self.inputs, args):
316 |             self._feed_input(feed_dict, inpt, value)
317 |         # Update the kwargs
318 |         kwargs_passed_inpt_names = set()
319 |         for inpt in self.inputs[len(args):]:
320 |             inpt_name = inpt.name.split(':')[0]
321 |             inpt_name = inpt_name.split('/')[-1]
322 |             assert inpt_name not in kwargs_passed_inpt_names, \
323 |                 "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
324 |             if inpt_name in kwargs:
325 |                 kwargs_passed_inpt_names.add(inpt_name)
326 |                 self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
327 |             else:
328 |                 assert inpt in self.givens, "Missing argument " + inpt_name
329 |         assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
330 |         # Update feed dict with givens.
331 |         for inpt in self.givens:
332 |             feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
333 |         results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
334 |         if self.check_nan:
335 |             if any(np.isnan(r).any() for r in results):
336 |                 raise RuntimeError("Nan detected")
337 |         return results
338 | 


--------------------------------------------------------------------------------
/matd3/maddpg/__init__.py:
--------------------------------------------------------------------------------
 1 | class AgentTrainer(object):
 2 |     def __init__(self, name, model, obs_shape, act_space, args):
 3 |         raise NotImplemented()
 4 | 
 5 |     def action(self, obs):
 6 |         raise NotImplemented()
 7 | 
 8 |     def process_experience(self, obs, act, rew, new_obs, done, terminal):
 9 |         raise NotImplemented()
10 | 
11 |     def preupdate(self):
12 |         raise NotImplemented()
13 | 
14 |     def update(self, agents):
15 |         raise NotImplemented()
16 | 


--------------------------------------------------------------------------------
/matd3/maddpg/trainer/maddpg.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from IPython import embed
  4 | 
  5 | import common.tf_util as U
  6 | 
  7 | from maddpg import AgentTrainer
  8 | from common.distributions import make_pdtype
  9 | from maddpg.trainer.replay_buffer import ReplayBuffer
 10 | 
 11 | 
 12 | def make_update_exp(vals, target_vals):
 13 |     polyak = 1.0 - 1e-2
 14 |     expression = []
 15 |     for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
 16 |         expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var))
 17 |     expression = tf.group(*expression)
 18 |     return U.function([], [], updates=[expression])
 19 | 
 20 | 
 21 | def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None,
 22 |             local_q_func=False, num_units=64, scope="trainer", reuse=None):
 23 |     """
 24 | 
 25 |     :param make_obs_ph_n:
 26 |     :param act_space_n:
 27 |     :param p_index:
 28 |     :param p_func: in base maddpg code = mlp_model
 29 |     :param q_func: in base maddpg code = mlp_model
 30 |     :param optimizer:
 31 |     :param grad_norm_clipping:
 32 |     :param local_q_func:
 33 |     :param num_units:
 34 |     :param scope:
 35 |     :param reuse:
 36 |     :return:
 37 |     """
 38 |     with tf.variable_scope(scope, reuse=reuse):
 39 |         # create distribtuions
 40 |         act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
 41 | 
 42 |         # set up placeholders
 43 |         obs_ph_n = make_obs_ph_n
 44 |         act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
 45 | 
 46 |         p_input = obs_ph_n[p_index]
 47 | 
 48 |         p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units)
 49 |         p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
 50 | 
 51 |         # wrap parameters in distribution
 52 |         act_pd = act_pdtype_n[p_index].pdfromflat(p)
 53 | 
 54 |         act_sample = act_pd.sample()
 55 |         p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
 56 | 
 57 |         act_input_n = act_ph_n + []
 58 |         act_input_n[p_index] = act_pd.sample()
 59 |         q_input = tf.concat(obs_ph_n + act_input_n, 1)
 60 | 
 61 |         q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
 62 |         pg_loss = -tf.reduce_mean(q)
 63 | 
 64 |         loss = pg_loss + p_reg * 1e-3
 65 | 
 66 |         optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)
 67 | 
 68 |         # Create callable functions
 69 |         train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
 70 |         act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
 71 |         p_values = U.function([obs_ph_n[p_index]], p)
 72 | 
 73 |         # target network
 74 |         target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units)
 75 |         target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
 76 |         update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
 77 | 
 78 |         target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
 79 |         target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)
 80 | 
 81 |         return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
 82 | 
 83 | def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
 84 |     with tf.variable_scope(scope, reuse=reuse):
 85 |         # create distribtuions
 86 |         act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
 87 | 
 88 |         # set up placeholders
 89 |         obs_ph_n = make_obs_ph_n
 90 |         act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
 91 |         target_ph = tf.placeholder(tf.float32, [None], name="target")
 92 | 
 93 |         q_input = tf.concat(obs_ph_n + act_ph_n, 1)
 94 |         if local_q_func:
 95 |             q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
 96 |         q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
 97 |         q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
 98 | 
 99 |         q_loss = tf.reduce_mean(tf.square(q - target_ph))
100 | 
101 |         # viscosity solution to Bellman differential equation in place of an initial condition
102 |         q_reg = tf.reduce_mean(tf.square(q))
103 |         loss = q_loss #+ 1e-3 * q_reg
104 | 
105 |         optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)
106 | 
107 |         # Create callable functions
108 |         train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
109 |         q_values = U.function(obs_ph_n + act_ph_n, q)
110 | 
111 |         action_grad = optimizer.compute_gradients(q, act_ph_n)
112 |         action_grad_func = U.function(inputs=obs_ph_n + act_ph_n, outputs=action_grad)
113 | 
114 |         # target network
115 |         target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
116 |         target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
117 |         update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
118 | 
119 |         target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
120 | 
121 |         return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values, 'action_grad': action_grad_func}
122 | 
123 | class MADDPGAgentTrainer(AgentTrainer):
124 |     def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
125 |         self.name = name
126 |         self.n = len(obs_shape_n)
127 |         self.agent_index = agent_index
128 |         self.args = args
129 |         obs_ph_n = []
130 |         for i in range(self.n):
131 |             obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
132 | 
133 |         # Create all the functions necessary to train the model
134 |         self.q_train, self.q_update, self.q_debug = q_train(
135 |             scope=self.name,
136 |             make_obs_ph_n=obs_ph_n,
137 |             act_space_n=act_space_n,
138 |             q_index=agent_index,
139 |             q_func=model,
140 |             optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
141 |             grad_norm_clipping=0.5,
142 |             local_q_func=local_q_func,
143 |             num_units=args.num_units
144 |         )
145 |         self.act, self.p_train, self.p_update, self.p_debug = p_train(
146 |             scope=self.name,
147 |             make_obs_ph_n=obs_ph_n,
148 |             act_space_n=act_space_n,
149 |             p_index=agent_index,
150 |             p_func=model,
151 |             q_func=model,
152 |             optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
153 |             grad_norm_clipping=0.5,
154 |             local_q_func=local_q_func,
155 |             num_units=args.num_units
156 |         )
157 |         # Create experience buffer
158 |         self.replay_buffer = ReplayBuffer(1e6)
159 |         self.min_replay_buffer_len = args.batch_size * args.max_episode_len
160 |         self.replay_sample_index = None
161 |         a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph())
162 |         a.flush()
163 |         a.close()
164 | 
165 |     def action(self, obs):
166 |         return self.act(obs[None])[0]
167 | 
168 |     def experience(self, obs, act, rew, new_obs, done, terminal):
169 |         # Store transition in the replay buffer.
170 |         self.replay_buffer.add(obs, act, rew, new_obs, float(done))
171 | 
172 |     def preupdate(self):
173 |         self.replay_sample_index = None
174 | 
175 |     def update(self, agents, t):
176 |         if len(self.replay_buffer) < self.min_replay_buffer_len: # replay buffer is not large enough
177 |             return
178 |         if not t % self.args.update_rate == 0:  # only update every 100 steps
179 |             return
180 | 
181 |         self.replay_sample_index = self.replay_buffer.generate_sample_indices(self.args.batch_size)
182 |         # collect replay sample from all agents
183 |         obs_n = []
184 |         obs_next_n = []
185 |         act_n = []
186 |         index = self.replay_sample_index
187 |         for i in range(self.n):
188 |             obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
189 |             obs_n.append(obs)
190 |             obs_next_n.append(obs_next)
191 |             act_n.append(act)
192 |         obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)
193 | 
194 | 
195 |         # train q network
196 |         target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
197 | 
198 |         target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
199 |         if self.args.critic_zero_if_done:
200 |             done_cond = done == True
201 |             target_q_next[done_cond] = 0
202 | 
203 |         target_q = rew + self.args.gamma * target_q_next
204 |         q_loss = self.q_train(*(obs_n + act_n + [target_q]))
205 |         # print('Action gradient = ')
206 | 
207 |         # train p network
208 |         p_loss = self.p_train(*(obs_n + act_n))
209 | 
210 |         self.p_update()
211 |         self.q_update()
212 |         #embed()
213 | 
214 |         return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]
215 | 


--------------------------------------------------------------------------------
/matd3/maddpg/trainer/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | class ReplayBuffer(object):
 5 |     def __init__(self, size):
 6 |         """Create Prioritized Replay buffer.
 7 | 
 8 |         Parameters
 9 |         ----------
10 |         size: int
11 |             Max number of transitions to store in the buffer. When the buffer
12 |             overflows the old memories are dropped.
13 |         """
14 |         self._storage = []
15 |         self._maxsize = int(size)
16 |         self._next_idx = 0
17 | 
18 |     def __len__(self):
19 |         return len(self._storage)
20 | 
21 |     def clear(self):
22 |         self._storage = []
23 |         self._next_idx = 0
24 | 
25 |     def add(self, obs_t, action, reward, obs_tp1, done):
26 |         data = (obs_t, action, reward, obs_tp1, done)
27 | 
28 |         if self._next_idx >= len(self._storage):
29 |             self._storage.append(data)
30 |         else:
31 |             self._storage[self._next_idx] = data
32 |         self._next_idx = (self._next_idx + 1) % self._maxsize
33 | 
34 |     def _encode_sample(self, idxes):
35 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
36 |         for i in idxes:
37 |             data = self._storage[i]
38 |             obs_t, action, reward, obs_tp1, done = data
39 |             obses_t.append(np.array(obs_t, copy=False))
40 |             actions.append(np.array(action, copy=False))
41 |             rewards.append(reward)
42 |             obses_tp1.append(np.array(obs_tp1, copy=False))
43 |             dones.append(done)
44 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
45 | 
46 |     def generate_sample_indices(self, batch_size):
47 |         return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
48 | 
49 |     def make_latest_index(self, batch_size):
50 |         idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)]
51 |         np.random.shuffle(idx)
52 |         return idx
53 | 
54 |     def sample_index(self, idxes):
55 |         return self._encode_sample(idxes)
56 | 
57 |     def sample(self, batch_size):
58 |         """Sample a batch of experiences.
59 | 
60 |         Parameters
61 |         ----------
62 |         batch_size: int
63 |             How many transitions to sample.
64 | 
65 |         Returns
66 |         -------
67 |         obs_batch: np.array
68 |             batch of observations
69 |         act_batch: np.array
70 |             batch of actions executed given obs_batch
71 |         rew_batch: np.array
72 |             rewards received as results of executing act_batch
73 |         next_obs_batch: np.array
74 |             next set of observations seen after executing act_batch
75 |         done_mask: np.array
76 |             done_mask[i] = 1 if executing act_batch[i] resulted in
77 |             the end of an episode and 0 otherwise.
78 |         """
79 |         if batch_size > 0:
80 |             idxes = self.generate_sample_indices(batch_size)
81 |         else:
82 |             idxes = range(0, len(self._storage))
83 |         return self._encode_sample(idxes)
84 | 
85 |     def collect(self):
86 |         return self.sample(-1)
87 | 


--------------------------------------------------------------------------------
/matd3/matd3/__init__.py:
--------------------------------------------------------------------------------
 1 | class AgentTrainer(object):
 2 |     def __init__(self, name, model, obs_shape, act_space, args):
 3 |         raise NotImplemented()
 4 | 
 5 |     def action(self, obs):
 6 |         raise NotImplemented()
 7 | 
 8 |     def process_experience(self, obs, act, rew, new_obs, done, terminal):
 9 |         raise NotImplemented()
10 | 
11 |     def preupdate(self):
12 |         raise NotImplemented()
13 | 
14 |     def update(self, agents):
15 |         raise NotImplemented()
16 | 


--------------------------------------------------------------------------------
/matd3/matd3/trainer/matd3.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | import common.tf_util as U
  5 | from maddpg import AgentTrainer
  6 | from common.distributions import make_pdtype
  7 | from maddpg.trainer.replay_buffer import ReplayBuffer
  8 | 
  9 | 
 10 | def make_update_exp(vals, target_vals):
 11 |     polyak = 1.0 - 1e-2
 12 |     expression = []
 13 |     for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
 14 |         expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var))
 15 |     expression = tf.group(*expression)
 16 |     return U.function([], [], updates=[expression])
 17 | 
 18 | 
 19 | def p_train(make_obs_ph_n, act_space_n, agent_idx, p_func, q_func, optimizer, grad_norm_clipping=None,
 20 |             local_q_func=False, num_units=64, scope="trainer", reuse=None):
 21 |     """
 22 | 
 23 |     :param make_obs_ph_n:
 24 |     :param act_space_n:
 25 |     :param agent_idx:
 26 |     :param p_func: in base maddpg code = mlp_model
 27 |     :param q_func: in base maddpg code = mlp_model
 28 |     :param optimizer:
 29 |     :param grad_norm_clipping:
 30 |     :param local_q_func:
 31 |     :param num_units:
 32 |     :param scope:
 33 |     :param reuse:
 34 |     :return:
 35 |     """
 36 |     with tf.variable_scope(scope, reuse=reuse):
 37 |         # create distribtuions
 38 |         act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
 39 | 
 40 |         # set up placeholders
 41 |         obs_ph_n = [tf.layers.flatten(obs_ph) for obs_ph in make_obs_ph_n]
 42 |         act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
 43 | 
 44 |         p_input = obs_ph_n[agent_idx]
 45 | 
 46 |         p = p_func(p_input, int(act_pdtype_n[agent_idx].param_shape()[0]), scope="p_func", num_units=num_units)
 47 |         p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
 48 | 
 49 |         # wrap parameters in distribution
 50 |         act_pd = act_pdtype_n[agent_idx].pdfromflat(p)
 51 | 
 52 |         act_sample = act_pd.sample()
 53 |         p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
 54 | 
 55 |         act_input_n = act_ph_n + []
 56 |         act_input_n[agent_idx] = act_pd.sample() #act_pd.mode() #
 57 |         q_input = tf.concat(obs_ph_n + act_input_n, 1)
 58 | 
 59 |         q = q_func(q_input, 1, scope="q_func" + str(1), reuse=True, num_units=num_units)[:,0]
 60 | 
 61 |         loss = -tf.reduce_mean(q)  + p_reg * 1e-3
 62 | 
 63 |         optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)
 64 | 
 65 |         # Create callable functions
 66 |         train = U.function(inputs=make_obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
 67 |         act = U.function(inputs=[make_obs_ph_n[agent_idx]], outputs=act_sample)
 68 |         p_values = U.function([make_obs_ph_n[agent_idx]], p)
 69 | 
 70 |         # target network
 71 |         target_p = p_func(p_input, int(act_pdtype_n[agent_idx].param_shape()[0]), scope="target_p_func", num_units=num_units)
 72 |         target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
 73 |         update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
 74 | 
 75 |         target_act_sample = act_pdtype_n[agent_idx].pdfromflat(target_p).sample()
 76 |         target_act = U.function(inputs=[make_obs_ph_n[agent_idx]], outputs=target_act_sample)
 77 | 
 78 |         return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
 79 | 
 80 | def q_train(make_obs_ph_n, act_space_n, agent_idx, q_func, q_function_idx, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
 81 |     with tf.variable_scope(scope, reuse=reuse):
 82 |         # create distribtuions
 83 |         act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
 84 | 
 85 |         # set up placeholders
 86 |         obs_ph_n = [tf.layers.flatten(obs_ph) for obs_ph in make_obs_ph_n]
 87 |         act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
 88 |         target_ph = tf.placeholder(tf.float32, [None], name="target")
 89 | 
 90 |         q_input = tf.concat(obs_ph_n + act_ph_n, 1)
 91 |         if local_q_func:
 92 |             q_input = tf.concat([obs_ph_n[agent_idx], act_ph_n[agent_idx]], 1)
 93 |         q = q_func(q_input, 1, scope="q_func" + str(q_function_idx), num_units=num_units)[:,0]
 94 |         q_func_vars = U.scope_vars(U.absolute_scope_name("q_func" + str(q_function_idx)))
 95 | 
 96 |         q_loss = tf.reduce_mean(tf.square(q - target_ph))
 97 | 
 98 |         # viscosity solution to Bellman differential equation in place of an initial condition
 99 |         q_reg = tf.reduce_mean(tf.square(q))
100 |         loss = q_loss #+ 1e-3 * q_reg
101 | 
102 |         optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)
103 | 
104 |         # Create callable functions
105 |         train = U.function(inputs=make_obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
106 |         q_values = U.function(make_obs_ph_n + act_ph_n, q)
107 | 
108 |         # target network
109 |         target_q = q_func(q_input, 1, scope="target_q_func" + str(q_function_idx), num_units=num_units)[:,0]
110 |         target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func" + str(q_function_idx)))
111 |         update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
112 | 
113 |         target_q_values = U.function(make_obs_ph_n + act_ph_n, target_q)
114 | 
115 |         return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
116 | 
117 | class MATD3AgentTrainer(AgentTrainer):
118 |     def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
119 |         self.name = name
120 |         self.n = len(obs_shape_n)
121 |         self.agent_index = agent_index
122 |         self.args = args
123 |         obs_ph_n = []
124 |         for i in range(self.n):
125 |             obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
126 | 
127 |         # Create all the functions necessary to train the model
128 |         self.q_train1, self.q_update1, self.q_debug1 = q_train(
129 |             scope=self.name,
130 |             make_obs_ph_n=obs_ph_n,
131 |             act_space_n=act_space_n,
132 |             agent_idx=agent_index,
133 |             q_function_idx=1,
134 |             q_func=model,
135 |             optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
136 |             grad_norm_clipping=0.5,
137 |             local_q_func=local_q_func,
138 |             num_units=args.num_units
139 |         )
140 |         self.q_train2, self.q_update2, self.q_debug2 = q_train(
141 |             scope=self.name,
142 |             make_obs_ph_n=obs_ph_n,
143 |             act_space_n=act_space_n,
144 |             agent_idx=agent_index,
145 |             q_func=model,
146 |             q_function_idx=2,
147 |             optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
148 |             grad_norm_clipping=0.5,
149 |             local_q_func=local_q_func,
150 |             num_units=args.num_units
151 |         )
152 | 
153 |         self.act, self.p_train, self.p_update, self.p_debug = p_train(
154 |             scope=self.name,
155 |             make_obs_ph_n=obs_ph_n,
156 |             act_space_n=act_space_n,
157 |             agent_idx=agent_index,
158 |             p_func=model,
159 |             q_func=model,  #MLPmodel()
160 |             optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
161 |             grad_norm_clipping=0.5,
162 |             local_q_func=local_q_func,
163 |             num_units=args.num_units
164 |         )
165 |         # Create experience buffer
166 |         self.replay_buffer = ReplayBuffer(1e6)
167 |         self.min_replay_buffer_len = args.batch_size * args.max_episode_len
168 |         self.replay_sample_index = None
169 |         a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph())
170 |         a.flush()
171 |         a.close()
172 | 
173 |     def action(self, obs):
174 |         return self.act(obs[None])[0]
175 | 
176 |     def experience(self, obs, act, rew, new_obs, done, terminal):
177 |         # Store transition in the replay buffer.
178 |         self.replay_buffer.add(obs, act, rew, new_obs, float(done))
179 | 
180 |     def preupdate(self):
181 |         self.replay_sample_index = None
182 | 
183 |     @property
184 |     def q_debug(self):
185 |         return self.q_debug1
186 | 
187 |     def update(self, agents, train_step):
188 |         if len(self.replay_buffer) < self.min_replay_buffer_len:  # replay buffer is not large enough
189 |             return
190 | 
191 |         if not train_step % self.args.update_rate == 0:
192 |             return
193 | 
194 | 
195 |         self.replay_sample_index = self.replay_buffer.generate_sample_indices(self.args.batch_size)
196 |         # collect replay sample from all agents
197 |         obs_n = []
198 |         obs_next_n = []
199 |         act_n = []
200 |         index = self.replay_sample_index
201 |         for i in range(self.n):
202 |             obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
203 |             obs_n.append(obs)
204 |             obs_next_n.append(obs_next)
205 |             act_n.append(act)
206 |         obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)
207 | 
208 |         # train q network
209 |         target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
210 |         if self.args.use_critic_noise:
211 |             for agent_idx in range(self.n):
212 |                 noise = np.random.normal(0, self.args.critic_action_noise_stddev, size=target_act_next_n[agent_idx].shape)
213 |                 clipped_noise = np.clip(noise, -self.args.action_noise_clip, self.args.action_noise_clip)
214 |                 target_act_next_n[agent_idx] = (target_act_next_n[agent_idx] + clipped_noise).tolist()
215 |         elif self.args.use_critic_noise_self:
216 |             noise = np.random.normal(0, self.args.critic_action_noise_stddev,
217 |                                      size=target_act_next_n[self.agent_index].shape)
218 |             clipped_noise = np.clip(noise, -self.args.action_noise_clip, self.args.action_noise_clip)
219 |             target_act_next_n[self.agent_index] = target_act_next_n[self.agent_index] + clipped_noise
220 |             target_act_next_n = target_act_next_n.tolist()
221 |         else:
222 |             target_act_next_n = target_act_next_n
223 |         target_q_next1 = self.q_debug1['target_q_values'](*(obs_next_n + target_act_next_n))
224 |         target_q_next2 = self.q_debug2['target_q_values'](*(obs_next_n + target_act_next_n))
225 |         target_q_next = np.min([target_q_next1, target_q_next2], 0)
226 |         if self.args.critic_zero_if_done:
227 |             done_cond = done == True
228 |             target_q_next[done_cond] = 0
229 | 
230 |         target_q = rew + self.args.gamma * target_q_next
231 |         q_loss = self.q_train1(*(obs_n + act_n + [target_q]))
232 |         q_loss = self.q_train2(*(obs_n + act_n + [target_q]))
233 | 
234 |         # train p network
235 |         if train_step % (self.args.update_rate * self.args.policy_update_rate) == 0:
236 |             p_loss = self.p_train(*(obs_n + act_n))
237 |             self.p_update()
238 |             self.q_update1()
239 |             self.q_update2()
240 | 
241 |         # print('Agent' + str(self.agent_index)  + ' Qloss = ' + str(q_loss) + ' Ploss = ' + str(p_loss))
242 |         # print('Replay buffer size:' + str(len(self.replay_buffer)))
243 | 
244 | 
245 |         return [q_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]
246 | 


--------------------------------------------------------------------------------
/matd3/matd3/trainer/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | 
 5 | class ReplayBuffer(object):
 6 |     def __init__(self, size):
 7 |         """Create Prioritized Replay buffer.
 8 | 
 9 |         Parameters
10 |         ----------
11 |         size: int
12 |             Max number of transitions to store in the buffer. When the buffer
13 |             overflows the old memories are dropped.
14 |         """
15 |         self._storage = []
16 |         self._maxsize = int(size)
17 |         self._next_idx = 0
18 | 
19 |     def __len__(self):
20 |         return len(self._storage)
21 | 
22 |     def clear(self):
23 |         self._storage = []
24 |         self._next_idx = 0
25 | 
26 |     def add(self, obs_t, action, reward, obs_tp1, done):
27 |         data = (obs_t, action, reward, obs_tp1, done)
28 | 
29 |         if self._next_idx >= len(self._storage):
30 |             self._storage.append(data)
31 |         else:
32 |             self._storage[self._next_idx] = data
33 |         self._next_idx = (self._next_idx + 1) % self._maxsize
34 | 
35 |     def _encode_sample(self, idxes):
36 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
37 |         for i in idxes:
38 |             data = self._storage[i]
39 |             obs_t, action, reward, obs_tp1, done = data
40 |             obses_t.append(np.array(obs_t, copy=False))
41 |             actions.append(np.array(action, copy=False))
42 |             rewards.append(reward)
43 |             obses_tp1.append(np.array(obs_tp1, copy=False))
44 |             dones.append(done)
45 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
46 | 
47 |     def generate_sample_indices(self, batch_size):
48 |         return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
49 | 
50 |     def make_latest_index(self, batch_size):
51 |         idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)]
52 |         np.random.shuffle(idx)
53 |         return idx
54 | 
55 |     def sample_index(self, idxes):
56 |         return self._encode_sample(idxes)
57 | 
58 |     def sample(self, batch_size):
59 |         """Sample a batch of experiences.
60 | 
61 |         Parameters
62 |         ----------
63 |         batch_size: int
64 |             How many transitions to sample.
65 | 
66 |         Returns
67 |         -------
68 |         obs_batch: np.array
69 |             batch of observations
70 |         act_batch: np.array
71 |             batch of actions executed given obs_batch
72 |         rew_batch: np.array
73 |             rewards received as results of executing act_batch
74 |         next_obs_batch: np.array
75 |             next set of observations seen after executing act_batch
76 |         done_mask: np.array
77 |             done_mask[i] = 1 if executing act_batch[i] resulted in
78 |             the end of an episode and 0 otherwise.
79 |         """
80 |         if batch_size > 0:
81 |             idxes = self.generate_sample_indices(batch_size)
82 |         else:
83 |             idxes = range(0, len(self._storage))
84 |         return self._encode_sample(idxes)
85 | 
86 |     def collect(self):
87 |         return self.sample(-1)
88 | 


--------------------------------------------------------------------------------
/matd3/multiagent/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | # Multiagent envs
 4 | # ----------------------------------------
 5 | 
 6 | register(
 7 |     id='MultiagentSimple-v0',
 8 |     entry_point='multiagent.envs:SimpleEnv',
 9 |     # FIXME(cathywu) currently has to be exactly max_path_length parameters in
10 |     # rllab run script
11 |     max_episode_steps=100,
12 | )
13 | 
14 | register(
15 |     id='MultiagentSimpleSpeakerListener-v0',
16 |     entry_point='multiagent.envs:SimpleSpeakerListenerEnv',
17 |     max_episode_steps=100,
18 | )
19 | 


--------------------------------------------------------------------------------
/matd3/multiagent/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | # physical/external base state of all entites
  4 | class EntityState(object):
  5 |     def __init__(self):
  6 |         # physical position
  7 |         self.p_pos = None
  8 |         # physical velocity
  9 |         self.p_vel = None
 10 | 
 11 | # state of agents (including communication and internal/mental state)
 12 | class AgentState(EntityState):
 13 |     def __init__(self):
 14 |         super(AgentState, self).__init__()
 15 |         # communication utterance
 16 |         self.c = None
 17 | 
 18 | # action of the agent
 19 | class Action(object):
 20 |     def __init__(self):
 21 |         # physical action
 22 |         self.u = None
 23 |         # communication action
 24 |         self.c = None
 25 | 
 26 | # properties and state of physical world entity
 27 | class Entity(object):
 28 |     def __init__(self):
 29 |         # name 
 30 |         self.name = ''
 31 |         # properties:
 32 |         self.size = 0.050
 33 |         # entity can move / be pushed
 34 |         self.movable = False
 35 |         # entity collides with others
 36 |         self.collide = True
 37 |         # material density (affects mass)
 38 |         self.density = 25.0
 39 |         # color
 40 |         self.color = None
 41 |         # max speed and accel
 42 |         self.max_speed = None
 43 |         self.accel = None
 44 |         # state
 45 |         self.state = EntityState()
 46 |         # mass
 47 |         self.initial_mass = 1.0
 48 | 
 49 |     @property
 50 |     def mass(self):
 51 |         return self.initial_mass
 52 | 
 53 | # properties of landmark entities
 54 | class Landmark(Entity):
 55 |      def __init__(self):
 56 |         super(Landmark, self).__init__()
 57 | 
 58 | # properties of agent entities
 59 | class Agent(Entity):
 60 |     def __init__(self):
 61 |         super(Agent, self).__init__()
 62 |         # agents are movable by default
 63 |         self.movable = True
 64 |         # cannot send communication signals
 65 |         self.silent = False
 66 |         # cannot observe the world
 67 |         self.blind = False
 68 |         # physical motor noise amount
 69 |         self.u_noise = None
 70 |         # communication noise amount
 71 |         self.c_noise = None
 72 |         # control range
 73 |         self.u_range = 1.0
 74 |         # state
 75 |         self.state = AgentState()
 76 |         # action
 77 |         self.action = Action()
 78 |         # script behavior to execute
 79 |         self.action_callback = None
 80 | 
 81 | # multi-agent world
 82 | class World(object):
 83 |     def __init__(self):
 84 |         # list of agents and entities (can change at execution-time!)
 85 |         self.agents = []
 86 |         self.landmarks = []
 87 |         # communication channel dimensionality
 88 |         self.dim_c = 0
 89 |         # position dimensionality
 90 |         self.dim_p = 2
 91 |         # color dimensionality
 92 |         self.dim_color = 3
 93 |         # simulation timestep
 94 |         self.dt = 0.1
 95 |         # physical damping
 96 |         self.damping = 0.25
 97 |         # contact response parameters
 98 |         self.contact_force = 1e+2
 99 |         self.contact_margin = 1e-3
100 | 
101 |     # return all entities in the world
102 |     @property
103 |     def entities(self):
104 |         return self.agents + self.landmarks
105 | 
106 |     # return all agents controllable by external policies
107 |     @property
108 |     def policy_agents(self):
109 |         return [agent for agent in self.agents if agent.action_callback is None]
110 | 
111 |     # return all agents controlled by world scripts
112 |     @property
113 |     def scripted_agents(self):
114 |         return [agent for agent in self.agents if agent.action_callback is not None]
115 | 
116 |     # update state of the world
117 |     def step(self):
118 |         # set actions for scripted agents 
119 |         for agent in self.scripted_agents:
120 |             agent.action = agent.action_callback(agent, self)
121 |         # gather forces applied to entities
122 |         p_force = [None] * len(self.entities)
123 |         # apply agent physical controls
124 |         p_force = self.apply_action_force(p_force)
125 |         # apply environment forces
126 |         p_force = self.apply_environment_force(p_force)
127 |         # integrate physical state
128 |         self.integrate_state(p_force)
129 |         # update agent state
130 |         for agent in self.agents:
131 |             self.update_agent_state(agent)
132 | 
133 |     # gather agent action forces
134 |     def apply_action_force(self, p_force):
135 |         # set applied forces
136 |         for i,agent in enumerate(self.agents):
137 |             if agent.movable:
138 |                 noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0
139 |                 p_force[i] = agent.action.u + noise                
140 |         return p_force
141 | 
142 |     # gather physical forces acting on entities
143 |     def apply_environment_force(self, p_force):
144 |         # simple (but inefficient) collision response
145 |         for a,entity_a in enumerate(self.entities):
146 |             for b,entity_b in enumerate(self.entities):
147 |                 if(b <= a): continue
148 |                 [f_a, f_b] = self.get_collision_force(entity_a, entity_b)
149 |                 if(f_a is not None):
150 |                     if(p_force[a] is None): p_force[a] = 0.0
151 |                     p_force[a] = f_a + p_force[a] 
152 |                 if(f_b is not None):
153 |                     if(p_force[b] is None): p_force[b] = 0.0
154 |                     p_force[b] = f_b + p_force[b]        
155 |         return p_force
156 | 
157 |     # integrate physical state
158 |     def integrate_state(self, p_force):
159 |         for i,entity in enumerate(self.entities):
160 |             if not entity.movable: continue
161 |             entity.state.p_vel = entity.state.p_vel * (1 - self.damping)
162 |             if (p_force[i] is not None):
163 |                 entity.state.p_vel += (p_force[i] / entity.mass) * self.dt
164 |             if entity.max_speed is not None:
165 |                 speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1]))
166 |                 if speed > entity.max_speed:
167 |                     entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) +
168 |                                                                   np.square(entity.state.p_vel[1])) * entity.max_speed
169 |             entity.state.p_pos += entity.state.p_vel * self.dt
170 | 
171 |     def update_agent_state(self, agent):
172 |         # set communication state (directly for now)
173 |         if agent.silent:
174 |             agent.state.c = np.zeros(self.dim_c)
175 |         else:
176 |             noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0
177 |             agent.state.c = agent.action.c + noise      
178 | 
179 |     # get collision forces for any contact between two entities
180 |     def get_collision_force(self, entity_a, entity_b):
181 |         if (not entity_a.collide) or (not entity_b.collide):
182 |             return [None, None] # not a collider
183 |         if (entity_a is entity_b):
184 |             return [None, None] # don't collide against itself
185 |         # compute actual distance between entities
186 |         delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
187 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
188 |         # minimum allowable distance
189 |         dist_min = entity_a.size + entity_b.size
190 |         # softmax penetration
191 |         k = self.contact_margin
192 |         penetration = np.logaddexp(0, -(dist - dist_min)/k)*k
193 |         force = self.contact_force * delta_pos / dist * penetration
194 |         force_a = +force if entity_a.movable else None
195 |         force_b = -force if entity_b.movable else None
196 |         return [force_a, force_b]


--------------------------------------------------------------------------------
/matd3/multiagent/environment.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import spaces
  3 | from gym.envs.registration import EnvSpec
  4 | import numpy as np
  5 | from multiagent.multi_discrete import MultiDiscrete
  6 | 
  7 | # environment for all agents in the multiagent world
  8 | # currently code assumes that no agents will be created/destroyed at runtime!
  9 | class MultiAgentEnv(gym.Env):
 10 |     metadata = {
 11 |         'render.modes' : ['human', 'rgb_array']
 12 |     }
 13 | 
 14 |     def __init__(self, world, reset_callback=None, reward_callback=None,
 15 |                  observation_callback=None, info_callback=None,
 16 |                  done_callback=None, shared_viewer=True):
 17 | 
 18 |         self.world = world
 19 |         self.agents = self.world.policy_agents
 20 |         # set required vectorized gym env property
 21 |         self.n = len(world.policy_agents)
 22 |         # scenario callbacks
 23 |         self.reset_callback = reset_callback
 24 |         self.reward_callback = reward_callback
 25 |         self.observation_callback = observation_callback
 26 |         self.info_callback = info_callback
 27 |         self.done_callback = done_callback
 28 |         # environment parameters
 29 |         self.discrete_action_space = True
 30 |         # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
 31 |         self.discrete_action_input = False
 32 |         # if true, even the action is continuous, action will be performed discretely
 33 |         self.force_discrete_action = world.discrete_action if hasattr(world, 'discrete_action') else False
 34 |         # if true, every agent has the same reward
 35 |         self.shared_reward = world.collaborative if hasattr(world, 'collaborative') else False
 36 |         self.time = 0
 37 | 
 38 |         # configure spaces
 39 |         self.action_space = []
 40 |         self.observation_space = []
 41 |         for agent in self.agents:
 42 |             total_action_space = []
 43 |             # physical action space
 44 |             if self.discrete_action_space:
 45 |                 u_action_space = spaces.Discrete(world.dim_p * 2 + 1)
 46 |             else:
 47 |                 u_action_space = spaces.Box(low=-agent.u_range, high=+agent.u_range, shape=(world.dim_p,), dtype=np.float32)
 48 |             if agent.movable:
 49 |                 total_action_space.append(u_action_space)
 50 |             # communication action space
 51 |             if self.discrete_action_space:
 52 |                 c_action_space = spaces.Discrete(world.dim_c)
 53 |             else:
 54 |                 c_action_space = spaces.Box(low=0.0, high=1.0, shape=(world.dim_c,), dtype=np.float32)
 55 |             if not agent.silent:
 56 |                 total_action_space.append(c_action_space)
 57 |             # total action space
 58 |             if len(total_action_space) > 1:
 59 |                 # all action spaces are discrete, so simplify to MultiDiscrete action space
 60 |                 if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
 61 |                     act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
 62 |                 else:
 63 |                     act_space = spaces.Tuple(total_action_space)
 64 |                 self.action_space.append(act_space)
 65 |             else:
 66 |                 self.action_space.append(total_action_space[0])
 67 |             # observation space
 68 |             obs_dim = len(observation_callback(agent, self.world))
 69 |             self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,), dtype=np.float32))
 70 |             agent.action.c = np.zeros(self.world.dim_c)
 71 | 
 72 |         # rendering
 73 |         self.shared_viewer = shared_viewer
 74 |         if self.shared_viewer:
 75 |             self.viewers = [None]
 76 |         else:
 77 |             self.viewers = [None] * self.n
 78 |         self._reset_render()
 79 | 
 80 |     def step(self, action_n):
 81 |         obs_n = []
 82 |         reward_n = []
 83 |         done_n = []
 84 |         info_n = {'n': []}
 85 |         self.agents = self.world.policy_agents
 86 |         # set action for each agent
 87 |         for i, agent in enumerate(self.agents):
 88 |             self._set_action(action_n[i], agent, self.action_space[i])
 89 |         # advance world state
 90 |         self.world.step()
 91 |         # record observation for each agent
 92 |         for agent in self.agents:
 93 |             obs_n.append(self._get_obs(agent))
 94 |             reward_n.append(self._get_reward(agent))
 95 |             done_n.append(self._get_done(agent))
 96 | 
 97 |             info_n['n'].append(self._get_info(agent))
 98 | 
 99 |         # all agents get total reward in cooperative case
100 |         reward = np.sum(reward_n)
101 |         if self.shared_reward:
102 |             reward_n = [reward] * self.n
103 | 
104 |         return obs_n, reward_n, done_n, info_n
105 | 
106 |     def reset(self):
107 |         # reset world
108 |         self.reset_callback(self.world)
109 |         # reset renderer
110 |         self._reset_render()
111 |         # record observations for each agent
112 |         obs_n = []
113 |         self.agents = self.world.policy_agents
114 |         for agent in self.agents:
115 |             obs_n.append(self._get_obs(agent))
116 |         return obs_n
117 | 
118 |     # get info used for benchmarking
119 |     def _get_info(self, agent):
120 |         if self.info_callback is None:
121 |             return {}
122 |         return self.info_callback(agent, self.world)
123 | 
124 |     # get observation for a particular agent
125 |     def _get_obs(self, agent):
126 |         if self.observation_callback is None:
127 |             return np.zeros(0)
128 |         return self.observation_callback(agent, self.world)
129 | 
130 |     # get dones for a particular agent
131 |     # unused right now -- agents are allowed to go beyond the viewing screen
132 |     def _get_done(self, agent):
133 |         if self.done_callback is None:
134 |             return False
135 |         return self.done_callback(agent, self.world)
136 | 
137 |     # get reward for a particular agent
138 |     def _get_reward(self, agent):
139 |         if self.reward_callback is None:
140 |             return 0.0
141 |         return self.reward_callback(agent, self.world)
142 | 
143 |     # set env action for a particular agent
144 |     def _set_action(self, action, agent, action_space, time=None):
145 |         agent.action.u = np.zeros(self.world.dim_p)
146 |         agent.action.c = np.zeros(self.world.dim_c)
147 |         # process action
148 |         if isinstance(action_space, MultiDiscrete):
149 |             act = []
150 |             size = action_space.high - action_space.low + 1
151 |             index = 0
152 |             for s in size:
153 |                 act.append(action[index:(index+s)])
154 |                 index += s
155 |             action = act
156 |         else:
157 |             action = [action]
158 | 
159 |         if agent.movable:
160 |             # physical action
161 |             if self.discrete_action_input:
162 |                 agent.action.u = np.zeros(self.world.dim_p)
163 |                 # process discrete action
164 |                 if action[0] == 1: agent.action.u[0] = -1.0
165 |                 if action[0] == 2: agent.action.u[0] = +1.0
166 |                 if action[0] == 3: agent.action.u[1] = -1.0
167 |                 if action[0] == 4: agent.action.u[1] = +1.0
168 |             else:
169 |                 if self.force_discrete_action:
170 |                     d = np.argmax(action[0])
171 |                     action[0][:] = 0.0
172 |                     action[0][d] = 1.0
173 |                 if self.discrete_action_space:
174 |                     agent.action.u[0] += action[0][1] - action[0][2]
175 |                     agent.action.u[1] += action[0][3] - action[0][4]
176 |                 else:
177 |                     agent.action.u = action[0]
178 |             sensitivity = 5.0
179 |             if agent.accel is not None:
180 |                 sensitivity = agent.accel
181 |             agent.action.u *= sensitivity
182 |             action = action[1:]
183 |         if not agent.silent:
184 |             # communication action
185 |             if self.discrete_action_input:
186 |                 agent.action.c = np.zeros(self.world.dim_c)
187 |                 agent.action.c[action[0]] = 1.0
188 |             else:
189 |                 agent.action.c = action[0]
190 |             action = action[1:]
191 |         # make sure we used all elements of action
192 |         assert len(action) == 0
193 | 
194 |     # reset rendering assets
195 |     def _reset_render(self):
196 |         self.render_geoms = None
197 |         self.render_geoms_xform = None
198 | 
199 |     # render environment
200 |     def render(self, mode='human'):
201 |         if mode == 'human':
202 |             alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
203 |             message = ''
204 |             for agent in self.world.agents:
205 |                 comm = []
206 |                 for other in self.world.agents:
207 |                     if other is agent: continue
208 |                     if np.all(other.state.c == 0):
209 |                         word = '_'
210 |                     else:
211 |                         word = alphabet[np.argmax(other.state.c)]
212 |                     message += (other.name + ' to ' + agent.name + ': ' + word + '   ')
213 |             print(message)
214 | 
215 |         for i in range(len(self.viewers)):
216 |             # create viewers (if necessary)
217 |             if self.viewers[i] is None:
218 |                 # import rendering only if we need it (and don't import for headless machines)
219 |                 #from gym.envs.classic_control import rendering
220 |                 from multiagent import rendering
221 |                 self.viewers[i] = rendering.Viewer(700,700)
222 | 
223 |         # create rendering geometry
224 |         if self.render_geoms is None:
225 |             # import rendering only if we need it (and don't import for headless machines)
226 |             #from gym.envs.classic_control import rendering
227 |             from multiagent import rendering
228 |             self.render_geoms = []
229 |             self.render_geoms_xform = []
230 |             for entity in self.world.entities:
231 |                 geom = rendering.make_circle(entity.size)
232 |                 xform = rendering.Transform()
233 |                 if 'agent' in entity.name:
234 |                     geom.set_color(*entity.color)
235 |                 else:
236 |                     geom.set_color(*entity.color)
237 |                 geom.add_attr(xform)
238 |                 self.render_geoms.append(geom)
239 |                 self.render_geoms_xform.append(xform)
240 | 
241 |             # add geoms to viewer
242 |             for viewer in self.viewers:
243 |                 viewer.geoms = []
244 |                 for geom in self.render_geoms:
245 |                     viewer.add_geom(geom)
246 | 
247 |         results = []
248 |         for i in range(len(self.viewers)):
249 |             from multiagent import rendering
250 |             # update bounds to center around agent
251 |             cam_range = 1
252 |             if self.shared_viewer:
253 |                 pos = np.zeros(self.world.dim_p)
254 |             else:
255 |                 pos = self.agents[i].state.p_pos
256 |             self.viewers[i].set_bounds(pos[0]-cam_range,pos[0]+cam_range,pos[1]-cam_range,pos[1]+cam_range)
257 |             # update geometry positions
258 |             for e, entity in enumerate(self.world.entities):
259 |                 self.render_geoms_xform[e].set_translation(*entity.state.p_pos)
260 |             # render to display or array
261 |             results.append(self.viewers[i].render(return_rgb_array = mode=='rgb_array'))
262 | 
263 |         return results
264 | 
265 |     # create receptor field locations in local coordinate frame
266 |     def _make_receptor_locations(self, agent):
267 |         receptor_type = 'polar'
268 |         range_min = 0.05 * 2.0
269 |         range_max = 1.00
270 |         dx = []
271 |         # circular receptive field
272 |         if receptor_type == 'polar':
273 |             for angle in np.linspace(-np.pi, +np.pi, 8, endpoint=False):
274 |                 for distance in np.linspace(range_min, range_max, 3):
275 |                     dx.append(distance * np.array([np.cos(angle), np.sin(angle)]))
276 |             # add origin
277 |             dx.append(np.array([0.0, 0.0]))
278 |         # grid receptive field
279 |         if receptor_type == 'grid':
280 |             for x in np.linspace(-range_max, +range_max, 5):
281 |                 for y in np.linspace(-range_max, +range_max, 5):
282 |                     dx.append(np.array([x,y]))
283 |         return dx
284 | 
285 | 
286 | # vectorized wrapper for a batch of multi-agent environments
287 | # assumes all environments have the same observation and action space
288 | class BatchMultiAgentEnv(gym.Env):
289 |     metadata = {
290 |         'runtime.vectorized': True,
291 |         'render.modes' : ['human', 'rgb_array']
292 |     }
293 | 
294 |     def __init__(self, env_batch):
295 |         self.env_batch = env_batch
296 | 
297 |     @property
298 |     def n(self):
299 |         return np.sum([env.n for env in self.env_batch])
300 | 
301 |     @property
302 |     def action_space(self):
303 |         return self.env_batch[0].action_space
304 | 
305 |     @property
306 |     def observation_space(self):
307 |         return self.env_batch[0].observation_space
308 | 
309 |     def step(self, action_n, time):
310 |         obs_n = []
311 |         reward_n = []
312 |         done_n = []
313 |         info_n = {'n': []}
314 |         i = 0
315 |         for env in self.env_batch:
316 |             obs, reward, done, _ = env.step(action_n[i:(i+env.n)], time)
317 |             i += env.n
318 |             obs_n += obs
319 |             # reward = [r / len(self.env_batch) for r in reward]
320 |             reward_n += reward
321 |             done_n += done
322 |         return obs_n, reward_n, done_n, info_n
323 | 
324 |     def reset(self):
325 |         obs_n = []
326 |         for env in self.env_batch:
327 |             obs_n += env.reset()
328 |         return obs_n
329 | 
330 |     # render environment
331 |     def render(self, mode='human', close=True):
332 |         results_n = []
333 |         for env in self.env_batch:
334 |             results_n += env.render(mode, close)
335 |         return results_n
336 | 


--------------------------------------------------------------------------------
/matd3/multiagent/multi_discrete.py:
--------------------------------------------------------------------------------
 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates)
 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py)
 3 | 
 4 | import numpy as np
 5 | 
 6 | import gym
 7 | from gym.spaces import prng
 8 | 
 9 | class MultiDiscrete(gym.Space):
10 |     """
11 |     - The multi-discrete action space consists of a series of discrete action spaces with different parameters
12 |     - It can be adapted to both a Discrete action space or a continuous (Box) action space
13 |     - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
14 |     - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
15 |        where the discrete action space can take any integers from `min` to `max` (both inclusive)
16 |     Note: A value of 0 always need to represent the NOOP action.
17 |     e.g. Nintendo Game Controller
18 |     - Can be conceptualized as 3 discrete action spaces:
19 |         1) Arrow Keys: Discrete 5  - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4]  - params: min: 0, max: 4
20 |         2) Button A:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
21 |         3) Button B:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
22 |     - Can be initialized as
23 |         MultiDiscrete([ [0,4], [0,1], [0,1] ])
24 |     """
25 |     def __init__(self, array_of_param_array):
26 |         self.low = np.array([x[0] for x in array_of_param_array])
27 |         self.high = np.array([x[1] for x in array_of_param_array])
28 |         self.num_discrete_space = self.low.shape[0]
29 | 
30 |     def sample(self):
31 |         """ Returns a array with one sample from each discrete action space """
32 |         # For each row: round(random .* (max - min) + min, 0)
33 |         random_array = prng.np_random.rand(self.num_discrete_space)
34 |         return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
35 |     def contains(self, x):
36 |         return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all()
37 | 
38 |     @property
39 |     def shape(self):
40 |         return self.num_discrete_space
41 |     def __repr__(self):
42 |         return "MultiDiscrete" + str(self.num_discrete_space)
43 |     def __eq__(self, other):
44 |         return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)


--------------------------------------------------------------------------------
/matd3/multiagent/policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pyglet.window import key
 3 | 
 4 | # individual agent policy
 5 | class Policy(object):
 6 |     def __init__(self):
 7 |         pass
 8 |     def action(self, obs):
 9 |         raise NotImplementedError()
10 | 
11 | # interactive policy based on keyboard input
12 | # hard-coded to deal only with movement, not communication
13 | class InteractivePolicy(Policy):
14 |     def __init__(self, env, agent_index):
15 |         super(InteractivePolicy, self).__init__()
16 |         self.env = env
17 |         # hard-coded keyboard events
18 |         self.move = [False for i in range(4)]
19 |         self.comm = [False for i in range(env.world.dim_c)]
20 |         # register keyboard events with this environment's window
21 |         env.viewers[agent_index].window.on_key_press = self.key_press
22 |         env.viewers[agent_index].window.on_key_release = self.key_release
23 | 
24 |     def action(self, obs):
25 |         # ignore observation and just act based on keyboard events
26 |         if self.env.discrete_action_input:
27 |             u = 0
28 |             if self.move[0]: u = 1
29 |             if self.move[1]: u = 2
30 |             if self.move[2]: u = 4
31 |             if self.move[3]: u = 3
32 |         else:
33 |             u = np.zeros(5) # 5-d because of no-move action
34 |             if self.move[0]: u[1] += 1.0
35 |             if self.move[1]: u[2] += 1.0
36 |             if self.move[3]: u[3] += 1.0
37 |             if self.move[2]: u[4] += 1.0
38 |             if True not in self.move:
39 |                 u[0] += 1.0
40 |         return np.concatenate([u, np.zeros(self.env.world.dim_c)])
41 | 
42 |     # keyboard event callbacks
43 |     def key_press(self, k, mod):
44 |         if k==key.LEFT:  self.move[0] = True
45 |         if k==key.RIGHT: self.move[1] = True
46 |         if k==key.UP:    self.move[2] = True
47 |         if k==key.DOWN:  self.move[3] = True
48 |     def key_release(self, k, mod):
49 |         if k==key.LEFT:  self.move[0] = False
50 |         if k==key.RIGHT: self.move[1] = False
51 |         if k==key.UP:    self.move[2] = False
52 |         if k==key.DOWN:  self.move[3] = False
53 | 


--------------------------------------------------------------------------------
/matd3/multiagent/rendering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 2D rendering framework
  3 | """
  4 | from __future__ import division
  5 | import os
  6 | import six
  7 | import sys
  8 | 
  9 | if "Apple" in sys.version:
 10 |     if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
 11 |         os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
 12 |         # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
 13 | 
 14 | from gym.utils import reraise
 15 | from gym import error
 16 | 
 17 | try:
 18 |     import pyglet
 19 | except ImportError as e:
 20 |     reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.")
 21 | 
 22 | try:
 23 |     from pyglet.gl import *
 24 | except ImportError as e:
 25 |     reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python <your_script.py>'")
 26 | 
 27 | import math
 28 | import numpy as np
 29 | 
 30 | RAD2DEG = 57.29577951308232
 31 | 
 32 | def get_display(spec):
 33 |     """Convert a display specification (such as :0) into an actual Display
 34 |     object.
 35 | 
 36 |     Pyglet only supports multiple Displays on Linux.
 37 |     """
 38 |     if spec is None:
 39 |         return None
 40 |     elif isinstance(spec, six.string_types):
 41 |         return pyglet.canvas.Display(spec)
 42 |     else:
 43 |         raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
 44 | 
 45 | class Viewer(object):
 46 |     def __init__(self, width, height, display=None):
 47 |         display = get_display(display)
 48 | 
 49 |         self.width = width
 50 |         self.height = height
 51 | 
 52 |         self.window = pyglet.window.Window(width=width, height=height, display=display)
 53 |         self.window.on_close = self.window_closed_by_user
 54 |         self.geoms = []
 55 |         self.onetime_geoms = []
 56 |         self.transform = Transform()
 57 | 
 58 |         glEnable(GL_BLEND)
 59 |         # glEnable(GL_MULTISAMPLE)
 60 |         glEnable(GL_LINE_SMOOTH)
 61 |         # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
 62 |         glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
 63 |         glLineWidth(2.0)
 64 |         glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
 65 | 
 66 |     def close(self):
 67 |         self.window.close()
 68 | 
 69 |     def window_closed_by_user(self):
 70 |         self.close()
 71 | 
 72 |     def set_bounds(self, left, right, bottom, top):
 73 |         assert right > left and top > bottom
 74 |         scalex = self.width/(right-left)
 75 |         scaley = self.height/(top-bottom)
 76 |         self.transform = Transform(
 77 |             translation=(-left*scalex, -bottom*scaley),
 78 |             scale=(scalex, scaley))
 79 | 
 80 |     def add_geom(self, geom):
 81 |         self.geoms.append(geom)
 82 | 
 83 |     def add_onetime(self, geom):
 84 |         self.onetime_geoms.append(geom)
 85 | 
 86 |     def render(self, return_rgb_array=False):
 87 |         glClearColor(1,1,1,1)
 88 |         self.window.clear()
 89 |         self.window.switch_to()
 90 |         self.window.dispatch_events()
 91 |         self.transform.enable()
 92 |         for geom in self.geoms:
 93 |             geom.render()
 94 |         for geom in self.onetime_geoms:
 95 |             geom.render()
 96 |         self.transform.disable()
 97 |         arr = None
 98 |         if return_rgb_array:
 99 |             buffer = pyglet.image.get_buffer_manager().get_color_buffer()
100 |             image_data = buffer.get_image_data()
101 |             arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
102 |             # In https://github.com/openai/gym-http-api/issues/2, we
103 |             # discovered that someone using Xmonad on Arch was having
104 |             # a window of size 598 x 398, though a 600 x 400 window
105 |             # was requested. (Guess Xmonad was preserving a pixel for
106 |             # the boundary.) So we use the buffer height/width rather
107 |             # than the requested one.
108 |             arr = arr.reshape(buffer.height, buffer.width, 4)
109 |             arr = arr[::-1,:,0:3]
110 |         self.window.flip()
111 |         self.onetime_geoms = []
112 |         return arr
113 | 
114 |     # Convenience
115 |     def draw_circle(self, radius=10, res=30, filled=True, **attrs):
116 |         geom = make_circle(radius=radius, res=res, filled=filled)
117 |         _add_attrs(geom, attrs)
118 |         self.add_onetime(geom)
119 |         return geom
120 | 
121 |     def draw_polygon(self, v, filled=True, **attrs):
122 |         geom = make_polygon(v=v, filled=filled)
123 |         _add_attrs(geom, attrs)
124 |         self.add_onetime(geom)
125 |         return geom
126 | 
127 |     def draw_polyline(self, v, **attrs):
128 |         geom = make_polyline(v=v)
129 |         _add_attrs(geom, attrs)
130 |         self.add_onetime(geom)
131 |         return geom
132 | 
133 |     def draw_line(self, start, end, **attrs):
134 |         geom = Line(start, end)
135 |         _add_attrs(geom, attrs)
136 |         self.add_onetime(geom)
137 |         return geom
138 | 
139 |     def get_array(self):
140 |         self.window.flip()
141 |         image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
142 |         self.window.flip()
143 |         arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
144 |         arr = arr.reshape(self.height, self.width, 4)
145 |         return arr[::-1,:,0:3]
146 | 
147 | def _add_attrs(geom, attrs):
148 |     if "color" in attrs:
149 |         geom.set_color(*attrs["color"])
150 |     if "linewidth" in attrs:
151 |         geom.set_linewidth(attrs["linewidth"])
152 | 
153 | class Geom(object):
154 |     def __init__(self):
155 |         self._color=Color((0, 0, 0, 1.0))
156 |         self.attrs = [self._color]
157 |     def render(self):
158 |         for attr in reversed(self.attrs):
159 |             attr.enable()
160 |         self.render1()
161 |         for attr in self.attrs:
162 |             attr.disable()
163 |     def render1(self):
164 |         raise NotImplementedError
165 |     def add_attr(self, attr):
166 |         self.attrs.append(attr)
167 |     def set_color(self, r, g, b, alpha=1):
168 |         self._color.vec4 = (r, g, b, alpha)
169 | 
170 | class Attr(object):
171 |     def enable(self):
172 |         raise NotImplementedError
173 |     def disable(self):
174 |         pass
175 | 
176 | class Transform(Attr):
177 |     def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)):
178 |         self.set_translation(*translation)
179 |         self.set_rotation(rotation)
180 |         self.set_scale(*scale)
181 |     def enable(self):
182 |         glPushMatrix()
183 |         glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint
184 |         glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
185 |         glScalef(self.scale[0], self.scale[1], 1)
186 |     def disable(self):
187 |         glPopMatrix()
188 |     def set_translation(self, newx, newy):
189 |         self.translation = (float(newx), float(newy))
190 |     def set_rotation(self, new):
191 |         self.rotation = float(new)
192 |     def set_scale(self, newx, newy):
193 |         self.scale = (float(newx), float(newy))
194 | 
195 | class Color(Attr):
196 |     def __init__(self, vec4):
197 |         self.vec4 = vec4
198 |     def enable(self):
199 |         glColor4f(*self.vec4)
200 | 
201 | class LineStyle(Attr):
202 |     def __init__(self, style):
203 |         self.style = style
204 |     def enable(self):
205 |         glEnable(GL_LINE_STIPPLE)
206 |         glLineStipple(1, self.style)
207 |     def disable(self):
208 |         glDisable(GL_LINE_STIPPLE)
209 | 
210 | class LineWidth(Attr):
211 |     def __init__(self, stroke):
212 |         self.stroke = stroke
213 |     def enable(self):
214 |         glLineWidth(self.stroke)
215 | 
216 | class Point(Geom):
217 |     def __init__(self):
218 |         Geom.__init__(self)
219 |     def render1(self):
220 |         glBegin(GL_POINTS) # draw point
221 |         glVertex3f(0.0, 0.0, 0.0)
222 |         glEnd()
223 | 
224 | class FilledPolygon(Geom):
225 |     def __init__(self, v):
226 |         Geom.__init__(self)
227 |         self.v = v
228 |     def render1(self):
229 |         if   len(self.v) == 4 : glBegin(GL_QUADS)
230 |         elif len(self.v)  > 4 : glBegin(GL_POLYGON)
231 |         else: glBegin(GL_TRIANGLES)
232 |         for p in self.v:
233 |             glVertex3f(p[0], p[1],0)  # draw each vertex
234 |         glEnd()
235 | 
236 |         color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5)
237 |         glColor4f(*color)
238 |         glBegin(GL_LINE_LOOP)
239 |         for p in self.v:
240 |             glVertex3f(p[0], p[1],0)  # draw each vertex
241 |         glEnd()
242 | 
243 | def make_circle(radius=10, res=30, filled=True):
244 |     points = []
245 |     for i in range(res):
246 |         ang = 2*math.pi*i / res
247 |         points.append((math.cos(ang)*radius, math.sin(ang)*radius))
248 |     if filled:
249 |         return FilledPolygon(points)
250 |     else:
251 |         return PolyLine(points, True)
252 | 
253 | def make_polygon(v, filled=True):
254 |     if filled: return FilledPolygon(v)
255 |     else: return PolyLine(v, True)
256 | 
257 | def make_polyline(v):
258 |     return PolyLine(v, False)
259 | 
260 | def make_capsule(length, width):
261 |     l, r, t, b = 0, length, width/2, -width/2
262 |     box = make_polygon([(l,b), (l,t), (r,t), (r,b)])
263 |     circ0 = make_circle(width/2)
264 |     circ1 = make_circle(width/2)
265 |     circ1.add_attr(Transform(translation=(length, 0)))
266 |     geom = Compound([box, circ0, circ1])
267 |     return geom
268 | 
269 | class Compound(Geom):
270 |     def __init__(self, gs):
271 |         Geom.__init__(self)
272 |         self.gs = gs
273 |         for g in self.gs:
274 |             g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
275 |     def render1(self):
276 |         for g in self.gs:
277 |             g.render()
278 | 
279 | class PolyLine(Geom):
280 |     def __init__(self, v, close):
281 |         Geom.__init__(self)
282 |         self.v = v
283 |         self.close = close
284 |         self.linewidth = LineWidth(1)
285 |         self.add_attr(self.linewidth)
286 |     def render1(self):
287 |         glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
288 |         for p in self.v:
289 |             glVertex3f(p[0], p[1],0)  # draw each vertex
290 |         glEnd()
291 |     def set_linewidth(self, x):
292 |         self.linewidth.stroke = x
293 | 
294 | class Line(Geom):
295 |     def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
296 |         Geom.__init__(self)
297 |         self.start = start
298 |         self.end = end
299 |         self.linewidth = LineWidth(1)
300 |         self.add_attr(self.linewidth)
301 | 
302 |     def render1(self):
303 |         glBegin(GL_LINES)
304 |         glVertex2f(*self.start)
305 |         glVertex2f(*self.end)
306 |         glEnd()
307 | 
308 | class Image(Geom):
309 |     def __init__(self, fname, width, height):
310 |         Geom.__init__(self)
311 |         self.width = width
312 |         self.height = height
313 |         img = pyglet.image.load(fname)
314 |         self.img = img
315 |         self.flip = False
316 |     def render1(self):
317 |         self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height)
318 | 
319 | # ================================================================
320 | 
321 | class SimpleImageViewer(object):
322 |     def __init__(self, display=None):
323 |         self.window = None
324 |         self.isopen = False
325 |         self.display = display
326 |     def imshow(self, arr):
327 |         if self.window is None:
328 |             height, width, channels = arr.shape
329 |             self.window = pyglet.window.Window(width=width, height=height, display=self.display)
330 |             self.width = width
331 |             self.height = height
332 |             self.isopen = True
333 |         assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
334 |         image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
335 |         self.window.clear()
336 |         self.window.switch_to()
337 |         self.window.dispatch_events()
338 |         image.blit(0,0)
339 |         self.window.flip()
340 |     def close(self):
341 |         if self.isopen:
342 |             self.window.close()
343 |             self.isopen = False
344 |     def __del__(self):
345 |         self.close()


--------------------------------------------------------------------------------
/matd3/multiagent/scenario.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # defines scenario upon which the world is built
 4 | class BaseScenario(object):
 5 |     # create elements of the world
 6 |     def make_world(self):
 7 |         raise NotImplementedError()
 8 |     # create initial conditions of the world
 9 |     def reset_world(self, world):
10 |         raise NotImplementedError()
11 | 


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/__init__.py:
--------------------------------------------------------------------------------
1 | import imp
2 | import os.path as osp
3 | 
4 | 
5 | def load(name):
6 |     pathname = osp.join(osp.dirname(__file__), name)
7 |     return imp.load_source('', pathname)
8 | 


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/simple.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | class Scenario(BaseScenario):
 6 |     def make_world(self):
 7 |         world = World()
 8 |         # add agents
 9 |         world.agents = [Agent() for i in range(1)]
10 |         for i, agent in enumerate(world.agents):
11 |             agent.name = 'agent %d' % i
12 |             agent.collide = False
13 |             agent.silent = True
14 |         # add landmarks
15 |         world.landmarks = [Landmark() for i in range(1)]
16 |         for i, landmark in enumerate(world.landmarks):
17 |             landmark.name = 'landmark %d' % i
18 |             landmark.collide = False
19 |             landmark.movable = False
20 |         # make initial conditions
21 |         self.reset_world(world)
22 |         return world
23 | 
24 |     def reset_world(self, world):
25 |         # random properties for agents
26 |         for i, agent in enumerate(world.agents):
27 |             agent.color = np.array([0.25,0.25,0.25])
28 |         # random properties for landmarks
29 |         for i, landmark in enumerate(world.landmarks):
30 |             landmark.color = np.array([0.75,0.75,0.75])
31 |         world.landmarks[0].color = np.array([0.75,0.25,0.25])
32 |         # set random initial states
33 |         for agent in world.agents:
34 |             agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
35 |             agent.state.p_vel = np.zeros(world.dim_p)
36 |             agent.state.c = np.zeros(world.dim_c)
37 |         for i, landmark in enumerate(world.landmarks):
38 |             landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
39 |             landmark.state.p_vel = np.zeros(world.dim_p)
40 | 
41 |     def reward(self, agent, world):
42 |         dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos))
43 |         return -dist2
44 | 
45 |     def observation(self, agent, world):
46 |         # get positions of all entities in this agent's reference frame
47 |         entity_pos = []
48 |         for entity in world.landmarks:
49 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
50 |         return np.concatenate([agent.state.p_vel] + entity_pos)
51 | 


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/simple_adversary.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 | 
  8 |     def make_world(self):
  9 |         world = World()
 10 |         # set any world properties first
 11 |         world.dim_c = 2
 12 |         num_agents = 3
 13 |         world.num_agents = num_agents
 14 |         num_adversaries = 1
 15 |         num_landmarks = num_agents - 1
 16 |         # add agents
 17 |         world.agents = [Agent() for i in range(num_agents)]
 18 |         for i, agent in enumerate(world.agents):
 19 |             agent.name = 'agent %d' % i
 20 |             agent.collide = False
 21 |             agent.silent = True
 22 |             agent.adversary = True if i < num_adversaries else False
 23 |             agent.size = 0.15
 24 |         # add landmarks
 25 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 26 |         for i, landmark in enumerate(world.landmarks):
 27 |             landmark.name = 'landmark %d' % i
 28 |             landmark.collide = False
 29 |             landmark.movable = False
 30 |             landmark.size = 0.08
 31 |         # make initial conditions
 32 |         self.reset_world(world)
 33 |         return world
 34 | 
 35 |     def reset_world(self, world):
 36 |         # random properties for agents
 37 |         world.agents[0].color = np.array([0.85, 0.35, 0.35])
 38 |         for i in range(1, world.num_agents):
 39 |             world.agents[i].color = np.array([0.35, 0.35, 0.85])
 40 |         # random properties for landmarks
 41 |         for i, landmark in enumerate(world.landmarks):
 42 |             landmark.color = np.array([0.15, 0.15, 0.15])
 43 |         # set goal landmark
 44 |         goal = np.random.choice(world.landmarks)
 45 |         goal.color = np.array([0.15, 0.65, 0.15])
 46 |         for agent in world.agents:
 47 |             agent.goal_a = goal
 48 |         # set random initial states
 49 |         for agent in world.agents:
 50 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 51 |             agent.state.p_vel = np.zeros(world.dim_p)
 52 |             agent.state.c = np.zeros(world.dim_c)
 53 |         for i, landmark in enumerate(world.landmarks):
 54 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 55 |             landmark.state.p_vel = np.zeros(world.dim_p)
 56 | 
 57 |     def benchmark_data(self, agent, world):
 58 |         # returns data for benchmarking purposes
 59 |         if agent.adversary:
 60 |             return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
 61 |         else:
 62 |             dists = []
 63 |             for l in world.landmarks:
 64 |                 dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos)))
 65 |             dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)))
 66 |             return tuple(dists)
 67 | 
 68 |     # return all agents that are not adversaries
 69 |     def good_agents(self, world):
 70 |         return [agent for agent in world.agents if not agent.adversary]
 71 | 
 72 |     # return all adversarial agents
 73 |     def adversaries(self, world):
 74 |         return [agent for agent in world.agents if agent.adversary]
 75 | 
 76 |     def reward(self, agent, world):
 77 |         # Agents are rewarded based on minimum agent distance to each landmark
 78 |         return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
 79 | 
 80 |     def agent_reward(self, agent, world):
 81 |         # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it
 82 |         shaped_reward = True
 83 |         shaped_adv_reward = True
 84 | 
 85 |         # Calculate negative reward for adversary
 86 |         adversary_agents = self.adversaries(world)
 87 |         if shaped_adv_reward:  # distance-based adversary reward
 88 |             adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents])
 89 |         else:  # proximity-based adversary reward (binary)
 90 |             adv_rew = 0
 91 |             for a in adversary_agents:
 92 |                 if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size:
 93 |                     adv_rew -= 5
 94 | 
 95 |         # Calculate positive reward for agents
 96 |         good_agents = self.good_agents(world)
 97 |         if shaped_reward:  # distance-based agent reward
 98 |             pos_rew = -min(
 99 |                 [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents])
100 |         else:  # proximity-based agent reward (binary)
101 |             pos_rew = 0
102 |             if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \
103 |                     < 2 * agent.goal_a.size:
104 |                 pos_rew += 5
105 |             pos_rew -= min(
106 |                 [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents])
107 |         return pos_rew + adv_rew
108 | 
109 |     def adversary_reward(self, agent, world):
110 |         # Rewarded based on proximity to the goal landmark
111 |         shaped_reward = True
112 |         if shaped_reward:  # distance-based reward
113 |             return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
114 |         else:  # proximity-based reward (binary)
115 |             adv_rew = 0
116 |             if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size:
117 |                 adv_rew += 5
118 |             return adv_rew
119 | 
120 | 
121 |     def observation(self, agent, world):
122 |         # get positions of all entities in this agent's reference frame
123 |         entity_pos = []
124 |         for entity in world.landmarks:
125 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
126 |         # entity colors
127 |         entity_color = []
128 |         for entity in world.landmarks:
129 |             entity_color.append(entity.color)
130 |         # communication of all other agents
131 |         other_pos = []
132 |         for other in world.agents:
133 |             if other is agent: continue
134 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
135 | 
136 |         if not agent.adversary:
137 |             return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos)
138 |         else:
139 |             return np.concatenate(entity_pos + other_pos)
140 | 


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/simple_crypto.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Scenario:
  3 | 1 speaker, 2 listeners (one of which is an adversary). Good agents rewarded for proximity to goal, and distance from
  4 | adversary to goal. Adversary is rewarded for its distance to the goal.
  5 | """
  6 | 
  7 | 
  8 | import numpy as np
  9 | from multiagent.core import World, Agent, Landmark
 10 | from multiagent.scenario import BaseScenario
 11 | import random
 12 | 
 13 | 
 14 | class CryptoAgent(Agent):
 15 |     def __init__(self):
 16 |         super(CryptoAgent, self).__init__()
 17 |         self.key = None
 18 | 
 19 | class Scenario(BaseScenario):
 20 | 
 21 |     def make_world(self):
 22 |         world = World()
 23 |         # set any world properties first
 24 |         num_agents = 3
 25 |         num_adversaries = 1
 26 |         num_landmarks = 2
 27 |         world.dim_c = 4
 28 |         # add agents
 29 |         world.agents = [CryptoAgent() for i in range(num_agents)]
 30 |         for i, agent in enumerate(world.agents):
 31 |             agent.name = 'agent %d' % i
 32 |             agent.collide = False
 33 |             agent.adversary = True if i < num_adversaries else False
 34 |             agent.speaker = True if i == 2 else False
 35 |             agent.movable = False
 36 |         # add landmarks
 37 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 38 |         for i, landmark in enumerate(world.landmarks):
 39 |             landmark.name = 'landmark %d' % i
 40 |             landmark.collide = False
 41 |             landmark.movable = False
 42 |         # make initial conditions
 43 |         self.reset_world(world)
 44 |         return world
 45 | 
 46 | 
 47 |     def reset_world(self, world):
 48 |         # random properties for agents
 49 |         for i, agent in enumerate(world.agents):
 50 |             agent.color = np.array([0.25, 0.25, 0.25])
 51 |             if agent.adversary:
 52 |                 agent.color = np.array([0.75, 0.25, 0.25])
 53 |             agent.key = None
 54 |         # random properties for landmarks
 55 |         color_list = [np.zeros(world.dim_c) for i in world.landmarks]
 56 |         for i, color in enumerate(color_list):
 57 |             color[i] += 1
 58 |         for color, landmark in zip(color_list, world.landmarks):
 59 |             landmark.color = color
 60 |         # set goal landmark
 61 |         goal = np.random.choice(world.landmarks)
 62 |         world.agents[1].color = goal.color
 63 |         world.agents[2].key = np.random.choice(world.landmarks).color
 64 | 
 65 |         for agent in world.agents:
 66 |             agent.goal_a = goal
 67 | 
 68 |         # set random initial states
 69 |         for agent in world.agents:
 70 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 71 |             agent.state.p_vel = np.zeros(world.dim_p)
 72 |             agent.state.c = np.zeros(world.dim_c)
 73 |         for i, landmark in enumerate(world.landmarks):
 74 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 75 |             landmark.state.p_vel = np.zeros(world.dim_p)
 76 | 
 77 | 
 78 |     def benchmark_data(self, agent, world):
 79 |         # returns data for benchmarking purposes
 80 |         return (agent.state.c, agent.goal_a.color)
 81 | 
 82 |     # return all agents that are not adversaries
 83 |     def good_listeners(self, world):
 84 |         return [agent for agent in world.agents if not agent.adversary and not agent.speaker]
 85 | 
 86 |     # return all agents that are not adversaries
 87 |     def good_agents(self, world):
 88 |         return [agent for agent in world.agents if not agent.adversary]
 89 | 
 90 |     # return all adversarial agents
 91 |     def adversaries(self, world):
 92 |         return [agent for agent in world.agents if agent.adversary]
 93 | 
 94 |     def reward(self, agent, world):
 95 |         return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
 96 | 
 97 |     def agent_reward(self, agent, world):
 98 |         # Agents rewarded if Bob can reconstruct message, but adversary (Eve) cannot
 99 |         good_listeners = self.good_listeners(world)
100 |         adversaries = self.adversaries(world)
101 |         good_rew = 0
102 |         adv_rew = 0
103 |         for a in good_listeners:
104 |             if (a.state.c == np.zeros(world.dim_c)).all():
105 |                 continue
106 |             else:
107 |                 good_rew -= np.sum(np.square(a.state.c - agent.goal_a.color))
108 |         for a in adversaries:
109 |             if (a.state.c == np.zeros(world.dim_c)).all():
110 |                 continue
111 |             else:
112 |                 adv_l1 = np.sum(np.square(a.state.c - agent.goal_a.color))
113 |                 adv_rew += adv_l1
114 |         return adv_rew + good_rew
115 | 
116 |     def adversary_reward(self, agent, world):
117 |         # Adversary (Eve) is rewarded if it can reconstruct original goal
118 |         rew = 0
119 |         if not (agent.state.c == np.zeros(world.dim_c)).all():
120 |             rew -= np.sum(np.square(agent.state.c - agent.goal_a.color))
121 |         return rew
122 | 
123 | 
124 |     def observation(self, agent, world):
125 |         # goal color
126 |         goal_color = np.zeros(world.dim_color)
127 |         if agent.goal_a is not None:
128 |             goal_color = agent.goal_a.color
129 | 
130 |         # get positions of all entities in this agent's reference frame
131 |         entity_pos = []
132 |         for entity in world.landmarks:
133 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
134 |         # communication of all other agents
135 |         comm = []
136 |         for other in world.agents:
137 |             if other is agent or (other.state.c is None) or not other.speaker: continue
138 |             comm.append(other.state.c)
139 | 
140 |         confer = np.array([0])
141 | 
142 |         if world.agents[2].key is None:
143 |             confer = np.array([1])
144 |             key = np.zeros(world.dim_c)
145 |             goal_color = np.zeros(world.dim_c)
146 |         else:
147 |             key = world.agents[2].key
148 | 
149 |         prnt = False
150 |         # speaker
151 |         if agent.speaker:
152 |             if prnt:
153 |                 print('speaker')
154 |                 print(agent.state.c)
155 |                 print(np.concatenate([goal_color] + [key] + [confer] + [np.random.randn(1)]))
156 |             return np.concatenate([goal_color] + [key])
157 |         # listener
158 |         if not agent.speaker and not agent.adversary:
159 |             if prnt:
160 |                 print('listener')
161 |                 print(agent.state.c)
162 |                 print(np.concatenate([key] + comm + [confer]))
163 |             return np.concatenate([key] + comm)
164 |         if not agent.speaker and agent.adversary:
165 |             if prnt:
166 |                 print('adversary')
167 |                 print(agent.state.c)
168 |                 print(np.concatenate(comm + [confer]))
169 |             return np.concatenate(comm)
170 | 


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/simple_push.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | class Scenario(BaseScenario):
 6 |     def make_world(self):
 7 |         world = World()
 8 |         # set any world properties first
 9 |         world.dim_c = 2
10 |         num_agents = 2
11 |         num_adversaries = 1
12 |         num_landmarks = 2
13 |         # add agents
14 |         world.agents = [Agent() for i in range(num_agents)]
15 |         for i, agent in enumerate(world.agents):
16 |             agent.name = 'agent %d' % i
17 |             agent.collide = True
18 |             agent.silent = True
19 |             if i < num_adversaries:
20 |                 agent.adversary = True
21 |             else:
22 |                 agent.adversary = False
23 |         # add landmarks
24 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
25 |         for i, landmark in enumerate(world.landmarks):
26 |             landmark.name = 'landmark %d' % i
27 |             landmark.collide = False
28 |             landmark.movable = False
29 |         # make initial conditions
30 |         self.reset_world(world)
31 |         return world
32 | 
33 |     def reset_world(self, world):
34 |         # random properties for landmarks
35 |         for i, landmark in enumerate(world.landmarks):
36 |             landmark.color = np.array([0.1, 0.1, 0.1])
37 |             landmark.color[i + 1] += 0.8
38 |             landmark.index = i
39 |         # set goal landmark
40 |         goal = np.random.choice(world.landmarks)
41 |         for i, agent in enumerate(world.agents):
42 |             agent.goal_a = goal
43 |             agent.color = np.array([0.25, 0.25, 0.25])
44 |             if agent.adversary:
45 |                 agent.color = np.array([0.75, 0.25, 0.25])
46 |             else:
47 |                 j = goal.index
48 |                 agent.color[j + 1] += 0.5
49 |         # set random initial states
50 |         for agent in world.agents:
51 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
52 |             agent.state.p_vel = np.zeros(world.dim_p)
53 |             agent.state.c = np.zeros(world.dim_c)
54 |         for i, landmark in enumerate(world.landmarks):
55 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
56 |             landmark.state.p_vel = np.zeros(world.dim_p)
57 | 
58 |     def reward(self, agent, world):
59 |         # Agents are rewarded based on minimum agent distance to each landmark
60 |         return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
61 | 
62 |     def agent_reward(self, agent, world):
63 |         # the distance to the goal
64 |         return -np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)))
65 | 
66 |     def adversary_reward(self, agent, world):
67 |         # keep the nearest good agents away from the goal
68 |         agent_dist = [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in world.agents if not a.adversary]
69 |         pos_rew = min(agent_dist)
70 |         #nearest_agent = world.good_agents[np.argmin(agent_dist)]
71 |         #neg_rew = np.sqrt(np.sum(np.square(nearest_agent.state.p_pos - agent.state.p_pos)))
72 |         neg_rew = np.sqrt(np.sum(np.square(agent.goal_a.state.p_pos - agent.state.p_pos)))
73 |         #neg_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in world.good_agents])
74 |         return pos_rew - neg_rew
75 |                
76 |     def observation(self, agent, world):
77 |         # get positions of all entities in this agent's reference frame
78 |         entity_pos = []
79 |         for entity in world.landmarks:  # world.entities:
80 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
81 |         # entity colors
82 |         entity_color = []
83 |         for entity in world.landmarks:  # world.entities:
84 |             entity_color.append(entity.color)
85 |         # communication of all other agents
86 |         comm = []
87 |         other_pos = []
88 |         for other in world.agents:
89 |             if other is agent: continue
90 |             comm.append(other.state.c)
91 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
92 |         if not agent.adversary:
93 |             return np.concatenate([agent.state.p_vel] + [agent.goal_a.state.p_pos - agent.state.p_pos] + [agent.color] + entity_pos + entity_color + other_pos)
94 |         else:
95 |             #other_pos = list(reversed(other_pos)) if random.uniform(0,1) > 0.5 else other_pos  # randomize position of other agents in adversary network
96 |             return np.concatenate([agent.state.p_vel] + entity_pos + other_pos)
97 | 


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/simple_reference.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | class Scenario(BaseScenario):
 6 |     def make_world(self):
 7 |         world = World()
 8 |         # set any world properties first
 9 |         world.dim_c = 10
10 |         world.collaborative = True  # whether agents share rewards
11 |         # add agents
12 |         world.agents = [Agent() for i in range(2)]
13 |         for i, agent in enumerate(world.agents):
14 |             agent.name = 'agent %d' % i
15 |             agent.collide = False
16 |         # add landmarks
17 |         world.landmarks = [Landmark() for i in range(3)]
18 |         for i, landmark in enumerate(world.landmarks):
19 |             landmark.name = 'landmark %d' % i
20 |             landmark.collide = False
21 |             landmark.movable = False
22 |         # make initial conditions
23 |         self.reset_world(world)
24 |         return world
25 | 
26 |     def reset_world(self, world):
27 |         # assign goals to agents
28 |         for agent in world.agents:
29 |             agent.goal_a = None
30 |             agent.goal_b = None
31 |         # want other agent to go to the goal landmark
32 |         world.agents[0].goal_a = world.agents[1]
33 |         world.agents[0].goal_b = np.random.choice(world.landmarks)
34 |         world.agents[1].goal_a = world.agents[0]
35 |         world.agents[1].goal_b = np.random.choice(world.landmarks)
36 |         # random properties for agents
37 |         for i, agent in enumerate(world.agents):
38 |             agent.color = np.array([0.25,0.25,0.25])               
39 |         # random properties for landmarks
40 |         world.landmarks[0].color = np.array([0.75,0.25,0.25]) 
41 |         world.landmarks[1].color = np.array([0.25,0.75,0.25]) 
42 |         world.landmarks[2].color = np.array([0.25,0.25,0.75]) 
43 |         # special colors for goals
44 |         world.agents[0].goal_a.color = world.agents[0].goal_b.color                
45 |         world.agents[1].goal_a.color = world.agents[1].goal_b.color                               
46 |         # set random initial states
47 |         for agent in world.agents:
48 |             agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
49 |             agent.state.p_vel = np.zeros(world.dim_p)
50 |             agent.state.c = np.zeros(world.dim_c)
51 |         for i, landmark in enumerate(world.landmarks):
52 |             landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
53 |             landmark.state.p_vel = np.zeros(world.dim_p)
54 | 
55 |     def reward(self, agent, world):
56 |         if agent.goal_a is None or agent.goal_b is None:
57 |             return 0.0
58 |         dist2 = np.sum(np.square(agent.goal_a.state.p_pos - agent.goal_b.state.p_pos))
59 |         return -dist2
60 | 
61 |     def observation(self, agent, world):
62 |         # goal color
63 |         goal_color = [np.zeros(world.dim_color), np.zeros(world.dim_color)]
64 |         if agent.goal_b is not None:
65 |             goal_color[1] = agent.goal_b.color 
66 | 
67 |         # get positions of all entities in this agent's reference frame
68 |         entity_pos = []
69 |         for entity in world.landmarks:
70 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
71 |         # entity colors
72 |         entity_color = []
73 |         for entity in world.landmarks:
74 |             entity_color.append(entity.color)
75 |         # communication of all other agents
76 |         comm = []
77 |         for other in world.agents:
78 |             if other is agent: continue
79 |             comm.append(other.state.c)
80 |         return np.concatenate([agent.state.p_vel] + entity_pos + [goal_color[1]] + comm)
81 |             


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/simple_speaker_listener.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | class Scenario(BaseScenario):
 6 |     def make_world(self):
 7 |         world = World()
 8 |         # set any world properties first
 9 |         world.dim_c = 3
10 |         num_landmarks = 3
11 |         world.collaborative = True
12 |         # add agents
13 |         world.agents = [Agent() for i in range(2)]
14 |         for i, agent in enumerate(world.agents):
15 |             agent.name = 'agent %d' % i
16 |             agent.collide = False
17 |             agent.size = 0.075
18 |         # speaker
19 |         world.agents[0].movable = False
20 |         # listener
21 |         world.agents[1].silent = True
22 |         # add landmarks
23 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
24 |         for i, landmark in enumerate(world.landmarks):
25 |             landmark.name = 'landmark %d' % i
26 |             landmark.collide = False
27 |             landmark.movable = False
28 |             landmark.size = 0.04
29 |         # make initial conditions
30 |         self.reset_world(world)
31 |         return world
32 | 
33 |     def reset_world(self, world):
34 |         # assign goals to agents
35 |         for agent in world.agents:
36 |             agent.goal_a = None
37 |             agent.goal_b = None
38 |         # want listener to go to the goal landmark
39 |         world.agents[0].goal_a = world.agents[1]
40 |         world.agents[0].goal_b = np.random.choice(world.landmarks)
41 |         # random properties for agents
42 |         for i, agent in enumerate(world.agents):
43 |             agent.color = np.array([0.25,0.25,0.25])               
44 |         # random properties for landmarks
45 |         world.landmarks[0].color = np.array([0.65,0.15,0.15])
46 |         world.landmarks[1].color = np.array([0.15,0.65,0.15])
47 |         world.landmarks[2].color = np.array([0.15,0.15,0.65])
48 |         # special colors for goals
49 |         world.agents[0].goal_a.color = world.agents[0].goal_b.color + np.array([0.45, 0.45, 0.45])
50 |         # set random initial states
51 |         for agent in world.agents:
52 |             agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
53 |             agent.state.p_vel = np.zeros(world.dim_p)
54 |             agent.state.c = np.zeros(world.dim_c)
55 |         for i, landmark in enumerate(world.landmarks):
56 |             landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
57 |             landmark.state.p_vel = np.zeros(world.dim_p)
58 | 
59 |     def benchmark_data(self, agent, world):
60 |         # returns data for benchmarking purposes
61 |         a = world.agents[0]
62 |         distance = np.sqrt(np.square(a.goal_a.state.p_pos))
63 |         return self.reward(agent, world)
64 | 
65 |     def reward(self, agent, world):
66 |         # squared distance from listener to landmark
67 |         a = world.agents[0]
68 |         dist2 = np.sum(np.square(a.goal_a.state.p_pos - a.goal_b.state.p_pos))
69 |         return -dist2
70 | 
71 |     def observation(self, agent, world):
72 |         # goal color
73 |         goal_color = np.zeros(world.dim_color)
74 |         if agent.goal_b is not None:
75 |             goal_color = agent.goal_b.color
76 | 
77 |         # get positions of all entities in this agent's reference frame
78 |         entity_pos = []
79 |         for entity in world.landmarks:
80 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
81 | 
82 |         # communication of all other agents
83 |         comm = []
84 |         for other in world.agents:
85 |             if other is agent or (other.state.c is None): continue
86 |             comm.append(other.state.c)
87 |         
88 |         # speaker
89 |         if not agent.movable:
90 |             return np.concatenate([goal_color])
91 |         # listener
92 |         if agent.silent:
93 |             return np.concatenate([agent.state.p_vel] + entity_pos + comm)
94 |             
95 | 


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/simple_spread.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | 
 6 | class Scenario(BaseScenario):
 7 |     def make_world(self):
 8 |         world = World()
 9 |         # set any world properties first
10 |         world.dim_c = 2
11 |         num_agents = 3
12 |         num_landmarks = 3
13 |         world.collaborative = True
14 |         # add agents
15 |         world.agents = [Agent() for i in range(num_agents)]
16 |         for i, agent in enumerate(world.agents):
17 |             agent.name = 'agent %d' % i
18 |             agent.collide = True
19 |             agent.silent = True
20 |             agent.size = 0.15
21 |         # add landmarks
22 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
23 |         for i, landmark in enumerate(world.landmarks):
24 |             landmark.name = 'landmark %d' % i
25 |             landmark.collide = False
26 |             landmark.movable = False
27 |         # make initial conditions
28 |         self.reset_world(world)
29 |         return world
30 | 
31 |     def reset_world(self, world):
32 |         # random properties for agents
33 |         for i, agent in enumerate(world.agents):
34 |             agent.color = np.array([0.35, 0.35, 0.85])
35 |         # random properties for landmarks
36 |         for i, landmark in enumerate(world.landmarks):
37 |             landmark.color = np.array([0.25, 0.25, 0.25])
38 |         # set random initial states
39 |         for agent in world.agents:
40 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
41 |             agent.state.p_vel = np.zeros(world.dim_p)
42 |             agent.state.c = np.zeros(world.dim_c)
43 |         for i, landmark in enumerate(world.landmarks):
44 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
45 |             landmark.state.p_vel = np.zeros(world.dim_p)
46 | 
47 |     def benchmark_data(self, agent, world):
48 |         rew = 0
49 |         collisions = 0
50 |         occupied_landmarks = 0
51 |         dists = [np.sqrt(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) for l in world.landmarks]
52 |         min_dist = min(dists)
53 |         rew -= min(dists)
54 |         if min(dists) < 0.1:
55 |             occupied_landmarks += 1
56 |         if agent.collide:
57 |             for a in world.agents:
58 |                 if self.is_collision(a, agent):
59 |                     rew -= 1
60 |                     collisions += 1
61 |         return (rew, collisions, min_dist, occupied_landmarks)
62 | 
63 | 
64 |     def is_collision(self, agent1, agent2):
65 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
66 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
67 |         dist_min = agent1.size + agent2.size
68 |         return True if dist < dist_min else False
69 | 
70 |     def reward(self, agent, world):
71 |         # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions
72 |         rew = 0
73 |         for l in world.landmarks:
74 |             dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
75 |             rew -= min(dists)
76 |         if agent.collide:
77 |             for a in world.agents:
78 |                 if self.is_collision(a, agent):
79 |                     rew -= 1
80 |         return rew
81 | 
82 |     def observation(self, agent, world):
83 |         # get positions of all entities in this agent's reference frame
84 |         entity_pos = []
85 |         for entity in world.landmarks:  # world.entities:
86 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
87 |         # entity colors
88 |         entity_color = []
89 |         for entity in world.landmarks:  # world.entities:
90 |             entity_color.append(entity.color)
91 |         # communication of all other agents
92 |         comm = []
93 |         other_pos = []
94 |         for other in world.agents:
95 |             if other is agent: continue
96 |             comm.append(other.state.c)
97 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
98 |         return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm)
99 | 


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/simple_spread_two_ag.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self):
  8 |         world = World()
  9 |         # set any world properties first
 10 |         world.dim_c = 2
 11 |         num_agents = 2
 12 |         num_landmarks = 2
 13 |         world.collaborative = True
 14 |         # add agents
 15 |         world.agents = [Agent() for i in range(num_agents)]
 16 |         for i, agent in enumerate(world.agents):
 17 |             agent.name = 'agent %d' % i
 18 |             agent.collide = True
 19 |             agent.silent = True
 20 |             agent.size = 0.15
 21 |         # add landmarks
 22 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 23 |         for i, landmark in enumerate(world.landmarks):
 24 |             landmark.name = 'landmark %d' % i
 25 |             landmark.collide = False
 26 |             landmark.movable = False
 27 |         # make initial conditions
 28 |         self.reset_world(world)
 29 |         return world
 30 | 
 31 |     def reset_world(self, world):
 32 |         # random properties for agents
 33 |         for i, agent in enumerate(world.agents):
 34 |             agent.color = np.array([0.35, 0.35, 0.85])
 35 |         # random properties for landmarks
 36 |         for i, landmark in enumerate(world.landmarks):
 37 |             landmark.color = np.array([0.25, 0.25, 0.25])
 38 |         # set random initial states
 39 |         for agent in world.agents:
 40 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 41 |             agent.state.p_vel = np.zeros(world.dim_p)
 42 |             agent.state.c = np.zeros(world.dim_c)
 43 |         for i, landmark in enumerate(world.landmarks):
 44 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 45 |             landmark.state.p_vel = np.zeros(world.dim_p)
 46 | 
 47 |     def benchmark_data(self, agent, world):
 48 |         rew = 0
 49 |         collisions = 0
 50 |         occupied_landmarks = 0
 51 |         min_dists = 0
 52 |         for l in world.landmarks:
 53 |             dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
 54 |             min_dists += min(dists)
 55 |             rew -= min(dists)
 56 |             if min(dists) < 0.1:
 57 |                 occupied_landmarks += 1
 58 |         if agent.collide:
 59 |             for a in world.agents:
 60 |                 if self.is_collision(a, agent):
 61 |                     rew -= 1
 62 |                     collisions += 1
 63 |         return (rew, collisions, min_dists, occupied_landmarks)
 64 | 
 65 | 
 66 |     def is_collision(self, agent1, agent2):
 67 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
 68 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
 69 |         dist_min = agent1.size + agent2.size
 70 |         return True if dist < dist_min else False
 71 | 
 72 |     def reward(self, agent, world):
 73 |         # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions
 74 |         rew = 0
 75 |         for l in world.landmarks:
 76 |             dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
 77 |             rew -= min(dists)
 78 |         if agent.collide:
 79 |             for a in world.agents:
 80 |                 if self.is_collision(a, agent):
 81 |                     rew -= 1
 82 |         return rew
 83 | 
 84 |     def observation(self, agent, world):
 85 |         # get positions of all entities in this agent's reference frame
 86 |         entity_pos = []
 87 |         for entity in world.landmarks:  # world.entities:
 88 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
 89 |         # entity colors
 90 |         entity_color = []
 91 |         for entity in world.landmarks:  # world.entities:
 92 |             entity_color.append(entity.color)
 93 |         # communication of all other agents
 94 |         comm = []
 95 |         other_pos = []
 96 |         for other in world.agents:
 97 |             if other is agent: continue
 98 |             comm.append(other.state.c)
 99 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
100 |         return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm)
101 | 


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/simple_tag.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self):
  8 |         world = World()
  9 |         # set any world properties first
 10 |         world.dim_c = 2
 11 |         num_good_agents = 1
 12 |         num_adversaries = 3
 13 |         num_agents = num_adversaries + num_good_agents
 14 |         num_landmarks = 2
 15 |         # add agents
 16 |         world.agents = [Agent() for i in range(num_agents)]
 17 |         for i, agent in enumerate(world.agents):
 18 |             agent.name = 'agent %d' % i
 19 |             agent.collide = True
 20 |             agent.silent = True
 21 |             agent.adversary = True if i < num_adversaries else False
 22 |             agent.size = 0.075 if agent.adversary else 0.05
 23 |             agent.accel = 3.0 if agent.adversary else 4.0
 24 |             #agent.accel = 20.0 if agent.adversary else 25.0
 25 |             agent.max_speed = 1.0 if agent.adversary else 1.3
 26 |         # add landmarks
 27 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 28 |         for i, landmark in enumerate(world.landmarks):
 29 |             landmark.name = 'landmark %d' % i
 30 |             landmark.collide = True
 31 |             landmark.movable = False
 32 |             landmark.size = 0.2
 33 |             landmark.boundary = False
 34 |         # make initial conditions
 35 |         self.reset_world(world)
 36 |         return world
 37 | 
 38 | 
 39 |     def reset_world(self, world):
 40 |         # random properties for agents
 41 |         for i, agent in enumerate(world.agents):
 42 |             agent.color = np.array([0.35, 0.85, 0.35]) if not agent.adversary else np.array([0.85, 0.35, 0.35])
 43 |             # random properties for landmarks
 44 |         for i, landmark in enumerate(world.landmarks):
 45 |             landmark.color = np.array([0.25, 0.25, 0.25])
 46 |         # set random initial states
 47 |         for agent in world.agents:
 48 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 49 |             agent.state.p_vel = np.zeros(world.dim_p)
 50 |             agent.state.c = np.zeros(world.dim_c)
 51 |         for i, landmark in enumerate(world.landmarks):
 52 |             if not landmark.boundary:
 53 |                 landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
 54 |                 landmark.state.p_vel = np.zeros(world.dim_p)
 55 | 
 56 | 
 57 |     def benchmark_data(self, agent, world):
 58 |         # returns data for benchmarking purposes
 59 |         if agent.adversary:
 60 |             collisions = 0
 61 |             for a in self.good_agents(world):
 62 |                 if self.is_collision(a, agent):
 63 |                     collisions += 1
 64 |             return collisions
 65 |         else:
 66 |             return 0
 67 | 
 68 | 
 69 |     def is_collision(self, agent1, agent2):
 70 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
 71 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
 72 |         dist_min = agent1.size + agent2.size
 73 |         return True if dist < dist_min else False
 74 | 
 75 |     # return all agents that are not adversaries
 76 |     def good_agents(self, world):
 77 |         return [agent for agent in world.agents if not agent.adversary]
 78 | 
 79 |     # return all adversarial agents
 80 |     def adversaries(self, world):
 81 |         return [agent for agent in world.agents if agent.adversary]
 82 | 
 83 | 
 84 |     def reward(self, agent, world):
 85 |         # Agents are rewarded based on minimum agent distance to each landmark
 86 |         main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
 87 |         return main_reward
 88 | 
 89 |     def agent_reward(self, agent, world):
 90 |         # Agents are negatively rewarded if caught by adversaries
 91 |         rew = 0
 92 |         shape = False
 93 |         adversaries = self.adversaries(world)
 94 |         if shape:  # reward can optionally be shaped (increased reward for increased distance from adversary)
 95 |             for adv in adversaries:
 96 |                 rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos)))
 97 |         if agent.collide:
 98 |             for a in adversaries:
 99 |                 if self.is_collision(a, agent):
100 |                     rew -= 10
101 | 
102 |         # agents are penalized for exiting the screen, so that they can be caught by the adversaries
103 |         def bound(x):
104 |             if x < 0.9:
105 |                 return 0
106 |             if x < 1.0:
107 |                 return (x - 0.9) * 10
108 |             return min(np.exp(2 * x - 2), 10)
109 |         for p in range(world.dim_p):
110 |             x = abs(agent.state.p_pos[p])
111 |             rew -= bound(x)
112 | 
113 |         return rew
114 | 
115 |     def adversary_reward(self, agent, world):
116 |         # Adversaries are rewarded for collisions with agents
117 |         rew = 0
118 |         shape = False
119 |         agents = self.good_agents(world)
120 |         adversaries = self.adversaries(world)
121 |         if shape:  # reward can optionally be shaped (decreased reward for increased distance from agents)
122 |             for adv in adversaries:
123 |                 rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents])
124 |         if agent.collide:
125 |             for ag in agents:
126 |                 for adv in adversaries:
127 |                     if self.is_collision(ag, adv):
128 |                         rew += 10
129 |         return rew
130 | 
131 |     def observation(self, agent, world):
132 |         # get positions of all entities in this agent's reference frame
133 |         entity_pos = []
134 |         for entity in world.landmarks:
135 |             if not entity.boundary:
136 |                 entity_pos.append(entity.state.p_pos - agent.state.p_pos)
137 |         # communication of all other agents
138 |         comm = []
139 |         other_pos = []
140 |         other_vel = []
141 |         for other in world.agents:
142 |             if other is agent: continue
143 |             comm.append(other.state.c)
144 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
145 |             if not other.adversary:
146 |                 other_vel.append(other.state.p_vel)
147 |         return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel)
148 | 


--------------------------------------------------------------------------------
/matd3/multiagent/scenarios/simple_world_comm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self):
  8 |         world = World()
  9 |         # set any world properties first
 10 |         world.dim_c = 4
 11 |         #world.damping = 1
 12 |         num_good_agents = 2
 13 |         num_adversaries = 4
 14 |         num_agents = num_adversaries + num_good_agents
 15 |         num_landmarks = 1
 16 |         num_food = 2
 17 |         num_forests = 2
 18 |         # add agents
 19 |         world.agents = [Agent() for i in range(num_agents)]
 20 |         for i, agent in enumerate(world.agents):
 21 |             agent.name = 'agent %d' % i
 22 |             agent.collide = True
 23 |             agent.leader = True if i == 0 else False
 24 |             agent.silent = True if i > 0 else False
 25 |             agent.adversary = True if i < num_adversaries else False
 26 |             agent.size = 0.075 if agent.adversary else 0.045
 27 |             agent.accel = 3.0 if agent.adversary else 4.0
 28 |             #agent.accel = 20.0 if agent.adversary else 25.0
 29 |             agent.max_speed = 1.0 if agent.adversary else 1.3
 30 |         # add landmarks
 31 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 32 |         for i, landmark in enumerate(world.landmarks):
 33 |             landmark.name = 'landmark %d' % i
 34 |             landmark.collide = True
 35 |             landmark.movable = False
 36 |             landmark.size = 0.2
 37 |             landmark.boundary = False
 38 |         world.food = [Landmark() for i in range(num_food)]
 39 |         for i, landmark in enumerate(world.food):
 40 |             landmark.name = 'food %d' % i
 41 |             landmark.collide = False
 42 |             landmark.movable = False
 43 |             landmark.size = 0.03
 44 |             landmark.boundary = False
 45 |         world.forests = [Landmark() for i in range(num_forests)]
 46 |         for i, landmark in enumerate(world.forests):
 47 |             landmark.name = 'forest %d' % i
 48 |             landmark.collide = False
 49 |             landmark.movable = False
 50 |             landmark.size = 0.3
 51 |             landmark.boundary = False
 52 |         world.landmarks += world.food
 53 |         world.landmarks += world.forests
 54 |         #world.landmarks += self.set_boundaries(world)  # world boundaries now penalized with negative reward
 55 |         # make initial conditions
 56 |         self.reset_world(world)
 57 |         return world
 58 | 
 59 |     def set_boundaries(self, world):
 60 |         boundary_list = []
 61 |         landmark_size = 1
 62 |         edge = 1 + landmark_size
 63 |         num_landmarks = int(edge * 2 / landmark_size)
 64 |         for x_pos in [-edge, edge]:
 65 |             for i in range(num_landmarks):
 66 |                 l = Landmark()
 67 |                 l.state.p_pos = np.array([x_pos, -1 + i * landmark_size])
 68 |                 boundary_list.append(l)
 69 | 
 70 |         for y_pos in [-edge, edge]:
 71 |             for i in range(num_landmarks):
 72 |                 l = Landmark()
 73 |                 l.state.p_pos = np.array([-1 + i * landmark_size, y_pos])
 74 |                 boundary_list.append(l)
 75 | 
 76 |         for i, l in enumerate(boundary_list):
 77 |             l.name = 'boundary %d' % i
 78 |             l.collide = True
 79 |             l.movable = False
 80 |             l.boundary = True
 81 |             l.color = np.array([0.75, 0.75, 0.75])
 82 |             l.size = landmark_size
 83 |             l.state.p_vel = np.zeros(world.dim_p)
 84 | 
 85 |         return boundary_list
 86 | 
 87 | 
 88 |     def reset_world(self, world):
 89 |         # random properties for agents
 90 |         for i, agent in enumerate(world.agents):
 91 |             agent.color = np.array([0.45, 0.95, 0.45]) if not agent.adversary else np.array([0.95, 0.45, 0.45])
 92 |             agent.color -= np.array([0.3, 0.3, 0.3]) if agent.leader else np.array([0, 0, 0])
 93 |             # random properties for landmarks
 94 |         for i, landmark in enumerate(world.landmarks):
 95 |             landmark.color = np.array([0.25, 0.25, 0.25])
 96 |         for i, landmark in enumerate(world.food):
 97 |             landmark.color = np.array([0.15, 0.15, 0.65])
 98 |         for i, landmark in enumerate(world.forests):
 99 |             landmark.color = np.array([0.6, 0.9, 0.6])
100 |         # set random initial states
101 |         for agent in world.agents:
102 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
103 |             agent.state.p_vel = np.zeros(world.dim_p)
104 |             agent.state.c = np.zeros(world.dim_c)
105 |         for i, landmark in enumerate(world.landmarks):
106 |             landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
107 |             landmark.state.p_vel = np.zeros(world.dim_p)
108 |         for i, landmark in enumerate(world.food):
109 |             landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
110 |             landmark.state.p_vel = np.zeros(world.dim_p)
111 |         for i, landmark in enumerate(world.forests):
112 |             landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
113 |             landmark.state.p_vel = np.zeros(world.dim_p)
114 | 
115 |     def benchmark_data(self, agent, world):
116 |         if agent.adversary:
117 |             collisions = 0
118 |             for a in self.good_agents(world):
119 |                 if self.is_collision(a, agent):
120 |                     collisions += 1
121 |             return collisions
122 |         else:
123 |             return 0
124 | 
125 | 
126 |     def is_collision(self, agent1, agent2):
127 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
128 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
129 |         dist_min = agent1.size + agent2.size
130 |         return True if dist < dist_min else False
131 | 
132 | 
133 |     # return all agents that are not adversaries
134 |     def good_agents(self, world):
135 |         return [agent for agent in world.agents if not agent.adversary]
136 | 
137 |     # return all adversarial agents
138 |     def adversaries(self, world):
139 |         return [agent for agent in world.agents if agent.adversary]
140 | 
141 | 
142 |     def reward(self, agent, world):
143 |         # Agents are rewarded based on minimum agent distance to each landmark
144 |         #boundary_reward = -10 if self.outside_boundary(agent) else 0
145 |         main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
146 |         return main_reward
147 | 
148 |     def outside_boundary(self, agent):
149 |         if agent.state.p_pos[0] > 1 or agent.state.p_pos[0] < -1 or agent.state.p_pos[1] > 1 or agent.state.p_pos[1] < -1:
150 |             return True
151 |         else:
152 |             return False
153 | 
154 | 
155 |     def agent_reward(self, agent, world):
156 |         # Agents are rewarded based on minimum agent distance to each landmark
157 |         rew = 0
158 |         shape = False
159 |         adversaries = self.adversaries(world)
160 |         if shape:
161 |             for adv in adversaries:
162 |                 rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos)))
163 |         if agent.collide:
164 |             for a in adversaries:
165 |                 if self.is_collision(a, agent):
166 |                     rew -= 5
167 |         def bound(x):
168 |             if x < 0.9:
169 |                 return 0
170 |             if x < 1.0:
171 |                 return (x - 0.9) * 10
172 |             return min(np.exp(2 * x - 2), 10)  # 1 + (x - 1) * (x - 1)
173 | 
174 |         for p in range(world.dim_p):
175 |             x = abs(agent.state.p_pos[p])
176 |             rew -= 2 * bound(x)
177 | 
178 |         for food in world.food:
179 |             if self.is_collision(agent, food):
180 |                 rew += 2
181 |         rew += 0.05 * min([np.sqrt(np.sum(np.square(food.state.p_pos - agent.state.p_pos))) for food in world.food])
182 | 
183 |         return rew
184 | 
185 |     def adversary_reward(self, agent, world):
186 |         # Agents are rewarded based on minimum agent distance to each landmark
187 |         rew = 0
188 |         shape = True
189 |         agents = self.good_agents(world)
190 |         adversaries = self.adversaries(world)
191 |         if shape:
192 |             rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in agents])
193 |         if agent.collide:
194 |             for ag in agents:
195 |                 for adv in adversaries:
196 |                     if self.is_collision(ag, adv):
197 |                         rew += 5
198 |         return rew
199 | 
200 | 
201 |     def observation2(self, agent, world):
202 |         # get positions of all entities in this agent's reference frame
203 |         entity_pos = []
204 |         for entity in world.landmarks:
205 |             if not entity.boundary:
206 |                 entity_pos.append(entity.state.p_pos - agent.state.p_pos)
207 | 
208 |         food_pos = []
209 |         for entity in world.food:
210 |             if not entity.boundary:
211 |                 food_pos.append(entity.state.p_pos - agent.state.p_pos)
212 |         # communication of all other agents
213 |         comm = []
214 |         other_pos = []
215 |         other_vel = []
216 |         for other in world.agents:
217 |             if other is agent: continue
218 |             comm.append(other.state.c)
219 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
220 |             if not other.adversary:
221 |                 other_vel.append(other.state.p_vel)
222 |         return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel)
223 | 
224 |     def observation(self, agent, world):
225 |         # get positions of all entities in this agent's reference frame
226 |         entity_pos = []
227 |         for entity in world.landmarks:
228 |             if not entity.boundary:
229 |                 entity_pos.append(entity.state.p_pos - agent.state.p_pos)
230 | 
231 |         in_forest = [np.array([-1]), np.array([-1])]
232 |         inf1 = False
233 |         inf2 = False
234 |         if self.is_collision(agent, world.forests[0]):
235 |             in_forest[0] = np.array([1])
236 |             inf1= True
237 |         if self.is_collision(agent, world.forests[1]):
238 |             in_forest[1] = np.array([1])
239 |             inf2 = True
240 | 
241 |         food_pos = []
242 |         for entity in world.food:
243 |             if not entity.boundary:
244 |                 food_pos.append(entity.state.p_pos - agent.state.p_pos)
245 |         # communication of all other agents
246 |         comm = []
247 |         other_pos = []
248 |         other_vel = []
249 |         for other in world.agents:
250 |             if other is agent: continue
251 |             comm.append(other.state.c)
252 |             oth_f1 = self.is_collision(other, world.forests[0])
253 |             oth_f2 = self.is_collision(other, world.forests[1])
254 |             if (inf1 and oth_f1) or (inf2 and oth_f2) or (not inf1 and not oth_f1 and not inf2 and not oth_f2) or agent.leader:  #without forest vis
255 |                 other_pos.append(other.state.p_pos - agent.state.p_pos)
256 |                 if not other.adversary:
257 |                     other_vel.append(other.state.p_vel)
258 |             else:
259 |                 other_pos.append([0, 0])
260 |                 if not other.adversary:
261 |                     other_vel.append([0, 0])
262 | 
263 |         # to tell the pred when the prey are in the forest
264 |         prey_forest = []
265 |         ga = self.good_agents(world)
266 |         for a in ga:
267 |             if any([self.is_collision(a, f) for f in world.forests]):
268 |                 prey_forest.append(np.array([1]))
269 |             else:
270 |                 prey_forest.append(np.array([-1]))
271 |         # to tell leader when pred are in forest
272 |         prey_forest_lead = []
273 |         for f in world.forests:
274 |             if any([self.is_collision(a, f) for a in ga]):
275 |                 prey_forest_lead.append(np.array([1]))
276 |             else:
277 |                 prey_forest_lead.append(np.array([-1]))
278 | 
279 |         comm = [world.agents[0].state.c]
280 | 
281 |         if agent.adversary and not agent.leader:
282 |             return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm)
283 |         if agent.leader:
284 |             return np.concatenate(
285 |                 [agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm)
286 |         else:
287 |             return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + in_forest + other_vel)
288 | 
289 | 
290 | 


--------------------------------------------------------------------------------
/matd3/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pickle
  3 | import time
  4 | from copy import deepcopy
  5 | 
  6 | import gym
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | import tensorflow.contrib.layers as layers
 10 | 
 11 | import common.tf_util as U
 12 | from maddpg.trainer.maddpg import MADDPGAgentTrainer
 13 | from matd3.trainer.matd3 import  MATD3AgentTrainer
 14 | from multiagent.environment import MultiAgentEnv
 15 | 
 16 | logger = None
 17 | 
 18 | def parse_args():
 19 |     parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
 20 |     # Environment
 21 |     parser.add_argument("--scenario", type=str, default="simple", help="name of the scenario script")
 22 |     parser.add_argument("--max-episode-len", type=int, default=25, help="maximum episode length")
 23 |     parser.add_argument("--num-episodes", type=int, default=60000, help="number of episodes")
 24 |     parser.add_argument("--num-adversaries", type=int, default=0, help="number of adversaries")
 25 |     parser.add_argument("--good-policy", type=str, default="matd3", help="policy for good agents (matd3 or maddpg)")
 26 |     parser.add_argument("--adv-policy", type=str, default="matd3", help="policy of adversaries (matd3 or maddpg)")
 27 | 
 28 |     # Core training parameters
 29 |     parser.add_argument("--lr", type=float, default=1e-2, help="learning rate for Adam optimizer")
 30 |     parser.add_argument("--gamma", type=float, default=0.95, help="discount factor")
 31 |     parser.add_argument("--batch-size", type=int, default=1024, help="number of episodes to optimize at the same time")
 32 |     parser.add_argument("--num-units", type=int, default=64, help="number of units in the mlp")
 33 |     parser.add_argument("--update-rate", type=int, default=100, help="after this many steps the critic is trained")
 34 |     parser.add_argument("--policy-update-rate", type=int, default=2,
 35 |                         help="after this many critic updates the target networks and policy are trained")
 36 |     parser.add_argument("--use-critic-noise", action="store_true", default=False, help="use noise in critic update next action")
 37 |     parser.add_argument("--use-critic-noise-self", action="store_true", default=False, help="use noise in critic update next action")
 38 |     parser.add_argument("--critic-action-noise-stddev", type=float, default=0.2)
 39 |     parser.add_argument("--action-noise-clip", type=float, default=0.5)
 40 |     parser.add_argument("--critic-zero-if-done", action="store_true", default=False, help="set q value to zero in critic update after done")
 41 | 
 42 |     # Checkpointing
 43 |     parser.add_argument("--exp-name", type=str, default='def_exp_name', help="name of the experiment")
 44 |     parser.add_argument("--save-dir", type=str, default="/tmp/policy/", help="directory in which training state and model should be saved")
 45 |     parser.add_argument("--save-rate", type=int, default=1000, help="save model once every time this many episodes are completed")
 46 |     parser.add_argument("--load-dir", type=str, default="", help="directory in which training state and model are loaded")
 47 |     # Evaluation
 48 |     parser.add_argument("--real-q-log", action="store_true", default=False,help="Evaluates approx. real q value after every 5 save-rates")
 49 |     parser.add_argument("--q-log-ep-len", type=int, default=200, help="Number of steps per state in q_eval")
 50 |     parser.add_argument("--restore", action="store_true", default=False)
 51 |     parser.add_argument("--display", action="store_true", default=False)
 52 |     parser.add_argument("--benchmark", action="store_true", default=False, help="Saves all locations and termination locations")
 53 |     parser.add_argument("--benchmark-iters", type=int, default=10000, help="number of iterations run for benchmarking")
 54 |     parser.add_argument("--benchmark-dir", type=str, default="./benchmark_files/", help="directory where benchmark data is saved")
 55 |     parser.add_argument("--plots-dir", type=str, default="./learning_curves/", help="directory where plot data is saved")
 56 |     parser.add_argument("--record-episodes", action="store_true", default=False, help="save rgb arrays of episodes")
 57 |     return parser.parse_args()
 58 | 
 59 | 
 60 | def mlp_model(input, num_outputs, scope, reuse=False, num_units=64, rnn_cell=None):
 61 |     # This model takes as input an observation and returns values of all actions
 62 |     with tf.variable_scope(scope, reuse=reuse):
 63 |         out = input
 64 |         out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu)
 65 |         out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu)
 66 |         out = layers.fully_connected(out, num_outputs=num_outputs, activation_fn=None)
 67 |         return out
 68 | 
 69 | def make_env(scenario_name, arglist, benchmark=False):
 70 |     from multiagent.environment import MultiAgentEnv
 71 |     import multiagent.scenarios as scenarios
 72 | 
 73 |     # load scenario from script
 74 |     scenario = scenarios.load(scenario_name + ".py").Scenario()
 75 |     # create world
 76 |     world = scenario.make_world()
 77 |     # create multiagent environment
 78 |     if benchmark:
 79 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
 80 |     else:
 81 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
 82 |     return env
 83 | 
 84 | 
 85 | def calculate_real_q_value(env: MultiAgentEnv, agents, world_state_buffer, action_n_buffer, start_episode_step_buffer,
 86 |                            obs_n_buffer, num_start_states, args):
 87 |     """
 88 | 
 89 |     :param env:
 90 |     :param agents:
 91 |     :param world_state_buffer: buffer of world states, from which we randomly sample
 92 |     :param action_n_buffer: buffer of action chosen in the world_state of same index
 93 |     :param num_start_states:
 94 |     :param len_eval:
 95 |     :return:
 96 |     """
 97 |     world_sample_indexes = np.random.choice(range(len(world_state_buffer)), num_start_states)
 98 |     discounted_run_rewards_n = []
 99 |     q_values_n = []
100 |     for start_idx, world_idx in enumerate(world_sample_indexes):
101 |         env.world = deepcopy(world_state_buffer[world_idx])
102 |         episode_reward_n = []
103 |         action_n = action_n_buffer[world_idx]
104 |         obs_n, reward_n, done_n, info_n = env.step(action_n)
105 |         episode_reward_n.append(reward_n)
106 |         # if arglist.q_log_full_episodes:
107 |         episode_step = 0
108 |         # else:
109 |         #     episode_step = start_episode_step_buffer[world_idx]
110 | 
111 |         terminal = False
112 |         obs_n_reshaped = []
113 |         action_n_reshaped = []
114 |         for ag_idx in range(len(obs_n)):
115 |             obs_n_reshaped.append([obs_n[ag_idx]])
116 |             action_n_reshaped.append([action_n[ag_idx]])
117 |         q_values_n.append([agent.q_debug['q_values'](*(obs_n_reshaped + action_n_reshaped)) for agent in agents])
118 | 
119 |         while not (all(done_n) or terminal):
120 |             action_n = [agent.action(obs) for agent, obs in zip(agents, obs_n)]
121 |             obs_n, reward_n, done_n, info_n = env.step(action_n)
122 |             episode_reward_n.append(reward_n)
123 | 
124 |             terminal = episode_step >= arglist.q_log_ep_len
125 |             episode_step += 1
126 | 
127 |         discount_factors = np.power(args.gamma, np.arange(0, len(episode_reward_n), dtype=np.int))
128 |         discounted_run_rewards_n.append(np.dot(discount_factors, np.array(episode_reward_n)))
129 | 
130 |     q_mean = np.mean(q_values_n, 0)[:,0]
131 |     real_mean = np.mean(discounted_run_rewards_n, 0)
132 |     return q_mean, real_mean
133 | 
134 | 
135 | 
136 | 
137 | def get_trainers(env, num_adversaries, obs_shape_n, arglist, good_agent_mode='matd3', adv_agent_mode='matd3'):
138 |     trainers = []
139 |     model = mlp_model
140 |     if good_agent_mode=='matd3':
141 |         good_trainer = MATD3AgentTrainer
142 |     elif good_agent_mode=='maddpg':
143 |         good_trainer = MADDPGAgentTrainer
144 |     else:
145 |         raise RuntimeError('Unknown agent mode specified' + str(good_agent_mode))
146 |     if adv_agent_mode== 'matd3':
147 |         adv_trainer = MATD3AgentTrainer
148 |     elif adv_agent_mode== 'maddpg':
149 |         adv_trainer= MADDPGAgentTrainer
150 |     else:
151 |         raise RuntimeError('Unknown agent mode specified' + str(adv_agent_mode))
152 | 
153 |     for i in range(num_adversaries):
154 |         trainers.append(adv_trainer(
155 |             "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist,
156 |             local_q_func=(arglist.adv_policy == 'ddpg')))
157 |     for i in range(num_adversaries, env.n):
158 |         trainers.append(good_trainer(
159 |             "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist,
160 |             local_q_func=(arglist.good_policy == 'ddpg')))
161 |     return trainers
162 | 
163 | 
164 | def train_maddpg(arglist):
165 |     with U.single_threaded_session():
166 |         # Create environment
167 |         env = make_env(arglist.scenario, arglist, arglist.benchmark)
168 |         # Create agent trainers
169 |         obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
170 |         num_adversaries = min(env.n, arglist.num_adversaries)
171 |         trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist,
172 |                                 good_agent_mode=arglist.good_policy, adv_agent_mode=arglist.adv_policy)
173 |         print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))
174 | 
175 |         # Initialize
176 |         U.initialize()
177 | 
178 |         # Load previous results, if necessary
179 |         if arglist.load_dir == "":
180 |             arglist.load_dir = arglist.save_dir
181 |         if arglist.display or arglist.restore or arglist.benchmark:
182 |             print('Loading previous state...')
183 |             U.load_state(arglist.load_dir)
184 | 
185 |         episode_rewards = [0.0]  # sum of rewards for all agents
186 |         agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
187 |         final_ep_rewards = []  # sum of rewards for training curve
188 |         final_ep_ag_rewards = []  # agent rewards for training curve
189 |         agent_info = [[[]]]  # placeholder for benchmarking info
190 |         saver = tf.train.Saver(max_to_keep=None)
191 |         obs_n = env.reset()
192 |         episode_step = 0
193 |         train_step = 0
194 |         t_start = time.time()
195 | 
196 |         if arglist.real_q_log:
197 |             world_state_buffer, action_n_buffer, start_episode_step_buffer, obs_n_buffer = [], [], [], []
198 |             q_means, real_means = [], []
199 | 
200 |         print('Starting iterations...')
201 |         while True:
202 |             # get action
203 |             action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)]
204 |             # environment step
205 |             new_obs_n, rew_n, done_n, info_n = env.step(action_n)
206 |             episode_step += 1
207 |             done = all(done_n)  # note: unused, never happens
208 |             terminal = (episode_step >= arglist.max_episode_len)
209 |             done = done or terminal
210 | 
211 |             if arglist.real_q_log:
212 |                 world_state_buffer.append(deepcopy(env.world))
213 |                 obs_n_buffer.append(obs_n)
214 |                 action_n_buffer.append(action_n)
215 |                 start_episode_step_buffer.append(episode_step)
216 | 
217 |             # collect experience
218 |             for i, agent in enumerate(trainers):
219 |                 agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done, terminal)
220 |             obs_n = new_obs_n
221 | 
222 |             for i, rew in enumerate(rew_n):
223 |                 episode_rewards[-1] += rew
224 |                 agent_rewards[i][-1] += rew
225 | 
226 | 
227 | 
228 |             if done or terminal:
229 |                 obs_n = env.reset()
230 |                 episode_step = 0
231 |                 episode_rewards.append(0)  # add element for next episode
232 |                 for a in agent_rewards:
233 |                     a.append(0)
234 |                 agent_info.append([[]])
235 | 
236 |             # increment global step counter
237 |             train_step += 1
238 | 
239 |             # for benchmarking learned policies
240 |             if arglist.benchmark:
241 |                 for i, info in enumerate(info_n):
242 |                     agent_info[-1][i].append(info_n['n'])
243 |                 if train_step > arglist.benchmark_iters and (done or terminal):
244 |                     file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
245 |                     print('Finished benchmarking, now saving...')
246 |                     with open(file_name, 'wb') as fp:
247 |                         pickle.dump(agent_info[:-1], fp)
248 |                     break
249 |                 continue
250 | 
251 |             # for displaying learned policies
252 |             if arglist.display:
253 |                 time.sleep(0.1)
254 |                 env.render()
255 |                 continue
256 | 
257 |             for agent in trainers:
258 |                 loss = agent.update(trainers, train_step)
259 | 
260 | 
261 |             # save model, display training output
262 |             if terminal and (len(episode_rewards) % arglist.save_rate == 0):
263 |                 if arglist.save_dir != '/tmp/policy/':
264 |                     U.save_state(arglist.save_dir + arglist.exp_name, saver=saver, global_step=len(episode_rewards))
265 |                 else:
266 |                     U.save_state(arglist.save_dir, saver=saver)                # print statement depends on whether or not there are adversaries
267 |                 if num_adversaries == 0:
268 |                     print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
269 |                         train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:-1]), round(time.time()-t_start, 3)))
270 |                 else:
271 |                     print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
272 |                         train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:-1]),
273 |                         [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
274 |                 t_start = time.time()
275 |                 # Keep track of final episode reward
276 |                 final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:-1]))
277 |                 for rew in agent_rewards:
278 |                     final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:-1]))
279 | 
280 |                 if arglist.real_q_log and (len(episode_rewards) % (5 * arglist.save_rate) == 0):
281 |                     q_mean, real_mean = calculate_real_q_value(deepcopy(env), trainers,
282 |                                                                world_state_buffer=world_state_buffer,
283 |                                                                action_n_buffer=action_n_buffer,
284 |                                                                obs_n_buffer=obs_n_buffer,
285 |                                                                start_episode_step_buffer=start_episode_step_buffer,
286 |                                                                num_start_states=200,
287 |                                                                args=arglist)
288 |                     world_state_buffer, action_n_buffer, start_episode_step_buffer, obs_n_buffer = [], [], [], []
289 |                     q_means.append(q_mean)
290 |                     real_means.append(real_mean)
291 |                     print('Q-mean: ' + str(q_mean) + ' Real mean: ' + str(real_mean))
292 | 
293 | 
294 | 
295 | 
296 |             # saves final episode reward for plotting training curve later
297 |             if len(episode_rewards) > arglist.num_episodes:
298 |                 rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
299 |                 with open(rew_file_name, 'wb') as fp:
300 |                     pickle.dump(final_ep_rewards, fp)
301 |                 agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
302 |                 with open(agrew_file_name, 'wb') as fp:
303 |                     pickle.dump(final_ep_ag_rewards, fp)
304 |                 args_file_name = arglist.plots_dir + arglist.exp_name + '_args.pkl'
305 |                 with open(args_file_name, 'wb') as fp:
306 |                     pickle.dump(arglist, fp)
307 |                 if arglist.real_q_log:
308 |                     real_q_path = arglist.plots_dir + arglist.exp_name + '_q_values.pkl'
309 |                     with open(real_q_path, 'wb') as fp:
310 |                         pickle.dump({'q_means': q_means, 'real_means': real_means}, fp)
311 |                 print('...Finished total of {} episodes.'.format(len(episode_rewards)))
312 |                 break
313 | 
314 | 
315 | if __name__ == '__main__':
316 |     arglist = parse_args()
317 |     train_maddpg(arglist)
318 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | * Use this code to replicate the results from the paper, for a more readable TF 2.x implementation check out [tf2multiagentrl](https://github.com/JohannesAck/tf2multiagentrl).
 2 | 
 3 | # Implementation of Multi-Agent TD3
 4 | 
 5 | This is the implemetation of MATD3, presented in our paper [Reducing Overestimation Bias in Multi-Agent Domains Using Double Centralized Critics](https://arxiv.org/pdf/1910.01465.pdf).
 6 | Multi-Agent TD3 is an algorithm for multi-agent reinforcement learning, that combines the improvements of [TD3](https://arxiv.org/pdf/1802.09477.pdf) with [MADDPG](https://arxiv.org/pdf/1706.02275.pdf).
 7 | 
 8 | The implementation here is closely based on [maddpg from Ryan Lowe / OpenAI](https://github.com/openai/maddpg), to enable a fair comparision. The environments used  are from [multiagent-particle-envs from OpenAI](https://github.com/openai/multiagent-particle-envs).
 9 | 
10 | 
11 | ### Requirements
12 |  - ```python == 3.6```
13 |  - ```TF == 1.12.0```         any 1.x should work
14 |  - ```Gym == 0.10.5```           *this one is important*
15 |  - ```Numpy >= 1.16.2``` 
16 | 
17 | ### Example Useage
18 | To start training on simple_crypto, with an MATD3 team of agents and an MADDPG adversary, use 
19 | ```
20 | python train.py --scenario simple_speaker_listener --good-policy matd3 --adv-policy maddpg
21 | ```
22 | 
23 | 
24 | ### Reference
25 | If you use our implementation, please also cite our paper with 
26 | ```
27 | @misc{ackermann2019reducing,
28 |     title={Reducing Overestimation Bias in Multi-Agent Domains Using Double Centralized Critics},
29 |     author={Johannes Ackermann and Volker Gabler and Takayuki Osa and Masashi Sugiyama},
30 |     year={2019},
31 |     eprint={1910.01465},
32 |     archivePrefix={arXiv},
33 |     primaryClass={cs.LG}
34 | }
35 | ```
36 | 


--------------------------------------------------------------------------------