├── .gitignore ├── matd3 ├── common │ ├── distributions.py │ └── tf_util.py ├── maddpg │ ├── __init__.py │ └── trainer │ │ ├── maddpg.py │ │ └── replay_buffer.py ├── matd3 │ ├── __init__.py │ └── trainer │ │ ├── matd3.py │ │ └── replay_buffer.py ├── multiagent │ ├── __init__.py │ ├── core.py │ ├── environment.py │ ├── multi_discrete.py │ ├── policy.py │ ├── rendering.py │ ├── scenario.py │ └── scenarios │ │ ├── __init__.py │ │ ├── simple.py │ │ ├── simple_adversary.py │ │ ├── simple_crypto.py │ │ ├── simple_push.py │ │ ├── simple_reference.py │ │ ├── simple_speaker_listener.py │ │ ├── simple_spread.py │ │ ├── simple_spread_two_ag.py │ │ ├── simple_tag.py │ │ └── simple_world_comm.py └── train.py └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.npy 6 | #Documentation 7 | *.html 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # PyCharm 77 | .idea 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | 110 | logdir 111 | logdirMaddpg 112 | leraning_curves 113 | parallelRunStarter.py 114 | learning_curves/ 115 | *.pkl 116 | *.pdf 117 | data/* 118 | -------------------------------------------------------------------------------- /matd3/common/distributions.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import common.tf_util as U 4 | from tensorflow.python.ops import math_ops 5 | from multiagent.multi_discrete import MultiDiscrete 6 | from tensorflow.python.ops import nn 7 | 8 | class Pd(object): 9 | """ 10 | A particular probability distribution 11 | """ 12 | def flatparam(self): 13 | raise NotImplementedError 14 | def mode(self): 15 | raise NotImplementedError 16 | def logp(self, x): 17 | raise NotImplementedError 18 | def kl(self, other): 19 | raise NotImplementedError 20 | def entropy(self): 21 | raise NotImplementedError 22 | def sample(self): 23 | raise NotImplementedError 24 | 25 | class PdType(object): 26 | """ 27 | Parametrized family of probability distributions 28 | """ 29 | def pdclass(self): 30 | raise NotImplementedError 31 | def pdfromflat(self, flat): 32 | return self.pdclass()(flat) 33 | def param_shape(self): 34 | raise NotImplementedError 35 | def sample_shape(self): 36 | raise NotImplementedError 37 | def sample_dtype(self): 38 | raise NotImplementedError 39 | 40 | def param_placeholder(self, prepend_shape, name=None): 41 | return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) 42 | def sample_placeholder(self, prepend_shape, name=None): 43 | return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) 44 | 45 | class CategoricalPdType(PdType): 46 | def __init__(self, ncat): 47 | self.ncat = ncat 48 | def pdclass(self): 49 | return CategoricalPd 50 | def param_shape(self): 51 | return [self.ncat] 52 | def sample_shape(self): 53 | return [] 54 | def sample_dtype(self): 55 | return tf.int32 56 | 57 | class SoftCategoricalPdType(PdType): 58 | def __init__(self, ncat): 59 | self.ncat = ncat 60 | def pdclass(self): 61 | return SoftCategoricalPd 62 | def param_shape(self): 63 | return [self.ncat] 64 | def sample_shape(self): 65 | return [self.ncat] 66 | def sample_dtype(self): 67 | return tf.float32 68 | 69 | class MultiCategoricalPdType(PdType): 70 | def __init__(self, low, high): 71 | self.low = low 72 | self.high = high 73 | self.ncats = high - low + 1 74 | def pdclass(self): 75 | return MultiCategoricalPd 76 | def pdfromflat(self, flat): 77 | return MultiCategoricalPd(self.low, self.high, flat) 78 | def param_shape(self): 79 | return [sum(self.ncats)] 80 | def sample_shape(self): 81 | return [len(self.ncats)] 82 | def sample_dtype(self): 83 | return tf.int32 84 | 85 | class SoftMultiCategoricalPdType(PdType): 86 | def __init__(self, low, high): 87 | self.low = low 88 | self.high = high 89 | self.ncats = high - low + 1 90 | def pdclass(self): 91 | return SoftMultiCategoricalPd 92 | def pdfromflat(self, flat): 93 | return SoftMultiCategoricalPd(self.low, self.high, flat) 94 | def param_shape(self): 95 | return [sum(self.ncats)] 96 | def sample_shape(self): 97 | return [sum(self.ncats)] 98 | def sample_dtype(self): 99 | return tf.float32 100 | 101 | class DiagGaussianPdType(PdType): 102 | def __init__(self, size): 103 | self.size = size 104 | def pdclass(self): 105 | return DiagGaussianPd 106 | def param_shape(self): 107 | return [2*self.size] 108 | def sample_shape(self): 109 | return [self.size] 110 | def sample_dtype(self): 111 | return tf.float32 112 | 113 | class BernoulliPdType(PdType): 114 | def __init__(self, size): 115 | self.size = size 116 | def pdclass(self): 117 | return BernoulliPd 118 | def param_shape(self): 119 | return [self.size] 120 | def sample_shape(self): 121 | return [self.size] 122 | def sample_dtype(self): 123 | return tf.int32 124 | 125 | # WRONG SECOND DERIVATIVES 126 | # class CategoricalPd(Pd): 127 | # def __init__(self, logits): 128 | # self.logits = logits 129 | # self.ps = tf.nn.softmax(logits) 130 | # @classmethod 131 | # def fromflat(cls, flat): 132 | # return cls(flat) 133 | # def flatparam(self): 134 | # return self.logits 135 | # def mode(self): 136 | # return U.argmax(self.logits, axis=1) 137 | # def logp(self, x): 138 | # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) 139 | # def kl(self, other): 140 | # return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ 141 | # - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 142 | # def entropy(self): 143 | # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 144 | # def sample(self): 145 | # u = tf.random_uniform(tf.shape(self.logits)) 146 | # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) 147 | 148 | class CategoricalPd(Pd): 149 | def __init__(self, logits): 150 | self.logits = logits 151 | def flatparam(self): 152 | return self.logits 153 | def mode(self): 154 | return U.argmax(self.logits, axis=1) 155 | def logp(self, x): 156 | return -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 157 | def kl(self, other): 158 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 159 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) 160 | ea0 = tf.exp(a0) 161 | ea1 = tf.exp(a1) 162 | z0 = U.sum(ea0, axis=1, keepdims=True) 163 | z1 = U.sum(ea1, axis=1, keepdims=True) 164 | p0 = ea0 / z0 165 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) 166 | def entropy(self): 167 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 168 | ea0 = tf.exp(a0) 169 | z0 = U.sum(ea0, axis=1, keepdims=True) 170 | p0 = ea0 / z0 171 | return U.sum(p0 * (tf.log(z0) - a0), axis=1) 172 | def sample(self): 173 | u = tf.random_uniform(tf.shape(self.logits)) 174 | return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) 175 | @classmethod 176 | def fromflat(cls, flat): 177 | return cls(flat) 178 | 179 | class SoftCategoricalPd(Pd): 180 | def __init__(self, logits): 181 | self.logits = logits 182 | def flatparam(self): 183 | return self.logits 184 | def mode(self): 185 | return U.softmax(self.logits, axis=-1) 186 | def logp(self, x): 187 | return -tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 188 | def kl(self, other): 189 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 190 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) 191 | ea0 = tf.exp(a0) 192 | ea1 = tf.exp(a1) 193 | z0 = U.sum(ea0, axis=1, keepdims=True) 194 | z1 = U.sum(ea1, axis=1, keepdims=True) 195 | p0 = ea0 / z0 196 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) 197 | def entropy(self): 198 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 199 | ea0 = tf.exp(a0) 200 | z0 = U.sum(ea0, axis=1, keepdims=True) 201 | p0 = ea0 / z0 202 | return U.sum(p0 * (tf.log(z0) - a0), axis=1) 203 | def sample(self): 204 | u = tf.random_uniform(tf.shape(self.logits)) 205 | return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1) 206 | @classmethod 207 | def fromflat(cls, flat): 208 | return cls(flat) 209 | 210 | class MultiCategoricalPd(Pd): 211 | def __init__(self, low, high, flat): 212 | self.flat = flat 213 | self.low = tf.constant(low, dtype=tf.int32) 214 | self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 215 | def flatparam(self): 216 | return self.flat 217 | def mode(self): 218 | return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) 219 | def logp(self, x): 220 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 221 | def kl(self, other): 222 | return tf.add_n([ 223 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 224 | ]) 225 | def entropy(self): 226 | return tf.add_n([p.entropy() for p in self.categoricals]) 227 | def sample(self): 228 | return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) 229 | @classmethod 230 | def fromflat(cls, flat): 231 | return cls(flat) 232 | 233 | class SoftMultiCategoricalPd(Pd): # doesn't work yet 234 | def __init__(self, low, high, flat): 235 | self.flat = flat 236 | self.low = tf.constant(low, dtype=tf.float32) 237 | self.categoricals = list(map(SoftCategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 238 | def flatparam(self): 239 | return self.flat 240 | def mode(self): 241 | x = [] 242 | for i in range(len(self.categoricals)): 243 | x.append(self.low[i] + self.categoricals[i].mode()) 244 | return tf.concat(x, axis=-1) 245 | def logp(self, x): 246 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 247 | def kl(self, other): 248 | return tf.add_n([ 249 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 250 | ]) 251 | def entropy(self): 252 | return tf.add_n([p.entropy() for p in self.categoricals]) 253 | def sample(self): 254 | x = [] 255 | for i in range(len(self.categoricals)): 256 | x.append(self.low[i] + self.categoricals[i].sample()) 257 | return tf.concat(x, axis=-1) 258 | @classmethod 259 | def fromflat(cls, flat): 260 | return cls(flat) 261 | 262 | class DiagGaussianPd(Pd): 263 | def __init__(self, flat): 264 | self.flat = flat 265 | mean, logstd = tf.split(axis=1, num_or_size_splits=2, value=flat) 266 | self.mean = mean 267 | self.logstd = logstd 268 | self.std = tf.exp(logstd) 269 | def flatparam(self): 270 | return self.flat 271 | def mode(self): 272 | return self.mean 273 | def logp(self, x): 274 | return - 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=1) \ 275 | - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) \ 276 | - U.sum(self.logstd, axis=1) 277 | def kl(self, other): 278 | assert isinstance(other, DiagGaussianPd) 279 | return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=1) 280 | def entropy(self): 281 | return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), 1) 282 | def sample(self): 283 | return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) 284 | @classmethod 285 | def fromflat(cls, flat): 286 | return cls(flat) 287 | 288 | class BernoulliPd(Pd): 289 | def __init__(self, logits): 290 | self.logits = logits 291 | self.ps = tf.sigmoid(logits) 292 | def flatparam(self): 293 | return self.logits 294 | def mode(self): 295 | return tf.round(self.ps) 296 | def logp(self, x): 297 | return - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1) 298 | def kl(self, other): 299 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) 300 | def entropy(self): 301 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) 302 | def sample(self): 303 | p = tf.sigmoid(self.logits) 304 | u = tf.random_uniform(tf.shape(p)) 305 | return tf.to_float(math_ops.less(u, p)) 306 | @classmethod 307 | def fromflat(cls, flat): 308 | return cls(flat) 309 | 310 | def make_pdtype(ac_space): 311 | from gym import spaces 312 | if isinstance(ac_space, spaces.Box): 313 | assert len(ac_space.shape) == 1 314 | return DiagGaussianPdType(ac_space.shape[0]) 315 | elif isinstance(ac_space, spaces.Discrete): 316 | # return CategoricalPdType(ac_space.n) 317 | return SoftCategoricalPdType(ac_space.n) 318 | elif isinstance(ac_space, MultiDiscrete): 319 | #return MultiCategoricalPdType(ac_space.low, ac_space.high) 320 | return SoftMultiCategoricalPdType(ac_space.low, ac_space.high) 321 | elif isinstance(ac_space, spaces.MultiBinary): 322 | return BernoulliPdType(ac_space.n) 323 | else: 324 | raise NotImplementedError 325 | 326 | def shape_el(v, i): 327 | maybe = v.get_shape()[i] 328 | if maybe is not None: 329 | return maybe 330 | else: 331 | return tf.shape(v)[i] 332 | -------------------------------------------------------------------------------- /matd3/common/tf_util.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | 6 | def sum(x, axis=None, keepdims=False): 7 | return tf.reduce_sum(x, axis=None if axis is None else [axis], keep_dims = keepdims) 8 | def mean(x, axis=None, keepdims=False): 9 | return tf.reduce_mean(x, axis=None if axis is None else [axis], keep_dims = keepdims) 10 | def var(x, axis=None, keepdims=False): 11 | meanx = mean(x, axis=axis, keepdims=keepdims) 12 | return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) 13 | def std(x, axis=None, keepdims=False): 14 | return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) 15 | def max(x, axis=None, keepdims=False): 16 | return tf.reduce_max(x, axis=None if axis is None else [axis], keep_dims = keepdims) 17 | def min(x, axis=None, keepdims=False): 18 | return tf.reduce_min(x, axis=None if axis is None else [axis], keep_dims = keepdims) 19 | def concatenate(arrs, axis=0): 20 | return tf.concat(axis=axis, values=arrs) 21 | def argmax(x, axis=None): 22 | return tf.argmax(x, axis=axis) 23 | def softmax(x, axis=None): 24 | return tf.nn.softmax(x, dim=axis) 25 | 26 | # ================================================================ 27 | # Misc 28 | # ================================================================ 29 | 30 | 31 | def is_placeholder(x): 32 | return type(x) is tf.Tensor and len(x.op.inputs) == 0 33 | 34 | # ================================================================ 35 | # Inputs 36 | # ================================================================ 37 | 38 | 39 | class TfInput(object): 40 | def __init__(self, name="(unnamed)"): 41 | """Generalized Tensorflow placeholder. The main differences are: 42 | - possibly uses multiple placeholders internally and returns multiple values 43 | - can apply light postprocessing to the value feed to placeholder. 44 | """ 45 | self.name = name 46 | 47 | def get(self): 48 | """Return the tf variable(s) representing the possibly postprocessed value 49 | of placeholder(s). 50 | """ 51 | raise NotImplemented() 52 | 53 | def make_feed_dict(data): 54 | """Given data input it to the placeholder(s).""" 55 | raise NotImplemented() 56 | 57 | 58 | class PlacholderTfInput(TfInput): 59 | def __init__(self, placeholder): 60 | """Wrapper for regular tensorflow placeholder.""" 61 | super().__init__(placeholder.name) 62 | self._placeholder = placeholder 63 | 64 | def get(self): 65 | return self._placeholder 66 | 67 | def make_feed_dict(self, data): 68 | return {self._placeholder: data} 69 | 70 | 71 | class BatchInput(PlacholderTfInput): 72 | def __init__(self, shape, dtype=tf.float32, name=None): 73 | """Creates a placeholder for a batch of tensors of a given shape and dtype 74 | 75 | Parameters 76 | ---------- 77 | shape: [int] 78 | shape of a single elemenet of the batch 79 | dtype: tf.dtype 80 | number representation used for tensor contents 81 | name: str 82 | name of the underlying placeholder 83 | """ 84 | super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name)) 85 | 86 | 87 | class Uint8Input(PlacholderTfInput): 88 | def __init__(self, shape, name=None): 89 | """Takes input in uint8 format which is cast to float32 and divided by 255 90 | before passing it to the model. 91 | 92 | On GPU this ensures lower data transfer times. 93 | 94 | Parameters 95 | ---------- 96 | shape: [int] 97 | shape of the tensor. 98 | name: str 99 | name of the underlying placeholder 100 | """ 101 | 102 | super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name)) 103 | self._shape = shape 104 | self._output = tf.cast(super().get(), tf.float32) / 255.0 105 | 106 | def get(self): 107 | return self._output 108 | 109 | 110 | def ensure_tf_input(thing): 111 | """Takes either tf.placeholder of TfInput and outputs equivalent TfInput""" 112 | if isinstance(thing, TfInput): 113 | return thing 114 | elif is_placeholder(thing): 115 | return PlacholderTfInput(thing) 116 | else: 117 | raise ValueError("Must be a placeholder or TfInput") 118 | 119 | # ================================================================ 120 | # Mathematical utils 121 | # ================================================================ 122 | 123 | 124 | def huber_loss(x, delta=1.0): 125 | """Reference: https://en.wikipedia.org/wiki/Huber_loss""" 126 | return tf.where( 127 | tf.abs(x) < delta, 128 | tf.square(x) * 0.5, 129 | delta * (tf.abs(x) - 0.5 * delta) 130 | ) 131 | 132 | # ================================================================ 133 | # Optimizer utils 134 | # ================================================================ 135 | 136 | 137 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 138 | """Minimizes `objective` using `optimizer` w.r.t. variables in 139 | `var_list` while ensure the norm of the gradients for each 140 | variable is clipped to `clip_val` 141 | """ 142 | if clip_val is None: 143 | return optimizer.minimize(objective, var_list=var_list) 144 | else: 145 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 146 | for i, (grad, var) in enumerate(gradients): 147 | if grad is not None: 148 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 149 | return optimizer.apply_gradients(gradients) 150 | 151 | 152 | # ================================================================ 153 | # Global session 154 | # ================================================================ 155 | 156 | def get_session(): 157 | """Returns recently made Tensorflow session""" 158 | return tf.get_default_session() 159 | 160 | 161 | def make_session(num_cpu): 162 | """Returns a session that will use CPU's only""" 163 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2,allow_growth=True) 164 | tf_config = tf.ConfigProto( 165 | inter_op_parallelism_threads=num_cpu, 166 | intra_op_parallelism_threads=num_cpu, 167 | gpu_options=gpu_options) 168 | return tf.Session(config=tf_config) 169 | 170 | 171 | def single_threaded_session(): 172 | """Returns a session which will only use a single CPU""" 173 | return make_session(1) 174 | 175 | 176 | ALREADY_INITIALIZED = set() 177 | 178 | 179 | def initialize(): 180 | """Initialize all the uninitialized variables in the global scope.""" 181 | new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED 182 | get_session().run(tf.variables_initializer(new_variables)) 183 | ALREADY_INITIALIZED.update(new_variables) 184 | 185 | 186 | # ================================================================ 187 | # Scopes 188 | # ================================================================ 189 | 190 | 191 | def scope_vars(scope, trainable_only=False): 192 | """ 193 | Get variables inside a scope 194 | The scope can be specified as a string 195 | 196 | Parameters 197 | ---------- 198 | scope: str or VariableScope 199 | scope in which the variables reside. 200 | trainable_only: bool 201 | whether or not to return only the variables that were marked as trainable. 202 | 203 | Returns 204 | ------- 205 | vars: [tf.Variable] 206 | list of variables in `scope`. 207 | """ 208 | return tf.get_collection( 209 | tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES, 210 | scope=scope if isinstance(scope, str) else scope.name 211 | ) 212 | 213 | 214 | def scope_name(): 215 | """Returns the name of current scope as a string, e.g. deepq/q_func""" 216 | return tf.get_variable_scope().name 217 | 218 | 219 | def absolute_scope_name(relative_scope_name): 220 | """Appends parent scope name to `relative_scope_name`""" 221 | return scope_name() + "/" + relative_scope_name 222 | 223 | # ================================================================ 224 | # Saving variables 225 | # ================================================================ 226 | 227 | 228 | def load_state(fname, saver=None): 229 | """Load all the variables to the current session from the location """ 230 | if saver is None: 231 | saver = tf.train.Saver() 232 | saver.restore(get_session(), fname) 233 | return saver 234 | 235 | 236 | def save_state(fname, saver=None, global_step=None): 237 | """Save all the variables in the current session to the location """ 238 | os.makedirs(os.path.dirname(fname), exist_ok=True) 239 | if saver is None: 240 | saver = tf.train.Saver(max_to_keep=None) 241 | if global_step is not None: 242 | saver.save(get_session(), fname, global_step) 243 | else: 244 | saver.save(get_session(), fname) 245 | return saver 246 | 247 | # ================================================================ 248 | # Theano-like Function 249 | # ================================================================ 250 | 251 | 252 | def function(inputs, outputs, updates=None, givens=None): 253 | """Just like Theano function. Take a bunch of tensorflow placeholders and expersions 254 | computed based on those placeholders and produces f(inputs) -> outputs. Function f takes 255 | values to be feed to the inputs placeholders and produces the values of the experessions 256 | in outputs. 257 | 258 | Input values can be passed in the same order as inputs or can be provided as kwargs based 259 | on placeholder name (passed to constructor or accessible via placeholder.op.name). 260 | 261 | Example: 262 | x = tf.placeholder(tf.int32, (), name="x") 263 | y = tf.placeholder(tf.int32, (), name="y") 264 | z = 3 * x + 2 * y 265 | lin = function([x, y], z, givens={y: 0}) 266 | 267 | with single_threaded_session(): 268 | initialize() 269 | 270 | assert lin(2) == 6 271 | assert lin(x=3) == 9 272 | assert lin(2, 2) == 10 273 | assert lin(x=2, y=3) == 12 274 | 275 | Parameters 276 | ---------- 277 | inputs: [tf.placeholder or TfInput] 278 | list of input arguments 279 | outputs: [tf.Variable] or tf.Variable 280 | list of outputs or a single output to be returned from function. Returned 281 | value will also have the same shape. 282 | """ 283 | if isinstance(outputs, list): 284 | return _Function(inputs, outputs, updates, givens=givens) 285 | elif isinstance(outputs, (dict, collections.OrderedDict)): 286 | f = _Function(inputs, outputs.values(), updates, givens=givens) 287 | return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs))) 288 | else: 289 | f = _Function(inputs, [outputs], updates, givens=givens) 290 | return lambda *args, **kwargs: f(*args, **kwargs)[0] 291 | 292 | 293 | class _Function(object): 294 | def __init__(self, inputs, outputs, updates, givens, check_nan=False): 295 | for inpt in inputs: 296 | if not issubclass(type(inpt), TfInput): 297 | assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of rl_algs.common.TfInput" 298 | self.inputs = inputs 299 | updates = updates or [] 300 | self.update_group = tf.group(*updates) 301 | self.outputs_update = list(outputs) + [self.update_group] 302 | self.givens = {} if givens is None else givens 303 | self.check_nan = check_nan 304 | 305 | def _feed_input(self, feed_dict, inpt, value): 306 | if issubclass(type(inpt), TfInput): 307 | feed_dict.update(inpt.make_feed_dict(value)) 308 | elif is_placeholder(inpt): 309 | feed_dict[inpt] = value 310 | 311 | def __call__(self, *args, **kwargs): 312 | assert len(args) <= len(self.inputs), "Too many arguments provided" 313 | feed_dict = {} 314 | # Update the args 315 | for inpt, value in zip(self.inputs, args): 316 | self._feed_input(feed_dict, inpt, value) 317 | # Update the kwargs 318 | kwargs_passed_inpt_names = set() 319 | for inpt in self.inputs[len(args):]: 320 | inpt_name = inpt.name.split(':')[0] 321 | inpt_name = inpt_name.split('/')[-1] 322 | assert inpt_name not in kwargs_passed_inpt_names, \ 323 | "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name) 324 | if inpt_name in kwargs: 325 | kwargs_passed_inpt_names.add(inpt_name) 326 | self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name)) 327 | else: 328 | assert inpt in self.givens, "Missing argument " + inpt_name 329 | assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys())) 330 | # Update feed dict with givens. 331 | for inpt in self.givens: 332 | feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) 333 | results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] 334 | if self.check_nan: 335 | if any(np.isnan(r).any() for r in results): 336 | raise RuntimeError("Nan detected") 337 | return results 338 | -------------------------------------------------------------------------------- /matd3/maddpg/__init__.py: -------------------------------------------------------------------------------- 1 | class AgentTrainer(object): 2 | def __init__(self, name, model, obs_shape, act_space, args): 3 | raise NotImplemented() 4 | 5 | def action(self, obs): 6 | raise NotImplemented() 7 | 8 | def process_experience(self, obs, act, rew, new_obs, done, terminal): 9 | raise NotImplemented() 10 | 11 | def preupdate(self): 12 | raise NotImplemented() 13 | 14 | def update(self, agents): 15 | raise NotImplemented() 16 | -------------------------------------------------------------------------------- /matd3/maddpg/trainer/maddpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from IPython import embed 4 | 5 | import common.tf_util as U 6 | 7 | from maddpg import AgentTrainer 8 | from common.distributions import make_pdtype 9 | from maddpg.trainer.replay_buffer import ReplayBuffer 10 | 11 | 12 | def make_update_exp(vals, target_vals): 13 | polyak = 1.0 - 1e-2 14 | expression = [] 15 | for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): 16 | expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var)) 17 | expression = tf.group(*expression) 18 | return U.function([], [], updates=[expression]) 19 | 20 | 21 | def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, 22 | local_q_func=False, num_units=64, scope="trainer", reuse=None): 23 | """ 24 | 25 | :param make_obs_ph_n: 26 | :param act_space_n: 27 | :param p_index: 28 | :param p_func: in base maddpg code = mlp_model 29 | :param q_func: in base maddpg code = mlp_model 30 | :param optimizer: 31 | :param grad_norm_clipping: 32 | :param local_q_func: 33 | :param num_units: 34 | :param scope: 35 | :param reuse: 36 | :return: 37 | """ 38 | with tf.variable_scope(scope, reuse=reuse): 39 | # create distribtuions 40 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 41 | 42 | # set up placeholders 43 | obs_ph_n = make_obs_ph_n 44 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] 45 | 46 | p_input = obs_ph_n[p_index] 47 | 48 | p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) 49 | p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) 50 | 51 | # wrap parameters in distribution 52 | act_pd = act_pdtype_n[p_index].pdfromflat(p) 53 | 54 | act_sample = act_pd.sample() 55 | p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) 56 | 57 | act_input_n = act_ph_n + [] 58 | act_input_n[p_index] = act_pd.sample() 59 | q_input = tf.concat(obs_ph_n + act_input_n, 1) 60 | 61 | q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0] 62 | pg_loss = -tf.reduce_mean(q) 63 | 64 | loss = pg_loss + p_reg * 1e-3 65 | 66 | optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) 67 | 68 | # Create callable functions 69 | train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) 70 | act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) 71 | p_values = U.function([obs_ph_n[p_index]], p) 72 | 73 | # target network 74 | target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) 75 | target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) 76 | update_target_p = make_update_exp(p_func_vars, target_p_func_vars) 77 | 78 | target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() 79 | target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) 80 | 81 | return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act} 82 | 83 | def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): 84 | with tf.variable_scope(scope, reuse=reuse): 85 | # create distribtuions 86 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 87 | 88 | # set up placeholders 89 | obs_ph_n = make_obs_ph_n 90 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] 91 | target_ph = tf.placeholder(tf.float32, [None], name="target") 92 | 93 | q_input = tf.concat(obs_ph_n + act_ph_n, 1) 94 | if local_q_func: 95 | q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) 96 | q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0] 97 | q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) 98 | 99 | q_loss = tf.reduce_mean(tf.square(q - target_ph)) 100 | 101 | # viscosity solution to Bellman differential equation in place of an initial condition 102 | q_reg = tf.reduce_mean(tf.square(q)) 103 | loss = q_loss #+ 1e-3 * q_reg 104 | 105 | optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) 106 | 107 | # Create callable functions 108 | train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) 109 | q_values = U.function(obs_ph_n + act_ph_n, q) 110 | 111 | action_grad = optimizer.compute_gradients(q, act_ph_n) 112 | action_grad_func = U.function(inputs=obs_ph_n + act_ph_n, outputs=action_grad) 113 | 114 | # target network 115 | target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0] 116 | target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) 117 | update_target_q = make_update_exp(q_func_vars, target_q_func_vars) 118 | 119 | target_q_values = U.function(obs_ph_n + act_ph_n, target_q) 120 | 121 | return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values, 'action_grad': action_grad_func} 122 | 123 | class MADDPGAgentTrainer(AgentTrainer): 124 | def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): 125 | self.name = name 126 | self.n = len(obs_shape_n) 127 | self.agent_index = agent_index 128 | self.args = args 129 | obs_ph_n = [] 130 | for i in range(self.n): 131 | obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) 132 | 133 | # Create all the functions necessary to train the model 134 | self.q_train, self.q_update, self.q_debug = q_train( 135 | scope=self.name, 136 | make_obs_ph_n=obs_ph_n, 137 | act_space_n=act_space_n, 138 | q_index=agent_index, 139 | q_func=model, 140 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 141 | grad_norm_clipping=0.5, 142 | local_q_func=local_q_func, 143 | num_units=args.num_units 144 | ) 145 | self.act, self.p_train, self.p_update, self.p_debug = p_train( 146 | scope=self.name, 147 | make_obs_ph_n=obs_ph_n, 148 | act_space_n=act_space_n, 149 | p_index=agent_index, 150 | p_func=model, 151 | q_func=model, 152 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 153 | grad_norm_clipping=0.5, 154 | local_q_func=local_q_func, 155 | num_units=args.num_units 156 | ) 157 | # Create experience buffer 158 | self.replay_buffer = ReplayBuffer(1e6) 159 | self.min_replay_buffer_len = args.batch_size * args.max_episode_len 160 | self.replay_sample_index = None 161 | a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph()) 162 | a.flush() 163 | a.close() 164 | 165 | def action(self, obs): 166 | return self.act(obs[None])[0] 167 | 168 | def experience(self, obs, act, rew, new_obs, done, terminal): 169 | # Store transition in the replay buffer. 170 | self.replay_buffer.add(obs, act, rew, new_obs, float(done)) 171 | 172 | def preupdate(self): 173 | self.replay_sample_index = None 174 | 175 | def update(self, agents, t): 176 | if len(self.replay_buffer) < self.min_replay_buffer_len: # replay buffer is not large enough 177 | return 178 | if not t % self.args.update_rate == 0: # only update every 100 steps 179 | return 180 | 181 | self.replay_sample_index = self.replay_buffer.generate_sample_indices(self.args.batch_size) 182 | # collect replay sample from all agents 183 | obs_n = [] 184 | obs_next_n = [] 185 | act_n = [] 186 | index = self.replay_sample_index 187 | for i in range(self.n): 188 | obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) 189 | obs_n.append(obs) 190 | obs_next_n.append(obs_next) 191 | act_n.append(act) 192 | obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) 193 | 194 | 195 | # train q network 196 | target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] 197 | 198 | target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) 199 | if self.args.critic_zero_if_done: 200 | done_cond = done == True 201 | target_q_next[done_cond] = 0 202 | 203 | target_q = rew + self.args.gamma * target_q_next 204 | q_loss = self.q_train(*(obs_n + act_n + [target_q])) 205 | # print('Action gradient = ') 206 | 207 | # train p network 208 | p_loss = self.p_train(*(obs_n + act_n)) 209 | 210 | self.p_update() 211 | self.q_update() 212 | #embed() 213 | 214 | return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)] 215 | -------------------------------------------------------------------------------- /matd3/maddpg/trainer/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | class ReplayBuffer(object): 5 | def __init__(self, size): 6 | """Create Prioritized Replay buffer. 7 | 8 | Parameters 9 | ---------- 10 | size: int 11 | Max number of transitions to store in the buffer. When the buffer 12 | overflows the old memories are dropped. 13 | """ 14 | self._storage = [] 15 | self._maxsize = int(size) 16 | self._next_idx = 0 17 | 18 | def __len__(self): 19 | return len(self._storage) 20 | 21 | def clear(self): 22 | self._storage = [] 23 | self._next_idx = 0 24 | 25 | def add(self, obs_t, action, reward, obs_tp1, done): 26 | data = (obs_t, action, reward, obs_tp1, done) 27 | 28 | if self._next_idx >= len(self._storage): 29 | self._storage.append(data) 30 | else: 31 | self._storage[self._next_idx] = data 32 | self._next_idx = (self._next_idx + 1) % self._maxsize 33 | 34 | def _encode_sample(self, idxes): 35 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 36 | for i in idxes: 37 | data = self._storage[i] 38 | obs_t, action, reward, obs_tp1, done = data 39 | obses_t.append(np.array(obs_t, copy=False)) 40 | actions.append(np.array(action, copy=False)) 41 | rewards.append(reward) 42 | obses_tp1.append(np.array(obs_tp1, copy=False)) 43 | dones.append(done) 44 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 45 | 46 | def generate_sample_indices(self, batch_size): 47 | return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 48 | 49 | def make_latest_index(self, batch_size): 50 | idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)] 51 | np.random.shuffle(idx) 52 | return idx 53 | 54 | def sample_index(self, idxes): 55 | return self._encode_sample(idxes) 56 | 57 | def sample(self, batch_size): 58 | """Sample a batch of experiences. 59 | 60 | Parameters 61 | ---------- 62 | batch_size: int 63 | How many transitions to sample. 64 | 65 | Returns 66 | ------- 67 | obs_batch: np.array 68 | batch of observations 69 | act_batch: np.array 70 | batch of actions executed given obs_batch 71 | rew_batch: np.array 72 | rewards received as results of executing act_batch 73 | next_obs_batch: np.array 74 | next set of observations seen after executing act_batch 75 | done_mask: np.array 76 | done_mask[i] = 1 if executing act_batch[i] resulted in 77 | the end of an episode and 0 otherwise. 78 | """ 79 | if batch_size > 0: 80 | idxes = self.generate_sample_indices(batch_size) 81 | else: 82 | idxes = range(0, len(self._storage)) 83 | return self._encode_sample(idxes) 84 | 85 | def collect(self): 86 | return self.sample(-1) 87 | -------------------------------------------------------------------------------- /matd3/matd3/__init__.py: -------------------------------------------------------------------------------- 1 | class AgentTrainer(object): 2 | def __init__(self, name, model, obs_shape, act_space, args): 3 | raise NotImplemented() 4 | 5 | def action(self, obs): 6 | raise NotImplemented() 7 | 8 | def process_experience(self, obs, act, rew, new_obs, done, terminal): 9 | raise NotImplemented() 10 | 11 | def preupdate(self): 12 | raise NotImplemented() 13 | 14 | def update(self, agents): 15 | raise NotImplemented() 16 | -------------------------------------------------------------------------------- /matd3/matd3/trainer/matd3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | import common.tf_util as U 5 | from maddpg import AgentTrainer 6 | from common.distributions import make_pdtype 7 | from maddpg.trainer.replay_buffer import ReplayBuffer 8 | 9 | 10 | def make_update_exp(vals, target_vals): 11 | polyak = 1.0 - 1e-2 12 | expression = [] 13 | for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): 14 | expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var)) 15 | expression = tf.group(*expression) 16 | return U.function([], [], updates=[expression]) 17 | 18 | 19 | def p_train(make_obs_ph_n, act_space_n, agent_idx, p_func, q_func, optimizer, grad_norm_clipping=None, 20 | local_q_func=False, num_units=64, scope="trainer", reuse=None): 21 | """ 22 | 23 | :param make_obs_ph_n: 24 | :param act_space_n: 25 | :param agent_idx: 26 | :param p_func: in base maddpg code = mlp_model 27 | :param q_func: in base maddpg code = mlp_model 28 | :param optimizer: 29 | :param grad_norm_clipping: 30 | :param local_q_func: 31 | :param num_units: 32 | :param scope: 33 | :param reuse: 34 | :return: 35 | """ 36 | with tf.variable_scope(scope, reuse=reuse): 37 | # create distribtuions 38 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 39 | 40 | # set up placeholders 41 | obs_ph_n = [tf.layers.flatten(obs_ph) for obs_ph in make_obs_ph_n] 42 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] 43 | 44 | p_input = obs_ph_n[agent_idx] 45 | 46 | p = p_func(p_input, int(act_pdtype_n[agent_idx].param_shape()[0]), scope="p_func", num_units=num_units) 47 | p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) 48 | 49 | # wrap parameters in distribution 50 | act_pd = act_pdtype_n[agent_idx].pdfromflat(p) 51 | 52 | act_sample = act_pd.sample() 53 | p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) 54 | 55 | act_input_n = act_ph_n + [] 56 | act_input_n[agent_idx] = act_pd.sample() #act_pd.mode() # 57 | q_input = tf.concat(obs_ph_n + act_input_n, 1) 58 | 59 | q = q_func(q_input, 1, scope="q_func" + str(1), reuse=True, num_units=num_units)[:,0] 60 | 61 | loss = -tf.reduce_mean(q) + p_reg * 1e-3 62 | 63 | optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) 64 | 65 | # Create callable functions 66 | train = U.function(inputs=make_obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) 67 | act = U.function(inputs=[make_obs_ph_n[agent_idx]], outputs=act_sample) 68 | p_values = U.function([make_obs_ph_n[agent_idx]], p) 69 | 70 | # target network 71 | target_p = p_func(p_input, int(act_pdtype_n[agent_idx].param_shape()[0]), scope="target_p_func", num_units=num_units) 72 | target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) 73 | update_target_p = make_update_exp(p_func_vars, target_p_func_vars) 74 | 75 | target_act_sample = act_pdtype_n[agent_idx].pdfromflat(target_p).sample() 76 | target_act = U.function(inputs=[make_obs_ph_n[agent_idx]], outputs=target_act_sample) 77 | 78 | return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act} 79 | 80 | def q_train(make_obs_ph_n, act_space_n, agent_idx, q_func, q_function_idx, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): 81 | with tf.variable_scope(scope, reuse=reuse): 82 | # create distribtuions 83 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 84 | 85 | # set up placeholders 86 | obs_ph_n = [tf.layers.flatten(obs_ph) for obs_ph in make_obs_ph_n] 87 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] 88 | target_ph = tf.placeholder(tf.float32, [None], name="target") 89 | 90 | q_input = tf.concat(obs_ph_n + act_ph_n, 1) 91 | if local_q_func: 92 | q_input = tf.concat([obs_ph_n[agent_idx], act_ph_n[agent_idx]], 1) 93 | q = q_func(q_input, 1, scope="q_func" + str(q_function_idx), num_units=num_units)[:,0] 94 | q_func_vars = U.scope_vars(U.absolute_scope_name("q_func" + str(q_function_idx))) 95 | 96 | q_loss = tf.reduce_mean(tf.square(q - target_ph)) 97 | 98 | # viscosity solution to Bellman differential equation in place of an initial condition 99 | q_reg = tf.reduce_mean(tf.square(q)) 100 | loss = q_loss #+ 1e-3 * q_reg 101 | 102 | optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) 103 | 104 | # Create callable functions 105 | train = U.function(inputs=make_obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) 106 | q_values = U.function(make_obs_ph_n + act_ph_n, q) 107 | 108 | # target network 109 | target_q = q_func(q_input, 1, scope="target_q_func" + str(q_function_idx), num_units=num_units)[:,0] 110 | target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func" + str(q_function_idx))) 111 | update_target_q = make_update_exp(q_func_vars, target_q_func_vars) 112 | 113 | target_q_values = U.function(make_obs_ph_n + act_ph_n, target_q) 114 | 115 | return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values} 116 | 117 | class MATD3AgentTrainer(AgentTrainer): 118 | def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): 119 | self.name = name 120 | self.n = len(obs_shape_n) 121 | self.agent_index = agent_index 122 | self.args = args 123 | obs_ph_n = [] 124 | for i in range(self.n): 125 | obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) 126 | 127 | # Create all the functions necessary to train the model 128 | self.q_train1, self.q_update1, self.q_debug1 = q_train( 129 | scope=self.name, 130 | make_obs_ph_n=obs_ph_n, 131 | act_space_n=act_space_n, 132 | agent_idx=agent_index, 133 | q_function_idx=1, 134 | q_func=model, 135 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 136 | grad_norm_clipping=0.5, 137 | local_q_func=local_q_func, 138 | num_units=args.num_units 139 | ) 140 | self.q_train2, self.q_update2, self.q_debug2 = q_train( 141 | scope=self.name, 142 | make_obs_ph_n=obs_ph_n, 143 | act_space_n=act_space_n, 144 | agent_idx=agent_index, 145 | q_func=model, 146 | q_function_idx=2, 147 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 148 | grad_norm_clipping=0.5, 149 | local_q_func=local_q_func, 150 | num_units=args.num_units 151 | ) 152 | 153 | self.act, self.p_train, self.p_update, self.p_debug = p_train( 154 | scope=self.name, 155 | make_obs_ph_n=obs_ph_n, 156 | act_space_n=act_space_n, 157 | agent_idx=agent_index, 158 | p_func=model, 159 | q_func=model, #MLPmodel() 160 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 161 | grad_norm_clipping=0.5, 162 | local_q_func=local_q_func, 163 | num_units=args.num_units 164 | ) 165 | # Create experience buffer 166 | self.replay_buffer = ReplayBuffer(1e6) 167 | self.min_replay_buffer_len = args.batch_size * args.max_episode_len 168 | self.replay_sample_index = None 169 | a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph()) 170 | a.flush() 171 | a.close() 172 | 173 | def action(self, obs): 174 | return self.act(obs[None])[0] 175 | 176 | def experience(self, obs, act, rew, new_obs, done, terminal): 177 | # Store transition in the replay buffer. 178 | self.replay_buffer.add(obs, act, rew, new_obs, float(done)) 179 | 180 | def preupdate(self): 181 | self.replay_sample_index = None 182 | 183 | @property 184 | def q_debug(self): 185 | return self.q_debug1 186 | 187 | def update(self, agents, train_step): 188 | if len(self.replay_buffer) < self.min_replay_buffer_len: # replay buffer is not large enough 189 | return 190 | 191 | if not train_step % self.args.update_rate == 0: 192 | return 193 | 194 | 195 | self.replay_sample_index = self.replay_buffer.generate_sample_indices(self.args.batch_size) 196 | # collect replay sample from all agents 197 | obs_n = [] 198 | obs_next_n = [] 199 | act_n = [] 200 | index = self.replay_sample_index 201 | for i in range(self.n): 202 | obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) 203 | obs_n.append(obs) 204 | obs_next_n.append(obs_next) 205 | act_n.append(act) 206 | obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) 207 | 208 | # train q network 209 | target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] 210 | if self.args.use_critic_noise: 211 | for agent_idx in range(self.n): 212 | noise = np.random.normal(0, self.args.critic_action_noise_stddev, size=target_act_next_n[agent_idx].shape) 213 | clipped_noise = np.clip(noise, -self.args.action_noise_clip, self.args.action_noise_clip) 214 | target_act_next_n[agent_idx] = (target_act_next_n[agent_idx] + clipped_noise).tolist() 215 | elif self.args.use_critic_noise_self: 216 | noise = np.random.normal(0, self.args.critic_action_noise_stddev, 217 | size=target_act_next_n[self.agent_index].shape) 218 | clipped_noise = np.clip(noise, -self.args.action_noise_clip, self.args.action_noise_clip) 219 | target_act_next_n[self.agent_index] = target_act_next_n[self.agent_index] + clipped_noise 220 | target_act_next_n = target_act_next_n.tolist() 221 | else: 222 | target_act_next_n = target_act_next_n 223 | target_q_next1 = self.q_debug1['target_q_values'](*(obs_next_n + target_act_next_n)) 224 | target_q_next2 = self.q_debug2['target_q_values'](*(obs_next_n + target_act_next_n)) 225 | target_q_next = np.min([target_q_next1, target_q_next2], 0) 226 | if self.args.critic_zero_if_done: 227 | done_cond = done == True 228 | target_q_next[done_cond] = 0 229 | 230 | target_q = rew + self.args.gamma * target_q_next 231 | q_loss = self.q_train1(*(obs_n + act_n + [target_q])) 232 | q_loss = self.q_train2(*(obs_n + act_n + [target_q])) 233 | 234 | # train p network 235 | if train_step % (self.args.update_rate * self.args.policy_update_rate) == 0: 236 | p_loss = self.p_train(*(obs_n + act_n)) 237 | self.p_update() 238 | self.q_update1() 239 | self.q_update2() 240 | 241 | # print('Agent' + str(self.agent_index) + ' Qloss = ' + str(q_loss) + ' Ploss = ' + str(p_loss)) 242 | # print('Replay buffer size:' + str(len(self.replay_buffer))) 243 | 244 | 245 | return [q_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)] 246 | -------------------------------------------------------------------------------- /matd3/matd3/trainer/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | class ReplayBuffer(object): 6 | def __init__(self, size): 7 | """Create Prioritized Replay buffer. 8 | 9 | Parameters 10 | ---------- 11 | size: int 12 | Max number of transitions to store in the buffer. When the buffer 13 | overflows the old memories are dropped. 14 | """ 15 | self._storage = [] 16 | self._maxsize = int(size) 17 | self._next_idx = 0 18 | 19 | def __len__(self): 20 | return len(self._storage) 21 | 22 | def clear(self): 23 | self._storage = [] 24 | self._next_idx = 0 25 | 26 | def add(self, obs_t, action, reward, obs_tp1, done): 27 | data = (obs_t, action, reward, obs_tp1, done) 28 | 29 | if self._next_idx >= len(self._storage): 30 | self._storage.append(data) 31 | else: 32 | self._storage[self._next_idx] = data 33 | self._next_idx = (self._next_idx + 1) % self._maxsize 34 | 35 | def _encode_sample(self, idxes): 36 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 37 | for i in idxes: 38 | data = self._storage[i] 39 | obs_t, action, reward, obs_tp1, done = data 40 | obses_t.append(np.array(obs_t, copy=False)) 41 | actions.append(np.array(action, copy=False)) 42 | rewards.append(reward) 43 | obses_tp1.append(np.array(obs_tp1, copy=False)) 44 | dones.append(done) 45 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 46 | 47 | def generate_sample_indices(self, batch_size): 48 | return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 49 | 50 | def make_latest_index(self, batch_size): 51 | idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)] 52 | np.random.shuffle(idx) 53 | return idx 54 | 55 | def sample_index(self, idxes): 56 | return self._encode_sample(idxes) 57 | 58 | def sample(self, batch_size): 59 | """Sample a batch of experiences. 60 | 61 | Parameters 62 | ---------- 63 | batch_size: int 64 | How many transitions to sample. 65 | 66 | Returns 67 | ------- 68 | obs_batch: np.array 69 | batch of observations 70 | act_batch: np.array 71 | batch of actions executed given obs_batch 72 | rew_batch: np.array 73 | rewards received as results of executing act_batch 74 | next_obs_batch: np.array 75 | next set of observations seen after executing act_batch 76 | done_mask: np.array 77 | done_mask[i] = 1 if executing act_batch[i] resulted in 78 | the end of an episode and 0 otherwise. 79 | """ 80 | if batch_size > 0: 81 | idxes = self.generate_sample_indices(batch_size) 82 | else: 83 | idxes = range(0, len(self._storage)) 84 | return self._encode_sample(idxes) 85 | 86 | def collect(self): 87 | return self.sample(-1) 88 | -------------------------------------------------------------------------------- /matd3/multiagent/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | # Multiagent envs 4 | # ---------------------------------------- 5 | 6 | register( 7 | id='MultiagentSimple-v0', 8 | entry_point='multiagent.envs:SimpleEnv', 9 | # FIXME(cathywu) currently has to be exactly max_path_length parameters in 10 | # rllab run script 11 | max_episode_steps=100, 12 | ) 13 | 14 | register( 15 | id='MultiagentSimpleSpeakerListener-v0', 16 | entry_point='multiagent.envs:SimpleSpeakerListenerEnv', 17 | max_episode_steps=100, 18 | ) 19 | -------------------------------------------------------------------------------- /matd3/multiagent/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # physical/external base state of all entites 4 | class EntityState(object): 5 | def __init__(self): 6 | # physical position 7 | self.p_pos = None 8 | # physical velocity 9 | self.p_vel = None 10 | 11 | # state of agents (including communication and internal/mental state) 12 | class AgentState(EntityState): 13 | def __init__(self): 14 | super(AgentState, self).__init__() 15 | # communication utterance 16 | self.c = None 17 | 18 | # action of the agent 19 | class Action(object): 20 | def __init__(self): 21 | # physical action 22 | self.u = None 23 | # communication action 24 | self.c = None 25 | 26 | # properties and state of physical world entity 27 | class Entity(object): 28 | def __init__(self): 29 | # name 30 | self.name = '' 31 | # properties: 32 | self.size = 0.050 33 | # entity can move / be pushed 34 | self.movable = False 35 | # entity collides with others 36 | self.collide = True 37 | # material density (affects mass) 38 | self.density = 25.0 39 | # color 40 | self.color = None 41 | # max speed and accel 42 | self.max_speed = None 43 | self.accel = None 44 | # state 45 | self.state = EntityState() 46 | # mass 47 | self.initial_mass = 1.0 48 | 49 | @property 50 | def mass(self): 51 | return self.initial_mass 52 | 53 | # properties of landmark entities 54 | class Landmark(Entity): 55 | def __init__(self): 56 | super(Landmark, self).__init__() 57 | 58 | # properties of agent entities 59 | class Agent(Entity): 60 | def __init__(self): 61 | super(Agent, self).__init__() 62 | # agents are movable by default 63 | self.movable = True 64 | # cannot send communication signals 65 | self.silent = False 66 | # cannot observe the world 67 | self.blind = False 68 | # physical motor noise amount 69 | self.u_noise = None 70 | # communication noise amount 71 | self.c_noise = None 72 | # control range 73 | self.u_range = 1.0 74 | # state 75 | self.state = AgentState() 76 | # action 77 | self.action = Action() 78 | # script behavior to execute 79 | self.action_callback = None 80 | 81 | # multi-agent world 82 | class World(object): 83 | def __init__(self): 84 | # list of agents and entities (can change at execution-time!) 85 | self.agents = [] 86 | self.landmarks = [] 87 | # communication channel dimensionality 88 | self.dim_c = 0 89 | # position dimensionality 90 | self.dim_p = 2 91 | # color dimensionality 92 | self.dim_color = 3 93 | # simulation timestep 94 | self.dt = 0.1 95 | # physical damping 96 | self.damping = 0.25 97 | # contact response parameters 98 | self.contact_force = 1e+2 99 | self.contact_margin = 1e-3 100 | 101 | # return all entities in the world 102 | @property 103 | def entities(self): 104 | return self.agents + self.landmarks 105 | 106 | # return all agents controllable by external policies 107 | @property 108 | def policy_agents(self): 109 | return [agent for agent in self.agents if agent.action_callback is None] 110 | 111 | # return all agents controlled by world scripts 112 | @property 113 | def scripted_agents(self): 114 | return [agent for agent in self.agents if agent.action_callback is not None] 115 | 116 | # update state of the world 117 | def step(self): 118 | # set actions for scripted agents 119 | for agent in self.scripted_agents: 120 | agent.action = agent.action_callback(agent, self) 121 | # gather forces applied to entities 122 | p_force = [None] * len(self.entities) 123 | # apply agent physical controls 124 | p_force = self.apply_action_force(p_force) 125 | # apply environment forces 126 | p_force = self.apply_environment_force(p_force) 127 | # integrate physical state 128 | self.integrate_state(p_force) 129 | # update agent state 130 | for agent in self.agents: 131 | self.update_agent_state(agent) 132 | 133 | # gather agent action forces 134 | def apply_action_force(self, p_force): 135 | # set applied forces 136 | for i,agent in enumerate(self.agents): 137 | if agent.movable: 138 | noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0 139 | p_force[i] = agent.action.u + noise 140 | return p_force 141 | 142 | # gather physical forces acting on entities 143 | def apply_environment_force(self, p_force): 144 | # simple (but inefficient) collision response 145 | for a,entity_a in enumerate(self.entities): 146 | for b,entity_b in enumerate(self.entities): 147 | if(b <= a): continue 148 | [f_a, f_b] = self.get_collision_force(entity_a, entity_b) 149 | if(f_a is not None): 150 | if(p_force[a] is None): p_force[a] = 0.0 151 | p_force[a] = f_a + p_force[a] 152 | if(f_b is not None): 153 | if(p_force[b] is None): p_force[b] = 0.0 154 | p_force[b] = f_b + p_force[b] 155 | return p_force 156 | 157 | # integrate physical state 158 | def integrate_state(self, p_force): 159 | for i,entity in enumerate(self.entities): 160 | if not entity.movable: continue 161 | entity.state.p_vel = entity.state.p_vel * (1 - self.damping) 162 | if (p_force[i] is not None): 163 | entity.state.p_vel += (p_force[i] / entity.mass) * self.dt 164 | if entity.max_speed is not None: 165 | speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1])) 166 | if speed > entity.max_speed: 167 | entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) + 168 | np.square(entity.state.p_vel[1])) * entity.max_speed 169 | entity.state.p_pos += entity.state.p_vel * self.dt 170 | 171 | def update_agent_state(self, agent): 172 | # set communication state (directly for now) 173 | if agent.silent: 174 | agent.state.c = np.zeros(self.dim_c) 175 | else: 176 | noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0 177 | agent.state.c = agent.action.c + noise 178 | 179 | # get collision forces for any contact between two entities 180 | def get_collision_force(self, entity_a, entity_b): 181 | if (not entity_a.collide) or (not entity_b.collide): 182 | return [None, None] # not a collider 183 | if (entity_a is entity_b): 184 | return [None, None] # don't collide against itself 185 | # compute actual distance between entities 186 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos 187 | dist = np.sqrt(np.sum(np.square(delta_pos))) 188 | # minimum allowable distance 189 | dist_min = entity_a.size + entity_b.size 190 | # softmax penetration 191 | k = self.contact_margin 192 | penetration = np.logaddexp(0, -(dist - dist_min)/k)*k 193 | force = self.contact_force * delta_pos / dist * penetration 194 | force_a = +force if entity_a.movable else None 195 | force_b = -force if entity_b.movable else None 196 | return [force_a, force_b] -------------------------------------------------------------------------------- /matd3/multiagent/environment.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | from gym.envs.registration import EnvSpec 4 | import numpy as np 5 | from multiagent.multi_discrete import MultiDiscrete 6 | 7 | # environment for all agents in the multiagent world 8 | # currently code assumes that no agents will be created/destroyed at runtime! 9 | class MultiAgentEnv(gym.Env): 10 | metadata = { 11 | 'render.modes' : ['human', 'rgb_array'] 12 | } 13 | 14 | def __init__(self, world, reset_callback=None, reward_callback=None, 15 | observation_callback=None, info_callback=None, 16 | done_callback=None, shared_viewer=True): 17 | 18 | self.world = world 19 | self.agents = self.world.policy_agents 20 | # set required vectorized gym env property 21 | self.n = len(world.policy_agents) 22 | # scenario callbacks 23 | self.reset_callback = reset_callback 24 | self.reward_callback = reward_callback 25 | self.observation_callback = observation_callback 26 | self.info_callback = info_callback 27 | self.done_callback = done_callback 28 | # environment parameters 29 | self.discrete_action_space = True 30 | # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector 31 | self.discrete_action_input = False 32 | # if true, even the action is continuous, action will be performed discretely 33 | self.force_discrete_action = world.discrete_action if hasattr(world, 'discrete_action') else False 34 | # if true, every agent has the same reward 35 | self.shared_reward = world.collaborative if hasattr(world, 'collaborative') else False 36 | self.time = 0 37 | 38 | # configure spaces 39 | self.action_space = [] 40 | self.observation_space = [] 41 | for agent in self.agents: 42 | total_action_space = [] 43 | # physical action space 44 | if self.discrete_action_space: 45 | u_action_space = spaces.Discrete(world.dim_p * 2 + 1) 46 | else: 47 | u_action_space = spaces.Box(low=-agent.u_range, high=+agent.u_range, shape=(world.dim_p,), dtype=np.float32) 48 | if agent.movable: 49 | total_action_space.append(u_action_space) 50 | # communication action space 51 | if self.discrete_action_space: 52 | c_action_space = spaces.Discrete(world.dim_c) 53 | else: 54 | c_action_space = spaces.Box(low=0.0, high=1.0, shape=(world.dim_c,), dtype=np.float32) 55 | if not agent.silent: 56 | total_action_space.append(c_action_space) 57 | # total action space 58 | if len(total_action_space) > 1: 59 | # all action spaces are discrete, so simplify to MultiDiscrete action space 60 | if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]): 61 | act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space]) 62 | else: 63 | act_space = spaces.Tuple(total_action_space) 64 | self.action_space.append(act_space) 65 | else: 66 | self.action_space.append(total_action_space[0]) 67 | # observation space 68 | obs_dim = len(observation_callback(agent, self.world)) 69 | self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,), dtype=np.float32)) 70 | agent.action.c = np.zeros(self.world.dim_c) 71 | 72 | # rendering 73 | self.shared_viewer = shared_viewer 74 | if self.shared_viewer: 75 | self.viewers = [None] 76 | else: 77 | self.viewers = [None] * self.n 78 | self._reset_render() 79 | 80 | def step(self, action_n): 81 | obs_n = [] 82 | reward_n = [] 83 | done_n = [] 84 | info_n = {'n': []} 85 | self.agents = self.world.policy_agents 86 | # set action for each agent 87 | for i, agent in enumerate(self.agents): 88 | self._set_action(action_n[i], agent, self.action_space[i]) 89 | # advance world state 90 | self.world.step() 91 | # record observation for each agent 92 | for agent in self.agents: 93 | obs_n.append(self._get_obs(agent)) 94 | reward_n.append(self._get_reward(agent)) 95 | done_n.append(self._get_done(agent)) 96 | 97 | info_n['n'].append(self._get_info(agent)) 98 | 99 | # all agents get total reward in cooperative case 100 | reward = np.sum(reward_n) 101 | if self.shared_reward: 102 | reward_n = [reward] * self.n 103 | 104 | return obs_n, reward_n, done_n, info_n 105 | 106 | def reset(self): 107 | # reset world 108 | self.reset_callback(self.world) 109 | # reset renderer 110 | self._reset_render() 111 | # record observations for each agent 112 | obs_n = [] 113 | self.agents = self.world.policy_agents 114 | for agent in self.agents: 115 | obs_n.append(self._get_obs(agent)) 116 | return obs_n 117 | 118 | # get info used for benchmarking 119 | def _get_info(self, agent): 120 | if self.info_callback is None: 121 | return {} 122 | return self.info_callback(agent, self.world) 123 | 124 | # get observation for a particular agent 125 | def _get_obs(self, agent): 126 | if self.observation_callback is None: 127 | return np.zeros(0) 128 | return self.observation_callback(agent, self.world) 129 | 130 | # get dones for a particular agent 131 | # unused right now -- agents are allowed to go beyond the viewing screen 132 | def _get_done(self, agent): 133 | if self.done_callback is None: 134 | return False 135 | return self.done_callback(agent, self.world) 136 | 137 | # get reward for a particular agent 138 | def _get_reward(self, agent): 139 | if self.reward_callback is None: 140 | return 0.0 141 | return self.reward_callback(agent, self.world) 142 | 143 | # set env action for a particular agent 144 | def _set_action(self, action, agent, action_space, time=None): 145 | agent.action.u = np.zeros(self.world.dim_p) 146 | agent.action.c = np.zeros(self.world.dim_c) 147 | # process action 148 | if isinstance(action_space, MultiDiscrete): 149 | act = [] 150 | size = action_space.high - action_space.low + 1 151 | index = 0 152 | for s in size: 153 | act.append(action[index:(index+s)]) 154 | index += s 155 | action = act 156 | else: 157 | action = [action] 158 | 159 | if agent.movable: 160 | # physical action 161 | if self.discrete_action_input: 162 | agent.action.u = np.zeros(self.world.dim_p) 163 | # process discrete action 164 | if action[0] == 1: agent.action.u[0] = -1.0 165 | if action[0] == 2: agent.action.u[0] = +1.0 166 | if action[0] == 3: agent.action.u[1] = -1.0 167 | if action[0] == 4: agent.action.u[1] = +1.0 168 | else: 169 | if self.force_discrete_action: 170 | d = np.argmax(action[0]) 171 | action[0][:] = 0.0 172 | action[0][d] = 1.0 173 | if self.discrete_action_space: 174 | agent.action.u[0] += action[0][1] - action[0][2] 175 | agent.action.u[1] += action[0][3] - action[0][4] 176 | else: 177 | agent.action.u = action[0] 178 | sensitivity = 5.0 179 | if agent.accel is not None: 180 | sensitivity = agent.accel 181 | agent.action.u *= sensitivity 182 | action = action[1:] 183 | if not agent.silent: 184 | # communication action 185 | if self.discrete_action_input: 186 | agent.action.c = np.zeros(self.world.dim_c) 187 | agent.action.c[action[0]] = 1.0 188 | else: 189 | agent.action.c = action[0] 190 | action = action[1:] 191 | # make sure we used all elements of action 192 | assert len(action) == 0 193 | 194 | # reset rendering assets 195 | def _reset_render(self): 196 | self.render_geoms = None 197 | self.render_geoms_xform = None 198 | 199 | # render environment 200 | def render(self, mode='human'): 201 | if mode == 'human': 202 | alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 203 | message = '' 204 | for agent in self.world.agents: 205 | comm = [] 206 | for other in self.world.agents: 207 | if other is agent: continue 208 | if np.all(other.state.c == 0): 209 | word = '_' 210 | else: 211 | word = alphabet[np.argmax(other.state.c)] 212 | message += (other.name + ' to ' + agent.name + ': ' + word + ' ') 213 | print(message) 214 | 215 | for i in range(len(self.viewers)): 216 | # create viewers (if necessary) 217 | if self.viewers[i] is None: 218 | # import rendering only if we need it (and don't import for headless machines) 219 | #from gym.envs.classic_control import rendering 220 | from multiagent import rendering 221 | self.viewers[i] = rendering.Viewer(700,700) 222 | 223 | # create rendering geometry 224 | if self.render_geoms is None: 225 | # import rendering only if we need it (and don't import for headless machines) 226 | #from gym.envs.classic_control import rendering 227 | from multiagent import rendering 228 | self.render_geoms = [] 229 | self.render_geoms_xform = [] 230 | for entity in self.world.entities: 231 | geom = rendering.make_circle(entity.size) 232 | xform = rendering.Transform() 233 | if 'agent' in entity.name: 234 | geom.set_color(*entity.color) 235 | else: 236 | geom.set_color(*entity.color) 237 | geom.add_attr(xform) 238 | self.render_geoms.append(geom) 239 | self.render_geoms_xform.append(xform) 240 | 241 | # add geoms to viewer 242 | for viewer in self.viewers: 243 | viewer.geoms = [] 244 | for geom in self.render_geoms: 245 | viewer.add_geom(geom) 246 | 247 | results = [] 248 | for i in range(len(self.viewers)): 249 | from multiagent import rendering 250 | # update bounds to center around agent 251 | cam_range = 1 252 | if self.shared_viewer: 253 | pos = np.zeros(self.world.dim_p) 254 | else: 255 | pos = self.agents[i].state.p_pos 256 | self.viewers[i].set_bounds(pos[0]-cam_range,pos[0]+cam_range,pos[1]-cam_range,pos[1]+cam_range) 257 | # update geometry positions 258 | for e, entity in enumerate(self.world.entities): 259 | self.render_geoms_xform[e].set_translation(*entity.state.p_pos) 260 | # render to display or array 261 | results.append(self.viewers[i].render(return_rgb_array = mode=='rgb_array')) 262 | 263 | return results 264 | 265 | # create receptor field locations in local coordinate frame 266 | def _make_receptor_locations(self, agent): 267 | receptor_type = 'polar' 268 | range_min = 0.05 * 2.0 269 | range_max = 1.00 270 | dx = [] 271 | # circular receptive field 272 | if receptor_type == 'polar': 273 | for angle in np.linspace(-np.pi, +np.pi, 8, endpoint=False): 274 | for distance in np.linspace(range_min, range_max, 3): 275 | dx.append(distance * np.array([np.cos(angle), np.sin(angle)])) 276 | # add origin 277 | dx.append(np.array([0.0, 0.0])) 278 | # grid receptive field 279 | if receptor_type == 'grid': 280 | for x in np.linspace(-range_max, +range_max, 5): 281 | for y in np.linspace(-range_max, +range_max, 5): 282 | dx.append(np.array([x,y])) 283 | return dx 284 | 285 | 286 | # vectorized wrapper for a batch of multi-agent environments 287 | # assumes all environments have the same observation and action space 288 | class BatchMultiAgentEnv(gym.Env): 289 | metadata = { 290 | 'runtime.vectorized': True, 291 | 'render.modes' : ['human', 'rgb_array'] 292 | } 293 | 294 | def __init__(self, env_batch): 295 | self.env_batch = env_batch 296 | 297 | @property 298 | def n(self): 299 | return np.sum([env.n for env in self.env_batch]) 300 | 301 | @property 302 | def action_space(self): 303 | return self.env_batch[0].action_space 304 | 305 | @property 306 | def observation_space(self): 307 | return self.env_batch[0].observation_space 308 | 309 | def step(self, action_n, time): 310 | obs_n = [] 311 | reward_n = [] 312 | done_n = [] 313 | info_n = {'n': []} 314 | i = 0 315 | for env in self.env_batch: 316 | obs, reward, done, _ = env.step(action_n[i:(i+env.n)], time) 317 | i += env.n 318 | obs_n += obs 319 | # reward = [r / len(self.env_batch) for r in reward] 320 | reward_n += reward 321 | done_n += done 322 | return obs_n, reward_n, done_n, info_n 323 | 324 | def reset(self): 325 | obs_n = [] 326 | for env in self.env_batch: 327 | obs_n += env.reset() 328 | return obs_n 329 | 330 | # render environment 331 | def render(self, mode='human', close=True): 332 | results_n = [] 333 | for env in self.env_batch: 334 | results_n += env.render(mode, close) 335 | return results_n 336 | -------------------------------------------------------------------------------- /matd3/multiagent/multi_discrete.py: -------------------------------------------------------------------------------- 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates) 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py) 3 | 4 | import numpy as np 5 | 6 | import gym 7 | from gym.spaces import prng 8 | 9 | class MultiDiscrete(gym.Space): 10 | """ 11 | - The multi-discrete action space consists of a series of discrete action spaces with different parameters 12 | - It can be adapted to both a Discrete action space or a continuous (Box) action space 13 | - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space 14 | - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space 15 | where the discrete action space can take any integers from `min` to `max` (both inclusive) 16 | Note: A value of 0 always need to represent the NOOP action. 17 | e.g. Nintendo Game Controller 18 | - Can be conceptualized as 3 discrete action spaces: 19 | 1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4 20 | 2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 21 | 3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 22 | - Can be initialized as 23 | MultiDiscrete([ [0,4], [0,1], [0,1] ]) 24 | """ 25 | def __init__(self, array_of_param_array): 26 | self.low = np.array([x[0] for x in array_of_param_array]) 27 | self.high = np.array([x[1] for x in array_of_param_array]) 28 | self.num_discrete_space = self.low.shape[0] 29 | 30 | def sample(self): 31 | """ Returns a array with one sample from each discrete action space """ 32 | # For each row: round(random .* (max - min) + min, 0) 33 | random_array = prng.np_random.rand(self.num_discrete_space) 34 | return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)] 35 | def contains(self, x): 36 | return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all() 37 | 38 | @property 39 | def shape(self): 40 | return self.num_discrete_space 41 | def __repr__(self): 42 | return "MultiDiscrete" + str(self.num_discrete_space) 43 | def __eq__(self, other): 44 | return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high) -------------------------------------------------------------------------------- /matd3/multiagent/policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pyglet.window import key 3 | 4 | # individual agent policy 5 | class Policy(object): 6 | def __init__(self): 7 | pass 8 | def action(self, obs): 9 | raise NotImplementedError() 10 | 11 | # interactive policy based on keyboard input 12 | # hard-coded to deal only with movement, not communication 13 | class InteractivePolicy(Policy): 14 | def __init__(self, env, agent_index): 15 | super(InteractivePolicy, self).__init__() 16 | self.env = env 17 | # hard-coded keyboard events 18 | self.move = [False for i in range(4)] 19 | self.comm = [False for i in range(env.world.dim_c)] 20 | # register keyboard events with this environment's window 21 | env.viewers[agent_index].window.on_key_press = self.key_press 22 | env.viewers[agent_index].window.on_key_release = self.key_release 23 | 24 | def action(self, obs): 25 | # ignore observation and just act based on keyboard events 26 | if self.env.discrete_action_input: 27 | u = 0 28 | if self.move[0]: u = 1 29 | if self.move[1]: u = 2 30 | if self.move[2]: u = 4 31 | if self.move[3]: u = 3 32 | else: 33 | u = np.zeros(5) # 5-d because of no-move action 34 | if self.move[0]: u[1] += 1.0 35 | if self.move[1]: u[2] += 1.0 36 | if self.move[3]: u[3] += 1.0 37 | if self.move[2]: u[4] += 1.0 38 | if True not in self.move: 39 | u[0] += 1.0 40 | return np.concatenate([u, np.zeros(self.env.world.dim_c)]) 41 | 42 | # keyboard event callbacks 43 | def key_press(self, k, mod): 44 | if k==key.LEFT: self.move[0] = True 45 | if k==key.RIGHT: self.move[1] = True 46 | if k==key.UP: self.move[2] = True 47 | if k==key.DOWN: self.move[3] = True 48 | def key_release(self, k, mod): 49 | if k==key.LEFT: self.move[0] = False 50 | if k==key.RIGHT: self.move[1] = False 51 | if k==key.UP: self.move[2] = False 52 | if k==key.DOWN: self.move[3] = False 53 | -------------------------------------------------------------------------------- /matd3/multiagent/rendering.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2D rendering framework 3 | """ 4 | from __future__ import division 5 | import os 6 | import six 7 | import sys 8 | 9 | if "Apple" in sys.version: 10 | if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ: 11 | os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib' 12 | # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite 13 | 14 | from gym.utils import reraise 15 | from gym import error 16 | 17 | try: 18 | import pyglet 19 | except ImportError as e: 20 | reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.") 21 | 22 | try: 23 | from pyglet.gl import * 24 | except ImportError as e: 25 | reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '") 26 | 27 | import math 28 | import numpy as np 29 | 30 | RAD2DEG = 57.29577951308232 31 | 32 | def get_display(spec): 33 | """Convert a display specification (such as :0) into an actual Display 34 | object. 35 | 36 | Pyglet only supports multiple Displays on Linux. 37 | """ 38 | if spec is None: 39 | return None 40 | elif isinstance(spec, six.string_types): 41 | return pyglet.canvas.Display(spec) 42 | else: 43 | raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec)) 44 | 45 | class Viewer(object): 46 | def __init__(self, width, height, display=None): 47 | display = get_display(display) 48 | 49 | self.width = width 50 | self.height = height 51 | 52 | self.window = pyglet.window.Window(width=width, height=height, display=display) 53 | self.window.on_close = self.window_closed_by_user 54 | self.geoms = [] 55 | self.onetime_geoms = [] 56 | self.transform = Transform() 57 | 58 | glEnable(GL_BLEND) 59 | # glEnable(GL_MULTISAMPLE) 60 | glEnable(GL_LINE_SMOOTH) 61 | # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE) 62 | glHint(GL_LINE_SMOOTH_HINT, GL_NICEST) 63 | glLineWidth(2.0) 64 | glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) 65 | 66 | def close(self): 67 | self.window.close() 68 | 69 | def window_closed_by_user(self): 70 | self.close() 71 | 72 | def set_bounds(self, left, right, bottom, top): 73 | assert right > left and top > bottom 74 | scalex = self.width/(right-left) 75 | scaley = self.height/(top-bottom) 76 | self.transform = Transform( 77 | translation=(-left*scalex, -bottom*scaley), 78 | scale=(scalex, scaley)) 79 | 80 | def add_geom(self, geom): 81 | self.geoms.append(geom) 82 | 83 | def add_onetime(self, geom): 84 | self.onetime_geoms.append(geom) 85 | 86 | def render(self, return_rgb_array=False): 87 | glClearColor(1,1,1,1) 88 | self.window.clear() 89 | self.window.switch_to() 90 | self.window.dispatch_events() 91 | self.transform.enable() 92 | for geom in self.geoms: 93 | geom.render() 94 | for geom in self.onetime_geoms: 95 | geom.render() 96 | self.transform.disable() 97 | arr = None 98 | if return_rgb_array: 99 | buffer = pyglet.image.get_buffer_manager().get_color_buffer() 100 | image_data = buffer.get_image_data() 101 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 102 | # In https://github.com/openai/gym-http-api/issues/2, we 103 | # discovered that someone using Xmonad on Arch was having 104 | # a window of size 598 x 398, though a 600 x 400 window 105 | # was requested. (Guess Xmonad was preserving a pixel for 106 | # the boundary.) So we use the buffer height/width rather 107 | # than the requested one. 108 | arr = arr.reshape(buffer.height, buffer.width, 4) 109 | arr = arr[::-1,:,0:3] 110 | self.window.flip() 111 | self.onetime_geoms = [] 112 | return arr 113 | 114 | # Convenience 115 | def draw_circle(self, radius=10, res=30, filled=True, **attrs): 116 | geom = make_circle(radius=radius, res=res, filled=filled) 117 | _add_attrs(geom, attrs) 118 | self.add_onetime(geom) 119 | return geom 120 | 121 | def draw_polygon(self, v, filled=True, **attrs): 122 | geom = make_polygon(v=v, filled=filled) 123 | _add_attrs(geom, attrs) 124 | self.add_onetime(geom) 125 | return geom 126 | 127 | def draw_polyline(self, v, **attrs): 128 | geom = make_polyline(v=v) 129 | _add_attrs(geom, attrs) 130 | self.add_onetime(geom) 131 | return geom 132 | 133 | def draw_line(self, start, end, **attrs): 134 | geom = Line(start, end) 135 | _add_attrs(geom, attrs) 136 | self.add_onetime(geom) 137 | return geom 138 | 139 | def get_array(self): 140 | self.window.flip() 141 | image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() 142 | self.window.flip() 143 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 144 | arr = arr.reshape(self.height, self.width, 4) 145 | return arr[::-1,:,0:3] 146 | 147 | def _add_attrs(geom, attrs): 148 | if "color" in attrs: 149 | geom.set_color(*attrs["color"]) 150 | if "linewidth" in attrs: 151 | geom.set_linewidth(attrs["linewidth"]) 152 | 153 | class Geom(object): 154 | def __init__(self): 155 | self._color=Color((0, 0, 0, 1.0)) 156 | self.attrs = [self._color] 157 | def render(self): 158 | for attr in reversed(self.attrs): 159 | attr.enable() 160 | self.render1() 161 | for attr in self.attrs: 162 | attr.disable() 163 | def render1(self): 164 | raise NotImplementedError 165 | def add_attr(self, attr): 166 | self.attrs.append(attr) 167 | def set_color(self, r, g, b, alpha=1): 168 | self._color.vec4 = (r, g, b, alpha) 169 | 170 | class Attr(object): 171 | def enable(self): 172 | raise NotImplementedError 173 | def disable(self): 174 | pass 175 | 176 | class Transform(Attr): 177 | def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)): 178 | self.set_translation(*translation) 179 | self.set_rotation(rotation) 180 | self.set_scale(*scale) 181 | def enable(self): 182 | glPushMatrix() 183 | glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint 184 | glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0) 185 | glScalef(self.scale[0], self.scale[1], 1) 186 | def disable(self): 187 | glPopMatrix() 188 | def set_translation(self, newx, newy): 189 | self.translation = (float(newx), float(newy)) 190 | def set_rotation(self, new): 191 | self.rotation = float(new) 192 | def set_scale(self, newx, newy): 193 | self.scale = (float(newx), float(newy)) 194 | 195 | class Color(Attr): 196 | def __init__(self, vec4): 197 | self.vec4 = vec4 198 | def enable(self): 199 | glColor4f(*self.vec4) 200 | 201 | class LineStyle(Attr): 202 | def __init__(self, style): 203 | self.style = style 204 | def enable(self): 205 | glEnable(GL_LINE_STIPPLE) 206 | glLineStipple(1, self.style) 207 | def disable(self): 208 | glDisable(GL_LINE_STIPPLE) 209 | 210 | class LineWidth(Attr): 211 | def __init__(self, stroke): 212 | self.stroke = stroke 213 | def enable(self): 214 | glLineWidth(self.stroke) 215 | 216 | class Point(Geom): 217 | def __init__(self): 218 | Geom.__init__(self) 219 | def render1(self): 220 | glBegin(GL_POINTS) # draw point 221 | glVertex3f(0.0, 0.0, 0.0) 222 | glEnd() 223 | 224 | class FilledPolygon(Geom): 225 | def __init__(self, v): 226 | Geom.__init__(self) 227 | self.v = v 228 | def render1(self): 229 | if len(self.v) == 4 : glBegin(GL_QUADS) 230 | elif len(self.v) > 4 : glBegin(GL_POLYGON) 231 | else: glBegin(GL_TRIANGLES) 232 | for p in self.v: 233 | glVertex3f(p[0], p[1],0) # draw each vertex 234 | glEnd() 235 | 236 | color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5) 237 | glColor4f(*color) 238 | glBegin(GL_LINE_LOOP) 239 | for p in self.v: 240 | glVertex3f(p[0], p[1],0) # draw each vertex 241 | glEnd() 242 | 243 | def make_circle(radius=10, res=30, filled=True): 244 | points = [] 245 | for i in range(res): 246 | ang = 2*math.pi*i / res 247 | points.append((math.cos(ang)*radius, math.sin(ang)*radius)) 248 | if filled: 249 | return FilledPolygon(points) 250 | else: 251 | return PolyLine(points, True) 252 | 253 | def make_polygon(v, filled=True): 254 | if filled: return FilledPolygon(v) 255 | else: return PolyLine(v, True) 256 | 257 | def make_polyline(v): 258 | return PolyLine(v, False) 259 | 260 | def make_capsule(length, width): 261 | l, r, t, b = 0, length, width/2, -width/2 262 | box = make_polygon([(l,b), (l,t), (r,t), (r,b)]) 263 | circ0 = make_circle(width/2) 264 | circ1 = make_circle(width/2) 265 | circ1.add_attr(Transform(translation=(length, 0))) 266 | geom = Compound([box, circ0, circ1]) 267 | return geom 268 | 269 | class Compound(Geom): 270 | def __init__(self, gs): 271 | Geom.__init__(self) 272 | self.gs = gs 273 | for g in self.gs: 274 | g.attrs = [a for a in g.attrs if not isinstance(a, Color)] 275 | def render1(self): 276 | for g in self.gs: 277 | g.render() 278 | 279 | class PolyLine(Geom): 280 | def __init__(self, v, close): 281 | Geom.__init__(self) 282 | self.v = v 283 | self.close = close 284 | self.linewidth = LineWidth(1) 285 | self.add_attr(self.linewidth) 286 | def render1(self): 287 | glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP) 288 | for p in self.v: 289 | glVertex3f(p[0], p[1],0) # draw each vertex 290 | glEnd() 291 | def set_linewidth(self, x): 292 | self.linewidth.stroke = x 293 | 294 | class Line(Geom): 295 | def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)): 296 | Geom.__init__(self) 297 | self.start = start 298 | self.end = end 299 | self.linewidth = LineWidth(1) 300 | self.add_attr(self.linewidth) 301 | 302 | def render1(self): 303 | glBegin(GL_LINES) 304 | glVertex2f(*self.start) 305 | glVertex2f(*self.end) 306 | glEnd() 307 | 308 | class Image(Geom): 309 | def __init__(self, fname, width, height): 310 | Geom.__init__(self) 311 | self.width = width 312 | self.height = height 313 | img = pyglet.image.load(fname) 314 | self.img = img 315 | self.flip = False 316 | def render1(self): 317 | self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height) 318 | 319 | # ================================================================ 320 | 321 | class SimpleImageViewer(object): 322 | def __init__(self, display=None): 323 | self.window = None 324 | self.isopen = False 325 | self.display = display 326 | def imshow(self, arr): 327 | if self.window is None: 328 | height, width, channels = arr.shape 329 | self.window = pyglet.window.Window(width=width, height=height, display=self.display) 330 | self.width = width 331 | self.height = height 332 | self.isopen = True 333 | assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape" 334 | image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3) 335 | self.window.clear() 336 | self.window.switch_to() 337 | self.window.dispatch_events() 338 | image.blit(0,0) 339 | self.window.flip() 340 | def close(self): 341 | if self.isopen: 342 | self.window.close() 343 | self.isopen = False 344 | def __del__(self): 345 | self.close() -------------------------------------------------------------------------------- /matd3/multiagent/scenario.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # defines scenario upon which the world is built 4 | class BaseScenario(object): 5 | # create elements of the world 6 | def make_world(self): 7 | raise NotImplementedError() 8 | # create initial conditions of the world 9 | def reset_world(self, world): 10 | raise NotImplementedError() 11 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/__init__.py: -------------------------------------------------------------------------------- 1 | import imp 2 | import os.path as osp 3 | 4 | 5 | def load(name): 6 | pathname = osp.join(osp.dirname(__file__), name) 7 | return imp.load_source('', pathname) 8 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/simple.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # add agents 9 | world.agents = [Agent() for i in range(1)] 10 | for i, agent in enumerate(world.agents): 11 | agent.name = 'agent %d' % i 12 | agent.collide = False 13 | agent.silent = True 14 | # add landmarks 15 | world.landmarks = [Landmark() for i in range(1)] 16 | for i, landmark in enumerate(world.landmarks): 17 | landmark.name = 'landmark %d' % i 18 | landmark.collide = False 19 | landmark.movable = False 20 | # make initial conditions 21 | self.reset_world(world) 22 | return world 23 | 24 | def reset_world(self, world): 25 | # random properties for agents 26 | for i, agent in enumerate(world.agents): 27 | agent.color = np.array([0.25,0.25,0.25]) 28 | # random properties for landmarks 29 | for i, landmark in enumerate(world.landmarks): 30 | landmark.color = np.array([0.75,0.75,0.75]) 31 | world.landmarks[0].color = np.array([0.75,0.25,0.25]) 32 | # set random initial states 33 | for agent in world.agents: 34 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 35 | agent.state.p_vel = np.zeros(world.dim_p) 36 | agent.state.c = np.zeros(world.dim_c) 37 | for i, landmark in enumerate(world.landmarks): 38 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 39 | landmark.state.p_vel = np.zeros(world.dim_p) 40 | 41 | def reward(self, agent, world): 42 | dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos)) 43 | return -dist2 44 | 45 | def observation(self, agent, world): 46 | # get positions of all entities in this agent's reference frame 47 | entity_pos = [] 48 | for entity in world.landmarks: 49 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 50 | return np.concatenate([agent.state.p_vel] + entity_pos) 51 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/simple_adversary.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | 8 | def make_world(self): 9 | world = World() 10 | # set any world properties first 11 | world.dim_c = 2 12 | num_agents = 3 13 | world.num_agents = num_agents 14 | num_adversaries = 1 15 | num_landmarks = num_agents - 1 16 | # add agents 17 | world.agents = [Agent() for i in range(num_agents)] 18 | for i, agent in enumerate(world.agents): 19 | agent.name = 'agent %d' % i 20 | agent.collide = False 21 | agent.silent = True 22 | agent.adversary = True if i < num_adversaries else False 23 | agent.size = 0.15 24 | # add landmarks 25 | world.landmarks = [Landmark() for i in range(num_landmarks)] 26 | for i, landmark in enumerate(world.landmarks): 27 | landmark.name = 'landmark %d' % i 28 | landmark.collide = False 29 | landmark.movable = False 30 | landmark.size = 0.08 31 | # make initial conditions 32 | self.reset_world(world) 33 | return world 34 | 35 | def reset_world(self, world): 36 | # random properties for agents 37 | world.agents[0].color = np.array([0.85, 0.35, 0.35]) 38 | for i in range(1, world.num_agents): 39 | world.agents[i].color = np.array([0.35, 0.35, 0.85]) 40 | # random properties for landmarks 41 | for i, landmark in enumerate(world.landmarks): 42 | landmark.color = np.array([0.15, 0.15, 0.15]) 43 | # set goal landmark 44 | goal = np.random.choice(world.landmarks) 45 | goal.color = np.array([0.15, 0.65, 0.15]) 46 | for agent in world.agents: 47 | agent.goal_a = goal 48 | # set random initial states 49 | for agent in world.agents: 50 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 51 | agent.state.p_vel = np.zeros(world.dim_p) 52 | agent.state.c = np.zeros(world.dim_c) 53 | for i, landmark in enumerate(world.landmarks): 54 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 55 | landmark.state.p_vel = np.zeros(world.dim_p) 56 | 57 | def benchmark_data(self, agent, world): 58 | # returns data for benchmarking purposes 59 | if agent.adversary: 60 | return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) 61 | else: 62 | dists = [] 63 | for l in world.landmarks: 64 | dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) 65 | dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) 66 | return tuple(dists) 67 | 68 | # return all agents that are not adversaries 69 | def good_agents(self, world): 70 | return [agent for agent in world.agents if not agent.adversary] 71 | 72 | # return all adversarial agents 73 | def adversaries(self, world): 74 | return [agent for agent in world.agents if agent.adversary] 75 | 76 | def reward(self, agent, world): 77 | # Agents are rewarded based on minimum agent distance to each landmark 78 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 79 | 80 | def agent_reward(self, agent, world): 81 | # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it 82 | shaped_reward = True 83 | shaped_adv_reward = True 84 | 85 | # Calculate negative reward for adversary 86 | adversary_agents = self.adversaries(world) 87 | if shaped_adv_reward: # distance-based adversary reward 88 | adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents]) 89 | else: # proximity-based adversary reward (binary) 90 | adv_rew = 0 91 | for a in adversary_agents: 92 | if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size: 93 | adv_rew -= 5 94 | 95 | # Calculate positive reward for agents 96 | good_agents = self.good_agents(world) 97 | if shaped_reward: # distance-based agent reward 98 | pos_rew = -min( 99 | [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) 100 | else: # proximity-based agent reward (binary) 101 | pos_rew = 0 102 | if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \ 103 | < 2 * agent.goal_a.size: 104 | pos_rew += 5 105 | pos_rew -= min( 106 | [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) 107 | return pos_rew + adv_rew 108 | 109 | def adversary_reward(self, agent, world): 110 | # Rewarded based on proximity to the goal landmark 111 | shaped_reward = True 112 | if shaped_reward: # distance-based reward 113 | return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) 114 | else: # proximity-based reward (binary) 115 | adv_rew = 0 116 | if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size: 117 | adv_rew += 5 118 | return adv_rew 119 | 120 | 121 | def observation(self, agent, world): 122 | # get positions of all entities in this agent's reference frame 123 | entity_pos = [] 124 | for entity in world.landmarks: 125 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 126 | # entity colors 127 | entity_color = [] 128 | for entity in world.landmarks: 129 | entity_color.append(entity.color) 130 | # communication of all other agents 131 | other_pos = [] 132 | for other in world.agents: 133 | if other is agent: continue 134 | other_pos.append(other.state.p_pos - agent.state.p_pos) 135 | 136 | if not agent.adversary: 137 | return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos) 138 | else: 139 | return np.concatenate(entity_pos + other_pos) 140 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/simple_crypto.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scenario: 3 | 1 speaker, 2 listeners (one of which is an adversary). Good agents rewarded for proximity to goal, and distance from 4 | adversary to goal. Adversary is rewarded for its distance to the goal. 5 | """ 6 | 7 | 8 | import numpy as np 9 | from multiagent.core import World, Agent, Landmark 10 | from multiagent.scenario import BaseScenario 11 | import random 12 | 13 | 14 | class CryptoAgent(Agent): 15 | def __init__(self): 16 | super(CryptoAgent, self).__init__() 17 | self.key = None 18 | 19 | class Scenario(BaseScenario): 20 | 21 | def make_world(self): 22 | world = World() 23 | # set any world properties first 24 | num_agents = 3 25 | num_adversaries = 1 26 | num_landmarks = 2 27 | world.dim_c = 4 28 | # add agents 29 | world.agents = [CryptoAgent() for i in range(num_agents)] 30 | for i, agent in enumerate(world.agents): 31 | agent.name = 'agent %d' % i 32 | agent.collide = False 33 | agent.adversary = True if i < num_adversaries else False 34 | agent.speaker = True if i == 2 else False 35 | agent.movable = False 36 | # add landmarks 37 | world.landmarks = [Landmark() for i in range(num_landmarks)] 38 | for i, landmark in enumerate(world.landmarks): 39 | landmark.name = 'landmark %d' % i 40 | landmark.collide = False 41 | landmark.movable = False 42 | # make initial conditions 43 | self.reset_world(world) 44 | return world 45 | 46 | 47 | def reset_world(self, world): 48 | # random properties for agents 49 | for i, agent in enumerate(world.agents): 50 | agent.color = np.array([0.25, 0.25, 0.25]) 51 | if agent.adversary: 52 | agent.color = np.array([0.75, 0.25, 0.25]) 53 | agent.key = None 54 | # random properties for landmarks 55 | color_list = [np.zeros(world.dim_c) for i in world.landmarks] 56 | for i, color in enumerate(color_list): 57 | color[i] += 1 58 | for color, landmark in zip(color_list, world.landmarks): 59 | landmark.color = color 60 | # set goal landmark 61 | goal = np.random.choice(world.landmarks) 62 | world.agents[1].color = goal.color 63 | world.agents[2].key = np.random.choice(world.landmarks).color 64 | 65 | for agent in world.agents: 66 | agent.goal_a = goal 67 | 68 | # set random initial states 69 | for agent in world.agents: 70 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 71 | agent.state.p_vel = np.zeros(world.dim_p) 72 | agent.state.c = np.zeros(world.dim_c) 73 | for i, landmark in enumerate(world.landmarks): 74 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 75 | landmark.state.p_vel = np.zeros(world.dim_p) 76 | 77 | 78 | def benchmark_data(self, agent, world): 79 | # returns data for benchmarking purposes 80 | return (agent.state.c, agent.goal_a.color) 81 | 82 | # return all agents that are not adversaries 83 | def good_listeners(self, world): 84 | return [agent for agent in world.agents if not agent.adversary and not agent.speaker] 85 | 86 | # return all agents that are not adversaries 87 | def good_agents(self, world): 88 | return [agent for agent in world.agents if not agent.adversary] 89 | 90 | # return all adversarial agents 91 | def adversaries(self, world): 92 | return [agent for agent in world.agents if agent.adversary] 93 | 94 | def reward(self, agent, world): 95 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 96 | 97 | def agent_reward(self, agent, world): 98 | # Agents rewarded if Bob can reconstruct message, but adversary (Eve) cannot 99 | good_listeners = self.good_listeners(world) 100 | adversaries = self.adversaries(world) 101 | good_rew = 0 102 | adv_rew = 0 103 | for a in good_listeners: 104 | if (a.state.c == np.zeros(world.dim_c)).all(): 105 | continue 106 | else: 107 | good_rew -= np.sum(np.square(a.state.c - agent.goal_a.color)) 108 | for a in adversaries: 109 | if (a.state.c == np.zeros(world.dim_c)).all(): 110 | continue 111 | else: 112 | adv_l1 = np.sum(np.square(a.state.c - agent.goal_a.color)) 113 | adv_rew += adv_l1 114 | return adv_rew + good_rew 115 | 116 | def adversary_reward(self, agent, world): 117 | # Adversary (Eve) is rewarded if it can reconstruct original goal 118 | rew = 0 119 | if not (agent.state.c == np.zeros(world.dim_c)).all(): 120 | rew -= np.sum(np.square(agent.state.c - agent.goal_a.color)) 121 | return rew 122 | 123 | 124 | def observation(self, agent, world): 125 | # goal color 126 | goal_color = np.zeros(world.dim_color) 127 | if agent.goal_a is not None: 128 | goal_color = agent.goal_a.color 129 | 130 | # get positions of all entities in this agent's reference frame 131 | entity_pos = [] 132 | for entity in world.landmarks: 133 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 134 | # communication of all other agents 135 | comm = [] 136 | for other in world.agents: 137 | if other is agent or (other.state.c is None) or not other.speaker: continue 138 | comm.append(other.state.c) 139 | 140 | confer = np.array([0]) 141 | 142 | if world.agents[2].key is None: 143 | confer = np.array([1]) 144 | key = np.zeros(world.dim_c) 145 | goal_color = np.zeros(world.dim_c) 146 | else: 147 | key = world.agents[2].key 148 | 149 | prnt = False 150 | # speaker 151 | if agent.speaker: 152 | if prnt: 153 | print('speaker') 154 | print(agent.state.c) 155 | print(np.concatenate([goal_color] + [key] + [confer] + [np.random.randn(1)])) 156 | return np.concatenate([goal_color] + [key]) 157 | # listener 158 | if not agent.speaker and not agent.adversary: 159 | if prnt: 160 | print('listener') 161 | print(agent.state.c) 162 | print(np.concatenate([key] + comm + [confer])) 163 | return np.concatenate([key] + comm) 164 | if not agent.speaker and agent.adversary: 165 | if prnt: 166 | print('adversary') 167 | print(agent.state.c) 168 | print(np.concatenate(comm + [confer])) 169 | return np.concatenate(comm) 170 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/simple_push.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # set any world properties first 9 | world.dim_c = 2 10 | num_agents = 2 11 | num_adversaries = 1 12 | num_landmarks = 2 13 | # add agents 14 | world.agents = [Agent() for i in range(num_agents)] 15 | for i, agent in enumerate(world.agents): 16 | agent.name = 'agent %d' % i 17 | agent.collide = True 18 | agent.silent = True 19 | if i < num_adversaries: 20 | agent.adversary = True 21 | else: 22 | agent.adversary = False 23 | # add landmarks 24 | world.landmarks = [Landmark() for i in range(num_landmarks)] 25 | for i, landmark in enumerate(world.landmarks): 26 | landmark.name = 'landmark %d' % i 27 | landmark.collide = False 28 | landmark.movable = False 29 | # make initial conditions 30 | self.reset_world(world) 31 | return world 32 | 33 | def reset_world(self, world): 34 | # random properties for landmarks 35 | for i, landmark in enumerate(world.landmarks): 36 | landmark.color = np.array([0.1, 0.1, 0.1]) 37 | landmark.color[i + 1] += 0.8 38 | landmark.index = i 39 | # set goal landmark 40 | goal = np.random.choice(world.landmarks) 41 | for i, agent in enumerate(world.agents): 42 | agent.goal_a = goal 43 | agent.color = np.array([0.25, 0.25, 0.25]) 44 | if agent.adversary: 45 | agent.color = np.array([0.75, 0.25, 0.25]) 46 | else: 47 | j = goal.index 48 | agent.color[j + 1] += 0.5 49 | # set random initial states 50 | for agent in world.agents: 51 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 52 | agent.state.p_vel = np.zeros(world.dim_p) 53 | agent.state.c = np.zeros(world.dim_c) 54 | for i, landmark in enumerate(world.landmarks): 55 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 56 | landmark.state.p_vel = np.zeros(world.dim_p) 57 | 58 | def reward(self, agent, world): 59 | # Agents are rewarded based on minimum agent distance to each landmark 60 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 61 | 62 | def agent_reward(self, agent, world): 63 | # the distance to the goal 64 | return -np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) 65 | 66 | def adversary_reward(self, agent, world): 67 | # keep the nearest good agents away from the goal 68 | agent_dist = [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in world.agents if not a.adversary] 69 | pos_rew = min(agent_dist) 70 | #nearest_agent = world.good_agents[np.argmin(agent_dist)] 71 | #neg_rew = np.sqrt(np.sum(np.square(nearest_agent.state.p_pos - agent.state.p_pos))) 72 | neg_rew = np.sqrt(np.sum(np.square(agent.goal_a.state.p_pos - agent.state.p_pos))) 73 | #neg_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in world.good_agents]) 74 | return pos_rew - neg_rew 75 | 76 | def observation(self, agent, world): 77 | # get positions of all entities in this agent's reference frame 78 | entity_pos = [] 79 | for entity in world.landmarks: # world.entities: 80 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 81 | # entity colors 82 | entity_color = [] 83 | for entity in world.landmarks: # world.entities: 84 | entity_color.append(entity.color) 85 | # communication of all other agents 86 | comm = [] 87 | other_pos = [] 88 | for other in world.agents: 89 | if other is agent: continue 90 | comm.append(other.state.c) 91 | other_pos.append(other.state.p_pos - agent.state.p_pos) 92 | if not agent.adversary: 93 | return np.concatenate([agent.state.p_vel] + [agent.goal_a.state.p_pos - agent.state.p_pos] + [agent.color] + entity_pos + entity_color + other_pos) 94 | else: 95 | #other_pos = list(reversed(other_pos)) if random.uniform(0,1) > 0.5 else other_pos # randomize position of other agents in adversary network 96 | return np.concatenate([agent.state.p_vel] + entity_pos + other_pos) 97 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/simple_reference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # set any world properties first 9 | world.dim_c = 10 10 | world.collaborative = True # whether agents share rewards 11 | # add agents 12 | world.agents = [Agent() for i in range(2)] 13 | for i, agent in enumerate(world.agents): 14 | agent.name = 'agent %d' % i 15 | agent.collide = False 16 | # add landmarks 17 | world.landmarks = [Landmark() for i in range(3)] 18 | for i, landmark in enumerate(world.landmarks): 19 | landmark.name = 'landmark %d' % i 20 | landmark.collide = False 21 | landmark.movable = False 22 | # make initial conditions 23 | self.reset_world(world) 24 | return world 25 | 26 | def reset_world(self, world): 27 | # assign goals to agents 28 | for agent in world.agents: 29 | agent.goal_a = None 30 | agent.goal_b = None 31 | # want other agent to go to the goal landmark 32 | world.agents[0].goal_a = world.agents[1] 33 | world.agents[0].goal_b = np.random.choice(world.landmarks) 34 | world.agents[1].goal_a = world.agents[0] 35 | world.agents[1].goal_b = np.random.choice(world.landmarks) 36 | # random properties for agents 37 | for i, agent in enumerate(world.agents): 38 | agent.color = np.array([0.25,0.25,0.25]) 39 | # random properties for landmarks 40 | world.landmarks[0].color = np.array([0.75,0.25,0.25]) 41 | world.landmarks[1].color = np.array([0.25,0.75,0.25]) 42 | world.landmarks[2].color = np.array([0.25,0.25,0.75]) 43 | # special colors for goals 44 | world.agents[0].goal_a.color = world.agents[0].goal_b.color 45 | world.agents[1].goal_a.color = world.agents[1].goal_b.color 46 | # set random initial states 47 | for agent in world.agents: 48 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 49 | agent.state.p_vel = np.zeros(world.dim_p) 50 | agent.state.c = np.zeros(world.dim_c) 51 | for i, landmark in enumerate(world.landmarks): 52 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 53 | landmark.state.p_vel = np.zeros(world.dim_p) 54 | 55 | def reward(self, agent, world): 56 | if agent.goal_a is None or agent.goal_b is None: 57 | return 0.0 58 | dist2 = np.sum(np.square(agent.goal_a.state.p_pos - agent.goal_b.state.p_pos)) 59 | return -dist2 60 | 61 | def observation(self, agent, world): 62 | # goal color 63 | goal_color = [np.zeros(world.dim_color), np.zeros(world.dim_color)] 64 | if agent.goal_b is not None: 65 | goal_color[1] = agent.goal_b.color 66 | 67 | # get positions of all entities in this agent's reference frame 68 | entity_pos = [] 69 | for entity in world.landmarks: 70 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 71 | # entity colors 72 | entity_color = [] 73 | for entity in world.landmarks: 74 | entity_color.append(entity.color) 75 | # communication of all other agents 76 | comm = [] 77 | for other in world.agents: 78 | if other is agent: continue 79 | comm.append(other.state.c) 80 | return np.concatenate([agent.state.p_vel] + entity_pos + [goal_color[1]] + comm) 81 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/simple_speaker_listener.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # set any world properties first 9 | world.dim_c = 3 10 | num_landmarks = 3 11 | world.collaborative = True 12 | # add agents 13 | world.agents = [Agent() for i in range(2)] 14 | for i, agent in enumerate(world.agents): 15 | agent.name = 'agent %d' % i 16 | agent.collide = False 17 | agent.size = 0.075 18 | # speaker 19 | world.agents[0].movable = False 20 | # listener 21 | world.agents[1].silent = True 22 | # add landmarks 23 | world.landmarks = [Landmark() for i in range(num_landmarks)] 24 | for i, landmark in enumerate(world.landmarks): 25 | landmark.name = 'landmark %d' % i 26 | landmark.collide = False 27 | landmark.movable = False 28 | landmark.size = 0.04 29 | # make initial conditions 30 | self.reset_world(world) 31 | return world 32 | 33 | def reset_world(self, world): 34 | # assign goals to agents 35 | for agent in world.agents: 36 | agent.goal_a = None 37 | agent.goal_b = None 38 | # want listener to go to the goal landmark 39 | world.agents[0].goal_a = world.agents[1] 40 | world.agents[0].goal_b = np.random.choice(world.landmarks) 41 | # random properties for agents 42 | for i, agent in enumerate(world.agents): 43 | agent.color = np.array([0.25,0.25,0.25]) 44 | # random properties for landmarks 45 | world.landmarks[0].color = np.array([0.65,0.15,0.15]) 46 | world.landmarks[1].color = np.array([0.15,0.65,0.15]) 47 | world.landmarks[2].color = np.array([0.15,0.15,0.65]) 48 | # special colors for goals 49 | world.agents[0].goal_a.color = world.agents[0].goal_b.color + np.array([0.45, 0.45, 0.45]) 50 | # set random initial states 51 | for agent in world.agents: 52 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 53 | agent.state.p_vel = np.zeros(world.dim_p) 54 | agent.state.c = np.zeros(world.dim_c) 55 | for i, landmark in enumerate(world.landmarks): 56 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 57 | landmark.state.p_vel = np.zeros(world.dim_p) 58 | 59 | def benchmark_data(self, agent, world): 60 | # returns data for benchmarking purposes 61 | a = world.agents[0] 62 | distance = np.sqrt(np.square(a.goal_a.state.p_pos)) 63 | return self.reward(agent, world) 64 | 65 | def reward(self, agent, world): 66 | # squared distance from listener to landmark 67 | a = world.agents[0] 68 | dist2 = np.sum(np.square(a.goal_a.state.p_pos - a.goal_b.state.p_pos)) 69 | return -dist2 70 | 71 | def observation(self, agent, world): 72 | # goal color 73 | goal_color = np.zeros(world.dim_color) 74 | if agent.goal_b is not None: 75 | goal_color = agent.goal_b.color 76 | 77 | # get positions of all entities in this agent's reference frame 78 | entity_pos = [] 79 | for entity in world.landmarks: 80 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 81 | 82 | # communication of all other agents 83 | comm = [] 84 | for other in world.agents: 85 | if other is agent or (other.state.c is None): continue 86 | comm.append(other.state.c) 87 | 88 | # speaker 89 | if not agent.movable: 90 | return np.concatenate([goal_color]) 91 | # listener 92 | if agent.silent: 93 | return np.concatenate([agent.state.p_vel] + entity_pos + comm) 94 | 95 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/simple_spread.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_agents = 3 12 | num_landmarks = 3 13 | world.collaborative = True 14 | # add agents 15 | world.agents = [Agent() for i in range(num_agents)] 16 | for i, agent in enumerate(world.agents): 17 | agent.name = 'agent %d' % i 18 | agent.collide = True 19 | agent.silent = True 20 | agent.size = 0.15 21 | # add landmarks 22 | world.landmarks = [Landmark() for i in range(num_landmarks)] 23 | for i, landmark in enumerate(world.landmarks): 24 | landmark.name = 'landmark %d' % i 25 | landmark.collide = False 26 | landmark.movable = False 27 | # make initial conditions 28 | self.reset_world(world) 29 | return world 30 | 31 | def reset_world(self, world): 32 | # random properties for agents 33 | for i, agent in enumerate(world.agents): 34 | agent.color = np.array([0.35, 0.35, 0.85]) 35 | # random properties for landmarks 36 | for i, landmark in enumerate(world.landmarks): 37 | landmark.color = np.array([0.25, 0.25, 0.25]) 38 | # set random initial states 39 | for agent in world.agents: 40 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 41 | agent.state.p_vel = np.zeros(world.dim_p) 42 | agent.state.c = np.zeros(world.dim_c) 43 | for i, landmark in enumerate(world.landmarks): 44 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 45 | landmark.state.p_vel = np.zeros(world.dim_p) 46 | 47 | def benchmark_data(self, agent, world): 48 | rew = 0 49 | collisions = 0 50 | occupied_landmarks = 0 51 | dists = [np.sqrt(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) for l in world.landmarks] 52 | min_dist = min(dists) 53 | rew -= min(dists) 54 | if min(dists) < 0.1: 55 | occupied_landmarks += 1 56 | if agent.collide: 57 | for a in world.agents: 58 | if self.is_collision(a, agent): 59 | rew -= 1 60 | collisions += 1 61 | return (rew, collisions, min_dist, occupied_landmarks) 62 | 63 | 64 | def is_collision(self, agent1, agent2): 65 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 66 | dist = np.sqrt(np.sum(np.square(delta_pos))) 67 | dist_min = agent1.size + agent2.size 68 | return True if dist < dist_min else False 69 | 70 | def reward(self, agent, world): 71 | # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions 72 | rew = 0 73 | for l in world.landmarks: 74 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 75 | rew -= min(dists) 76 | if agent.collide: 77 | for a in world.agents: 78 | if self.is_collision(a, agent): 79 | rew -= 1 80 | return rew 81 | 82 | def observation(self, agent, world): 83 | # get positions of all entities in this agent's reference frame 84 | entity_pos = [] 85 | for entity in world.landmarks: # world.entities: 86 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 87 | # entity colors 88 | entity_color = [] 89 | for entity in world.landmarks: # world.entities: 90 | entity_color.append(entity.color) 91 | # communication of all other agents 92 | comm = [] 93 | other_pos = [] 94 | for other in world.agents: 95 | if other is agent: continue 96 | comm.append(other.state.c) 97 | other_pos.append(other.state.p_pos - agent.state.p_pos) 98 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm) 99 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/simple_spread_two_ag.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_agents = 2 12 | num_landmarks = 2 13 | world.collaborative = True 14 | # add agents 15 | world.agents = [Agent() for i in range(num_agents)] 16 | for i, agent in enumerate(world.agents): 17 | agent.name = 'agent %d' % i 18 | agent.collide = True 19 | agent.silent = True 20 | agent.size = 0.15 21 | # add landmarks 22 | world.landmarks = [Landmark() for i in range(num_landmarks)] 23 | for i, landmark in enumerate(world.landmarks): 24 | landmark.name = 'landmark %d' % i 25 | landmark.collide = False 26 | landmark.movable = False 27 | # make initial conditions 28 | self.reset_world(world) 29 | return world 30 | 31 | def reset_world(self, world): 32 | # random properties for agents 33 | for i, agent in enumerate(world.agents): 34 | agent.color = np.array([0.35, 0.35, 0.85]) 35 | # random properties for landmarks 36 | for i, landmark in enumerate(world.landmarks): 37 | landmark.color = np.array([0.25, 0.25, 0.25]) 38 | # set random initial states 39 | for agent in world.agents: 40 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 41 | agent.state.p_vel = np.zeros(world.dim_p) 42 | agent.state.c = np.zeros(world.dim_c) 43 | for i, landmark in enumerate(world.landmarks): 44 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 45 | landmark.state.p_vel = np.zeros(world.dim_p) 46 | 47 | def benchmark_data(self, agent, world): 48 | rew = 0 49 | collisions = 0 50 | occupied_landmarks = 0 51 | min_dists = 0 52 | for l in world.landmarks: 53 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 54 | min_dists += min(dists) 55 | rew -= min(dists) 56 | if min(dists) < 0.1: 57 | occupied_landmarks += 1 58 | if agent.collide: 59 | for a in world.agents: 60 | if self.is_collision(a, agent): 61 | rew -= 1 62 | collisions += 1 63 | return (rew, collisions, min_dists, occupied_landmarks) 64 | 65 | 66 | def is_collision(self, agent1, agent2): 67 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 68 | dist = np.sqrt(np.sum(np.square(delta_pos))) 69 | dist_min = agent1.size + agent2.size 70 | return True if dist < dist_min else False 71 | 72 | def reward(self, agent, world): 73 | # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions 74 | rew = 0 75 | for l in world.landmarks: 76 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 77 | rew -= min(dists) 78 | if agent.collide: 79 | for a in world.agents: 80 | if self.is_collision(a, agent): 81 | rew -= 1 82 | return rew 83 | 84 | def observation(self, agent, world): 85 | # get positions of all entities in this agent's reference frame 86 | entity_pos = [] 87 | for entity in world.landmarks: # world.entities: 88 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 89 | # entity colors 90 | entity_color = [] 91 | for entity in world.landmarks: # world.entities: 92 | entity_color.append(entity.color) 93 | # communication of all other agents 94 | comm = [] 95 | other_pos = [] 96 | for other in world.agents: 97 | if other is agent: continue 98 | comm.append(other.state.c) 99 | other_pos.append(other.state.p_pos - agent.state.p_pos) 100 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm) 101 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/simple_tag.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_good_agents = 1 12 | num_adversaries = 3 13 | num_agents = num_adversaries + num_good_agents 14 | num_landmarks = 2 15 | # add agents 16 | world.agents = [Agent() for i in range(num_agents)] 17 | for i, agent in enumerate(world.agents): 18 | agent.name = 'agent %d' % i 19 | agent.collide = True 20 | agent.silent = True 21 | agent.adversary = True if i < num_adversaries else False 22 | agent.size = 0.075 if agent.adversary else 0.05 23 | agent.accel = 3.0 if agent.adversary else 4.0 24 | #agent.accel = 20.0 if agent.adversary else 25.0 25 | agent.max_speed = 1.0 if agent.adversary else 1.3 26 | # add landmarks 27 | world.landmarks = [Landmark() for i in range(num_landmarks)] 28 | for i, landmark in enumerate(world.landmarks): 29 | landmark.name = 'landmark %d' % i 30 | landmark.collide = True 31 | landmark.movable = False 32 | landmark.size = 0.2 33 | landmark.boundary = False 34 | # make initial conditions 35 | self.reset_world(world) 36 | return world 37 | 38 | 39 | def reset_world(self, world): 40 | # random properties for agents 41 | for i, agent in enumerate(world.agents): 42 | agent.color = np.array([0.35, 0.85, 0.35]) if not agent.adversary else np.array([0.85, 0.35, 0.35]) 43 | # random properties for landmarks 44 | for i, landmark in enumerate(world.landmarks): 45 | landmark.color = np.array([0.25, 0.25, 0.25]) 46 | # set random initial states 47 | for agent in world.agents: 48 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 49 | agent.state.p_vel = np.zeros(world.dim_p) 50 | agent.state.c = np.zeros(world.dim_c) 51 | for i, landmark in enumerate(world.landmarks): 52 | if not landmark.boundary: 53 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 54 | landmark.state.p_vel = np.zeros(world.dim_p) 55 | 56 | 57 | def benchmark_data(self, agent, world): 58 | # returns data for benchmarking purposes 59 | if agent.adversary: 60 | collisions = 0 61 | for a in self.good_agents(world): 62 | if self.is_collision(a, agent): 63 | collisions += 1 64 | return collisions 65 | else: 66 | return 0 67 | 68 | 69 | def is_collision(self, agent1, agent2): 70 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 71 | dist = np.sqrt(np.sum(np.square(delta_pos))) 72 | dist_min = agent1.size + agent2.size 73 | return True if dist < dist_min else False 74 | 75 | # return all agents that are not adversaries 76 | def good_agents(self, world): 77 | return [agent for agent in world.agents if not agent.adversary] 78 | 79 | # return all adversarial agents 80 | def adversaries(self, world): 81 | return [agent for agent in world.agents if agent.adversary] 82 | 83 | 84 | def reward(self, agent, world): 85 | # Agents are rewarded based on minimum agent distance to each landmark 86 | main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 87 | return main_reward 88 | 89 | def agent_reward(self, agent, world): 90 | # Agents are negatively rewarded if caught by adversaries 91 | rew = 0 92 | shape = False 93 | adversaries = self.adversaries(world) 94 | if shape: # reward can optionally be shaped (increased reward for increased distance from adversary) 95 | for adv in adversaries: 96 | rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos))) 97 | if agent.collide: 98 | for a in adversaries: 99 | if self.is_collision(a, agent): 100 | rew -= 10 101 | 102 | # agents are penalized for exiting the screen, so that they can be caught by the adversaries 103 | def bound(x): 104 | if x < 0.9: 105 | return 0 106 | if x < 1.0: 107 | return (x - 0.9) * 10 108 | return min(np.exp(2 * x - 2), 10) 109 | for p in range(world.dim_p): 110 | x = abs(agent.state.p_pos[p]) 111 | rew -= bound(x) 112 | 113 | return rew 114 | 115 | def adversary_reward(self, agent, world): 116 | # Adversaries are rewarded for collisions with agents 117 | rew = 0 118 | shape = False 119 | agents = self.good_agents(world) 120 | adversaries = self.adversaries(world) 121 | if shape: # reward can optionally be shaped (decreased reward for increased distance from agents) 122 | for adv in adversaries: 123 | rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents]) 124 | if agent.collide: 125 | for ag in agents: 126 | for adv in adversaries: 127 | if self.is_collision(ag, adv): 128 | rew += 10 129 | return rew 130 | 131 | def observation(self, agent, world): 132 | # get positions of all entities in this agent's reference frame 133 | entity_pos = [] 134 | for entity in world.landmarks: 135 | if not entity.boundary: 136 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 137 | # communication of all other agents 138 | comm = [] 139 | other_pos = [] 140 | other_vel = [] 141 | for other in world.agents: 142 | if other is agent: continue 143 | comm.append(other.state.c) 144 | other_pos.append(other.state.p_pos - agent.state.p_pos) 145 | if not other.adversary: 146 | other_vel.append(other.state.p_vel) 147 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel) 148 | -------------------------------------------------------------------------------- /matd3/multiagent/scenarios/simple_world_comm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 4 11 | #world.damping = 1 12 | num_good_agents = 2 13 | num_adversaries = 4 14 | num_agents = num_adversaries + num_good_agents 15 | num_landmarks = 1 16 | num_food = 2 17 | num_forests = 2 18 | # add agents 19 | world.agents = [Agent() for i in range(num_agents)] 20 | for i, agent in enumerate(world.agents): 21 | agent.name = 'agent %d' % i 22 | agent.collide = True 23 | agent.leader = True if i == 0 else False 24 | agent.silent = True if i > 0 else False 25 | agent.adversary = True if i < num_adversaries else False 26 | agent.size = 0.075 if agent.adversary else 0.045 27 | agent.accel = 3.0 if agent.adversary else 4.0 28 | #agent.accel = 20.0 if agent.adversary else 25.0 29 | agent.max_speed = 1.0 if agent.adversary else 1.3 30 | # add landmarks 31 | world.landmarks = [Landmark() for i in range(num_landmarks)] 32 | for i, landmark in enumerate(world.landmarks): 33 | landmark.name = 'landmark %d' % i 34 | landmark.collide = True 35 | landmark.movable = False 36 | landmark.size = 0.2 37 | landmark.boundary = False 38 | world.food = [Landmark() for i in range(num_food)] 39 | for i, landmark in enumerate(world.food): 40 | landmark.name = 'food %d' % i 41 | landmark.collide = False 42 | landmark.movable = False 43 | landmark.size = 0.03 44 | landmark.boundary = False 45 | world.forests = [Landmark() for i in range(num_forests)] 46 | for i, landmark in enumerate(world.forests): 47 | landmark.name = 'forest %d' % i 48 | landmark.collide = False 49 | landmark.movable = False 50 | landmark.size = 0.3 51 | landmark.boundary = False 52 | world.landmarks += world.food 53 | world.landmarks += world.forests 54 | #world.landmarks += self.set_boundaries(world) # world boundaries now penalized with negative reward 55 | # make initial conditions 56 | self.reset_world(world) 57 | return world 58 | 59 | def set_boundaries(self, world): 60 | boundary_list = [] 61 | landmark_size = 1 62 | edge = 1 + landmark_size 63 | num_landmarks = int(edge * 2 / landmark_size) 64 | for x_pos in [-edge, edge]: 65 | for i in range(num_landmarks): 66 | l = Landmark() 67 | l.state.p_pos = np.array([x_pos, -1 + i * landmark_size]) 68 | boundary_list.append(l) 69 | 70 | for y_pos in [-edge, edge]: 71 | for i in range(num_landmarks): 72 | l = Landmark() 73 | l.state.p_pos = np.array([-1 + i * landmark_size, y_pos]) 74 | boundary_list.append(l) 75 | 76 | for i, l in enumerate(boundary_list): 77 | l.name = 'boundary %d' % i 78 | l.collide = True 79 | l.movable = False 80 | l.boundary = True 81 | l.color = np.array([0.75, 0.75, 0.75]) 82 | l.size = landmark_size 83 | l.state.p_vel = np.zeros(world.dim_p) 84 | 85 | return boundary_list 86 | 87 | 88 | def reset_world(self, world): 89 | # random properties for agents 90 | for i, agent in enumerate(world.agents): 91 | agent.color = np.array([0.45, 0.95, 0.45]) if not agent.adversary else np.array([0.95, 0.45, 0.45]) 92 | agent.color -= np.array([0.3, 0.3, 0.3]) if agent.leader else np.array([0, 0, 0]) 93 | # random properties for landmarks 94 | for i, landmark in enumerate(world.landmarks): 95 | landmark.color = np.array([0.25, 0.25, 0.25]) 96 | for i, landmark in enumerate(world.food): 97 | landmark.color = np.array([0.15, 0.15, 0.65]) 98 | for i, landmark in enumerate(world.forests): 99 | landmark.color = np.array([0.6, 0.9, 0.6]) 100 | # set random initial states 101 | for agent in world.agents: 102 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 103 | agent.state.p_vel = np.zeros(world.dim_p) 104 | agent.state.c = np.zeros(world.dim_c) 105 | for i, landmark in enumerate(world.landmarks): 106 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 107 | landmark.state.p_vel = np.zeros(world.dim_p) 108 | for i, landmark in enumerate(world.food): 109 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 110 | landmark.state.p_vel = np.zeros(world.dim_p) 111 | for i, landmark in enumerate(world.forests): 112 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 113 | landmark.state.p_vel = np.zeros(world.dim_p) 114 | 115 | def benchmark_data(self, agent, world): 116 | if agent.adversary: 117 | collisions = 0 118 | for a in self.good_agents(world): 119 | if self.is_collision(a, agent): 120 | collisions += 1 121 | return collisions 122 | else: 123 | return 0 124 | 125 | 126 | def is_collision(self, agent1, agent2): 127 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 128 | dist = np.sqrt(np.sum(np.square(delta_pos))) 129 | dist_min = agent1.size + agent2.size 130 | return True if dist < dist_min else False 131 | 132 | 133 | # return all agents that are not adversaries 134 | def good_agents(self, world): 135 | return [agent for agent in world.agents if not agent.adversary] 136 | 137 | # return all adversarial agents 138 | def adversaries(self, world): 139 | return [agent for agent in world.agents if agent.adversary] 140 | 141 | 142 | def reward(self, agent, world): 143 | # Agents are rewarded based on minimum agent distance to each landmark 144 | #boundary_reward = -10 if self.outside_boundary(agent) else 0 145 | main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 146 | return main_reward 147 | 148 | def outside_boundary(self, agent): 149 | if agent.state.p_pos[0] > 1 or agent.state.p_pos[0] < -1 or agent.state.p_pos[1] > 1 or agent.state.p_pos[1] < -1: 150 | return True 151 | else: 152 | return False 153 | 154 | 155 | def agent_reward(self, agent, world): 156 | # Agents are rewarded based on minimum agent distance to each landmark 157 | rew = 0 158 | shape = False 159 | adversaries = self.adversaries(world) 160 | if shape: 161 | for adv in adversaries: 162 | rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos))) 163 | if agent.collide: 164 | for a in adversaries: 165 | if self.is_collision(a, agent): 166 | rew -= 5 167 | def bound(x): 168 | if x < 0.9: 169 | return 0 170 | if x < 1.0: 171 | return (x - 0.9) * 10 172 | return min(np.exp(2 * x - 2), 10) # 1 + (x - 1) * (x - 1) 173 | 174 | for p in range(world.dim_p): 175 | x = abs(agent.state.p_pos[p]) 176 | rew -= 2 * bound(x) 177 | 178 | for food in world.food: 179 | if self.is_collision(agent, food): 180 | rew += 2 181 | rew += 0.05 * min([np.sqrt(np.sum(np.square(food.state.p_pos - agent.state.p_pos))) for food in world.food]) 182 | 183 | return rew 184 | 185 | def adversary_reward(self, agent, world): 186 | # Agents are rewarded based on minimum agent distance to each landmark 187 | rew = 0 188 | shape = True 189 | agents = self.good_agents(world) 190 | adversaries = self.adversaries(world) 191 | if shape: 192 | rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in agents]) 193 | if agent.collide: 194 | for ag in agents: 195 | for adv in adversaries: 196 | if self.is_collision(ag, adv): 197 | rew += 5 198 | return rew 199 | 200 | 201 | def observation2(self, agent, world): 202 | # get positions of all entities in this agent's reference frame 203 | entity_pos = [] 204 | for entity in world.landmarks: 205 | if not entity.boundary: 206 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 207 | 208 | food_pos = [] 209 | for entity in world.food: 210 | if not entity.boundary: 211 | food_pos.append(entity.state.p_pos - agent.state.p_pos) 212 | # communication of all other agents 213 | comm = [] 214 | other_pos = [] 215 | other_vel = [] 216 | for other in world.agents: 217 | if other is agent: continue 218 | comm.append(other.state.c) 219 | other_pos.append(other.state.p_pos - agent.state.p_pos) 220 | if not other.adversary: 221 | other_vel.append(other.state.p_vel) 222 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel) 223 | 224 | def observation(self, agent, world): 225 | # get positions of all entities in this agent's reference frame 226 | entity_pos = [] 227 | for entity in world.landmarks: 228 | if not entity.boundary: 229 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 230 | 231 | in_forest = [np.array([-1]), np.array([-1])] 232 | inf1 = False 233 | inf2 = False 234 | if self.is_collision(agent, world.forests[0]): 235 | in_forest[0] = np.array([1]) 236 | inf1= True 237 | if self.is_collision(agent, world.forests[1]): 238 | in_forest[1] = np.array([1]) 239 | inf2 = True 240 | 241 | food_pos = [] 242 | for entity in world.food: 243 | if not entity.boundary: 244 | food_pos.append(entity.state.p_pos - agent.state.p_pos) 245 | # communication of all other agents 246 | comm = [] 247 | other_pos = [] 248 | other_vel = [] 249 | for other in world.agents: 250 | if other is agent: continue 251 | comm.append(other.state.c) 252 | oth_f1 = self.is_collision(other, world.forests[0]) 253 | oth_f2 = self.is_collision(other, world.forests[1]) 254 | if (inf1 and oth_f1) or (inf2 and oth_f2) or (not inf1 and not oth_f1 and not inf2 and not oth_f2) or agent.leader: #without forest vis 255 | other_pos.append(other.state.p_pos - agent.state.p_pos) 256 | if not other.adversary: 257 | other_vel.append(other.state.p_vel) 258 | else: 259 | other_pos.append([0, 0]) 260 | if not other.adversary: 261 | other_vel.append([0, 0]) 262 | 263 | # to tell the pred when the prey are in the forest 264 | prey_forest = [] 265 | ga = self.good_agents(world) 266 | for a in ga: 267 | if any([self.is_collision(a, f) for f in world.forests]): 268 | prey_forest.append(np.array([1])) 269 | else: 270 | prey_forest.append(np.array([-1])) 271 | # to tell leader when pred are in forest 272 | prey_forest_lead = [] 273 | for f in world.forests: 274 | if any([self.is_collision(a, f) for a in ga]): 275 | prey_forest_lead.append(np.array([1])) 276 | else: 277 | prey_forest_lead.append(np.array([-1])) 278 | 279 | comm = [world.agents[0].state.c] 280 | 281 | if agent.adversary and not agent.leader: 282 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm) 283 | if agent.leader: 284 | return np.concatenate( 285 | [agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm) 286 | else: 287 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + in_forest + other_vel) 288 | 289 | 290 | -------------------------------------------------------------------------------- /matd3/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pickle 3 | import time 4 | from copy import deepcopy 5 | 6 | import gym 7 | import numpy as np 8 | import tensorflow as tf 9 | import tensorflow.contrib.layers as layers 10 | 11 | import common.tf_util as U 12 | from maddpg.trainer.maddpg import MADDPGAgentTrainer 13 | from matd3.trainer.matd3 import MATD3AgentTrainer 14 | from multiagent.environment import MultiAgentEnv 15 | 16 | logger = None 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments") 20 | # Environment 21 | parser.add_argument("--scenario", type=str, default="simple", help="name of the scenario script") 22 | parser.add_argument("--max-episode-len", type=int, default=25, help="maximum episode length") 23 | parser.add_argument("--num-episodes", type=int, default=60000, help="number of episodes") 24 | parser.add_argument("--num-adversaries", type=int, default=0, help="number of adversaries") 25 | parser.add_argument("--good-policy", type=str, default="matd3", help="policy for good agents (matd3 or maddpg)") 26 | parser.add_argument("--adv-policy", type=str, default="matd3", help="policy of adversaries (matd3 or maddpg)") 27 | 28 | # Core training parameters 29 | parser.add_argument("--lr", type=float, default=1e-2, help="learning rate for Adam optimizer") 30 | parser.add_argument("--gamma", type=float, default=0.95, help="discount factor") 31 | parser.add_argument("--batch-size", type=int, default=1024, help="number of episodes to optimize at the same time") 32 | parser.add_argument("--num-units", type=int, default=64, help="number of units in the mlp") 33 | parser.add_argument("--update-rate", type=int, default=100, help="after this many steps the critic is trained") 34 | parser.add_argument("--policy-update-rate", type=int, default=2, 35 | help="after this many critic updates the target networks and policy are trained") 36 | parser.add_argument("--use-critic-noise", action="store_true", default=False, help="use noise in critic update next action") 37 | parser.add_argument("--use-critic-noise-self", action="store_true", default=False, help="use noise in critic update next action") 38 | parser.add_argument("--critic-action-noise-stddev", type=float, default=0.2) 39 | parser.add_argument("--action-noise-clip", type=float, default=0.5) 40 | parser.add_argument("--critic-zero-if-done", action="store_true", default=False, help="set q value to zero in critic update after done") 41 | 42 | # Checkpointing 43 | parser.add_argument("--exp-name", type=str, default='def_exp_name', help="name of the experiment") 44 | parser.add_argument("--save-dir", type=str, default="/tmp/policy/", help="directory in which training state and model should be saved") 45 | parser.add_argument("--save-rate", type=int, default=1000, help="save model once every time this many episodes are completed") 46 | parser.add_argument("--load-dir", type=str, default="", help="directory in which training state and model are loaded") 47 | # Evaluation 48 | parser.add_argument("--real-q-log", action="store_true", default=False,help="Evaluates approx. real q value after every 5 save-rates") 49 | parser.add_argument("--q-log-ep-len", type=int, default=200, help="Number of steps per state in q_eval") 50 | parser.add_argument("--restore", action="store_true", default=False) 51 | parser.add_argument("--display", action="store_true", default=False) 52 | parser.add_argument("--benchmark", action="store_true", default=False, help="Saves all locations and termination locations") 53 | parser.add_argument("--benchmark-iters", type=int, default=10000, help="number of iterations run for benchmarking") 54 | parser.add_argument("--benchmark-dir", type=str, default="./benchmark_files/", help="directory where benchmark data is saved") 55 | parser.add_argument("--plots-dir", type=str, default="./learning_curves/", help="directory where plot data is saved") 56 | parser.add_argument("--record-episodes", action="store_true", default=False, help="save rgb arrays of episodes") 57 | return parser.parse_args() 58 | 59 | 60 | def mlp_model(input, num_outputs, scope, reuse=False, num_units=64, rnn_cell=None): 61 | # This model takes as input an observation and returns values of all actions 62 | with tf.variable_scope(scope, reuse=reuse): 63 | out = input 64 | out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu) 65 | out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu) 66 | out = layers.fully_connected(out, num_outputs=num_outputs, activation_fn=None) 67 | return out 68 | 69 | def make_env(scenario_name, arglist, benchmark=False): 70 | from multiagent.environment import MultiAgentEnv 71 | import multiagent.scenarios as scenarios 72 | 73 | # load scenario from script 74 | scenario = scenarios.load(scenario_name + ".py").Scenario() 75 | # create world 76 | world = scenario.make_world() 77 | # create multiagent environment 78 | if benchmark: 79 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) 80 | else: 81 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) 82 | return env 83 | 84 | 85 | def calculate_real_q_value(env: MultiAgentEnv, agents, world_state_buffer, action_n_buffer, start_episode_step_buffer, 86 | obs_n_buffer, num_start_states, args): 87 | """ 88 | 89 | :param env: 90 | :param agents: 91 | :param world_state_buffer: buffer of world states, from which we randomly sample 92 | :param action_n_buffer: buffer of action chosen in the world_state of same index 93 | :param num_start_states: 94 | :param len_eval: 95 | :return: 96 | """ 97 | world_sample_indexes = np.random.choice(range(len(world_state_buffer)), num_start_states) 98 | discounted_run_rewards_n = [] 99 | q_values_n = [] 100 | for start_idx, world_idx in enumerate(world_sample_indexes): 101 | env.world = deepcopy(world_state_buffer[world_idx]) 102 | episode_reward_n = [] 103 | action_n = action_n_buffer[world_idx] 104 | obs_n, reward_n, done_n, info_n = env.step(action_n) 105 | episode_reward_n.append(reward_n) 106 | # if arglist.q_log_full_episodes: 107 | episode_step = 0 108 | # else: 109 | # episode_step = start_episode_step_buffer[world_idx] 110 | 111 | terminal = False 112 | obs_n_reshaped = [] 113 | action_n_reshaped = [] 114 | for ag_idx in range(len(obs_n)): 115 | obs_n_reshaped.append([obs_n[ag_idx]]) 116 | action_n_reshaped.append([action_n[ag_idx]]) 117 | q_values_n.append([agent.q_debug['q_values'](*(obs_n_reshaped + action_n_reshaped)) for agent in agents]) 118 | 119 | while not (all(done_n) or terminal): 120 | action_n = [agent.action(obs) for agent, obs in zip(agents, obs_n)] 121 | obs_n, reward_n, done_n, info_n = env.step(action_n) 122 | episode_reward_n.append(reward_n) 123 | 124 | terminal = episode_step >= arglist.q_log_ep_len 125 | episode_step += 1 126 | 127 | discount_factors = np.power(args.gamma, np.arange(0, len(episode_reward_n), dtype=np.int)) 128 | discounted_run_rewards_n.append(np.dot(discount_factors, np.array(episode_reward_n))) 129 | 130 | q_mean = np.mean(q_values_n, 0)[:,0] 131 | real_mean = np.mean(discounted_run_rewards_n, 0) 132 | return q_mean, real_mean 133 | 134 | 135 | 136 | 137 | def get_trainers(env, num_adversaries, obs_shape_n, arglist, good_agent_mode='matd3', adv_agent_mode='matd3'): 138 | trainers = [] 139 | model = mlp_model 140 | if good_agent_mode=='matd3': 141 | good_trainer = MATD3AgentTrainer 142 | elif good_agent_mode=='maddpg': 143 | good_trainer = MADDPGAgentTrainer 144 | else: 145 | raise RuntimeError('Unknown agent mode specified' + str(good_agent_mode)) 146 | if adv_agent_mode== 'matd3': 147 | adv_trainer = MATD3AgentTrainer 148 | elif adv_agent_mode== 'maddpg': 149 | adv_trainer= MADDPGAgentTrainer 150 | else: 151 | raise RuntimeError('Unknown agent mode specified' + str(adv_agent_mode)) 152 | 153 | for i in range(num_adversaries): 154 | trainers.append(adv_trainer( 155 | "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist, 156 | local_q_func=(arglist.adv_policy == 'ddpg'))) 157 | for i in range(num_adversaries, env.n): 158 | trainers.append(good_trainer( 159 | "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist, 160 | local_q_func=(arglist.good_policy == 'ddpg'))) 161 | return trainers 162 | 163 | 164 | def train_maddpg(arglist): 165 | with U.single_threaded_session(): 166 | # Create environment 167 | env = make_env(arglist.scenario, arglist, arglist.benchmark) 168 | # Create agent trainers 169 | obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] 170 | num_adversaries = min(env.n, arglist.num_adversaries) 171 | trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist, 172 | good_agent_mode=arglist.good_policy, adv_agent_mode=arglist.adv_policy) 173 | print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) 174 | 175 | # Initialize 176 | U.initialize() 177 | 178 | # Load previous results, if necessary 179 | if arglist.load_dir == "": 180 | arglist.load_dir = arglist.save_dir 181 | if arglist.display or arglist.restore or arglist.benchmark: 182 | print('Loading previous state...') 183 | U.load_state(arglist.load_dir) 184 | 185 | episode_rewards = [0.0] # sum of rewards for all agents 186 | agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward 187 | final_ep_rewards = [] # sum of rewards for training curve 188 | final_ep_ag_rewards = [] # agent rewards for training curve 189 | agent_info = [[[]]] # placeholder for benchmarking info 190 | saver = tf.train.Saver(max_to_keep=None) 191 | obs_n = env.reset() 192 | episode_step = 0 193 | train_step = 0 194 | t_start = time.time() 195 | 196 | if arglist.real_q_log: 197 | world_state_buffer, action_n_buffer, start_episode_step_buffer, obs_n_buffer = [], [], [], [] 198 | q_means, real_means = [], [] 199 | 200 | print('Starting iterations...') 201 | while True: 202 | # get action 203 | action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] 204 | # environment step 205 | new_obs_n, rew_n, done_n, info_n = env.step(action_n) 206 | episode_step += 1 207 | done = all(done_n) # note: unused, never happens 208 | terminal = (episode_step >= arglist.max_episode_len) 209 | done = done or terminal 210 | 211 | if arglist.real_q_log: 212 | world_state_buffer.append(deepcopy(env.world)) 213 | obs_n_buffer.append(obs_n) 214 | action_n_buffer.append(action_n) 215 | start_episode_step_buffer.append(episode_step) 216 | 217 | # collect experience 218 | for i, agent in enumerate(trainers): 219 | agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done, terminal) 220 | obs_n = new_obs_n 221 | 222 | for i, rew in enumerate(rew_n): 223 | episode_rewards[-1] += rew 224 | agent_rewards[i][-1] += rew 225 | 226 | 227 | 228 | if done or terminal: 229 | obs_n = env.reset() 230 | episode_step = 0 231 | episode_rewards.append(0) # add element for next episode 232 | for a in agent_rewards: 233 | a.append(0) 234 | agent_info.append([[]]) 235 | 236 | # increment global step counter 237 | train_step += 1 238 | 239 | # for benchmarking learned policies 240 | if arglist.benchmark: 241 | for i, info in enumerate(info_n): 242 | agent_info[-1][i].append(info_n['n']) 243 | if train_step > arglist.benchmark_iters and (done or terminal): 244 | file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' 245 | print('Finished benchmarking, now saving...') 246 | with open(file_name, 'wb') as fp: 247 | pickle.dump(agent_info[:-1], fp) 248 | break 249 | continue 250 | 251 | # for displaying learned policies 252 | if arglist.display: 253 | time.sleep(0.1) 254 | env.render() 255 | continue 256 | 257 | for agent in trainers: 258 | loss = agent.update(trainers, train_step) 259 | 260 | 261 | # save model, display training output 262 | if terminal and (len(episode_rewards) % arglist.save_rate == 0): 263 | if arglist.save_dir != '/tmp/policy/': 264 | U.save_state(arglist.save_dir + arglist.exp_name, saver=saver, global_step=len(episode_rewards)) 265 | else: 266 | U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries 267 | if num_adversaries == 0: 268 | print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( 269 | train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:-1]), round(time.time()-t_start, 3))) 270 | else: 271 | print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( 272 | train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:-1]), 273 | [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) 274 | t_start = time.time() 275 | # Keep track of final episode reward 276 | final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:-1])) 277 | for rew in agent_rewards: 278 | final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:-1])) 279 | 280 | if arglist.real_q_log and (len(episode_rewards) % (5 * arglist.save_rate) == 0): 281 | q_mean, real_mean = calculate_real_q_value(deepcopy(env), trainers, 282 | world_state_buffer=world_state_buffer, 283 | action_n_buffer=action_n_buffer, 284 | obs_n_buffer=obs_n_buffer, 285 | start_episode_step_buffer=start_episode_step_buffer, 286 | num_start_states=200, 287 | args=arglist) 288 | world_state_buffer, action_n_buffer, start_episode_step_buffer, obs_n_buffer = [], [], [], [] 289 | q_means.append(q_mean) 290 | real_means.append(real_mean) 291 | print('Q-mean: ' + str(q_mean) + ' Real mean: ' + str(real_mean)) 292 | 293 | 294 | 295 | 296 | # saves final episode reward for plotting training curve later 297 | if len(episode_rewards) > arglist.num_episodes: 298 | rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' 299 | with open(rew_file_name, 'wb') as fp: 300 | pickle.dump(final_ep_rewards, fp) 301 | agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' 302 | with open(agrew_file_name, 'wb') as fp: 303 | pickle.dump(final_ep_ag_rewards, fp) 304 | args_file_name = arglist.plots_dir + arglist.exp_name + '_args.pkl' 305 | with open(args_file_name, 'wb') as fp: 306 | pickle.dump(arglist, fp) 307 | if arglist.real_q_log: 308 | real_q_path = arglist.plots_dir + arglist.exp_name + '_q_values.pkl' 309 | with open(real_q_path, 'wb') as fp: 310 | pickle.dump({'q_means': q_means, 'real_means': real_means}, fp) 311 | print('...Finished total of {} episodes.'.format(len(episode_rewards))) 312 | break 313 | 314 | 315 | if __name__ == '__main__': 316 | arglist = parse_args() 317 | train_maddpg(arglist) 318 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | * Use this code to replicate the results from the paper, for a more readable TF 2.x implementation check out [tf2multiagentrl](https://github.com/JohannesAck/tf2multiagentrl). 2 | 3 | # Implementation of Multi-Agent TD3 4 | 5 | This is the implemetation of MATD3, presented in our paper [Reducing Overestimation Bias in Multi-Agent Domains Using Double Centralized Critics](https://arxiv.org/pdf/1910.01465.pdf). 6 | Multi-Agent TD3 is an algorithm for multi-agent reinforcement learning, that combines the improvements of [TD3](https://arxiv.org/pdf/1802.09477.pdf) with [MADDPG](https://arxiv.org/pdf/1706.02275.pdf). 7 | 8 | The implementation here is closely based on [maddpg from Ryan Lowe / OpenAI](https://github.com/openai/maddpg), to enable a fair comparision. The environments used are from [multiagent-particle-envs from OpenAI](https://github.com/openai/multiagent-particle-envs). 9 | 10 | 11 | ### Requirements 12 | - ```python == 3.6``` 13 | - ```TF == 1.12.0``` any 1.x should work 14 | - ```Gym == 0.10.5``` *this one is important* 15 | - ```Numpy >= 1.16.2``` 16 | 17 | ### Example Useage 18 | To start training on simple_crypto, with an MATD3 team of agents and an MADDPG adversary, use 19 | ``` 20 | python train.py --scenario simple_speaker_listener --good-policy matd3 --adv-policy maddpg 21 | ``` 22 | 23 | 24 | ### Reference 25 | If you use our implementation, please also cite our paper with 26 | ``` 27 | @misc{ackermann2019reducing, 28 | title={Reducing Overestimation Bias in Multi-Agent Domains Using Double Centralized Critics}, 29 | author={Johannes Ackermann and Volker Gabler and Takayuki Osa and Masashi Sugiyama}, 30 | year={2019}, 31 | eprint={1910.01465}, 32 | archivePrefix={arXiv}, 33 | primaryClass={cs.LG} 34 | } 35 | ``` 36 | --------------------------------------------------------------------------------