├── .gitignore ├── LICENSE.txt ├── README.md ├── maddpg.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── not-zip-safe ├── requires.txt └── top_level.txt ├── maddpg ├── __init__.py ├── __pycache__ │ └── __init__.cpython-36.pyc ├── common │ ├── __pycache__ │ │ ├── distributions.cpython-36.pyc │ │ └── tf_util.cpython-36.pyc │ ├── distributions.py │ └── tf_util.py └── trainer │ ├── __pycache__ │ ├── maddpg.cpython-36.pyc │ └── replay_buffer.cpython-36.pyc │ ├── maddpg.py │ └── replay_buffer.py ├── multiagent ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── core.cpython-36.pyc │ ├── environment.cpython-36.pyc │ ├── multi_discrete.cpython-36.pyc │ ├── rendering.cpython-36.pyc │ └── scenario.cpython-36.pyc ├── core.py ├── environment.py ├── multi_discrete.py ├── policy.py ├── rendering.py ├── scenario.py └── scenarios │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── formation.cpython-36.pyc │ ├── simple.cpython-36.pyc │ ├── simple_adversary.cpython-36.pyc │ ├── simple_crypto.cpython-36.pyc │ ├── simple_push.cpython-36.pyc │ ├── simple_reference.cpython-36.pyc │ ├── simple_speaker_listener.cpython-36.pyc │ ├── simple_spread.cpython-36.pyc │ ├── simple_tag.cpython-36.pyc │ └── simple_world_comm.cpython-36.pyc │ ├── formation.py │ ├── simple.py │ ├── simple_adversary.py │ ├── simple_crypto.py │ ├── simple_push.py │ ├── simple_reference.py │ ├── simple_speaker_listener.py │ ├── simple_spread.py │ ├── simple_tag.py │ └── simple_world_comm.py ├── setup.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | .static_storage/ 57 | .media/ 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning 2 | 3 | 4 | In this project deep reinforcement learning is used to train multi-agent robotic systems to perfrom leader-follower formation control . The OpenAI's MADDPG environment is used after some modifications have been added for agent training. 5 | 6 | ## Framework 7 | 8 | The Framework used in this project is Python 3.6.9 installed on Ubuntu 18.04 LTS. Alongside with Numpy, Tensorflow. 9 | 10 |

11 | 12 |

13 | 14 | 15 | ## Environment 16 | 17 | The environment is the single most important element in the Reinforcement Learningprocess since it presents the physical world that the agent interacts with. In this project the environment used is [Multi-Agent Particle Environments (MPE)](https://github.com/openai/multiagent-particle-envs). 18 | based on OpenAI work. OpenAI is artificial intelligence research laboratory that develops free open-source tools and libraries that helps the Artificial Intelligence developers community in the Research and Industry fields. The original environment is a 2D world with a continuous observation and discreteaction space, along with some basic simulated physics. It was developed such that the agents are divided into 2 groups: Good Agents, Adversary Agents. Such that the good agents try to cooperate to cover certain goal landmarks so that the adversary agents can not cover these goals. 19 |

20 | 21 |

22 | 23 | Many modifications are made to this environment in this project so that it can be used in the Leader-Follower Formation Control favor, including: 24 | 1. The agents are divided into: One Leader Agent, Two Follower Agents. 25 | 2. The goal is an individual landmark its location is fixed, unlike being assigned randomly in the original environment, in the left down corner of the environment, specifically at location (-0.8, -0.8) with respect to the coordination plane which has a center of (0, 0) right at the middle of the screen. 26 | 3. The landmarks representing obstacles are assigned randomly. 27 | 4. All of the agents initial positions are assigned randomly constrained to be at the first quadrant of the coordination system unlike being completely random at the original environment. 28 | 5. The maximum speed of the agents is set to 0.2 unlike the original environments. The size of the agents is reduced and the landmarks size is magnified compared tothe original environment. 29 | -------------------------------------------------------------------------------- /maddpg.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: maddpg 3 | Version: 0.0.1 4 | Summary: Multi-Agent Deep Deterministic Policy Gradient 5 | Home-page: https://github.com/openai/maddpg 6 | Author: Igor Mordatch 7 | Author-email: mordatch@openai.com 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /maddpg.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | maddpg/__init__.py 4 | maddpg.egg-info/PKG-INFO 5 | maddpg.egg-info/SOURCES.txt 6 | maddpg.egg-info/dependency_links.txt 7 | maddpg.egg-info/not-zip-safe 8 | maddpg.egg-info/requires.txt 9 | maddpg.egg-info/top_level.txt 10 | multiagent/__init__.py 11 | multiagent/core.py 12 | multiagent/environment.py 13 | multiagent/multi_discrete.py 14 | multiagent/policy.py 15 | multiagent/rendering.py 16 | multiagent/scenario.py 17 | multiagent/scenarios/__init__.py 18 | multiagent/scenarios/simple.py 19 | multiagent/scenarios/simple_adversary.py 20 | multiagent/scenarios/simple_crypto.py 21 | multiagent/scenarios/simple_push.py 22 | multiagent/scenarios/simple_reference.py 23 | multiagent/scenarios/simple_speaker_listener.py 24 | multiagent/scenarios/simple_spread.py 25 | multiagent/scenarios/simple_tag.py 26 | multiagent/scenarios/simple_world_comm.py -------------------------------------------------------------------------------- /maddpg.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /maddpg.egg-info/not-zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /maddpg.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | gym 2 | numpy-stl 3 | -------------------------------------------------------------------------------- /maddpg.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | maddpg 2 | multiagent 3 | -------------------------------------------------------------------------------- /maddpg/__init__.py: -------------------------------------------------------------------------------- 1 | class AgentTrainer(object): 2 | def __init__(self, name, model, obs_shape, act_space, args): 3 | raise NotImplemented() 4 | 5 | def action(self, obs): 6 | raise NotImplemented() 7 | 8 | def process_experience(self, obs, act, rew, new_obs, done, terminal): 9 | raise NotImplemented() 10 | 11 | def preupdate(self): 12 | raise NotImplemented() 13 | 14 | def update(self, agents): 15 | raise NotImplemented() -------------------------------------------------------------------------------- /maddpg/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/maddpg/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /maddpg/common/__pycache__/distributions.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/maddpg/common/__pycache__/distributions.cpython-36.pyc -------------------------------------------------------------------------------- /maddpg/common/__pycache__/tf_util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/maddpg/common/__pycache__/tf_util.cpython-36.pyc -------------------------------------------------------------------------------- /maddpg/common/distributions.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import maddpg.common.tf_util as U 4 | from tensorflow.python.ops import math_ops 5 | from multiagent.multi_discrete import MultiDiscrete 6 | from tensorflow.python.ops import nn 7 | 8 | class Pd(object): 9 | """ 10 | A particular probability distribution 11 | """ 12 | def flatparam(self): 13 | raise NotImplementedError 14 | def mode(self): 15 | raise NotImplementedError 16 | def logp(self, x): 17 | raise NotImplementedError 18 | def kl(self, other): 19 | raise NotImplementedError 20 | def entropy(self): 21 | raise NotImplementedError 22 | def sample(self): 23 | raise NotImplementedError 24 | 25 | class PdType(object): 26 | """ 27 | Parametrized family of probability distributions 28 | """ 29 | def pdclass(self): 30 | raise NotImplementedError 31 | def pdfromflat(self, flat): 32 | return self.pdclass()(flat) 33 | def param_shape(self): 34 | raise NotImplementedError 35 | def sample_shape(self): 36 | raise NotImplementedError 37 | def sample_dtype(self): 38 | raise NotImplementedError 39 | 40 | def param_placeholder(self, prepend_shape, name=None): 41 | return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) 42 | def sample_placeholder(self, prepend_shape, name=None): 43 | return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) 44 | 45 | class CategoricalPdType(PdType): 46 | def __init__(self, ncat): 47 | self.ncat = ncat 48 | def pdclass(self): 49 | return CategoricalPd 50 | def param_shape(self): 51 | return [self.ncat] 52 | def sample_shape(self): 53 | return [] 54 | def sample_dtype(self): 55 | return tf.int32 56 | 57 | class SoftCategoricalPdType(PdType): 58 | def __init__(self, ncat): 59 | self.ncat = ncat 60 | def pdclass(self): 61 | return SoftCategoricalPd 62 | def param_shape(self): 63 | return [self.ncat] 64 | def sample_shape(self): 65 | return [self.ncat] 66 | def sample_dtype(self): 67 | return tf.float32 68 | 69 | class MultiCategoricalPdType(PdType): 70 | def __init__(self, low, high): 71 | self.low = low 72 | self.high = high 73 | self.ncats = high - low + 1 74 | def pdclass(self): 75 | return MultiCategoricalPd 76 | def pdfromflat(self, flat): 77 | return MultiCategoricalPd(self.low, self.high, flat) 78 | def param_shape(self): 79 | return [sum(self.ncats)] 80 | def sample_shape(self): 81 | return [len(self.ncats)] 82 | def sample_dtype(self): 83 | return tf.int32 84 | 85 | class SoftMultiCategoricalPdType(PdType): 86 | def __init__(self, low, high): 87 | self.low = low 88 | self.high = high 89 | self.ncats = high - low + 1 90 | def pdclass(self): 91 | return SoftMultiCategoricalPd 92 | def pdfromflat(self, flat): 93 | return SoftMultiCategoricalPd(self.low, self.high, flat) 94 | def param_shape(self): 95 | return [sum(self.ncats)] 96 | def sample_shape(self): 97 | return [sum(self.ncats)] 98 | def sample_dtype(self): 99 | return tf.float32 100 | 101 | class DiagGaussianPdType(PdType): 102 | def __init__(self, size): 103 | self.size = size 104 | def pdclass(self): 105 | return DiagGaussianPd 106 | def param_shape(self): 107 | return [2*self.size] 108 | def sample_shape(self): 109 | return [self.size] 110 | def sample_dtype(self): 111 | return tf.float32 112 | 113 | class BernoulliPdType(PdType): 114 | def __init__(self, size): 115 | self.size = size 116 | def pdclass(self): 117 | return BernoulliPd 118 | def param_shape(self): 119 | return [self.size] 120 | def sample_shape(self): 121 | return [self.size] 122 | def sample_dtype(self): 123 | return tf.int32 124 | 125 | # WRONG SECOND DERIVATIVES 126 | # class CategoricalPd(Pd): 127 | # def __init__(self, logits): 128 | # self.logits = logits 129 | # self.ps = tf.nn.softmax(logits) 130 | # @classmethod 131 | # def fromflat(cls, flat): 132 | # return cls(flat) 133 | # def flatparam(self): 134 | # return self.logits 135 | # def mode(self): 136 | # return U.argmax(self.logits, axis=1) 137 | # def logp(self, x): 138 | # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) 139 | # def kl(self, other): 140 | # return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ 141 | # - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 142 | # def entropy(self): 143 | # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 144 | # def sample(self): 145 | # u = tf.random_uniform(tf.shape(self.logits)) 146 | # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) 147 | 148 | class CategoricalPd(Pd): 149 | def __init__(self, logits): 150 | self.logits = logits 151 | def flatparam(self): 152 | return self.logits 153 | def mode(self): 154 | return U.argmax(self.logits, axis=1) 155 | def logp(self, x): 156 | return -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 157 | def kl(self, other): 158 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 159 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) 160 | ea0 = tf.exp(a0) 161 | ea1 = tf.exp(a1) 162 | z0 = U.sum(ea0, axis=1, keepdims=True) 163 | z1 = U.sum(ea1, axis=1, keepdims=True) 164 | p0 = ea0 / z0 165 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) 166 | def entropy(self): 167 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 168 | ea0 = tf.exp(a0) 169 | z0 = U.sum(ea0, axis=1, keepdims=True) 170 | p0 = ea0 / z0 171 | return U.sum(p0 * (tf.log(z0) - a0), axis=1) 172 | def sample(self): 173 | u = tf.random_uniform(tf.shape(self.logits)) 174 | return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) 175 | @classmethod 176 | def fromflat(cls, flat): 177 | return cls(flat) 178 | 179 | class SoftCategoricalPd(Pd): 180 | def __init__(self, logits): 181 | self.logits = logits 182 | def flatparam(self): 183 | return self.logits 184 | def mode(self): 185 | return U.softmax(self.logits, axis=-1) 186 | def logp(self, x): 187 | return -tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 188 | def kl(self, other): 189 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 190 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) 191 | ea0 = tf.exp(a0) 192 | ea1 = tf.exp(a1) 193 | z0 = U.sum(ea0, axis=1, keepdims=True) 194 | z1 = U.sum(ea1, axis=1, keepdims=True) 195 | p0 = ea0 / z0 196 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) 197 | def entropy(self): 198 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 199 | ea0 = tf.exp(a0) 200 | z0 = U.sum(ea0, axis=1, keepdims=True) 201 | p0 = ea0 / z0 202 | return U.sum(p0 * (tf.log(z0) - a0), axis=1) 203 | def sample(self): 204 | u = tf.random_uniform(tf.shape(self.logits)) 205 | return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1) 206 | @classmethod 207 | def fromflat(cls, flat): 208 | return cls(flat) 209 | 210 | class MultiCategoricalPd(Pd): 211 | def __init__(self, low, high, flat): 212 | self.flat = flat 213 | self.low = tf.constant(low, dtype=tf.int32) 214 | self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 215 | def flatparam(self): 216 | return self.flat 217 | def mode(self): 218 | return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) 219 | def logp(self, x): 220 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 221 | def kl(self, other): 222 | return tf.add_n([ 223 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 224 | ]) 225 | def entropy(self): 226 | return tf.add_n([p.entropy() for p in self.categoricals]) 227 | def sample(self): 228 | return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) 229 | @classmethod 230 | def fromflat(cls, flat): 231 | return cls(flat) 232 | 233 | class SoftMultiCategoricalPd(Pd): # doesn't work yet 234 | def __init__(self, low, high, flat): 235 | self.flat = flat 236 | self.low = tf.constant(low, dtype=tf.float32) 237 | self.categoricals = list(map(SoftCategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 238 | def flatparam(self): 239 | return self.flat 240 | def mode(self): 241 | x = [] 242 | for i in range(len(self.categoricals)): 243 | x.append(self.low[i] + self.categoricals[i].mode()) 244 | return tf.concat(x, axis=-1) 245 | def logp(self, x): 246 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 247 | def kl(self, other): 248 | return tf.add_n([ 249 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 250 | ]) 251 | def entropy(self): 252 | return tf.add_n([p.entropy() for p in self.categoricals]) 253 | def sample(self): 254 | x = [] 255 | for i in range(len(self.categoricals)): 256 | x.append(self.low[i] + self.categoricals[i].sample()) 257 | return tf.concat(x, axis=-1) 258 | @classmethod 259 | def fromflat(cls, flat): 260 | return cls(flat) 261 | 262 | class DiagGaussianPd(Pd): 263 | def __init__(self, flat): 264 | self.flat = flat 265 | mean, logstd = tf.split(axis=1, num_or_size_splits=2, value=flat) 266 | self.mean = mean 267 | self.logstd = logstd 268 | self.std = tf.exp(logstd) 269 | def flatparam(self): 270 | return self.flat 271 | def mode(self): 272 | return self.mean 273 | def logp(self, x): 274 | return - 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=1) \ 275 | - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) \ 276 | - U.sum(self.logstd, axis=1) 277 | def kl(self, other): 278 | assert isinstance(other, DiagGaussianPd) 279 | return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=1) 280 | def entropy(self): 281 | return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), 1) 282 | def sample(self): 283 | return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) 284 | @classmethod 285 | def fromflat(cls, flat): 286 | return cls(flat) 287 | 288 | class BernoulliPd(Pd): 289 | def __init__(self, logits): 290 | self.logits = logits 291 | self.ps = tf.sigmoid(logits) 292 | def flatparam(self): 293 | return self.logits 294 | def mode(self): 295 | return tf.round(self.ps) 296 | def logp(self, x): 297 | return - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1) 298 | def kl(self, other): 299 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) 300 | def entropy(self): 301 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) 302 | def sample(self): 303 | p = tf.sigmoid(self.logits) 304 | u = tf.random_uniform(tf.shape(p)) 305 | return tf.to_float(math_ops.less(u, p)) 306 | @classmethod 307 | def fromflat(cls, flat): 308 | return cls(flat) 309 | 310 | def make_pdtype(ac_space): 311 | from gym import spaces 312 | if isinstance(ac_space, spaces.Box): 313 | assert len(ac_space.shape) == 1 314 | return DiagGaussianPdType(ac_space.shape[0]) 315 | elif isinstance(ac_space, spaces.Discrete): 316 | # return CategoricalPdType(ac_space.n) 317 | return SoftCategoricalPdType(ac_space.n) 318 | elif isinstance(ac_space, MultiDiscrete): 319 | #return MultiCategoricalPdType(ac_space.low, ac_space.high) 320 | return SoftMultiCategoricalPdType(ac_space.low, ac_space.high) 321 | elif isinstance(ac_space, spaces.MultiBinary): 322 | return BernoulliPdType(ac_space.n) 323 | else: 324 | raise NotImplementedError 325 | 326 | def shape_el(v, i): 327 | maybe = v.get_shape()[i] 328 | if maybe is not None: 329 | return maybe 330 | else: 331 | return tf.shape(v)[i] 332 | -------------------------------------------------------------------------------- /maddpg/common/tf_util.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | 6 | def sum(x, axis=None, keepdims=False): 7 | return tf.reduce_sum(x, axis=None if axis is None else [axis], keep_dims = keepdims) 8 | def mean(x, axis=None, keepdims=False): 9 | return tf.reduce_mean(x, axis=None if axis is None else [axis], keep_dims = keepdims) 10 | def var(x, axis=None, keepdims=False): 11 | meanx = mean(x, axis=axis, keepdims=keepdims) 12 | return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) 13 | def std(x, axis=None, keepdims=False): 14 | return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) 15 | def max(x, axis=None, keepdims=False): 16 | return tf.reduce_max(x, axis=None if axis is None else [axis], keep_dims = keepdims) 17 | def min(x, axis=None, keepdims=False): 18 | return tf.reduce_min(x, axis=None if axis is None else [axis], keep_dims = keepdims) 19 | def concatenate(arrs, axis=0): 20 | return tf.concat(axis=axis, values=arrs) 21 | def argmax(x, axis=None): 22 | return tf.argmax(x, axis=axis) 23 | def softmax(x, axis=None): 24 | return tf.nn.softmax(x, axis=axis) 25 | 26 | # ================================================================ 27 | # Misc 28 | # ================================================================ 29 | 30 | 31 | def is_placeholder(x): 32 | return type(x) is tf.Tensor and len(x.op.inputs) == 0 33 | 34 | # ================================================================ 35 | # Inputs 36 | # ================================================================ 37 | 38 | 39 | class TfInput(object): 40 | def __init__(self, name="(unnamed)"): 41 | """Generalized Tensorflow placeholder. The main differences are: 42 | - possibly uses multiple placeholders internally and returns multiple values 43 | - can apply light postprocessing to the value feed to placeholder. 44 | """ 45 | self.name = name 46 | 47 | def get(self): 48 | """Return the tf variable(s) representing the possibly postprocessed value 49 | of placeholder(s). 50 | """ 51 | raise NotImplemented() 52 | 53 | def make_feed_dict(data): 54 | """Given data input it to the placeholder(s).""" 55 | raise NotImplemented() 56 | 57 | 58 | class PlacholderTfInput(TfInput): 59 | def __init__(self, placeholder): 60 | """Wrapper for regular tensorflow placeholder.""" 61 | super().__init__(placeholder.name) 62 | self._placeholder = placeholder 63 | 64 | def get(self): 65 | return self._placeholder 66 | 67 | def make_feed_dict(self, data): 68 | return {self._placeholder: data} 69 | 70 | 71 | class BatchInput(PlacholderTfInput): 72 | def __init__(self, shape, dtype=tf.float32, name=None): 73 | """Creates a placeholder for a batch of tensors of a given shape and dtype 74 | 75 | Parameters 76 | ---------- 77 | shape: [int] 78 | shape of a single elemenet of the batch 79 | dtype: tf.dtype 80 | number representation used for tensor contents 81 | name: str 82 | name of the underlying placeholder 83 | """ 84 | super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name)) 85 | 86 | 87 | class Uint8Input(PlacholderTfInput): 88 | def __init__(self, shape, name=None): 89 | """Takes input in uint8 format which is cast to float32 and divided by 255 90 | before passing it to the model. 91 | 92 | On GPU this ensures lower data transfer times. 93 | 94 | Parameters 95 | ---------- 96 | shape: [int] 97 | shape of the tensor. 98 | name: str 99 | name of the underlying placeholder 100 | """ 101 | 102 | super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name)) 103 | self._shape = shape 104 | self._output = tf.cast(super().get(), tf.float32) / 255.0 105 | 106 | def get(self): 107 | return self._output 108 | 109 | 110 | def ensure_tf_input(thing): 111 | """Takes either tf.placeholder of TfInput and outputs equivalent TfInput""" 112 | if isinstance(thing, TfInput): 113 | return thing 114 | elif is_placeholder(thing): 115 | return PlacholderTfInput(thing) 116 | else: 117 | raise ValueError("Must be a placeholder or TfInput") 118 | 119 | # ================================================================ 120 | # Mathematical utils 121 | # ================================================================ 122 | 123 | 124 | def huber_loss(x, delta=1.0): 125 | """Reference: https://en.wikipedia.org/wiki/Huber_loss""" 126 | return tf.where( 127 | tf.abs(x) < delta, 128 | tf.square(x) * 0.5, 129 | delta * (tf.abs(x) - 0.5 * delta) 130 | ) 131 | 132 | # ================================================================ 133 | # Optimizer utils 134 | # ================================================================ 135 | 136 | 137 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 138 | """Minimized `objective` using `optimizer` w.r.t. variables in 139 | `var_list` while ensure the norm of the gradients for each 140 | variable is clipped to `clip_val` 141 | """ 142 | if clip_val is None: 143 | return optimizer.minimize(objective, var_list=var_list) 144 | else: 145 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 146 | for i, (grad, var) in enumerate(gradients): 147 | if grad is not None: 148 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 149 | return optimizer.apply_gradients(gradients) 150 | 151 | 152 | # ================================================================ 153 | # Global session 154 | # ================================================================ 155 | 156 | def get_session(): 157 | """Returns recently made Tensorflow session""" 158 | return tf.get_default_session() 159 | 160 | 161 | def make_session(num_cpu): 162 | """Returns a session that will use CPU's only""" 163 | tf_config = tf.ConfigProto( 164 | inter_op_parallelism_threads=num_cpu, 165 | intra_op_parallelism_threads=num_cpu) 166 | return tf.Session(config=tf_config) 167 | 168 | 169 | def single_threaded_session(): 170 | """Returns a session which will only use a single CPU""" 171 | return make_session(1) 172 | 173 | 174 | ALREADY_INITIALIZED = set() 175 | 176 | 177 | def initialize(): 178 | """Initialize all the uninitialized variables in the global scope.""" 179 | new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED 180 | get_session().run(tf.variables_initializer(new_variables)) 181 | ALREADY_INITIALIZED.update(new_variables) 182 | 183 | 184 | # ================================================================ 185 | # Scopes 186 | # ================================================================ 187 | 188 | 189 | def scope_vars(scope, trainable_only=False): 190 | """ 191 | Get variables inside a scope 192 | The scope can be specified as a string 193 | 194 | Parameters 195 | ---------- 196 | scope: str or VariableScope 197 | scope in which the variables reside. 198 | trainable_only: bool 199 | whether or not to return only the variables that were marked as trainable. 200 | 201 | Returns 202 | ------- 203 | vars: [tf.Variable] 204 | list of variables in `scope`. 205 | """ 206 | return tf.get_collection( 207 | tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES, 208 | scope=scope if isinstance(scope, str) else scope.name 209 | ) 210 | 211 | 212 | def scope_name(): 213 | """Returns the name of current scope as a string, e.g. deepq/q_func""" 214 | return tf.get_variable_scope().name 215 | 216 | 217 | def absolute_scope_name(relative_scope_name): 218 | """Appends parent scope name to `relative_scope_name`""" 219 | return scope_name() + "/" + relative_scope_name 220 | 221 | # ================================================================ 222 | # Saving variables 223 | # ================================================================ 224 | 225 | 226 | def load_state(fname, saver=None): 227 | """Load all the variables to the current session from the location """ 228 | if saver is None: 229 | saver = tf.train.Saver() 230 | saver.restore(get_session(), fname) 231 | return saver 232 | 233 | 234 | def save_state(fname, saver=None): 235 | """Save all the variables in the current session to the location """ 236 | os.makedirs(os.path.dirname(fname), exist_ok=True) 237 | if saver is None: 238 | saver = tf.train.Saver() 239 | saver.save(get_session(), fname) 240 | return saver 241 | 242 | # ================================================================ 243 | # Theano-like Function 244 | # ================================================================ 245 | 246 | 247 | def function(inputs, outputs, updates=None, givens=None): 248 | """Just like Theano function. Take a bunch of tensorflow placeholders and expersions 249 | computed based on those placeholders and produces f(inputs) -> outputs. Function f takes 250 | values to be feed to the inputs placeholders and produces the values of the experessions 251 | in outputs. 252 | 253 | Input values can be passed in the same order as inputs or can be provided as kwargs based 254 | on placeholder name (passed to constructor or accessible via placeholder.op.name). 255 | 256 | Example: 257 | x = tf.placeholder(tf.int32, (), name="x") 258 | y = tf.placeholder(tf.int32, (), name="y") 259 | z = 3 * x + 2 * y 260 | lin = function([x, y], z, givens={y: 0}) 261 | 262 | with single_threaded_session(): 263 | initialize() 264 | 265 | assert lin(2) == 6 266 | assert lin(x=3) == 9 267 | assert lin(2, 2) == 10 268 | assert lin(x=2, y=3) == 12 269 | 270 | Parameters 271 | ---------- 272 | inputs: [tf.placeholder or TfInput] 273 | list of input arguments 274 | outputs: [tf.Variable] or tf.Variable 275 | list of outputs or a single output to be returned from function. Returned 276 | value will also have the same shape. 277 | """ 278 | if isinstance(outputs, list): 279 | return _Function(inputs, outputs, updates, givens=givens) 280 | elif isinstance(outputs, (dict, collections.OrderedDict)): 281 | f = _Function(inputs, outputs.values(), updates, givens=givens) 282 | return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs))) 283 | else: 284 | f = _Function(inputs, [outputs], updates, givens=givens) 285 | return lambda *args, **kwargs: f(*args, **kwargs)[0] 286 | 287 | 288 | class _Function(object): 289 | def __init__(self, inputs, outputs, updates, givens, check_nan=False): 290 | for inpt in inputs: 291 | if not issubclass(type(inpt), TfInput): 292 | assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of rl_algs.common.TfInput" 293 | self.inputs = inputs 294 | updates = updates or [] 295 | self.update_group = tf.group(*updates) 296 | self.outputs_update = list(outputs) + [self.update_group] 297 | self.givens = {} if givens is None else givens 298 | self.check_nan = check_nan 299 | 300 | def _feed_input(self, feed_dict, inpt, value): 301 | if issubclass(type(inpt), TfInput): 302 | feed_dict.update(inpt.make_feed_dict(value)) 303 | elif is_placeholder(inpt): 304 | feed_dict[inpt] = value 305 | 306 | def __call__(self, *args, **kwargs): 307 | assert len(args) <= len(self.inputs), "Too many arguments provided" 308 | feed_dict = {} 309 | # Update the args 310 | for inpt, value in zip(self.inputs, args): 311 | self._feed_input(feed_dict, inpt, value) 312 | # Update the kwargs 313 | kwargs_passed_inpt_names = set() 314 | for inpt in self.inputs[len(args):]: 315 | inpt_name = inpt.name.split(':')[0] 316 | inpt_name = inpt_name.split('/')[-1] 317 | assert inpt_name not in kwargs_passed_inpt_names, \ 318 | "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name) 319 | if inpt_name in kwargs: 320 | kwargs_passed_inpt_names.add(inpt_name) 321 | self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name)) 322 | else: 323 | assert inpt in self.givens, "Missing argument " + inpt_name 324 | assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys())) 325 | # Update feed dict with givens. 326 | for inpt in self.givens: 327 | feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) 328 | results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] 329 | if self.check_nan: 330 | if any(np.isnan(r).any() for r in results): 331 | raise RuntimeError("Nan detected") 332 | return results 333 | -------------------------------------------------------------------------------- /maddpg/trainer/__pycache__/maddpg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/maddpg/trainer/__pycache__/maddpg.cpython-36.pyc -------------------------------------------------------------------------------- /maddpg/trainer/__pycache__/replay_buffer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/maddpg/trainer/__pycache__/replay_buffer.cpython-36.pyc -------------------------------------------------------------------------------- /maddpg/trainer/maddpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | import maddpg.common.tf_util as U 5 | 6 | from maddpg.common.distributions import make_pdtype 7 | from maddpg import AgentTrainer 8 | from maddpg.trainer.replay_buffer import ReplayBuffer 9 | 10 | 11 | def discount_with_dones(rewards, dones, gamma): 12 | discounted = [] 13 | r = 0 14 | for reward, done in zip(rewards[::-1], dones[::-1]): 15 | r = reward + gamma*r 16 | r = r*(1.-done) 17 | discounted.append(r) 18 | return discounted[::-1] 19 | 20 | def make_update_exp(vals, target_vals): 21 | polyak = 1.0 - 1e-2 22 | expression = [] 23 | for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): 24 | expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var)) 25 | expression = tf.group(*expression) 26 | return U.function([], [], updates=[expression]) 27 | 28 | def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): 29 | with tf.variable_scope(scope, reuse=reuse): 30 | # create distribtuions 31 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 32 | 33 | # set up placeholders 34 | obs_ph_n = make_obs_ph_n 35 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] 36 | 37 | p_input = obs_ph_n[p_index] 38 | 39 | p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) 40 | p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) 41 | 42 | # wrap parameters in distribution 43 | act_pd = act_pdtype_n[p_index].pdfromflat(p) 44 | 45 | act_sample = act_pd.sample() 46 | p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) 47 | 48 | act_input_n = act_ph_n + [] 49 | act_input_n[p_index] = act_pd.sample() 50 | q_input = tf.concat(obs_ph_n + act_input_n, 1) 51 | if local_q_func: 52 | q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) 53 | q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0] 54 | pg_loss = -tf.reduce_mean(q) 55 | 56 | loss = pg_loss + p_reg * 1e-3 57 | 58 | optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) 59 | 60 | # Create callable functions 61 | train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) 62 | act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) 63 | p_values = U.function([obs_ph_n[p_index]], p) 64 | 65 | # target network 66 | target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) 67 | target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) 68 | update_target_p = make_update_exp(p_func_vars, target_p_func_vars) 69 | 70 | target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() 71 | target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) 72 | 73 | return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act} 74 | 75 | def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): 76 | with tf.variable_scope(scope, reuse=reuse): 77 | # create distribtuions 78 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 79 | 80 | # set up placeholders 81 | obs_ph_n = make_obs_ph_n 82 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] 83 | target_ph = tf.placeholder(tf.float32, [None], name="target") 84 | 85 | q_input = tf.concat(obs_ph_n + act_ph_n, 1) 86 | if local_q_func: 87 | q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) 88 | q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0] 89 | q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) 90 | 91 | q_loss = tf.reduce_mean(tf.square(q - target_ph)) 92 | 93 | # viscosity solution to Bellman differential equation in place of an initial condition 94 | q_reg = tf.reduce_mean(tf.square(q)) 95 | loss = q_loss #+ 1e-3 * q_reg 96 | 97 | optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) 98 | 99 | # Create callable functions 100 | train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) 101 | q_values = U.function(obs_ph_n + act_ph_n, q) 102 | 103 | # target network 104 | target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0] 105 | target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) 106 | update_target_q = make_update_exp(q_func_vars, target_q_func_vars) 107 | 108 | target_q_values = U.function(obs_ph_n + act_ph_n, target_q) 109 | 110 | return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values} 111 | 112 | class MADDPGAgentTrainer(AgentTrainer): 113 | def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): 114 | self.name = name 115 | self.n = len(obs_shape_n) 116 | self.agent_index = agent_index 117 | self.args = args 118 | obs_ph_n = [] 119 | for i in range(self.n): 120 | obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) 121 | 122 | # Create all the functions necessary to train the model 123 | self.q_train, self.q_update, self.q_debug = q_train( 124 | scope=self.name, 125 | make_obs_ph_n=obs_ph_n, 126 | act_space_n=act_space_n, 127 | q_index=agent_index, 128 | q_func=model, 129 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 130 | grad_norm_clipping=0.5, 131 | local_q_func=local_q_func, 132 | num_units=args.num_units 133 | ) 134 | self.act, self.p_train, self.p_update, self.p_debug = p_train( 135 | scope=self.name, 136 | make_obs_ph_n=obs_ph_n, 137 | act_space_n=act_space_n, 138 | p_index=agent_index, 139 | p_func=model, 140 | q_func=model, 141 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 142 | grad_norm_clipping=0.5, 143 | local_q_func=local_q_func, 144 | num_units=args.num_units 145 | ) 146 | # Create experience buffer 147 | self.replay_buffer = ReplayBuffer(1e6) 148 | self.max_replay_buffer_len = args.batch_size * args.max_episode_len 149 | self.replay_sample_index = None 150 | 151 | def action(self, obs): 152 | return self.act(obs[None])[0] 153 | 154 | def experience(self, obs, act, rew, new_obs, done, terminal): 155 | # Store transition in the replay buffer. 156 | self.replay_buffer.add(obs, act, rew, new_obs, float(done)) 157 | 158 | def preupdate(self): 159 | self.replay_sample_index = None 160 | 161 | def update(self, agents, t): 162 | if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough 163 | return 164 | if not t % 100 == 0: # only update every 100 steps 165 | return 166 | 167 | self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) 168 | # collect replay sample from all agents 169 | obs_n = [] 170 | obs_next_n = [] 171 | act_n = [] 172 | index = self.replay_sample_index 173 | for i in range(self.n): 174 | obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) 175 | obs_n.append(obs) 176 | obs_next_n.append(obs_next) 177 | act_n.append(act) 178 | obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) 179 | 180 | # train q network 181 | num_sample = 1 182 | target_q = 0.0 183 | for i in range(num_sample): 184 | target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] 185 | target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) 186 | target_q += rew + self.args.gamma * (1.0 - done) * target_q_next 187 | target_q /= num_sample 188 | q_loss = self.q_train(*(obs_n + act_n + [target_q])) 189 | 190 | # train p network 191 | p_loss = self.p_train(*(obs_n + act_n)) 192 | 193 | self.p_update() 194 | self.q_update() 195 | 196 | return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)] 197 | -------------------------------------------------------------------------------- /maddpg/trainer/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | class ReplayBuffer(object): 5 | def __init__(self, size): 6 | """Create Prioritized Replay buffer. 7 | 8 | Parameters 9 | ---------- 10 | size: int 11 | Max number of transitions to store in the buffer. When the buffer 12 | overflows the old memories are dropped. 13 | """ 14 | self._storage = [] 15 | self._maxsize = int(size) 16 | self._next_idx = 0 17 | 18 | def __len__(self): 19 | return len(self._storage) 20 | 21 | def clear(self): 22 | self._storage = [] 23 | self._next_idx = 0 24 | 25 | def add(self, obs_t, action, reward, obs_tp1, done): 26 | data = (obs_t, action, reward, obs_tp1, done) 27 | 28 | if self._next_idx >= len(self._storage): 29 | self._storage.append(data) 30 | else: 31 | self._storage[self._next_idx] = data 32 | self._next_idx = (self._next_idx + 1) % self._maxsize 33 | 34 | def _encode_sample(self, idxes): 35 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 36 | for i in idxes: 37 | data = self._storage[i] 38 | obs_t, action, reward, obs_tp1, done = data 39 | obses_t.append(np.array(obs_t, copy=False)) 40 | actions.append(np.array(action, copy=False)) 41 | rewards.append(reward) 42 | obses_tp1.append(np.array(obs_tp1, copy=False)) 43 | dones.append(done) 44 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 45 | 46 | def make_index(self, batch_size): 47 | return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 48 | 49 | def make_latest_index(self, batch_size): 50 | idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)] 51 | np.random.shuffle(idx) 52 | return idx 53 | 54 | def sample_index(self, idxes): 55 | return self._encode_sample(idxes) 56 | 57 | def sample(self, batch_size): 58 | """Sample a batch of experiences. 59 | 60 | Parameters 61 | ---------- 62 | batch_size: int 63 | How many transitions to sample. 64 | 65 | Returns 66 | ------- 67 | obs_batch: np.array 68 | batch of observations 69 | act_batch: np.array 70 | batch of actions executed given obs_batch 71 | rew_batch: np.array 72 | rewards received as results of executing act_batch 73 | next_obs_batch: np.array 74 | next set of observations seen after executing act_batch 75 | done_mask: np.array 76 | done_mask[i] = 1 if executing act_batch[i] resulted in 77 | the end of an episode and 0 otherwise. 78 | """ 79 | if batch_size > 0: 80 | idxes = self.make_index(batch_size) 81 | else: 82 | idxes = range(0, len(self._storage)) 83 | return self._encode_sample(idxes) 84 | 85 | def collect(self): 86 | return self.sample(-1) 87 | -------------------------------------------------------------------------------- /multiagent/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | # Multiagent envs 4 | # ---------------------------------------- 5 | 6 | register( 7 | id='MultiagentSimple-v0', 8 | entry_point='multiagent.envs:SimpleEnv', 9 | # FIXME(cathywu) currently has to be exactly max_path_length parameters in 10 | # rllab run script 11 | max_episode_steps=100, 12 | ) 13 | 14 | register( 15 | id='MultiagentSimpleSpeakerListener-v0', 16 | entry_point='multiagent.envs:SimpleSpeakerListenerEnv', 17 | max_episode_steps=100, 18 | ) 19 | -------------------------------------------------------------------------------- /multiagent/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/__pycache__/core.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/core.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/__pycache__/environment.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/environment.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/__pycache__/multi_discrete.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/multi_discrete.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/__pycache__/rendering.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/rendering.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/__pycache__/scenario.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/scenario.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # physical/external base state of all entites 4 | class EntityState(object): 5 | def __init__(self): 6 | # physical position 7 | self.p_pos = None 8 | # physical velocity 9 | self.p_vel = None 10 | 11 | # state of agents (including communication and internal/mental state) 12 | class AgentState(EntityState): 13 | def __init__(self): 14 | super(AgentState, self).__init__() 15 | # communication utterance 16 | self.c = None 17 | 18 | # action of the agent 19 | class Action(object): 20 | def __init__(self): 21 | # physical action 22 | self.u = None 23 | # communication action 24 | self.c = None 25 | 26 | # properties and state of physical world entity 27 | class Entity(object): 28 | def __init__(self): 29 | # name 30 | self.name = '' 31 | # properties: 32 | self.size = 0.050 33 | # entity can move / be pushed 34 | self.movable = False 35 | # entity collides with others 36 | self.collide = True 37 | # material density (affects mass) 38 | self.density = 25.0 39 | # color 40 | self.color = None 41 | # max speed and accel 42 | self.max_speed = 0.2 43 | self.accel = None 44 | # state 45 | self.state = EntityState() 46 | # mass 47 | self.initial_mass = 1.0 48 | 49 | @property 50 | def mass(self): 51 | return self.initial_mass 52 | 53 | # properties of landmark entities 54 | class Landmark(Entity): 55 | def __init__(self): 56 | super(Landmark, self).__init__() 57 | 58 | # properties of agent entities 59 | class Agent(Entity): 60 | def __init__(self): 61 | super(Agent, self).__init__() 62 | # agents are movable by default 63 | self.movable = True 64 | # cannot send communication signals 65 | self.silent = False 66 | # cannot observe the world 67 | self.blind = False 68 | # physical motor noise amount 69 | self.u_noise = None 70 | # communication noise amount 71 | self.c_noise = None 72 | # control range 73 | self.u_range = 1.0 74 | # state 75 | self.state = AgentState() 76 | # action 77 | self.action = Action() 78 | # script behavior to execute 79 | self.action_callback = None 80 | 81 | # multi-agent world 82 | class World(object): 83 | def __init__(self): 84 | # list of agents and entities (can change at execution-time!) 85 | self.agents = [] 86 | self.landmarks = [] 87 | # communication channel dimensionality 88 | self.dim_c = 0 89 | # position dimensionality 90 | self.dim_p = 2 91 | # color dimensionality 92 | self.dim_color = 3 93 | # simulation timestep 94 | self.dt = 0.1 95 | # physical damping 96 | self.damping = 0.25 97 | # contact response parameters 98 | self.contact_force = 1e+2 99 | self.contact_margin = 1e-3 100 | 101 | # return all entities in the world 102 | @property 103 | def entities(self): 104 | return self.agents + self.landmarks 105 | 106 | # return all agents controllable by external policies 107 | @property 108 | def policy_agents(self): 109 | return [agent for agent in self.agents if agent.action_callback is None] 110 | 111 | # return all agents controlled by world scripts 112 | @property 113 | def scripted_agents(self): 114 | return [agent for agent in self.agents if agent.action_callback is not None] 115 | 116 | # update state of the world 117 | def step(self): 118 | # set actions for scripted agents 119 | for agent in self.scripted_agents: 120 | agent.action = agent.action_callback(agent, self) 121 | # gather forces applied to entities 122 | p_force = [None] * len(self.entities) 123 | # apply agent physical controls 124 | p_force = self.apply_action_force(p_force) 125 | # apply environment forces 126 | p_force = self.apply_environment_force(p_force) 127 | # integrate physical state 128 | self.integrate_state(p_force) 129 | # update agent state 130 | for agent in self.agents: 131 | self.update_agent_state(agent) 132 | 133 | # gather agent action forces 134 | def apply_action_force(self, p_force): 135 | # set applied forces 136 | for i,agent in enumerate(self.agents): 137 | if agent.movable: 138 | noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0 139 | p_force[i] = agent.action.u + noise 140 | return p_force 141 | 142 | # gather physical forces acting on entities 143 | def apply_environment_force(self, p_force): 144 | # simple (but inefficient) collision response 145 | for a,entity_a in enumerate(self.entities): 146 | for b,entity_b in enumerate(self.entities): 147 | if(b <= a): continue 148 | [f_a, f_b] = self.get_collision_force(entity_a, entity_b) 149 | if(f_a is not None): 150 | if(p_force[a] is None): p_force[a] = 0.0 151 | p_force[a] = f_a + p_force[a] 152 | if(f_b is not None): 153 | if(p_force[b] is None): p_force[b] = 0.0 154 | p_force[b] = f_b + p_force[b] 155 | return p_force 156 | 157 | # integrate physical state 158 | def integrate_state(self, p_force): 159 | for i,entity in enumerate(self.entities): 160 | if not entity.movable: continue 161 | entity.state.p_vel = entity.state.p_vel * (1 - self.damping) 162 | if (p_force[i] is not None): 163 | entity.state.p_vel += (p_force[i] / entity.mass) * self.dt 164 | if entity.max_speed is not None: 165 | speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1])) 166 | if speed > entity.max_speed: 167 | entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) + 168 | np.square(entity.state.p_vel[1])) * entity.max_speed 169 | entity.state.p_pos += entity.state.p_vel * self.dt 170 | 171 | def update_agent_state(self, agent): 172 | # set communication state (directly for now) 173 | if agent.silent: 174 | agent.state.c = np.zeros(self.dim_c) 175 | else: 176 | noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0 177 | agent.state.c = agent.action.c + noise 178 | 179 | # get collision forces for any contact between two entities 180 | def get_collision_force(self, entity_a, entity_b): 181 | if (not entity_a.collide) or (not entity_b.collide): 182 | return [None, None] # not a collider 183 | if (entity_a is entity_b): 184 | return [None, None] # don't collide against itself 185 | # compute actual distance between entities 186 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos 187 | dist = np.sqrt(np.sum(np.square(delta_pos))) 188 | # minimum allowable distance 189 | dist_min = entity_a.size + entity_b.size 190 | # softmax penetration 191 | k = self.contact_margin 192 | penetration = np.logaddexp(0, -(dist - dist_min)/k)*k 193 | force = self.contact_force * delta_pos / dist * penetration 194 | force_a = +force if entity_a.movable else None 195 | force_b = -force if entity_b.movable else None 196 | return [force_a, force_b] -------------------------------------------------------------------------------- /multiagent/environment.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | from gym.envs.registration import EnvSpec 4 | import numpy as np 5 | from multiagent.multi_discrete import MultiDiscrete 6 | 7 | # environment for all agents in the multiagent world 8 | # currently code assumes that no agents will be created/destroyed at runtime! 9 | class MultiAgentEnv(gym.Env): 10 | metadata = { 11 | 'render.modes' : ['human', 'rgb_array'] 12 | } 13 | 14 | def __init__(self, world, reset_callback=None, reward_callback=None, 15 | observation_callback=None, info_callback=None, 16 | done_callback=None, shared_viewer=True): 17 | 18 | self.world = world 19 | self.agents = self.world.policy_agents 20 | # set required vectorized gym env property 21 | self.n = len(world.policy_agents) 22 | # scenario callbacks 23 | self.reset_callback = reset_callback 24 | self.reward_callback = reward_callback 25 | self.observation_callback = observation_callback 26 | self.info_callback = info_callback 27 | self.done_callback = done_callback 28 | # environment parameters 29 | self.discrete_action_space = True 30 | # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector 31 | self.discrete_action_input = False 32 | # if true, even the action is continuous, action will be performed discretely 33 | self.force_discrete_action = world.discrete_action if hasattr(world, 'discrete_action') else False 34 | # if true, every agent has the same reward 35 | self.shared_reward = world.collaborative if hasattr(world, 'collaborative') else False 36 | self.time = 0 37 | 38 | # configure spaces 39 | self.action_space = [] 40 | self.observation_space = [] 41 | for agent in self.agents: 42 | total_action_space = [] 43 | # physical action space 44 | if self.discrete_action_space: 45 | u_action_space = spaces.Discrete(world.dim_p * 2 + 1) 46 | else: 47 | u_action_space = spaces.Box(low=-agent.u_range, high=+agent.u_range, shape=(world.dim_p,), dtype=np.float32) 48 | if agent.movable: 49 | total_action_space.append(u_action_space) 50 | # communication action space 51 | if self.discrete_action_space: 52 | c_action_space = spaces.Discrete(world.dim_c) 53 | else: 54 | c_action_space = spaces.Box(low=0.0, high=1.0, shape=(world.dim_c,), dtype=np.float32) 55 | if not agent.silent: 56 | total_action_space.append(c_action_space) 57 | # total action space 58 | if len(total_action_space) > 1: 59 | # all action spaces are discrete, so simplify to MultiDiscrete action space 60 | if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]): 61 | act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space]) 62 | else: 63 | act_space = spaces.Tuple(total_action_space) 64 | self.action_space.append(act_space) 65 | else: 66 | self.action_space.append(total_action_space[0]) 67 | # observation space 68 | obs_dim = len(observation_callback(agent, self.world)) 69 | self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,), dtype=np.float32)) 70 | agent.action.c = np.zeros(self.world.dim_c) 71 | 72 | # rendering 73 | self.shared_viewer = shared_viewer 74 | if self.shared_viewer: 75 | self.viewers = [None] 76 | else: 77 | self.viewers = [None] * self.n 78 | self._reset_render() 79 | 80 | def step(self, action_n): 81 | obs_n = [] 82 | reward_n = [] 83 | done_n = [] 84 | info_n = {'n': []} 85 | self.agents = self.world.policy_agents 86 | # set action for each agent 87 | for i, agent in enumerate(self.agents): 88 | self._set_action(action_n[i], agent, self.action_space[i]) 89 | # advance world state 90 | self.world.step() 91 | # record observation for each agent 92 | for agent in self.agents: 93 | obs_n.append(self._get_obs(agent)) 94 | reward_n.append(self._get_reward(agent)) 95 | done_n.append(self._get_done(agent)) 96 | 97 | info_n['n'].append(self._get_info(agent)) 98 | 99 | # all agents get total reward in cooperative case 100 | reward = np.sum(reward_n) 101 | if self.shared_reward: 102 | reward_n = [reward] * self.n 103 | 104 | return obs_n, reward_n, done_n, info_n 105 | 106 | def reset(self): 107 | # reset world 108 | self.reset_callback(self.world) 109 | # reset renderer 110 | self._reset_render() 111 | # record observations for each agent 112 | obs_n = [] 113 | self.agents = self.world.policy_agents 114 | for agent in self.agents: 115 | obs_n.append(self._get_obs(agent)) 116 | return obs_n 117 | 118 | # get info used for benchmarking 119 | def _get_info(self, agent): 120 | if self.info_callback is None: 121 | return {} 122 | return self.info_callback(agent, self.world) 123 | 124 | # get observation for a particular agent 125 | def _get_obs(self, agent): 126 | if self.observation_callback is None: 127 | return np.zeros(0) 128 | return self.observation_callback(agent, self.world) 129 | 130 | # get dones for a particular agent 131 | # unused right now -- agents are allowed to go beyond the viewing screen 132 | def _get_done(self, agent): 133 | if self.done_callback is None: 134 | return False 135 | return self.done_callback(agent, self.world) 136 | 137 | # get reward for a particular agent 138 | def _get_reward(self, agent): 139 | if self.reward_callback is None: 140 | return 0.0 141 | return self.reward_callback(agent, self.world) 142 | 143 | # set env action for a particular agent 144 | def _set_action(self, action, agent, action_space, time=None): 145 | agent.action.u = np.zeros(self.world.dim_p) 146 | agent.action.c = np.zeros(self.world.dim_c) 147 | # process action 148 | if isinstance(action_space, MultiDiscrete): 149 | act = [] 150 | size = action_space.high - action_space.low + 1 151 | index = 0 152 | for s in size: 153 | act.append(action[index:(index+s)]) 154 | index += s 155 | action = act 156 | else: 157 | action = [action] 158 | 159 | if agent.movable: 160 | # physical action 161 | if self.discrete_action_input: 162 | agent.action.u = np.zeros(self.world.dim_p) 163 | # process discrete action 164 | if action[0] == 1: agent.action.u[0] = -1.0 165 | if action[0] == 2: agent.action.u[0] = +1.0 166 | if action[0] == 3: agent.action.u[1] = -1.0 167 | if action[0] == 4: agent.action.u[1] = +1.0 168 | else: 169 | if self.force_discrete_action: 170 | d = np.argmax(action[0]) 171 | action[0][:] = 0.0 172 | action[0][d] = 1.0 173 | if self.discrete_action_space: 174 | agent.action.u[0] += action[0][1] - action[0][2] 175 | agent.action.u[1] += action[0][3] - action[0][4] 176 | else: 177 | agent.action.u = action[0] 178 | sensitivity = 5.0 179 | if agent.accel is not None: 180 | sensitivity = agent.accel 181 | agent.action.u *= sensitivity 182 | action = action[1:] 183 | if not agent.silent: 184 | # communication action 185 | if self.discrete_action_input: 186 | agent.action.c = np.zeros(self.world.dim_c) 187 | agent.action.c[action[0]] = 1.0 188 | else: 189 | agent.action.c = action[0] 190 | action = action[1:] 191 | # make sure we used all elements of action 192 | assert len(action) == 0 193 | 194 | # reset rendering assets 195 | def _reset_render(self): 196 | self.render_geoms = None 197 | self.render_geoms_xform = None 198 | 199 | # render environment 200 | def render(self, mode='human'): 201 | if mode == 'human': 202 | alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 203 | message = '' 204 | for agent in self.world.agents: 205 | comm = [] 206 | for other in self.world.agents: 207 | if other is agent: continue 208 | if np.all(other.state.c == 0): 209 | word = '_' 210 | else: 211 | word = alphabet[np.argmax(other.state.c)] 212 | message += (other.name + ' to ' + agent.name + ': ' + word + ' ') 213 | print(message) 214 | 215 | for i in range(len(self.viewers)): 216 | # create viewers (if necessary) 217 | if self.viewers[i] is None: 218 | # import rendering only if we need it (and don't import for headless machines) 219 | #from gym.envs.classic_control import rendering 220 | from multiagent import rendering 221 | self.viewers[i] = rendering.Viewer(700,700) 222 | 223 | # create rendering geometry 224 | if self.render_geoms is None: 225 | # import rendering only if we need it (and don't import for headless machines) 226 | #from gym.envs.classic_control import rendering 227 | from multiagent import rendering 228 | self.render_geoms = [] 229 | self.render_geoms_xform = [] 230 | for entity in self.world.entities: 231 | geom = rendering.make_circle(entity.size) 232 | xform = rendering.Transform() 233 | if 'agent' in entity.name: 234 | geom.set_color(*entity.color, alpha=0.5) 235 | else: 236 | geom.set_color(*entity.color) 237 | geom.add_attr(xform) 238 | self.render_geoms.append(geom) 239 | self.render_geoms_xform.append(xform) 240 | 241 | # add geoms to viewer 242 | for viewer in self.viewers: 243 | viewer.geoms = [] 244 | for geom in self.render_geoms: 245 | viewer.add_geom(geom) 246 | 247 | results = [] 248 | for i in range(len(self.viewers)): 249 | from multiagent import rendering 250 | # update bounds to center around agent 251 | cam_range = 1 252 | if self.shared_viewer: 253 | pos = np.zeros(self.world.dim_p) 254 | else: 255 | pos = self.agents[i].state.p_pos 256 | self.viewers[i].set_bounds(pos[0]-cam_range,pos[0]+cam_range,pos[1]-cam_range,pos[1]+cam_range) 257 | # update geometry positions 258 | for e, entity in enumerate(self.world.entities): 259 | self.render_geoms_xform[e].set_translation(*entity.state.p_pos) 260 | # render to display or array 261 | results.append(self.viewers[i].render(return_rgb_array = mode=='rgb_array')) 262 | 263 | return results 264 | 265 | # create receptor field locations in local coordinate frame 266 | def _make_receptor_locations(self, agent): 267 | receptor_type = 'polar' 268 | range_min = 0.05 * 2.0 269 | range_max = 1.00 270 | dx = [] 271 | # circular receptive field 272 | if receptor_type == 'polar': 273 | for angle in np.linspace(-np.pi, +np.pi, 8, endpoint=False): 274 | for distance in np.linspace(range_min, range_max, 3): 275 | dx.append(distance * np.array([np.cos(angle), np.sin(angle)])) 276 | # add origin 277 | dx.append(np.array([0.0, 0.0])) 278 | # grid receptive field 279 | if receptor_type == 'grid': 280 | for x in np.linspace(-range_max, +range_max, 5): 281 | for y in np.linspace(-range_max, +range_max, 5): 282 | dx.append(np.array([x,y])) 283 | return dx 284 | 285 | 286 | # vectorized wrapper for a batch of multi-agent environments 287 | # assumes all environments have the same observation and action space 288 | class BatchMultiAgentEnv(gym.Env): 289 | metadata = { 290 | 'runtime.vectorized': True, 291 | 'render.modes' : ['human', 'rgb_array'] 292 | } 293 | 294 | def __init__(self, env_batch): 295 | self.env_batch = env_batch 296 | 297 | @property 298 | def n(self): 299 | return np.sum([env.n for env in self.env_batch]) 300 | 301 | @property 302 | def action_space(self): 303 | return self.env_batch[0].action_space 304 | 305 | @property 306 | def observation_space(self): 307 | return self.env_batch[0].observation_space 308 | 309 | def step(self, action_n, time): 310 | obs_n = [] 311 | reward_n = [] 312 | done_n = [] 313 | info_n = {'n': []} 314 | i = 0 315 | for env in self.env_batch: 316 | obs, reward, done, _ = env.step(action_n[i:(i+env.n)], time) 317 | i += env.n 318 | obs_n += obs 319 | # reward = [r / len(self.env_batch) for r in reward] 320 | reward_n += reward 321 | done_n += done 322 | return obs_n, reward_n, done_n, info_n 323 | 324 | def reset(self): 325 | obs_n = [] 326 | for env in self.env_batch: 327 | obs_n += env.reset() 328 | return obs_n 329 | 330 | # render environment 331 | def render(self, mode='human', close=True): 332 | results_n = [] 333 | for env in self.env_batch: 334 | results_n += env.render(mode, close) 335 | return results_n 336 | -------------------------------------------------------------------------------- /multiagent/multi_discrete.py: -------------------------------------------------------------------------------- 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates) 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py) 3 | 4 | import numpy as np 5 | 6 | import gym 7 | from gym.spaces import prng 8 | 9 | class MultiDiscrete(gym.Space): 10 | """ 11 | - The multi-discrete action space consists of a series of discrete action spaces with different parameters 12 | - It can be adapted to both a Discrete action space or a continuous (Box) action space 13 | - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space 14 | - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space 15 | where the discrete action space can take any integers from `min` to `max` (both inclusive) 16 | Note: A value of 0 always need to represent the NOOP action. 17 | e.g. Nintendo Game Controller 18 | - Can be conceptualized as 3 discrete action spaces: 19 | 1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4 20 | 2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 21 | 3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 22 | - Can be initialized as 23 | MultiDiscrete([ [0,4], [0,1], [0,1] ]) 24 | """ 25 | def __init__(self, array_of_param_array): 26 | self.low = np.array([x[0] for x in array_of_param_array]) 27 | self.high = np.array([x[1] for x in array_of_param_array]) 28 | self.num_discrete_space = self.low.shape[0] 29 | 30 | def sample(self): 31 | """ Returns a array with one sample from each discrete action space """ 32 | # For each row: round(random .* (max - min) + min, 0) 33 | random_array = prng.np_random.rand(self.num_discrete_space) 34 | return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)] 35 | def contains(self, x): 36 | return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all() 37 | 38 | @property 39 | def shape(self): 40 | return self.num_discrete_space 41 | def __repr__(self): 42 | return "MultiDiscrete" + str(self.num_discrete_space) 43 | def __eq__(self, other): 44 | return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high) -------------------------------------------------------------------------------- /multiagent/policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pyglet.window import key 3 | 4 | # individual agent policy 5 | class Policy(object): 6 | def __init__(self): 7 | pass 8 | def action(self, obs): 9 | raise NotImplementedError() 10 | 11 | # interactive policy based on keyboard input 12 | # hard-coded to deal only with movement, not communication 13 | class InteractivePolicy(Policy): 14 | def __init__(self, env, agent_index): 15 | super(InteractivePolicy, self).__init__() 16 | self.env = env 17 | # hard-coded keyboard events 18 | self.move = [False for i in range(4)] 19 | self.comm = [False for i in range(env.world.dim_c)] 20 | # register keyboard events with this environment's window 21 | env.viewers[agent_index].window.on_key_press = self.key_press 22 | env.viewers[agent_index].window.on_key_release = self.key_release 23 | 24 | def action(self, obs): 25 | # ignore observation and just act based on keyboard events 26 | if self.env.discrete_action_input: 27 | u = 0 28 | if self.move[0]: u = 1 29 | if self.move[1]: u = 2 30 | if self.move[2]: u = 4 31 | if self.move[3]: u = 3 32 | else: 33 | u = np.zeros(5) # 5-d because of no-move action 34 | if self.move[0]: u[1] += 1.0 35 | if self.move[1]: u[2] += 1.0 36 | if self.move[3]: u[3] += 1.0 37 | if self.move[2]: u[4] += 1.0 38 | if True not in self.move: 39 | u[0] += 1.0 40 | return np.concatenate([u, np.zeros(self.env.world.dim_c)]) 41 | 42 | # keyboard event callbacks 43 | def key_press(self, k, mod): 44 | if k==key.LEFT: self.move[0] = True 45 | if k==key.RIGHT: self.move[1] = True 46 | if k==key.UP: self.move[2] = True 47 | if k==key.DOWN: self.move[3] = True 48 | def key_release(self, k, mod): 49 | if k==key.LEFT: self.move[0] = False 50 | if k==key.RIGHT: self.move[1] = False 51 | if k==key.UP: self.move[2] = False 52 | if k==key.DOWN: self.move[3] = False 53 | -------------------------------------------------------------------------------- /multiagent/rendering.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2D rendering framework 3 | """ 4 | from __future__ import division 5 | import os 6 | import six 7 | import sys 8 | 9 | if "Apple" in sys.version: 10 | if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ: 11 | os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib' 12 | # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite 13 | 14 | from gym.utils import reraise 15 | from gym import error 16 | 17 | try: 18 | import pyglet 19 | except ImportError as e: 20 | reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.") 21 | 22 | try: 23 | from pyglet.gl import * 24 | except ImportError as e: 25 | reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '") 26 | 27 | import math 28 | import numpy as np 29 | 30 | RAD2DEG = 57.29577951308232 31 | 32 | def get_display(spec): 33 | """Convert a display specification (such as :0) into an actual Display 34 | object. 35 | 36 | Pyglet only supports multiple Displays on Linux. 37 | """ 38 | if spec is None: 39 | return None 40 | elif isinstance(spec, six.string_types): 41 | return pyglet.canvas.Display(spec) 42 | else: 43 | raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec)) 44 | 45 | class Viewer(object): 46 | def __init__(self, width, height, display=None): 47 | display = get_display(display) 48 | 49 | self.width = width 50 | self.height = height 51 | 52 | self.window = pyglet.window.Window(width=width, height=height, display=display) 53 | self.window.on_close = self.window_closed_by_user 54 | self.geoms = [] 55 | self.onetime_geoms = [] 56 | self.transform = Transform() 57 | 58 | glEnable(GL_BLEND) 59 | # glEnable(GL_MULTISAMPLE) 60 | glEnable(GL_LINE_SMOOTH) 61 | # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE) 62 | glHint(GL_LINE_SMOOTH_HINT, GL_NICEST) 63 | glLineWidth(2.0) 64 | glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) 65 | 66 | def close(self): 67 | self.window.close() 68 | 69 | def window_closed_by_user(self): 70 | self.close() 71 | 72 | def set_bounds(self, left, right, bottom, top): 73 | assert right > left and top > bottom 74 | scalex = self.width/(right-left) 75 | scaley = self.height/(top-bottom) 76 | self.transform = Transform( 77 | translation=(-left*scalex, -bottom*scaley), 78 | scale=(scalex, scaley)) 79 | 80 | def add_geom(self, geom): 81 | self.geoms.append(geom) 82 | 83 | def add_onetime(self, geom): 84 | self.onetime_geoms.append(geom) 85 | 86 | def render(self, return_rgb_array=False): 87 | glClearColor(1,1,1,1) 88 | self.window.clear() 89 | self.window.switch_to() 90 | self.window.dispatch_events() 91 | self.transform.enable() 92 | for geom in self.geoms: 93 | geom.render() 94 | for geom in self.onetime_geoms: 95 | geom.render() 96 | self.transform.disable() 97 | arr = None 98 | if return_rgb_array: 99 | buffer = pyglet.image.get_buffer_manager().get_color_buffer() 100 | image_data = buffer.get_image_data() 101 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 102 | # In https://github.com/openai/gym-http-api/issues/2, we 103 | # discovered that someone using Xmonad on Arch was having 104 | # a window of size 598 x 398, though a 600 x 400 window 105 | # was requested. (Guess Xmonad was preserving a pixel for 106 | # the boundary.) So we use the buffer height/width rather 107 | # than the requested one. 108 | arr = arr.reshape(buffer.height, buffer.width, 4) 109 | arr = arr[::-1,:,0:3] 110 | self.window.flip() 111 | self.onetime_geoms = [] 112 | return arr 113 | 114 | # Convenience 115 | def draw_circle(self, radius=10, res=30, filled=True, **attrs): 116 | geom = make_circle(radius=radius, res=res, filled=filled) 117 | _add_attrs(geom, attrs) 118 | self.add_onetime(geom) 119 | return geom 120 | 121 | def draw_polygon(self, v, filled=True, **attrs): 122 | geom = make_polygon(v=v, filled=filled) 123 | _add_attrs(geom, attrs) 124 | self.add_onetime(geom) 125 | return geom 126 | 127 | def draw_polyline(self, v, **attrs): 128 | geom = make_polyline(v=v) 129 | _add_attrs(geom, attrs) 130 | self.add_onetime(geom) 131 | return geom 132 | 133 | def draw_line(self, start, end, **attrs): 134 | geom = Line(start, end) 135 | _add_attrs(geom, attrs) 136 | self.add_onetime(geom) 137 | return geom 138 | 139 | def get_array(self): 140 | self.window.flip() 141 | image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() 142 | self.window.flip() 143 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 144 | arr = arr.reshape(self.height, self.width, 4) 145 | return arr[::-1,:,0:3] 146 | 147 | def _add_attrs(geom, attrs): 148 | if "color" in attrs: 149 | geom.set_color(*attrs["color"]) 150 | if "linewidth" in attrs: 151 | geom.set_linewidth(attrs["linewidth"]) 152 | 153 | class Geom(object): 154 | def __init__(self): 155 | self._color=Color((0, 0, 0, 1.0)) 156 | self.attrs = [self._color] 157 | def render(self): 158 | for attr in reversed(self.attrs): 159 | attr.enable() 160 | self.render1() 161 | for attr in self.attrs: 162 | attr.disable() 163 | def render1(self): 164 | raise NotImplementedError 165 | def add_attr(self, attr): 166 | self.attrs.append(attr) 167 | def set_color(self, r, g, b, alpha=1): 168 | self._color.vec4 = (r, g, b, alpha) 169 | 170 | class Attr(object): 171 | def enable(self): 172 | raise NotImplementedError 173 | def disable(self): 174 | pass 175 | 176 | class Transform(Attr): 177 | def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)): 178 | self.set_translation(*translation) 179 | self.set_rotation(rotation) 180 | self.set_scale(*scale) 181 | def enable(self): 182 | glPushMatrix() 183 | glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint 184 | glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0) 185 | glScalef(self.scale[0], self.scale[1], 1) 186 | def disable(self): 187 | glPopMatrix() 188 | def set_translation(self, newx, newy): 189 | self.translation = (float(newx), float(newy)) 190 | def set_rotation(self, new): 191 | self.rotation = float(new) 192 | def set_scale(self, newx, newy): 193 | self.scale = (float(newx), float(newy)) 194 | 195 | class Color(Attr): 196 | def __init__(self, vec4): 197 | self.vec4 = vec4 198 | def enable(self): 199 | glColor4f(*self.vec4) 200 | 201 | class LineStyle(Attr): 202 | def __init__(self, style): 203 | self.style = style 204 | def enable(self): 205 | glEnable(GL_LINE_STIPPLE) 206 | glLineStipple(1, self.style) 207 | def disable(self): 208 | glDisable(GL_LINE_STIPPLE) 209 | 210 | class LineWidth(Attr): 211 | def __init__(self, stroke): 212 | self.stroke = stroke 213 | def enable(self): 214 | glLineWidth(self.stroke) 215 | 216 | class Point(Geom): 217 | def __init__(self): 218 | Geom.__init__(self) 219 | def render1(self): 220 | glBegin(GL_POINTS) # draw point 221 | glVertex3f(0.0, 0.0, 0.0) 222 | glEnd() 223 | 224 | class FilledPolygon(Geom): 225 | def __init__(self, v): 226 | Geom.__init__(self) 227 | self.v = v 228 | def render1(self): 229 | if len(self.v) == 4 : glBegin(GL_QUADS) 230 | elif len(self.v) > 4 : glBegin(GL_POLYGON) 231 | else: glBegin(GL_TRIANGLES) 232 | for p in self.v: 233 | glVertex3f(p[0], p[1],0) # draw each vertex 234 | glEnd() 235 | 236 | color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5) 237 | glColor4f(*color) 238 | glBegin(GL_LINE_LOOP) 239 | for p in self.v: 240 | glVertex3f(p[0], p[1],0) # draw each vertex 241 | glEnd() 242 | 243 | def make_circle(radius=10, res=30, filled=True): 244 | points = [] 245 | for i in range(res): 246 | ang = 2*math.pi*i / res 247 | points.append((math.cos(ang)*radius, math.sin(ang)*radius)) 248 | if filled: 249 | return FilledPolygon(points) 250 | else: 251 | return PolyLine(points, True) 252 | 253 | def make_polygon(v, filled=True): 254 | if filled: return FilledPolygon(v) 255 | else: return PolyLine(v, True) 256 | 257 | def make_polyline(v): 258 | return PolyLine(v, False) 259 | 260 | def make_capsule(length, width): 261 | l, r, t, b = 0, length, width/2, -width/2 262 | box = make_polygon([(l,b), (l,t), (r,t), (r,b)]) 263 | circ0 = make_circle(width/2) 264 | circ1 = make_circle(width/2) 265 | circ1.add_attr(Transform(translation=(length, 0))) 266 | geom = Compound([box, circ0, circ1]) 267 | return geom 268 | 269 | class Compound(Geom): 270 | def __init__(self, gs): 271 | Geom.__init__(self) 272 | self.gs = gs 273 | for g in self.gs: 274 | g.attrs = [a for a in g.attrs if not isinstance(a, Color)] 275 | def render1(self): 276 | for g in self.gs: 277 | g.render() 278 | 279 | class PolyLine(Geom): 280 | def __init__(self, v, close): 281 | Geom.__init__(self) 282 | self.v = v 283 | self.close = close 284 | self.linewidth = LineWidth(1) 285 | self.add_attr(self.linewidth) 286 | def render1(self): 287 | glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP) 288 | for p in self.v: 289 | glVertex3f(p[0], p[1],0) # draw each vertex 290 | glEnd() 291 | def set_linewidth(self, x): 292 | self.linewidth.stroke = x 293 | 294 | class Line(Geom): 295 | def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)): 296 | Geom.__init__(self) 297 | self.start = start 298 | self.end = end 299 | self.linewidth = LineWidth(1) 300 | self.add_attr(self.linewidth) 301 | 302 | def render1(self): 303 | glBegin(GL_LINES) 304 | glVertex2f(*self.start) 305 | glVertex2f(*self.end) 306 | glEnd() 307 | 308 | class Image(Geom): 309 | def __init__(self, fname, width, height): 310 | Geom.__init__(self) 311 | self.width = width 312 | self.height = height 313 | img = pyglet.image.load(fname) 314 | self.img = img 315 | self.flip = False 316 | def render1(self): 317 | self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height) 318 | 319 | # ================================================================ 320 | 321 | class SimpleImageViewer(object): 322 | def __init__(self, display=None): 323 | self.window = None 324 | self.isopen = False 325 | self.display = display 326 | def imshow(self, arr): 327 | if self.window is None: 328 | height, width, channels = arr.shape 329 | self.window = pyglet.window.Window(width=width, height=height, display=self.display) 330 | self.width = width 331 | self.height = height 332 | self.isopen = True 333 | assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape" 334 | image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3) 335 | self.window.clear() 336 | self.window.switch_to() 337 | self.window.dispatch_events() 338 | image.blit(0,0) 339 | self.window.flip() 340 | def close(self): 341 | if self.isopen: 342 | self.window.close() 343 | self.isopen = False 344 | def __del__(self): 345 | self.close() -------------------------------------------------------------------------------- /multiagent/scenario.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # defines scenario upon which the world is built 4 | class BaseScenario(object): 5 | # create elements of the world 6 | def make_world(self): 7 | raise NotImplementedError() 8 | # create initial conditions of the world 9 | def reset_world(self, world): 10 | raise NotImplementedError() 11 | -------------------------------------------------------------------------------- /multiagent/scenarios/__init__.py: -------------------------------------------------------------------------------- 1 | import imp 2 | import os.path as osp 3 | 4 | 5 | def load(name): 6 | pathname = osp.join(osp.dirname(__file__), name) 7 | return imp.load_source('', pathname) 8 | -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/formation.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/formation.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/simple.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/simple_adversary.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_adversary.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/simple_crypto.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_crypto.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/simple_push.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_push.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/simple_reference.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_reference.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/simple_speaker_listener.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_speaker_listener.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/simple_spread.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_spread.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/simple_tag.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_tag.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/__pycache__/simple_world_comm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_world_comm.cpython-36.pyc -------------------------------------------------------------------------------- /multiagent/scenarios/formation.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from multiagent.core import World, Agent, Landmark 4 | from multiagent.scenario import BaseScenario 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # world characteristics 10 | world.dim_c = 2 11 | num_agents = 3 12 | world.num_agents = num_agents 13 | num_landmarks = num_agents + 1 14 | # adding agents 15 | world.agents = [Agent() for i in range(num_agents)] 16 | for i, agent in enumerate(world.agents): 17 | agent.name = 'agent %d' % i 18 | agent.collide = False 19 | agent.silent = True 20 | agent.size = 0.05 21 | # adding landmarks 22 | world.landmarks = [Landmark() for i in range(num_landmarks)] 23 | for i, landmark in enumerate(world.landmarks): 24 | landmark.name = 'landmark %d' % i 25 | landmark.collide = False 26 | landmark.movable = False 27 | landmark.size = 0.07 28 | # Initial Conditions 29 | self.reset_world(world) 30 | return world 31 | 32 | def reset_world(self, world): 33 | # Landmarks characteristics 34 | for landmark in world.landmarks: 35 | landmark.color = np.array([0.15, 0.15, 0.15]) 36 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 37 | landmark.state.p_vel = np.zeros(world.dim_p) 38 | goal = world.landmarks[0] 39 | goal.color = np.array([0.15, 0.65, 0.15]) 40 | goal.state.p_pos = [-0.8, -0.8] 41 | # Leader characteristics 42 | world.agents[0].color = np.array([0.85, 0.35, 0.35]) 43 | world.agents[0].adversary = True 44 | world.agents[0].goal_a = goal 45 | # Followers 46 | for i in range(1, world.num_agents): 47 | world.agents[i].color = np.array([0.35, 0.35, 0.85]) 48 | world.agents[i].adversary = False 49 | # Random intial states 50 | for agent in world.agents: 51 | agent.state.p_pos = np.random.uniform(0.1, 0.9, world.dim_p) 52 | agent.state.p_vel = np.zeros(world.dim_p) 53 | agent.state.c = np.zeros(world.dim_c) 54 | 55 | def benchmark_data(self, agent, world): 56 | # returning data for benchmark purposes 57 | if agent.adversary: 58 | return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) 59 | else: 60 | dists = [] 61 | for l in world.landmarks: 62 | dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) 63 | dists.append(np.sum(np.square(agent.state.p_pos - world.agents[0].state.p_pos))) 64 | return tuple(dists) 65 | 66 | def reward(self, agent, world): 67 | reward = self.outside(agent, world) + self.collosion(agent, world) 68 | if agent.adversary: 69 | reward -= np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) 70 | else: 71 | reward -= np.sqrt(np.sum(np.square(agent.state.p_pos - world.agents[0].state.p_pos))) 72 | return reward 73 | 74 | def collosion(self, agent, world): 75 | col_rew = 0 76 | for ag in world.agents: 77 | if not ag.name == agent.name: 78 | if np.sqrt(np.sum(np.square(agent.state.p_pos - ag.state.p_pos))) < 2* agent.size: 79 | col_rew -= 15 80 | for i in range(1, len(world.landmarks)): 81 | if np.sqrt(np.sum(np.square(agent.state.p_pos - world.landmarks[i].state.p_pos))) < 2* agent.size: 82 | col_rew -= 15 83 | return col_rew 84 | 85 | def outside(self, agent, world): 86 | out_rew = 0 87 | if np.sum(np.absolute(agent.state.p_pos)) > 2: 88 | out_rew -= 20 89 | return out_rew 90 | 91 | def observation(self, agent, world): 92 | # position of the landmarks w.r.t the agent 93 | landmark_pos = [] 94 | for landmark in world.landmarks: 95 | landmark_pos.append(landmark.state.p_pos - agent.state.p_pos) 96 | # position of the other agents w.r.t this agent 97 | other_pos = [] 98 | for other in world.agents: 99 | if other is agent: continue 100 | other_pos.append(other.state.p_pos - agent.state.p_pos) 101 | 102 | if not agent.adversary: 103 | return np.concatenate([agent.state.p_pos - world.agents[0].state.p_pos] + landmark_pos + other_pos) 104 | else: 105 | return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + landmark_pos) -------------------------------------------------------------------------------- /multiagent/scenarios/simple.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # add agents 9 | world.agents = [Agent() for i in range(3)] 10 | for i, agent in enumerate(world.agents): 11 | agent.name = 'agent %d' % i 12 | agent.collide = False 13 | agent.silent = True 14 | # add landmarks 15 | world.landmarks = [Landmark() for i in range(1)] 16 | for i, landmark in enumerate(world.landmarks): 17 | landmark.name = 'landmark %d' % i 18 | landmark.collide = False 19 | landmark.movable = False 20 | # make initial conditions 21 | self.reset_world(world) 22 | return world 23 | 24 | def reset_world(self, world): 25 | # random properties for agents 26 | for i, agent in enumerate(world.agents): 27 | agent.color = np.array([0.25,0.25,0.25]) 28 | # random properties for landmarks 29 | for i, landmark in enumerate(world.landmarks): 30 | landmark.color = np.array([0.75,0.75,0.75]) 31 | world.landmarks[0].color = np.array([0.75,0.25,0.25]) 32 | # set random initial states 33 | for agent in world.agents: 34 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 35 | agent.state.p_vel = np.zeros(world.dim_p) 36 | agent.state.c = np.zeros(world.dim_c) 37 | for i, landmark in enumerate(world.landmarks): 38 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 39 | landmark.state.p_vel = np.zeros(world.dim_p) 40 | 41 | def reward(self, agent, world): 42 | dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos)) 43 | return -dist2 44 | 45 | def observation(self, agent, world): 46 | # get positions of all entities in this agent's reference frame 47 | entity_pos = [] 48 | for entity in world.landmarks: 49 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 50 | return np.concatenate([agent.state.p_vel] + entity_pos) 51 | -------------------------------------------------------------------------------- /multiagent/scenarios/simple_adversary.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | 8 | def make_world(self): 9 | world = World() 10 | # set any world properties first 11 | world.dim_c = 2 12 | num_agents = 3 13 | world.num_agents = num_agents 14 | num_adversaries = 1 15 | num_landmarks = num_agents - 1 16 | # add agents 17 | world.agents = [Agent() for i in range(num_agents)] 18 | for i, agent in enumerate(world.agents): 19 | agent.name = 'agent %d' % i 20 | agent.collide = False 21 | agent.silent = True 22 | agent.adversary = True if i < num_adversaries else False 23 | agent.size = 0.15 24 | # add landmarks 25 | world.landmarks = [Landmark() for i in range(num_landmarks)] 26 | for i, landmark in enumerate(world.landmarks): 27 | landmark.name = 'landmark %d' % i 28 | landmark.collide = False 29 | landmark.movable = False 30 | landmark.size = 0.08 31 | # make initial conditions 32 | self.reset_world(world) 33 | return world 34 | 35 | def reset_world(self, world): 36 | # random properties for agents 37 | world.agents[0].color = np.array([0.85, 0.35, 0.35]) 38 | for i in range(1, world.num_agents): 39 | world.agents[i].color = np.array([0.35, 0.35, 0.85]) 40 | # random properties for landmarks 41 | for i, landmark in enumerate(world.landmarks): 42 | landmark.color = np.array([0.15, 0.15, 0.15]) 43 | # set goal landmark 44 | goal = np.random.choice(world.landmarks) 45 | goal.color = np.array([0.15, 0.65, 0.15]) 46 | for agent in world.agents: 47 | agent.goal_a = goal 48 | # set random initial states 49 | for agent in world.agents: 50 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 51 | agent.state.p_vel = np.zeros(world.dim_p) 52 | agent.state.c = np.zeros(world.dim_c) 53 | for i, landmark in enumerate(world.landmarks): 54 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 55 | landmark.state.p_vel = np.zeros(world.dim_p) 56 | 57 | def benchmark_data(self, agent, world): 58 | # returns data for benchmarking purposes 59 | if agent.adversary: 60 | return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) 61 | else: 62 | dists = [] 63 | for l in world.landmarks: 64 | dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) 65 | dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) 66 | return tuple(dists) 67 | 68 | # return all agents that are not adversaries 69 | def good_agents(self, world): 70 | return [agent for agent in world.agents if not agent.adversary] 71 | 72 | # return all adversarial agents 73 | def adversaries(self, world): 74 | return [agent for agent in world.agents if agent.adversary] 75 | 76 | def reward(self, agent, world): 77 | # Agents are rewarded based on minimum agent distance to each landmark 78 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 79 | 80 | def agent_reward(self, agent, world): 81 | # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it 82 | shaped_reward = True 83 | shaped_adv_reward = True 84 | 85 | # Calculate negative reward for adversary 86 | adversary_agents = self.adversaries(world) 87 | if shaped_adv_reward: # distance-based adversary reward 88 | adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents]) 89 | else: # proximity-based adversary reward (binary) 90 | adv_rew = 0 91 | for a in adversary_agents: 92 | if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size: 93 | adv_rew -= 5 94 | 95 | # Calculate positive reward for agents 96 | good_agents = self.good_agents(world) 97 | if shaped_reward: # distance-based agent reward 98 | pos_rew = -min( 99 | [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) 100 | else: # proximity-based agent reward (binary) 101 | pos_rew = 0 102 | if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \ 103 | < 2 * agent.goal_a.size: 104 | pos_rew += 5 105 | pos_rew -= min( 106 | [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) 107 | return pos_rew + adv_rew 108 | 109 | def adversary_reward(self, agent, world): 110 | # Rewarded based on proximity to the goal landmark 111 | shaped_reward = True 112 | if shaped_reward: # distance-based reward 113 | return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) 114 | else: # proximity-based reward (binary) 115 | adv_rew = 0 116 | if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size: 117 | adv_rew += 5 118 | return adv_rew 119 | 120 | 121 | def observation(self, agent, world): 122 | # get positions of all entities in this agent's reference frame 123 | entity_pos = [] 124 | for entity in world.landmarks: 125 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 126 | # entity colors 127 | entity_color = [] 128 | for entity in world.landmarks: 129 | entity_color.append(entity.color) 130 | # communication of all other agents 131 | other_pos = [] 132 | for other in world.agents: 133 | if other is agent: continue 134 | other_pos.append(other.state.p_pos - agent.state.p_pos) 135 | 136 | if not agent.adversary: 137 | return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos) 138 | else: 139 | return np.concatenate(entity_pos + other_pos) 140 | -------------------------------------------------------------------------------- /multiagent/scenarios/simple_crypto.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scenario: 3 | 1 speaker, 2 listeners (one of which is an adversary). Good agents rewarded for proximity to goal, and distance from 4 | adversary to goal. Adversary is rewarded for its distance to the goal. 5 | """ 6 | 7 | 8 | import numpy as np 9 | from multiagent.core import World, Agent, Landmark 10 | from multiagent.scenario import BaseScenario 11 | import random 12 | 13 | 14 | class CryptoAgent(Agent): 15 | def __init__(self): 16 | super(CryptoAgent, self).__init__() 17 | self.key = None 18 | 19 | class Scenario(BaseScenario): 20 | 21 | def make_world(self): 22 | world = World() 23 | # set any world properties first 24 | num_agents = 3 25 | num_adversaries = 1 26 | num_landmarks = 2 27 | world.dim_c = 4 28 | # add agents 29 | world.agents = [CryptoAgent() for i in range(num_agents)] 30 | for i, agent in enumerate(world.agents): 31 | agent.name = 'agent %d' % i 32 | agent.collide = False 33 | agent.adversary = True if i < num_adversaries else False 34 | agent.speaker = True if i == 2 else False 35 | agent.movable = False 36 | # add landmarks 37 | world.landmarks = [Landmark() for i in range(num_landmarks)] 38 | for i, landmark in enumerate(world.landmarks): 39 | landmark.name = 'landmark %d' % i 40 | landmark.collide = False 41 | landmark.movable = False 42 | # make initial conditions 43 | self.reset_world(world) 44 | return world 45 | 46 | 47 | def reset_world(self, world): 48 | # random properties for agents 49 | for i, agent in enumerate(world.agents): 50 | agent.color = np.array([0.25, 0.25, 0.25]) 51 | if agent.adversary: 52 | agent.color = np.array([0.75, 0.25, 0.25]) 53 | agent.key = None 54 | # random properties for landmarks 55 | color_list = [np.zeros(world.dim_c) for i in world.landmarks] 56 | for i, color in enumerate(color_list): 57 | color[i] += 1 58 | for color, landmark in zip(color_list, world.landmarks): 59 | landmark.color = color 60 | # set goal landmark 61 | goal = np.random.choice(world.landmarks) 62 | world.agents[1].color = goal.color 63 | world.agents[2].key = np.random.choice(world.landmarks).color 64 | 65 | for agent in world.agents: 66 | agent.goal_a = goal 67 | 68 | # set random initial states 69 | for agent in world.agents: 70 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 71 | agent.state.p_vel = np.zeros(world.dim_p) 72 | agent.state.c = np.zeros(world.dim_c) 73 | for i, landmark in enumerate(world.landmarks): 74 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 75 | landmark.state.p_vel = np.zeros(world.dim_p) 76 | 77 | 78 | def benchmark_data(self, agent, world): 79 | # returns data for benchmarking purposes 80 | return (agent.state.c, agent.goal_a.color) 81 | 82 | # return all agents that are not adversaries 83 | def good_listeners(self, world): 84 | return [agent for agent in world.agents if not agent.adversary and not agent.speaker] 85 | 86 | # return all agents that are not adversaries 87 | def good_agents(self, world): 88 | return [agent for agent in world.agents if not agent.adversary] 89 | 90 | # return all adversarial agents 91 | def adversaries(self, world): 92 | return [agent for agent in world.agents if agent.adversary] 93 | 94 | def reward(self, agent, world): 95 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 96 | 97 | def agent_reward(self, agent, world): 98 | # Agents rewarded if Bob can reconstruct message, but adversary (Eve) cannot 99 | good_listeners = self.good_listeners(world) 100 | adversaries = self.adversaries(world) 101 | good_rew = 0 102 | adv_rew = 0 103 | for a in good_listeners: 104 | if (a.state.c == np.zeros(world.dim_c)).all(): 105 | continue 106 | else: 107 | good_rew -= np.sum(np.square(a.state.c - agent.goal_a.color)) 108 | for a in adversaries: 109 | if (a.state.c == np.zeros(world.dim_c)).all(): 110 | continue 111 | else: 112 | adv_l1 = np.sum(np.square(a.state.c - agent.goal_a.color)) 113 | adv_rew += adv_l1 114 | return adv_rew + good_rew 115 | 116 | def adversary_reward(self, agent, world): 117 | # Adversary (Eve) is rewarded if it can reconstruct original goal 118 | rew = 0 119 | if not (agent.state.c == np.zeros(world.dim_c)).all(): 120 | rew -= np.sum(np.square(agent.state.c - agent.goal_a.color)) 121 | return rew 122 | 123 | 124 | def observation(self, agent, world): 125 | # goal color 126 | goal_color = np.zeros(world.dim_color) 127 | if agent.goal_a is not None: 128 | goal_color = agent.goal_a.color 129 | 130 | # get positions of all entities in this agent's reference frame 131 | entity_pos = [] 132 | for entity in world.landmarks: 133 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 134 | # communication of all other agents 135 | comm = [] 136 | for other in world.agents: 137 | if other is agent or (other.state.c is None) or not other.speaker: continue 138 | comm.append(other.state.c) 139 | 140 | confer = np.array([0]) 141 | 142 | if world.agents[2].key is None: 143 | confer = np.array([1]) 144 | key = np.zeros(world.dim_c) 145 | goal_color = np.zeros(world.dim_c) 146 | else: 147 | key = world.agents[2].key 148 | 149 | prnt = False 150 | # speaker 151 | if agent.speaker: 152 | if prnt: 153 | print('speaker') 154 | print(agent.state.c) 155 | print(np.concatenate([goal_color] + [key] + [confer] + [np.random.randn(1)])) 156 | return np.concatenate([goal_color] + [key]) 157 | # listener 158 | if not agent.speaker and not agent.adversary: 159 | if prnt: 160 | print('listener') 161 | print(agent.state.c) 162 | print(np.concatenate([key] + comm + [confer])) 163 | return np.concatenate([key] + comm) 164 | if not agent.speaker and agent.adversary: 165 | if prnt: 166 | print('adversary') 167 | print(agent.state.c) 168 | print(np.concatenate(comm + [confer])) 169 | return np.concatenate(comm) 170 | -------------------------------------------------------------------------------- /multiagent/scenarios/simple_push.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # set any world properties first 9 | world.dim_c = 2 10 | num_agents = 2 11 | num_adversaries = 1 12 | num_landmarks = 2 13 | # add agents 14 | world.agents = [Agent() for i in range(num_agents)] 15 | for i, agent in enumerate(world.agents): 16 | agent.name = 'agent %d' % i 17 | agent.collide = True 18 | agent.silent = True 19 | if i < num_adversaries: 20 | agent.adversary = True 21 | else: 22 | agent.adversary = False 23 | # add landmarks 24 | world.landmarks = [Landmark() for i in range(num_landmarks)] 25 | for i, landmark in enumerate(world.landmarks): 26 | landmark.name = 'landmark %d' % i 27 | landmark.collide = False 28 | landmark.movable = False 29 | # make initial conditions 30 | self.reset_world(world) 31 | return world 32 | 33 | def reset_world(self, world): 34 | # random properties for landmarks 35 | for i, landmark in enumerate(world.landmarks): 36 | landmark.color = np.array([0.1, 0.1, 0.1]) 37 | landmark.color[i + 1] += 0.8 38 | landmark.index = i 39 | # set goal landmark 40 | goal = np.random.choice(world.landmarks) 41 | for i, agent in enumerate(world.agents): 42 | agent.goal_a = goal 43 | agent.color = np.array([0.25, 0.25, 0.25]) 44 | if agent.adversary: 45 | agent.color = np.array([0.75, 0.25, 0.25]) 46 | else: 47 | j = goal.index 48 | agent.color[j + 1] += 0.5 49 | # set random initial states 50 | for agent in world.agents: 51 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 52 | agent.state.p_vel = np.zeros(world.dim_p) 53 | agent.state.c = np.zeros(world.dim_c) 54 | for i, landmark in enumerate(world.landmarks): 55 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 56 | landmark.state.p_vel = np.zeros(world.dim_p) 57 | 58 | def reward(self, agent, world): 59 | # Agents are rewarded based on minimum agent distance to each landmark 60 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 61 | 62 | def agent_reward(self, agent, world): 63 | # the distance to the goal 64 | return -np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) 65 | 66 | def adversary_reward(self, agent, world): 67 | # keep the nearest good agents away from the goal 68 | agent_dist = [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in world.agents if not a.adversary] 69 | pos_rew = min(agent_dist) 70 | #nearest_agent = world.good_agents[np.argmin(agent_dist)] 71 | #neg_rew = np.sqrt(np.sum(np.square(nearest_agent.state.p_pos - agent.state.p_pos))) 72 | neg_rew = np.sqrt(np.sum(np.square(agent.goal_a.state.p_pos - agent.state.p_pos))) 73 | #neg_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in world.good_agents]) 74 | return pos_rew - neg_rew 75 | 76 | def observation(self, agent, world): 77 | # get positions of all entities in this agent's reference frame 78 | entity_pos = [] 79 | for entity in world.landmarks: # world.entities: 80 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 81 | # entity colors 82 | entity_color = [] 83 | for entity in world.landmarks: # world.entities: 84 | entity_color.append(entity.color) 85 | # communication of all other agents 86 | comm = [] 87 | other_pos = [] 88 | for other in world.agents: 89 | if other is agent: continue 90 | comm.append(other.state.c) 91 | other_pos.append(other.state.p_pos - agent.state.p_pos) 92 | if not agent.adversary: 93 | return np.concatenate([agent.state.p_vel] + [agent.goal_a.state.p_pos - agent.state.p_pos] + [agent.color] + entity_pos + entity_color + other_pos) 94 | else: 95 | #other_pos = list(reversed(other_pos)) if random.uniform(0,1) > 0.5 else other_pos # randomize position of other agents in adversary network 96 | return np.concatenate([agent.state.p_vel] + entity_pos + other_pos) 97 | -------------------------------------------------------------------------------- /multiagent/scenarios/simple_reference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # set any world properties first 9 | world.dim_c = 10 10 | world.collaborative = True # whether agents share rewards 11 | # add agents 12 | world.agents = [Agent() for i in range(2)] 13 | for i, agent in enumerate(world.agents): 14 | agent.name = 'agent %d' % i 15 | agent.collide = False 16 | # add landmarks 17 | world.landmarks = [Landmark() for i in range(3)] 18 | for i, landmark in enumerate(world.landmarks): 19 | landmark.name = 'landmark %d' % i 20 | landmark.collide = False 21 | landmark.movable = False 22 | # make initial conditions 23 | self.reset_world(world) 24 | return world 25 | 26 | def reset_world(self, world): 27 | # assign goals to agents 28 | for agent in world.agents: 29 | agent.goal_a = None 30 | agent.goal_b = None 31 | # want other agent to go to the goal landmark 32 | world.agents[0].goal_a = world.agents[1] 33 | world.agents[0].goal_b = np.random.choice(world.landmarks) 34 | world.agents[1].goal_a = world.agents[0] 35 | world.agents[1].goal_b = np.random.choice(world.landmarks) 36 | # random properties for agents 37 | for i, agent in enumerate(world.agents): 38 | agent.color = np.array([0.25,0.25,0.25]) 39 | # random properties for landmarks 40 | world.landmarks[0].color = np.array([0.75,0.25,0.25]) 41 | world.landmarks[1].color = np.array([0.25,0.75,0.25]) 42 | world.landmarks[2].color = np.array([0.25,0.25,0.75]) 43 | # special colors for goals 44 | world.agents[0].goal_a.color = world.agents[0].goal_b.color 45 | world.agents[1].goal_a.color = world.agents[1].goal_b.color 46 | # set random initial states 47 | for agent in world.agents: 48 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 49 | agent.state.p_vel = np.zeros(world.dim_p) 50 | agent.state.c = np.zeros(world.dim_c) 51 | for i, landmark in enumerate(world.landmarks): 52 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 53 | landmark.state.p_vel = np.zeros(world.dim_p) 54 | 55 | def reward(self, agent, world): 56 | if agent.goal_a is None or agent.goal_b is None: 57 | return 0.0 58 | dist2 = np.sum(np.square(agent.goal_a.state.p_pos - agent.goal_b.state.p_pos)) 59 | return -dist2 60 | 61 | def observation(self, agent, world): 62 | # goal color 63 | goal_color = [np.zeros(world.dim_color), np.zeros(world.dim_color)] 64 | if agent.goal_b is not None: 65 | goal_color[1] = agent.goal_b.color 66 | 67 | # get positions of all entities in this agent's reference frame 68 | entity_pos = [] 69 | for entity in world.landmarks: 70 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 71 | # entity colors 72 | entity_color = [] 73 | for entity in world.landmarks: 74 | entity_color.append(entity.color) 75 | # communication of all other agents 76 | comm = [] 77 | for other in world.agents: 78 | if other is agent: continue 79 | comm.append(other.state.c) 80 | return np.concatenate([agent.state.p_vel] + entity_pos + [goal_color[1]] + comm) 81 | -------------------------------------------------------------------------------- /multiagent/scenarios/simple_speaker_listener.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # set any world properties first 9 | world.dim_c = 3 10 | num_landmarks = 3 11 | world.collaborative = True 12 | # add agents 13 | world.agents = [Agent() for i in range(2)] 14 | for i, agent in enumerate(world.agents): 15 | agent.name = 'agent %d' % i 16 | agent.collide = False 17 | agent.size = 0.075 18 | # speaker 19 | world.agents[0].movable = False 20 | # listener 21 | world.agents[1].silent = True 22 | # add landmarks 23 | world.landmarks = [Landmark() for i in range(num_landmarks)] 24 | for i, landmark in enumerate(world.landmarks): 25 | landmark.name = 'landmark %d' % i 26 | landmark.collide = False 27 | landmark.movable = False 28 | landmark.size = 0.04 29 | # make initial conditions 30 | self.reset_world(world) 31 | return world 32 | 33 | def reset_world(self, world): 34 | # assign goals to agents 35 | for agent in world.agents: 36 | agent.goal_a = None 37 | agent.goal_b = None 38 | # want listener to go to the goal landmark 39 | world.agents[0].goal_a = world.agents[1] 40 | world.agents[0].goal_b = np.random.choice(world.landmarks) 41 | # random properties for agents 42 | for i, agent in enumerate(world.agents): 43 | agent.color = np.array([0.25,0.25,0.25]) 44 | # random properties for landmarks 45 | world.landmarks[0].color = np.array([0.65,0.15,0.15]) 46 | world.landmarks[1].color = np.array([0.15,0.65,0.15]) 47 | world.landmarks[2].color = np.array([0.15,0.15,0.65]) 48 | # special colors for goals 49 | world.agents[0].goal_a.color = world.agents[0].goal_b.color + np.array([0.45, 0.45, 0.45]) 50 | # set random initial states 51 | for agent in world.agents: 52 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 53 | agent.state.p_vel = np.zeros(world.dim_p) 54 | agent.state.c = np.zeros(world.dim_c) 55 | for i, landmark in enumerate(world.landmarks): 56 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 57 | landmark.state.p_vel = np.zeros(world.dim_p) 58 | 59 | def benchmark_data(self, agent, world): 60 | # returns data for benchmarking purposes 61 | return self.reward(agent, reward) 62 | 63 | def reward(self, agent, world): 64 | # squared distance from listener to landmark 65 | a = world.agents[0] 66 | dist2 = np.sum(np.square(a.goal_a.state.p_pos - a.goal_b.state.p_pos)) 67 | return -dist2 68 | 69 | def observation(self, agent, world): 70 | # goal color 71 | goal_color = np.zeros(world.dim_color) 72 | if agent.goal_b is not None: 73 | goal_color = agent.goal_b.color 74 | 75 | # get positions of all entities in this agent's reference frame 76 | entity_pos = [] 77 | for entity in world.landmarks: 78 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 79 | 80 | # communication of all other agents 81 | comm = [] 82 | for other in world.agents: 83 | if other is agent or (other.state.c is None): continue 84 | comm.append(other.state.c) 85 | 86 | # speaker 87 | if not agent.movable: 88 | return np.concatenate([goal_color]) 89 | # listener 90 | if agent.silent: 91 | return np.concatenate([agent.state.p_vel] + entity_pos + comm) 92 | 93 | -------------------------------------------------------------------------------- /multiagent/scenarios/simple_spread.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_agents = 3 12 | num_landmarks = 3 13 | world.collaborative = True 14 | # add agents 15 | world.agents = [Agent() for i in range(num_agents)] 16 | for i, agent in enumerate(world.agents): 17 | agent.name = 'agent %d' % i 18 | agent.collide = True 19 | agent.silent = True 20 | agent.size = 0.15 21 | # add landmarks 22 | world.landmarks = [Landmark() for i in range(num_landmarks)] 23 | for i, landmark in enumerate(world.landmarks): 24 | landmark.name = 'landmark %d' % i 25 | landmark.collide = False 26 | landmark.movable = False 27 | # make initial conditions 28 | self.reset_world(world) 29 | return world 30 | 31 | def reset_world(self, world): 32 | # random properties for agents 33 | for i, agent in enumerate(world.agents): 34 | agent.color = np.array([0.35, 0.35, 0.85]) 35 | # random properties for landmarks 36 | for i, landmark in enumerate(world.landmarks): 37 | landmark.color = np.array([0.25, 0.25, 0.25]) 38 | # set random initial states 39 | for agent in world.agents: 40 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 41 | agent.state.p_vel = np.zeros(world.dim_p) 42 | agent.state.c = np.zeros(world.dim_c) 43 | for i, landmark in enumerate(world.landmarks): 44 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 45 | landmark.state.p_vel = np.zeros(world.dim_p) 46 | 47 | def benchmark_data(self, agent, world): 48 | rew = 0 49 | collisions = 0 50 | occupied_landmarks = 0 51 | min_dists = 0 52 | for l in world.landmarks: 53 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 54 | min_dists += min(dists) 55 | rew -= min(dists) 56 | if min(dists) < 0.1: 57 | occupied_landmarks += 1 58 | if agent.collide: 59 | for a in world.agents: 60 | if self.is_collision(a, agent): 61 | rew -= 1 62 | collisions += 1 63 | return (rew, collisions, min_dists, occupied_landmarks) 64 | 65 | 66 | def is_collision(self, agent1, agent2): 67 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 68 | dist = np.sqrt(np.sum(np.square(delta_pos))) 69 | dist_min = agent1.size + agent2.size 70 | return True if dist < dist_min else False 71 | 72 | def reward(self, agent, world): 73 | # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions 74 | rew = 0 75 | for l in world.landmarks: 76 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 77 | rew -= min(dists) 78 | if agent.collide: 79 | for a in world.agents: 80 | if self.is_collision(a, agent): 81 | rew -= 1 82 | return rew 83 | 84 | def observation(self, agent, world): 85 | # get positions of all entities in this agent's reference frame 86 | entity_pos = [] 87 | for entity in world.landmarks: # world.entities: 88 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 89 | # entity colors 90 | entity_color = [] 91 | for entity in world.landmarks: # world.entities: 92 | entity_color.append(entity.color) 93 | # communication of all other agents 94 | comm = [] 95 | other_pos = [] 96 | for other in world.agents: 97 | if other is agent: continue 98 | comm.append(other.state.c) 99 | other_pos.append(other.state.p_pos - agent.state.p_pos) 100 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm) 101 | -------------------------------------------------------------------------------- /multiagent/scenarios/simple_tag.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_good_agents = 1 12 | num_adversaries = 3 13 | num_agents = num_adversaries + num_good_agents 14 | num_landmarks = 2 15 | # add agents 16 | world.agents = [Agent() for i in range(num_agents)] 17 | for i, agent in enumerate(world.agents): 18 | agent.name = 'agent %d' % i 19 | agent.collide = True 20 | agent.silent = True 21 | agent.adversary = True if i < num_adversaries else False 22 | agent.size = 0.075 if agent.adversary else 0.05 23 | agent.accel = 3.0 if agent.adversary else 4.0 24 | #agent.accel = 20.0 if agent.adversary else 25.0 25 | agent.max_speed = 1.0 if agent.adversary else 1.3 26 | # add landmarks 27 | world.landmarks = [Landmark() for i in range(num_landmarks)] 28 | for i, landmark in enumerate(world.landmarks): 29 | landmark.name = 'landmark %d' % i 30 | landmark.collide = True 31 | landmark.movable = False 32 | landmark.size = 0.2 33 | landmark.boundary = False 34 | # make initial conditions 35 | self.reset_world(world) 36 | return world 37 | 38 | 39 | def reset_world(self, world): 40 | # random properties for agents 41 | for i, agent in enumerate(world.agents): 42 | agent.color = np.array([0.35, 0.85, 0.35]) if not agent.adversary else np.array([0.85, 0.35, 0.35]) 43 | # random properties for landmarks 44 | for i, landmark in enumerate(world.landmarks): 45 | landmark.color = np.array([0.25, 0.25, 0.25]) 46 | # set random initial states 47 | for agent in world.agents: 48 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 49 | agent.state.p_vel = np.zeros(world.dim_p) 50 | agent.state.c = np.zeros(world.dim_c) 51 | for i, landmark in enumerate(world.landmarks): 52 | if not landmark.boundary: 53 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 54 | landmark.state.p_vel = np.zeros(world.dim_p) 55 | 56 | 57 | def benchmark_data(self, agent, world): 58 | # returns data for benchmarking purposes 59 | if agent.adversary: 60 | collisions = 0 61 | for a in self.good_agents(world): 62 | if self.is_collision(a, agent): 63 | collisions += 1 64 | return collisions 65 | else: 66 | return 0 67 | 68 | 69 | def is_collision(self, agent1, agent2): 70 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 71 | dist = np.sqrt(np.sum(np.square(delta_pos))) 72 | dist_min = agent1.size + agent2.size 73 | return True if dist < dist_min else False 74 | 75 | # return all agents that are not adversaries 76 | def good_agents(self, world): 77 | return [agent for agent in world.agents if not agent.adversary] 78 | 79 | # return all adversarial agents 80 | def adversaries(self, world): 81 | return [agent for agent in world.agents if agent.adversary] 82 | 83 | 84 | def reward(self, agent, world): 85 | # Agents are rewarded based on minimum agent distance to each landmark 86 | main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 87 | return main_reward 88 | 89 | def agent_reward(self, agent, world): 90 | # Agents are negatively rewarded if caught by adversaries 91 | rew = 0 92 | shape = False 93 | adversaries = self.adversaries(world) 94 | if shape: # reward can optionally be shaped (increased reward for increased distance from adversary) 95 | for adv in adversaries: 96 | rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos))) 97 | if agent.collide: 98 | for a in adversaries: 99 | if self.is_collision(a, agent): 100 | rew -= 10 101 | 102 | # agents are penalized for exiting the screen, so that they can be caught by the adversaries 103 | def bound(x): 104 | if x < 0.9: 105 | return 0 106 | if x < 1.0: 107 | return (x - 0.9) * 10 108 | return min(np.exp(2 * x - 2), 10) 109 | for p in range(world.dim_p): 110 | x = abs(agent.state.p_pos[p]) 111 | rew -= bound(x) 112 | 113 | return rew 114 | 115 | def adversary_reward(self, agent, world): 116 | # Adversaries are rewarded for collisions with agents 117 | rew = 0 118 | shape = False 119 | agents = self.good_agents(world) 120 | adversaries = self.adversaries(world) 121 | if shape: # reward can optionally be shaped (decreased reward for increased distance from agents) 122 | for adv in adversaries: 123 | rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents]) 124 | if agent.collide: 125 | for ag in agents: 126 | for adv in adversaries: 127 | if self.is_collision(ag, adv): 128 | rew += 10 129 | return rew 130 | 131 | def observation(self, agent, world): 132 | # get positions of all entities in this agent's reference frame 133 | entity_pos = [] 134 | for entity in world.landmarks: 135 | if not entity.boundary: 136 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 137 | # communication of all other agents 138 | comm = [] 139 | other_pos = [] 140 | other_vel = [] 141 | for other in world.agents: 142 | if other is agent: continue 143 | comm.append(other.state.c) 144 | other_pos.append(other.state.p_pos - agent.state.p_pos) 145 | if not other.adversary: 146 | other_vel.append(other.state.p_vel) 147 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel) 148 | -------------------------------------------------------------------------------- /multiagent/scenarios/simple_world_comm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 4 11 | #world.damping = 1 12 | num_good_agents = 2 13 | num_adversaries = 4 14 | num_agents = num_adversaries + num_good_agents 15 | num_landmarks = 1 16 | num_food = 2 17 | num_forests = 2 18 | # add agents 19 | world.agents = [Agent() for i in range(num_agents)] 20 | for i, agent in enumerate(world.agents): 21 | agent.name = 'agent %d' % i 22 | agent.collide = True 23 | agent.leader = True if i == 0 else False 24 | agent.silent = True if i > 0 else False 25 | agent.adversary = True if i < num_adversaries else False 26 | agent.size = 0.075 if agent.adversary else 0.045 27 | agent.accel = 3.0 if agent.adversary else 4.0 28 | #agent.accel = 20.0 if agent.adversary else 25.0 29 | agent.max_speed = 1.0 if agent.adversary else 1.3 30 | # add landmarks 31 | world.landmarks = [Landmark() for i in range(num_landmarks)] 32 | for i, landmark in enumerate(world.landmarks): 33 | landmark.name = 'landmark %d' % i 34 | landmark.collide = True 35 | landmark.movable = False 36 | landmark.size = 0.2 37 | landmark.boundary = False 38 | world.food = [Landmark() for i in range(num_food)] 39 | for i, landmark in enumerate(world.food): 40 | landmark.name = 'food %d' % i 41 | landmark.collide = False 42 | landmark.movable = False 43 | landmark.size = 0.03 44 | landmark.boundary = False 45 | world.forests = [Landmark() for i in range(num_forests)] 46 | for i, landmark in enumerate(world.forests): 47 | landmark.name = 'forest %d' % i 48 | landmark.collide = False 49 | landmark.movable = False 50 | landmark.size = 0.3 51 | landmark.boundary = False 52 | world.landmarks += world.food 53 | world.landmarks += world.forests 54 | #world.landmarks += self.set_boundaries(world) # world boundaries now penalized with negative reward 55 | # make initial conditions 56 | self.reset_world(world) 57 | return world 58 | 59 | def set_boundaries(self, world): 60 | boundary_list = [] 61 | landmark_size = 1 62 | edge = 1 + landmark_size 63 | num_landmarks = int(edge * 2 / landmark_size) 64 | for x_pos in [-edge, edge]: 65 | for i in range(num_landmarks): 66 | l = Landmark() 67 | l.state.p_pos = np.array([x_pos, -1 + i * landmark_size]) 68 | boundary_list.append(l) 69 | 70 | for y_pos in [-edge, edge]: 71 | for i in range(num_landmarks): 72 | l = Landmark() 73 | l.state.p_pos = np.array([-1 + i * landmark_size, y_pos]) 74 | boundary_list.append(l) 75 | 76 | for i, l in enumerate(boundary_list): 77 | l.name = 'boundary %d' % i 78 | l.collide = True 79 | l.movable = False 80 | l.boundary = True 81 | l.color = np.array([0.75, 0.75, 0.75]) 82 | l.size = landmark_size 83 | l.state.p_vel = np.zeros(world.dim_p) 84 | 85 | return boundary_list 86 | 87 | 88 | def reset_world(self, world): 89 | # random properties for agents 90 | for i, agent in enumerate(world.agents): 91 | agent.color = np.array([0.45, 0.95, 0.45]) if not agent.adversary else np.array([0.95, 0.45, 0.45]) 92 | agent.color -= np.array([0.3, 0.3, 0.3]) if agent.leader else np.array([0, 0, 0]) 93 | # random properties for landmarks 94 | for i, landmark in enumerate(world.landmarks): 95 | landmark.color = np.array([0.25, 0.25, 0.25]) 96 | for i, landmark in enumerate(world.food): 97 | landmark.color = np.array([0.15, 0.15, 0.65]) 98 | for i, landmark in enumerate(world.forests): 99 | landmark.color = np.array([0.6, 0.9, 0.6]) 100 | # set random initial states 101 | for agent in world.agents: 102 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 103 | agent.state.p_vel = np.zeros(world.dim_p) 104 | agent.state.c = np.zeros(world.dim_c) 105 | for i, landmark in enumerate(world.landmarks): 106 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 107 | landmark.state.p_vel = np.zeros(world.dim_p) 108 | for i, landmark in enumerate(world.food): 109 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 110 | landmark.state.p_vel = np.zeros(world.dim_p) 111 | for i, landmark in enumerate(world.forests): 112 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 113 | landmark.state.p_vel = np.zeros(world.dim_p) 114 | 115 | def benchmark_data(self, agent, world): 116 | if agent.adversary: 117 | collisions = 0 118 | for a in self.good_agents(world): 119 | if self.is_collision(a, agent): 120 | collisions += 1 121 | return collisions 122 | else: 123 | return 0 124 | 125 | 126 | def is_collision(self, agent1, agent2): 127 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 128 | dist = np.sqrt(np.sum(np.square(delta_pos))) 129 | dist_min = agent1.size + agent2.size 130 | return True if dist < dist_min else False 131 | 132 | 133 | # return all agents that are not adversaries 134 | def good_agents(self, world): 135 | return [agent for agent in world.agents if not agent.adversary] 136 | 137 | # return all adversarial agents 138 | def adversaries(self, world): 139 | return [agent for agent in world.agents if agent.adversary] 140 | 141 | 142 | def reward(self, agent, world): 143 | # Agents are rewarded based on minimum agent distance to each landmark 144 | #boundary_reward = -10 if self.outside_boundary(agent) else 0 145 | main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 146 | return main_reward 147 | 148 | def outside_boundary(self, agent): 149 | if agent.state.p_pos[0] > 1 or agent.state.p_pos[0] < -1 or agent.state.p_pos[1] > 1 or agent.state.p_pos[1] < -1: 150 | return True 151 | else: 152 | return False 153 | 154 | 155 | def agent_reward(self, agent, world): 156 | # Agents are rewarded based on minimum agent distance to each landmark 157 | rew = 0 158 | shape = False 159 | adversaries = self.adversaries(world) 160 | if shape: 161 | for adv in adversaries: 162 | rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos))) 163 | if agent.collide: 164 | for a in adversaries: 165 | if self.is_collision(a, agent): 166 | rew -= 5 167 | def bound(x): 168 | if x < 0.9: 169 | return 0 170 | if x < 1.0: 171 | return (x - 0.9) * 10 172 | return min(np.exp(2 * x - 2), 10) # 1 + (x - 1) * (x - 1) 173 | 174 | for p in range(world.dim_p): 175 | x = abs(agent.state.p_pos[p]) 176 | rew -= 2 * bound(x) 177 | 178 | for food in world.food: 179 | if self.is_collision(agent, food): 180 | rew += 2 181 | rew += 0.05 * min([np.sqrt(np.sum(np.square(food.state.p_pos - agent.state.p_pos))) for food in world.food]) 182 | 183 | return rew 184 | 185 | def adversary_reward(self, agent, world): 186 | # Agents are rewarded based on minimum agent distance to each landmark 187 | rew = 0 188 | shape = True 189 | agents = self.good_agents(world) 190 | adversaries = self.adversaries(world) 191 | if shape: 192 | rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in agents]) 193 | if agent.collide: 194 | for ag in agents: 195 | for adv in adversaries: 196 | if self.is_collision(ag, adv): 197 | rew += 5 198 | return rew 199 | 200 | 201 | def observation2(self, agent, world): 202 | # get positions of all entities in this agent's reference frame 203 | entity_pos = [] 204 | for entity in world.landmarks: 205 | if not entity.boundary: 206 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 207 | 208 | food_pos = [] 209 | for entity in world.food: 210 | if not entity.boundary: 211 | food_pos.append(entity.state.p_pos - agent.state.p_pos) 212 | # communication of all other agents 213 | comm = [] 214 | other_pos = [] 215 | other_vel = [] 216 | for other in world.agents: 217 | if other is agent: continue 218 | comm.append(other.state.c) 219 | other_pos.append(other.state.p_pos - agent.state.p_pos) 220 | if not other.adversary: 221 | other_vel.append(other.state.p_vel) 222 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel) 223 | 224 | def observation(self, agent, world): 225 | # get positions of all entities in this agent's reference frame 226 | entity_pos = [] 227 | for entity in world.landmarks: 228 | if not entity.boundary: 229 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 230 | 231 | in_forest = [np.array([-1]), np.array([-1])] 232 | inf1 = False 233 | inf2 = False 234 | if self.is_collision(agent, world.forests[0]): 235 | in_forest[0] = np.array([1]) 236 | inf1= True 237 | if self.is_collision(agent, world.forests[1]): 238 | in_forest[1] = np.array([1]) 239 | inf2 = True 240 | 241 | food_pos = [] 242 | for entity in world.food: 243 | if not entity.boundary: 244 | food_pos.append(entity.state.p_pos - agent.state.p_pos) 245 | # communication of all other agents 246 | comm = [] 247 | other_pos = [] 248 | other_vel = [] 249 | for other in world.agents: 250 | if other is agent: continue 251 | comm.append(other.state.c) 252 | oth_f1 = self.is_collision(other, world.forests[0]) 253 | oth_f2 = self.is_collision(other, world.forests[1]) 254 | if (inf1 and oth_f1) or (inf2 and oth_f2) or (not inf1 and not oth_f1 and not inf2 and not oth_f2) or agent.leader: #without forest vis 255 | other_pos.append(other.state.p_pos - agent.state.p_pos) 256 | if not other.adversary: 257 | other_vel.append(other.state.p_vel) 258 | else: 259 | other_pos.append([0, 0]) 260 | if not other.adversary: 261 | other_vel.append([0, 0]) 262 | 263 | # to tell the pred when the prey are in the forest 264 | prey_forest = [] 265 | ga = self.good_agents(world) 266 | for a in ga: 267 | if any([self.is_collision(a, f) for f in world.forests]): 268 | prey_forest.append(np.array([1])) 269 | else: 270 | prey_forest.append(np.array([-1])) 271 | # to tell leader when pred are in forest 272 | prey_forest_lead = [] 273 | for f in world.forests: 274 | if any([self.is_collision(a, f) for a in ga]): 275 | prey_forest_lead.append(np.array([1])) 276 | else: 277 | prey_forest_lead.append(np.array([-1])) 278 | 279 | comm = [world.agents[0].state.c] 280 | 281 | if agent.adversary and not agent.leader: 282 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm) 283 | if agent.leader: 284 | return np.concatenate( 285 | [agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm) 286 | else: 287 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + in_forest + other_vel) 288 | 289 | 290 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='maddpg', 4 | version='0.0.1', 5 | description='Multi-Agent Deep Deterministic Policy Gradient', 6 | url='https://github.com/openai/maddpg', 7 | author='Igor Mordatch', 8 | author_email='mordatch@openai.com', 9 | packages=find_packages(), 10 | include_package_data=True, 11 | zip_safe=False, 12 | install_requires=['gym', 'numpy-stl'] 13 | ) 14 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import tensorflow as tf 4 | import time 5 | import pickle 6 | 7 | import maddpg.common.tf_util as U 8 | from maddpg.trainer.maddpg import MADDPGAgentTrainer 9 | import tensorflow.contrib.layers as layers 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments") 13 | # Environment 14 | parser.add_argument("--scenario", type=str, default="formation", help="name of the scenario script") 15 | parser.add_argument("--max-episode-len", type=int, default=120, help="maximum episode length") 16 | parser.add_argument("--num-episodes", type=int, default=50000, help="number of episodes") 17 | parser.add_argument("--num-adversaries", type=int, default=1, help="number of adversaries") 18 | parser.add_argument("--good-policy", type=str, default="maddpg", help="policy for good agents") 19 | parser.add_argument("--adv-policy", type=str, default="maddpg", help="policy of adversaries") 20 | # Core training parameters 21 | parser.add_argument("--lr", type=float, default=1e-2, help="learning rate for Adam optimizer") 22 | parser.add_argument("--gamma", type=float, default=0.95, help="discount factor") 23 | parser.add_argument("--batch-size", type=int, default=1024, help="number of episodes to optimize at the same time") 24 | parser.add_argument("--num-units", type=int, default=64, help="number of units in the mlp") 25 | # Checkpointing 26 | parser.add_argument("--exp-name", type=str, default="formation", help="name of the experiment") 27 | parser.add_argument("--save-dir", type=str, default="/home/islam/training/policy/", help="directory in which training state and model should be saved") 28 | parser.add_argument("--save-rate", type=int, default=100, help="save model once every time this many episodes are completed") 29 | parser.add_argument("--load-dir", type=str, default="", help="directory in which training state and model are loaded") 30 | # Evaluation 31 | parser.add_argument("--restore", action="store_true", default=False) 32 | parser.add_argument("--display", action="store_true", default=True) 33 | parser.add_argument("--benchmark", action="store_true", default=False) 34 | parser.add_argument("--benchmark-iters", type=int, default=100000, help="number of iterations run for benchmarking") 35 | parser.add_argument("--benchmark-dir", type=str, default="/home/islam/training/benchmark/", help="directory where benchmark data is saved") 36 | parser.add_argument("--plots-dir", type=str, default="/home/islam/training/curves/", help="directory where plot data is saved") 37 | return parser.parse_args() 38 | 39 | def mlp_model(input, num_outputs, scope, reuse=False, num_units=64, rnn_cell=None): 40 | # This model takes as input an observation and returns values of all actions 41 | with tf.variable_scope(scope, reuse=reuse): 42 | out = input 43 | out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu) 44 | out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu) 45 | out = layers.fully_connected(out, num_outputs=num_outputs, activation_fn=None) 46 | return out 47 | 48 | def make_env(scenario_name, arglist, benchmark=False): 49 | from multiagent.environment import MultiAgentEnv 50 | import multiagent.scenarios as scenarios 51 | 52 | # load scenario from script 53 | scenario = scenarios.load(scenario_name + ".py").Scenario() 54 | # create world 55 | world = scenario.make_world() 56 | # create multiagent environment 57 | if benchmark: 58 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) 59 | else: 60 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) 61 | return env 62 | 63 | def get_trainers(env, num_adversaries, obs_shape_n, arglist): 64 | trainers = [] 65 | model = mlp_model 66 | trainer = MADDPGAgentTrainer 67 | for i in range(num_adversaries): 68 | trainers.append(trainer( 69 | "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist, 70 | local_q_func=(arglist.adv_policy=='ddpg'))) 71 | for i in range(num_adversaries, env.n): 72 | trainers.append(trainer( 73 | "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist, 74 | local_q_func=(arglist.good_policy=='ddpg'))) 75 | return trainers 76 | 77 | 78 | def train(arglist): 79 | with U.single_threaded_session(): 80 | # Create environment 81 | env = make_env(arglist.scenario, arglist, arglist.benchmark) 82 | # Create agent trainers 83 | obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] 84 | num_adversaries = min(env.n, arglist.num_adversaries) 85 | trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) 86 | print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) 87 | 88 | # Initialize 89 | U.initialize() 90 | 91 | # Load previous results, if necessary 92 | if arglist.load_dir == "": 93 | arglist.load_dir = arglist.save_dir 94 | if arglist.display or arglist.restore or arglist.benchmark: 95 | print('Loading previous state...') 96 | U.load_state(arglist.load_dir) 97 | 98 | episode_rewards = [0.0] # sum of rewards for all agents 99 | agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward 100 | final_ep_rewards = [] # sum of rewards for training curve 101 | final_ep_ag_rewards = [] # agent rewards for training curve 102 | agent_info = [[[]]] # placeholder for benchmarking info 103 | saver = tf.train.Saver() 104 | obs_n = env.reset() 105 | episode_step = 0 106 | train_step = 0 107 | t_start = time.time() 108 | 109 | print('Starting iterations...') 110 | while True: 111 | # get action 112 | action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] 113 | # environment step 114 | new_obs_n, rew_n, done_n, info_n = env.step(action_n) 115 | episode_step += 1 116 | done = all(done_n) 117 | terminal = (episode_step >= arglist.max_episode_len) 118 | # collect experience 119 | for i, agent in enumerate(trainers): 120 | agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) 121 | obs_n = new_obs_n 122 | 123 | for i, rew in enumerate(rew_n): 124 | episode_rewards[-1] += rew 125 | agent_rewards[i][-1] += rew 126 | 127 | if done or terminal: 128 | obs_n = env.reset() 129 | episode_step = 0 130 | episode_rewards.append(0) 131 | for a in agent_rewards: 132 | a.append(0) 133 | agent_info.append([[]]) 134 | 135 | # increment global step counter 136 | train_step += 1 137 | 138 | # for benchmarking learned policies 139 | if arglist.benchmark: 140 | for i, info in enumerate(info_n): 141 | agent_info[-1][i].append(info_n['n']) 142 | if train_step > arglist.benchmark_iters and (done or terminal): 143 | file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' 144 | print('Finished benchmarking, now saving...') 145 | with open(file_name, 'wb') as fp: 146 | pickle.dump(agent_info[:-1], fp) 147 | break 148 | continue 149 | 150 | # for displaying learned policies 151 | if arglist.display: 152 | time.sleep(0.1) 153 | env.render() 154 | continue 155 | 156 | # update all trainers, if not in display or benchmark mode 157 | loss = None 158 | for agent in trainers: 159 | agent.preupdate() 160 | for agent in trainers: 161 | loss = agent.update(trainers, train_step) 162 | 163 | # save model, display training output 164 | if terminal and (len(episode_rewards) % arglist.save_rate == 0): 165 | U.save_state(arglist.save_dir, saver=saver) 166 | # print statement depends on whether or not there are adversaries 167 | if num_adversaries == 0: 168 | print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( 169 | train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) 170 | else: 171 | print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( 172 | train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), 173 | [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) 174 | t_start = time.time() 175 | # Keep track of final episode reward 176 | final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) 177 | for rew in agent_rewards: 178 | final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) 179 | 180 | # saves final episode reward for plotting training curve later 181 | if len(episode_rewards) > arglist.num_episodes: 182 | rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' 183 | with open(rew_file_name, 'wb') as fp: 184 | pickle.dump(final_ep_rewards, fp) 185 | agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' 186 | with open(agrew_file_name, 'wb') as fp: 187 | pickle.dump(final_ep_ag_rewards, fp) 188 | print('...Finished total of {} episodes.'.format(len(episode_rewards))) 189 | break 190 | 191 | if __name__ == '__main__': 192 | arglist = parse_args() 193 | train(arglist) --------------------------------------------------------------------------------