├── .gitignore
├── LICENSE.txt
├── README.md
├── maddpg.egg-info
    ├── PKG-INFO
    ├── SOURCES.txt
    ├── dependency_links.txt
    ├── not-zip-safe
    ├── requires.txt
    └── top_level.txt
├── maddpg
    ├── __init__.py
    ├── __pycache__
    │   └── __init__.cpython-36.pyc
    ├── common
    │   ├── __pycache__
    │   │   ├── distributions.cpython-36.pyc
    │   │   └── tf_util.cpython-36.pyc
    │   ├── distributions.py
    │   └── tf_util.py
    └── trainer
    │   ├── __pycache__
    │       ├── maddpg.cpython-36.pyc
    │       └── replay_buffer.cpython-36.pyc
    │   ├── maddpg.py
    │   └── replay_buffer.py
├── multiagent
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   ├── core.cpython-36.pyc
    │   ├── environment.cpython-36.pyc
    │   ├── multi_discrete.cpython-36.pyc
    │   ├── rendering.cpython-36.pyc
    │   └── scenario.cpython-36.pyc
    ├── core.py
    ├── environment.py
    ├── multi_discrete.py
    ├── policy.py
    ├── rendering.py
    ├── scenario.py
    └── scenarios
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-36.pyc
    │       ├── formation.cpython-36.pyc
    │       ├── simple.cpython-36.pyc
    │       ├── simple_adversary.cpython-36.pyc
    │       ├── simple_crypto.cpython-36.pyc
    │       ├── simple_push.cpython-36.pyc
    │       ├── simple_reference.cpython-36.pyc
    │       ├── simple_speaker_listener.cpython-36.pyc
    │       ├── simple_spread.cpython-36.pyc
    │       ├── simple_tag.cpython-36.pyc
    │       └── simple_world_comm.cpython-36.pyc
    │   ├── formation.py
    │   ├── simple.py
    │   ├── simple_adversary.py
    │   ├── simple_crypto.py
    │   ├── simple_push.py
    │   ├── simple_reference.py
    │   ├── simple_speaker_listener.py
    │   ├── simple_spread.py
    │   ├── simple_tag.py
    │   └── simple_world_comm.py
├── setup.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | .static_storage/
 57 | .media/
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 OpenAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning
 2 | 
 3 | 
 4 | In this project deep reinforcement learning is used to train multi-agent robotic systems to perfrom leader-follower formation control . The OpenAI's MADDPG environment is used after some modifications have been added for agent training.
 5 | 
 6 | ## Framework
 7 | 
 8 | The Framework used in this project is Python 3.6.9 installed on Ubuntu 18.04 LTS. Alongside with Numpy, Tensorflow.
 9 | 
10 | <p align="center">
11 |   <img  src="https://user-images.githubusercontent.com/42684592/123695973-82063300-d85b-11eb-8349-84dadbda6b69.png">
12 | </p>
13 | 
14 | 
15 | ## Environment
16 | 
17 | The  environment  is  the  single  most  important  element  in  the  Reinforcement  Learningprocess since it presents the physical world that the agent interacts with.  In this project the environment used is [Multi-Agent Particle Environments (MPE)](https://github.com/openai/multiagent-particle-envs).
18 | based on OpenAI work. OpenAI is artificial intelligence research laboratory that develops free open-source tools and libraries that helps the Artificial Intelligence developers community in the Research and Industry fields. The original environment is a 2D world with a continuous observation and discreteaction space, along with some basic simulated physics.  It was developed such that the agents are divided into 2 groups: Good Agents, Adversary Agents. Such that the good agents try to cooperate to cover certain goal landmarks so that the adversary agents can not cover these goals.
19 | <p align="center">
20 |   <img  src="https://user-images.githubusercontent.com/42684592/123695976-829ec980-d85b-11eb-8d5f-a7f1391d9418.png">
21 | </p>
22 | 
23 | Many modifications are made to this environment in this project so that it can be used in the Leader-Follower Formation Control favor, including:
24 | 1.  The agents are divided into:  One Leader Agent, Two Follower Agents.
25 | 2.  The goal is an individual landmark its location is fixed, unlike being assigned randomly  in  the  original  environment,  in  the  left  down  corner  of  the  environment, specifically at location (-0.8, -0.8) with respect to the coordination plane which has a center of (0, 0) right at the middle of the screen.
26 | 3.  The landmarks representing obstacles are assigned randomly.
27 | 4.  All of the agents initial positions are assigned randomly constrained to be at the first quadrant of the coordination system unlike being completely random at the original environment.
28 | 5.  The maximum speed of the agents is set to 0.2 unlike the original environments. The size of the agents is reduced and the landmarks size is magnified compared tothe original environment.
29 | 


--------------------------------------------------------------------------------
/maddpg.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: maddpg
 3 | Version: 0.0.1
 4 | Summary: Multi-Agent Deep Deterministic Policy Gradient
 5 | Home-page: https://github.com/openai/maddpg
 6 | Author: Igor Mordatch
 7 | Author-email: mordatch@openai.com
 8 | License: UNKNOWN
 9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 | 


--------------------------------------------------------------------------------
/maddpg.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | README.md
 2 | setup.py
 3 | maddpg/__init__.py
 4 | maddpg.egg-info/PKG-INFO
 5 | maddpg.egg-info/SOURCES.txt
 6 | maddpg.egg-info/dependency_links.txt
 7 | maddpg.egg-info/not-zip-safe
 8 | maddpg.egg-info/requires.txt
 9 | maddpg.egg-info/top_level.txt
10 | multiagent/__init__.py
11 | multiagent/core.py
12 | multiagent/environment.py
13 | multiagent/multi_discrete.py
14 | multiagent/policy.py
15 | multiagent/rendering.py
16 | multiagent/scenario.py
17 | multiagent/scenarios/__init__.py
18 | multiagent/scenarios/simple.py
19 | multiagent/scenarios/simple_adversary.py
20 | multiagent/scenarios/simple_crypto.py
21 | multiagent/scenarios/simple_push.py
22 | multiagent/scenarios/simple_reference.py
23 | multiagent/scenarios/simple_speaker_listener.py
24 | multiagent/scenarios/simple_spread.py
25 | multiagent/scenarios/simple_tag.py
26 | multiagent/scenarios/simple_world_comm.py


--------------------------------------------------------------------------------
/maddpg.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/maddpg.egg-info/not-zip-safe:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/maddpg.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | gym
2 | numpy-stl
3 | 


--------------------------------------------------------------------------------
/maddpg.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | maddpg
2 | multiagent
3 | 


--------------------------------------------------------------------------------
/maddpg/__init__.py:
--------------------------------------------------------------------------------
 1 | class AgentTrainer(object):
 2 |     def __init__(self, name, model, obs_shape, act_space, args):
 3 |         raise NotImplemented()
 4 | 
 5 |     def action(self, obs):
 6 |         raise NotImplemented()
 7 | 
 8 |     def process_experience(self, obs, act, rew, new_obs, done, terminal):
 9 |         raise NotImplemented()
10 | 
11 |     def preupdate(self):
12 |         raise NotImplemented()
13 | 
14 |     def update(self, agents):
15 |         raise NotImplemented()


--------------------------------------------------------------------------------
/maddpg/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/maddpg/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/maddpg/common/__pycache__/distributions.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/maddpg/common/__pycache__/distributions.cpython-36.pyc


--------------------------------------------------------------------------------
/maddpg/common/__pycache__/tf_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/maddpg/common/__pycache__/tf_util.cpython-36.pyc


--------------------------------------------------------------------------------
/maddpg/common/distributions.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import maddpg.common.tf_util as U
  4 | from tensorflow.python.ops import math_ops
  5 | from multiagent.multi_discrete import MultiDiscrete
  6 | from tensorflow.python.ops import nn
  7 | 
  8 | class Pd(object):
  9 |     """
 10 |     A particular probability distribution
 11 |     """
 12 |     def flatparam(self):
 13 |         raise NotImplementedError
 14 |     def mode(self):
 15 |         raise NotImplementedError
 16 |     def logp(self, x):
 17 |         raise NotImplementedError
 18 |     def kl(self, other):
 19 |         raise NotImplementedError
 20 |     def entropy(self):
 21 |         raise NotImplementedError
 22 |     def sample(self):
 23 |         raise NotImplementedError
 24 | 
 25 | class PdType(object):
 26 |     """
 27 |     Parametrized family of probability distributions
 28 |     """
 29 |     def pdclass(self):
 30 |         raise NotImplementedError
 31 |     def pdfromflat(self, flat):
 32 |         return self.pdclass()(flat)
 33 |     def param_shape(self):
 34 |         raise NotImplementedError
 35 |     def sample_shape(self):
 36 |         raise NotImplementedError
 37 |     def sample_dtype(self):
 38 |         raise NotImplementedError
 39 | 
 40 |     def param_placeholder(self, prepend_shape, name=None):
 41 |         return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
 42 |     def sample_placeholder(self, prepend_shape, name=None):
 43 |         return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
 44 | 
 45 | class CategoricalPdType(PdType):
 46 |     def __init__(self, ncat):
 47 |         self.ncat = ncat
 48 |     def pdclass(self):
 49 |         return CategoricalPd
 50 |     def param_shape(self):
 51 |         return [self.ncat]
 52 |     def sample_shape(self):
 53 |         return []
 54 |     def sample_dtype(self):
 55 |         return tf.int32
 56 | 
 57 | class SoftCategoricalPdType(PdType):
 58 |     def __init__(self, ncat):
 59 |         self.ncat = ncat
 60 |     def pdclass(self):
 61 |         return SoftCategoricalPd
 62 |     def param_shape(self):
 63 |         return [self.ncat]
 64 |     def sample_shape(self):
 65 |         return [self.ncat]
 66 |     def sample_dtype(self):
 67 |         return tf.float32
 68 | 
 69 | class MultiCategoricalPdType(PdType):
 70 |     def __init__(self, low, high):
 71 |         self.low = low
 72 |         self.high = high
 73 |         self.ncats = high - low + 1
 74 |     def pdclass(self):
 75 |         return MultiCategoricalPd
 76 |     def pdfromflat(self, flat):
 77 |         return MultiCategoricalPd(self.low, self.high, flat)
 78 |     def param_shape(self):
 79 |         return [sum(self.ncats)]
 80 |     def sample_shape(self):
 81 |         return [len(self.ncats)]
 82 |     def sample_dtype(self):
 83 |         return tf.int32
 84 | 
 85 | class SoftMultiCategoricalPdType(PdType):
 86 |     def __init__(self, low, high):
 87 |         self.low = low
 88 |         self.high = high
 89 |         self.ncats = high - low + 1
 90 |     def pdclass(self):
 91 |         return SoftMultiCategoricalPd
 92 |     def pdfromflat(self, flat):
 93 |         return SoftMultiCategoricalPd(self.low, self.high, flat)
 94 |     def param_shape(self):
 95 |         return [sum(self.ncats)]
 96 |     def sample_shape(self):
 97 |         return [sum(self.ncats)]
 98 |     def sample_dtype(self):
 99 |         return tf.float32
100 | 
101 | class DiagGaussianPdType(PdType):
102 |     def __init__(self, size):
103 |         self.size = size
104 |     def pdclass(self):
105 |         return DiagGaussianPd
106 |     def param_shape(self):
107 |         return [2*self.size]
108 |     def sample_shape(self):
109 |         return [self.size]
110 |     def sample_dtype(self):
111 |         return tf.float32
112 | 
113 | class BernoulliPdType(PdType):
114 |     def __init__(self, size):
115 |         self.size = size
116 |     def pdclass(self):
117 |         return BernoulliPd
118 |     def param_shape(self):
119 |         return [self.size]
120 |     def sample_shape(self):
121 |         return [self.size]
122 |     def sample_dtype(self):
123 |         return tf.int32
124 | 
125 | # WRONG SECOND DERIVATIVES
126 | # class CategoricalPd(Pd):
127 | #     def __init__(self, logits):
128 | #         self.logits = logits
129 | #         self.ps = tf.nn.softmax(logits)
130 | #     @classmethod
131 | #     def fromflat(cls, flat):
132 | #         return cls(flat)
133 | #     def flatparam(self):
134 | #         return self.logits
135 | #     def mode(self):
136 | #         return U.argmax(self.logits, axis=1)
137 | #     def logp(self, x):
138 | #         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
139 | #     def kl(self, other):
140 | #         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
141 | #                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
142 | #     def entropy(self):
143 | #         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
144 | #     def sample(self):
145 | #         u = tf.random_uniform(tf.shape(self.logits))
146 | #         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
147 | 
148 | class CategoricalPd(Pd):
149 |     def __init__(self, logits):
150 |         self.logits = logits
151 |     def flatparam(self):
152 |         return self.logits
153 |     def mode(self):
154 |         return U.argmax(self.logits, axis=1)
155 |     def logp(self, x):
156 |         return -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
157 |     def kl(self, other):
158 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
159 |         a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
160 |         ea0 = tf.exp(a0)
161 |         ea1 = tf.exp(a1)
162 |         z0 = U.sum(ea0, axis=1, keepdims=True)
163 |         z1 = U.sum(ea1, axis=1, keepdims=True)
164 |         p0 = ea0 / z0
165 |         return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
166 |     def entropy(self):
167 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
168 |         ea0 = tf.exp(a0)
169 |         z0 = U.sum(ea0, axis=1, keepdims=True)
170 |         p0 = ea0 / z0
171 |         return U.sum(p0 * (tf.log(z0) - a0), axis=1)
172 |     def sample(self):
173 |         u = tf.random_uniform(tf.shape(self.logits))
174 |         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
175 |     @classmethod
176 |     def fromflat(cls, flat):
177 |         return cls(flat)
178 | 
179 | class SoftCategoricalPd(Pd):
180 |     def __init__(self, logits):
181 |         self.logits = logits
182 |     def flatparam(self):
183 |         return self.logits
184 |     def mode(self):
185 |         return U.softmax(self.logits, axis=-1)
186 |     def logp(self, x):
187 |         return -tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
188 |     def kl(self, other):
189 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
190 |         a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
191 |         ea0 = tf.exp(a0)
192 |         ea1 = tf.exp(a1)
193 |         z0 = U.sum(ea0, axis=1, keepdims=True)
194 |         z1 = U.sum(ea1, axis=1, keepdims=True)
195 |         p0 = ea0 / z0
196 |         return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
197 |     def entropy(self):
198 |         a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
199 |         ea0 = tf.exp(a0)
200 |         z0 = U.sum(ea0, axis=1, keepdims=True)
201 |         p0 = ea0 / z0
202 |         return U.sum(p0 * (tf.log(z0) - a0), axis=1)
203 |     def sample(self):
204 |         u = tf.random_uniform(tf.shape(self.logits))
205 |         return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1)  
206 |     @classmethod
207 |     def fromflat(cls, flat):
208 |         return cls(flat)        
209 | 
210 | class MultiCategoricalPd(Pd):
211 |     def __init__(self, low, high, flat):
212 |         self.flat = flat
213 |         self.low = tf.constant(low, dtype=tf.int32)
214 |         self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
215 |     def flatparam(self):
216 |         return self.flat
217 |     def mode(self):
218 |         return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
219 |     def logp(self, x):
220 |         return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
221 |     def kl(self, other):
222 |         return tf.add_n([
223 |                 p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
224 |             ])
225 |     def entropy(self):
226 |         return tf.add_n([p.entropy() for p in self.categoricals])
227 |     def sample(self):
228 |         return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
229 |     @classmethod
230 |     def fromflat(cls, flat):
231 |         return cls(flat)
232 | 
233 | class SoftMultiCategoricalPd(Pd):  # doesn't work yet
234 |     def __init__(self, low, high, flat):
235 |         self.flat = flat
236 |         self.low = tf.constant(low, dtype=tf.float32)
237 |         self.categoricals = list(map(SoftCategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
238 |     def flatparam(self):
239 |         return self.flat
240 |     def mode(self):
241 |         x = []
242 |         for i in range(len(self.categoricals)):
243 |             x.append(self.low[i] + self.categoricals[i].mode())
244 |         return tf.concat(x, axis=-1)
245 |     def logp(self, x):
246 |         return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
247 |     def kl(self, other):
248 |         return tf.add_n([
249 |                 p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
250 |             ])
251 |     def entropy(self):
252 |         return tf.add_n([p.entropy() for p in self.categoricals])
253 |     def sample(self):
254 |         x = []
255 |         for i in range(len(self.categoricals)):
256 |             x.append(self.low[i] + self.categoricals[i].sample())
257 |         return tf.concat(x, axis=-1)
258 |     @classmethod
259 |     def fromflat(cls, flat):
260 |         return cls(flat)
261 | 
262 | class DiagGaussianPd(Pd):
263 |     def __init__(self, flat):
264 |         self.flat = flat
265 |         mean, logstd = tf.split(axis=1, num_or_size_splits=2, value=flat)
266 |         self.mean = mean
267 |         self.logstd = logstd
268 |         self.std = tf.exp(logstd)
269 |     def flatparam(self):
270 |         return self.flat        
271 |     def mode(self):
272 |         return self.mean
273 |     def logp(self, x):
274 |         return - 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=1) \
275 |                - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) \
276 |                - U.sum(self.logstd, axis=1)
277 |     def kl(self, other):
278 |         assert isinstance(other, DiagGaussianPd)
279 |         return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=1)
280 |     def entropy(self):
281 |         return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), 1)
282 |     def sample(self):
283 |         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
284 |     @classmethod
285 |     def fromflat(cls, flat):
286 |         return cls(flat)
287 | 
288 | class BernoulliPd(Pd):
289 |     def __init__(self, logits):
290 |         self.logits = logits
291 |         self.ps = tf.sigmoid(logits)
292 |     def flatparam(self):
293 |         return self.logits
294 |     def mode(self):
295 |         return tf.round(self.ps)
296 |     def logp(self, x):
297 |         return - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1)
298 |     def kl(self, other):
299 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
300 |     def entropy(self):
301 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
302 |     def sample(self):
303 |         p = tf.sigmoid(self.logits)
304 |         u = tf.random_uniform(tf.shape(p))
305 |         return tf.to_float(math_ops.less(u, p))
306 |     @classmethod
307 |     def fromflat(cls, flat):
308 |         return cls(flat)
309 | 
310 | def make_pdtype(ac_space):
311 |     from gym import spaces
312 |     if isinstance(ac_space, spaces.Box):
313 |         assert len(ac_space.shape) == 1
314 |         return DiagGaussianPdType(ac_space.shape[0])
315 |     elif isinstance(ac_space, spaces.Discrete):
316 |         # return CategoricalPdType(ac_space.n)
317 |         return SoftCategoricalPdType(ac_space.n)
318 |     elif isinstance(ac_space, MultiDiscrete):
319 |         #return MultiCategoricalPdType(ac_space.low, ac_space.high)
320 |         return SoftMultiCategoricalPdType(ac_space.low, ac_space.high)
321 |     elif isinstance(ac_space, spaces.MultiBinary):
322 |         return BernoulliPdType(ac_space.n)
323 |     else:
324 |         raise NotImplementedError
325 | 
326 | def shape_el(v, i):
327 |     maybe = v.get_shape()[i]
328 |     if maybe is not None:
329 |         return maybe
330 |     else:
331 |         return tf.shape(v)[i]
332 | 


--------------------------------------------------------------------------------
/maddpg/common/tf_util.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import numpy as np
  3 | import os
  4 | import tensorflow as tf
  5 | 
  6 | def sum(x, axis=None, keepdims=False):
  7 |     return tf.reduce_sum(x, axis=None if axis is None else [axis], keep_dims = keepdims)
  8 | def mean(x, axis=None, keepdims=False):
  9 |     return tf.reduce_mean(x, axis=None if axis is None else [axis], keep_dims = keepdims)
 10 | def var(x, axis=None, keepdims=False):
 11 |     meanx = mean(x, axis=axis, keepdims=keepdims)
 12 |     return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
 13 | def std(x, axis=None, keepdims=False):
 14 |     return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
 15 | def max(x, axis=None, keepdims=False):
 16 |     return tf.reduce_max(x, axis=None if axis is None else [axis], keep_dims = keepdims)
 17 | def min(x, axis=None, keepdims=False):
 18 |     return tf.reduce_min(x, axis=None if axis is None else [axis], keep_dims = keepdims)
 19 | def concatenate(arrs, axis=0):
 20 |     return tf.concat(axis=axis, values=arrs)
 21 | def argmax(x, axis=None):
 22 |     return tf.argmax(x, axis=axis)
 23 | def softmax(x, axis=None):
 24 |     return tf.nn.softmax(x, axis=axis)
 25 | 
 26 | # ================================================================
 27 | # Misc
 28 | # ================================================================
 29 | 
 30 | 
 31 | def is_placeholder(x):
 32 |     return type(x) is tf.Tensor and len(x.op.inputs) == 0
 33 | 
 34 | # ================================================================
 35 | # Inputs
 36 | # ================================================================
 37 | 
 38 | 
 39 | class TfInput(object):
 40 |     def __init__(self, name="(unnamed)"):
 41 |         """Generalized Tensorflow placeholder. The main differences are:
 42 |             - possibly uses multiple placeholders internally and returns multiple values
 43 |             - can apply light postprocessing to the value feed to placeholder.
 44 |         """
 45 |         self.name = name
 46 | 
 47 |     def get(self):
 48 |         """Return the tf variable(s) representing the possibly postprocessed value
 49 |         of placeholder(s).
 50 |         """
 51 |         raise NotImplemented()
 52 | 
 53 |     def make_feed_dict(data):
 54 |         """Given data input it to the placeholder(s)."""
 55 |         raise NotImplemented()
 56 | 
 57 | 
 58 | class PlacholderTfInput(TfInput):
 59 |     def __init__(self, placeholder):
 60 |         """Wrapper for regular tensorflow placeholder."""
 61 |         super().__init__(placeholder.name)
 62 |         self._placeholder = placeholder
 63 | 
 64 |     def get(self):
 65 |         return self._placeholder
 66 | 
 67 |     def make_feed_dict(self, data):
 68 |         return {self._placeholder: data}
 69 | 
 70 | 
 71 | class BatchInput(PlacholderTfInput):
 72 |     def __init__(self, shape, dtype=tf.float32, name=None):
 73 |         """Creates a placeholder for a batch of tensors of a given shape and dtype
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         shape: [int]
 78 |             shape of a single elemenet of the batch
 79 |         dtype: tf.dtype
 80 |             number representation used for tensor contents
 81 |         name: str
 82 |             name of the underlying placeholder
 83 |         """
 84 |         super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
 85 | 
 86 | 
 87 | class Uint8Input(PlacholderTfInput):
 88 |     def __init__(self, shape, name=None):
 89 |         """Takes input in uint8 format which is cast to float32 and divided by 255
 90 |         before passing it to the model.
 91 | 
 92 |         On GPU this ensures lower data transfer times.
 93 | 
 94 |         Parameters
 95 |         ----------
 96 |         shape: [int]
 97 |             shape of the tensor.
 98 |         name: str
 99 |             name of the underlying placeholder
100 |         """
101 | 
102 |         super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
103 |         self._shape = shape
104 |         self._output = tf.cast(super().get(), tf.float32) / 255.0
105 | 
106 |     def get(self):
107 |         return self._output
108 | 
109 | 
110 | def ensure_tf_input(thing):
111 |     """Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
112 |     if isinstance(thing, TfInput):
113 |         return thing
114 |     elif is_placeholder(thing):
115 |         return PlacholderTfInput(thing)
116 |     else:
117 |         raise ValueError("Must be a placeholder or TfInput")
118 | 
119 | # ================================================================
120 | # Mathematical utils
121 | # ================================================================
122 | 
123 | 
124 | def huber_loss(x, delta=1.0):
125 |     """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
126 |     return tf.where(
127 |         tf.abs(x) < delta,
128 |         tf.square(x) * 0.5,
129 |         delta * (tf.abs(x) - 0.5 * delta)
130 |     )
131 | 
132 | # ================================================================
133 | # Optimizer utils
134 | # ================================================================
135 | 
136 | 
137 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
138 |     """Minimized `objective` using `optimizer` w.r.t. variables in
139 |     `var_list` while ensure the norm of the gradients for each
140 |     variable is clipped to `clip_val`
141 |     """    
142 |     if clip_val is None:
143 |         return optimizer.minimize(objective, var_list=var_list)
144 |     else:
145 |         gradients = optimizer.compute_gradients(objective, var_list=var_list)
146 |         for i, (grad, var) in enumerate(gradients):
147 |             if grad is not None:
148 |                 gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
149 |         return optimizer.apply_gradients(gradients)
150 | 
151 | 
152 | # ================================================================
153 | # Global session
154 | # ================================================================
155 | 
156 | def get_session():
157 |     """Returns recently made Tensorflow session"""
158 |     return tf.get_default_session()
159 | 
160 | 
161 | def make_session(num_cpu):
162 |     """Returns a session that will use <num_cpu> CPU's only"""
163 |     tf_config = tf.ConfigProto(
164 |         inter_op_parallelism_threads=num_cpu,
165 |         intra_op_parallelism_threads=num_cpu)
166 |     return tf.Session(config=tf_config)
167 | 
168 | 
169 | def single_threaded_session():
170 |     """Returns a session which will only use a single CPU"""
171 |     return make_session(1)
172 | 
173 | 
174 | ALREADY_INITIALIZED = set()
175 | 
176 | 
177 | def initialize():
178 |     """Initialize all the uninitialized variables in the global scope."""
179 |     new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
180 |     get_session().run(tf.variables_initializer(new_variables))
181 |     ALREADY_INITIALIZED.update(new_variables)
182 | 
183 | 
184 | # ================================================================
185 | # Scopes
186 | # ================================================================
187 | 
188 | 
189 | def scope_vars(scope, trainable_only=False):
190 |     """
191 |     Get variables inside a scope
192 |     The scope can be specified as a string
193 | 
194 |     Parameters
195 |     ----------
196 |     scope: str or VariableScope
197 |         scope in which the variables reside.
198 |     trainable_only: bool
199 |         whether or not to return only the variables that were marked as trainable.
200 | 
201 |     Returns
202 |     -------
203 |     vars: [tf.Variable]
204 |         list of variables in `scope`.
205 |     """
206 |     return tf.get_collection(
207 |         tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
208 |         scope=scope if isinstance(scope, str) else scope.name
209 |     )
210 | 
211 | 
212 | def scope_name():
213 |     """Returns the name of current scope as a string, e.g. deepq/q_func"""
214 |     return tf.get_variable_scope().name
215 | 
216 | 
217 | def absolute_scope_name(relative_scope_name):
218 |     """Appends parent scope name to `relative_scope_name`"""
219 |     return scope_name() + "/" + relative_scope_name
220 | 
221 | # ================================================================
222 | # Saving variables
223 | # ================================================================
224 | 
225 | 
226 | def load_state(fname, saver=None):
227 |     """Load all the variables to the current session from the location <fname>"""
228 |     if saver is None:
229 |         saver = tf.train.Saver()
230 |     saver.restore(get_session(), fname)
231 |     return saver
232 | 
233 | 
234 | def save_state(fname, saver=None):
235 |     """Save all the variables in the current session to the location <fname>"""
236 |     os.makedirs(os.path.dirname(fname), exist_ok=True)
237 |     if saver is None:
238 |         saver = tf.train.Saver()
239 |     saver.save(get_session(), fname)
240 |     return saver
241 | 
242 | # ================================================================
243 | # Theano-like Function
244 | # ================================================================
245 | 
246 | 
247 | def function(inputs, outputs, updates=None, givens=None):
248 |     """Just like Theano function. Take a bunch of tensorflow placeholders and expersions
249 |     computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
250 |     values to be feed to the inputs placeholders and produces the values of the experessions
251 |     in outputs.
252 | 
253 |     Input values can be passed in the same order as inputs or can be provided as kwargs based
254 |     on placeholder name (passed to constructor or accessible via placeholder.op.name).
255 | 
256 |     Example:
257 |         x = tf.placeholder(tf.int32, (), name="x")
258 |         y = tf.placeholder(tf.int32, (), name="y")
259 |         z = 3 * x + 2 * y
260 |         lin = function([x, y], z, givens={y: 0})
261 | 
262 |         with single_threaded_session():
263 |             initialize()
264 | 
265 |             assert lin(2) == 6
266 |             assert lin(x=3) == 9
267 |             assert lin(2, 2) == 10
268 |             assert lin(x=2, y=3) == 12
269 | 
270 |     Parameters
271 |     ----------
272 |     inputs: [tf.placeholder or TfInput]
273 |         list of input arguments
274 |     outputs: [tf.Variable] or tf.Variable
275 |         list of outputs or a single output to be returned from function. Returned
276 |         value will also have the same shape.
277 |     """
278 |     if isinstance(outputs, list):
279 |         return _Function(inputs, outputs, updates, givens=givens)
280 |     elif isinstance(outputs, (dict, collections.OrderedDict)):
281 |         f = _Function(inputs, outputs.values(), updates, givens=givens)
282 |         return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
283 |     else:
284 |         f = _Function(inputs, [outputs], updates, givens=givens)
285 |         return lambda *args, **kwargs: f(*args, **kwargs)[0]
286 | 
287 | 
288 | class _Function(object):
289 |     def __init__(self, inputs, outputs, updates, givens, check_nan=False):
290 |         for inpt in inputs:
291 |             if not issubclass(type(inpt), TfInput):
292 |                 assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of rl_algs.common.TfInput"
293 |         self.inputs = inputs
294 |         updates = updates or []
295 |         self.update_group = tf.group(*updates)
296 |         self.outputs_update = list(outputs) + [self.update_group]
297 |         self.givens = {} if givens is None else givens
298 |         self.check_nan = check_nan
299 | 
300 |     def _feed_input(self, feed_dict, inpt, value):
301 |         if issubclass(type(inpt), TfInput):
302 |             feed_dict.update(inpt.make_feed_dict(value))
303 |         elif is_placeholder(inpt):
304 |             feed_dict[inpt] = value
305 | 
306 |     def __call__(self, *args, **kwargs):
307 |         assert len(args) <= len(self.inputs), "Too many arguments provided"
308 |         feed_dict = {}
309 |         # Update the args
310 |         for inpt, value in zip(self.inputs, args):
311 |             self._feed_input(feed_dict, inpt, value)
312 |         # Update the kwargs
313 |         kwargs_passed_inpt_names = set()
314 |         for inpt in self.inputs[len(args):]:
315 |             inpt_name = inpt.name.split(':')[0]
316 |             inpt_name = inpt_name.split('/')[-1]
317 |             assert inpt_name not in kwargs_passed_inpt_names, \
318 |                 "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
319 |             if inpt_name in kwargs:
320 |                 kwargs_passed_inpt_names.add(inpt_name)
321 |                 self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
322 |             else:
323 |                 assert inpt in self.givens, "Missing argument " + inpt_name
324 |         assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
325 |         # Update feed dict with givens.
326 |         for inpt in self.givens:
327 |             feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
328 |         results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
329 |         if self.check_nan:
330 |             if any(np.isnan(r).any() for r in results):
331 |                 raise RuntimeError("Nan detected")
332 |         return results
333 | 


--------------------------------------------------------------------------------
/maddpg/trainer/__pycache__/maddpg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/maddpg/trainer/__pycache__/maddpg.cpython-36.pyc


--------------------------------------------------------------------------------
/maddpg/trainer/__pycache__/replay_buffer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/maddpg/trainer/__pycache__/replay_buffer.cpython-36.pyc


--------------------------------------------------------------------------------
/maddpg/trainer/maddpg.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import tensorflow as tf
  4 | import maddpg.common.tf_util as U
  5 | 
  6 | from maddpg.common.distributions import make_pdtype
  7 | from maddpg import AgentTrainer
  8 | from maddpg.trainer.replay_buffer import ReplayBuffer
  9 | 
 10 | 
 11 | def discount_with_dones(rewards, dones, gamma):
 12 |     discounted = []
 13 |     r = 0
 14 |     for reward, done in zip(rewards[::-1], dones[::-1]):
 15 |         r = reward + gamma*r
 16 |         r = r*(1.-done)
 17 |         discounted.append(r)
 18 |     return discounted[::-1]
 19 | 
 20 | def make_update_exp(vals, target_vals):
 21 |     polyak = 1.0 - 1e-2
 22 |     expression = []
 23 |     for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
 24 |         expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var))
 25 |     expression = tf.group(*expression)
 26 |     return U.function([], [], updates=[expression])
 27 | 
 28 | def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
 29 |     with tf.variable_scope(scope, reuse=reuse):
 30 |         # create distribtuions
 31 |         act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
 32 | 
 33 |         # set up placeholders
 34 |         obs_ph_n = make_obs_ph_n
 35 |         act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
 36 | 
 37 |         p_input = obs_ph_n[p_index]
 38 | 
 39 |         p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units)
 40 |         p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
 41 | 
 42 |         # wrap parameters in distribution
 43 |         act_pd = act_pdtype_n[p_index].pdfromflat(p)
 44 | 
 45 |         act_sample = act_pd.sample()
 46 |         p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
 47 | 
 48 |         act_input_n = act_ph_n + []
 49 |         act_input_n[p_index] = act_pd.sample()
 50 |         q_input = tf.concat(obs_ph_n + act_input_n, 1)
 51 |         if local_q_func:
 52 |             q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
 53 |         q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
 54 |         pg_loss = -tf.reduce_mean(q)
 55 | 
 56 |         loss = pg_loss + p_reg * 1e-3
 57 | 
 58 |         optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)
 59 | 
 60 |         # Create callable functions
 61 |         train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
 62 |         act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
 63 |         p_values = U.function([obs_ph_n[p_index]], p)
 64 | 
 65 |         # target network
 66 |         target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units)
 67 |         target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
 68 |         update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
 69 | 
 70 |         target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
 71 |         target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)
 72 | 
 73 |         return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
 74 | 
 75 | def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
 76 |     with tf.variable_scope(scope, reuse=reuse):
 77 |         # create distribtuions
 78 |         act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
 79 | 
 80 |         # set up placeholders
 81 |         obs_ph_n = make_obs_ph_n
 82 |         act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
 83 |         target_ph = tf.placeholder(tf.float32, [None], name="target")
 84 | 
 85 |         q_input = tf.concat(obs_ph_n + act_ph_n, 1)
 86 |         if local_q_func:
 87 |             q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
 88 |         q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
 89 |         q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
 90 | 
 91 |         q_loss = tf.reduce_mean(tf.square(q - target_ph))
 92 | 
 93 |         # viscosity solution to Bellman differential equation in place of an initial condition
 94 |         q_reg = tf.reduce_mean(tf.square(q))
 95 |         loss = q_loss #+ 1e-3 * q_reg
 96 | 
 97 |         optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)
 98 | 
 99 |         # Create callable functions
100 |         train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
101 |         q_values = U.function(obs_ph_n + act_ph_n, q)
102 | 
103 |         # target network
104 |         target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
105 |         target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
106 |         update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
107 | 
108 |         target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
109 | 
110 |         return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
111 | 
112 | class MADDPGAgentTrainer(AgentTrainer):
113 |     def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
114 |         self.name = name
115 |         self.n = len(obs_shape_n)
116 |         self.agent_index = agent_index
117 |         self.args = args
118 |         obs_ph_n = []
119 |         for i in range(self.n):
120 |             obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
121 | 
122 |         # Create all the functions necessary to train the model
123 |         self.q_train, self.q_update, self.q_debug = q_train(
124 |             scope=self.name,
125 |             make_obs_ph_n=obs_ph_n,
126 |             act_space_n=act_space_n,
127 |             q_index=agent_index,
128 |             q_func=model,
129 |             optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
130 |             grad_norm_clipping=0.5,
131 |             local_q_func=local_q_func,
132 |             num_units=args.num_units
133 |         )
134 |         self.act, self.p_train, self.p_update, self.p_debug = p_train(
135 |             scope=self.name,
136 |             make_obs_ph_n=obs_ph_n,
137 |             act_space_n=act_space_n,
138 |             p_index=agent_index,
139 |             p_func=model,
140 |             q_func=model,
141 |             optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
142 |             grad_norm_clipping=0.5,
143 |             local_q_func=local_q_func,
144 |             num_units=args.num_units
145 |         )
146 |         # Create experience buffer
147 |         self.replay_buffer = ReplayBuffer(1e6)
148 |         self.max_replay_buffer_len = args.batch_size * args.max_episode_len
149 |         self.replay_sample_index = None
150 | 
151 |     def action(self, obs):
152 |         return self.act(obs[None])[0]
153 | 
154 |     def experience(self, obs, act, rew, new_obs, done, terminal):
155 |         # Store transition in the replay buffer.
156 |         self.replay_buffer.add(obs, act, rew, new_obs, float(done))
157 | 
158 |     def preupdate(self):
159 |         self.replay_sample_index = None
160 | 
161 |     def update(self, agents, t):
162 |         if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough
163 |             return
164 |         if not t % 100 == 0:  # only update every 100 steps
165 |             return
166 | 
167 |         self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)
168 |         # collect replay sample from all agents
169 |         obs_n = []
170 |         obs_next_n = []
171 |         act_n = []
172 |         index = self.replay_sample_index
173 |         for i in range(self.n):
174 |             obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
175 |             obs_n.append(obs)
176 |             obs_next_n.append(obs_next)
177 |             act_n.append(act)
178 |         obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)
179 | 
180 |         # train q network
181 |         num_sample = 1
182 |         target_q = 0.0
183 |         for i in range(num_sample):
184 |             target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
185 |             target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
186 |             target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
187 |         target_q /= num_sample
188 |         q_loss = self.q_train(*(obs_n + act_n + [target_q]))
189 | 
190 |         # train p network
191 |         p_loss = self.p_train(*(obs_n + act_n))
192 | 
193 |         self.p_update()
194 |         self.q_update()
195 | 
196 |         return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]
197 | 


--------------------------------------------------------------------------------
/maddpg/trainer/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | class ReplayBuffer(object):
 5 |     def __init__(self, size):
 6 |         """Create Prioritized Replay buffer.
 7 | 
 8 |         Parameters
 9 |         ----------
10 |         size: int
11 |             Max number of transitions to store in the buffer. When the buffer
12 |             overflows the old memories are dropped.
13 |         """
14 |         self._storage = []
15 |         self._maxsize = int(size)
16 |         self._next_idx = 0
17 | 
18 |     def __len__(self):
19 |         return len(self._storage)
20 | 
21 |     def clear(self):
22 |         self._storage = []
23 |         self._next_idx = 0
24 | 
25 |     def add(self, obs_t, action, reward, obs_tp1, done):
26 |         data = (obs_t, action, reward, obs_tp1, done)
27 | 
28 |         if self._next_idx >= len(self._storage):
29 |             self._storage.append(data)
30 |         else:
31 |             self._storage[self._next_idx] = data
32 |         self._next_idx = (self._next_idx + 1) % self._maxsize
33 | 
34 |     def _encode_sample(self, idxes):
35 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
36 |         for i in idxes:
37 |             data = self._storage[i]
38 |             obs_t, action, reward, obs_tp1, done = data
39 |             obses_t.append(np.array(obs_t, copy=False))
40 |             actions.append(np.array(action, copy=False))
41 |             rewards.append(reward)
42 |             obses_tp1.append(np.array(obs_tp1, copy=False))
43 |             dones.append(done)
44 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
45 | 
46 |     def make_index(self, batch_size):
47 |         return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
48 | 
49 |     def make_latest_index(self, batch_size):
50 |         idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)]
51 |         np.random.shuffle(idx)
52 |         return idx
53 | 
54 |     def sample_index(self, idxes):
55 |         return self._encode_sample(idxes)
56 | 
57 |     def sample(self, batch_size):
58 |         """Sample a batch of experiences.
59 | 
60 |         Parameters
61 |         ----------
62 |         batch_size: int
63 |             How many transitions to sample.
64 | 
65 |         Returns
66 |         -------
67 |         obs_batch: np.array
68 |             batch of observations
69 |         act_batch: np.array
70 |             batch of actions executed given obs_batch
71 |         rew_batch: np.array
72 |             rewards received as results of executing act_batch
73 |         next_obs_batch: np.array
74 |             next set of observations seen after executing act_batch
75 |         done_mask: np.array
76 |             done_mask[i] = 1 if executing act_batch[i] resulted in
77 |             the end of an episode and 0 otherwise.
78 |         """
79 |         if batch_size > 0:
80 |             idxes = self.make_index(batch_size)
81 |         else:
82 |             idxes = range(0, len(self._storage))
83 |         return self._encode_sample(idxes)
84 | 
85 |     def collect(self):
86 |         return self.sample(-1)
87 | 


--------------------------------------------------------------------------------
/multiagent/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | # Multiagent envs
 4 | # ----------------------------------------
 5 | 
 6 | register(
 7 |     id='MultiagentSimple-v0',
 8 |     entry_point='multiagent.envs:SimpleEnv',
 9 |     # FIXME(cathywu) currently has to be exactly max_path_length parameters in
10 |     # rllab run script
11 |     max_episode_steps=100,
12 | )
13 | 
14 | register(
15 |     id='MultiagentSimpleSpeakerListener-v0',
16 |     entry_point='multiagent.envs:SimpleSpeakerListenerEnv',
17 |     max_episode_steps=100,
18 | )
19 | 


--------------------------------------------------------------------------------
/multiagent/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/__pycache__/core.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/core.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/__pycache__/environment.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/environment.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/__pycache__/multi_discrete.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/multi_discrete.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/__pycache__/rendering.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/rendering.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/__pycache__/scenario.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/__pycache__/scenario.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | # physical/external base state of all entites
  4 | class EntityState(object):
  5 |     def __init__(self):
  6 |         # physical position
  7 |         self.p_pos = None
  8 |         # physical velocity
  9 |         self.p_vel = None
 10 | 
 11 | # state of agents (including communication and internal/mental state)
 12 | class AgentState(EntityState):
 13 |     def __init__(self):
 14 |         super(AgentState, self).__init__()
 15 |         # communication utterance
 16 |         self.c = None
 17 | 
 18 | # action of the agent
 19 | class Action(object):
 20 |     def __init__(self):
 21 |         # physical action
 22 |         self.u = None
 23 |         # communication action
 24 |         self.c = None
 25 | 
 26 | # properties and state of physical world entity
 27 | class Entity(object):
 28 |     def __init__(self):
 29 |         # name 
 30 |         self.name = ''
 31 |         # properties:
 32 |         self.size = 0.050
 33 |         # entity can move / be pushed
 34 |         self.movable = False
 35 |         # entity collides with others
 36 |         self.collide = True
 37 |         # material density (affects mass)
 38 |         self.density = 25.0
 39 |         # color
 40 |         self.color = None
 41 |         # max speed and accel
 42 |         self.max_speed = 0.2
 43 |         self.accel = None
 44 |         # state
 45 |         self.state = EntityState()
 46 |         # mass
 47 |         self.initial_mass = 1.0
 48 | 
 49 |     @property
 50 |     def mass(self):
 51 |         return self.initial_mass
 52 | 
 53 | # properties of landmark entities
 54 | class Landmark(Entity):
 55 |      def __init__(self):
 56 |         super(Landmark, self).__init__()
 57 | 
 58 | # properties of agent entities
 59 | class Agent(Entity):
 60 |     def __init__(self):
 61 |         super(Agent, self).__init__()
 62 |         # agents are movable by default
 63 |         self.movable = True
 64 |         # cannot send communication signals
 65 |         self.silent = False
 66 |         # cannot observe the world
 67 |         self.blind = False
 68 |         # physical motor noise amount
 69 |         self.u_noise = None
 70 |         # communication noise amount
 71 |         self.c_noise = None
 72 |         # control range
 73 |         self.u_range = 1.0
 74 |         # state
 75 |         self.state = AgentState()
 76 |         # action
 77 |         self.action = Action()
 78 |         # script behavior to execute
 79 |         self.action_callback = None
 80 | 
 81 | # multi-agent world
 82 | class World(object):
 83 |     def __init__(self):
 84 |         # list of agents and entities (can change at execution-time!)
 85 |         self.agents = []
 86 |         self.landmarks = []
 87 |         # communication channel dimensionality 
 88 |         self.dim_c = 0
 89 |         # position dimensionality
 90 |         self.dim_p = 2
 91 |         # color dimensionality
 92 |         self.dim_color = 3
 93 |         # simulation timestep
 94 |         self.dt = 0.1
 95 |         # physical damping
 96 |         self.damping = 0.25
 97 |         # contact response parameters
 98 |         self.contact_force = 1e+2
 99 |         self.contact_margin = 1e-3
100 | 
101 |     # return all entities in the world
102 |     @property
103 |     def entities(self):
104 |         return self.agents + self.landmarks
105 | 
106 |     # return all agents controllable by external policies
107 |     @property
108 |     def policy_agents(self):
109 |         return [agent for agent in self.agents if agent.action_callback is None]
110 | 
111 |     # return all agents controlled by world scripts
112 |     @property
113 |     def scripted_agents(self):
114 |         return [agent for agent in self.agents if agent.action_callback is not None]
115 | 
116 |     # update state of the world
117 |     def step(self):
118 |         # set actions for scripted agents 
119 |         for agent in self.scripted_agents:
120 |             agent.action = agent.action_callback(agent, self)
121 |         # gather forces applied to entities
122 |         p_force = [None] * len(self.entities)
123 |         # apply agent physical controls
124 |         p_force = self.apply_action_force(p_force)
125 |         # apply environment forces
126 |         p_force = self.apply_environment_force(p_force)
127 |         # integrate physical state
128 |         self.integrate_state(p_force)
129 |         # update agent state
130 |         for agent in self.agents:
131 |             self.update_agent_state(agent)
132 | 
133 |     # gather agent action forces
134 |     def apply_action_force(self, p_force):
135 |         # set applied forces
136 |         for i,agent in enumerate(self.agents):
137 |             if agent.movable:
138 |                 noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0
139 |                 p_force[i] = agent.action.u + noise                
140 |         return p_force
141 | 
142 |     # gather physical forces acting on entities
143 |     def apply_environment_force(self, p_force):
144 |         # simple (but inefficient) collision response
145 |         for a,entity_a in enumerate(self.entities):
146 |             for b,entity_b in enumerate(self.entities):
147 |                 if(b <= a): continue
148 |                 [f_a, f_b] = self.get_collision_force(entity_a, entity_b)
149 |                 if(f_a is not None):
150 |                     if(p_force[a] is None): p_force[a] = 0.0
151 |                     p_force[a] = f_a + p_force[a] 
152 |                 if(f_b is not None):
153 |                     if(p_force[b] is None): p_force[b] = 0.0
154 |                     p_force[b] = f_b + p_force[b]        
155 |         return p_force
156 | 
157 |     # integrate physical state
158 |     def integrate_state(self, p_force):
159 |         for i,entity in enumerate(self.entities):
160 |             if not entity.movable: continue
161 |             entity.state.p_vel = entity.state.p_vel * (1 - self.damping)
162 |             if (p_force[i] is not None):
163 |                 entity.state.p_vel += (p_force[i] / entity.mass) * self.dt
164 |             if entity.max_speed is not None:
165 |                 speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1]))
166 |                 if speed > entity.max_speed:
167 |                     entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) +
168 |                                                                   np.square(entity.state.p_vel[1])) * entity.max_speed
169 |             entity.state.p_pos += entity.state.p_vel * self.dt
170 | 
171 |     def update_agent_state(self, agent):
172 |         # set communication state (directly for now)
173 |         if agent.silent:
174 |             agent.state.c = np.zeros(self.dim_c)
175 |         else:
176 |             noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0
177 |             agent.state.c = agent.action.c + noise      
178 | 
179 |     # get collision forces for any contact between two entities
180 |     def get_collision_force(self, entity_a, entity_b):
181 |         if (not entity_a.collide) or (not entity_b.collide):
182 |             return [None, None] # not a collider
183 |         if (entity_a is entity_b):
184 |             return [None, None] # don't collide against itself
185 |         # compute actual distance between entities
186 |         delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
187 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
188 |         # minimum allowable distance
189 |         dist_min = entity_a.size + entity_b.size
190 |         # softmax penetration
191 |         k = self.contact_margin
192 |         penetration = np.logaddexp(0, -(dist - dist_min)/k)*k
193 |         force = self.contact_force * delta_pos / dist * penetration
194 |         force_a = +force if entity_a.movable else None
195 |         force_b = -force if entity_b.movable else None
196 |         return [force_a, force_b]


--------------------------------------------------------------------------------
/multiagent/environment.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import spaces
  3 | from gym.envs.registration import EnvSpec
  4 | import numpy as np
  5 | from multiagent.multi_discrete import MultiDiscrete
  6 | 
  7 | # environment for all agents in the multiagent world
  8 | # currently code assumes that no agents will be created/destroyed at runtime!
  9 | class MultiAgentEnv(gym.Env):
 10 |     metadata = {
 11 |         'render.modes' : ['human', 'rgb_array']
 12 |     }
 13 | 
 14 |     def __init__(self, world, reset_callback=None, reward_callback=None,
 15 |                  observation_callback=None, info_callback=None,
 16 |                  done_callback=None, shared_viewer=True):
 17 | 
 18 |         self.world = world
 19 |         self.agents = self.world.policy_agents
 20 |         # set required vectorized gym env property
 21 |         self.n = len(world.policy_agents)
 22 |         # scenario callbacks
 23 |         self.reset_callback = reset_callback
 24 |         self.reward_callback = reward_callback
 25 |         self.observation_callback = observation_callback
 26 |         self.info_callback = info_callback
 27 |         self.done_callback = done_callback
 28 |         # environment parameters
 29 |         self.discrete_action_space = True
 30 |         # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
 31 |         self.discrete_action_input = False
 32 |         # if true, even the action is continuous, action will be performed discretely
 33 |         self.force_discrete_action = world.discrete_action if hasattr(world, 'discrete_action') else False
 34 |         # if true, every agent has the same reward
 35 |         self.shared_reward = world.collaborative if hasattr(world, 'collaborative') else False
 36 |         self.time = 0
 37 | 
 38 |         # configure spaces
 39 |         self.action_space = []
 40 |         self.observation_space = []
 41 |         for agent in self.agents:
 42 |             total_action_space = []
 43 |             # physical action space
 44 |             if self.discrete_action_space:
 45 |                 u_action_space = spaces.Discrete(world.dim_p * 2 + 1)
 46 |             else:
 47 |                 u_action_space = spaces.Box(low=-agent.u_range, high=+agent.u_range, shape=(world.dim_p,), dtype=np.float32)
 48 |             if agent.movable:
 49 |                 total_action_space.append(u_action_space)
 50 |             # communication action space
 51 |             if self.discrete_action_space:
 52 |                 c_action_space = spaces.Discrete(world.dim_c)
 53 |             else:
 54 |                 c_action_space = spaces.Box(low=0.0, high=1.0, shape=(world.dim_c,), dtype=np.float32)
 55 |             if not agent.silent:
 56 |                 total_action_space.append(c_action_space)
 57 |             # total action space
 58 |             if len(total_action_space) > 1:
 59 |                 # all action spaces are discrete, so simplify to MultiDiscrete action space
 60 |                 if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
 61 |                     act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
 62 |                 else:
 63 |                     act_space = spaces.Tuple(total_action_space)
 64 |                 self.action_space.append(act_space)
 65 |             else:
 66 |                 self.action_space.append(total_action_space[0])
 67 |             # observation space
 68 |             obs_dim = len(observation_callback(agent, self.world))
 69 |             self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,), dtype=np.float32))
 70 |             agent.action.c = np.zeros(self.world.dim_c)
 71 | 
 72 |         # rendering
 73 |         self.shared_viewer = shared_viewer
 74 |         if self.shared_viewer:
 75 |             self.viewers = [None]
 76 |         else:
 77 |             self.viewers = [None] * self.n
 78 |         self._reset_render()
 79 | 
 80 |     def step(self, action_n):
 81 |         obs_n = []
 82 |         reward_n = []
 83 |         done_n = []
 84 |         info_n = {'n': []}
 85 |         self.agents = self.world.policy_agents
 86 |         # set action for each agent
 87 |         for i, agent in enumerate(self.agents):
 88 |             self._set_action(action_n[i], agent, self.action_space[i])
 89 |         # advance world state
 90 |         self.world.step()
 91 |         # record observation for each agent
 92 |         for agent in self.agents:
 93 |             obs_n.append(self._get_obs(agent))
 94 |             reward_n.append(self._get_reward(agent))
 95 |             done_n.append(self._get_done(agent))
 96 | 
 97 |             info_n['n'].append(self._get_info(agent))
 98 | 
 99 |         # all agents get total reward in cooperative case
100 |         reward = np.sum(reward_n)
101 |         if self.shared_reward:
102 |             reward_n = [reward] * self.n
103 | 
104 |         return obs_n, reward_n, done_n, info_n
105 | 
106 |     def reset(self):
107 |         # reset world
108 |         self.reset_callback(self.world)
109 |         # reset renderer
110 |         self._reset_render()
111 |         # record observations for each agent
112 |         obs_n = []
113 |         self.agents = self.world.policy_agents
114 |         for agent in self.agents:
115 |             obs_n.append(self._get_obs(agent))
116 |         return obs_n
117 | 
118 |     # get info used for benchmarking
119 |     def _get_info(self, agent):
120 |         if self.info_callback is None:
121 |             return {}
122 |         return self.info_callback(agent, self.world)
123 | 
124 |     # get observation for a particular agent
125 |     def _get_obs(self, agent):
126 |         if self.observation_callback is None:
127 |             return np.zeros(0)
128 |         return self.observation_callback(agent, self.world)
129 | 
130 |     # get dones for a particular agent
131 |     # unused right now -- agents are allowed to go beyond the viewing screen
132 |     def _get_done(self, agent):
133 |         if self.done_callback is None:
134 |             return False
135 |         return self.done_callback(agent, self.world)
136 | 
137 |     # get reward for a particular agent
138 |     def _get_reward(self, agent):
139 |         if self.reward_callback is None:
140 |             return 0.0
141 |         return self.reward_callback(agent, self.world)
142 | 
143 |     # set env action for a particular agent
144 |     def _set_action(self, action, agent, action_space, time=None):
145 |         agent.action.u = np.zeros(self.world.dim_p)
146 |         agent.action.c = np.zeros(self.world.dim_c)
147 |         # process action
148 |         if isinstance(action_space, MultiDiscrete):
149 |             act = []
150 |             size = action_space.high - action_space.low + 1
151 |             index = 0
152 |             for s in size:
153 |                 act.append(action[index:(index+s)])
154 |                 index += s
155 |             action = act
156 |         else:
157 |             action = [action]
158 | 
159 |         if agent.movable:
160 |             # physical action
161 |             if self.discrete_action_input:
162 |                 agent.action.u = np.zeros(self.world.dim_p)
163 |                 # process discrete action
164 |                 if action[0] == 1: agent.action.u[0] = -1.0
165 |                 if action[0] == 2: agent.action.u[0] = +1.0
166 |                 if action[0] == 3: agent.action.u[1] = -1.0
167 |                 if action[0] == 4: agent.action.u[1] = +1.0
168 |             else:
169 |                 if self.force_discrete_action:
170 |                     d = np.argmax(action[0])
171 |                     action[0][:] = 0.0
172 |                     action[0][d] = 1.0
173 |                 if self.discrete_action_space:
174 |                     agent.action.u[0] += action[0][1] - action[0][2]
175 |                     agent.action.u[1] += action[0][3] - action[0][4]
176 |                 else:
177 |                     agent.action.u = action[0]
178 |             sensitivity = 5.0
179 |             if agent.accel is not None:
180 |                 sensitivity = agent.accel
181 |             agent.action.u *= sensitivity
182 |             action = action[1:]
183 |         if not agent.silent:
184 |             # communication action
185 |             if self.discrete_action_input:
186 |                 agent.action.c = np.zeros(self.world.dim_c)
187 |                 agent.action.c[action[0]] = 1.0
188 |             else:
189 |                 agent.action.c = action[0]
190 |             action = action[1:]
191 |         # make sure we used all elements of action
192 |         assert len(action) == 0
193 | 
194 |     # reset rendering assets
195 |     def _reset_render(self):
196 |         self.render_geoms = None
197 |         self.render_geoms_xform = None
198 | 
199 |     # render environment
200 |     def render(self, mode='human'):
201 |         if mode == 'human':
202 |             alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
203 |             message = ''
204 |             for agent in self.world.agents:
205 |                 comm = []
206 |                 for other in self.world.agents:
207 |                     if other is agent: continue
208 |                     if np.all(other.state.c == 0):
209 |                         word = '_'
210 |                     else:
211 |                         word = alphabet[np.argmax(other.state.c)]
212 |                     message += (other.name + ' to ' + agent.name + ': ' + word + '   ')
213 |             print(message)
214 | 
215 |         for i in range(len(self.viewers)):
216 |             # create viewers (if necessary)
217 |             if self.viewers[i] is None:
218 |                 # import rendering only if we need it (and don't import for headless machines)
219 |                 #from gym.envs.classic_control import rendering
220 |                 from multiagent import rendering
221 |                 self.viewers[i] = rendering.Viewer(700,700)
222 | 
223 |         # create rendering geometry
224 |         if self.render_geoms is None:
225 |             # import rendering only if we need it (and don't import for headless machines)
226 |             #from gym.envs.classic_control import rendering
227 |             from multiagent import rendering
228 |             self.render_geoms = []
229 |             self.render_geoms_xform = []
230 |             for entity in self.world.entities:
231 |                 geom = rendering.make_circle(entity.size)
232 |                 xform = rendering.Transform()
233 |                 if 'agent' in entity.name:
234 |                     geom.set_color(*entity.color, alpha=0.5)
235 |                 else:
236 |                     geom.set_color(*entity.color)
237 |                 geom.add_attr(xform)
238 |                 self.render_geoms.append(geom)
239 |                 self.render_geoms_xform.append(xform)
240 | 
241 |             # add geoms to viewer
242 |             for viewer in self.viewers:
243 |                 viewer.geoms = []
244 |                 for geom in self.render_geoms:
245 |                     viewer.add_geom(geom)
246 | 
247 |         results = []
248 |         for i in range(len(self.viewers)):
249 |             from multiagent import rendering
250 |             # update bounds to center around agent
251 |             cam_range = 1
252 |             if self.shared_viewer:
253 |                 pos = np.zeros(self.world.dim_p)
254 |             else:
255 |                 pos = self.agents[i].state.p_pos
256 |             self.viewers[i].set_bounds(pos[0]-cam_range,pos[0]+cam_range,pos[1]-cam_range,pos[1]+cam_range)
257 |             # update geometry positions
258 |             for e, entity in enumerate(self.world.entities):
259 |                 self.render_geoms_xform[e].set_translation(*entity.state.p_pos)
260 |             # render to display or array
261 |             results.append(self.viewers[i].render(return_rgb_array = mode=='rgb_array'))
262 | 
263 |         return results
264 | 
265 |     # create receptor field locations in local coordinate frame
266 |     def _make_receptor_locations(self, agent):
267 |         receptor_type = 'polar'
268 |         range_min = 0.05 * 2.0
269 |         range_max = 1.00
270 |         dx = []
271 |         # circular receptive field
272 |         if receptor_type == 'polar':
273 |             for angle in np.linspace(-np.pi, +np.pi, 8, endpoint=False):
274 |                 for distance in np.linspace(range_min, range_max, 3):
275 |                     dx.append(distance * np.array([np.cos(angle), np.sin(angle)]))
276 |             # add origin
277 |             dx.append(np.array([0.0, 0.0]))
278 |         # grid receptive field
279 |         if receptor_type == 'grid':
280 |             for x in np.linspace(-range_max, +range_max, 5):
281 |                 for y in np.linspace(-range_max, +range_max, 5):
282 |                     dx.append(np.array([x,y]))
283 |         return dx
284 | 
285 | 
286 | # vectorized wrapper for a batch of multi-agent environments
287 | # assumes all environments have the same observation and action space
288 | class BatchMultiAgentEnv(gym.Env):
289 |     metadata = {
290 |         'runtime.vectorized': True,
291 |         'render.modes' : ['human', 'rgb_array']
292 |     }
293 | 
294 |     def __init__(self, env_batch):
295 |         self.env_batch = env_batch
296 | 
297 |     @property
298 |     def n(self):
299 |         return np.sum([env.n for env in self.env_batch])
300 | 
301 |     @property
302 |     def action_space(self):
303 |         return self.env_batch[0].action_space
304 | 
305 |     @property
306 |     def observation_space(self):
307 |         return self.env_batch[0].observation_space
308 | 
309 |     def step(self, action_n, time):
310 |         obs_n = []
311 |         reward_n = []
312 |         done_n = []
313 |         info_n = {'n': []}
314 |         i = 0
315 |         for env in self.env_batch:
316 |             obs, reward, done, _ = env.step(action_n[i:(i+env.n)], time)
317 |             i += env.n
318 |             obs_n += obs
319 |             # reward = [r / len(self.env_batch) for r in reward]
320 |             reward_n += reward
321 |             done_n += done
322 |         return obs_n, reward_n, done_n, info_n
323 | 
324 |     def reset(self):
325 |         obs_n = []
326 |         for env in self.env_batch:
327 |             obs_n += env.reset()
328 |         return obs_n
329 | 
330 |     # render environment
331 |     def render(self, mode='human', close=True):
332 |         results_n = []
333 |         for env in self.env_batch:
334 |             results_n += env.render(mode, close)
335 |         return results_n
336 | 


--------------------------------------------------------------------------------
/multiagent/multi_discrete.py:
--------------------------------------------------------------------------------
 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates)
 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py)
 3 | 
 4 | import numpy as np
 5 | 
 6 | import gym
 7 | from gym.spaces import prng
 8 | 
 9 | class MultiDiscrete(gym.Space):
10 |     """
11 |     - The multi-discrete action space consists of a series of discrete action spaces with different parameters
12 |     - It can be adapted to both a Discrete action space or a continuous (Box) action space
13 |     - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
14 |     - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
15 |        where the discrete action space can take any integers from `min` to `max` (both inclusive)
16 |     Note: A value of 0 always need to represent the NOOP action.
17 |     e.g. Nintendo Game Controller
18 |     - Can be conceptualized as 3 discrete action spaces:
19 |         1) Arrow Keys: Discrete 5  - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4]  - params: min: 0, max: 4
20 |         2) Button A:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
21 |         3) Button B:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
22 |     - Can be initialized as
23 |         MultiDiscrete([ [0,4], [0,1], [0,1] ])
24 |     """
25 |     def __init__(self, array_of_param_array):
26 |         self.low = np.array([x[0] for x in array_of_param_array])
27 |         self.high = np.array([x[1] for x in array_of_param_array])
28 |         self.num_discrete_space = self.low.shape[0]
29 | 
30 |     def sample(self):
31 |         """ Returns a array with one sample from each discrete action space """
32 |         # For each row: round(random .* (max - min) + min, 0)
33 |         random_array = prng.np_random.rand(self.num_discrete_space)
34 |         return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
35 |     def contains(self, x):
36 |         return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all()
37 | 
38 |     @property
39 |     def shape(self):
40 |         return self.num_discrete_space
41 |     def __repr__(self):
42 |         return "MultiDiscrete" + str(self.num_discrete_space)
43 |     def __eq__(self, other):
44 |         return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)


--------------------------------------------------------------------------------
/multiagent/policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pyglet.window import key
 3 | 
 4 | # individual agent policy
 5 | class Policy(object):
 6 |     def __init__(self):
 7 |         pass
 8 |     def action(self, obs):
 9 |         raise NotImplementedError()
10 | 
11 | # interactive policy based on keyboard input
12 | # hard-coded to deal only with movement, not communication
13 | class InteractivePolicy(Policy):
14 |     def __init__(self, env, agent_index):
15 |         super(InteractivePolicy, self).__init__()
16 |         self.env = env
17 |         # hard-coded keyboard events
18 |         self.move = [False for i in range(4)]
19 |         self.comm = [False for i in range(env.world.dim_c)]
20 |         # register keyboard events with this environment's window
21 |         env.viewers[agent_index].window.on_key_press = self.key_press
22 |         env.viewers[agent_index].window.on_key_release = self.key_release
23 | 
24 |     def action(self, obs):
25 |         # ignore observation and just act based on keyboard events
26 |         if self.env.discrete_action_input:
27 |             u = 0
28 |             if self.move[0]: u = 1
29 |             if self.move[1]: u = 2
30 |             if self.move[2]: u = 4
31 |             if self.move[3]: u = 3
32 |         else:
33 |             u = np.zeros(5) # 5-d because of no-move action
34 |             if self.move[0]: u[1] += 1.0
35 |             if self.move[1]: u[2] += 1.0
36 |             if self.move[3]: u[3] += 1.0
37 |             if self.move[2]: u[4] += 1.0
38 |             if True not in self.move:
39 |                 u[0] += 1.0
40 |         return np.concatenate([u, np.zeros(self.env.world.dim_c)])
41 | 
42 |     # keyboard event callbacks
43 |     def key_press(self, k, mod):
44 |         if k==key.LEFT:  self.move[0] = True
45 |         if k==key.RIGHT: self.move[1] = True
46 |         if k==key.UP:    self.move[2] = True
47 |         if k==key.DOWN:  self.move[3] = True
48 |     def key_release(self, k, mod):
49 |         if k==key.LEFT:  self.move[0] = False
50 |         if k==key.RIGHT: self.move[1] = False
51 |         if k==key.UP:    self.move[2] = False
52 |         if k==key.DOWN:  self.move[3] = False
53 | 


--------------------------------------------------------------------------------
/multiagent/rendering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 2D rendering framework
  3 | """
  4 | from __future__ import division
  5 | import os
  6 | import six
  7 | import sys
  8 | 
  9 | if "Apple" in sys.version:
 10 |     if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
 11 |         os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
 12 |         # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
 13 | 
 14 | from gym.utils import reraise
 15 | from gym import error
 16 | 
 17 | try:
 18 |     import pyglet
 19 | except ImportError as e:
 20 |     reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.")
 21 | 
 22 | try:
 23 |     from pyglet.gl import *
 24 | except ImportError as e:
 25 |     reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python <your_script.py>'")
 26 | 
 27 | import math
 28 | import numpy as np
 29 | 
 30 | RAD2DEG = 57.29577951308232
 31 | 
 32 | def get_display(spec):
 33 |     """Convert a display specification (such as :0) into an actual Display
 34 |     object.
 35 | 
 36 |     Pyglet only supports multiple Displays on Linux.
 37 |     """
 38 |     if spec is None:
 39 |         return None
 40 |     elif isinstance(spec, six.string_types):
 41 |         return pyglet.canvas.Display(spec)
 42 |     else:
 43 |         raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
 44 | 
 45 | class Viewer(object):
 46 |     def __init__(self, width, height, display=None):
 47 |         display = get_display(display)
 48 | 
 49 |         self.width = width
 50 |         self.height = height
 51 | 
 52 |         self.window = pyglet.window.Window(width=width, height=height, display=display)
 53 |         self.window.on_close = self.window_closed_by_user
 54 |         self.geoms = []
 55 |         self.onetime_geoms = []
 56 |         self.transform = Transform()
 57 | 
 58 |         glEnable(GL_BLEND)
 59 |         # glEnable(GL_MULTISAMPLE)
 60 |         glEnable(GL_LINE_SMOOTH)
 61 |         # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
 62 |         glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
 63 |         glLineWidth(2.0)
 64 |         glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
 65 | 
 66 |     def close(self):
 67 |         self.window.close()
 68 | 
 69 |     def window_closed_by_user(self):
 70 |         self.close()
 71 | 
 72 |     def set_bounds(self, left, right, bottom, top):
 73 |         assert right > left and top > bottom
 74 |         scalex = self.width/(right-left)
 75 |         scaley = self.height/(top-bottom)
 76 |         self.transform = Transform(
 77 |             translation=(-left*scalex, -bottom*scaley),
 78 |             scale=(scalex, scaley))
 79 | 
 80 |     def add_geom(self, geom):
 81 |         self.geoms.append(geom)
 82 | 
 83 |     def add_onetime(self, geom):
 84 |         self.onetime_geoms.append(geom)
 85 | 
 86 |     def render(self, return_rgb_array=False):
 87 |         glClearColor(1,1,1,1)
 88 |         self.window.clear()
 89 |         self.window.switch_to()
 90 |         self.window.dispatch_events()
 91 |         self.transform.enable()
 92 |         for geom in self.geoms:
 93 |             geom.render()
 94 |         for geom in self.onetime_geoms:
 95 |             geom.render()
 96 |         self.transform.disable()
 97 |         arr = None
 98 |         if return_rgb_array:
 99 |             buffer = pyglet.image.get_buffer_manager().get_color_buffer()
100 |             image_data = buffer.get_image_data()
101 |             arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
102 |             # In https://github.com/openai/gym-http-api/issues/2, we
103 |             # discovered that someone using Xmonad on Arch was having
104 |             # a window of size 598 x 398, though a 600 x 400 window
105 |             # was requested. (Guess Xmonad was preserving a pixel for
106 |             # the boundary.) So we use the buffer height/width rather
107 |             # than the requested one.
108 |             arr = arr.reshape(buffer.height, buffer.width, 4)
109 |             arr = arr[::-1,:,0:3]
110 |         self.window.flip()
111 |         self.onetime_geoms = []
112 |         return arr
113 | 
114 |     # Convenience
115 |     def draw_circle(self, radius=10, res=30, filled=True, **attrs):
116 |         geom = make_circle(radius=radius, res=res, filled=filled)
117 |         _add_attrs(geom, attrs)
118 |         self.add_onetime(geom)
119 |         return geom
120 | 
121 |     def draw_polygon(self, v, filled=True, **attrs):
122 |         geom = make_polygon(v=v, filled=filled)
123 |         _add_attrs(geom, attrs)
124 |         self.add_onetime(geom)
125 |         return geom
126 | 
127 |     def draw_polyline(self, v, **attrs):
128 |         geom = make_polyline(v=v)
129 |         _add_attrs(geom, attrs)
130 |         self.add_onetime(geom)
131 |         return geom
132 | 
133 |     def draw_line(self, start, end, **attrs):
134 |         geom = Line(start, end)
135 |         _add_attrs(geom, attrs)
136 |         self.add_onetime(geom)
137 |         return geom
138 | 
139 |     def get_array(self):
140 |         self.window.flip()
141 |         image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
142 |         self.window.flip()
143 |         arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
144 |         arr = arr.reshape(self.height, self.width, 4)
145 |         return arr[::-1,:,0:3]
146 | 
147 | def _add_attrs(geom, attrs):
148 |     if "color" in attrs:
149 |         geom.set_color(*attrs["color"])
150 |     if "linewidth" in attrs:
151 |         geom.set_linewidth(attrs["linewidth"])
152 | 
153 | class Geom(object):
154 |     def __init__(self):
155 |         self._color=Color((0, 0, 0, 1.0))
156 |         self.attrs = [self._color]
157 |     def render(self):
158 |         for attr in reversed(self.attrs):
159 |             attr.enable()
160 |         self.render1()
161 |         for attr in self.attrs:
162 |             attr.disable()
163 |     def render1(self):
164 |         raise NotImplementedError
165 |     def add_attr(self, attr):
166 |         self.attrs.append(attr)
167 |     def set_color(self, r, g, b, alpha=1):
168 |         self._color.vec4 = (r, g, b, alpha)
169 | 
170 | class Attr(object):
171 |     def enable(self):
172 |         raise NotImplementedError
173 |     def disable(self):
174 |         pass
175 | 
176 | class Transform(Attr):
177 |     def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)):
178 |         self.set_translation(*translation)
179 |         self.set_rotation(rotation)
180 |         self.set_scale(*scale)
181 |     def enable(self):
182 |         glPushMatrix()
183 |         glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint
184 |         glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
185 |         glScalef(self.scale[0], self.scale[1], 1)
186 |     def disable(self):
187 |         glPopMatrix()
188 |     def set_translation(self, newx, newy):
189 |         self.translation = (float(newx), float(newy))
190 |     def set_rotation(self, new):
191 |         self.rotation = float(new)
192 |     def set_scale(self, newx, newy):
193 |         self.scale = (float(newx), float(newy))
194 | 
195 | class Color(Attr):
196 |     def __init__(self, vec4):
197 |         self.vec4 = vec4
198 |     def enable(self):
199 |         glColor4f(*self.vec4)
200 | 
201 | class LineStyle(Attr):
202 |     def __init__(self, style):
203 |         self.style = style
204 |     def enable(self):
205 |         glEnable(GL_LINE_STIPPLE)
206 |         glLineStipple(1, self.style)
207 |     def disable(self):
208 |         glDisable(GL_LINE_STIPPLE)
209 | 
210 | class LineWidth(Attr):
211 |     def __init__(self, stroke):
212 |         self.stroke = stroke
213 |     def enable(self):
214 |         glLineWidth(self.stroke)
215 | 
216 | class Point(Geom):
217 |     def __init__(self):
218 |         Geom.__init__(self)
219 |     def render1(self):
220 |         glBegin(GL_POINTS) # draw point
221 |         glVertex3f(0.0, 0.0, 0.0)
222 |         glEnd()
223 | 
224 | class FilledPolygon(Geom):
225 |     def __init__(self, v):
226 |         Geom.__init__(self)
227 |         self.v = v
228 |     def render1(self):
229 |         if   len(self.v) == 4 : glBegin(GL_QUADS)
230 |         elif len(self.v)  > 4 : glBegin(GL_POLYGON)
231 |         else: glBegin(GL_TRIANGLES)
232 |         for p in self.v:
233 |             glVertex3f(p[0], p[1],0)  # draw each vertex
234 |         glEnd()
235 | 
236 |         color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5)
237 |         glColor4f(*color)
238 |         glBegin(GL_LINE_LOOP)
239 |         for p in self.v:
240 |             glVertex3f(p[0], p[1],0)  # draw each vertex
241 |         glEnd()
242 | 
243 | def make_circle(radius=10, res=30, filled=True):
244 |     points = []
245 |     for i in range(res):
246 |         ang = 2*math.pi*i / res
247 |         points.append((math.cos(ang)*radius, math.sin(ang)*radius))
248 |     if filled:
249 |         return FilledPolygon(points)
250 |     else:
251 |         return PolyLine(points, True)
252 | 
253 | def make_polygon(v, filled=True):
254 |     if filled: return FilledPolygon(v)
255 |     else: return PolyLine(v, True)
256 | 
257 | def make_polyline(v):
258 |     return PolyLine(v, False)
259 | 
260 | def make_capsule(length, width):
261 |     l, r, t, b = 0, length, width/2, -width/2
262 |     box = make_polygon([(l,b), (l,t), (r,t), (r,b)])
263 |     circ0 = make_circle(width/2)
264 |     circ1 = make_circle(width/2)
265 |     circ1.add_attr(Transform(translation=(length, 0)))
266 |     geom = Compound([box, circ0, circ1])
267 |     return geom
268 | 
269 | class Compound(Geom):
270 |     def __init__(self, gs):
271 |         Geom.__init__(self)
272 |         self.gs = gs
273 |         for g in self.gs:
274 |             g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
275 |     def render1(self):
276 |         for g in self.gs:
277 |             g.render()
278 | 
279 | class PolyLine(Geom):
280 |     def __init__(self, v, close):
281 |         Geom.__init__(self)
282 |         self.v = v
283 |         self.close = close
284 |         self.linewidth = LineWidth(1)
285 |         self.add_attr(self.linewidth)
286 |     def render1(self):
287 |         glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
288 |         for p in self.v:
289 |             glVertex3f(p[0], p[1],0)  # draw each vertex
290 |         glEnd()
291 |     def set_linewidth(self, x):
292 |         self.linewidth.stroke = x
293 | 
294 | class Line(Geom):
295 |     def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
296 |         Geom.__init__(self)
297 |         self.start = start
298 |         self.end = end
299 |         self.linewidth = LineWidth(1)
300 |         self.add_attr(self.linewidth)
301 | 
302 |     def render1(self):
303 |         glBegin(GL_LINES)
304 |         glVertex2f(*self.start)
305 |         glVertex2f(*self.end)
306 |         glEnd()
307 | 
308 | class Image(Geom):
309 |     def __init__(self, fname, width, height):
310 |         Geom.__init__(self)
311 |         self.width = width
312 |         self.height = height
313 |         img = pyglet.image.load(fname)
314 |         self.img = img
315 |         self.flip = False
316 |     def render1(self):
317 |         self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height)
318 | 
319 | # ================================================================
320 | 
321 | class SimpleImageViewer(object):
322 |     def __init__(self, display=None):
323 |         self.window = None
324 |         self.isopen = False
325 |         self.display = display
326 |     def imshow(self, arr):
327 |         if self.window is None:
328 |             height, width, channels = arr.shape
329 |             self.window = pyglet.window.Window(width=width, height=height, display=self.display)
330 |             self.width = width
331 |             self.height = height
332 |             self.isopen = True
333 |         assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
334 |         image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
335 |         self.window.clear()
336 |         self.window.switch_to()
337 |         self.window.dispatch_events()
338 |         image.blit(0,0)
339 |         self.window.flip()
340 |     def close(self):
341 |         if self.isopen:
342 |             self.window.close()
343 |             self.isopen = False
344 |     def __del__(self):
345 |         self.close()


--------------------------------------------------------------------------------
/multiagent/scenario.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # defines scenario upon which the world is built
 4 | class BaseScenario(object):
 5 |     # create elements of the world
 6 |     def make_world(self):
 7 |         raise NotImplementedError()
 8 |     # create initial conditions of the world
 9 |     def reset_world(self, world):
10 |         raise NotImplementedError()
11 | 


--------------------------------------------------------------------------------
/multiagent/scenarios/__init__.py:
--------------------------------------------------------------------------------
1 | import imp
2 | import os.path as osp
3 | 
4 | 
5 | def load(name):
6 |     pathname = osp.join(osp.dirname(__file__), name)
7 |     return imp.load_source('', pathname)
8 | 


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/formation.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/formation.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/simple.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/simple_adversary.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_adversary.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/simple_crypto.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_crypto.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/simple_push.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_push.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/simple_reference.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_reference.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/simple_speaker_listener.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_speaker_listener.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/simple_spread.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_spread.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/simple_tag.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_tag.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/__pycache__/simple_world_comm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/islambarakat99/Multi-Robot-Formation-Control-using-Deep-Reinforcement-Learning/4fde6a0a951394298a15a8e26bcdb57f819cbea8/multiagent/scenarios/__pycache__/simple_world_comm.cpython-36.pyc


--------------------------------------------------------------------------------
/multiagent/scenarios/formation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | from multiagent.core import World, Agent, Landmark
  4 | from multiagent.scenario import BaseScenario
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self):
  8 |         world = World()
  9 |         # world characteristics
 10 |         world.dim_c = 2
 11 |         num_agents = 3
 12 |         world.num_agents = num_agents
 13 |         num_landmarks = num_agents + 1
 14 |         # adding agents
 15 |         world.agents = [Agent() for i in range(num_agents)]
 16 |         for i, agent in enumerate(world.agents):
 17 |             agent.name =  'agent %d' % i
 18 |             agent.collide = False
 19 |             agent.silent = True
 20 |             agent.size = 0.05
 21 |         # adding landmarks
 22 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 23 |         for i, landmark in enumerate(world.landmarks):
 24 |             landmark.name = 'landmark %d' % i
 25 |             landmark.collide = False
 26 |             landmark.movable = False
 27 |             landmark.size = 0.07 
 28 |         # Initial Conditions
 29 |         self.reset_world(world)
 30 |         return world
 31 |     
 32 |     def reset_world(self, world):
 33 |         # Landmarks characteristics
 34 |         for landmark in world.landmarks:
 35 |             landmark.color = np.array([0.15, 0.15, 0.15])
 36 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 37 |             landmark.state.p_vel = np.zeros(world.dim_p)
 38 |         goal = world.landmarks[0]
 39 |         goal.color = np.array([0.15, 0.65, 0.15])
 40 |         goal.state.p_pos = [-0.8, -0.8]            
 41 |         # Leader characteristics
 42 |         world.agents[0].color = np.array([0.85, 0.35, 0.35])
 43 |         world.agents[0].adversary = True
 44 |         world.agents[0].goal_a = goal     
 45 |         # Followers
 46 |         for i in range(1, world.num_agents):
 47 |             world.agents[i].color = np.array([0.35, 0.35, 0.85])
 48 |             world.agents[i].adversary = False    
 49 |         # Random intial states
 50 |         for agent in world.agents:  
 51 |             agent.state.p_pos = np.random.uniform(0.1, 0.9, world.dim_p)
 52 |             agent.state.p_vel = np.zeros(world.dim_p)
 53 |             agent.state.c = np.zeros(world.dim_c)   
 54 |        
 55 |     def benchmark_data(self, agent, world):
 56 |         # returning data for benchmark purposes
 57 |         if agent.adversary:
 58 |             return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
 59 |         else:
 60 |             dists = []
 61 |             for l in world.landmarks:
 62 |                 dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos)))
 63 |             dists.append(np.sum(np.square(agent.state.p_pos - world.agents[0].state.p_pos)))
 64 |             return tuple(dists)
 65 | 
 66 |     def reward(self, agent, world):
 67 |         reward = self.outside(agent, world) + self.collosion(agent, world)
 68 |         if agent.adversary:
 69 |             reward -= np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)))
 70 |         else:
 71 |             reward -= np.sqrt(np.sum(np.square(agent.state.p_pos - world.agents[0].state.p_pos)))
 72 |         return reward
 73 |       
 74 |     def collosion(self, agent, world):
 75 |         col_rew = 0
 76 |         for ag in world.agents:
 77 |             if not ag.name == agent.name:
 78 |                 if np.sqrt(np.sum(np.square(agent.state.p_pos - ag.state.p_pos))) < 2* agent.size:
 79 |                     col_rew -= 15
 80 |         for i in range(1, len(world.landmarks)):
 81 |             if np.sqrt(np.sum(np.square(agent.state.p_pos - world.landmarks[i].state.p_pos))) < 2* agent.size:
 82 |                 col_rew -= 15
 83 |         return col_rew
 84 |     
 85 |     def outside(self, agent, world):
 86 |         out_rew = 0
 87 |         if np.sum(np.absolute(agent.state.p_pos)) > 2:
 88 |             out_rew -= 20
 89 |         return out_rew
 90 | 
 91 |     def observation(self, agent, world):
 92 |         # position of the landmarks w.r.t the agent
 93 |         landmark_pos = []
 94 |         for landmark in world.landmarks:
 95 |             landmark_pos.append(landmark.state.p_pos - agent.state.p_pos)
 96 |         # position of the other agents w.r.t this agent
 97 |         other_pos = []
 98 |         for other in world.agents:
 99 |             if other is agent: continue
100 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
101 | 
102 |         if not agent.adversary:
103 |             return np.concatenate([agent.state.p_pos - world.agents[0].state.p_pos] + landmark_pos + other_pos)
104 |         else:
105 |             return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + landmark_pos)


--------------------------------------------------------------------------------
/multiagent/scenarios/simple.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | class Scenario(BaseScenario):
 6 |     def make_world(self):
 7 |         world = World()
 8 |         # add agents
 9 |         world.agents = [Agent() for i in range(3)]
10 |         for i, agent in enumerate(world.agents):
11 |             agent.name = 'agent %d' % i
12 |             agent.collide = False
13 |             agent.silent = True
14 |         # add landmarks
15 |         world.landmarks = [Landmark() for i in range(1)]
16 |         for i, landmark in enumerate(world.landmarks):
17 |             landmark.name = 'landmark %d' % i
18 |             landmark.collide = False
19 |             landmark.movable = False
20 |         # make initial conditions
21 |         self.reset_world(world)
22 |         return world
23 | 
24 |     def reset_world(self, world):
25 |         # random properties for agents
26 |         for i, agent in enumerate(world.agents):
27 |             agent.color = np.array([0.25,0.25,0.25])
28 |         # random properties for landmarks
29 |         for i, landmark in enumerate(world.landmarks):
30 |             landmark.color = np.array([0.75,0.75,0.75])
31 |         world.landmarks[0].color = np.array([0.75,0.25,0.25])
32 |         # set random initial states
33 |         for agent in world.agents:
34 |             agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
35 |             agent.state.p_vel = np.zeros(world.dim_p)
36 |             agent.state.c = np.zeros(world.dim_c)
37 |         for i, landmark in enumerate(world.landmarks):
38 |             landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
39 |             landmark.state.p_vel = np.zeros(world.dim_p)
40 | 
41 |     def reward(self, agent, world):
42 |         dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos))
43 |         return -dist2
44 | 
45 |     def observation(self, agent, world):
46 |         # get positions of all entities in this agent's reference frame
47 |         entity_pos = []
48 |         for entity in world.landmarks:
49 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
50 |         return np.concatenate([agent.state.p_vel] + entity_pos)
51 | 


--------------------------------------------------------------------------------
/multiagent/scenarios/simple_adversary.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 | 
  8 |     def make_world(self):
  9 |         world = World()
 10 |         # set any world properties first
 11 |         world.dim_c = 2
 12 |         num_agents = 3
 13 |         world.num_agents = num_agents
 14 |         num_adversaries = 1
 15 |         num_landmarks = num_agents - 1
 16 |         # add agents
 17 |         world.agents = [Agent() for i in range(num_agents)]
 18 |         for i, agent in enumerate(world.agents):
 19 |             agent.name = 'agent %d' % i
 20 |             agent.collide = False
 21 |             agent.silent = True
 22 |             agent.adversary = True if i < num_adversaries else False
 23 |             agent.size = 0.15
 24 |         # add landmarks
 25 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 26 |         for i, landmark in enumerate(world.landmarks):
 27 |             landmark.name = 'landmark %d' % i
 28 |             landmark.collide = False
 29 |             landmark.movable = False
 30 |             landmark.size = 0.08
 31 |         # make initial conditions
 32 |         self.reset_world(world)
 33 |         return world
 34 | 
 35 |     def reset_world(self, world):
 36 |         # random properties for agents
 37 |         world.agents[0].color = np.array([0.85, 0.35, 0.35])
 38 |         for i in range(1, world.num_agents):
 39 |             world.agents[i].color = np.array([0.35, 0.35, 0.85])
 40 |         # random properties for landmarks
 41 |         for i, landmark in enumerate(world.landmarks):
 42 |             landmark.color = np.array([0.15, 0.15, 0.15])
 43 |         # set goal landmark
 44 |         goal = np.random.choice(world.landmarks)
 45 |         goal.color = np.array([0.15, 0.65, 0.15])
 46 |         for agent in world.agents:
 47 |             agent.goal_a = goal
 48 |         # set random initial states
 49 |         for agent in world.agents:
 50 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 51 |             agent.state.p_vel = np.zeros(world.dim_p)
 52 |             agent.state.c = np.zeros(world.dim_c)
 53 |         for i, landmark in enumerate(world.landmarks):
 54 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 55 |             landmark.state.p_vel = np.zeros(world.dim_p)
 56 | 
 57 |     def benchmark_data(self, agent, world):
 58 |         # returns data for benchmarking purposes
 59 |         if agent.adversary:
 60 |             return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
 61 |         else:
 62 |             dists = []
 63 |             for l in world.landmarks:
 64 |                 dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos)))
 65 |             dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)))
 66 |             return tuple(dists)
 67 | 
 68 |     # return all agents that are not adversaries
 69 |     def good_agents(self, world):
 70 |         return [agent for agent in world.agents if not agent.adversary]
 71 | 
 72 |     # return all adversarial agents
 73 |     def adversaries(self, world):
 74 |         return [agent for agent in world.agents if agent.adversary]
 75 | 
 76 |     def reward(self, agent, world):
 77 |         # Agents are rewarded based on minimum agent distance to each landmark
 78 |         return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
 79 | 
 80 |     def agent_reward(self, agent, world):
 81 |         # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it
 82 |         shaped_reward = True
 83 |         shaped_adv_reward = True
 84 | 
 85 |         # Calculate negative reward for adversary
 86 |         adversary_agents = self.adversaries(world)
 87 |         if shaped_adv_reward:  # distance-based adversary reward
 88 |             adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents])
 89 |         else:  # proximity-based adversary reward (binary)
 90 |             adv_rew = 0
 91 |             for a in adversary_agents:
 92 |                 if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size:
 93 |                     adv_rew -= 5
 94 | 
 95 |         # Calculate positive reward for agents
 96 |         good_agents = self.good_agents(world)
 97 |         if shaped_reward:  # distance-based agent reward
 98 |             pos_rew = -min(
 99 |                 [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents])
100 |         else:  # proximity-based agent reward (binary)
101 |             pos_rew = 0
102 |             if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \
103 |                     < 2 * agent.goal_a.size:
104 |                 pos_rew += 5
105 |             pos_rew -= min(
106 |                 [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents])
107 |         return pos_rew + adv_rew
108 | 
109 |     def adversary_reward(self, agent, world):
110 |         # Rewarded based on proximity to the goal landmark
111 |         shaped_reward = True
112 |         if shaped_reward:  # distance-based reward
113 |             return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
114 |         else:  # proximity-based reward (binary)
115 |             adv_rew = 0
116 |             if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size:
117 |                 adv_rew += 5
118 |             return adv_rew
119 | 
120 | 
121 |     def observation(self, agent, world):
122 |         # get positions of all entities in this agent's reference frame
123 |         entity_pos = []
124 |         for entity in world.landmarks:
125 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
126 |         # entity colors
127 |         entity_color = []
128 |         for entity in world.landmarks:
129 |             entity_color.append(entity.color)
130 |         # communication of all other agents
131 |         other_pos = []
132 |         for other in world.agents:
133 |             if other is agent: continue
134 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
135 | 
136 |         if not agent.adversary:
137 |             return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos)
138 |         else:
139 |             return np.concatenate(entity_pos + other_pos)
140 | 


--------------------------------------------------------------------------------
/multiagent/scenarios/simple_crypto.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Scenario:
  3 | 1 speaker, 2 listeners (one of which is an adversary). Good agents rewarded for proximity to goal, and distance from
  4 | adversary to goal. Adversary is rewarded for its distance to the goal.
  5 | """
  6 | 
  7 | 
  8 | import numpy as np
  9 | from multiagent.core import World, Agent, Landmark
 10 | from multiagent.scenario import BaseScenario
 11 | import random
 12 | 
 13 | 
 14 | class CryptoAgent(Agent):
 15 |     def __init__(self):
 16 |         super(CryptoAgent, self).__init__()
 17 |         self.key = None
 18 | 
 19 | class Scenario(BaseScenario):
 20 | 
 21 |     def make_world(self):
 22 |         world = World()
 23 |         # set any world properties first
 24 |         num_agents = 3
 25 |         num_adversaries = 1
 26 |         num_landmarks = 2
 27 |         world.dim_c = 4
 28 |         # add agents
 29 |         world.agents = [CryptoAgent() for i in range(num_agents)]
 30 |         for i, agent in enumerate(world.agents):
 31 |             agent.name = 'agent %d' % i
 32 |             agent.collide = False
 33 |             agent.adversary = True if i < num_adversaries else False
 34 |             agent.speaker = True if i == 2 else False
 35 |             agent.movable = False
 36 |         # add landmarks
 37 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 38 |         for i, landmark in enumerate(world.landmarks):
 39 |             landmark.name = 'landmark %d' % i
 40 |             landmark.collide = False
 41 |             landmark.movable = False
 42 |         # make initial conditions
 43 |         self.reset_world(world)
 44 |         return world
 45 | 
 46 | 
 47 |     def reset_world(self, world):
 48 |         # random properties for agents
 49 |         for i, agent in enumerate(world.agents):
 50 |             agent.color = np.array([0.25, 0.25, 0.25])
 51 |             if agent.adversary:
 52 |                 agent.color = np.array([0.75, 0.25, 0.25])
 53 |             agent.key = None
 54 |         # random properties for landmarks
 55 |         color_list = [np.zeros(world.dim_c) for i in world.landmarks]
 56 |         for i, color in enumerate(color_list):
 57 |             color[i] += 1
 58 |         for color, landmark in zip(color_list, world.landmarks):
 59 |             landmark.color = color
 60 |         # set goal landmark
 61 |         goal = np.random.choice(world.landmarks)
 62 |         world.agents[1].color = goal.color
 63 |         world.agents[2].key = np.random.choice(world.landmarks).color
 64 | 
 65 |         for agent in world.agents:
 66 |             agent.goal_a = goal
 67 | 
 68 |         # set random initial states
 69 |         for agent in world.agents:
 70 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 71 |             agent.state.p_vel = np.zeros(world.dim_p)
 72 |             agent.state.c = np.zeros(world.dim_c)
 73 |         for i, landmark in enumerate(world.landmarks):
 74 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 75 |             landmark.state.p_vel = np.zeros(world.dim_p)
 76 | 
 77 | 
 78 |     def benchmark_data(self, agent, world):
 79 |         # returns data for benchmarking purposes
 80 |         return (agent.state.c, agent.goal_a.color)
 81 | 
 82 |     # return all agents that are not adversaries
 83 |     def good_listeners(self, world):
 84 |         return [agent for agent in world.agents if not agent.adversary and not agent.speaker]
 85 | 
 86 |     # return all agents that are not adversaries
 87 |     def good_agents(self, world):
 88 |         return [agent for agent in world.agents if not agent.adversary]
 89 | 
 90 |     # return all adversarial agents
 91 |     def adversaries(self, world):
 92 |         return [agent for agent in world.agents if agent.adversary]
 93 | 
 94 |     def reward(self, agent, world):
 95 |         return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
 96 | 
 97 |     def agent_reward(self, agent, world):
 98 |         # Agents rewarded if Bob can reconstruct message, but adversary (Eve) cannot
 99 |         good_listeners = self.good_listeners(world)
100 |         adversaries = self.adversaries(world)
101 |         good_rew = 0
102 |         adv_rew = 0
103 |         for a in good_listeners:
104 |             if (a.state.c == np.zeros(world.dim_c)).all():
105 |                 continue
106 |             else:
107 |                 good_rew -= np.sum(np.square(a.state.c - agent.goal_a.color))
108 |         for a in adversaries:
109 |             if (a.state.c == np.zeros(world.dim_c)).all():
110 |                 continue
111 |             else:
112 |                 adv_l1 = np.sum(np.square(a.state.c - agent.goal_a.color))
113 |                 adv_rew += adv_l1
114 |         return adv_rew + good_rew
115 | 
116 |     def adversary_reward(self, agent, world):
117 |         # Adversary (Eve) is rewarded if it can reconstruct original goal
118 |         rew = 0
119 |         if not (agent.state.c == np.zeros(world.dim_c)).all():
120 |             rew -= np.sum(np.square(agent.state.c - agent.goal_a.color))
121 |         return rew
122 | 
123 | 
124 |     def observation(self, agent, world):
125 |         # goal color
126 |         goal_color = np.zeros(world.dim_color)
127 |         if agent.goal_a is not None:
128 |             goal_color = agent.goal_a.color
129 | 
130 |         # get positions of all entities in this agent's reference frame
131 |         entity_pos = []
132 |         for entity in world.landmarks:
133 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
134 |         # communication of all other agents
135 |         comm = []
136 |         for other in world.agents:
137 |             if other is agent or (other.state.c is None) or not other.speaker: continue
138 |             comm.append(other.state.c)
139 | 
140 |         confer = np.array([0])
141 | 
142 |         if world.agents[2].key is None:
143 |             confer = np.array([1])
144 |             key = np.zeros(world.dim_c)
145 |             goal_color = np.zeros(world.dim_c)
146 |         else:
147 |             key = world.agents[2].key
148 | 
149 |         prnt = False
150 |         # speaker
151 |         if agent.speaker:
152 |             if prnt:
153 |                 print('speaker')
154 |                 print(agent.state.c)
155 |                 print(np.concatenate([goal_color] + [key] + [confer] + [np.random.randn(1)]))
156 |             return np.concatenate([goal_color] + [key])
157 |         # listener
158 |         if not agent.speaker and not agent.adversary:
159 |             if prnt:
160 |                 print('listener')
161 |                 print(agent.state.c)
162 |                 print(np.concatenate([key] + comm + [confer]))
163 |             return np.concatenate([key] + comm)
164 |         if not agent.speaker and agent.adversary:
165 |             if prnt:
166 |                 print('adversary')
167 |                 print(agent.state.c)
168 |                 print(np.concatenate(comm + [confer]))
169 |             return np.concatenate(comm)
170 | 


--------------------------------------------------------------------------------
/multiagent/scenarios/simple_push.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | class Scenario(BaseScenario):
 6 |     def make_world(self):
 7 |         world = World()
 8 |         # set any world properties first
 9 |         world.dim_c = 2
10 |         num_agents = 2
11 |         num_adversaries = 1
12 |         num_landmarks = 2
13 |         # add agents
14 |         world.agents = [Agent() for i in range(num_agents)]
15 |         for i, agent in enumerate(world.agents):
16 |             agent.name = 'agent %d' % i
17 |             agent.collide = True
18 |             agent.silent = True
19 |             if i < num_adversaries:
20 |                 agent.adversary = True
21 |             else:
22 |                 agent.adversary = False
23 |         # add landmarks
24 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
25 |         for i, landmark in enumerate(world.landmarks):
26 |             landmark.name = 'landmark %d' % i
27 |             landmark.collide = False
28 |             landmark.movable = False
29 |         # make initial conditions
30 |         self.reset_world(world)
31 |         return world
32 | 
33 |     def reset_world(self, world):
34 |         # random properties for landmarks
35 |         for i, landmark in enumerate(world.landmarks):
36 |             landmark.color = np.array([0.1, 0.1, 0.1])
37 |             landmark.color[i + 1] += 0.8
38 |             landmark.index = i
39 |         # set goal landmark
40 |         goal = np.random.choice(world.landmarks)
41 |         for i, agent in enumerate(world.agents):
42 |             agent.goal_a = goal
43 |             agent.color = np.array([0.25, 0.25, 0.25])
44 |             if agent.adversary:
45 |                 agent.color = np.array([0.75, 0.25, 0.25])
46 |             else:
47 |                 j = goal.index
48 |                 agent.color[j + 1] += 0.5
49 |         # set random initial states
50 |         for agent in world.agents:
51 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
52 |             agent.state.p_vel = np.zeros(world.dim_p)
53 |             agent.state.c = np.zeros(world.dim_c)
54 |         for i, landmark in enumerate(world.landmarks):
55 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
56 |             landmark.state.p_vel = np.zeros(world.dim_p)
57 | 
58 |     def reward(self, agent, world):
59 |         # Agents are rewarded based on minimum agent distance to each landmark
60 |         return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
61 | 
62 |     def agent_reward(self, agent, world):
63 |         # the distance to the goal
64 |         return -np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)))
65 | 
66 |     def adversary_reward(self, agent, world):
67 |         # keep the nearest good agents away from the goal
68 |         agent_dist = [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in world.agents if not a.adversary]
69 |         pos_rew = min(agent_dist)
70 |         #nearest_agent = world.good_agents[np.argmin(agent_dist)]
71 |         #neg_rew = np.sqrt(np.sum(np.square(nearest_agent.state.p_pos - agent.state.p_pos)))
72 |         neg_rew = np.sqrt(np.sum(np.square(agent.goal_a.state.p_pos - agent.state.p_pos)))
73 |         #neg_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in world.good_agents])
74 |         return pos_rew - neg_rew
75 |                
76 |     def observation(self, agent, world):
77 |         # get positions of all entities in this agent's reference frame
78 |         entity_pos = []
79 |         for entity in world.landmarks:  # world.entities:
80 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
81 |         # entity colors
82 |         entity_color = []
83 |         for entity in world.landmarks:  # world.entities:
84 |             entity_color.append(entity.color)
85 |         # communication of all other agents
86 |         comm = []
87 |         other_pos = []
88 |         for other in world.agents:
89 |             if other is agent: continue
90 |             comm.append(other.state.c)
91 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
92 |         if not agent.adversary:
93 |             return np.concatenate([agent.state.p_vel] + [agent.goal_a.state.p_pos - agent.state.p_pos] + [agent.color] + entity_pos + entity_color + other_pos)
94 |         else:
95 |             #other_pos = list(reversed(other_pos)) if random.uniform(0,1) > 0.5 else other_pos  # randomize position of other agents in adversary network
96 |             return np.concatenate([agent.state.p_vel] + entity_pos + other_pos)
97 | 


--------------------------------------------------------------------------------
/multiagent/scenarios/simple_reference.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | class Scenario(BaseScenario):
 6 |     def make_world(self):
 7 |         world = World()
 8 |         # set any world properties first
 9 |         world.dim_c = 10
10 |         world.collaborative = True  # whether agents share rewards
11 |         # add agents
12 |         world.agents = [Agent() for i in range(2)]
13 |         for i, agent in enumerate(world.agents):
14 |             agent.name = 'agent %d' % i
15 |             agent.collide = False
16 |         # add landmarks
17 |         world.landmarks = [Landmark() for i in range(3)]
18 |         for i, landmark in enumerate(world.landmarks):
19 |             landmark.name = 'landmark %d' % i
20 |             landmark.collide = False
21 |             landmark.movable = False
22 |         # make initial conditions
23 |         self.reset_world(world)
24 |         return world
25 | 
26 |     def reset_world(self, world):
27 |         # assign goals to agents
28 |         for agent in world.agents:
29 |             agent.goal_a = None
30 |             agent.goal_b = None
31 |         # want other agent to go to the goal landmark
32 |         world.agents[0].goal_a = world.agents[1]
33 |         world.agents[0].goal_b = np.random.choice(world.landmarks)
34 |         world.agents[1].goal_a = world.agents[0]
35 |         world.agents[1].goal_b = np.random.choice(world.landmarks)
36 |         # random properties for agents
37 |         for i, agent in enumerate(world.agents):
38 |             agent.color = np.array([0.25,0.25,0.25])               
39 |         # random properties for landmarks
40 |         world.landmarks[0].color = np.array([0.75,0.25,0.25]) 
41 |         world.landmarks[1].color = np.array([0.25,0.75,0.25]) 
42 |         world.landmarks[2].color = np.array([0.25,0.25,0.75]) 
43 |         # special colors for goals
44 |         world.agents[0].goal_a.color = world.agents[0].goal_b.color                
45 |         world.agents[1].goal_a.color = world.agents[1].goal_b.color                               
46 |         # set random initial states
47 |         for agent in world.agents:
48 |             agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
49 |             agent.state.p_vel = np.zeros(world.dim_p)
50 |             agent.state.c = np.zeros(world.dim_c)
51 |         for i, landmark in enumerate(world.landmarks):
52 |             landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
53 |             landmark.state.p_vel = np.zeros(world.dim_p)
54 | 
55 |     def reward(self, agent, world):
56 |         if agent.goal_a is None or agent.goal_b is None:
57 |             return 0.0
58 |         dist2 = np.sum(np.square(agent.goal_a.state.p_pos - agent.goal_b.state.p_pos))
59 |         return -dist2
60 | 
61 |     def observation(self, agent, world):
62 |         # goal color
63 |         goal_color = [np.zeros(world.dim_color), np.zeros(world.dim_color)]
64 |         if agent.goal_b is not None:
65 |             goal_color[1] = agent.goal_b.color 
66 | 
67 |         # get positions of all entities in this agent's reference frame
68 |         entity_pos = []
69 |         for entity in world.landmarks:
70 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
71 |         # entity colors
72 |         entity_color = []
73 |         for entity in world.landmarks:
74 |             entity_color.append(entity.color)
75 |         # communication of all other agents
76 |         comm = []
77 |         for other in world.agents:
78 |             if other is agent: continue
79 |             comm.append(other.state.c)
80 |         return np.concatenate([agent.state.p_vel] + entity_pos + [goal_color[1]] + comm)
81 |             


--------------------------------------------------------------------------------
/multiagent/scenarios/simple_speaker_listener.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | class Scenario(BaseScenario):
 6 |     def make_world(self):
 7 |         world = World()
 8 |         # set any world properties first
 9 |         world.dim_c = 3
10 |         num_landmarks = 3
11 |         world.collaborative = True
12 |         # add agents
13 |         world.agents = [Agent() for i in range(2)]
14 |         for i, agent in enumerate(world.agents):
15 |             agent.name = 'agent %d' % i
16 |             agent.collide = False
17 |             agent.size = 0.075
18 |         # speaker
19 |         world.agents[0].movable = False
20 |         # listener
21 |         world.agents[1].silent = True
22 |         # add landmarks
23 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
24 |         for i, landmark in enumerate(world.landmarks):
25 |             landmark.name = 'landmark %d' % i
26 |             landmark.collide = False
27 |             landmark.movable = False
28 |             landmark.size = 0.04
29 |         # make initial conditions
30 |         self.reset_world(world)
31 |         return world
32 | 
33 |     def reset_world(self, world):
34 |         # assign goals to agents
35 |         for agent in world.agents:
36 |             agent.goal_a = None
37 |             agent.goal_b = None
38 |         # want listener to go to the goal landmark
39 |         world.agents[0].goal_a = world.agents[1]
40 |         world.agents[0].goal_b = np.random.choice(world.landmarks)
41 |         # random properties for agents
42 |         for i, agent in enumerate(world.agents):
43 |             agent.color = np.array([0.25,0.25,0.25])               
44 |         # random properties for landmarks
45 |         world.landmarks[0].color = np.array([0.65,0.15,0.15])
46 |         world.landmarks[1].color = np.array([0.15,0.65,0.15])
47 |         world.landmarks[2].color = np.array([0.15,0.15,0.65])
48 |         # special colors for goals
49 |         world.agents[0].goal_a.color = world.agents[0].goal_b.color + np.array([0.45, 0.45, 0.45])
50 |         # set random initial states
51 |         for agent in world.agents:
52 |             agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
53 |             agent.state.p_vel = np.zeros(world.dim_p)
54 |             agent.state.c = np.zeros(world.dim_c)
55 |         for i, landmark in enumerate(world.landmarks):
56 |             landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
57 |             landmark.state.p_vel = np.zeros(world.dim_p)
58 | 
59 |     def benchmark_data(self, agent, world):
60 |         # returns data for benchmarking purposes
61 |         return self.reward(agent, reward)
62 | 
63 |     def reward(self, agent, world):
64 |         # squared distance from listener to landmark
65 |         a = world.agents[0]
66 |         dist2 = np.sum(np.square(a.goal_a.state.p_pos - a.goal_b.state.p_pos))
67 |         return -dist2
68 | 
69 |     def observation(self, agent, world):
70 |         # goal color
71 |         goal_color = np.zeros(world.dim_color)
72 |         if agent.goal_b is not None:
73 |             goal_color = agent.goal_b.color
74 | 
75 |         # get positions of all entities in this agent's reference frame
76 |         entity_pos = []
77 |         for entity in world.landmarks:
78 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
79 | 
80 |         # communication of all other agents
81 |         comm = []
82 |         for other in world.agents:
83 |             if other is agent or (other.state.c is None): continue
84 |             comm.append(other.state.c)
85 |         
86 |         # speaker
87 |         if not agent.movable:
88 |             return np.concatenate([goal_color])
89 |         # listener
90 |         if agent.silent:
91 |             return np.concatenate([agent.state.p_vel] + entity_pos + comm)
92 |             
93 | 


--------------------------------------------------------------------------------
/multiagent/scenarios/simple_spread.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self):
  8 |         world = World()
  9 |         # set any world properties first
 10 |         world.dim_c = 2
 11 |         num_agents = 3
 12 |         num_landmarks = 3
 13 |         world.collaborative = True
 14 |         # add agents
 15 |         world.agents = [Agent() for i in range(num_agents)]
 16 |         for i, agent in enumerate(world.agents):
 17 |             agent.name = 'agent %d' % i
 18 |             agent.collide = True
 19 |             agent.silent = True
 20 |             agent.size = 0.15
 21 |         # add landmarks
 22 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 23 |         for i, landmark in enumerate(world.landmarks):
 24 |             landmark.name = 'landmark %d' % i
 25 |             landmark.collide = False
 26 |             landmark.movable = False
 27 |         # make initial conditions
 28 |         self.reset_world(world)
 29 |         return world
 30 | 
 31 |     def reset_world(self, world):
 32 |         # random properties for agents
 33 |         for i, agent in enumerate(world.agents):
 34 |             agent.color = np.array([0.35, 0.35, 0.85])
 35 |         # random properties for landmarks
 36 |         for i, landmark in enumerate(world.landmarks):
 37 |             landmark.color = np.array([0.25, 0.25, 0.25])
 38 |         # set random initial states
 39 |         for agent in world.agents:
 40 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 41 |             agent.state.p_vel = np.zeros(world.dim_p)
 42 |             agent.state.c = np.zeros(world.dim_c)
 43 |         for i, landmark in enumerate(world.landmarks):
 44 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 45 |             landmark.state.p_vel = np.zeros(world.dim_p)
 46 | 
 47 |     def benchmark_data(self, agent, world):
 48 |         rew = 0
 49 |         collisions = 0
 50 |         occupied_landmarks = 0
 51 |         min_dists = 0
 52 |         for l in world.landmarks:
 53 |             dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
 54 |             min_dists += min(dists)
 55 |             rew -= min(dists)
 56 |             if min(dists) < 0.1:
 57 |                 occupied_landmarks += 1
 58 |         if agent.collide:
 59 |             for a in world.agents:
 60 |                 if self.is_collision(a, agent):
 61 |                     rew -= 1
 62 |                     collisions += 1
 63 |         return (rew, collisions, min_dists, occupied_landmarks)
 64 | 
 65 | 
 66 |     def is_collision(self, agent1, agent2):
 67 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
 68 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
 69 |         dist_min = agent1.size + agent2.size
 70 |         return True if dist < dist_min else False
 71 | 
 72 |     def reward(self, agent, world):
 73 |         # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions
 74 |         rew = 0
 75 |         for l in world.landmarks:
 76 |             dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
 77 |             rew -= min(dists)
 78 |         if agent.collide:
 79 |             for a in world.agents:
 80 |                 if self.is_collision(a, agent):
 81 |                     rew -= 1
 82 |         return rew
 83 | 
 84 |     def observation(self, agent, world):
 85 |         # get positions of all entities in this agent's reference frame
 86 |         entity_pos = []
 87 |         for entity in world.landmarks:  # world.entities:
 88 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
 89 |         # entity colors
 90 |         entity_color = []
 91 |         for entity in world.landmarks:  # world.entities:
 92 |             entity_color.append(entity.color)
 93 |         # communication of all other agents
 94 |         comm = []
 95 |         other_pos = []
 96 |         for other in world.agents:
 97 |             if other is agent: continue
 98 |             comm.append(other.state.c)
 99 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
100 |         return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm)
101 | 


--------------------------------------------------------------------------------
/multiagent/scenarios/simple_tag.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self):
  8 |         world = World()
  9 |         # set any world properties first
 10 |         world.dim_c = 2
 11 |         num_good_agents = 1
 12 |         num_adversaries = 3
 13 |         num_agents = num_adversaries + num_good_agents
 14 |         num_landmarks = 2
 15 |         # add agents
 16 |         world.agents = [Agent() for i in range(num_agents)]
 17 |         for i, agent in enumerate(world.agents):
 18 |             agent.name = 'agent %d' % i
 19 |             agent.collide = True
 20 |             agent.silent = True
 21 |             agent.adversary = True if i < num_adversaries else False
 22 |             agent.size = 0.075 if agent.adversary else 0.05
 23 |             agent.accel = 3.0 if agent.adversary else 4.0
 24 |             #agent.accel = 20.0 if agent.adversary else 25.0
 25 |             agent.max_speed = 1.0 if agent.adversary else 1.3
 26 |         # add landmarks
 27 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 28 |         for i, landmark in enumerate(world.landmarks):
 29 |             landmark.name = 'landmark %d' % i
 30 |             landmark.collide = True
 31 |             landmark.movable = False
 32 |             landmark.size = 0.2
 33 |             landmark.boundary = False
 34 |         # make initial conditions
 35 |         self.reset_world(world)
 36 |         return world
 37 | 
 38 | 
 39 |     def reset_world(self, world):
 40 |         # random properties for agents
 41 |         for i, agent in enumerate(world.agents):
 42 |             agent.color = np.array([0.35, 0.85, 0.35]) if not agent.adversary else np.array([0.85, 0.35, 0.35])
 43 |             # random properties for landmarks
 44 |         for i, landmark in enumerate(world.landmarks):
 45 |             landmark.color = np.array([0.25, 0.25, 0.25])
 46 |         # set random initial states
 47 |         for agent in world.agents:
 48 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 49 |             agent.state.p_vel = np.zeros(world.dim_p)
 50 |             agent.state.c = np.zeros(world.dim_c)
 51 |         for i, landmark in enumerate(world.landmarks):
 52 |             if not landmark.boundary:
 53 |                 landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
 54 |                 landmark.state.p_vel = np.zeros(world.dim_p)
 55 | 
 56 | 
 57 |     def benchmark_data(self, agent, world):
 58 |         # returns data for benchmarking purposes
 59 |         if agent.adversary:
 60 |             collisions = 0
 61 |             for a in self.good_agents(world):
 62 |                 if self.is_collision(a, agent):
 63 |                     collisions += 1
 64 |             return collisions
 65 |         else:
 66 |             return 0
 67 | 
 68 | 
 69 |     def is_collision(self, agent1, agent2):
 70 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
 71 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
 72 |         dist_min = agent1.size + agent2.size
 73 |         return True if dist < dist_min else False
 74 | 
 75 |     # return all agents that are not adversaries
 76 |     def good_agents(self, world):
 77 |         return [agent for agent in world.agents if not agent.adversary]
 78 | 
 79 |     # return all adversarial agents
 80 |     def adversaries(self, world):
 81 |         return [agent for agent in world.agents if agent.adversary]
 82 | 
 83 | 
 84 |     def reward(self, agent, world):
 85 |         # Agents are rewarded based on minimum agent distance to each landmark
 86 |         main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
 87 |         return main_reward
 88 | 
 89 |     def agent_reward(self, agent, world):
 90 |         # Agents are negatively rewarded if caught by adversaries
 91 |         rew = 0
 92 |         shape = False
 93 |         adversaries = self.adversaries(world)
 94 |         if shape:  # reward can optionally be shaped (increased reward for increased distance from adversary)
 95 |             for adv in adversaries:
 96 |                 rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos)))
 97 |         if agent.collide:
 98 |             for a in adversaries:
 99 |                 if self.is_collision(a, agent):
100 |                     rew -= 10
101 | 
102 |         # agents are penalized for exiting the screen, so that they can be caught by the adversaries
103 |         def bound(x):
104 |             if x < 0.9:
105 |                 return 0
106 |             if x < 1.0:
107 |                 return (x - 0.9) * 10
108 |             return min(np.exp(2 * x - 2), 10)
109 |         for p in range(world.dim_p):
110 |             x = abs(agent.state.p_pos[p])
111 |             rew -= bound(x)
112 | 
113 |         return rew
114 | 
115 |     def adversary_reward(self, agent, world):
116 |         # Adversaries are rewarded for collisions with agents
117 |         rew = 0
118 |         shape = False
119 |         agents = self.good_agents(world)
120 |         adversaries = self.adversaries(world)
121 |         if shape:  # reward can optionally be shaped (decreased reward for increased distance from agents)
122 |             for adv in adversaries:
123 |                 rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents])
124 |         if agent.collide:
125 |             for ag in agents:
126 |                 for adv in adversaries:
127 |                     if self.is_collision(ag, adv):
128 |                         rew += 10
129 |         return rew
130 | 
131 |     def observation(self, agent, world):
132 |         # get positions of all entities in this agent's reference frame
133 |         entity_pos = []
134 |         for entity in world.landmarks:
135 |             if not entity.boundary:
136 |                 entity_pos.append(entity.state.p_pos - agent.state.p_pos)
137 |         # communication of all other agents
138 |         comm = []
139 |         other_pos = []
140 |         other_vel = []
141 |         for other in world.agents:
142 |             if other is agent: continue
143 |             comm.append(other.state.c)
144 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
145 |             if not other.adversary:
146 |                 other_vel.append(other.state.p_vel)
147 |         return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel)
148 | 


--------------------------------------------------------------------------------
/multiagent/scenarios/simple_world_comm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self):
  8 |         world = World()
  9 |         # set any world properties first
 10 |         world.dim_c = 4
 11 |         #world.damping = 1
 12 |         num_good_agents = 2
 13 |         num_adversaries = 4
 14 |         num_agents = num_adversaries + num_good_agents
 15 |         num_landmarks = 1
 16 |         num_food = 2
 17 |         num_forests = 2
 18 |         # add agents
 19 |         world.agents = [Agent() for i in range(num_agents)]
 20 |         for i, agent in enumerate(world.agents):
 21 |             agent.name = 'agent %d' % i
 22 |             agent.collide = True
 23 |             agent.leader = True if i == 0 else False
 24 |             agent.silent = True if i > 0 else False
 25 |             agent.adversary = True if i < num_adversaries else False
 26 |             agent.size = 0.075 if agent.adversary else 0.045
 27 |             agent.accel = 3.0 if agent.adversary else 4.0
 28 |             #agent.accel = 20.0 if agent.adversary else 25.0
 29 |             agent.max_speed = 1.0 if agent.adversary else 1.3
 30 |         # add landmarks
 31 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 32 |         for i, landmark in enumerate(world.landmarks):
 33 |             landmark.name = 'landmark %d' % i
 34 |             landmark.collide = True
 35 |             landmark.movable = False
 36 |             landmark.size = 0.2
 37 |             landmark.boundary = False
 38 |         world.food = [Landmark() for i in range(num_food)]
 39 |         for i, landmark in enumerate(world.food):
 40 |             landmark.name = 'food %d' % i
 41 |             landmark.collide = False
 42 |             landmark.movable = False
 43 |             landmark.size = 0.03
 44 |             landmark.boundary = False
 45 |         world.forests = [Landmark() for i in range(num_forests)]
 46 |         for i, landmark in enumerate(world.forests):
 47 |             landmark.name = 'forest %d' % i
 48 |             landmark.collide = False
 49 |             landmark.movable = False
 50 |             landmark.size = 0.3
 51 |             landmark.boundary = False
 52 |         world.landmarks += world.food
 53 |         world.landmarks += world.forests
 54 |         #world.landmarks += self.set_boundaries(world)  # world boundaries now penalized with negative reward
 55 |         # make initial conditions
 56 |         self.reset_world(world)
 57 |         return world
 58 | 
 59 |     def set_boundaries(self, world):
 60 |         boundary_list = []
 61 |         landmark_size = 1
 62 |         edge = 1 + landmark_size
 63 |         num_landmarks = int(edge * 2 / landmark_size)
 64 |         for x_pos in [-edge, edge]:
 65 |             for i in range(num_landmarks):
 66 |                 l = Landmark()
 67 |                 l.state.p_pos = np.array([x_pos, -1 + i * landmark_size])
 68 |                 boundary_list.append(l)
 69 | 
 70 |         for y_pos in [-edge, edge]:
 71 |             for i in range(num_landmarks):
 72 |                 l = Landmark()
 73 |                 l.state.p_pos = np.array([-1 + i * landmark_size, y_pos])
 74 |                 boundary_list.append(l)
 75 | 
 76 |         for i, l in enumerate(boundary_list):
 77 |             l.name = 'boundary %d' % i
 78 |             l.collide = True
 79 |             l.movable = False
 80 |             l.boundary = True
 81 |             l.color = np.array([0.75, 0.75, 0.75])
 82 |             l.size = landmark_size
 83 |             l.state.p_vel = np.zeros(world.dim_p)
 84 | 
 85 |         return boundary_list
 86 | 
 87 | 
 88 |     def reset_world(self, world):
 89 |         # random properties for agents
 90 |         for i, agent in enumerate(world.agents):
 91 |             agent.color = np.array([0.45, 0.95, 0.45]) if not agent.adversary else np.array([0.95, 0.45, 0.45])
 92 |             agent.color -= np.array([0.3, 0.3, 0.3]) if agent.leader else np.array([0, 0, 0])
 93 |             # random properties for landmarks
 94 |         for i, landmark in enumerate(world.landmarks):
 95 |             landmark.color = np.array([0.25, 0.25, 0.25])
 96 |         for i, landmark in enumerate(world.food):
 97 |             landmark.color = np.array([0.15, 0.15, 0.65])
 98 |         for i, landmark in enumerate(world.forests):
 99 |             landmark.color = np.array([0.6, 0.9, 0.6])
100 |         # set random initial states
101 |         for agent in world.agents:
102 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
103 |             agent.state.p_vel = np.zeros(world.dim_p)
104 |             agent.state.c = np.zeros(world.dim_c)
105 |         for i, landmark in enumerate(world.landmarks):
106 |             landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
107 |             landmark.state.p_vel = np.zeros(world.dim_p)
108 |         for i, landmark in enumerate(world.food):
109 |             landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
110 |             landmark.state.p_vel = np.zeros(world.dim_p)
111 |         for i, landmark in enumerate(world.forests):
112 |             landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
113 |             landmark.state.p_vel = np.zeros(world.dim_p)
114 | 
115 |     def benchmark_data(self, agent, world):
116 |         if agent.adversary:
117 |             collisions = 0
118 |             for a in self.good_agents(world):
119 |                 if self.is_collision(a, agent):
120 |                     collisions += 1
121 |             return collisions
122 |         else:
123 |             return 0
124 | 
125 | 
126 |     def is_collision(self, agent1, agent2):
127 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
128 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
129 |         dist_min = agent1.size + agent2.size
130 |         return True if dist < dist_min else False
131 | 
132 | 
133 |     # return all agents that are not adversaries
134 |     def good_agents(self, world):
135 |         return [agent for agent in world.agents if not agent.adversary]
136 | 
137 |     # return all adversarial agents
138 |     def adversaries(self, world):
139 |         return [agent for agent in world.agents if agent.adversary]
140 | 
141 | 
142 |     def reward(self, agent, world):
143 |         # Agents are rewarded based on minimum agent distance to each landmark
144 |         #boundary_reward = -10 if self.outside_boundary(agent) else 0
145 |         main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
146 |         return main_reward
147 | 
148 |     def outside_boundary(self, agent):
149 |         if agent.state.p_pos[0] > 1 or agent.state.p_pos[0] < -1 or agent.state.p_pos[1] > 1 or agent.state.p_pos[1] < -1:
150 |             return True
151 |         else:
152 |             return False
153 | 
154 | 
155 |     def agent_reward(self, agent, world):
156 |         # Agents are rewarded based on minimum agent distance to each landmark
157 |         rew = 0
158 |         shape = False
159 |         adversaries = self.adversaries(world)
160 |         if shape:
161 |             for adv in adversaries:
162 |                 rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos)))
163 |         if agent.collide:
164 |             for a in adversaries:
165 |                 if self.is_collision(a, agent):
166 |                     rew -= 5
167 |         def bound(x):
168 |             if x < 0.9:
169 |                 return 0
170 |             if x < 1.0:
171 |                 return (x - 0.9) * 10
172 |             return min(np.exp(2 * x - 2), 10)  # 1 + (x - 1) * (x - 1)
173 | 
174 |         for p in range(world.dim_p):
175 |             x = abs(agent.state.p_pos[p])
176 |             rew -= 2 * bound(x)
177 | 
178 |         for food in world.food:
179 |             if self.is_collision(agent, food):
180 |                 rew += 2
181 |         rew += 0.05 * min([np.sqrt(np.sum(np.square(food.state.p_pos - agent.state.p_pos))) for food in world.food])
182 | 
183 |         return rew
184 | 
185 |     def adversary_reward(self, agent, world):
186 |         # Agents are rewarded based on minimum agent distance to each landmark
187 |         rew = 0
188 |         shape = True
189 |         agents = self.good_agents(world)
190 |         adversaries = self.adversaries(world)
191 |         if shape:
192 |             rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in agents])
193 |         if agent.collide:
194 |             for ag in agents:
195 |                 for adv in adversaries:
196 |                     if self.is_collision(ag, adv):
197 |                         rew += 5
198 |         return rew
199 | 
200 | 
201 |     def observation2(self, agent, world):
202 |         # get positions of all entities in this agent's reference frame
203 |         entity_pos = []
204 |         for entity in world.landmarks:
205 |             if not entity.boundary:
206 |                 entity_pos.append(entity.state.p_pos - agent.state.p_pos)
207 | 
208 |         food_pos = []
209 |         for entity in world.food:
210 |             if not entity.boundary:
211 |                 food_pos.append(entity.state.p_pos - agent.state.p_pos)
212 |         # communication of all other agents
213 |         comm = []
214 |         other_pos = []
215 |         other_vel = []
216 |         for other in world.agents:
217 |             if other is agent: continue
218 |             comm.append(other.state.c)
219 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
220 |             if not other.adversary:
221 |                 other_vel.append(other.state.p_vel)
222 |         return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel)
223 | 
224 |     def observation(self, agent, world):
225 |         # get positions of all entities in this agent's reference frame
226 |         entity_pos = []
227 |         for entity in world.landmarks:
228 |             if not entity.boundary:
229 |                 entity_pos.append(entity.state.p_pos - agent.state.p_pos)
230 | 
231 |         in_forest = [np.array([-1]), np.array([-1])]
232 |         inf1 = False
233 |         inf2 = False
234 |         if self.is_collision(agent, world.forests[0]):
235 |             in_forest[0] = np.array([1])
236 |             inf1= True
237 |         if self.is_collision(agent, world.forests[1]):
238 |             in_forest[1] = np.array([1])
239 |             inf2 = True
240 | 
241 |         food_pos = []
242 |         for entity in world.food:
243 |             if not entity.boundary:
244 |                 food_pos.append(entity.state.p_pos - agent.state.p_pos)
245 |         # communication of all other agents
246 |         comm = []
247 |         other_pos = []
248 |         other_vel = []
249 |         for other in world.agents:
250 |             if other is agent: continue
251 |             comm.append(other.state.c)
252 |             oth_f1 = self.is_collision(other, world.forests[0])
253 |             oth_f2 = self.is_collision(other, world.forests[1])
254 |             if (inf1 and oth_f1) or (inf2 and oth_f2) or (not inf1 and not oth_f1 and not inf2 and not oth_f2) or agent.leader:  #without forest vis
255 |                 other_pos.append(other.state.p_pos - agent.state.p_pos)
256 |                 if not other.adversary:
257 |                     other_vel.append(other.state.p_vel)
258 |             else:
259 |                 other_pos.append([0, 0])
260 |                 if not other.adversary:
261 |                     other_vel.append([0, 0])
262 | 
263 |         # to tell the pred when the prey are in the forest
264 |         prey_forest = []
265 |         ga = self.good_agents(world)
266 |         for a in ga:
267 |             if any([self.is_collision(a, f) for f in world.forests]):
268 |                 prey_forest.append(np.array([1]))
269 |             else:
270 |                 prey_forest.append(np.array([-1]))
271 |         # to tell leader when pred are in forest
272 |         prey_forest_lead = []
273 |         for f in world.forests:
274 |             if any([self.is_collision(a, f) for a in ga]):
275 |                 prey_forest_lead.append(np.array([1]))
276 |             else:
277 |                 prey_forest_lead.append(np.array([-1]))
278 | 
279 |         comm = [world.agents[0].state.c]
280 | 
281 |         if agent.adversary and not agent.leader:
282 |             return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm)
283 |         if agent.leader:
284 |             return np.concatenate(
285 |                 [agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm)
286 |         else:
287 |             return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + in_forest + other_vel)
288 | 
289 | 
290 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(name='maddpg',
 4 |       version='0.0.1',
 5 |       description='Multi-Agent Deep Deterministic Policy Gradient',
 6 |       url='https://github.com/openai/maddpg',
 7 |       author='Igor Mordatch',
 8 |       author_email='mordatch@openai.com',
 9 |       packages=find_packages(),
10 |       include_package_data=True,
11 |       zip_safe=False,
12 |       install_requires=['gym', 'numpy-stl']
13 | )
14 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import time
  5 | import pickle
  6 | 
  7 | import maddpg.common.tf_util as U
  8 | from maddpg.trainer.maddpg import MADDPGAgentTrainer
  9 | import tensorflow.contrib.layers as layers
 10 | 
 11 | def parse_args():
 12 |     parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
 13 |     # Environment
 14 |     parser.add_argument("--scenario", type=str, default="formation", help="name of the scenario script")
 15 |     parser.add_argument("--max-episode-len", type=int, default=120, help="maximum episode length")
 16 |     parser.add_argument("--num-episodes", type=int, default=50000, help="number of episodes")
 17 |     parser.add_argument("--num-adversaries", type=int, default=1, help="number of adversaries")
 18 |     parser.add_argument("--good-policy", type=str, default="maddpg", help="policy for good agents")
 19 |     parser.add_argument("--adv-policy", type=str, default="maddpg", help="policy of adversaries")
 20 |     # Core training parameters
 21 |     parser.add_argument("--lr", type=float, default=1e-2, help="learning rate for Adam optimizer")
 22 |     parser.add_argument("--gamma", type=float, default=0.95, help="discount factor")
 23 |     parser.add_argument("--batch-size", type=int, default=1024, help="number of episodes to optimize at the same time")
 24 |     parser.add_argument("--num-units", type=int, default=64, help="number of units in the mlp")
 25 |     # Checkpointing
 26 |     parser.add_argument("--exp-name", type=str, default="formation", help="name of the experiment")
 27 |     parser.add_argument("--save-dir", type=str, default="/home/islam/training/policy/", help="directory in which training state and model should be saved")
 28 |     parser.add_argument("--save-rate", type=int, default=100, help="save model once every time this many episodes are completed")
 29 |     parser.add_argument("--load-dir", type=str, default="", help="directory in which training state and model are loaded")
 30 |     # Evaluation
 31 |     parser.add_argument("--restore", action="store_true", default=False)
 32 |     parser.add_argument("--display", action="store_true", default=True)
 33 |     parser.add_argument("--benchmark", action="store_true", default=False)
 34 |     parser.add_argument("--benchmark-iters", type=int, default=100000, help="number of iterations run for benchmarking")
 35 |     parser.add_argument("--benchmark-dir", type=str, default="/home/islam/training/benchmark/", help="directory where benchmark data is saved")
 36 |     parser.add_argument("--plots-dir", type=str, default="/home/islam/training/curves/", help="directory where plot data is saved")
 37 |     return parser.parse_args()
 38 | 
 39 | def mlp_model(input, num_outputs, scope, reuse=False, num_units=64, rnn_cell=None):
 40 |     # This model takes as input an observation and returns values of all actions
 41 |     with tf.variable_scope(scope, reuse=reuse):
 42 |         out = input
 43 |         out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu)
 44 |         out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu)
 45 |         out = layers.fully_connected(out, num_outputs=num_outputs, activation_fn=None)
 46 |         return out
 47 | 
 48 | def make_env(scenario_name, arglist, benchmark=False):
 49 |     from multiagent.environment import MultiAgentEnv
 50 |     import multiagent.scenarios as scenarios
 51 | 
 52 |     # load scenario from script
 53 |     scenario = scenarios.load(scenario_name + ".py").Scenario()
 54 |     # create world
 55 |     world = scenario.make_world()
 56 |     # create multiagent environment
 57 |     if benchmark:
 58 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
 59 |     else:
 60 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
 61 |     return env
 62 | 
 63 | def get_trainers(env, num_adversaries, obs_shape_n, arglist):
 64 |     trainers = []
 65 |     model = mlp_model
 66 |     trainer = MADDPGAgentTrainer
 67 |     for i in range(num_adversaries):
 68 |         trainers.append(trainer(
 69 |             "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist,
 70 |             local_q_func=(arglist.adv_policy=='ddpg')))
 71 |     for i in range(num_adversaries, env.n):
 72 |         trainers.append(trainer(
 73 |             "agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist,
 74 |             local_q_func=(arglist.good_policy=='ddpg')))
 75 |     return trainers
 76 | 
 77 | 
 78 | def train(arglist):
 79 |     with U.single_threaded_session():
 80 |         # Create environment
 81 |         env = make_env(arglist.scenario, arglist, arglist.benchmark)
 82 |         # Create agent trainers
 83 |         obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
 84 |         num_adversaries = min(env.n, arglist.num_adversaries)
 85 |         trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
 86 |         print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))
 87 | 
 88 |         # Initialize
 89 |         U.initialize()
 90 | 
 91 |         # Load previous results, if necessary
 92 |         if arglist.load_dir == "":
 93 |             arglist.load_dir = arglist.save_dir
 94 |         if arglist.display or arglist.restore or arglist.benchmark:
 95 |             print('Loading previous state...')
 96 |             U.load_state(arglist.load_dir)
 97 | 
 98 |         episode_rewards = [0.0]  # sum of rewards for all agents
 99 |         agent_rewards = [[0.0] for _ in range(env.n)]  # individual agent reward
100 |         final_ep_rewards = []  # sum of rewards for training curve
101 |         final_ep_ag_rewards = []  # agent rewards for training curve
102 |         agent_info = [[[]]]  # placeholder for benchmarking info
103 |         saver = tf.train.Saver()
104 |         obs_n = env.reset()
105 |         episode_step = 0
106 |         train_step = 0
107 |         t_start = time.time()
108 | 
109 |         print('Starting iterations...')
110 |         while True:
111 |             # get action
112 |             action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
113 |             # environment step
114 |             new_obs_n, rew_n, done_n, info_n = env.step(action_n)
115 |             episode_step += 1
116 |             done = all(done_n)
117 |             terminal = (episode_step >= arglist.max_episode_len)
118 |             # collect experience
119 |             for i, agent in enumerate(trainers):
120 |                 agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
121 |             obs_n = new_obs_n
122 | 
123 |             for i, rew in enumerate(rew_n):
124 |                 episode_rewards[-1] += rew
125 |                 agent_rewards[i][-1] += rew
126 | 
127 |             if done or terminal:
128 |                 obs_n = env.reset()
129 |                 episode_step = 0
130 |                 episode_rewards.append(0)
131 |                 for a in agent_rewards:
132 |                     a.append(0)
133 |                 agent_info.append([[]])
134 | 
135 |             # increment global step counter
136 |             train_step += 1
137 | 
138 |             # for benchmarking learned policies
139 |             if arglist.benchmark:
140 |                 for i, info in enumerate(info_n):
141 |                     agent_info[-1][i].append(info_n['n'])
142 |                 if train_step > arglist.benchmark_iters and (done or terminal):
143 |                     file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
144 |                     print('Finished benchmarking, now saving...')
145 |                     with open(file_name, 'wb') as fp:
146 |                         pickle.dump(agent_info[:-1], fp)
147 |                     break
148 |                 continue
149 | 
150 |             # for displaying learned policies
151 |             if arglist.display:
152 |                 time.sleep(0.1)
153 |                 env.render()
154 |                 continue
155 | 
156 |             # update all trainers, if not in display or benchmark mode
157 |             loss = None
158 |             for agent in trainers:
159 |                 agent.preupdate()
160 |             for agent in trainers:
161 |                 loss = agent.update(trainers, train_step)
162 | 
163 |             # save model, display training output
164 |             if terminal and (len(episode_rewards) % arglist.save_rate == 0):
165 |                 U.save_state(arglist.save_dir, saver=saver)
166 |                 # print statement depends on whether or not there are adversaries
167 |                 if num_adversaries == 0:
168 |                     print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
169 |                         train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
170 |                 else:
171 |                     print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
172 |                         train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
173 |                         [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
174 |                 t_start = time.time()
175 |                 # Keep track of final episode reward
176 |                 final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
177 |                 for rew in agent_rewards:
178 |                     final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))
179 | 
180 |             # saves final episode reward for plotting training curve later
181 |             if len(episode_rewards) > arglist.num_episodes:
182 |                 rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
183 |                 with open(rew_file_name, 'wb') as fp:
184 |                     pickle.dump(final_ep_rewards, fp)
185 |                 agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
186 |                 with open(agrew_file_name, 'wb') as fp:
187 |                     pickle.dump(final_ep_ag_rewards, fp)
188 |                 print('...Finished total of {} episodes.'.format(len(episode_rewards)))
189 |                 break
190 | 
191 | if __name__ == '__main__':
192 |     arglist = parse_args()
193 |     train(arglist)


--------------------------------------------------------------------------------