├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── multi-agent-irl
    ├── README.md
    ├── all_result.sh
    ├── compute_kl.sh
    ├── curve.sh
    ├── hyper_study.sh
    ├── irl
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── mack
    │   │   ├── __init__.py
    │   │   ├── airl.py
    │   │   ├── codail.py
    │   │   ├── gail.py
    │   │   ├── kfac_discriminator.py
    │   │   ├── kfac_discriminator_airl.py
    │   │   ├── kfac_discriminator_codail.py
    │   │   ├── kfac_discriminator_ncdail.py
    │   │   ├── ncdail.py
    │   │   ├── run_mack_airl.py
    │   │   ├── run_mack_codail.py
    │   │   ├── run_mack_gail.py
    │   │   ├── run_mack_ncdail.py
    │   │   └── tf_util.py
    │   └── render.py
    ├── plot_distribution.sh
    ├── requirements.txt
    ├── rl
    │   ├── __init__.py
    │   ├── acktr
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── filters.py
    │   │   ├── kfac.py
    │   │   ├── kfac_utils.py
    │   │   ├── running_stat.py
    │   │   └── utils.py
    │   ├── bench
    │   │   ├── __init__.py
    │   │   ├── benchmarks.py
    │   │   └── monitor.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── atari_wrappers.py
    │   │   ├── console_util.py
    │   │   ├── dataset.py
    │   │   ├── distributions.py
    │   │   ├── ma_wrappers.py
    │   │   ├── math_util.py
    │   │   ├── misc_util.py
    │   │   ├── mpi_adam.py
    │   │   ├── mpi_fork.py
    │   │   ├── mpi_moments.py
    │   │   ├── mpi_running_mean_std.py
    │   │   ├── schedules.py
    │   │   ├── segment_tree.py
    │   │   ├── tf_util.py
    │   │   └── vec_env
    │   │   │   ├── __init__.py
    │   │   │   ├── dummy_vec_env.py
    │   │   │   ├── mpi_vec_env1.py
    │   │   │   ├── speedtest.py
    │   │   │   ├── subproc_vec_env.py
    │   │   │   ├── subproc_vec_env_walker.py
    │   │   │   ├── vec_frame_stack.py
    │   │   │   └── vec_normalize.py
    │   ├── envs
    │   │   ├── __init__.py
    │   │   ├── ant_og.xml
    │   │   ├── mujoco_env
    │   │   │   ├── __init__.py
    │   │   │   └── walker2d.py
    │   │   ├── multi_ant.py
    │   │   ├── multi_ant.xml
    │   │   └── multi_walker.py
    │   └── logger.py
    ├── sample.sh
    └── sandbox
    │   ├── __init__.py
    │   ├── imitation
    │       ├── __init__.py
    │       ├── crender.py
    │       ├── render.py
    │       ├── run_cmappo.py
    │       └── run_mujoco.py
    │   ├── mack
    │       ├── __init__.py
    │       ├── acktr_cont.py
    │       ├── acktr_disc.py
    │       ├── acktr_disc_om.py
    │       ├── acktr_multi_disc.py
    │       ├── opponent_policies.py
    │       ├── policies.py
    │       ├── policies_om.py
    │       ├── render.py
    │       ├── run_clone.py
    │       ├── run_simple.py
    │       ├── run_simple_om.py
    │       ├── run_walker.py
    │       └── run_walker_multi_disc.py
    │   └── mppo
    │       ├── __init__.py
    │       ├── policies.py
    │       ├── ppo2.py
    │       ├── run_simple_walker.py
    │       ├── run_sumo.py
    │       └── run_walker.py
└── multi-agent-particle-envs
    ├── LICENSE.txt
    ├── README.md
    ├── bin
        ├── __init__.py
        └── interactive.py
    ├── make_env.py
    ├── multiagent
        ├── __init__.py
        ├── core.py
        ├── environment.py
        ├── multi_discrete.py
        ├── policy.py
        ├── rendering.py
        ├── scenario.py
        └── scenarios
        │   ├── __init__.py
        │   ├── simple.py
        │   ├── simple_adversary.py
        │   ├── simple_crypto.py
        │   ├── simple_push.py
        │   ├── simple_reference.py
        │   ├── simple_speaker_listener.py
        │   ├── simple_spread.py
        │   ├── simple_tag.py
        │   └── simple_world_comm.py
    └── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # workspace
132 | .idea
133 | .vscode
134 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "thirdparty/ma-particle-envs"]
2 | 	path = thirdparty/ma-particle-envs
3 | 	url = https://github.com/openai/multiagent-particle-envs
4 | [submodule "thirdparty/ma-airl"]
5 | 	path = thirdparty/ma-airl
6 | 	url = https://github.com/ermongroup/ma-airl
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CoDAIL
 2 | Implementation of CoDAIL in the paper [Multi-Agent Interactions Modeling with Correlated Policies](https://openreview.net/forum?id=B1gZV1HYvS), based on [MA-AIRL](https://github.com/ermongroup/ma-airl).
 3 | 
 4 | ## Running the Code
 5 | 
 6 | - For code implementing CoDAIL, please visit `multi-agent-irl` folder.
 7 | - For the OpenAI particle environment code, please visit `multi-agent-particle-envs` folder.
 8 | 
 9 | 
10 | **NOTE**: Early implementation can be seen at [codailiclr2020/CoDAIL](https://github.com/codailiclr2020/CoDAIL)
11 | 


--------------------------------------------------------------------------------
/multi-agent-irl/README.md:
--------------------------------------------------------------------------------
 1 | # Multi-Agent Adversarial Inverse Reinforcement Learning
 2 | 
 3 | Frist you should install requirements.
 4 | Type the following codes under this forlder:
 5 | ```
 6 | pip install -r requirements.txt
 7 | ```
 8 | Then cd into the particle environment folder and type:
 9 | ```
10 | pip install -e .
11 | ```
12 | to install the env.
13 | 
14 | Run Multi-Agent ACKTR to obtain experts:
15 | ```
16 | python -m sandbox.mack.run_simple
17 | python -m sandbox.mack.run_simple_om
18 | ```
19 | The former generates interactions that agents do not consider others, while the latter generates interactions that agents model others when making decisions. Notice you should be aware of the args to run it successfully.
20 | 
21 | To generate expert trajectories:
22 | ```
23 |  python -m irl.render
24 | ```
25 | with specific args
26 | 
27 | Run CoDAIL / NCDAIL / MA-GAIL / MA-AIRL:
28 | 
29 | ```
30 | python -m irl.mack.run_mack_gail
31 | python -m irl.mack.run_mack_airl
32 | python -m irl.mack.run_mack_ncdail
33 | python -m irl.mack.run_mack_codail
34 | ```
35 | 
36 | Render results (see './irl/render.py' for more information):
37 | 
38 | ```
39 | python -m irl.render
40 | ```
41 | 


--------------------------------------------------------------------------------
/multi-agent-irl/all_result.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # source activate rl
 3 | if [ $1 -eq 200 ]
 4 | then
 5 |     python -m irl.render --env=simple_speaker_listener --all_exp --epoch=$2 --traj_limitation=200
 6 |     python -m irl.render --env=simple_spread --all_exp --epoch=$2 --traj_limitation=200
 7 |     python -m irl.render --env=simple_push --all_exp --epoch=$2 --traj_limitation=200
 8 |     python -m irl.render --env=simple_tag --all_exp --epoch=$2 --traj_limitation=200
 9 | elif [ $1 -eq 100 ]
10 | then
11 |     python -m irl.render --env=simple_speaker_listener --all_exp --epoch=$2 --traj_limitation=100
12 |     python -m irl.render --env=simple_spread --all_exp --epoch=$2 --traj_limitation=100
13 |     python -m irl.render --env=simple_push --all_exp --epoch=$2 --traj_limitation=100
14 |     python -m irl.render --env=simple_tag --all_exp --epoch=$2 --traj_limitation=100
15 | fi
16 | 


--------------------------------------------------------------------------------
/multi-agent-irl/compute_kl.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # source activate rl
3 | python -m irl.render --env=simple_speaker_listener --kl --num_trajs=$1 --epoch=$2 --seed=$3
4 | python -m irl.render --env=simple_spread --kl --num_trajs=$1 --epoch=$2 --seed=$3
5 | python -m irl.render --env=simple_push --kl --num_trajs=$1 --epoch=$2 --seed=$3
6 | python -m irl.render --env=simple_tag --kl --num_trajs=$1 --epoch=$2 --seed=$3
7 | 
8 | 


--------------------------------------------------------------------------------
/multi-agent-irl/curve.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # source activate rl
 3 | if [ $1 -eq 200 ]
 4 | then
 5 |     python -m irl.render --env=simple_speaker_listener --curve --epoch=$2 --traj_limitation=200
 6 |     python -m irl.render --env=simple_spread --curve --epoch=$2 --traj_limitation=200
 7 |     python -m irl.render --env=simple_push --curve --epoch=$2 --traj_limitation=200
 8 |     python -m irl.render --env=simple_tag --curve --epoch=$2 --traj_limitation=200
 9 | elif [ $1 -eq 100 ]
10 | then
11 |     python -m irl.render --env=simple_speaker_listener --curve --epoch=$2 --traj_limitation=100
12 |     python -m irl.render --env=simple_spread --curve --epoch=$2 --traj_limitation=100
13 |     python -m irl.render --env=simple_push --curve --epoch=$2 --traj_limitation=100
14 |     python -m irl.render --env=simple_tag --curve --epoch=$2 --traj_limitation=100
15 | fi
16 | 


--------------------------------------------------------------------------------
/multi-agent-irl/hyper_study.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # source activate rl
 3 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=2 --ent_coef=0 
 4 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=4 --ent_coef=0 
 5 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=2 --g=1 --ent_coef=0 
 6 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=4 --g=1 --ent_coef=0 
 7 | 
 8 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=1 --ent_coef=0.2
 9 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=1 --ent_coef=0.4
10 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=1 --ent_coef=0.6
11 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=1 --ent_coef=0.8
12 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=1 --ent_coef=1.0
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/multi-agent-irl/irl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/irl/__init__.py


--------------------------------------------------------------------------------
/multi-agent-irl/irl/mack/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/irl/mack/__init__.py


--------------------------------------------------------------------------------
/multi-agent-irl/irl/mack/kfac_discriminator.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import joblib
  4 | from rl.acktr.utils import Scheduler, find_trainable_variables
  5 | from rl.acktr.utils import fc, mse
  6 | from rl.acktr import kfac
  7 | 
  8 | disc_types = ['decentralized', 'centralized', 'single']
  9 | 
 10 | 
 11 | class Discriminator(object):
 12 |     def __init__(self, sess, ob_spaces, ac_spaces,
 13 |                  nstack, index, disc_type='decentralized', hidden_size=128,
 14 |                  lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5):
 15 |         self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
 16 |         self.disc_type = disc_type
 17 |         if disc_type not in disc_types:
 18 |             assert False
 19 |         self.scope = scope
 20 |         self.index = index
 21 |         self.sess = sess
 22 |         ob_space = ob_spaces[index]
 23 |         ac_space = ac_spaces[index]
 24 |         self.ob_shape = ob_space.shape[0] * nstack
 25 |         try:
 26 |             nact = ac_space.n
 27 |         except:
 28 |             nact = ac_space.shape[0]
 29 |         self.ac_shape = nact * nstack
 30 |         self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
 31 |         try:
 32 |             self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
 33 |         except:
 34 |             self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack
 35 |         self.hidden_size = hidden_size
 36 | 
 37 |         if disc_type == 'decentralized':
 38 |             input_shape = self.ob_shape + self.ac_shape
 39 |         elif disc_type == 'centralized':
 40 |             input_shape = self.all_ob_shape + self.all_ac_shape
 41 |         elif disc_type == 'single':
 42 |             input_shape = self.all_ob_shape + self.all_ac_shape
 43 |         else:
 44 |             assert False
 45 | 
 46 |         self.g = tf.placeholder(tf.float32, (None, input_shape))
 47 |         self.e = tf.placeholder(tf.float32, (None, input_shape))
 48 |         self.lr_rate = tf.placeholder(tf.float32, ())
 49 |         self.adv = tf.placeholder(tf.float32, ())
 50 | 
 51 |         num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1
 52 | 
 53 |         logits = self.build_graph(tf.concat([self.g, self.e], axis=0), num_outputs, reuse=False)
 54 |         labels = tf.concat([tf.zeros([tf.shape(self.g)[0], 1]), tf.ones([tf.shape(self.e)[0], 1])], axis=0)
 55 | 
 56 |         g_logits = self.build_graph(self.g, num_outputs, reuse=True)
 57 |         e_logits = self.build_graph(self.e, num_outputs, reuse=True)
 58 | 
 59 |         self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
 60 |             logits=g_logits, labels=tf.zeros_like(g_logits)))
 61 |         self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
 62 |             logits=e_logits, labels=tf.ones_like(e_logits)))
 63 | 
 64 |         self.total_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
 65 |         fisher_loss = -self.total_loss
 66 | 
 67 |         # self.reward_op = tf.sigmoid(g_logits) * 2.0 - 1
 68 |         self.reward_op = tf.log(tf.sigmoid(g_logits) + 1e-10)
 69 | 
 70 |         # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits))
 71 | 
 72 |         self.var_list = self.get_trainable_variables()
 73 |         params = find_trainable_variables(self.scope)
 74 |         grads = tf.gradients(self.total_loss, params)
 75 | 
 76 |         # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
 77 |         with tf.variable_scope(self.scope + '/d_optim'):
 78 |             # d_optim = kfac.KfacOptimizer(
 79 |             #     learning_rate=self.lr_rate, clip_kl=kfac_clip,
 80 |             #     momentum=0.9, kfac_update=1, epsilon=0.01,
 81 |             #     stats_decay=0.99, async=0, cold_iter=10,
 82 |             #     max_grad_norm=max_grad_norm)
 83 |             # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params)
 84 |             # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params)))
 85 |             # self.q_runner = q_runner
 86 |             d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate)
 87 |             train_op = d_optim.apply_gradients(list(zip(grads, params)))
 88 | 
 89 |         self.d_optim = train_op
 90 |         self.saver = tf.train.Saver(self.get_variables())
 91 | 
 92 |         self.params_flat = self.get_trainable_variables()
 93 | 
 94 |     def build_graph(self, x, num_outputs=1, reuse=False):
 95 |         with tf.variable_scope(self.scope):
 96 |             if reuse:
 97 |                 tf.get_variable_scope().reuse_variables()
 98 |             p_h1 = fc(x, 'fc1', nh=self.hidden_size)
 99 |             p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size)
100 |             logits = fc(p_h2, 'out', nh=num_outputs, act=lambda x: x)
101 |         return logits
102 | 
103 |     def get_variables(self):
104 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
105 | 
106 |     def get_trainable_variables(self):
107 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
108 | 
109 |     def get_reward(self, obs, acs):
110 |         if len(obs.shape) == 1:
111 |             obs = np.expand_dims(obs, 0)
112 |         if len(acs.shape) == 1:
113 |             acs = np.expand_dims(acs, 0)
114 |         feed_dict = {self.g: np.concatenate([obs, acs], axis=1)}
115 |         return self.sess.run(self.reward_op, feed_dict)
116 | 
117 |     def train(self, g_obs, g_acs, e_obs, e_acs):
118 |         feed_dict = {self.g: np.concatenate([g_obs, g_acs], axis=1),
119 |                      self.e: np.concatenate([e_obs, e_acs], axis=1), self.lr_rate: self.lr.value()}
120 |         loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict)
121 |         g_loss, e_loss = self.sess.run([self.g_loss, self.e_loss], feed_dict)
122 |         return g_loss, e_loss, None, None
123 | 
124 |     def restore(self, path):
125 |         print('restoring from:' + path)
126 |         self.saver.restore(self.sess, path)
127 | 
128 |     def save(self, save_path):
129 |         ps = self.sess.run(self.params_flat)
130 |         joblib.dump(ps, save_path)
131 | 
132 |     def load(self, load_path):
133 |         loaded_params = joblib.load(load_path)
134 |         restores = []
135 |         for p, loaded_p in zip(self.params_flat, loaded_params):
136 |             restores.append(p.assign(loaded_p))
137 |         self.sess.run(restores)
138 | 
139 | 


--------------------------------------------------------------------------------
/multi-agent-irl/irl/mack/kfac_discriminator_codail.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import joblib
  4 | from rl.acktr.utils import Scheduler, find_trainable_variables
  5 | from rl.acktr.utils import fc, mse
  6 | from rl.acktr import kfac
  7 | 
  8 | disc_types = ['decentralized', 'dentralized-all']
  9 | 
 10 | 
 11 | class Discriminator(object):
 12 |     def __init__(self, sess, ob_spaces, ac_spaces,
 13 |                  nstack, index, disc_type='decentralized', hidden_size=128,
 14 |                  lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5):
 15 |         self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
 16 |         self.disc_type = disc_type
 17 |         if disc_type not in disc_types:
 18 |             assert False
 19 |         self.scope = scope
 20 |         self.index = index
 21 |         self.sess = sess
 22 |         ob_space = ob_spaces[index]
 23 |         ac_space = ac_spaces[index]
 24 |         self.ob_shape = ob_space.shape[0] * nstack
 25 |         try:
 26 |             nact = ac_space.n
 27 |         except:
 28 |             nact = ac_space.shape[0]
 29 |         self.ac_shape = nact * nstack
 30 |         self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
 31 |         try:
 32 |             self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
 33 |         except:
 34 |             self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack
 35 |         self.hidden_size = hidden_size
 36 | 
 37 |         if disc_type == 'decentralized':
 38 |             input_shape = self.ob_shape + self.all_ac_shape
 39 |         elif disc_type == 'decentralized-all':
 40 |             input_shape = self.all_ob_shape + self.all_ac_shape
 41 |         else:
 42 |             assert False
 43 | 
 44 |         self.g = tf.placeholder(tf.float32, (None, input_shape))
 45 |         self.e = tf.placeholder(tf.float32, (None, input_shape))
 46 |         self.lr_rate = tf.placeholder(tf.float32, ())
 47 |         self.adv = tf.placeholder(tf.float32, ())
 48 | 
 49 |         num_outputs = 1
 50 | 
 51 |         logits = self.build_graph(tf.concat([self.g, self.e], axis=0), num_outputs, reuse=False)
 52 |         labels = tf.concat([tf.zeros([tf.shape(self.g)[0], 1]), tf.ones([tf.shape(self.e)[0], 1])], axis=0)
 53 | 
 54 |         g_logits = self.build_graph(self.g, num_outputs, reuse=True)
 55 |         e_logits = self.build_graph(self.e, num_outputs, reuse=True)
 56 | 
 57 |         self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
 58 |             logits=g_logits, labels=tf.zeros_like(g_logits)))
 59 |         self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
 60 |             logits=e_logits, labels=tf.ones_like(e_logits)))
 61 | 
 62 |         self.total_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
 63 |         fisher_loss = -self.total_loss
 64 | 
 65 |         # self.reward_op = tf.sigmoid(g_logits) * 2.0 - 1
 66 |         # self.reward_op = tf.log(tf.sigmoid(g_logits) + 1e-10)
 67 |         # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits))
 68 |         self.reward_op = tf.log( tf.sigmoid(g_logits) + 1e-10) - tf.log(1-tf.sigmoid(g_logits) + 1e-10)
 69 | 
 70 |         self.var_list = self.get_trainable_variables()
 71 |         params = find_trainable_variables(self.scope)
 72 |         grads = tf.gradients(self.total_loss, params)
 73 | 
 74 |         # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
 75 |         with tf.variable_scope(self.scope + '/d_optim'):
 76 |             # d_optim = kfac.KfacOptimizer(
 77 |             #     learning_rate=self.lr_rate, clip_kl=kfac_clip,
 78 |             #     momentum=0.9, kfac_update=1, epsilon=0.01,
 79 |             #     stats_decay=0.99, async=0, cold_iter=10,
 80 |             #     max_grad_norm=max_grad_norm)
 81 |             # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params)
 82 |             # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params)))
 83 |             # self.q_runner = q_runner
 84 |             d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate)
 85 |             train_op = d_optim.apply_gradients(list(zip(grads, params)))
 86 | 
 87 |         self.d_optim = train_op
 88 |         self.saver = tf.train.Saver(self.get_variables())
 89 | 
 90 |         self.params_flat = self.get_trainable_variables()
 91 | 
 92 |     def build_graph(self, x, num_outputs=1, reuse=False):
 93 |         with tf.variable_scope(self.scope):
 94 |             if reuse:
 95 |                 tf.get_variable_scope().reuse_variables()
 96 |             p_h1 = fc(x, 'fc1', nh=self.hidden_size)
 97 |             p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size)
 98 |             logits = fc(p_h2, 'out', nh=num_outputs, act=lambda x: x)
 99 |         return logits
100 | 
101 |     def get_variables(self):
102 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
103 | 
104 |     def get_trainable_variables(self):
105 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
106 | 
107 |     def get_reward(self, obs, acs):
108 |         if len(obs.shape) == 1:
109 |             obs = np.expand_dims(obs, 0)
110 |         if len(acs.shape) == 1:
111 |             acs = np.expand_dims(acs, 0)
112 |         feed_dict = {self.g: np.concatenate([obs, acs], axis=1)}
113 |         return self.sess.run(self.reward_op, feed_dict)
114 | 
115 |     def train(self, g_obs, g_acs, e_obs, e_acs):
116 |         feed_dict = {self.g: np.concatenate([g_obs, g_acs], axis=1),
117 |                      self.e: np.concatenate([e_obs, e_acs], axis=1), self.lr_rate: self.lr.value()}
118 |         loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict)
119 |         g_loss, e_loss = self.sess.run([self.g_loss, self.e_loss], feed_dict)
120 |         return g_loss, e_loss, None, None
121 | 
122 |     def restore(self, path):
123 |         print('restoring from:' + path)
124 |         self.saver.restore(self.sess, path)
125 | 
126 |     def save(self, save_path):
127 |         ps = self.sess.run(self.params_flat)
128 |         joblib.dump(ps, save_path)
129 | 
130 |     def load(self, load_path):
131 |         loaded_params = joblib.load(load_path)
132 |         restores = []
133 |         for p, loaded_p in zip(self.params_flat, loaded_params):
134 |             restores.append(p.assign(loaded_p))
135 |         self.sess.run(restores)
136 | 
137 | 


--------------------------------------------------------------------------------
/multi-agent-irl/irl/mack/kfac_discriminator_ncdail.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import joblib
  4 | from rl.acktr.utils import Scheduler, find_trainable_variables
  5 | from rl.acktr.utils import fc, mse
  6 | from rl.acktr import kfac
  7 | 
  8 | disc_types = ['decentralized', 'dentralized-all']
  9 | 
 10 | 
 11 | class Discriminator(object):
 12 |     def __init__(self, sess, ob_spaces, ac_spaces,
 13 |                  nstack, index, disc_type='decentralized', hidden_size=128,
 14 |                  lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5):
 15 |         self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
 16 |         self.disc_type = disc_type
 17 |         if disc_type not in disc_types:
 18 |             assert False
 19 |         self.scope = scope
 20 |         self.index = index
 21 |         self.sess = sess
 22 |         ob_space = ob_spaces[index]
 23 |         ac_space = ac_spaces[index]
 24 |         self.ob_shape = ob_space.shape[0] * nstack
 25 |         try:
 26 |             nact = ac_space.n
 27 |         except:
 28 |             nact = ac_space.shape[0]
 29 |         self.ac_shape = nact * nstack
 30 |         self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
 31 |         try:
 32 |             self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
 33 |         except:
 34 |             self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack
 35 |         self.hidden_size = hidden_size
 36 | 
 37 |         if disc_type == 'decentralized':
 38 |             input_shape = self.ob_shape + self.all_ac_shape
 39 |         elif disc_type == 'decentralized-all':
 40 |             input_shape = self.all_ob_shape + self.all_ac_shape
 41 |         else:
 42 |             assert False
 43 | 
 44 |         self.g = tf.placeholder(tf.float32, (None, input_shape))
 45 |         self.e = tf.placeholder(tf.float32, (None, input_shape))
 46 |         self.lr_rate = tf.placeholder(tf.float32, ())
 47 |         self.adv = tf.placeholder(tf.float32, ())
 48 | 
 49 |         num_outputs = 1
 50 | 
 51 |         logits = self.build_graph(tf.concat([self.g, self.e], axis=0), num_outputs, reuse=False)
 52 |         labels = tf.concat([tf.zeros([tf.shape(self.g)[0], 1]), tf.ones([tf.shape(self.e)[0], 1])], axis=0)
 53 | 
 54 |         g_logits = self.build_graph(self.g, num_outputs, reuse=True)
 55 |         e_logits = self.build_graph(self.e, num_outputs, reuse=True)
 56 | 
 57 |         self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
 58 |             logits=g_logits, labels=tf.zeros_like(g_logits)))
 59 |         self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
 60 |             logits=e_logits, labels=tf.ones_like(e_logits)))
 61 | 
 62 |         self.total_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
 63 |         fisher_loss = -self.total_loss
 64 | 
 65 |         # self.reward_op = tf.sigmoid(g_logits) * 2.0 - 1
 66 |         # self.reward_op = tf.log(tf.sigmoid(g_logits) + 1e-10)
 67 |         # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits))
 68 |         self.reward_op = tf.log( tf.sigmoid(g_logits) + 1e-10) - tf.log(1-tf.sigmoid(g_logits) + 1e-10)
 69 | 
 70 |         self.var_list = self.get_trainable_variables()
 71 |         params = find_trainable_variables(self.scope)
 72 |         grads = tf.gradients(self.total_loss, params)
 73 | 
 74 |         # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
 75 |         with tf.variable_scope(self.scope + '/d_optim'):
 76 |             # d_optim = kfac.KfacOptimizer(
 77 |             #     learning_rate=self.lr_rate, clip_kl=kfac_clip,
 78 |             #     momentum=0.9, kfac_update=1, epsilon=0.01,
 79 |             #     stats_decay=0.99, async=0, cold_iter=10,
 80 |             #     max_grad_norm=max_grad_norm)
 81 |             # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params)
 82 |             # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params)))
 83 |             # self.q_runner = q_runner
 84 |             d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate)
 85 |             train_op = d_optim.apply_gradients(list(zip(grads, params)))
 86 | 
 87 |         self.d_optim = train_op
 88 |         self.saver = tf.train.Saver(self.get_variables())
 89 | 
 90 |         self.params_flat = self.get_trainable_variables()
 91 | 
 92 |     def build_graph(self, x, num_outputs=1, reuse=False):
 93 |         with tf.variable_scope(self.scope):
 94 |             if reuse:
 95 |                 tf.get_variable_scope().reuse_variables()
 96 |             p_h1 = fc(x, 'fc1', nh=self.hidden_size)
 97 |             p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size)
 98 |             logits = fc(p_h2, 'out', nh=num_outputs, act=lambda x: x)
 99 |         return logits
100 | 
101 |     def get_variables(self):
102 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
103 | 
104 |     def get_trainable_variables(self):
105 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
106 | 
107 |     def get_reward(self, obs, acs):
108 |         if len(obs.shape) == 1:
109 |             obs = np.expand_dims(obs, 0)
110 |         if len(acs.shape) == 1:
111 |             acs = np.expand_dims(acs, 0)
112 |         feed_dict = {self.g: np.concatenate([obs, acs], axis=1)}
113 |         return self.sess.run(self.reward_op, feed_dict)
114 | 
115 |     def train(self, g_obs, g_acs, e_obs, e_acs):
116 |         feed_dict = {self.g: np.concatenate([g_obs, g_acs], axis=1),
117 |                      self.e: np.concatenate([e_obs, e_acs], axis=1), self.lr_rate: self.lr.value()}
118 |         loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict)
119 |         g_loss, e_loss = self.sess.run([self.g_loss, self.e_loss], feed_dict)
120 |         return g_loss, e_loss, None, None
121 | 
122 |     def restore(self, path):
123 |         print('restoring from:' + path)
124 |         self.saver.restore(self.sess, path)
125 | 
126 |     def save(self, save_path):
127 |         ps = self.sess.run(self.params_flat)
128 |         joblib.dump(ps, save_path)
129 | 
130 |     def load(self, load_path):
131 |         loaded_params = joblib.load(load_path)
132 |         restores = []
133 |         for p, loaded_p in zip(self.params_flat, loaded_params):
134 |             restores.append(p.assign(loaded_p))
135 |         self.sess.run(restores)
136 | 
137 | 


--------------------------------------------------------------------------------
/multi-agent-irl/irl/mack/run_mack_airl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import logging
 3 | import os
 4 | import itertools
 5 | import click
 6 | import gym
 7 | 
 8 | import make_env
 9 | from rl import bench
10 | from rl import logger
11 | from rl.common import set_global_seeds
12 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
13 | from irl.dataset import MADataSet
14 | from irl.mack.airl import learn
15 | from sandbox.mack.policies import CategoricalPolicy
16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
17 | 
18 | 
19 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path,
20 |           traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500, l2=0.1, d_iters=1,
21 |           rew_scale=0.1):
22 |     def create_env(rank):
23 |         def _thunk():
24 |             env = make_env.make_env(env_id)
25 |             env.seed(seed + rank)
26 |             env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
27 |                                 allow_early_resets=True)
28 |             gym.logger.setLevel(logging.WARN)
29 |             return env
30 |         return _thunk
31 | 
32 |     logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
33 | 
34 |     set_global_seeds(seed)
35 |     env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True)
36 |     print(num_cpu)
37 |     policy_fn = CategoricalPolicy
38 |     expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation, nobs_flag=True)
39 |     learn(policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu,
40 |           nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr,
41 |           disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id), l2=l2, d_iters=d_iters,
42 |           rew_scale=rew_scale)
43 |     env.close()
44 | 
45 | 
46 | @click.command()
47 | @click.option('--logdir', type=click.STRING, default='./results')
48 | @click.option('--env', type=click.STRING, default='simple_spread')
49 | @click.option('--expert_path', type=click.STRING,
50 |               default='./results/mack_om/simple_speaker_listener/l-0.1-b-1000/seed-1/checkpoint55000-200tra.pkl')
51 | @click.option('--seed', type=click.INT, default=1)
52 | @click.option('--traj_limitation', type=click.INT, default=200)
53 | @click.option('--ret_threshold', type=click.FLOAT, default=-10)
54 | @click.option('--dis_lr', type=click.FLOAT, default=0.1)
55 | @click.option('--disc_type', type=click.Choice(['decentralized', 'decentralized-all']),
56 |               default='decentralized')
57 | @click.option('--bc_iters', type=click.INT, default=500)
58 | @click.option('--l2', type=click.FLOAT, default=0.1)
59 | @click.option('--d_iters', type=click.INT, default=1)
60 | @click.option('--rew_scale', type=click.FLOAT, default=0)
61 | def main(logdir, env, expert_path, seed, traj_limitation, ret_threshold, dis_lr, disc_type, bc_iters, l2, d_iters,
62 |          rew_scale):
63 |     expert_path='./results/mack_om/'+env+'/l-0.1-b-1000/seed-'+str(1)+'/checkpoint55000-200tra-{}.pkl'.format(seed)
64 |     env_ids = [env]
65 |     lrs = [0.1]
66 |     seeds = [seed]
67 |     batch_sizes = [1000]
68 | 
69 |     for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes):
70 |         train(logdir + '/airl/' + env_id + '/' + disc_type + '/s-{}/l-{}-b-{}-d-{}-c-{}-l2-{}-iter-{}-r-{}/seed-{}'.format(
71 |               traj_limitation, lr, batch_size, dis_lr, bc_iters, l2, d_iters, rew_scale, seed),
72 |               env_id, 5e7, lr, batch_size, seed, batch_size // 250, expert_path,
73 |               traj_limitation, ret_threshold, dis_lr, disc_type=disc_type, bc_iters=bc_iters, l2=l2, d_iters=d_iters,
74 |               rew_scale=rew_scale)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/multi-agent-irl/irl/mack/run_mack_codail.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import logging
 3 | import os
 4 | import itertools
 5 | import click
 6 | import gym
 7 | 
 8 | import make_env
 9 | from rl import bench
10 | from rl import logger
11 | from rl.common import set_global_seeds
12 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
13 | from irl.dataset import MADataSet
14 | from irl.mack.codail import learn
15 | from sandbox.mack.policies_om import CategoricalPolicy as CategoricalPolicy_om
16 | from sandbox.mack.opponent_policies import CategoricalPolicy as oppo_CategoricalPolicy
17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
18 | 
19 | 
20 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path,
21 |           traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500, d_iters=1, g_iters=1, ent_coef=0.0):
22 |     def create_env(rank):
23 |         def _thunk():
24 |             env = make_env.make_env(env_id)
25 |             env.seed(seed + rank)
26 |             env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
27 |                                 allow_early_resets=True)
28 |             gym.logger.setLevel(logging.WARN)
29 |             return env
30 |         return _thunk
31 | 
32 |     logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
33 | 
34 |     set_global_seeds(seed)
35 |     env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True)
36 |     print(num_cpu)
37 |     policy_fn = CategoricalPolicy_om
38 |     oppo_policy_fn = oppo_CategoricalPolicy
39 |     expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation)
40 |     learn(policy_fn, oppo_policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu,
41 |           nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr,
42 |           disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id), d_iters=d_iters, g_iters=g_iters)
43 |     env.close()
44 | 
45 | 
46 | @click.command()
47 | @click.option('--logdir', type=click.STRING, default='./results')
48 | @click.option('--env', type=click.STRING, default='simple_spread')
49 | @click.option('--expert_path', type=click.STRING,
50 |               default='./results/mack_om/simple_speaker_listener/l-0.1-b-1000/seed-1/checkpoint55000-200tra.pkl')
51 | @click.option('--seed', type=click.INT, default=1)
52 | @click.option('--traj_limitation', type=click.INT, default=200)
53 | @click.option('--ret_threshold', type=click.FLOAT, default=-10)
54 | @click.option('--dis_lr', type=click.FLOAT, default=0.1)
55 | @click.option('--disc_type', type=click.Choice(['decentralized', 'decentralized-all']), default='decentralized')
56 | @click.option('--bc_iters', type=click.INT, default=500)
57 | @click.option('--d_iters', type=click.INT, default=1)
58 | @click.option('--g_iters', type=click.INT, default=1)
59 | @click.option('--ent_coef', type=click.FLOAT, default=0.0)
60 | @click.option('--hyper_study', is_flag=True, flag_value=True)
61 | def main(logdir, env, expert_path, seed, traj_limitation, ret_threshold, dis_lr, disc_type, bc_iters, d_iters, g_iters, ent_coef, hyper_study):
62 |     expert_path='./results/mack_om/'+env+'/l-0.1-b-1000/seed-'+str(1)+'/checkpoint55000-200tra-{}.pkl'.format(seed)
63 |     print(expert_path)
64 |     env_ids = [env]
65 |     lrs = [0.1]
66 |     seeds = [seed]
67 |     batch_sizes = [1000]
68 | 
69 |     ldir = './results'
70 | 
71 |     for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes):
72 |         logdir = ldir + '/codail/' + env_id + '/' + disc_type + '/s-{}/l-{}-b-{}-d-{}-c-{}/seed-{}'.format(
73 |               traj_limitation, lr, batch_size, dis_lr, bc_iters, seed)
74 |         if hyper_study:
75 |             logdir = ldir + '/codail/' + env_id + '/' + disc_type + '/hyper_study/'+'s-{}/d-{}-g-{}-c-{}/seed-{}'.format(
76 |               traj_limitation, d_iters, g_iters, ent_coef, seed)
77 |         else:
78 |             d_iters = g_iters = 1
79 |             ent_coef = 0.0
80 |         print(logdir)
81 |         train(logdir,
82 |               env_id, 5e7, lr, batch_size, seed, batch_size // 250, expert_path,
83 |               traj_limitation, ret_threshold, dis_lr, disc_type=disc_type, bc_iters=bc_iters, d_iters=d_iters, g_iters=g_iters, ent_coef=ent_coef)
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     main()
88 | 


--------------------------------------------------------------------------------
/multi-agent-irl/irl/mack/run_mack_gail.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import logging
 3 | import os
 4 | import itertools
 5 | import click
 6 | import gym
 7 | 
 8 | import make_env
 9 | from rl import bench
10 | from rl import logger
11 | from rl.common import set_global_seeds
12 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
13 | from irl.dataset import MADataSet
14 | from irl.mack.gail import learn
15 | from sandbox.mack.policies import CategoricalPolicy
16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
17 | 
18 | 
19 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path,
20 |           traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500):
21 |     def create_env(rank):
22 |         def _thunk():
23 |             env = make_env.make_env(env_id)
24 |             env.seed(seed + rank)
25 |             env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
26 |                                 allow_early_resets=True)
27 |             gym.logger.setLevel(logging.WARN)
28 |             return env
29 |         return _thunk
30 | 
31 |     logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
32 | 
33 |     set_global_seeds(seed)
34 |     env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True)
35 |     print(num_cpu)
36 |     policy_fn = CategoricalPolicy
37 |     expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation)
38 |     learn(policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu,
39 |           nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr,
40 |           disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id))
41 |     env.close()
42 | 
43 | 
44 | @click.command()
45 | @click.option('--logdir', type=click.STRING, default='./results')
46 | @click.option('--env', type=click.STRING, default='simple_spread')
47 | @click.option('--expert_path', type=click.STRING,
48 |               default='./results/mack_om/simple_speaker_listener/l-0.1-b-1000/seed-1/checkpoint55000-200tra.pkl')
49 | @click.option('--seed', type=click.INT, default=1)
50 | @click.option('--traj_limitation', type=click.INT, default=200)
51 | @click.option('--ret_threshold', type=click.FLOAT, default=-10)
52 | @click.option('--dis_lr', type=click.FLOAT, default=0.1)
53 | @click.option('--disc_type', type=click.Choice(['decentralized', 'centralized', 'single']), default='decentralized')
54 | @click.option('--bc_iters', type=click.INT, default=500)
55 | def main(logdir, env, expert_path, seed, traj_limitation, ret_threshold, dis_lr, disc_type, bc_iters):
56 |     expert_path='./results/mack_om/'+env+'/l-0.1-b-1000/seed-'+str(1)+'/checkpoint55000-200tra-{}.pkl'.format(seed)
57 |     env_ids = [env]
58 |     lrs = [0.1]
59 |     seeds = [seed]
60 |     batch_sizes = [1000]
61 | 
62 |     logdir = './results'
63 | 
64 |     for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes):
65 |         train(logdir + '/gail/' + env_id + '/' + disc_type + '/s-{}/l-{}-b-{}-d-{}-c-{}/seed-{}'.format(
66 |               traj_limitation, lr, batch_size, dis_lr, bc_iters, seed),
67 |               env_id, 5e7, lr, batch_size, seed, batch_size // 250, expert_path,
68 |               traj_limitation, ret_threshold, dis_lr, disc_type=disc_type, bc_iters=bc_iters)
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     main()
73 | 


--------------------------------------------------------------------------------
/multi-agent-irl/irl/mack/run_mack_ncdail.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import logging
 3 | import os
 4 | import itertools
 5 | import click
 6 | import gym
 7 | 
 8 | import make_env
 9 | from rl import bench
10 | from rl import logger
11 | from rl.common import set_global_seeds
12 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
13 | from irl.dataset import MADataSet
14 | from irl.mack.ncdail import learn
15 | from sandbox.mack.policies import CategoricalPolicy
16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
17 | 
18 | 
19 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path,
20 |           traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500, d_iters=1):
21 |     def create_env(rank):
22 |         def _thunk():
23 |             env = make_env.make_env(env_id)
24 |             env.seed(seed + rank)
25 |             env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
26 |                                 allow_early_resets=True)
27 |             gym.logger.setLevel(logging.WARN)
28 |             return env
29 |         return _thunk
30 | 
31 |     logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
32 | 
33 |     set_global_seeds(seed)
34 |     env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True)
35 |     print(num_cpu)
36 |     policy_fn = CategoricalPolicy
37 |     expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation)
38 |     learn(policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu,
39 |           nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr,
40 |           disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id), d_iters=d_iters)
41 |     env.close()
42 | 
43 | 
44 | @click.command()
45 | @click.option('--logdir', type=click.STRING, default='./results')
46 | @click.option('--env', type=click.STRING, default='simple_spread')
47 | @click.option('--expert_path', type=click.STRING,
48 |               default='./results/mack_om/simple_push/l-0.1-b-1000/seed-1/checkpoint55000-200tra.pkl')
49 | @click.option('--seed', type=click.INT, default=1)
50 | @click.option('--traj_limitation', type=click.INT, default=200)
51 | @click.option('--ret_threshold', type=click.FLOAT, default=-10)
52 | @click.option('--dis_lr', type=click.FLOAT, default=0.1)
53 | @click.option('--disc_type', type=click.Choice(['decentralized', 'decentralized-all']), default='decentralized')
54 | @click.option('--bc_iters', type=click.INT, default=500)
55 | @click.option('--d_iters', type=click.INT, default=1)
56 | def main(logdir, env, expert_path, seed, traj_limitation, ret_threshold, dis_lr, disc_type, bc_iters, d_iters):
57 |     expert_path='./results/mack_om/'+env+'/l-0.1-b-1000/seed-'+str(1)+'/checkpoint55000-200tra-{}.pkl'.format(seed)
58 |     env_ids = [env]
59 |     lrs = [0.1]
60 |     seeds = [seed]
61 |     batch_sizes = [1000]
62 | 
63 |     ldir = './results'
64 | 
65 |     for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes):
66 |         logdir = ldir + '/ncdail/' + env_id + '/' + disc_type + '/s-{}/l-{}-b-{}-d-{}-c-{}/seed-{}'.format(
67 |               traj_limitation, lr, batch_size, dis_lr, bc_iters, seed)
68 |         print(logdir)
69 |         train(logdir,
70 |               env_id, 5e7, lr, batch_size, seed, batch_size // 250, expert_path,
71 |               traj_limitation, ret_threshold, dis_lr, disc_type=disc_type, bc_iters=bc_iters, d_iters=d_iters)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     main()
76 | 


--------------------------------------------------------------------------------
/multi-agent-irl/irl/mack/tf_util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | REG_VARS = 'reg_vars'
 5 | 
 6 | def linear(X, dout, name, bias=True):
 7 |     with tf.variable_scope(name):
 8 |         dX = int(X.get_shape()[-1])
 9 |         W = tf.get_variable('W', shape=(dX, dout))
10 |         tf.add_to_collection(REG_VARS, W)
11 |         if bias:
12 |             b = tf.get_variable('b', initializer=tf.constant(np.zeros(dout).astype(np.float32)))
13 |         else:
14 |             b = 0
15 |     return tf.matmul(X, W)+b
16 | 
17 | def discounted_reduce_sum(X, discount, axis=-1):
18 |     if discount != 1.0:
19 |         disc = tf.cumprod(discount*tf.ones_like(X), axis=axis)
20 |     else:
21 |         disc = 1.0
22 |     return tf.reduce_sum(X*disc, axis=axis)
23 | 
24 | def assert_shape(tens, shape):
25 |     assert tens.get_shape().is_compatible_with(shape)
26 | 
27 | def relu_layer(X, dout, name):
28 |     return tf.nn.relu(linear(X, dout, name))
29 | 
30 | def softplus_layer(X, dout, name):
31 |     return tf.nn.softplus(linear(X, dout, name))
32 | 
33 | def tanh_layer(X, dout, name):
34 |     return tf.nn.tanh(linear(X, dout, name))
35 | 
36 | def get_session_config():
37 |     session_config = tf.ConfigProto()
38 |     session_config.gpu_options.allow_growth = True
39 |     #session_config.gpu_options.per_process_gpu_memory_fraction = 0.2
40 |     return session_config
41 | 
42 | 
43 | def load_prior_params(pkl_fname):
44 |     import joblib
45 |     with tf.Session(config=get_session_config()):
46 |         params = joblib.load(pkl_fname)
47 |     tf.reset_default_graph()
48 |     #joblib.dump(params, file_name, compress=3)
49 |     params = params['irl_params']
50 |     #print(params)
51 |     assert params is not None
52 |     return params
53 | 


--------------------------------------------------------------------------------
/multi-agent-irl/plot_distribution.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # source activate rl
 3 | python -m irl.render --env=simple_speaker_listener --algo=mack_om --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
 4 | python -m irl.render --env=simple_speaker_listener --algo=codail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
 5 | python -m irl.render --env=simple_speaker_listener --algo=ncdail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
 6 | python -m irl.render --env=simple_speaker_listener --algo=gail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
 7 | python -m irl.render --env=simple_speaker_listener --algo=airl --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
 8 | python -m irl.render --env=simple_speaker_listener --algo=random --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
 9 | 
10 | python -m irl.render --env=simple_spread --algo=mack_om --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
11 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
12 | python -m irl.render --env=simple_spread --algo=ncdail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
13 | python -m irl.render --env=simple_spread --algo=gail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
14 | python -m irl.render --env=simple_spread --algo=airl --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
15 | python -m irl.render --env=simple_spread --algo=random --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
16 | 
17 | python -m irl.render --env=simple_push --algo=mack_om --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
18 | python -m irl.render --env=simple_push --algo=codail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
19 | python -m irl.render --env=simple_push --algo=ncdail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
20 | python -m irl.render --env=simple_push --algo=gail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
21 | python -m irl.render --env=simple_push --algo=airl --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
22 | python -m irl.render --env=simple_push --algo=random --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
23 | 
24 | python -m irl.render --env=simple_tag --algo=mack_om --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
25 | python -m irl.render --env=simple_tag --algo=codail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
26 | python -m irl.render --env=simple_tag --algo=ncdail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
27 | python -m irl.render --env=simple_tag --algo=gail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
28 | python -m irl.render --env=simple_tag --algo=airl --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
29 | python -m irl.render --env=simple_tag --algo=random --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3
30 | 
31 | 


--------------------------------------------------------------------------------
/multi-agent-irl/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorflow >= 1.2
 2 | numpy
 3 | scipy
 4 | mpi4py
 5 | ray
 6 | click
 7 | gym == 0.10.4
 8 | tqdm
 9 | joblib
10 | progressbar2
11 | zmq
12 | cloudpickle
13 | baselines
14 | box2d-py
15 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | from rl.envs.multi_walker import MultiWalkerEnv
 3 | 
 4 | 
 5 | def make_env(env_id):
 6 |     if env_id == 'walker1':
 7 |         return MultiWalkerEnv(n_walkers=1, reward_mech='local')
 8 |     elif env_id == 'walker2':
 9 |         return MultiWalkerEnv(n_walkers=2, reward_mech='local')
10 |     elif env_id == 'walker3':
11 |         return MultiWalkerEnv(n_walkers=3, reward_mech='local')
12 |     elif env_id == 'walker2c':
13 |         return MultiWalkerEnv(n_walkers=2, reward_mech='local', competitive=True)
14 |     elif env_id == 'walker4':
15 |         return MultiWalkerEnv(n_walkers=4, reward_mech='local')
16 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/acktr/README.md:
--------------------------------------------------------------------------------
1 | Contains utilities for ACK (ACKTR but actually without Trust Region)


--------------------------------------------------------------------------------
/multi-agent-irl/rl/acktr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/rl/acktr/__init__.py


--------------------------------------------------------------------------------
/multi-agent-irl/rl/acktr/filters.py:
--------------------------------------------------------------------------------
  1 | from collections import deque
  2 | 
  3 | import numpy as np
  4 | 
  5 | from rl.acktr.running_stat import RunningStat
  6 | 
  7 | 
  8 | class Filter(object):
  9 |     def __call__(self, x, update=True):
 10 |         raise NotImplementedError
 11 |     def reset(self):
 12 |         pass
 13 | 
 14 | class IdentityFilter(Filter):
 15 |     def __call__(self, x, update=True):
 16 |         return x
 17 | 
 18 | class CompositionFilter(Filter):
 19 |     def __init__(self, fs):
 20 |         self.fs = fs
 21 |     def __call__(self, x, update=True):
 22 |         for f in self.fs:
 23 |             x = f(x)
 24 |         return x
 25 |     def output_shape(self, input_space):
 26 |         out = input_space.shape
 27 |         for f in self.fs:
 28 |             out = f.output_shape(out)
 29 |         return out
 30 | 
 31 | class ZFilter(Filter):
 32 |     """
 33 |     y = (x-mean)/std
 34 |     using running estimates of mean,std
 35 |     """
 36 | 
 37 |     def __init__(self, shape, demean=True, destd=True, clip=10.0):
 38 |         self.demean = demean
 39 |         self.destd = destd
 40 |         self.clip = clip
 41 | 
 42 |         self.rs = RunningStat(shape)
 43 | 
 44 |     def __call__(self, x, update=True):
 45 |         if update: self.rs.push(x)
 46 |         if self.demean:
 47 |             x = x - self.rs.mean
 48 |         if self.destd:
 49 |             x = x / (self.rs.std+1e-8)
 50 |         if self.clip:
 51 |             x = np.clip(x, -self.clip, self.clip)
 52 |         return x
 53 |     def output_shape(self, input_space):
 54 |         return input_space.shape
 55 | 
 56 | class AddClock(Filter):
 57 |     def __init__(self):
 58 |         self.count = 0
 59 |     def reset(self):
 60 |         self.count = 0
 61 |     def __call__(self, x, update=True):
 62 |         return np.append(x, self.count/100.0)
 63 |     def output_shape(self, input_space):
 64 |         return (input_space.shape[0]+1,)
 65 | 
 66 | class FlattenFilter(Filter):
 67 |     def __call__(self, x, update=True):
 68 |         return x.ravel()
 69 |     def output_shape(self, input_space):
 70 |         return (int(np.prod(input_space.shape)),)
 71 | 
 72 | class Ind2OneHotFilter(Filter):
 73 |     def __init__(self, n):
 74 |         self.n = n
 75 |     def __call__(self, x, update=True):
 76 |         out = np.zeros(self.n)
 77 |         out[x] = 1
 78 |         return out
 79 |     def output_shape(self, input_space):
 80 |         return (input_space.n,)
 81 | 
 82 | class DivFilter(Filter):
 83 |     def __init__(self, divisor):
 84 |         self.divisor = divisor
 85 |     def __call__(self, x, update=True):
 86 |         return x / self.divisor
 87 |     def output_shape(self, input_space):
 88 |         return input_space.shape
 89 | 
 90 | class StackFilter(Filter):
 91 |     def __init__(self, length):
 92 |         self.stack = deque(maxlen=length)
 93 |     def reset(self):
 94 |         self.stack.clear()
 95 |     def __call__(self, x, update=True):
 96 |         self.stack.append(x)
 97 |         while len(self.stack) < self.stack.maxlen:
 98 |             self.stack.append(x)
 99 |         return np.concatenate(self.stack, axis=-1)
100 |     def output_shape(self, input_space):
101 |         return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,)
102 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/acktr/kfac_utils.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | 
  5 | def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
  6 |     if reduce_dim == None:
  7 |         # general batch matmul
  8 |         if len(a.get_shape()) == 3 and len(b.get_shape()) == 3:
  9 |             return tf.batch_matmul(a, b, adj_x=transpose_a, adj_y=transpose_b)
 10 |         elif len(a.get_shape()) == 3 and len(b.get_shape()) == 2:
 11 |             if transpose_b:
 12 |                 N = b.get_shape()[0].value
 13 |             else:
 14 |                 N = b.get_shape()[1].value
 15 |             B = a.get_shape()[0].value
 16 |             if transpose_a:
 17 |                 K = a.get_shape()[1].value
 18 |                 a = tf.reshape(tf.transpose(a, [0, 2, 1]), [-1, K])
 19 |             else:
 20 |                 K = a.get_shape()[-1].value
 21 |                 a = tf.reshape(a, [-1, K])
 22 |             result = tf.matmul(a, b, transpose_b=transpose_b)
 23 |             result = tf.reshape(result, [B, -1, N])
 24 |             return result
 25 |         elif len(a.get_shape()) == 2 and len(b.get_shape()) == 3:
 26 |             if transpose_a:
 27 |                 M = a.get_shape()[1].value
 28 |             else:
 29 |                 M = a.get_shape()[0].value
 30 |             B = b.get_shape()[0].value
 31 |             if transpose_b:
 32 |                 K = b.get_shape()[-1].value
 33 |                 b = tf.transpose(tf.reshape(b, [-1, K]), [1, 0])
 34 |             else:
 35 |                 K = b.get_shape()[1].value
 36 |                 b = tf.transpose(tf.reshape(
 37 |                     tf.transpose(b, [0, 2, 1]), [-1, K]), [1, 0])
 38 |             result = tf.matmul(a, b, transpose_a=transpose_a)
 39 |             result = tf.transpose(tf.reshape(result, [M, B, -1]), [1, 0, 2])
 40 |             return result
 41 |         else:
 42 |             return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
 43 |     else:
 44 |         # weird batch matmul
 45 |         if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
 46 |             # reshape reduce_dim to the left most dim in b
 47 |             b_shape = b.get_shape()
 48 |             if reduce_dim != 0:
 49 |                 b_dims = list(range(len(b_shape)))
 50 |                 b_dims.remove(reduce_dim)
 51 |                 b_dims.insert(0, reduce_dim)
 52 |                 b = tf.transpose(b, b_dims)
 53 |             b_t_shape = b.get_shape()
 54 |             b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
 55 |             result = tf.matmul(a, b, transpose_a=transpose_a,
 56 |                                transpose_b=transpose_b)
 57 |             result = tf.reshape(result, b_t_shape)
 58 |             if reduce_dim != 0:
 59 |                 b_dims = list(range(len(b_shape)))
 60 |                 b_dims.remove(0)
 61 |                 b_dims.insert(reduce_dim, 0)
 62 |                 result = tf.transpose(result, b_dims)
 63 |             return result
 64 | 
 65 |         elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
 66 |             # reshape reduce_dim to the right most dim in a
 67 |             a_shape = a.get_shape()
 68 |             outter_dim = len(a_shape) - 1
 69 |             reduce_dim = len(a_shape) - reduce_dim - 1
 70 |             if reduce_dim != outter_dim:
 71 |                 a_dims = list(range(len(a_shape)))
 72 |                 a_dims.remove(reduce_dim)
 73 |                 a_dims.insert(outter_dim, reduce_dim)
 74 |                 a = tf.transpose(a, a_dims)
 75 |             a_t_shape = a.get_shape()
 76 |             a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
 77 |             result = tf.matmul(a, b, transpose_a=transpose_a,
 78 |                                transpose_b=transpose_b)
 79 |             result = tf.reshape(result, a_t_shape)
 80 |             if reduce_dim != outter_dim:
 81 |                 a_dims = list(range(len(a_shape)))
 82 |                 a_dims.remove(outter_dim)
 83 |                 a_dims.insert(reduce_dim, outter_dim)
 84 |                 result = tf.transpose(result, a_dims)
 85 |             return result
 86 | 
 87 |         elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
 88 |             return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
 89 | 
 90 |         assert False, 'something went wrong'
 91 | 
 92 | 
 93 | def clipoutNeg(vec, threshold=1e-6):
 94 |     mask = tf.cast(vec > threshold, tf.float32)
 95 |     return mask * vec
 96 | 
 97 | 
 98 | def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False):
 99 |     eigen_min = tf.reduce_min(input_mat)
100 |     eigen_max = tf.reduce_max(input_mat)
101 |     eigen_ratio = eigen_max / eigen_min
102 |     input_mat_clipped = clipoutNeg(input_mat, threshold)
103 | 
104 |     if debug:
105 |         input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print(
106 |             input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio]))
107 | 
108 |     return input_mat_clipped
109 | 
110 | 
111 | def factorReshape(Q, e, grad, facIndx=0, ftype='act'):
112 |     grad_shape = grad.get_shape()
113 |     if ftype == 'act':
114 |         assert e.get_shape()[0] == grad_shape[facIndx]
115 |         expanded_shape = [1, ] * len(grad_shape)
116 |         expanded_shape[facIndx] = -1
117 |         e = tf.reshape(e, expanded_shape)
118 |     if ftype == 'grad':
119 |         assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1]
120 |         expanded_shape = [1, ] * len(grad_shape)
121 |         expanded_shape[len(grad_shape) - facIndx - 1] = -1
122 |         e = tf.reshape(e, expanded_shape)
123 | 
124 |     return Q, e
125 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/acktr/running_stat.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # http://www.johndcook.com/blog/standard_deviation/
 4 | class RunningStat(object):
 5 |     def __init__(self, shape):
 6 |         self._n = 0
 7 |         self._M = np.zeros(shape)
 8 |         self._S = np.zeros(shape)
 9 |     def push(self, x):
10 |         x = np.asarray(x)
11 |         assert x.shape == self._M.shape
12 |         self._n += 1
13 |         if self._n == 1:
14 |             self._M[...] = x
15 |         else:
16 |             oldM = self._M.copy()
17 |             self._M[...] = oldM + (x - oldM)/self._n
18 |             self._S[...] = self._S + (x - oldM)*(x - self._M)
19 |     @property
20 |     def n(self):
21 |         return self._n
22 |     @property
23 |     def mean(self):
24 |         return self._M
25 |     @property
26 |     def var(self):
27 |         return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
28 |     @property
29 |     def std(self):
30 |         return np.sqrt(self.var)
31 |     @property
32 |     def shape(self):
33 |         return self._M.shape
34 | 
35 | def test_running_stat():
36 |     for shp in ((), (3,), (3,4)):
37 |         li = []
38 |         rs = RunningStat(shp)
39 |         for _ in range(5):
40 |             val = np.random.randn(*shp)
41 |             rs.push(val)
42 |             li.append(val)
43 |             m = np.mean(li, axis=0)
44 |             assert np.allclose(rs.mean, m)
45 |             v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
46 |             assert np.allclose(rs.var, v)
47 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from rl.bench.benchmarks import *
2 | from rl.bench.monitor import *
3 | 
4 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/bench/benchmarks.py:
--------------------------------------------------------------------------------
  1 | _atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
  2 | _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
  3 | 
  4 | _BENCHMARKS = []
  5 | 
  6 | def register_benchmark(benchmark):
  7 |     for b in _BENCHMARKS:
  8 |         if b['name'] == benchmark['name']:
  9 |             raise ValueError('Benchmark with name %s already registered!'%b['name'])
 10 |     _BENCHMARKS.append(benchmark)
 11 | 
 12 | def list_benchmarks():
 13 |     return [b['name'] for b in _BENCHMARKS]
 14 | 
 15 | def get_benchmark(benchmark_name):
 16 |     for b in _BENCHMARKS:
 17 |         if b['name'] == benchmark_name:
 18 |             return b
 19 |     raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks()))
 20 | 
 21 | def get_task(benchmark, env_id):
 22 |     """Get a task by env_id. Return None if the benchmark doesn't have the env"""
 23 |     return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
 24 | 
 25 | def find_task_for_env_id_in_any_benchmark(env_id):
 26 |     for bm in _BENCHMARKS:
 27 |         for task in bm["tasks"]:
 28 |             if task["env_id"]==env_id:
 29 |                 return bm, task
 30 |     return None, None
 31 | 
 32 | _ATARI_SUFFIX = 'NoFrameskip-v4'
 33 | 
 34 | register_benchmark({
 35 |     'name' : 'Atari200M',
 36 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 200M frames',
 37 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(200e6)} for _game in _atari7]
 38 | })
 39 | 
 40 | register_benchmark({
 41 |     'name' : 'Atari40M',
 42 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
 43 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atari7]
 44 | })
 45 | 
 46 | register_benchmark({
 47 |     'name' : 'Atari1Hr',
 48 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
 49 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_seconds' : 60*60} for _game in _atari7]
 50 | })
 51 | 
 52 | register_benchmark({
 53 |     'name' : 'AtariExploration40M',
 54 |     'description' :'7 Atari games emphasizing exploration, with pixel observations, 40M frames',
 55 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atariexpl7]
 56 | })
 57 | 
 58 | 
 59 | # MuJoCo
 60 | 
 61 | _mujocosmall = [
 62 |     'InvertedDoublePendulum-v1', 'InvertedPendulum-v1',
 63 |     'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
 64 |     'Reacher-v1', 'Swimmer-v1']
 65 | register_benchmark({
 66 |     'name' : 'Mujoco1M',
 67 |     'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps',
 68 |     'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall]
 69 | })
 70 | register_benchmark({
 71 |     'name' : 'MujocoWalkers',
 72 |     'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M',
 73 |     'tasks' : [
 74 |         {'env_id' : "Hopper-v1",          'trials' : 4, 'num_timesteps' :   8*1000000 },
 75 |         {'env_id' : "Walker2d-v1",        'trials' : 4, 'num_timesteps' :   8*1000000 },
 76 |         {'env_id' : "Humanoid-v1",        'trials' : 4, 'num_timesteps' : 100*1000000 },
 77 |     ]
 78 | })
 79 | # To reproduce:
 80 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8
 81 | # (observation input filters necessary)
 82 | 
 83 | 
 84 | # Roboschool
 85 | 
 86 | register_benchmark({
 87 |     'name' : 'Roboschool8M',
 88 |     'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
 89 |     'tasks' : [
 90 |         {'env_id' : "RoboschoolReacher-v1",                 'trials' : 4, 'num_timesteps' :  2*1000000 },
 91 |         {'env_id' : "RoboschoolAnt-v1",                     'trials' : 4, 'num_timesteps' :  8*1000000 },
 92 |         {'env_id' : "RoboschoolHalfCheetah-v1",             'trials' : 4, 'num_timesteps' :  8*1000000 },
 93 |         {'env_id' : "RoboschoolHopper-v1",                  'trials' : 4, 'num_timesteps' :  8*1000000 },
 94 |         {'env_id' : "RoboschoolWalker2d-v1",                'trials' : 4, 'num_timesteps' :  8*1000000 },
 95 |         ]
 96 | })
 97 | register_benchmark({
 98 |     'name' : 'RoboschoolHarder',
 99 |     'description' : 'Test your might!!! Up to 12 hours on 32 cores',
100 |     'tasks' : [
101 |         {'env_id' : "RoboschoolHumanoid-v1",              'trials' : 4, 'num_timesteps' : 100*1000000 },
102 |         {'env_id' : "RoboschoolHumanoidFlagrun-v1",       'trials' : 4, 'num_timesteps' : 200*1000000 },
103 |         {'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 },
104 |         ]
105 | })
106 | # To reproduce:
107 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M     myrun_ppo2_cpu8
108 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536
109 | # (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder)
110 | 
111 | 
112 | # Other
113 | 
114 | _atari50 =  [ # actually 49
115 |             'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 
116 |             'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider',  'Bowling', 
117 |             'Boxing', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber', 
118 |             'DemonAttack', 'DoubleDunk',  'Enduro', 'FishingDerby', 'Freeway', 
119 |             'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond',  
120 |             'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', 
121 |             'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert', 
122 |             'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 
123 |             'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 
124 |             'Venture', 'VideoPinball', 'WizardOfWor', 'Zaxxon', 
125 | ]
126 | 
127 | register_benchmark({
128 |     'name' : 'Atari50_40M',
129 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
130 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50]
131 | })
132 | 
133 | def env_shortname(s):
134 |     "Make typical names above shorter, while keeping recognizable"
135 |     s = s.replace("NoFrameskip", "")
136 |     if s[:10]=="Roboschool": s = s[10:]
137 |     i = s.rfind("-v")
138 |     if i!=-1: s = s[:i]
139 | 
140 |     return s.lower()
141 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/__init__.py:
--------------------------------------------------------------------------------
1 | from rl.common.console_util import *
2 | from rl.common.dataset import Dataset
3 | from rl.common.math_util import *
4 | from rl.common.misc_util import *
5 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/console_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from contextlib import contextmanager
 3 | import numpy as np
 4 | import time
 5 | 
 6 | # ================================================================
 7 | # Misc
 8 | # ================================================================
 9 | 
10 | def fmt_row(width, row, header=False):
11 |     out = " | ".join(fmt_item(x, width) for x in row)
12 |     if header: out = out + "\n" + "-"*len(out)
13 |     return out
14 | 
15 | def fmt_item(x, l):
16 |     if isinstance(x, np.ndarray):
17 |         assert x.ndim==0
18 |         x = x.item()
19 |     if isinstance(x, float): rep = "%g"%x
20 |     else: rep = str(x)
21 |     return " "*(l - len(rep)) + rep
22 | 
23 | color2num = dict(
24 |     gray=30,
25 |     red=31,
26 |     green=32,
27 |     yellow=33,
28 |     blue=34,
29 |     magenta=35,
30 |     cyan=36,
31 |     white=37,
32 |     crimson=38
33 | )
34 | 
35 | def colorize(string, color, bold=False, highlight=False):
36 |     attr = []
37 |     num = color2num[color]
38 |     if highlight: num += 10
39 |     attr.append(str(num))
40 |     if bold: attr.append('1')
41 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
42 | 
43 | 
44 | MESSAGE_DEPTH = 0
45 | 
46 | @contextmanager
47 | def timed(msg):
48 |     global MESSAGE_DEPTH #pylint: disable=W0603
49 |     print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
50 |     tstart = time.time()
51 |     MESSAGE_DEPTH += 1
52 |     yield
53 |     MESSAGE_DEPTH -= 1
54 |     print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
55 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Dataset(object):
 4 |     def __init__(self, data_map, deterministic=False, shuffle=True):
 5 |         self.data_map = data_map
 6 |         self.deterministic = deterministic
 7 |         self.enable_shuffle = shuffle
 8 |         self.n = next(iter(data_map.values())).shape[0]
 9 |         self._next_id = 0
10 |         self.shuffle()
11 | 
12 |     def shuffle(self):
13 |         if self.deterministic:
14 |             return
15 |         perm = np.arange(self.n)
16 |         np.random.shuffle(perm)
17 |         for key in self.data_map:
18 |             self.data_map[key] = self.data_map[key][perm]
19 | 
20 |         self._next_id = 0
21 | 
22 |     def next_batch(self, batch_size):
23 |         if self._next_id >= self.n and self.enable_shuffle:
24 |             self.shuffle()
25 | 
26 |         cur_id = self._next_id
27 |         cur_batch_size = min(batch_size, self.n - self._next_id)
28 |         self._next_id += cur_batch_size
29 | 
30 |         data_map = dict()
31 |         for key in self.data_map:
32 |             data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
33 |         return data_map
34 | 
35 |     def iterate_once(self, batch_size):
36 |         if self.enable_shuffle: self.shuffle()
37 | 
38 |         while self._next_id <= self.n - batch_size:
39 |             yield self.next_batch(batch_size)
40 |         self._next_id = 0
41 | 
42 |     def subset(self, num_elements, deterministic=True):
43 |         data_map = dict()
44 |         for key in self.data_map:
45 |             data_map[key] = self.data_map[key][:num_elements]
46 |         return Dataset(data_map, deterministic)
47 | 
48 | 
49 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
50 |     assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
51 |     arrays = tuple(map(np.asarray, arrays))
52 |     n = arrays[0].shape[0]
53 |     assert all(a.shape[0] == n for a in arrays[1:])
54 |     inds = np.arange(n)
55 |     if shuffle: np.random.shuffle(inds)
56 |     sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
57 |     for batch_inds in np.array_split(inds, sections):
58 |         if include_final_partial_batch or len(batch_inds) == batch_size:
59 |             yield tuple(a[batch_inds] for a in arrays)
60 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/ma_wrappers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from collections import deque
 3 | from PIL import Image
 4 | import gym
 5 | from gym import spaces
 6 | 
 7 | 
 8 | class MAWrapper(gym.Wrapper):
 9 |     def __init__(self, env):
10 |         gym.Wrapper.__init__(self, env)
11 |         self.observation_space = [self.env.observation_space]
12 |         self.action_space = [self.env.action_space]
13 |         self.n = 1
14 | 
15 |     def step(self, action):
16 |         obs, reward, done, info = self.env.step(action[0])
17 |         return [obs], [reward], [done], info
18 | 
19 |     def reset(self):
20 |         return [self.env.reset()]
21 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/math_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | 
 5 | def discount(x, gamma):
 6 |     """
 7 |     computes discounted sums along 0th dimension of x.
 8 | 
 9 |     inputs
10 |     ------
11 |     x: ndarray
12 |     gamma: float
13 | 
14 |     outputs
15 |     -------
16 |     y: ndarray with same shape as x, satisfying
17 | 
18 |         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 |                 where k = len(x) - t - 1
20 | 
21 |     """
22 |     assert x.ndim >= 1
23 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 | 
25 | def explained_variance(ypred,y):
26 |     """
27 |     Computes fraction of variance that ypred explains about y.
28 |     Returns 1 - Var[y-ypred] / Var[y]
29 | 
30 |     interpretation:
31 |         ev=0  =>  might as well have predicted zero
32 |         ev=1  =>  perfect prediction
33 |         ev<0  =>  worse than just predicting zero
34 | 
35 |     """
36 |     assert y.ndim == 1 and ypred.ndim == 1
37 |     vary = np.var(y)
38 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 | 
40 | def explained_variance_2d(ypred, y):
41 |     assert y.ndim == 2 and ypred.ndim == 2
42 |     vary = np.var(y, axis=0)
43 |     out = 1 - np.var(y-ypred)/vary
44 |     out[vary < 1e-10] = 0
45 |     return out
46 | 
47 | def ncc(ypred, y):
48 |     return np.corrcoef(ypred, y)[1,0]
49 | 
50 | def flatten_arrays(arrs):
51 |     return np.concatenate([arr.flat for arr in arrs])
52 | 
53 | def unflatten_vector(vec, shapes):
54 |     i=0
55 |     arrs = []
56 |     for shape in shapes:
57 |         size = np.prod(shape)
58 |         arr = vec[i:i+size].reshape(shape)
59 |         arrs.append(arr)
60 |         i += size
61 |     return arrs
62 | 
63 | def discount_with_boundaries(X, New, gamma):
64 |     """
65 |     X: 2d array of floats, time x features
66 |     New: 2d array of bools, indicating when a new episode has started
67 |     """
68 |     Y = np.zeros_like(X)
69 |     T = X.shape[0]
70 |     Y[T-1] = X[T-1]
71 |     for t in range(T-2, -1, -1):
72 |         Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 |     return Y
74 | 
75 | def test_discount_with_boundaries():
76 |     gamma=0.9
77 |     x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 |     starts = [1.0, 0.0, 0.0, 1.0]
79 |     y = discount_with_boundaries(x, starts, gamma)
80 |     assert np.allclose(y, [
81 |         1 + gamma * 2 + gamma**2 * 3,
82 |         2 + gamma * 3,
83 |         3,
84 |         4
85 |     ])


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/mpi_adam.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import rl.common.tf_util as U
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | 
 6 | class MpiAdam(object):
 7 |     def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
 8 |         self.var_list = var_list
 9 |         self.beta1 = beta1
10 |         self.beta2 = beta2
11 |         self.epsilon = epsilon
12 |         self.scale_grad_by_procs = scale_grad_by_procs
13 |         size = sum(U.numel(v) for v in var_list)
14 |         self.m = np.zeros(size, 'float32')
15 |         self.v = np.zeros(size, 'float32')
16 |         self.t = 0
17 |         self.setfromflat = U.SetFromFlat(var_list)
18 |         self.getflat = U.GetFlat(var_list)
19 |         self.comm = MPI.COMM_WORLD if comm is None else comm
20 | 
21 |     def update(self, localg, stepsize):
22 |         if self.t % 100 == 0:
23 |             self.check_synced()
24 |         localg = localg.astype('float32')
25 |         globalg = np.zeros_like(localg)
26 |         self.comm.Allreduce(localg, globalg, op=MPI.SUM)
27 |         if self.scale_grad_by_procs:
28 |             globalg /= self.comm.Get_size()
29 | 
30 |         self.t += 1
31 |         a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
32 |         self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
33 |         self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
34 |         step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
35 |         self.setfromflat(self.getflat() + step)
36 | 
37 |     def sync(self):
38 |         theta = self.getflat()
39 |         self.comm.Bcast(theta, root=0)
40 |         self.setfromflat(theta)
41 | 
42 |     def check_synced(self):
43 |         if self.comm.Get_rank() == 0: # this is root
44 |             theta = self.getflat()
45 |             self.comm.Bcast(theta, root=0)
46 |         else:
47 |             thetalocal = self.getflat()
48 |             thetaroot = np.empty_like(thetalocal)
49 |             self.comm.Bcast(thetaroot, root=0)
50 |             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
51 | 
52 | @U.in_session
53 | def test_MpiAdam():
54 |     np.random.seed(0)
55 |     tf.set_random_seed(0)
56 |     
57 |     a = tf.Variable(np.random.randn(3).astype('float32'))
58 |     b = tf.Variable(np.random.randn(2,5).astype('float32'))
59 |     loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
60 | 
61 |     stepsize = 1e-2
62 |     update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
63 |     do_update = U.function([], loss, updates=[update_op])
64 | 
65 |     tf.get_default_session().run(tf.global_variables_initializer())
66 |     for i in range(10):
67 |         print(i,do_update())
68 | 
69 |     tf.set_random_seed(0)
70 |     tf.get_default_session().run(tf.global_variables_initializer())
71 | 
72 |     var_list = [a,b]
73 |     lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
74 |     adam = MpiAdam(var_list)
75 | 
76 |     for i in range(10):
77 |         l,g = lossandgrad()
78 |         adam.update(g, stepsize)
79 |         print(i,l)


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/mpi_fork.py:
--------------------------------------------------------------------------------
 1 | import os, subprocess, sys
 2 | 
 3 | def mpi_fork(n, bind_to_core=False):
 4 |     """Re-launches the current script with workers
 5 |     Returns "parent" for original parent, "child" for MPI children
 6 |     """
 7 |     if n<=1: 
 8 |         return "child"
 9 |     if os.getenv("IN_MPI") is None:
10 |         env = os.environ.copy()
11 |         env.update(
12 |             MKL_NUM_THREADS="1",
13 |             OMP_NUM_THREADS="1",
14 |             IN_MPI="1"
15 |         )
16 |         args = ["mpirun", "-np", str(n)]
17 |         if bind_to_core:
18 |             args += ["-bind-to", "core"]
19 |         args += [sys.executable] + sys.argv
20 |         subprocess.check_call(args, env=env)
21 |         return "parent"
22 |     else:
23 |         return "child"
24 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/mpi_moments.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | from rl.common import zipsame
 4 | 
 5 | def mpi_moments(x, axis=0):
 6 |     x = np.asarray(x, dtype='float64')
 7 |     newshape = list(x.shape)
 8 |     newshape.pop(axis)
 9 |     n = np.prod(newshape,dtype=int)
10 |     totalvec = np.zeros(n*2+1, 'float64')
11 |     addvec = np.concatenate([x.sum(axis=axis).ravel(), 
12 |         np.square(x).sum(axis=axis).ravel(), 
13 |         np.array([x.shape[axis]],dtype='float64')])
14 |     MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
15 |     sum = totalvec[:n]
16 |     sumsq = totalvec[n:2*n]
17 |     count = totalvec[2*n]
18 |     if count == 0:
19 |         mean = np.empty(newshape); mean[:] = np.nan
20 |         std = np.empty(newshape); std[:] = np.nan
21 |     else:
22 |         mean = sum/count
23 |         std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0))
24 |     return mean, std, count
25 | 
26 | 
27 | def test_runningmeanstd():
28 |     comm = MPI.COMM_WORLD
29 |     np.random.seed(0)
30 |     for (triple,axis) in [
31 |         ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
32 |         ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
33 |         ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
34 |         ]:
35 | 
36 | 
37 |         x = np.concatenate(triple, axis=axis)
38 |         ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
39 | 
40 | 
41 |         ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
42 | 
43 |         for (a1,a2) in zipsame(ms1, ms2):
44 |             print(a1, a2)
45 |             assert np.allclose(a1, a2)
46 |             print("ok!")
47 | 
48 | if __name__ == "__main__":
49 |     #mpirun -np 3 python <script>
50 |     test_runningmeanstd()


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
  1 | from mpi4py import MPI
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import rl.common.tf_util as U
  5 | 
  6 | 
  7 | class RunningMeanStd(object):
  8 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
  9 |     def __init__(self, epsilon=1e-2, shape=()):
 10 | 
 11 |         self._sum = tf.get_variable(
 12 |             dtype=tf.float64,
 13 |             shape=shape,
 14 |             initializer=tf.constant_initializer(0.0),
 15 |             name="runningsum", trainable=False)
 16 |         self._sumsq = tf.get_variable(
 17 |             dtype=tf.float64,
 18 |             shape=shape,
 19 |             initializer=tf.constant_initializer(epsilon),
 20 |             name="runningsumsq", trainable=False)
 21 |         self._count = tf.get_variable(
 22 |             dtype=tf.float64,
 23 |             shape=(),
 24 |             initializer=tf.constant_initializer(epsilon),
 25 |             name="count", trainable=False)
 26 |         self.shape = shape
 27 | 
 28 |         self.mean = tf.to_float(self._sum / self._count)
 29 |         self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
 30 | 
 31 |         newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
 32 |         newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
 33 |         newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
 34 |         self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
 35 |             updates=[tf.assign_add(self._sum, newsum),
 36 |                      tf.assign_add(self._sumsq, newsumsq),
 37 |                      tf.assign_add(self._count, newcount)])
 38 | 
 39 |     def update(self, x):
 40 |         x = x.astype('float64')
 41 |         n = int(np.prod(self.shape))
 42 |         totalvec = np.zeros(n*2+1, 'float64')
 43 |         addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
 44 |         MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
 45 |         self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
 46 | 
 47 | @U.in_session
 48 | def test_runningmeanstd():
 49 |     for (x1, x2, x3) in [
 50 |         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
 51 |         (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
 52 |         ]:
 53 | 
 54 |         rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
 55 |         U.initialize()
 56 | 
 57 |         x = np.concatenate([x1, x2, x3], axis=0)
 58 |         ms1 = [x.mean(axis=0), x.std(axis=0)]
 59 |         rms.update(x1)
 60 |         rms.update(x2)
 61 |         rms.update(x3)
 62 |         ms2 = U.eval([rms.mean, rms.std])
 63 | 
 64 |         assert np.allclose(ms1, ms2)
 65 | 
 66 | @U.in_session
 67 | def test_dist():
 68 |     np.random.seed(0)
 69 |     p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
 70 |     q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
 71 | 
 72 |     # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
 73 |     # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
 74 | 
 75 |     comm = MPI.COMM_WORLD
 76 |     assert comm.Get_size()==2
 77 |     if comm.Get_rank()==0:
 78 |         x1,x2,x3 = p1,p2,p3
 79 |     elif comm.Get_rank()==1:
 80 |         x1,x2,x3 = q1,q2,q3
 81 |     else:
 82 |         assert False
 83 | 
 84 |     rms = RunningMeanStd(epsilon=0.0, shape=(1,))
 85 |     U.initialize()
 86 | 
 87 |     rms.update(x1)
 88 |     rms.update(x2)
 89 |     rms.update(x3)
 90 | 
 91 |     bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
 92 | 
 93 |     def checkallclose(x,y):
 94 |         print(x,y)
 95 |         return np.allclose(x,y)
 96 | 
 97 |     assert checkallclose(
 98 |         bigvec.mean(axis=0),
 99 |         U.eval(rms.mean)
100 |     )
101 |     assert checkallclose(
102 |         bigvec.std(axis=0),
103 |         U.eval(rms.std)
104 |     )
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     # Run with mpirun -np 2 python <filename>
109 |     test_dist()
110 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/schedules.py:
--------------------------------------------------------------------------------
  1 | """This file is used for specifying various schedules that evolve over
  2 | time throughout the execution of the algorithm, such as:
  3 |  - learning rate for the optimizer
  4 |  - exploration epsilon for the epsilon greedy exploration strategy
  5 |  - beta parameter for beta parameter in prioritized replay
  6 | 
  7 | Each schedule has a function `value(t)` which returns the current value
  8 | of the parameter given the timestep t of the optimization procedure.
  9 | """
 10 | 
 11 | 
 12 | class Schedule(object):
 13 |     def value(self, t):
 14 |         """Value of the schedule at time t"""
 15 |         raise NotImplementedError()
 16 | 
 17 | 
 18 | class ConstantSchedule(object):
 19 |     def __init__(self, value):
 20 |         """Value remains constant over time.
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         value: float
 25 |             Constant value of the schedule
 26 |         """
 27 |         self._v = value
 28 | 
 29 |     def value(self, t):
 30 |         """See Schedule.value"""
 31 |         return self._v
 32 | 
 33 | 
 34 | def linear_interpolation(l, r, alpha):
 35 |     return l + alpha * (r - l)
 36 | 
 37 | 
 38 | class PiecewiseSchedule(object):
 39 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 40 |         """Piecewise schedule.
 41 | 
 42 |         endpoints: [(int, int)]
 43 |             list of pairs `(time, value)` meanining that schedule should output
 44 |             `value` when `t==time`. All the values for time must be sorted in
 45 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 46 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 47 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 48 |             time passed between `time_a` and `time_b` for time `t`.
 49 |         interpolation: lambda float, float, float: float
 50 |             a function that takes value to the left and to the right of t according
 51 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 52 |             right endpoint that t has covered. See linear_interpolation for example.
 53 |         outside_value: float
 54 |             if the value is requested outside of all the intervals sepecified in
 55 |             `endpoints` this value is returned. If None then AssertionError is
 56 |             raised when outside value is requested.
 57 |         """
 58 |         idxes = [e[0] for e in endpoints]
 59 |         assert idxes == sorted(idxes)
 60 |         self._interpolation = interpolation
 61 |         self._outside_value = outside_value
 62 |         self._endpoints = endpoints
 63 | 
 64 |     def value(self, t):
 65 |         """See Schedule.value"""
 66 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 67 |             if l_t <= t and t < r_t:
 68 |                 alpha = float(t - l_t) / (r_t - l_t)
 69 |                 return self._interpolation(l, r, alpha)
 70 | 
 71 |         # t does not belong to any of the pieces, so doom.
 72 |         assert self._outside_value is not None
 73 |         return self._outside_value
 74 | 
 75 | 
 76 | class LinearSchedule(object):
 77 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 78 |         """Linear interpolation between initial_p and final_p over
 79 |         schedule_timesteps. After this many timesteps pass final_p is
 80 |         returned.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         schedule_timesteps: int
 85 |             Number of timesteps for which to linearly anneal initial_p
 86 |             to final_p
 87 |         initial_p: float
 88 |             initial output value
 89 |         final_p: float
 90 |             final output value
 91 |         """
 92 |         self.schedule_timesteps = schedule_timesteps
 93 |         self.final_p = final_p
 94 |         self.initial_p = initial_p
 95 | 
 96 |     def value(self, t):
 97 |         """See Schedule.value"""
 98 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
 99 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
100 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/segment_tree.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | 
  4 | class SegmentTree(object):
  5 |     def __init__(self, capacity, operation, neutral_element):
  6 |         """Build a Segment Tree data structure.
  7 | 
  8 |         https://en.wikipedia.org/wiki/Segment_tree
  9 | 
 10 |         Can be used as regular array, but with two
 11 |         important differences:
 12 | 
 13 |             a) setting item's value is slightly slower.
 14 |                It is O(lg capacity) instead of O(1).
 15 |             b) user has access to an efficient `reduce`
 16 |                operation which reduces `operation` over
 17 |                a contiguous subsequence of items in the
 18 |                array.
 19 | 
 20 |         Paramters
 21 |         ---------
 22 |         capacity: int
 23 |             Total size of the array - must be a power of two.
 24 |         operation: lambda obj, obj -> obj
 25 |             and operation for combining elements (eg. sum, max)
 26 |             must for a mathematical group together with the set of
 27 |             possible values for array elements.
 28 |         neutral_element: obj
 29 |             neutral element for the operation above. eg. float('-inf')
 30 |             for max and 0 for sum.
 31 |         """
 32 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
 33 |         self._capacity = capacity
 34 |         self._value = [neutral_element for _ in range(2 * capacity)]
 35 |         self._operation = operation
 36 | 
 37 |     def _reduce_helper(self, start, end, node, node_start, node_end):
 38 |         if start == node_start and end == node_end:
 39 |             return self._value[node]
 40 |         mid = (node_start + node_end) // 2
 41 |         if end <= mid:
 42 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
 43 |         else:
 44 |             if mid + 1 <= start:
 45 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
 46 |             else:
 47 |                 return self._operation(
 48 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
 49 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 50 |                 )
 51 | 
 52 |     def reduce(self, start=0, end=None):
 53 |         """Returns result of applying `self.operation`
 54 |         to a contiguous subsequence of the array.
 55 | 
 56 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 57 | 
 58 |         Parameters
 59 |         ----------
 60 |         start: int
 61 |             beginning of the subsequence
 62 |         end: int
 63 |             end of the subsequences
 64 | 
 65 |         Returns
 66 |         -------
 67 |         reduced: obj
 68 |             result of reducing self.operation over the specified range of array elements.
 69 |         """
 70 |         if end is None:
 71 |             end = self._capacity
 72 |         if end < 0:
 73 |             end += self._capacity
 74 |         end -= 1
 75 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 76 | 
 77 |     def __setitem__(self, idx, val):
 78 |         # index of the leaf
 79 |         idx += self._capacity
 80 |         self._value[idx] = val
 81 |         idx //= 2
 82 |         while idx >= 1:
 83 |             self._value[idx] = self._operation(
 84 |                 self._value[2 * idx],
 85 |                 self._value[2 * idx + 1]
 86 |             )
 87 |             idx //= 2
 88 | 
 89 |     def __getitem__(self, idx):
 90 |         assert 0 <= idx < self._capacity
 91 |         return self._value[self._capacity + idx]
 92 | 
 93 | 
 94 | class SumSegmentTree(SegmentTree):
 95 |     def __init__(self, capacity):
 96 |         super(SumSegmentTree, self).__init__(
 97 |             capacity=capacity,
 98 |             operation=operator.add,
 99 |             neutral_element=0.0
100 |         )
101 | 
102 |     def sum(self, start=0, end=None):
103 |         """Returns arr[start] + ... + arr[end]"""
104 |         return super(SumSegmentTree, self).reduce(start, end)
105 | 
106 |     def find_prefixsum_idx(self, prefixsum):
107 |         """Find the highest index `i` in the array such that
108 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
109 | 
110 |         if array values are probabilities, this function
111 |         allows to sample indexes according to the discrete
112 |         probability efficiently.
113 | 
114 |         Parameters
115 |         ----------
116 |         perfixsum: float
117 |             upperbound on the sum of array prefix
118 | 
119 |         Returns
120 |         -------
121 |         idx: int
122 |             highest index satisfying the prefixsum constraint
123 |         """
124 |         assert 0 <= prefixsum <= self.sum() + 1e-5
125 |         idx = 1
126 |         while idx < self._capacity:  # while non-leaf
127 |             if self._value[2 * idx] > prefixsum:
128 |                 idx = 2 * idx
129 |             else:
130 |                 prefixsum -= self._value[2 * idx]
131 |                 idx = 2 * idx + 1
132 |         return idx - self._capacity
133 | 
134 | 
135 | class MinSegmentTree(SegmentTree):
136 |     def __init__(self, capacity):
137 |         super(MinSegmentTree, self).__init__(
138 |             capacity=capacity,
139 |             operation=min,
140 |             neutral_element=float('inf')
141 |         )
142 | 
143 |     def min(self, start=0, end=None):
144 |         """Returns min(arr[start], ...,  arr[end])"""
145 | 
146 |         return super(MinSegmentTree, self).reduce(start, end)
147 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
 1 | class VecEnv(object):
 2 |     """
 3 |     Vectorized environment base class
 4 |     """
 5 |     def step(self, vac):
 6 |         """
 7 |         Apply sequence of actions to sequence of environments
 8 |         actions -> (observations, rewards, news)
 9 | 
10 |         where 'news' is a boolean vector indicating whether each element is new.
11 |         """
12 |         raise NotImplementedError
13 | 
14 |     def reset(self):
15 |         """
16 |         Reset all environments
17 |         """
18 |         raise NotImplementedError
19 | 
20 |     def close(self):
21 |         pass


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/vec_env/dummy_vec_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from . import VecEnv
 3 | 
 4 | 
 5 | class DummyVecEnv(VecEnv):
 6 |     def __init__(self, env_fns, swi=False, is_multi_agent=False):
 7 |         envswi = [fn() for fn in env_fns]
 8 |         if swi:
 9 |             self.envs, self.switchers = zip(*envswi)
10 |         else:
11 |             self.envs = envswi
12 |         env = self.envs[0]
13 |         self.action_space = env.action_space
14 |         self.observation_space = env.observation_space        
15 |         self.ts = np.zeros(len(self.envs), dtype='int')
16 |         self.is_multi_agent = is_multi_agent
17 |         self.num_agents = len(env.observation_space) if is_multi_agent else 1
18 | 
19 |     def switch_to(self,i=None,option=None):
20 |         assert self.switchers
21 |         if i:
22 |             for switcher in self.switchers:
23 |                 if option:
24 |                     switcher.switch_to(i,option=option)
25 |                 else:
26 |                     switcher.switch_to(i)
27 |             self.i = i
28 |         else:
29 |             assert self.i
30 |             self.i +=1
31 |             for switcher in self.switchers:
32 |                 if option:
33 |                     switcher.switch_to(self.i, option=option)
34 |                 else:
35 |                     switcher.switch_to(self.i)
36 | 
37 |     def step_async(self, actions):
38 |         self.actions = actions
39 | 
40 |     def step_wait(self):
41 |         action_n = self.actions
42 |         if self.is_multi_agent:
43 |             action_n = [[ac[i] for ac in action_n] for i in range(len(self.envs))]
44 |             results = [env.step(a) for (a,env) in zip(action_n, self.envs)]
45 |             obs, rews, dones, infos = [], [], [], []
46 |             for k in range(self.num_agents):
47 |                 obs.append([result[0][k] for result in results])
48 |                 rews.append([result[1][k] for result in results])
49 |                 dones.append([result[2][k] for result in results])
50 |             try:
51 |                 infos = [result[3] for result in results]
52 |             except:
53 |                 infos = None
54 | 
55 |             obs = [np.stack(ob) for ob in obs]
56 |             rews = [np.stack(rew) for rew in rews]
57 |             dones = [np.stack(done) for done in dones]
58 |             return obs, rews, dones, infos
59 |         else:
60 |             results = [env.step(a) for (a,env) in zip(action_n, self.envs)]
61 |             obs, rews, dones, infos = map(np.array, zip(*results))
62 |             self.ts += 1
63 |             for (i, done) in enumerate(dones):
64 |                 if done:
65 |                     obs[i] = self.envs[i].reset()
66 |                     self.ts[i] = 0
67 |             return np.array(obs), np.array(rews), np.array(dones), infos
68 | 
69 |     def reset(self):
70 |         results = [env.reset() for env in self.envs]
71 |         if self.is_multi_agent:
72 |             obs = [[result[k] for result in results] for k in range(self.num_agents)]
73 |             obs = [np.stack(ob) for ob in obs]
74 |             return obs
75 |         else:
76 |             return np.array(results)
77 | 
78 |     @property
79 |     def num_envs(self):
80 |         return len(self.envs)
81 | 
82 |     def render(self, mode):
83 |         return self.envs[0].render()
84 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/vec_env/mpi_vec_env1.py:
--------------------------------------------------------------------------------
 1 | from rl.common.vec_env import VecEnv
 2 | import numpy as np
 3 | 
 4 | EXIT = 'EXIT'
 5 | 
 6 | class MpiVecEnv(VecEnv):
 7 |     def __init__(self, env, comm):
 8 |         """
 9 |         envs: list of gym environments to run in subprocesses
10 |         """
11 |         self.comm = comm
12 |         self.env = env
13 |         self.action_space = self.env.action_space
14 |         self.observation_space = self.env.observation_space
15 |         if comm.Get_rank() != 0:
16 |             self._slave_reset()
17 |             while True:
18 |                 self._slave_step()
19 | 
20 |     def step(self, actions):
21 |         a = self.comm.scatter(actions)
22 |         ob,rew,done,info = self.env.step(a)
23 |         if done: ob = self.env.reset()
24 |         results = self.comm.gather((ob,rew,done,info))
25 |         results = list(zip(*results))
26 |         return (*map(np.array, results[:3]), results[3])
27 | 
28 |     def close(self):
29 |         self.comm.scatter([EXIT for _ in range(self.num_envs)])
30 | 
31 |     def _slave_step(self):
32 |         a = self.comm.scatter(None)
33 |         if a == EXIT:
34 |             return
35 |         ob,rew,done,info = self.env.step(a)
36 |         if done: ob = self.env.reset()
37 |         self.comm.gather((ob,rew,done,info))
38 | 
39 |     def reset(self):
40 |         return np.array(self.comm.gather(self.env.reset()))
41 | 
42 |     def _slave_reset(self):
43 |         self.comm.gather(self.env.reset())
44 | 
45 |     @property
46 |     def num_envs(self):
47 |         return self.comm.Get_size()
48 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/vec_env/speedtest.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
 3 | from rl.common.atari_wrappers import wrap_deepmind
 4 | from rl.common import set_global_seeds
 5 | import time
 6 | import numpy as np
 7 | 
 8 | env_id = 'SpaceInvaders'
 9 | seed = 42
10 | nenvs = 1
11 | np.random.seed(0)
12 | 
13 | def make_env(rank):
14 |     def env_fn():
15 |         env = gym.make('{}NoFrameskip-v4'.format(env_id))
16 |         env.seed(seed + rank)
17 |         return env
18 |         return wrap_deepmind(env)
19 |     return env_fn
20 | 
21 | if 1:
22 |     from rl_algs.common.vec_env.mpi_vec_env1 import MpiVecEnv
23 |     from mpi4py import MPI
24 |     comm = MPI.COMM_WORLD
25 |     nenvs = comm.Get_size()
26 |     env = make_env(comm.Get_rank())()
27 |     env = MpiVecEnv(env, comm)
28 |     A = np.array([env.action_space.sample() for _ in range(env.num_envs)])*0
29 | elif 1:
30 |     set_global_seeds(seed)
31 |     env = SubprocVecEnv([make_env(i) for i in range(nenvs)])
32 |     A = np.array([env.action_space.sample() for _ in range(env.num_envs)])*0
33 | else:
34 |     env = make_env(0)()
35 |     A = env.action_space.sample()*0
36 |     env.num_envs = 1
37 | 
38 | env.reset()
39 | 
40 | nsteps = 1000
41 | tstart = time.time()
42 | blah = 0
43 | for _ in range(nsteps):
44 |     ob,rew,done,_ = env.step(A)
45 |     for q in (ob, rew, done):
46 |         blah += np.sum(q)
47 | print(blah)
48 | totaltime = time.time() - tstart
49 | totalframes = nsteps * env.num_envs
50 | print('%s in %s: %s '%(totalframes, totaltime, totalframes/totaltime))
51 | 
52 | env.close()


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/vec_env/subproc_vec_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiprocessing import Process, Pipe
  3 | from baselines.common.vec_env import VecEnv, CloudpickleWrapper
  4 | 
  5 | 
  6 | def worker(remote, env_fn_wrapper, is_multi_agent):
  7 |     env = env_fn_wrapper.x()
  8 |     while True:
  9 |         cmd, data = remote.recv()
 10 |         if cmd == 'step':
 11 |             ob, reward, done, info = env.step(data)
 12 |             if is_multi_agent:
 13 |                 if done[0]:
 14 |                     ob = env.reset()
 15 |             else:
 16 |                 if done:
 17 |                     ob = env.reset()
 18 |             remote.send((ob, reward, done, info))
 19 |         elif cmd == 'reset':
 20 |             ob = env.reset()
 21 |             remote.send(ob)
 22 |         elif cmd == 'close':
 23 |             remote.close()
 24 |             break
 25 |         elif cmd == 'get_spaces':
 26 |             remote.send((env.action_space, env.observation_space))
 27 |         elif cmd == 'render':
 28 |             env.render()
 29 |             remote.send(0)
 30 |         else:
 31 |             raise NotImplementedError
 32 | 
 33 | 
 34 | class SubprocVecEnv(VecEnv):
 35 |     def __init__(self, env_fns, is_multi_agent=False):
 36 |         """
 37 |         envs: list of gym environments to run in subprocesses
 38 |         """
 39 |         nenvs = len(env_fns)
 40 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])        
 41 |         self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn), is_multi_agent))
 42 |             for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
 43 |         for p in self.ps:
 44 |             p.daemon = True
 45 |             p.start()
 46 |         for remote in self.work_remotes:
 47 |             remote.close()
 48 | 
 49 |         self.remotes[0].send(('get_spaces', None))
 50 |         self.action_space, self.observation_space = self.remotes[0].recv()
 51 |         self.is_multi_agent = is_multi_agent
 52 |         self.num_agents = None
 53 |         if is_multi_agent:
 54 |             try:
 55 |                 n = len(self.action_space)
 56 |             except:
 57 |                 n = len(self.action_space.spaces)
 58 |             self.num_agents = n
 59 | 
 60 |     def step_async(self, actions):
 61 |         # if self.is_multi_agent:
 62 |         #     remote_action = []
 63 |         #     for i in range(len(self.remotes)):
 64 |         #         remote_action.append([action[i] for action in actions])
 65 |         #     actions = remote_action
 66 | 
 67 |         for remote, action in zip(self.remotes, actions):
 68 |             remote.send(('step', action))
 69 |         self.waiting = True
 70 | 
 71 |     def step_wait(self):
 72 |         results = [remote.recv() for remote in self.remotes]
 73 |         self.waiting = False
 74 |         if self.is_multi_agent:
 75 |             obs, rews, dones, infos = [], [], [], []
 76 |             for k in range(self.num_agents):
 77 |                 obs.append([result[0][k] for result in results])
 78 |                 rews.append([result[1][k] for result in results])
 79 |                 dones.append([result[2][k] for result in results])
 80 |             try:
 81 |                 infos = [result[3] for result in results]
 82 |             except:
 83 |                 infos = None
 84 | 
 85 |             obs = [np.stack(ob) for ob in obs]
 86 |             rews = [np.stack(rew) for rew in rews]
 87 |             dones = [np.stack(done) for done in dones]
 88 |             return obs, rews, dones, infos
 89 |         else:
 90 |             obs, rews, dones, infos = zip(*results)
 91 |             return np.stack(obs), np.stack(rews), np.stack(dones), infos
 92 | 
 93 |     def reset(self):
 94 |         for remote in self.remotes:
 95 |             remote.send(('reset', None))
 96 |         if self.is_multi_agent:
 97 |             results = [remote.recv() for remote in self.remotes]
 98 |             obs = [[result[k] for result in results] for k in range(self.num_agents)]
 99 |             obs = [np.stack(ob) for ob in obs]
100 |             return obs
101 |         else:
102 |             return np.stack([remote.recv() for remote in self.remotes])
103 | 
104 |     def close(self):
105 |         for remote in self.remotes:
106 |             remote.send(('close', None))
107 |         for p in self.ps:
108 |             p.join()
109 | 
110 |     @property
111 |     def num_envs(self):
112 |         return len(self.remotes)
113 | 
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     from make_env import make_env
118 | 
119 |     def create_env(rank):
120 |         def _thunk():
121 |             env = make_env('simple_push')
122 |             env.seed(rank)
123 |             return env
124 |         return _thunk
125 | 
126 |     env = SubprocVecEnv([create_env(i) for i in range(0, 4)], is_multi_agent=True)
127 |     env.reset()
128 |     obs, rews, dones, _ = env.step(
129 |         [[np.array([0, 1, 0, 0, 0]), np.array([2, 0, 0, 0, 0])] for _ in range(4)]
130 |     )
131 |     print(env.observation_space)
132 |     print(obs)
133 |     print(rews[0].shape)
134 |     print(dones[1].shape)
135 |     env.close()
136 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/vec_env/subproc_vec_env_walker.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiprocessing import Process, Pipe
  3 | from baselines.common.vec_env import VecEnv, CloudpickleWrapper
  4 | 
  5 | 
  6 | def worker(remote, env_fn_wrapper, is_multi_agent):
  7 |     env = env_fn_wrapper.x()
  8 |     while True:
  9 |         cmd, data = remote.recv()
 10 |         if cmd == 'step':
 11 |             ob, reward, done, info = env.step(data)
 12 |             if is_multi_agent:
 13 |                 if done[0]:
 14 |                     ob = env.reset()
 15 |             else:
 16 |                 if done:
 17 |                     ob = env.reset()
 18 |             remote.send((ob, reward, done, info))
 19 |         elif cmd == 'reset':
 20 |             ob = env.reset()
 21 |             remote.send(ob)
 22 |         elif cmd == 'close':
 23 |             remote.close()
 24 |             break
 25 |         elif cmd == 'get_spaces':
 26 |             remote.send((env.action_space, env.observation_space))
 27 |         elif cmd == 'render':
 28 |             env.render()
 29 |             remote.send(0)
 30 |         else:
 31 |             raise NotImplementedError
 32 | 
 33 | 
 34 | class SubprocVecEnv(VecEnv):
 35 |     def __init__(self, env_fns, is_multi_agent=False):
 36 |         """
 37 |         envs: list of gym environments to run in subprocesses
 38 |         """
 39 |         self.waiting = False
 40 |         self.closed = False
 41 |         nenvs = len(env_fns)
 42 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
 43 |         self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn), is_multi_agent))
 44 |             for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
 45 |         for p in self.ps:
 46 |             p.daemon = True
 47 |             p.start()
 48 |         for remote in self.work_remotes:
 49 |             remote.close()
 50 | 
 51 |         self.remotes[0].send(('get_spaces', None))
 52 |         self.action_space, self.observation_space = self.remotes[0].recv()
 53 |         self.is_multi_agent = is_multi_agent
 54 |         self.num_agents = None
 55 |         if is_multi_agent:
 56 |             try:
 57 |                 n = len(self.action_space)
 58 |             except:
 59 |                 n = len(self.action_space.spaces)
 60 |             self.num_agents = n
 61 | 
 62 |     def step_async(self, actions):
 63 |         if self.is_multi_agent:
 64 |             remote_action = []
 65 |             for i in range(len(self.remotes)):
 66 |                 remote_action.append([action[i] for action in actions])
 67 |             actions = remote_action
 68 | 
 69 |         for remote, action in zip(self.remotes, actions):
 70 |             remote.send(('step', action))
 71 |         self.waiting = True
 72 | 
 73 |     def step_wait(self):
 74 |         results = [remote.recv() for remote in self.remotes]
 75 |         self.waiting = False
 76 |         if self.is_multi_agent:
 77 |             obs, rews, dones, infos = [], [], [], []
 78 |             for k in range(self.num_agents):
 79 |                 obs.append([result[0][k] for result in results])
 80 |                 rews.append([result[1][k] for result in results])
 81 |                 dones.append([result[2][k] for result in results])
 82 |             try:
 83 |                 infos = [result[3] for result in results]
 84 |             except:
 85 |                 infos = None
 86 | 
 87 |             obs = [np.stack(ob) for ob in obs]
 88 |             rews = [np.stack(rew) for rew in rews]
 89 |             dones = [np.stack(done) for done in dones]
 90 |             return obs, rews, dones, infos
 91 |         else:
 92 |             obs, rews, dones, infos = zip(*results)
 93 |             return np.stack(obs), np.stack(rews), np.stack(dones), infos
 94 | 
 95 |     def reset(self):
 96 |         for remote in self.remotes:
 97 |             remote.send(('reset', None))
 98 |         if self.is_multi_agent:
 99 |             results = [remote.recv() for remote in self.remotes]
100 |             obs = [[result[k] for result in results] for k in range(self.num_agents)]
101 |             obs = [np.stack(ob) for ob in obs]
102 |             return obs
103 |         else:
104 |             return np.stack([remote.recv() for remote in self.remotes])
105 | 
106 |     def close(self):
107 |         if self.closed:
108 |             return
109 |         if self.waiting:
110 |             for remote in self.remotes:
111 |                 remote.recv()
112 |         for remote in self.remotes:
113 |             remote.send(('close', None))
114 |         for p in self.ps:
115 |             p.join()
116 |         self.closed = True
117 | 
118 |     @property
119 |     def num_envs(self):
120 |         return len(self.remotes)
121 | 
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     from make_env import make_env
126 | 
127 |     def create_env(rank):
128 |         def _thunk():
129 |             env = make_env('simple_push')
130 |             env.seed(rank)
131 |             return env
132 |         return _thunk
133 | 
134 |     env = SubprocVecEnv([create_env(i) for i in range(0, 4)], is_multi_agent=True)
135 |     env.reset()
136 |     obs, rews, dones, _ = env.step(
137 |         [[np.array([0, 1, 0, 0, 0]), np.array([2, 0, 0, 0, 0])] for _ in range(4)]
138 |     )
139 |     print(env.observation_space)
140 |     print(obs)
141 |     print(rews[0].shape)
142 |     print(dones[1].shape)
143 |     env.close()


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/vec_env/vec_frame_stack.py:
--------------------------------------------------------------------------------
 1 | from rl.common.vec_env import VecEnv
 2 | import numpy as np
 3 | from gym import spaces
 4 | 
 5 | class VecFrameStack(VecEnv):
 6 |     """
 7 |     Vectorized environment base class
 8 |     """
 9 |     def __init__(self, venv, nstack):
10 |         self.venv = venv
11 |         self.nstack = nstack
12 |         wos = venv.observation_space # wrapped ob space
13 |         low = np.repeat(wos.low, self.nstack, axis=-1)
14 |         high = np.repeat(wos.high, self.nstack, axis=-1)
15 |         self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
16 |         self._observation_space = spaces.Box(low=low, high=high)
17 |         self._action_space = venv.action_space
18 |     def step(self, vac):
19 |         """
20 |         Apply sequence of actions to sequence of environments
21 |         actions -> (observations, rewards, news)
22 | 
23 |         where 'news' is a boolean vector indicating whether each element is new.
24 |         """
25 |         obs, rews, news, infos = self.venv.step(vac)
26 |         self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
27 |         for (i, new) in enumerate(news):
28 |             if new:
29 |                 self.stackedobs[i] = 0
30 |         self.stackedobs[..., -obs.shape[-1]:] = obs
31 |         return self.stackedobs, rews, news, infos
32 |     def reset(self):
33 |         """
34 |         Reset all environments
35 |         """
36 |         obs = self.venv.reset()
37 |         self.stackedobs[...] = 0
38 |         self.stackedobs[..., -obs.shape[-1]:] = obs
39 |         return self.stackedobs
40 |     @property
41 |     def action_space(self):
42 |         return self._action_space
43 |     @property
44 |     def observation_space(self):
45 |         return self._observation_space
46 |     def close(self):
47 |         self.venv.close()
48 |     @property
49 |     def num_envs(self):
50 |         return self.venv.num_envs


--------------------------------------------------------------------------------
/multi-agent-irl/rl/common/vec_env/vec_normalize.py:
--------------------------------------------------------------------------------
  1 | from baselines.common.vec_env import VecEnvWrapper
  2 | from baselines.common.running_mean_std import RunningMeanStd
  3 | import numpy as np
  4 | import joblib
  5 | 
  6 | 
  7 | class VecNormalize(VecEnvWrapper):
  8 |     """
  9 |     Vectorized environment base class
 10 |     """
 11 |     def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
 12 |         VecEnvWrapper.__init__(self, venv)
 13 |         self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
 14 |         self.ret_rms = RunningMeanStd(shape=()) if ret else None
 15 |         self.clipob = clipob
 16 |         self.cliprew = cliprew
 17 |         self.ret = np.zeros(self.num_envs)
 18 |         self.gamma = gamma
 19 |         self.epsilon = epsilon
 20 | 
 21 |     def step_wait(self):
 22 |         """
 23 |         Apply sequence of actions to sequence of environments
 24 |         actions -> (observations, rewards, news)
 25 | 
 26 |         where 'news' is a boolean vector indicating whether each element is new.
 27 |         """
 28 |         obs, rews, news, infos = self.venv.step_wait()
 29 |         self.ret = self.ret * self.gamma + rews
 30 |         obs = self._obfilt(obs)
 31 |         if self.ret_rms:
 32 |             self.ret_rms.update(self.ret)
 33 |             rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
 34 |         return obs, rews, news, infos
 35 | 
 36 |     def _obfilt(self, obs):
 37 |         if self.ob_rms:
 38 |             self.ob_rms.update(obs)
 39 |             obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
 40 |             return obs
 41 |         else:
 42 |             return obs
 43 | 
 44 |     def reset(self):
 45 |         """
 46 |         Reset all environments
 47 |         """
 48 |         obs = self.venv.reset()
 49 |         return self._obfilt(obs)
 50 | 
 51 | 
 52 | class MAVecNormalize(VecEnvWrapper):
 53 |     """
 54 |     Vectorized environment base class
 55 |     """
 56 |     def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
 57 |         VecEnvWrapper.__init__(self, venv)
 58 |         try:
 59 |             self.num_agents = num_agents = len(self.observation_space)
 60 |             self.ob_rms = [RunningMeanStd(shape=self.observation_space[k].shape) for k in range(num_agents)] if ob else None
 61 |         except:
 62 |             self.num_agents = num_agents = len(self.observation_space.spaces)
 63 |             self.ob_rms = [RunningMeanStd(shape=self.observation_space.spaces[k].shape) for k in range(num_agents)] if ob else None
 64 | 
 65 |         self.ret_rms = RunningMeanStd(shape=()) if ret else None
 66 |         #[RunningMeanStd(shape=()) for k in range(num_agents)] if ret else None
 67 |         self.clipob = clipob
 68 |         self.cliprew = cliprew
 69 |         # self.ret = [np.zeros(self.num_envs) for _ in range(num_agents)]
 70 |         self.ret = np.zeros(self.num_envs)
 71 |         self.gamma = gamma
 72 |         self.epsilon = epsilon
 73 | 
 74 |     def step_wait(self):
 75 |         """
 76 |         Apply sequence of actions to sequence of environments
 77 |         actions -> (observations, rewards, news)
 78 | 
 79 |         where 'news' is a boolean vector indicating whether each element is new.
 80 |         """
 81 |         obs, rews, news, infos = self.venv.step_wait()
 82 |         # print(rews)
 83 |         self.ret = self.ret * self.gamma + rews[0]
 84 |         # self.ret = [self.ret[k] * self.gamma + rews[k] for k in range(self.num_agents)]
 85 |         obs = self._obfilt(obs)
 86 |         if self.ret_rms:
 87 |             self.ret_rms.update(self.ret)
 88 |             # for k in range(self.num_agents):
 89 |             # print(self.ret_rms.mean, self.ret_rms.var)
 90 |             rews = [np.clip(rews[k] / np.sqrt(self.ret_rms.var + self.epsilon),
 91 |                             -self.cliprew, self.cliprew) for k in range(self.num_agents)]
 92 |             # print('---')
 93 |             # print(rews)
 94 |         return obs, rews, news, infos
 95 | 
 96 |     def _obfilt(self, obs):
 97 |         if self.ob_rms:
 98 |             for k in range(self.num_agents):
 99 |                 self.ob_rms[k].update(obs[k])
100 |             obs = [np.clip((obs[k] - self.ob_rms[k].mean) / np.sqrt(self.ob_rms[k].var + self.epsilon), -self.clipob, self.clipob) for k in range(self.num_agents)]
101 |             return obs
102 |         else:
103 |             return obs
104 | 
105 |     def reset(self):
106 |         """
107 |         Reset all environments
108 |         """
109 |         obs = self.venv.reset()
110 |         return self._obfilt(obs)
111 | 
112 |     def save(self, path):
113 |         joblib.dump(self.ob_rms, path)
114 | 
115 |     def load(self, path):
116 |         self.ob_rms = joblib.load(path)


--------------------------------------------------------------------------------
/multi-agent-irl/rl/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/rl/envs/__init__.py


--------------------------------------------------------------------------------
/multi-agent-irl/rl/envs/ant_og.xml:
--------------------------------------------------------------------------------
 1 | <mujoco model="ant">
 2 |   <compiler inertiafromgeom="true" angle="degree" coordinate="local" />
 3 |   <option timestep="0.02" integrator="RK4" />
 4 |   <custom>
 5 |     <numeric name="init_qpos" data="0.0 0.0 0.55 1.0 0.0 0.0 0.0 0.0 1.0 0.0 -1.0 0.0 -1.0 0.0 1.0" />
 6 |   </custom>
 7 |   <default>
 8 |     <joint limited="true" armature="1" damping="1" />
 9 |     <geom condim="3" conaffinity="0" margin="0.01" friction="1 0.5 0.5" solref=".02 1" solimp=".8 .8 .01" rgba="0.8 0.6 0.4 1" density="5.0" />
10 |   </default>
11 |   <asset>
12 |     <texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0" />
13 |     <texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01" />
14 |     <texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100" />
15 |     <material name='MatPlane' texture="texplane" shininess="1" texrepeat="60 60" specular="1"  reflectance="0.5" />
16 |     <material name='geom' texture="texgeom" texuniform="true" />
17 |   </asset>
18 |   <worldbody>
19 |     <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3" />
20 |     <geom name='floor' material="MatPlane" pos='0 0 0' size='40 40 40' type='plane' conaffinity='1' rgba='0.8 0.9 0.8 1' condim='3' />
21 |     <body name="torso" pos="0 0 0.75">
22 |       <geom name="torso_geom" type="sphere" size="0.25" pos="0 0 0" />
23 |       <joint name="root" type="free" limited="false" pos="0 0 0" axis="0 0 1" margin="0.01" armature="0" damping="0" />
24 |       <body name="front_left_leg" pos="0 0 0">
25 |         <geom name="aux_1_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 0.2 0.0" />
26 |         <body name="aux_1" pos="0.2 0.2 0">
27 |           <joint name="hip_1" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
28 |           <geom name="left_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 0.2 0.0" />
29 |           <body pos="0.2 0.2 0">
30 |             <joint name="ankle_1" type="hinge" pos="0.0 0.0 0.0" axis="-1 1 0" range="30 70" />
31 |             <geom name="left_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.4 0.4 0.0" />
32 |           </body>
33 |         </body>
34 |       </body>
35 |       <body name="front_right_leg" pos="0 0 0">
36 |         <geom name="aux_2_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 0.2 0.0" />
37 |         <body name="aux_2" pos="-0.2 0.2 0">
38 |           <joint name="hip_2" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
39 |           <geom name="right_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 0.2 0.0" />
40 |           <body pos="-0.2 0.2 0">
41 |             <joint name="ankle_2" type="hinge" pos="0.0 0.0 0.0" axis="1 1 0" range="-70 -30" />
42 |             <geom name="right_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.4 0.4 0.0" />
43 |           </body>
44 |         </body>
45 |       </body>
46 |       <body name="back_leg" pos="0 0 0">
47 |         <geom name="aux_3_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 -0.2 0.0" />
48 |         <body name="aux_3" pos="-0.2 -0.2 0">
49 |           <joint name="hip_3" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
50 |           <geom name="back_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 -0.2 0.0" />
51 |           <body pos="-0.2 -0.2 0">
52 |             <joint name="ankle_3" type="hinge" pos="0.0 0.0 0.0" axis="-1 1 0" range="-70 -30" />
53 |             <geom name="third_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.4 -0.4 0.0" />
54 |           </body>
55 |         </body>
56 |       </body>
57 |       <body name="right_back_leg" pos="0 0 0">
58 |         <geom name="aux_4_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 -0.2 0.0" />
59 |         <body name="aux_4" pos="0.2 -0.2 0">
60 |           <joint name="hip_4" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
61 |           <geom name="rightback_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 -0.2 0.0" />
62 |           <body pos="0.2 -0.2 0">
63 |             <joint name="ankle_4" type="hinge" pos="0.0 0.0 0.0" axis="1 1 0" range="30 70" />
64 |             <geom name="fourth_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.4 -0.4 0.0" />
65 |           </body>
66 |         </body>
67 |       </body>
68 |     </body>
69 |   </worldbody>
70 |   <actuator>
71 |     <motor joint="hip_4" ctrlrange="-150.0 150.0" ctrllimited="true" />
72 |     <motor joint="ankle_4" ctrlrange="-150.0 150.0" ctrllimited="true" />
73 |     <motor joint="hip_1" ctrlrange="-150.0 150.0" ctrllimited="true" />
74 |     <motor joint="ankle_1" ctrlrange="-150.0 150.0" ctrllimited="true" />
75 |     <motor joint="hip_2" ctrlrange="-150.0 150.0" ctrllimited="true" />
76 |     <motor joint="ankle_2" ctrlrange="-150.0 150.0" ctrllimited="true" />
77 |     <motor joint="hip_3" ctrlrange="-150.0 150.0" ctrllimited="true" />
78 |     <motor joint="ankle_3" ctrlrange="-150.0 150.0" ctrllimited="true" />
79 |   </actuator>
80 | </mujoco>
81 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/envs/mujoco_env/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/rl/envs/mujoco_env/__init__.py


--------------------------------------------------------------------------------
/multi-agent-irl/rl/envs/mujoco_env/walker2d.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym import utils
 4 | from gym.envs.mujoco import mujoco_env
 5 | from gym.envs.registration import register
 6 | 
 7 | 
 8 | class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 9 |     def __init__(self, target_vel):
10 |         self.target_vel = target_vel
11 |         mujoco_env.MujocoEnv.__init__(self, "walker2d.xml", 4)
12 |         utils.EzPickle.__init__(self)
13 | 
14 |     def _step(self, a):
15 |         posbefore = self.model.data.qpos[0, 0]
16 |         self.do_simulation(a, self.frame_skip)
17 |         posafter, height, ang = self.model.data.qpos[0:3, 0]
18 |         alive_bonus = 1.0
19 |         reward = ((posafter - posbefore) / self.dt)
20 |         print(reward)
21 |         # reward = -(reward - self.target_vel) ** 2
22 | 
23 |         reward += alive_bonus
24 |         reward -= 1e-3 * np.square(a).sum()
25 |         done = not (2.0 > height > 0.8 and 1.0 > ang > -1.0)
26 |         ob = self._get_obs()
27 |         return ob, reward, done, {}
28 | 
29 |     def _get_obs(self):
30 |         qpos = self.model.data.qpos
31 |         qvel = self.model.data.qvel
32 |         return np.concatenate([qpos[1:], np.clip(qvel, -10, 10)]).ravel()
33 | 
34 |     def reset_model(self):
35 |         self.set_state(
36 |             self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq),
37 |             self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
38 |         )
39 |         return self._get_obs()
40 | 
41 |     def viewer_setup(self):
42 |         self.viewer.cam.trackbodyid = 2
43 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
44 |         self.viewer.cam.lookat[2] += .8
45 |         self.viewer.cam.elevation = -20
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     try:
50 |         register(
51 |             id='NewWalker2d_5-v1',
52 |             entry_point='new_walker2d_5:NewWalker2dEnv',
53 |             kwargs={'target_vel': 5.}
54 |         )
55 |     except gym.error.Error:
56 |         pass
57 |     env = gym.make('NewWalker2d_5-v1')
58 | 


--------------------------------------------------------------------------------
/multi-agent-irl/rl/envs/multi_ant.xml:
--------------------------------------------------------------------------------
 1 | <mujoco model="ant">
 2 |   <compiler inertiafromgeom="true" angle="degree" coordinate="local"/>
 3 |   <option timestep="0.02" integrator="RK4"/>
 4 |   <custom>
 5 |     <numeric name="init_qpos" data="0.0 0.0 0.55 1.0 0.0 0.0 0.0 0.0 1.0 0.0 -1.0 0.0 -1.0 0.0 1.0"/>
 6 |   </custom>
 7 |   <default>
 8 |     <joint limited="true" armature="1" damping="1"/>
 9 |     <geom condim="3" conaffinity="0" margin="0.01" friction="1 0.5 0.5" solref=".02 1" solimp=".8 .8 .01" rgba="0.8 0.6 0.4 1" density="5.0"/>
10 |   </default>
11 |   <asset>
12 |     <texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0"/>
13 |     <texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01"/>
14 |     <texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100"/>
15 |     <material name="MatPlane" texture="texplane" shininess="1" texrepeat="60 60" specular="1" reflectance="0.5"/>
16 |     <material name="geom" texture="texgeom" texuniform="true"/>
17 |   </asset>
18 |   <worldbody>
19 |     <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3"/>
20 |     <geom name="floor" material="MatPlane" pos="0 0 0" size="40 40 40" type="plane" conaffinity="1" rgba="0.8 0.9 0.8 1" condim="3"/>
21 |     <body name="torso" pos="0 0 0.75">
22 |       <geom name="torso_geom" type="sphere" size="0.25" pos="0 0 0"/>
23 |       <joint name="root" type="free" limited="false" pos="0 0 0" axis="0 0 1" margin="0.01" armature="0" damping="0"/>
24 |       <body name="leg_0" pos="0 0 0">
25 |         <geom fromto="0.0 0.0 0.0 0.1994 0.1994 0.0" name="aux_0_geom" size="0.08" type="capsule"/>
26 |         <body name="aux_0" pos="0.1994 0.1994 0.0">
27 |           <joint axis="0 0 1" name="hip_0" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
28 |           <geom fromto="0.0 0.0 0.0 0.1994 0.1994 0.0" name="leg_geom_0" size="0.08" type="capsule"/>
29 |           <body pos="0.1994 0.1994 0.0">
30 |             <joint axis="-1 1 0" name="ankle_0" pos="0.0 0.0 0.0" range="30 70" type="hinge"/>
31 |             <geom fromto="0.0 0.0 0.0 0.3988 0.3988 0.0" name="ankle_geom_0" size="0.08" type="capsule"/>
32 |           </body>
33 |         </body>
34 |       </body>
35 |       <body name="leg_1" pos="0 0 0">
36 |         <geom fromto="0.0 0.0 0.0 -0.1994 0.1994 0.0" name="aux_1_geom" size="0.08" type="capsule"/>
37 |         <body name="aux_1" pos="-0.1994 0.1994 0.0">
38 |           <joint axis="0 0 1" name="hip_1" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
39 |           <geom fromto="0.0 0.0 0.0 -0.1994 0.1994 0.0" name="leg_geom_1" size="0.08" type="capsule"/>
40 |           <body pos="-0.1994 0.1994 0.0">
41 |             <joint axis="1 1 0" name="ankle_1" pos="0.0 0.0 0.0" range="-70 -30" type="hinge"/>
42 |             <geom fromto="0.0 0.0 0.0 -0.3988 0.3988 0.0" name="ankle_geom_1" size="0.08" type="capsule"/>
43 |           </body>
44 |         </body>
45 |       </body>
46 |       <body name="leg_2" pos="0 0 0">
47 |         <geom fromto="0.0 0.0 0.0 -0.1994 -0.1994 0.0" name="aux_2_geom" size="0.08" type="capsule"/>
48 |         <body name="aux_2" pos="-0.1994 -0.1994 0.0">
49 |           <joint axis="0 0 1" name="hip_2" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
50 |           <geom fromto="0.0 0.0 0.0 -0.1994 -0.1994 0.0" name="leg_geom_2" size="0.08" type="capsule"/>
51 |           <body pos="-0.1994 -0.1994 0.0">
52 |             <joint axis="-1 1 0" name="ankle_2" pos="0.0 0.0 0.0" range="-70 -30" type="hinge"/>
53 |             <geom fromto="0.0 0.0 0.0 -0.3988 -0.3988 0.0" name="ankle_geom_2" size="0.08" type="capsule"/>
54 |           </body>
55 |         </body>
56 |       </body>
57 |       <body name="leg_3" pos="0 0 0">
58 |         <geom fromto="0.0 0.0 0.0 0.1994 -0.1994 0.0" name="aux_3_geom" size="0.08" type="capsule"/>
59 |         <body name="aux_3" pos="0.1994 -0.1994 0.0">
60 |           <joint axis="0 0 1" name="hip_3" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
61 |           <geom fromto="0.0 0.0 0.0 0.1994 -0.1994 0.0" name="leg_geom_3" size="0.08" type="capsule"/>
62 |           <body pos="0.1994 -0.1994 0.0">
63 |             <joint axis="1 1 0" name="ankle_3" pos="0.0 0.0 0.0" range="30 70" type="hinge"/>
64 |             <geom fromto="0.0 0.0 0.0 0.3988 -0.3988 0.0" name="ankle_geom_3" size="0.08" type="capsule"/>
65 |           </body>
66 |         </body>
67 |       </body>
68 |     </body>
69 |   </worldbody>
70 |   <actuator>
71 |     <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="hip_0"/>
72 |     <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="ankle_0"/>
73 |     <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="hip_1"/>
74 |     <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="ankle_1"/>
75 |     <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="hip_2"/>
76 |     <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="ankle_2"/>
77 |     <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="hip_3"/>
78 |     <motor ctrllimited="true" ctrlrange="-150.0 150.0" joint="ankle_3"/>
79 |   </actuator>
80 | </mujoco>
81 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sample.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | # source activate rl
3 | python -m irl.render --env=simple_speaker_listener --algo=mack_om --epoch=$2 --num_trajs=$1 --sample --seed=$3
4 | python -m irl.render --env=simple_spread --algo=mack_om --epoch=$2 --num_trajs=$1 --sample --seed=$3
5 | python -m irl.render --env=simple_push --algo=mack_om --epoch=$2 --num_trajs=$1 --sample --seed=$3
6 | python -m irl.render --env=simple_tag --algo=mack_om --epoch=$2 --num_trajs=$1 --sample --seed=$3
7 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/sandbox/__init__.py


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/imitation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/sandbox/imitation/__init__.py


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/imitation/crender.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import click
 3 | import rl.common.tf_util as U
 4 | from sandbox.ppo_sgd import cmlp_policy
 5 | import multiagent
 6 | import make_env
 7 | import gym.spaces
 8 | import numpy as np
 9 | import time
10 | 
11 | 
12 | @click.command()
13 | @click.option('--path', type=click.STRING,
14 |               # default='/Users/jiaming/atlas/exps/cmappo-sgd/simple_push/l-0.0001-b-10000/seed-1'
15 |               default='/tmp/exps/cmappo-sgd/simple_speaker_listener/l-0.0001-b-10000/seed-1'
16 |               )
17 | def render(path):
18 |     def policy_fn(name, ob_space, ac_space, index, all_ob_space):
19 |         pi = cmlp_policy.MlpPolicy(
20 |             name=name, ob_space=ob_space, ac_space=ac_space,
21 |             hid_size=64, num_hid_layers=2, index=index, all_ob_space=all_ob_space
22 |         )
23 |         return pi
24 | 
25 |     env = make_env.make_env('simple_speaker_listener')
26 |     ob_space = env.observation_space
27 |     ac_space = env.action_space
28 |     all_ob_shape = 0
29 |     for k in range(env.n):
30 |         all_ob_shape += ob_space[k].shape[0]
31 |     all_ob_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(all_ob_shape,))
32 |     pi = []
33 |     for k in range(env.n):
34 |         pi.append(policy_fn("pi_%d"%k, ob_space[k], ac_space[k], # gym.spaces.Box(low=-1., high=1., shape=(5,)),
35 |                             k, all_ob_space))
36 |     sess = U.single_threaded_session()
37 |     sess.__enter__()
38 | 
39 |     for k in range(env.n):
40 |         pi[k].restore(path + '/model_%d.ckpt'%k)
41 |         # print(U.get_session().run(pi[k].ob_rms.mean))
42 | 
43 |     obs = env.reset()
44 |     done = False
45 | 
46 |     #print(obs.shape)
47 | 
48 |     for i in range(100):
49 |         obs = env.reset()
50 |         step = 0
51 |         done = False
52 |         while not done:
53 |             action = []
54 |             flattened_obs = []
55 |             for j in obs:
56 |                 flattened_obs.extend(j)
57 |             for k in range(env.n):
58 |                 act, _ = pi[k].act(False, obs[k], np.array(flattened_obs))
59 |                 action.append(act)
60 |             # print(action)
61 |             obs, rew, done, _ = env.step(action)
62 |             print(rew)
63 |             step += 1
64 |             env.render()
65 |             time.sleep(0.1)
66 |             if step == 50 or True in done:
67 |                 done = True
68 |                 step = 0
69 |             else:
70 |                 done = False
71 | 
72 | if __name__ == '__main__':
73 |     render()


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/imitation/render.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import click
 3 | import rl.common.tf_util as U
 4 | from sandbox.ppo_sgd import mlp_policy
 5 | #from gym.envs.registration import register
 6 | 
 7 | '''
 8 | register(
 9 |     id='Walker2d-v5',
10 |     entry_point='rl.envs.mujoco_env.walker2d:Walker2dEnv',
11 |     kwargs={'target_vel': 5.}
12 | )
13 | '''
14 | 
15 | import multiagent
16 | import make_env
17 | import gym.spaces
18 | import numpy as np
19 | 
20 | @click.command()
21 | @click.option('--path', type=click.STRING,
22 |               default='./log/exps/mappo-sgd/simple_tag/l-0.0003-b-2048/seed-1')
23 | def render(path):
24 |     def policy_fn(name, ob_space, ac_space, id):
25 |         pi = mlp_policy.MlpPolicy(
26 |             name=name, ob_space=ob_space, ac_space=ac_space,
27 |             hid_size=64, num_hid_layers=2, id = id
28 |         )
29 |         return pi
30 | 
31 |     env = make_env.make_env('Walker2d-v5')
32 |     ob_space = env.observation_space
33 |     ac_space = env.action_space
34 |     pi = []
35 |     for k in range(env.n):
36 |         pi.append(policy_fn("pi_%d"%k, ob_space[k], gym.spaces.Box(low=-1., high=1., shape=(5,)), k))
37 |     sess = U.single_threaded_session()
38 |     sess.__enter__()
39 | 
40 |     for k in range(env.n):
41 |         pi[k].restore(path + '/model_%d.ckpt'%k)
42 |         print(U.get_session().run(pi[k].ob_rms.mean))
43 | 
44 |     obs = env.reset()
45 |     done = False
46 | 
47 |     #print(obs.shape)
48 | 
49 |     for i in range(100):
50 |         obs = env.reset()
51 |         step = 0
52 |         done = False
53 |         while not done:
54 |             action = []
55 |             for k in range(env.n):
56 |                 act, _ = pi[k].act(False, obs[k])
57 |                 action.append(act)
58 |             # print(action)
59 |             obs, _, done, _ = env.step(action)
60 |             step += 1
61 |             env.render()
62 |             if step == 100:
63 |                 done = True
64 |             else:
65 |                 done = False
66 | 
67 | if __name__ == '__main__':
68 |     render()


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/imitation/run_cmappo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import itertools
 3 | import logging
 4 | import os.path as osp
 5 | import os
 6 | 
 7 | import click
 8 | import gym
 9 | import ray
10 | 
11 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
12 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
13 | 
14 | 
15 | @ray.remote(num_cpus=1)
16 | def train(logdir, env_id, lr, num_timesteps, seed, timesteps_per_batch, cont=False):
17 |     from sandbox.ppo_sgd import cmlp_policy
18 |     from sandbox.ppo_sgd import cmappo_simple
19 |     from rl import logger
20 |     from rl.common import set_global_seeds, tf_util as U
21 |     from rl import bench
22 | 
23 |     from gym.envs.registration import register
24 |     import multiagent
25 |     import make_env
26 | 
27 |     logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
28 | 
29 |     U.make_session(num_cpu=1).__enter__()
30 |     set_global_seeds(seed)
31 |     env = make_env.make_env(env_id)
32 | 
33 |     def policy_fn(name, ob_space, ac_space, index, all_ob_space):
34 |         pi = cmlp_policy.MlpPolicy(
35 |             name=name, ob_space=ob_space, ac_space=ac_space,
36 |             hid_size=64, num_hid_layers=2, index=index, all_ob_space=all_ob_space
37 |         )
38 |         return pi
39 | 
40 |     env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)
41 |     env.seed(seed)
42 |     gym.logger.setLevel(logging.WARN)
43 |     cmappo_simple.learn(
44 |         env, policy_fn,
45 |         max_timesteps=num_timesteps,
46 |         timesteps_per_batch=timesteps_per_batch,
47 |         clip_param=0.2, entcoeff=0.01,
48 |         optim_epochs=4, optim_stepsize=lr, optim_batchsize=64,
49 |         gamma=0.95, lam=0.95, schedule='linear', cont=cont
50 |     )
51 |     env.close()
52 |     return None
53 | 
54 | 
55 | @click.command()
56 | @click.option('--logdir', default='/tmp', type=click.STRING)
57 | @click.option('--cont', is_flag=True, flag_value=True)
58 | def main(logdir, cont):
59 |     env_ids = [
60 |         'simple_speaker_listener'
61 |     ]
62 |     lrs = [
63 |         0.0001 # 0.0001, 0.003, 0.0005, 0.0001
64 |     ]
65 |     seeds = [1]
66 |     batch_sizes = [50000]
67 | 
68 |     num_cpus = len(env_ids) * len(lrs) * len(seeds) * len(batch_sizes)
69 |     # print(len(env_ids), len(lrs) , len(seeds) , len(batch_sizes))
70 |     ray.init(num_cpus=num_cpus, num_gpus=0)
71 |     print('Requesting {} cpus.'.format(num_cpus))
72 | 
73 |     jobs = [
74 |         train.remote(
75 |             logdir + '/exps/cmappo-sgd/' + env_id + '/l-{}-b-{}/seed-{}'.format(lr, batch_size, seed),
76 |             env_id, lr, 1e7, seed, batch_size, cont)
77 |         for env_id, lr, batch_size, seed in itertools.product(env_ids, lrs, batch_sizes, seeds)
78 |     ]
79 | 
80 |     print(jobs)
81 |     ray.get(jobs)
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/imitation/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import itertools
 3 | import logging
 4 | import os.path as osp
 5 | import os
 6 | 
 7 | import click
 8 | import gym
 9 | import ray
10 | 
11 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
12 | 
13 | 
14 | @ray.remote(num_cpus=1)
15 | def train(logdir, env_id, lr, num_timesteps, seed, timesteps_per_batch, cont=False):
16 |     from sandbox.ppo_sgd import mlp_policy
17 |     from sandbox.ppo_sgd import pposgd_simple
18 |     from rl import logger
19 |     from rl.common import set_global_seeds, tf_util as U
20 |     from rl import bench
21 | 
22 |     from gym.envs.registration import register
23 |     import multiagent
24 |     import make_env
25 | 
26 |     logger.configure(logdir, format_strs=['log', 'json', 'tensorboard'])
27 |     U.make_session(num_cpu=1).__enter__()
28 |     set_global_seeds(seed)
29 |     env = make_env.make_env(env_id)
30 | 
31 |     def policy_fn(name, ob_space, ac_space, id):
32 |         pi = mlp_policy.MlpPolicy(
33 |             name=name, ob_space=ob_space, ac_space=ac_space,
34 |             hid_size=64, num_hid_layers=2, id=id
35 |         )
36 |         return pi
37 | 
38 |     env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
39 |     env.seed(seed)
40 |     gym.logger.setLevel(logging.WARN)
41 |     pposgd_simple.learn(
42 |         env, policy_fn,
43 |         max_timesteps=num_timesteps,
44 |         timesteps_per_batch=timesteps_per_batch,
45 |         clip_param=0.2, entcoeff=0.0,
46 |         optim_epochs=10, optim_stepsize=lr, optim_batchsize=64,
47 |         gamma=0.99, lam=0.95, schedule='linear', cont=cont
48 |     )
49 |     env.close()
50 |     return None
51 | 
52 | 
53 | @click.command()
54 | @click.option('--logdir', default='/tmp', type=click.STRING)
55 | @click.option('--cont', is_flag=True, flag_value=True)
56 | def main(logdir, cont):
57 |     env_ids = [
58 |         'simple_tag'
59 |     ]
60 |     lrs = [
61 |         0.0003
62 |     ]
63 |     seeds = [1,2,3,4]
64 |     batch_sizes = [2048]
65 | 
66 |     num_cpus = len(env_ids) * len(lrs) * len(seeds) * len(batch_sizes)
67 |     print(len(env_ids), len(lrs) , len(seeds) , len(batch_sizes))
68 |     ray.init(num_cpus=num_cpus, num_gpus=0)
69 |     print('Requesting {} cpus.'.format(num_cpus))
70 | 
71 |     jobs = [
72 |         train.remote(
73 |             logdir + '/exps/mappo-sgd/' + env_id + '/l-{}-b-{}/seed-{}'.format(lr, batch_size, seed),
74 |             env_id, lr, 1e7, seed, batch_size, cont)
75 |         for env_id, lr, batch_size, seed in itertools.product(env_ids, lrs, batch_sizes, seeds)
76 |     ]
77 | 
78 |     print(jobs)
79 |     ray.get(jobs)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     main()
84 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mack/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/sandbox/mack/__init__.py


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mack/opponent_policies.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | import rl.common.tf_util as U
  5 | from rl.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
  6 | 
  7 | 
  8 | class CategoricalPolicy(object):
  9 |     def __init__(self, sess, agent_id, ob_space, ac_space, ob_spaces, ac_spaces,
 10 |                  nenv, nsteps, nstack, reuse=False, name='model'):
 11 |         self.agent_id = agent_id
 12 |         nbatch = nenv * nsteps
 13 |         ob_shape = (nbatch, ob_space.shape[0] * nstack)
 14 |         all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack)
 15 |         nact = ac_space
 16 |         actions = [tf.placeholder(tf.int32, (nbatch)) for _ in range(len(ob_spaces)-1)]
 17 |         all_ac_shape = (nbatch, (sum([ac.n for ac in ac_spaces]) - nact) * nstack)
 18 |         obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
 19 |         X = obs_x
 20 |         X_v = tf.placeholder(tf.float32, all_ob_shape)
 21 |         A_v = tf.placeholder(tf.float32, all_ac_shape)
 22 |         with tf.variable_scope('oppo_{}'.format(name), reuse=reuse):
 23 |             h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
 24 |             h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
 25 |             pi = []
 26 |             for k in range(len(ob_spaces)):
 27 |                 if k == agent_id:
 28 |                     continue
 29 |                 pi.append(fc(h2, 'pi_%d'%k, ac_spaces[k].n, act=lambda x: x))
 30 |         self.log_prob = [-tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi[i], labels=actions[i]) for i in range(len(pi))]
 31 |         a0 = [sample(_) for _ in pi]
 32 |         self.initial_state = []  # not stateful
 33 | 
 34 |         def step_log_prob(ob, acts_n):
 35 |             acts = [acts_n[i] for i in range(len(acts_n)) if i!=self.agent_id]
 36 |             feed_dict = {X:ob}
 37 |             feed_dict.update(zip(actions, acts))
 38 |             log_prob = sess.run(self.log_prob, feed_dict)
 39 |             return log_prob.reshape([-1, 1])
 40 | 
 41 |         def step(ob, obs, a_v, *_args, **_kwargs):
 42 |             a = sess.run(a0, {X: ob, X_v: obs})
 43 |             return a
 44 | 
 45 |         self.obs_x = obs_x
 46 |         self.X = X
 47 |         self.X_v = X_v
 48 |         self.A_v = A_v
 49 |         self.pi = pi
 50 |         self.step_log_prob = step_log_prob
 51 |         self.step = step
 52 | 
 53 | 
 54 | class GaussianPolicy(object):
 55 |     def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces,
 56 |                  nenv, nsteps, nstack, reuse=False, name='model'):
 57 |         self.agent_id = agent_id
 58 |         nbatch = nenv * nsteps
 59 |         ob_shape = (nbatch, ob_space.shape[0] * nstack)
 60 |         all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack)
 61 |         nact = ac_space.shape[0]
 62 |         all_ac_shape = (nbatch, (sum([ac.shape[0] for ac in ac_spaces]) - nact) * nstack)
 63 |         obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
 64 |         X = obs_x
 65 |         X_v = tf.placeholder(tf.float32, all_ob_shape)
 66 |         A_v = tf.placeholder(tf.float32, all_ac_shape)
 67 |         with tf.variable_scope('oppo_policy_{}'.format(name), reuse=reuse):
 68 |             h1 = fc(X, 'fc1', nh=64, init_scale=np.sqrt(2), act=tf.nn.tanh)
 69 |             h2 = fc(h1, 'fc2', nh=64, init_scale=np.sqrt(2), act=tf.nn.tanh)
 70 |             pi = []
 71 |             for k in range(len(ob_spaces)):
 72 |                 if k == agent_id:
 73 |                     continue
 74 |                 pi.append(fc(h2, 'pi%d'%k, ac_spaces[k], act=lambda x: x, init_scale=0.01))
 75 | 
 76 |         with tf.variable_scope('oppo_{}'.format(name), reuse=reuse):
 77 |             logstd = tf.get_variable('sigma', shape=[nact], dtype=tf.float32,
 78 |                                      initializer=tf.constant_initializer(0.0))
 79 |             logstd = tf.expand_dims(logstd, 0)
 80 |             std = tf.exp(logstd)
 81 |             std = tf.tile(std, [nbatch, 1])
 82 | 
 83 |         a0 = pi + tf.random_normal(tf.shape(std), 0.0, 1.0) * std
 84 | 
 85 |         self.initial_state = []  # not stateful
 86 | 
 87 |         def step(ob, obs, *_args, **_kwargs):
 88 |             a = sess.run(a0, {X: ob, X_v: obs})
 89 |             return a
 90 | 
 91 |         self.obs_x = obs_x
 92 |         self.X = X
 93 |         self.X_v = X_v
 94 |         self.A_v = A_v
 95 |         self.pi = pi
 96 |         self.std = std
 97 |         self.logstd = logstd
 98 |         self.step = step
 99 |         self.mean_std = tf.concat([pi, std], axis=1)
100 | 
101 | 
102 | class MultiCategoricalPolicy(object):
103 |     def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces,
104 |                  nenv, nsteps, nstack, reuse=False, name='model'):
105 |         self.agent_id = agent_id
106 |         nbins = 11
107 |         nbatch = nenv * nsteps
108 |         ob_shape = (nbatch, ob_space.shape[0] * nstack)
109 |         all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack)
110 |         nact = ac_space.shape[0]
111 |         all_ac_shape = (nbatch, (sum([ac.shape[0] for ac in ac_spaces]) - nact) * nstack)
112 |         obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
113 |         X = obs_x
114 |         X_v = tf.placeholder(tf.float32, all_ob_shape)
115 |         A_v = tf.placeholder(tf.float32, all_ac_shape)
116 |         with tf.variable_scope('oppo_{}'.format(name), reuse=reuse):
117 |             h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
118 |             h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
119 |             pi = []
120 |             for k in range(len(ob_spaces)):
121 |                 if k == agent_id:
122 |                     continue
123 |                 pi.append(fc(h2, 'pi%d'%k, ac_spaces[k] * nbins, act=lambda x: x))
124 | 
125 |         pi = tf.reshape(pi, [nbatch, nact, nbins])
126 |         a0 = sample(pi, axis=2)
127 |         self.initial_state = []  # not stateful
128 | 
129 |         def step(ob, obs, *_args, **_kwargs):
130 |             a = sess.run(a0, {X: ob, X_v: obs})
131 |             return a
132 | 
133 |         def transform(a):
134 |             # transform from [0, 9] to [-0.8, 0.8]
135 |             a = np.array(a, dtype=np.float32)
136 |             a = (a - (nbins - 1) / 2) / (nbins - 1) * 2.0
137 |             return a
138 | 
139 |         self.obs_x = obs_x
140 |         self.X = X
141 |         self.X_v = X_v
142 |         self.A_v = A_v
143 |         self.pi = pi
144 |         self.step = step
145 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mack/render.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import click
 3 | import rl.common.tf_util as U
 4 | import multiagent
 5 | import tensorflow as tf
 6 | import make_env
 7 | import gym.spaces
 8 | import numpy as np
 9 | import time
10 | from rl.common.misc_util import set_global_seeds
11 | from sandbox.mack.acktr_disc import Model, onehot
12 | from sandbox.mack.policies import CategoricalPolicy
13 | from rl import bench
14 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
15 | import imageio
16 | import pickle as pkl
17 | 
18 | 
19 | @click.command()
20 | @click.option('--path', type=click.STRING,
21 |               default='/tmp/exps/ma-acktr/simple/l-0.1-b-1000/seed-1/checkpoint01000'
22 |               )
23 |               # default='/atlas/u/hyren/malog/exps/acktr/simple/l-0.1-k-0.002-b-1000/seed-5/checkpoint00400')
24 | # default='/tmp/exps/ma-acktr/simple_speaker_listener/l-0.1-k-0.002-b-500/seed-1/checkpoint04000')
25 | def render(path):
26 |     tf.reset_default_graph()
27 | 
28 |     def create_env():
29 |         env = make_env.make_env('simple')
30 |         env.seed(3)
31 |         env = bench.Monitor(env, '/tmp/',  allow_early_resets=True)
32 |         set_global_seeds(3)
33 |         return env
34 | 
35 |     env = create_env()
36 |     nenv = 1
37 |     n_agents = len(env.action_space)
38 | 
39 |     ob_space = env.observation_space
40 |     ac_space = env.action_space
41 |     n_actions = [action.n for action in ac_space]
42 | 
43 |     make_model = lambda: Model(
44 |         CategoricalPolicy, ob_space, ac_space, 1, total_timesteps=1e7, nprocs=2, nsteps=500,
45 |         nstack=1, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.01, max_grad_norm=0.5, kfac_clip=0.001,
46 |                                lrschedule='linear')
47 |     model = make_model()
48 |     model.load(path)
49 | 
50 |     images = []
51 |     sample_trajs = []
52 |     for i in range(20):
53 |         all_ob, all_agent_ob, all_ac, all_rew, ep_ret = [], [], [], [], [0, 0]
54 |         for k in range(n_agents):
55 |             all_ob.append([])
56 |             all_ac.append([])
57 |             all_rew.append([])
58 |         obs = env.reset()
59 |         obs = [ob[None, :] for ob in obs]
60 |         action = [np.zeros([1]) for _ in range(n_agents)]
61 |         step = 0
62 |         done = False
63 |         while not done:
64 |             action, _, _ = model.step(obs, action)
65 |             actions_list = [onehot(action[k][0], n_actions[k]) for k in range(n_agents)]
66 |             for k in range(n_agents):
67 |                 all_ob[k].append(obs[k])
68 |                 all_ac[k].append(actions_list[k])
69 |             all_agent_ob.append(np.concatenate(obs, axis=1))
70 |             obs, rew, done, _ = env.step(actions_list)
71 |             for k in range(n_agents):
72 |                 all_rew[k].append(rew[k])
73 |                 ep_ret[k] += rew[k]
74 |             obs = [ob[None, :] for ob in obs]
75 |             step += 1
76 |             img = env.render()
77 |             images.append(img[0])
78 | 
79 |             if step == 50 or True in done:
80 |                 done = True
81 |                 step = 0
82 |             else:
83 |                 done = False
84 |         for k in range(n_agents):
85 |             all_ob[k] = np.squeeze(all_ob[k])
86 |         all_agent_ob = np.squeeze(all_agent_ob)
87 |         traj_data = {
88 |             "ob": all_ob, "ac": all_ac, "rew": all_rew,
89 |             "ep_ret": ep_ret, "all_ob": all_agent_ob
90 |         }
91 |         sample_trajs.append(traj_data)
92 |         print(ep_ret[0], ep_ret[1])
93 |     # pkl.dump(sample_trajs, open('/atlas/u/hyren/maexperts/test.pkl', 'wb'))
94 |     # imageio.mimsave('simple_push.mp4', images, fps=25)
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     render()


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mack/run_clone.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import time
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import os
  6 | import os.path as osp
  7 | 
  8 | import make_env
  9 | import gym
 10 | import logging
 11 | from rl import bench
 12 | from rl.common.misc_util import set_global_seeds
 13 | from rl import logger
 14 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
 15 | from sandbox.mack.policies import CategoricalPolicy
 16 | from sandbox.mack.acktr_disc import Model
 17 | from irl.dataset import MADataSet
 18 | 
 19 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
 20 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 21 | 
 22 | 
 23 | def learn(policy, env, expert, seed, total_timesteps=int(40e6), gamma=0.99, lam=0.95, log_interval=1, nprocs=4, nsteps=20,
 24 |           nstack=1, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.05, max_grad_norm=0.5,
 25 |           kfac_clip=0.001, save_interval=1000, lrschedule='linear', batch_size=1024):
 26 |     tf.reset_default_graph()
 27 |     set_global_seeds(seed)
 28 |     nenvs = env.num_envs
 29 |     ob_space = env.observation_space
 30 |     ac_space = env.action_space
 31 |     make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs,
 32 |                                nsteps=batch_size, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef,
 33 |                                vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
 34 |                                lrschedule=lrschedule)
 35 |     if save_interval and logger.get_dir():
 36 |         import cloudpickle
 37 |         with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
 38 |             fh.write(cloudpickle.dumps(make_model))
 39 |     model = make_model()
 40 | 
 41 |     tstart = time.time()
 42 |     coord = tf.train.Coordinator()
 43 |     # enqueue_threads = [q_runner.create_threads(model.sess, coord=coord, start=True) for q_runner in model.q_runner]
 44 |     print("-------------------------------")
 45 |     print(total_timesteps//batch_size+1)
 46 |     print("-------------------------------")
 47 | 
 48 |     for update in range(total_timesteps//batch_size+1):
 49 |         e_obs, e_actions, _, _ = expert.get_next_batch(batch_size)
 50 |         e_a = [np.argmax(e_actions[k], axis=1) for k in range(len(e_actions))]
 51 |         nseconds = time.time() - tstart
 52 |         fps = int((update * batch_size) / nseconds)
 53 | 
 54 |         lld_loss = model.clone(e_obs, e_a)[0]
 55 |         # print(lld_loss)
 56 |         if update % log_interval == 0 or update == 1:
 57 |             logger.record_tabular("nupdates", update)
 58 |             logger.record_tabular("total_timesteps", update*batch_size)
 59 |             logger.record_tabular("fps", fps)
 60 |             for k in range(model.num_agents):
 61 |                 logger.record_tabular("lld_loss %d" % k, float(lld_loss[k]))
 62 |             logger.dump_tabular()
 63 | 
 64 |         if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
 65 |             savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update)
 66 |             print('Saving to', savepath)
 67 |             model.save(savepath)
 68 | 
 69 |     coord.request_stop()
 70 | 
 71 | @click.command()
 72 | @click.option('--logdir', type=click.STRING, default='./results')
 73 | @click.option('--env', type=click.Choice(['simple', 'simple_speaker_listener', 'simple_crypto',
 74 |                                           'simple_push', 'simple_tag']))
 75 | @click.option('--expert_path', type=click.STRING)
 76 | @click.option('--seed', type=click.INT, default=1)
 77 | @click.option('--batch_size', type=click.INT, default=1000)
 78 | @click.option('--traj_limitation', type=click.INT, default=100)
 79 | @click.option('--lr', type=click.FLOAT, default=0.01)
 80 | def train(logdir, env, expert_path, seed, batch_size, lr, traj_limitation):
 81 |     env_id = env
 82 |     logdir = logdir + '/bc/' + env_id + '/s-{}/l-{}-b-{}/seed-{}'.format(traj_limitation, lr, batch_size, seed)
 83 |     print(logdir, env, expert_path, seed)
 84 |     logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
 85 |     expert = MADataSet(expert_path, ret_threshold=-10, traj_limitation=traj_limitation)
 86 | 
 87 |     def create_env(rank):
 88 |         def _thunk():
 89 |             env = make_env.make_env(env_id)
 90 |             env.seed(seed + rank)
 91 |             env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
 92 |                                 allow_early_resets=True)
 93 |             gym.logger.setLevel(logging.WARN)
 94 |             return env
 95 |         return _thunk
 96 | 
 97 |     env = SubprocVecEnv([create_env(i) for i in range(1)], is_multi_agent=True)
 98 | 
 99 |     policy_fn = CategoricalPolicy
100 |     learn(policy_fn, env, expert, seed, int(2e7), batch_size=batch_size, lr=lr )
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     train()
105 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mack/run_simple.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import logging
 3 | import os
 4 | import itertools
 5 | import click
 6 | import gym
 7 | import make_env
 8 | from rl import bench
 9 | from rl import logger
10 | from rl.common import set_global_seeds
11 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
12 | from sandbox.mack.acktr_disc import learn
13 | from sandbox.mack.policies import CategoricalPolicy
14 | 
15 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
17 | 
18 | 
19 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, load, epoch, path):
20 |     def create_env(rank):
21 |         def _thunk():
22 |             env = make_env.make_env(env_id)
23 |             env.seed(seed + rank)
24 |             env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
25 |                                 allow_early_resets=True)
26 |             gym.logger.setLevel(logging.WARN)
27 |             return env
28 |         return _thunk
29 | 
30 |     logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
31 |     
32 |     set_global_seeds(seed)
33 |     env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True)
34 |     policy_fn = CategoricalPolicy
35 |     learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu,
36 |           nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.00, identical=make_env.get_identical(env_id), load=load, epoch=epoch, path=path)
37 |     env.close()
38 | 
39 | 
40 | @click.command()
41 | @click.option('--logdir', type=click.STRING, default='./results')
42 | @click.option('--env', type=click.Choice(['simple', 'simple_speaker_listener',
43 |                                           'simple_crypto', 'simple_push',
44 |                                           'simple_tag', 'simple_spread', 'simple_adversary']))
45 | @click.option('--lr', type=click.FLOAT, default=0.1)
46 | @click.option('--seed', type=click.INT, default=1)
47 | @click.option('--batch_size', type=click.INT, default=1000)
48 | @click.option('--load', is_flag=True, flag_value=True)
49 | @click.option('--epoch', type=click.INT, default=55000)
50 | def main(logdir, env, lr, seed, batch_size, load, epoch):
51 |     env_name = env
52 |     env_ids = [env]
53 |     lrs = [lr]
54 |     seeds = [seed]
55 |     batch_sizes = [batch_size]
56 | 
57 |     path= None
58 | 
59 |     print('logging to: ' + logdir)
60 |     if load:
61 |         path = './results/'+'mack_om'+'/'+env_name+'/'
62 |         path += '/l-0.1-b-1000/seed-1/checkpoint' + '%05d'%epoch
63 | 
64 |     for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes):
65 |         train(logdir + '/mack/' + env_id + '/l-{}-b-{}/seed-{}'.format(lr, batch_size, seed),
66 |               env_id, 5e7, lr, batch_size, seed, batch_size // 250, load, epoch, path)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mack/run_simple_om.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import logging
 3 | import os
 4 | import itertools
 5 | import click
 6 | import gym
 7 | import make_env
 8 | from rl import bench
 9 | from rl import logger
10 | from rl.common import set_global_seeds
11 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
12 | from sandbox.mack.acktr_disc_om import learn
13 | from sandbox.mack.policies_om import CategoricalPolicy as CategoricalPolicy_om
14 | from sandbox.mack.opponent_policies import CategoricalPolicy as oppo_CategoricalPolicy
15 | 
16 | 
17 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
19 | 
20 | 
21 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, load, epoch, path):
22 |     def create_env(rank):
23 |         def _thunk():
24 |             env = make_env.make_env(env_id)
25 |             env.seed(seed + rank)
26 |             env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
27 |                                 allow_early_resets=True)
28 |             gym.logger.setLevel(logging.WARN)
29 |             return env
30 |         return _thunk
31 | 
32 |     logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
33 |     
34 |     set_global_seeds(seed)
35 |     env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True)
36 |     policy_fn = CategoricalPolicy_om
37 |     oppo_policy_fn = oppo_CategoricalPolicy
38 |     learn(policy_fn, oppo_policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu,
39 |           nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.00, identical=make_env.get_identical(env_id), load=load, epoch=epoch, path=path)
40 |     env.close()
41 | 
42 | 
43 | @click.command()
44 | @click.option('--logdir', type=click.STRING, default='./results')
45 | @click.option('--env', type=click.Choice(['simple', 'simple_speaker_listener',
46 |                                           'simple_crypto', 'simple_push',
47 |                                           'simple_tag', 'simple_spread', 'simple_adversary']))
48 | @click.option('--lr', type=click.FLOAT, default=0.1)
49 | @click.option('--seed', type=click.INT, default=1)
50 | @click.option('--batch_size', type=click.INT, default=1000)
51 | @click.option('--load', is_flag=True, flag_value=True)
52 | @click.option('--epoch', type=click.INT, default=0)
53 | def main(logdir, env, lr, seed, batch_size, load, epoch):
54 |     env_name = env
55 |     env_ids = [env]
56 |     lrs = [lr]
57 |     seeds = [seed]
58 |     batch_sizes = [batch_size]
59 | 
60 |     print('logging to: ' + logdir)
61 |     path=None
62 | 
63 |     if load:
64 |         path = './results/'+'mack_om'+'/'+env_name+'/'
65 |         path += '/l-0.1-b-1000/seed-1/checkpoint' + '%05d'%epoch
66 |     else:
67 |         epoch=0
68 |     for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes):
69 |         train(logdir + '/mack_om/' + env_id + '/l-{}-b-{}/seed-{}'.format(lr, batch_size, seed),
70 |               env_id, 5e7, lr, batch_size, seed, batch_size // 250, load, epoch, path)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mack/run_walker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import logging
 3 | import os
 4 | import itertools
 5 | import click
 6 | import gym
 7 | from rl import bench
 8 | from rl import logger
 9 | from rl.common import set_global_seeds
10 | from rl import make_env
11 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
12 | from sandbox.mack.acktr_cont import learn
13 | from sandbox.mack.policies import GaussianPolicy
14 | 
15 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
17 | 
18 | 
19 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, gae):
20 |     def create_env(rank):
21 |         def _thunk():
22 |             env = gym.make('BipedalWalker-v2') # make_env(env_id)
23 |             env.seed(seed + rank)
24 |             env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
25 |                                 allow_early_resets=True)
26 |             gym.logger.setLevel(logging.WARN)
27 |             return env
28 |         return _thunk
29 | 
30 |     logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
31 | 
32 |     set_global_seeds(seed)
33 |     env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=False)
34 |     policy_fn = GaussianPolicy
35 |     learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu,
36 |           nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.00, gae=gae)
37 |     env.close()
38 | 
39 | 
40 | @click.command()
41 | @click.option('--logdir', type=click.STRING, default='/tmp')
42 | @click.option('--env', type=click.STRING, default='biwalker1')
43 | @click.option('--lr', type=click.FLOAT, default=0.03)
44 | @click.option('--seed', type=click.INT, default=1)
45 | @click.option('--batch_size', type=click.INT, default=2400)
46 | @click.option('--atlas', is_flag=True, flag_value=True)
47 | @click.option('--gae', is_flag=True, flag_value=True)
48 | def main(logdir, env, lr, seed, batch_size, atlas, gae):
49 |     env_ids = [env]
50 |     lrs = [lr]
51 |     seeds = [seed]
52 |     batch_sizes = [batch_size]
53 | 
54 |     if atlas:
55 |         logdir = '/atlas/u/tsong'
56 |     print('logging to: ' + logdir)
57 | 
58 |     gae_str = '/gae' if gae else '/a3c'
59 | 
60 |     for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes):
61 |         train(logdir + '/exps/mack/' + env_id + '{}/l-{}-b-{}/seed-{}'.format(gae_str, lr, batch_size, seed),
62 |               env_id, 5e7, lr, batch_size, seed, 1, True)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mack/run_walker_multi_disc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import logging
 3 | import os
 4 | import itertools
 5 | import click
 6 | import gym
 7 | from rl import bench
 8 | from rl import logger
 9 | from rl.common import set_global_seeds
10 | from rl import make_env
11 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
12 | from sandbox.mack.acktr_multi_disc import learn
13 | from sandbox.mack.policies import MultiCategoricalPolicy
14 | 
15 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
17 | 
18 | 
19 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu):
20 |     def create_env(rank):
21 |         def _thunk():
22 |             env = gym.make('BipedalWalker-v2')
23 |             env.seed(seed + rank)
24 |             env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
25 |                                 allow_early_resets=True)
26 |             gym.logger.setLevel(logging.WARN)
27 |             return env
28 |         return _thunk
29 | 
30 |     logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
31 | 
32 |     set_global_seeds(seed)
33 |     env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=False)
34 |     policy_fn = MultiCategoricalPolicy
35 |     learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu,
36 |           nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.00)
37 |     env.close()
38 | 
39 | 
40 | @click.command()
41 | @click.option('--logdir', type=click.STRING, default='/tmp')
42 | @click.option('--env', type=click.STRING, default='walker1')
43 | @click.option('--lr', type=click.FLOAT, default=0.1)
44 | @click.option('--seed', type=click.INT, default=1)
45 | @click.option('--batch_size', type=click.INT, default=8000)
46 | @click.option('--atlas', is_flag=True, flag_value=True)
47 | def main(logdir, env, lr, seed, batch_size, atlas):
48 |     env_ids = [env]
49 |     lrs = [lr]
50 |     seeds = [seed]
51 |     batch_sizes = [batch_size]
52 | 
53 |     if atlas:
54 |         logdir = '/atlas/u/tsong'
55 |     print('logging to: ' + logdir)
56 | 
57 |     for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes):
58 |         train(logdir + '/exps/mack/' + env_id + '/l-{}-b-{}/seed-{}'.format(lr, batch_size, seed),
59 |               env_id, 5e7, lr, batch_size, seed, 1)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mppo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/sandbox/mppo/__init__.py


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mppo/run_simple_walker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | from baselines.common.cmd_util import mujoco_arg_parser
 4 | from rl import make_env
 5 | from rl import bench, logger
 6 | import os
 7 | 
 8 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 9 | # os.environ['CUDA_VISIBLE_DEVICES'] = ''
10 | 
11 | 
12 | def train(env_id, num_timesteps, seed, num_cpu, batch, lr):
13 |     from rl.common import set_global_seeds
14 |     from rl.common.vec_env.vec_normalize import MAVecNormalize
15 |     from rl.common.ma_wrappers import MAWrapper
16 |     from sandbox.mppo import ppo2
17 |     from sandbox.mppo.policies import MlpPolicy
18 |     import gym
19 |     import tensorflow as tf
20 |     from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
21 |     ncpu = 1
22 |     config = tf.ConfigProto(allow_soft_placement=True,
23 |                             intra_op_parallelism_threads=ncpu,
24 |                             inter_op_parallelism_threads=ncpu)
25 |     tf.Session(config=config).__enter__()
26 | 
27 |     def _make_env():
28 |         env = gym.make(env_id)
29 |         env = MAWrapper(env)
30 |         env = bench.Monitor(env, logger.get_dir())
31 |         return env
32 | 
33 |     env = SubprocVecEnv([_make_env for _ in range(num_cpu)], is_multi_agent=True)
34 |     env = MAVecNormalize(env)
35 | 
36 |     set_global_seeds(seed)
37 |     policy = MlpPolicy
38 |     ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=32,
39 |         lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
40 |         ent_coef=0.0,
41 |         lr=lr,
42 |         cliprange=0.2,
43 |         total_timesteps=num_timesteps)
44 | 
45 | 
46 | def main():
47 |     parser = mujoco_arg_parser()
48 |     parser.add_argument('--cpu', type=int, default=1)
49 |     parser.add_argument('--lr', type=float, default=3e-4)
50 |     parser.add_argument('--batch', type=int, default=2048)
51 |     args = parser.parse_args()
52 |     logdir = './results/mappo/' + args.env + '/l-{}-b-{}/seed-{}'.format(args.lr, args.batch, args.seed)
53 |     try:
54 |         logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
55 |     except:
56 |         logger.configure()
57 |     train(args.env, num_timesteps=1e7, seed=args.seed, num_cpu=args.cpu, batch=args.batch, lr=args.lr)
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     main()
62 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mppo/run_sumo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | from baselines.common.cmd_util import mujoco_arg_parser
 4 | from rl import make_env
 5 | from rl import bench, logger
 6 | import os
 7 | import robosumo.envs
 8 | from irl.dataset import MADataSet
 9 | 
10 | 
11 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
12 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
13 | 
14 | 
15 | def train(env_id, num_timesteps, seed, num_cpu, batch, lr):
16 |     from rl.common import set_global_seeds
17 |     from rl.common.vec_env.vec_normalize import MAVecNormalize
18 |     from rl.common.ma_wrappers import MAWrapper
19 |     from sandbox.mppo import ppo2
20 |     from sandbox.mppo.policies import MlpPolicy
21 |     import gym
22 |     import tensorflow as tf
23 |     from rl.common.vec_env.subproc_vec_env import SubprocVecEnv
24 |     ncpu = 1
25 |     config = tf.ConfigProto(allow_soft_placement=True,
26 |                             intra_op_parallelism_threads=ncpu,
27 |                             inter_op_parallelism_threads=ncpu)
28 |     tf.Session(config=config).__enter__()
29 | 
30 |     def _make_env(rank):
31 |         env = gym.make('RoboSumo-Ant-vs-Ant-v0')
32 |         env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
33 |         return env
34 | 
35 |     env = SubprocVecEnv([lambda: _make_env(i) for i in range(num_cpu)], is_multi_agent=True)
36 |     env = MAVecNormalize(env)
37 | 
38 |     set_global_seeds(seed)
39 |     policy = MlpPolicy
40 |     expert = MADataSet('/atlas/u/tsong/Projects/imitation/ant-vs-ant.pkl')
41 |     ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=160,
42 |         lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
43 |         ent_coef=0.0,
44 |         lr=lr,
45 |         cliprange=0.2,
46 |         total_timesteps=num_timesteps, expert=expert, clone_iters=1000)
47 | 
48 | 
49 | def main():
50 |     parser = mujoco_arg_parser()
51 |     parser.add_argument('--cpu', type=int, default=1)
52 |     parser.add_argument('--lr', type=float, default=3e-4)
53 |     parser.add_argument('--batch', type=int, default=2048)
54 |     args = parser.parse_args()
55 |     logdir = './results/mppo/sumo' + '/l-{}-b-{}/seed-{}'.format(args.lr, args.batch, args.seed)
56 |     try:
57 |         logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
58 |     except:
59 |         logger.configure()
60 |     train(args.env, num_timesteps=1e7, seed=args.seed, num_cpu=args.cpu, batch=args.batch, lr=args.lr)
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     main()
65 | 


--------------------------------------------------------------------------------
/multi-agent-irl/sandbox/mppo/run_walker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | from baselines.common.cmd_util import mujoco_arg_parser
 4 | from rl import make_env
 5 | from rl import bench, logger
 6 | import os
 7 | 
 8 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 9 | # os.environ['CUDA_VISIBLE_DEVICES'] = ''
10 | 
11 | 
12 | def train(env_id, num_timesteps, seed, num_cpu, batch, lr):
13 |     from rl.common import set_global_seeds
14 |     from rl.common.vec_env.vec_normalize import MAVecNormalize
15 |     from rl.common.ma_wrappers import MAWrapper
16 |     from sandbox.mppo import ppo2
17 |     from sandbox.mppo.policies import MlpPolicy
18 |     import gym
19 |     import tensorflow as tf
20 |     from rl.common.vec_env.subproc_vec_env_walker import SubprocVecEnv
21 |     ncpu = 1
22 |     config = tf.ConfigProto(allow_soft_placement=True,
23 |                             intra_op_parallelism_threads=ncpu,
24 |                             inter_op_parallelism_threads=ncpu)
25 |     tf.Session(config=config).__enter__()
26 | 
27 |     def _make_env():
28 |         env = make_env(env_id) # gym.make(env_id)
29 |         env = bench.Monitor(env, logger.get_dir())
30 |         return env
31 | 
32 |     env = SubprocVecEnv([_make_env for _ in range(num_cpu)], is_multi_agent=True)
33 |     env = MAVecNormalize(env)
34 | 
35 |     set_global_seeds(seed)
36 |     policy = MlpPolicy
37 |     ppo2.learn(policy=policy, env=env, nsteps=batch // num_cpu, nminibatches=32,
38 |         lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
39 |         ent_coef=0.0,
40 |         lr=lr,
41 |         cliprange=0.2,
42 |         total_timesteps=num_timesteps)
43 | 
44 | 
45 | def main():
46 |     parser = mujoco_arg_parser()
47 |     parser.add_argument('--cpu', type=int, default=1)
48 |     parser.add_argument('--lr', type=float, default=3e-4)
49 |     parser.add_argument('--batch', type=int, default=2048)
50 |     args = parser.parse_args()
51 |     logdir = './results/mappo/' + args.env + '/l-{}-b-{}/seed-{}'.format(args.lr, args.batch, args.seed)
52 |     try:
53 |         logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard'])
54 |     except:
55 |         logger.configure()
56 |     train(args.env, num_timesteps=1e7, seed=args.seed, num_cpu=args.cpu, batch=args.batch, lr=args.lr)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 OpenAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/README.md:
--------------------------------------------------------------------------------
 1 | # Multi-Agent Particle Environment
 2 | 
 3 | A simple multi-agent particle world with a continuous observation and discrete action space, along with some basic simulated physics.
 4 | Used in the paper [Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments](https://arxiv.org/pdf/1706.02275.pdf).
 5 | 
 6 | ## Getting started:
 7 | 
 8 | - To install, `cd` into the root directory and type `pip install -e .`
 9 | 
10 | - To interactively view moving to landmark scenario (see others in ./scenarios/):
11 | `bin/interactive.py --scenario simple.py`
12 | 
13 | - Known dependencies: Python (3.5.4), OpenAI gym (0.10.5), numpy (1.14.5)
14 | 
15 | - To use the environments, look at the code for importing them in `make_env.py`.
16 | 
17 | ## Code structure
18 | 
19 | - `make_env.py`: contains code for importing a multiagent environment as an OpenAI Gym-like object.
20 | 
21 | - `./multiagent/environment.py`: contains code for environment simulation (interaction physics, `_step()` function, etc.)
22 | 
23 | - `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code.
24 | 
25 | - `./multiagent/rendering.py`: used for displaying agent behaviors on the screen.
26 | 
27 | - `./multiagent/policy.py`: contains code for interactive policy based on keyboard input.
28 | 
29 | - `./multiagent/scenario.py`: contains base scenario object that is extended for all scenarios.
30 | 
31 | - `./multiagent/scenarios/`: folder where various scenarios/ environments are stored. scenario code consists of several functions:
32 |     1) `make_world()`: creates all of the entities that inhabit the world (landmarks, agents, etc.), assigns their capabilities (whether they can communicate, or move, or both).
33 |      called once at the beginning of each training session
34 |     2) `reset_world()`: resets the world by assigning properties (position, color, etc.) to all entities in the world
35 |     called before every episode (including after make_world() before the first episode)
36 |     3) `reward()`: defines the reward function for a given agent
37 |     4) `observation()`: defines the observation space of a given agent
38 |     5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics)
39 | 
40 | ### Creating new environments
41 | 
42 | You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`).
43 | 
44 | ## List of environments
45 | 
46 | 
47 | | Env name in code (name in paper) |  Communication? | Competitive? | Notes |
48 | | --- | --- | --- | --- |
49 | | `simple.py` | N | N | Single agent sees landmark position, rewarded based on how close it gets to landmark. Not a multiagent environment -- used for debugging policies. |
50 | | `simple_adversary.py` (Physical deception) | N | Y | 1 adversary (red), N good agents (green), N landmarks (usually N=2). All agents observe position of landmarks and other agents. One landmark is the ‘target landmark’ (colored green). Good agents rewarded based on how close one of them is to the target landmark, but negatively rewarded if the adversary is close to target landmark. Adversary is rewarded based on how close it is to the target, but it doesn’t know which landmark is the target landmark. So good agents have to learn to ‘split up’ and cover all landmarks to deceive the adversary. |
51 | | `simple_crypto.py` (Covert communication) | Y | Y | Two good agents (alice and bob), one adversary (eve). Alice must sent a private message to bob over a public channel. Alice and bob are rewarded based on how well bob reconstructs the message, but negatively rewarded if eve can reconstruct the message. Alice and bob have a private key (randomly generated at beginning of each episode), which they must learn to use to encrypt the message. |
52 | | `simple_push.py` (Keep-away) | N |Y  | 1 agent, 1 adversary, 1 landmark. Agent is rewarded based on distance to landmark. Adversary is rewarded if it is close to the landmark, and if the agent is far from the landmark. So the adversary learns to push agent away from the landmark. |
53 | | `simple_reference.py` | Y | N | 2 agents, 3 landmarks of different colors. Each agent wants to get to their target landmark, which is known only by other agent. Reward is collective. So agents have to learn to communicate the goal of the other agent, and navigate to their landmark. This is the same as the simple_speaker_listener scenario where both agents are simultaneous speakers and listeners. |
54 | | `simple_speaker_listener.py` (Cooperative communication) | Y | N | Same as simple_reference, except one agent is the ‘speaker’ (gray) that does not move (observes goal of other agent), and other agent is the listener (cannot speak, but must navigate to correct landmark).|
55 | | `simple_spread.py` (Cooperative navigation) | N | N | N agents, N landmarks. Agents are rewarded based on how far any agent is from each landmark. Agents are penalized if they collide with other agents. So, agents have to learn to cover all the landmarks while avoiding collisions. |
56 | | `simple_tag.py` (Predator-prey) | N | Y | Predator-prey environment. Good agents (green) are faster and want to avoid being hit by adversaries (red). Adversaries are slower and want to hit good agents. Obstacles (large black circles) block the way. |
57 | | `simple_world_comm.py` | Y | Y | Environment seen in the video accompanying the paper. Same as simple_tag, except (1) there is food (small blue balls) that the good agents are rewarded for being near, (2) we now have ‘forests’ that hide agents inside from being seen from outside; (3) there is a ‘leader adversary” that can see the agents at all times, and can communicate with the other adversaries to help coordinate the chase. |
58 | 
59 | ## Paper citation
60 | 
61 | If you used this environment for your experiments or found it helpful, consider citing the following papers:
62 | 
63 | Environments in this repo:
64 | <pre>
65 | @article{lowe2017multi,
66 |   title={Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments},
67 |   author={Lowe, Ryan and Wu, Yi and Tamar, Aviv and Harb, Jean and Abbeel, Pieter and Mordatch, Igor},
68 |   journal={Neural Information Processing Systems (NIPS)},
69 |   year={2017}
70 | }
71 | </pre>
72 | 
73 | Original particle world environment:
74 | <pre>
75 | @article{mordatch2017emergence,
76 |   title={Emergence of Grounded Compositional Language in Multi-Agent Populations},
77 |   author={Mordatch, Igor and Abbeel, Pieter},
78 |   journal={arXiv preprint arXiv:1703.04908},
79 |   year={2017}
80 | }
81 | </pre>
82 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-particle-envs/bin/__init__.py


--------------------------------------------------------------------------------
/multi-agent-particle-envs/bin/interactive.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os,sys
 3 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
 4 | import argparse
 5 | 
 6 | from multiagent.environment import MultiAgentEnv
 7 | from multiagent.policy import InteractivePolicy
 8 | import multiagent.scenarios as scenarios
 9 | 
10 | if __name__ == '__main__':
11 |     # parse arguments
12 |     parser = argparse.ArgumentParser(description=None)
13 |     parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.')
14 |     args = parser.parse_args()
15 | 
16 |     # load scenario from script
17 |     scenario = scenarios.load(args.scenario).Scenario()
18 |     # create world
19 |     world = scenario.make_world()
20 |     # create multiagent environment
21 |     env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False)
22 |     # render call to create viewer window (necessary only for interactive policies)
23 |     env.render()
24 |     # create interactive policies for each agent
25 |     policies = [InteractivePolicy(env,i) for i in range(env.n)]
26 |     # execution loop
27 |     obs_n = env.reset()
28 |     while True:
29 |         # query for action from each agent's policy
30 |         act_n = []
31 |         for i, policy in enumerate(policies):
32 |             act_n.append(policy.action(obs_n[i]))
33 |         # step environment
34 |         obs_n, reward_n, done_n, _ = env.step(act_n)
35 |         # render all agent views
36 |         env.render()
37 |         # display rewards
38 |         #for agent in env.world.agents:
39 |         #    print(agent.name + " reward: %0.3f" % env._get_reward(agent))
40 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/make_env.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code for creating a multiagent environment with one of the scenarios listed
 3 | in ./scenarios/.
 4 | Can be called by using, for example:
 5 |     env = make_env('simple_speaker_listener')
 6 | After producing the env object, can be used similarly to an OpenAI gym
 7 | environment.
 8 | 
 9 | A policy using this environment must output actions in the form of a list
10 | for all agents. Each element of the list should be a numpy array,
11 | of size (env.world.dim_p + env.world.dim_c, 1). Physical actions precede
12 | communication actions in this array. See environment.py for more details.
13 | """
14 | 
15 | def make_env(scenario_name, benchmark=False):
16 |     '''
17 |     Creates a MultiAgentEnv object as env. This can be used similar to a gym
18 |     environment by calling env.reset() and env.step().
19 |     Use env.render() to view the environment on the screen.
20 | 
21 |     Input:
22 |         scenario_name   :   name of the scenario from ./scenarios/ to be Returns
23 |                             (without the .py extension)
24 |         benchmark       :   whether you want to produce benchmarking data
25 |                             (usually only done during evaluation)
26 | 
27 |     Some useful env properties (see environment.py):
28 |         .observation_space  :   Returns the observation space for each agent
29 |         .action_space       :   Returns the action space for each agent
30 |         .n                  :   Returns the number of Agents
31 |     '''
32 |     from multiagent.environment import MultiAgentEnv
33 |     import multiagent.scenarios as scenarios
34 | 
35 |     # load scenario from script
36 |     scenario = scenarios.load(scenario_name + ".py").Scenario()
37 |     # create world
38 |     world = scenario.make_world()
39 |     # create multiagent environment
40 |     if benchmark:
41 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
42 |     else:
43 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation,
44 |                             done_callback=scenario.done)
45 |     return env
46 | 
47 | 
48 | def get_identical(env_id):
49 |     return None # without any prior about the reward structure
50 | 
51 |     if env_id == 'simple_adversary':
52 |         return [False, False, True]
53 |     elif env_id == 'simple_spread':
54 |         return [False, True, True]
55 |     elif env_id == 'simple_tag':
56 |         return [False, True, True, False]
57 |     else:
58 |         return None
59 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | # Multiagent envs
 4 | # ----------------------------------------
 5 | 
 6 | register(
 7 |     id='MultiagentSimple-v0',
 8 |     entry_point='multiagent.envs:SimpleEnv',
 9 |     # FIXME(cathywu) currently has to be exactly max_path_length parameters in
10 |     # rllab run script
11 |     max_episode_steps=100,
12 | )
13 | 
14 | register(
15 |     id='MultiagentSimpleSpeakerListener-v0',
16 |     entry_point='multiagent.envs:SimpleSpeakerListenerEnv',
17 |     max_episode_steps=100,
18 | )
19 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/multi_discrete.py:
--------------------------------------------------------------------------------
 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates)
 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py)
 3 | 
 4 | import numpy as np
 5 | 
 6 | import gym
 7 | from gym.spaces import prng
 8 | 
 9 | class MultiDiscrete(gym.Space):
10 |     """
11 |     - The multi-discrete action space consists of a series of discrete action spaces with different parameters
12 |     - It can be adapted to both a Discrete action space or a continuous (Box) action space
13 |     - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
14 |     - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
15 |        where the discrete action space can take any integers from `min` to `max` (both inclusive)
16 |     Note: A value of 0 always need to represent the NOOP action.
17 |     e.g. Nintendo Game Controller
18 |     - Can be conceptualized as 3 discrete action spaces:
19 |         1) Arrow Keys: Discrete 5  - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4]  - params: min: 0, max: 4
20 |         2) Button A:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
21 |         3) Button B:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
22 |     - Can be initialized as
23 |         MultiDiscrete([ [0,4], [0,1], [0,1] ])
24 |     """
25 |     def __init__(self, array_of_param_array):
26 |         self.low = np.array([x[0] for x in array_of_param_array])
27 |         self.high = np.array([x[1] for x in array_of_param_array])
28 |         self.num_discrete_space = self.low.shape[0]
29 | 
30 |     def sample(self):
31 |         """ Returns a array with one sample from each discrete action space """
32 |         # For each row: round(random .* (max - min) + min, 0)
33 |         random_array = prng.np_random.rand(self.num_discrete_space)
34 |         return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
35 |     def contains(self, x):
36 |         return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all()
37 | 
38 |     @property
39 |     def shape(self):
40 |         return self.num_discrete_space
41 |     def __repr__(self):
42 |         return "MultiDiscrete" + str(self.num_discrete_space)
43 |     def __eq__(self, other):
44 |         return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pyglet.window import key
 3 | 
 4 | # individual agent policy
 5 | class Policy(object):
 6 |     def __init__(self):
 7 |         pass
 8 |     def action(self, obs):
 9 |         raise NotImplementedError()
10 | 
11 | # interactive policy based on keyboard input
12 | # hard-coded to deal only with movement, not communication
13 | class InteractivePolicy(Policy):
14 |     def __init__(self, env, agent_index):
15 |         super(InteractivePolicy, self).__init__()
16 |         self.env = env
17 |         # hard-coded keyboard events
18 |         self.move = [False for i in range(4)]
19 |         self.comm = [False for i in range(env.world.dim_c)]
20 |         # register keyboard events with this environment's window
21 |         env.viewers[agent_index].window.on_key_press = self.key_press
22 |         env.viewers[agent_index].window.on_key_release = self.key_release
23 | 
24 |     def action(self, obs):
25 |         # ignore observation and just act based on keyboard events
26 |         if self.env.discrete_action_input:
27 |             u = 0
28 |             if self.move[0]: u = 1
29 |             if self.move[1]: u = 2
30 |             if self.move[2]: u = 4
31 |             if self.move[3]: u = 3
32 |         else:
33 |             u = np.zeros(5) # 5-d because of no-move action
34 |             if self.move[0]: u[1] += 1.0
35 |             if self.move[1]: u[2] += 1.0
36 |             if self.move[3]: u[3] += 1.0
37 |             if self.move[2]: u[4] += 1.0
38 |             if True not in self.move:
39 |                 u[0] += 1.0
40 |         return np.concatenate([u, np.zeros(self.env.world.dim_c)])
41 | 
42 |     # keyboard event callbacks
43 |     def key_press(self, k, mod):
44 |         if k==key.LEFT:  self.move[0] = True
45 |         if k==key.RIGHT: self.move[1] = True
46 |         if k==key.UP:    self.move[2] = True
47 |         if k==key.DOWN:  self.move[3] = True
48 |     def key_release(self, k, mod):
49 |         if k==key.LEFT:  self.move[0] = False
50 |         if k==key.RIGHT: self.move[1] = False
51 |         if k==key.UP:    self.move[2] = False
52 |         if k==key.DOWN:  self.move[3] = False
53 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/scenario.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # defines scenario upon which the world is built
 4 | class BaseScenario(object):
 5 |     # create elements of the world
 6 |     def make_world(self):
 7 |         raise NotImplementedError()
 8 |     # create initial conditions of the world
 9 |     def reset_world(self, world):
10 |         raise NotImplementedError()
11 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/scenarios/__init__.py:
--------------------------------------------------------------------------------
1 | import imp
2 | import os.path as osp
3 | 
4 | 
5 | def load(name):
6 |     pathname = osp.join(osp.dirname(__file__), name)
7 |     return imp.load_source('', pathname)
8 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/scenarios/simple.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | class Scenario(BaseScenario):
 6 |     def make_world(self):
 7 |         world = World()
 8 |         # add agents
 9 |         world.agents = [Agent() for i in range(1)]
10 |         for i, agent in enumerate(world.agents):
11 |             agent.name = 'agent %d' % i
12 |             agent.collide = False
13 |             agent.silent = True
14 |         # add landmarks
15 |         world.landmarks = [Landmark() for i in range(1)]
16 |         for i, landmark in enumerate(world.landmarks):
17 |             landmark.name = 'landmark %d' % i
18 |             landmark.collide = False
19 |             landmark.movable = False
20 |         # make initial conditions
21 |         self.reset_world(world)
22 |         return world
23 | 
24 |     def reset_world(self, world):
25 |         # random properties for agents
26 |         for i, agent in enumerate(world.agents):
27 |             agent.color = np.array([0.25,0.25,0.25])
28 |         # random properties for landmarks
29 |         for i, landmark in enumerate(world.landmarks):
30 |             landmark.color = np.array([0.75,0.75,0.75])
31 |         world.landmarks[0].color = np.array([0.75,0.25,0.25])
32 |         # set random initial states
33 |         for agent in world.agents:
34 |             agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
35 |             agent.state.p_vel = np.zeros(world.dim_p)
36 |             agent.state.c = np.zeros(world.dim_c)
37 |         for i, landmark in enumerate(world.landmarks):
38 |             landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
39 |             landmark.state.p_vel = np.zeros(world.dim_p)
40 | 
41 |     def reward(self, agent, world):
42 |         dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos))
43 |         return -dist2
44 | 
45 |     def observation(self, agent, world):
46 |         # get positions of all entities in this agent's reference frame
47 |         entity_pos = []
48 |         for entity in world.landmarks:
49 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
50 |         return np.concatenate([agent.state.p_vel] + entity_pos)
51 | 
52 |     def done(self, agent, world):
53 |         if world.time >= 50:
54 |             return True
55 |         else:
56 |             return False
57 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/scenarios/simple_push.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | class Scenario(BaseScenario):
  6 |     def make_world(self):
  7 |         world = World()
  8 |         # set any world properties first
  9 |         world.dim_c = 2
 10 |         num_agents = 2
 11 |         num_adversaries = 1
 12 |         num_landmarks = 2
 13 |         # add agents
 14 |         world.agents = [Agent() for i in range(num_agents)]
 15 |         for i, agent in enumerate(world.agents):
 16 |             agent.name = 'agent %d' % i
 17 |             agent.collide = True
 18 |             agent.silent = True
 19 |             agent.max_speed = 2.0
 20 |             if i < num_adversaries:
 21 |                 agent.adversary = True
 22 |                 agent.initial_mass = 5.0
 23 |             else:
 24 |                 agent.adversary = False
 25 |         # add landmarks
 26 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 27 |         for i, landmark in enumerate(world.landmarks):
 28 |             landmark.name = 'landmark %d' % i
 29 |             landmark.collide = False
 30 |             landmark.movable = False
 31 |         # make initial conditions
 32 |         self.reset_world(world)
 33 |         return world
 34 | 
 35 |     def reset_world(self, world):
 36 |         # random properties for landmarks
 37 |         for i, landmark in enumerate(world.landmarks):
 38 |             landmark.color = np.array([0.1, 0.1, 0.1])
 39 |             landmark.color[i + 1] += 0.8
 40 |             landmark.index = i
 41 |         # set goal landmark
 42 |         goal = np.random.choice(world.landmarks)
 43 |         for i, agent in enumerate(world.agents):
 44 |             agent.goal_a = goal
 45 |             agent.color = np.array([0.25, 0.25, 0.25])
 46 |             if agent.adversary:
 47 |                 agent.color = np.array([0.75, 0.25, 0.25])
 48 |             else:
 49 |                 j = goal.index
 50 |                 agent.color[j + 1] += 0.5
 51 |         # set random initial states
 52 |         for agent in world.agents:
 53 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 54 |             agent.state.p_vel = np.zeros(world.dim_p)
 55 |             agent.state.c = np.zeros(world.dim_c)
 56 |         for i, landmark in enumerate(world.landmarks):
 57 |             landmark.state.p_pos = np.random.uniform(-0.8, +0.8, world.dim_p)
 58 |             landmark.state.p_vel = np.zeros(world.dim_p)
 59 | 
 60 |     def reward(self, agent, world):
 61 |         # Agents are rewarded based on minimum agent distance to each landmark
 62 |         return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
 63 | 
 64 |     def agent_reward(self, agent, world):
 65 |         # the distance to the goal
 66 |         return -np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)))
 67 | 
 68 |     def adversary_reward(self, agent, world):
 69 |         # keep the nearest good agents away from the goal
 70 |         agent_dist = [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in world.agents if
 71 |                       not a.adversary]
 72 |         pos_rew = min(agent_dist)
 73 |         # nearest_agent = world.good_agents[np.argmin(agent_dist)]
 74 |         # neg_rew = np.sqrt(np.sum(np.square(nearest_agent.state.p_pos - agent.state.p_pos)))
 75 |         neg_rew = np.sqrt(np.sum(np.square(agent.goal_a.state.p_pos - agent.state.p_pos)))
 76 |         # neg_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in world.good_agents])
 77 |         return pos_rew - neg_rew
 78 | 
 79 |     def observation(self, agent, world):
 80 |         # get positions of all entities in this agent's reference frame
 81 |         entity_pos = []
 82 |         for entity in world.landmarks:  # world.entities:
 83 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
 84 |         # entity colors
 85 |         entity_color = []
 86 |         for entity in world.landmarks:  # world.entities:
 87 |             entity_color.append(entity.color)
 88 |         # communication of all other agents
 89 |         comm = []
 90 |         other_pos = []
 91 |         for other in world.agents:
 92 |             if other is agent: continue
 93 |             comm.append(other.state.c)
 94 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
 95 |         if not agent.adversary:
 96 |             return np.concatenate([agent.state.p_vel] + [agent.goal_a.state.p_pos - agent.state.p_pos] + [agent.color] + entity_pos + entity_color + other_pos)
 97 |         else:
 98 |             #other_pos = list(reversed(other_pos)) if random.uniform(0,1) > 0.5 else other_pos  # randomize position of other agents in adversary network
 99 |             return np.concatenate([agent.state.p_vel] + [agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos)
100 | 
101 |     def done(self, agent, world):
102 |         if world.time >= 50:
103 |             return True
104 |         else:
105 |             return False
106 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/scenarios/simple_reference.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiagent.core import World, Agent, Landmark
 3 | from multiagent.scenario import BaseScenario
 4 | 
 5 | class Scenario(BaseScenario):
 6 |     def make_world(self):
 7 |         world = World()
 8 |         # set any world properties first
 9 |         world.dim_c = 10
10 |         world.collaborative = True  # whether agents share rewards
11 |         # add agents
12 |         world.agents = [Agent() for i in range(2)]
13 |         for i, agent in enumerate(world.agents):
14 |             agent.name = 'agent %d' % i
15 |             agent.collide = False
16 |         # add landmarks
17 |         world.landmarks = [Landmark() for i in range(3)]
18 |         for i, landmark in enumerate(world.landmarks):
19 |             landmark.name = 'landmark %d' % i
20 |             landmark.collide = False
21 |             landmark.movable = False
22 |         # make initial conditions
23 |         self.reset_world(world)
24 |         return world
25 | 
26 |     def reset_world(self, world):
27 |         # assign goals to agents
28 |         for agent in world.agents:
29 |             agent.goal_a = None
30 |             agent.goal_b = None
31 |         # want other agent to go to the goal landmark
32 |         world.agents[0].goal_a = world.agents[1]
33 |         world.agents[0].goal_b = np.random.choice(world.landmarks)
34 |         world.agents[1].goal_a = world.agents[0]
35 |         world.agents[1].goal_b = np.random.choice(world.landmarks)
36 |         # random properties for agents
37 |         for i, agent in enumerate(world.agents):
38 |             agent.color = np.array([0.25,0.25,0.25])               
39 |         # random properties for landmarks
40 |         world.landmarks[0].color = np.array([0.75,0.25,0.25]) 
41 |         world.landmarks[1].color = np.array([0.25,0.75,0.25]) 
42 |         world.landmarks[2].color = np.array([0.25,0.25,0.75]) 
43 |         # special colors for goals
44 |         world.agents[0].goal_a.color = world.agents[0].goal_b.color                
45 |         world.agents[1].goal_a.color = world.agents[1].goal_b.color                               
46 |         # set random initial states
47 |         for agent in world.agents:
48 |             agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
49 |             agent.state.p_vel = np.zeros(world.dim_p)
50 |             agent.state.c = np.zeros(world.dim_c)
51 |         for i, landmark in enumerate(world.landmarks):
52 |             landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
53 |             landmark.state.p_vel = np.zeros(world.dim_p)
54 | 
55 |     def reward(self, agent, world):
56 |         if agent.goal_a is None or agent.goal_b is None:
57 |             return 0.0
58 |         dist2 = np.sum(np.square(agent.goal_a.state.p_pos - agent.goal_b.state.p_pos))
59 |         return -dist2
60 | 
61 |     def observation(self, agent, world):
62 |         # goal color
63 |         goal_color = [np.zeros(world.dim_color), np.zeros(world.dim_color)]
64 |         if agent.goal_b is not None:
65 |             goal_color[1] = agent.goal_b.color 
66 | 
67 |         # get positions of all entities in this agent's reference frame
68 |         entity_pos = []
69 |         for entity in world.landmarks:
70 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
71 |         # entity colors
72 |         entity_color = []
73 |         for entity in world.landmarks:
74 |             entity_color.append(entity.color)
75 |         # communication of all other agents
76 |         comm = []
77 |         for other in world.agents:
78 |             if other is agent: continue
79 |             comm.append(other.state.c)
80 |         return np.concatenate([agent.state.p_vel] + entity_pos + [goal_color[1]] + comm)
81 |             


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/scenarios/simple_speaker_listener.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | class Scenario(BaseScenario):
  6 |     def make_world(self):
  7 |         world = World()
  8 |         # set any world properties first
  9 |         world.dim_c = 3
 10 |         num_landmarks = 3
 11 |         world.collaborative = True
 12 |         # add agents
 13 |         world.agents = [Agent() for i in range(2)]
 14 |         for i, agent in enumerate(world.agents):
 15 |             agent.name = 'agent %d' % i
 16 |             agent.collide = False
 17 |             agent.size = 0.075
 18 |         # speaker
 19 |         world.agents[0].movable = False
 20 |         # listener
 21 |         world.agents[1].silent = True
 22 |         # add landmarks
 23 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 24 |         for i, landmark in enumerate(world.landmarks):
 25 |             landmark.name = 'landmark %d' % i
 26 |             landmark.collide = False
 27 |             landmark.movable = False
 28 |             landmark.size = 0.04
 29 |         # make initial conditions
 30 |         self.reset_world(world)
 31 |         return world
 32 | 
 33 |     def reset_world(self, world):
 34 |         # assign goals to agents
 35 |         for agent in world.agents:
 36 |             agent.goal_a = None
 37 |             agent.goal_b = None
 38 |         # want listener to go to the goal landmark
 39 |         world.agents[0].goal_a = world.agents[1]
 40 |         world.agents[0].goal_b = np.random.choice(world.landmarks)
 41 |         # random properties for agents
 42 |         for i, agent in enumerate(world.agents):
 43 |             agent.color = np.array([0.25,0.25,0.25])               
 44 |         # random properties for landmarks
 45 |         world.landmarks[0].color = np.array([0.65,0.15,0.15])
 46 |         world.landmarks[1].color = np.array([0.15,0.65,0.15])
 47 |         world.landmarks[2].color = np.array([0.15,0.15,0.65])
 48 |         # special colors for goals
 49 |         world.agents[0].goal_a.color = world.agents[0].goal_b.color + np.array([0.45, 0.45, 0.45])
 50 |         # set random initial states
 51 |         for agent in world.agents:
 52 |             agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
 53 |             agent.state.p_vel = np.zeros(world.dim_p)
 54 |             agent.state.c = np.zeros(world.dim_c)
 55 |         for i, landmark in enumerate(world.landmarks):
 56 |             landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
 57 |             landmark.state.p_vel = np.zeros(world.dim_p)
 58 | 
 59 |     def benchmark_data(self, agent, world):
 60 |         # returns data for benchmarking purposes
 61 |         return self.reward(agent, world)
 62 | 
 63 |     def reward(self, agent, world):
 64 |         # squared distance from listener to landmark
 65 |         a = world.agents[0]
 66 |         dist2 = np.sqrt(np.sum(np.square(a.goal_a.state.p_pos - a.goal_b.state.p_pos)))
 67 |         return -dist2
 68 | 
 69 |     def observation(self, agent, world):
 70 |         # goal color
 71 |         goal_color = np.zeros(world.dim_color)
 72 |         if agent.goal_b is not None:
 73 |             goal_color = agent.goal_b.color
 74 | 
 75 |         # get positions of all entities in this agent's reference frame
 76 |         entity_pos = []
 77 |         for entity in world.landmarks:
 78 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
 79 | 
 80 |         # communication of all other agents
 81 |         comm = []
 82 |         for other in world.agents:
 83 |             if other is agent or (other.state.c is None): continue
 84 |             comm.append(other.state.c)
 85 |         
 86 |         # speaker
 87 |         if not agent.movable:
 88 |             return np.concatenate([goal_color])
 89 |         # listener
 90 |         if agent.silent:
 91 |             return np.concatenate([agent.state.p_vel] + entity_pos + comm)
 92 | 
 93 |     def done(self, agent, world):
 94 |         if world.time >= 50:
 95 |             return True
 96 |         else:
 97 |             a = world.agents[0]
 98 |             dist2 = np.sqrt(np.sum(np.square(a.goal_a.state.p_pos - a.goal_b.state.p_pos)))
 99 |             if dist2 <= 0.035:
100 |                 return True
101 |             return False


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/scenarios/simple_spread.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self):
  8 |         world = World()
  9 |         # set any world properties first
 10 |         world.dim_c = 2
 11 |         num_agents = 3
 12 |         num_landmarks = 3
 13 |         world.collaborative = True
 14 |         # add agents
 15 |         world.agents = [Agent() for i in range(num_agents)]
 16 |         for i, agent in enumerate(world.agents):
 17 |             agent.name = 'agent %d' % i
 18 |             agent.collide = True
 19 |             agent.silent = True
 20 |             agent.size = 0.15
 21 |         # add landmarks
 22 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 23 |         for i, landmark in enumerate(world.landmarks):
 24 |             landmark.name = 'landmark %d' % i
 25 |             landmark.collide = False
 26 |             landmark.movable = False
 27 |         # make initial conditions
 28 |         self.reset_world(world)
 29 |         return world
 30 | 
 31 |     def reset_world(self, world):
 32 |         # random properties for agents
 33 |         for i, agent in enumerate(world.agents):
 34 |             agent.color = np.array([0.35, 0.35, 0.85])
 35 |         # random properties for landmarks
 36 |         for i, landmark in enumerate(world.landmarks):
 37 |             landmark.color = np.array([0.25, 0.25, 0.25])
 38 |         # set random initial states
 39 |         for agent in world.agents:
 40 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 41 |             agent.state.p_vel = np.zeros(world.dim_p)
 42 |             agent.state.c = np.zeros(world.dim_c)
 43 |         for i, landmark in enumerate(world.landmarks):
 44 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 45 |             landmark.state.p_vel = np.zeros(world.dim_p)
 46 | 
 47 |     def benchmark_data(self, agent, world):
 48 |         rew = 0
 49 |         collisions = 0
 50 |         occupied_landmarks = 0
 51 |         min_dists = 0
 52 |         for l in world.landmarks:
 53 |             dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
 54 |             min_dists += min(dists)
 55 |             rew -= min(dists)
 56 |             if min(dists) < 0.1:
 57 |                 occupied_landmarks += 1
 58 |         if agent.collide:
 59 |             for a in world.agents:
 60 |                 if self.is_collision(a, agent):
 61 |                     rew -= 1
 62 |                     collisions += 1
 63 |         return (rew, collisions, min_dists, occupied_landmarks)
 64 | 
 65 | 
 66 |     def is_collision(self, agent1, agent2):
 67 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
 68 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
 69 |         dist_min = agent1.size + agent2.size
 70 |         return np.minimum(dist - dist_min, 0.0)  # True if dist < dist_min else False
 71 | 
 72 |     def reward(self, agent, world):
 73 |         # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions
 74 |         rew = 0
 75 |         for l in world.landmarks:
 76 |             dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
 77 |             rew -= min(dists)
 78 |         if agent.collide:
 79 |             for a in world.agents:
 80 |                 rew += self.is_collision(a, agent)
 81 |                 # if self.is_collision(a, agent):
 82 |                 #    rew -= 1
 83 |         return rew
 84 | 
 85 |     def observation(self, agent, world):
 86 |         # get positions of all entities in this agent's reference frame
 87 |         entity_pos = []
 88 |         for entity in world.landmarks:  # world.entities:
 89 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
 90 |         # entity colors
 91 |         entity_color = []
 92 |         for entity in world.landmarks:  # world.entities:
 93 |             entity_color.append(entity.color)
 94 |         # communication of all other agents
 95 |         comm = []
 96 |         other_pos = []
 97 |         for other in world.agents:
 98 |             if other is agent: continue
 99 |             comm.append(other.state.c)
100 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
101 |         return np.concatenate([agent.state.p_vel] + entity_pos + other_pos + comm)
102 | 
103 |     def done(self, agent, world):
104 |         if world.time >= 50:
105 |             return True
106 |         else:
107 |             for l in world.landmarks:
108 |                 dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
109 |                 rew = min(dists)
110 |                 if rew > 0.1:
111 |                     return False
112 |             return True
113 | 


--------------------------------------------------------------------------------
/multi-agent-particle-envs/multiagent/scenarios/simple_tag.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiagent.core import World, Agent, Landmark
  3 | from multiagent.scenario import BaseScenario
  4 | 
  5 | 
  6 | class Scenario(BaseScenario):
  7 |     def make_world(self):
  8 |         world = World()
  9 |         # set any world properties first
 10 |         world.dim_c = 2
 11 |         num_good_agents = 1
 12 |         num_adversaries = 3
 13 |         num_agents = num_adversaries + num_good_agents
 14 |         num_landmarks = 2
 15 |         # add agents
 16 |         world.agents = [Agent() for i in range(num_agents)]
 17 |         for i, agent in enumerate(world.agents):
 18 |             agent.name = 'agent %d' % i
 19 |             agent.collide = True
 20 |             agent.silent = True
 21 |             agent.adversary = True if i < num_adversaries else False
 22 |             agent.size = 0.075 if agent.adversary else 0.05
 23 |             agent.accel = 3.0 if agent.adversary else 4.0
 24 |             #agent.accel = 20.0 if agent.adversary else 25.0
 25 |             agent.max_speed = 1.0 if agent.adversary else 1.3
 26 |         # add landmarks
 27 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 28 |         for i, landmark in enumerate(world.landmarks):
 29 |             landmark.name = 'landmark %d' % i
 30 |             landmark.collide = True
 31 |             landmark.movable = False
 32 |             landmark.size = 0.2
 33 |             landmark.boundary = False
 34 |         # make initial conditions
 35 |         self.reset_world(world)
 36 |         return world
 37 | 
 38 |     def reset_world(self, world):
 39 |         # random properties for agents
 40 |         for i, agent in enumerate(world.agents):
 41 |             agent.color = np.array([0.35, 0.85, 0.35]) if not agent.adversary else np.array([0.85, 0.35, 0.35])
 42 |             # random properties for landmarks
 43 |         for i, landmark in enumerate(world.landmarks):
 44 |             landmark.color = np.array([0.25, 0.25, 0.25])
 45 |         # set random initial states
 46 |         for agent in world.agents:
 47 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 48 |             agent.state.p_vel = np.zeros(world.dim_p)
 49 |             agent.state.c = np.zeros(world.dim_c)
 50 |         for i, landmark in enumerate(world.landmarks):
 51 |             if not landmark.boundary:
 52 |                 landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
 53 |                 landmark.state.p_vel = np.zeros(world.dim_p)
 54 | 
 55 |     def benchmark_data(self, agent, world):
 56 |         # returns data for benchmarking purposes
 57 |         if agent.adversary:
 58 |             collisions = 0
 59 |             for a in self.good_agents(world):
 60 |                 if self.is_collision(a, agent):
 61 |                     collisions += 1
 62 |             return collisions
 63 |         else:
 64 |             return 0
 65 | 
 66 |     def is_collision(self, agent1, agent2):
 67 |         delta_pos = agent1.state.p_pos - agent2.state.p_pos
 68 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
 69 |         dist_min = agent1.size + agent2.size
 70 |         return True if dist < dist_min else False
 71 | 
 72 |     # return all agents that are not adversaries
 73 |     def good_agents(self, world):
 74 |         return [agent for agent in world.agents if not agent.adversary]
 75 | 
 76 |     # return all adversarial agents
 77 |     def adversaries(self, world):
 78 |         return [agent for agent in world.agents if agent.adversary]
 79 | 
 80 |     def reward(self, agent, world):
 81 |         # Agents are rewarded based on minimum agent distance to each landmark
 82 |         main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
 83 |         return main_reward
 84 | 
 85 |     def agent_reward(self, agent, world):
 86 |         # Agents are negatively rewarded if caught by adversaries
 87 |         rew = 0
 88 |         shape = False
 89 |         adversaries = self.adversaries(world)
 90 |         if shape:  # reward can optionally be shaped (increased reward for increased distance from adversary)
 91 |             for adv in adversaries:
 92 |                 rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos)))
 93 |         if agent.collide:
 94 |             for a in adversaries:
 95 |                 if self.is_collision(a, agent):
 96 |                     rew -= 10
 97 | 
 98 |         # agents are penalized for exiting the screen, so that they can be caught by the adversaries
 99 |         def bound(x):
100 |             if x < 0.9:
101 |                 return 0
102 |             if x < 1.0:
103 |                 return (x - 0.9) * 10
104 |             return min(np.exp(2 * x - 2), 10)
105 |         for p in range(world.dim_p):
106 |             x = abs(agent.state.p_pos[p])
107 |             rew -= bound(x)
108 | 
109 |         return rew
110 | 
111 |     def adversary_reward(self, agent, world):
112 |         # Adversaries are rewarded for collisions with agents
113 |         rew = 0
114 |         shape = False
115 |         agents = self.good_agents(world)
116 |         adversaries = self.adversaries(world)
117 |         if shape:  # reward can optionally be shaped (decreased reward for increased distance from agents)
118 |             for adv in adversaries:
119 |                 rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents])
120 |         if agent.collide:
121 |             for ag in agents:
122 |                 for adv in adversaries:
123 |                     if self.is_collision(ag, adv):
124 |                         rew += 10
125 |         return rew
126 | 
127 |     def observation(self, agent, world):
128 |         # get positions of all entities in this agent's reference frame
129 |         entity_pos = []
130 |         for entity in world.landmarks:
131 |             if not entity.boundary:
132 |                 entity_pos.append(entity.state.p_pos - agent.state.p_pos)
133 |         # communication of all other agents
134 |         comm = []
135 |         other_pos = []
136 |         other_vel = []
137 |         for other in world.agents:
138 |             if other is agent: continue
139 |             comm.append(other.state.c)
140 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
141 |             if not other.adversary:
142 |                 other_vel.append(other.state.p_vel)
143 |         return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel)
144 | 
145 |     def done(self, agent, world):
146 |         if world.time >= 50:
147 |             return True
148 |         else:
149 |             return False


--------------------------------------------------------------------------------
/multi-agent-particle-envs/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(name='multiagent',
 4 |       version='0.0.1',
 5 |       description='Multi-Agent Goal-Driven Communication Environment',
 6 |       url='https://github.com/openai/multiagent-public',
 7 |       author='Igor Mordatch',
 8 |       author_email='mordatch@openai.com',
 9 |       packages=find_packages(),
10 |       include_package_data=True,
11 |       zip_safe=False,
12 |       install_requires=['gym', 'numpy-stl']
13 | )
14 | 


--------------------------------------------------------------------------------