├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── multi-agent-irl ├── README.md ├── all_result.sh ├── compute_kl.sh ├── curve.sh ├── hyper_study.sh ├── irl │ ├── __init__.py │ ├── dataset.py │ ├── mack │ │ ├── __init__.py │ │ ├── airl.py │ │ ├── codail.py │ │ ├── gail.py │ │ ├── kfac_discriminator.py │ │ ├── kfac_discriminator_airl.py │ │ ├── kfac_discriminator_codail.py │ │ ├── kfac_discriminator_ncdail.py │ │ ├── ncdail.py │ │ ├── run_mack_airl.py │ │ ├── run_mack_codail.py │ │ ├── run_mack_gail.py │ │ ├── run_mack_ncdail.py │ │ └── tf_util.py │ └── render.py ├── plot_distribution.sh ├── requirements.txt ├── rl │ ├── __init__.py │ ├── acktr │ │ ├── README.md │ │ ├── __init__.py │ │ ├── filters.py │ │ ├── kfac.py │ │ ├── kfac_utils.py │ │ ├── running_stat.py │ │ └── utils.py │ ├── bench │ │ ├── __init__.py │ │ ├── benchmarks.py │ │ └── monitor.py │ ├── common │ │ ├── __init__.py │ │ ├── atari_wrappers.py │ │ ├── console_util.py │ │ ├── dataset.py │ │ ├── distributions.py │ │ ├── ma_wrappers.py │ │ ├── math_util.py │ │ ├── misc_util.py │ │ ├── mpi_adam.py │ │ ├── mpi_fork.py │ │ ├── mpi_moments.py │ │ ├── mpi_running_mean_std.py │ │ ├── schedules.py │ │ ├── segment_tree.py │ │ ├── tf_util.py │ │ └── vec_env │ │ │ ├── __init__.py │ │ │ ├── dummy_vec_env.py │ │ │ ├── mpi_vec_env1.py │ │ │ ├── speedtest.py │ │ │ ├── subproc_vec_env.py │ │ │ ├── subproc_vec_env_walker.py │ │ │ ├── vec_frame_stack.py │ │ │ └── vec_normalize.py │ ├── envs │ │ ├── __init__.py │ │ ├── ant_og.xml │ │ ├── mujoco_env │ │ │ ├── __init__.py │ │ │ └── walker2d.py │ │ ├── multi_ant.py │ │ ├── multi_ant.xml │ │ └── multi_walker.py │ └── logger.py ├── sample.sh └── sandbox │ ├── __init__.py │ ├── imitation │ ├── __init__.py │ ├── crender.py │ ├── render.py │ ├── run_cmappo.py │ └── run_mujoco.py │ ├── mack │ ├── __init__.py │ ├── acktr_cont.py │ ├── acktr_disc.py │ ├── acktr_disc_om.py │ ├── acktr_multi_disc.py │ ├── opponent_policies.py │ ├── policies.py │ ├── policies_om.py │ ├── render.py │ ├── run_clone.py │ ├── run_simple.py │ ├── run_simple_om.py │ ├── run_walker.py │ └── run_walker_multi_disc.py │ └── mppo │ ├── __init__.py │ ├── policies.py │ ├── ppo2.py │ ├── run_simple_walker.py │ ├── run_sumo.py │ └── run_walker.py └── multi-agent-particle-envs ├── LICENSE.txt ├── README.md ├── bin ├── __init__.py └── interactive.py ├── make_env.py ├── multiagent ├── __init__.py ├── core.py ├── environment.py ├── multi_discrete.py ├── policy.py ├── rendering.py ├── scenario.py └── scenarios │ ├── __init__.py │ ├── simple.py │ ├── simple_adversary.py │ ├── simple_crypto.py │ ├── simple_push.py │ ├── simple_reference.py │ ├── simple_speaker_listener.py │ ├── simple_spread.py │ ├── simple_tag.py │ └── simple_world_comm.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # workspace 132 | .idea 133 | .vscode 134 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "thirdparty/ma-particle-envs"] 2 | path = thirdparty/ma-particle-envs 3 | url = https://github.com/openai/multiagent-particle-envs 4 | [submodule "thirdparty/ma-airl"] 5 | path = thirdparty/ma-airl 6 | url = https://github.com/ermongroup/ma-airl 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CoDAIL 2 | Implementation of CoDAIL in the paper [Multi-Agent Interactions Modeling with Correlated Policies](https://openreview.net/forum?id=B1gZV1HYvS), based on [MA-AIRL](https://github.com/ermongroup/ma-airl). 3 | 4 | ## Running the Code 5 | 6 | - For code implementing CoDAIL, please visit `multi-agent-irl` folder. 7 | - For the OpenAI particle environment code, please visit `multi-agent-particle-envs` folder. 8 | 9 | 10 | **NOTE**: Early implementation can be seen at [codailiclr2020/CoDAIL](https://github.com/codailiclr2020/CoDAIL) 11 | -------------------------------------------------------------------------------- /multi-agent-irl/README.md: -------------------------------------------------------------------------------- 1 | # Multi-Agent Adversarial Inverse Reinforcement Learning 2 | 3 | Frist you should install requirements. 4 | Type the following codes under this forlder: 5 | ``` 6 | pip install -r requirements.txt 7 | ``` 8 | Then cd into the particle environment folder and type: 9 | ``` 10 | pip install -e . 11 | ``` 12 | to install the env. 13 | 14 | Run Multi-Agent ACKTR to obtain experts: 15 | ``` 16 | python -m sandbox.mack.run_simple 17 | python -m sandbox.mack.run_simple_om 18 | ``` 19 | The former generates interactions that agents do not consider others, while the latter generates interactions that agents model others when making decisions. Notice you should be aware of the args to run it successfully. 20 | 21 | To generate expert trajectories: 22 | ``` 23 | python -m irl.render 24 | ``` 25 | with specific args 26 | 27 | Run CoDAIL / NCDAIL / MA-GAIL / MA-AIRL: 28 | 29 | ``` 30 | python -m irl.mack.run_mack_gail 31 | python -m irl.mack.run_mack_airl 32 | python -m irl.mack.run_mack_ncdail 33 | python -m irl.mack.run_mack_codail 34 | ``` 35 | 36 | Render results (see './irl/render.py' for more information): 37 | 38 | ``` 39 | python -m irl.render 40 | ``` 41 | -------------------------------------------------------------------------------- /multi-agent-irl/all_result.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # source activate rl 3 | if [ $1 -eq 200 ] 4 | then 5 | python -m irl.render --env=simple_speaker_listener --all_exp --epoch=$2 --traj_limitation=200 6 | python -m irl.render --env=simple_spread --all_exp --epoch=$2 --traj_limitation=200 7 | python -m irl.render --env=simple_push --all_exp --epoch=$2 --traj_limitation=200 8 | python -m irl.render --env=simple_tag --all_exp --epoch=$2 --traj_limitation=200 9 | elif [ $1 -eq 100 ] 10 | then 11 | python -m irl.render --env=simple_speaker_listener --all_exp --epoch=$2 --traj_limitation=100 12 | python -m irl.render --env=simple_spread --all_exp --epoch=$2 --traj_limitation=100 13 | python -m irl.render --env=simple_push --all_exp --epoch=$2 --traj_limitation=100 14 | python -m irl.render --env=simple_tag --all_exp --epoch=$2 --traj_limitation=100 15 | fi 16 | -------------------------------------------------------------------------------- /multi-agent-irl/compute_kl.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # source activate rl 3 | python -m irl.render --env=simple_speaker_listener --kl --num_trajs=$1 --epoch=$2 --seed=$3 4 | python -m irl.render --env=simple_spread --kl --num_trajs=$1 --epoch=$2 --seed=$3 5 | python -m irl.render --env=simple_push --kl --num_trajs=$1 --epoch=$2 --seed=$3 6 | python -m irl.render --env=simple_tag --kl --num_trajs=$1 --epoch=$2 --seed=$3 7 | 8 | -------------------------------------------------------------------------------- /multi-agent-irl/curve.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # source activate rl 3 | if [ $1 -eq 200 ] 4 | then 5 | python -m irl.render --env=simple_speaker_listener --curve --epoch=$2 --traj_limitation=200 6 | python -m irl.render --env=simple_spread --curve --epoch=$2 --traj_limitation=200 7 | python -m irl.render --env=simple_push --curve --epoch=$2 --traj_limitation=200 8 | python -m irl.render --env=simple_tag --curve --epoch=$2 --traj_limitation=200 9 | elif [ $1 -eq 100 ] 10 | then 11 | python -m irl.render --env=simple_speaker_listener --curve --epoch=$2 --traj_limitation=100 12 | python -m irl.render --env=simple_spread --curve --epoch=$2 --traj_limitation=100 13 | python -m irl.render --env=simple_push --curve --epoch=$2 --traj_limitation=100 14 | python -m irl.render --env=simple_tag --curve --epoch=$2 --traj_limitation=100 15 | fi 16 | -------------------------------------------------------------------------------- /multi-agent-irl/hyper_study.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # source activate rl 3 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=2 --ent_coef=0 4 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=4 --ent_coef=0 5 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=2 --g=1 --ent_coef=0 6 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=4 --g=1 --ent_coef=0 7 | 8 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=1 --ent_coef=0.2 9 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=1 --ent_coef=0.4 10 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=1 --ent_coef=0.6 11 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=1 --ent_coef=0.8 12 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=200 --hyper_study --epoch=$1 --d=1 --g=1 --ent_coef=1.0 13 | 14 | 15 | -------------------------------------------------------------------------------- /multi-agent-irl/irl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/irl/__init__.py -------------------------------------------------------------------------------- /multi-agent-irl/irl/mack/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/irl/mack/__init__.py -------------------------------------------------------------------------------- /multi-agent-irl/irl/mack/kfac_discriminator.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import joblib 4 | from rl.acktr.utils import Scheduler, find_trainable_variables 5 | from rl.acktr.utils import fc, mse 6 | from rl.acktr import kfac 7 | 8 | disc_types = ['decentralized', 'centralized', 'single'] 9 | 10 | 11 | class Discriminator(object): 12 | def __init__(self, sess, ob_spaces, ac_spaces, 13 | nstack, index, disc_type='decentralized', hidden_size=128, 14 | lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5): 15 | self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear') 16 | self.disc_type = disc_type 17 | if disc_type not in disc_types: 18 | assert False 19 | self.scope = scope 20 | self.index = index 21 | self.sess = sess 22 | ob_space = ob_spaces[index] 23 | ac_space = ac_spaces[index] 24 | self.ob_shape = ob_space.shape[0] * nstack 25 | try: 26 | nact = ac_space.n 27 | except: 28 | nact = ac_space.shape[0] 29 | self.ac_shape = nact * nstack 30 | self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack 31 | try: 32 | self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack 33 | except: 34 | self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack 35 | self.hidden_size = hidden_size 36 | 37 | if disc_type == 'decentralized': 38 | input_shape = self.ob_shape + self.ac_shape 39 | elif disc_type == 'centralized': 40 | input_shape = self.all_ob_shape + self.all_ac_shape 41 | elif disc_type == 'single': 42 | input_shape = self.all_ob_shape + self.all_ac_shape 43 | else: 44 | assert False 45 | 46 | self.g = tf.placeholder(tf.float32, (None, input_shape)) 47 | self.e = tf.placeholder(tf.float32, (None, input_shape)) 48 | self.lr_rate = tf.placeholder(tf.float32, ()) 49 | self.adv = tf.placeholder(tf.float32, ()) 50 | 51 | num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1 52 | 53 | logits = self.build_graph(tf.concat([self.g, self.e], axis=0), num_outputs, reuse=False) 54 | labels = tf.concat([tf.zeros([tf.shape(self.g)[0], 1]), tf.ones([tf.shape(self.e)[0], 1])], axis=0) 55 | 56 | g_logits = self.build_graph(self.g, num_outputs, reuse=True) 57 | e_logits = self.build_graph(self.e, num_outputs, reuse=True) 58 | 59 | self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( 60 | logits=g_logits, labels=tf.zeros_like(g_logits))) 61 | self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( 62 | logits=e_logits, labels=tf.ones_like(e_logits))) 63 | 64 | self.total_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) 65 | fisher_loss = -self.total_loss 66 | 67 | # self.reward_op = tf.sigmoid(g_logits) * 2.0 - 1 68 | self.reward_op = tf.log(tf.sigmoid(g_logits) + 1e-10) 69 | 70 | # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits)) 71 | 72 | self.var_list = self.get_trainable_variables() 73 | params = find_trainable_variables(self.scope) 74 | grads = tf.gradients(self.total_loss, params) 75 | 76 | # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list) 77 | with tf.variable_scope(self.scope + '/d_optim'): 78 | # d_optim = kfac.KfacOptimizer( 79 | # learning_rate=self.lr_rate, clip_kl=kfac_clip, 80 | # momentum=0.9, kfac_update=1, epsilon=0.01, 81 | # stats_decay=0.99, async=0, cold_iter=10, 82 | # max_grad_norm=max_grad_norm) 83 | # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params) 84 | # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params))) 85 | # self.q_runner = q_runner 86 | d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate) 87 | train_op = d_optim.apply_gradients(list(zip(grads, params))) 88 | 89 | self.d_optim = train_op 90 | self.saver = tf.train.Saver(self.get_variables()) 91 | 92 | self.params_flat = self.get_trainable_variables() 93 | 94 | def build_graph(self, x, num_outputs=1, reuse=False): 95 | with tf.variable_scope(self.scope): 96 | if reuse: 97 | tf.get_variable_scope().reuse_variables() 98 | p_h1 = fc(x, 'fc1', nh=self.hidden_size) 99 | p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size) 100 | logits = fc(p_h2, 'out', nh=num_outputs, act=lambda x: x) 101 | return logits 102 | 103 | def get_variables(self): 104 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 105 | 106 | def get_trainable_variables(self): 107 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 108 | 109 | def get_reward(self, obs, acs): 110 | if len(obs.shape) == 1: 111 | obs = np.expand_dims(obs, 0) 112 | if len(acs.shape) == 1: 113 | acs = np.expand_dims(acs, 0) 114 | feed_dict = {self.g: np.concatenate([obs, acs], axis=1)} 115 | return self.sess.run(self.reward_op, feed_dict) 116 | 117 | def train(self, g_obs, g_acs, e_obs, e_acs): 118 | feed_dict = {self.g: np.concatenate([g_obs, g_acs], axis=1), 119 | self.e: np.concatenate([e_obs, e_acs], axis=1), self.lr_rate: self.lr.value()} 120 | loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict) 121 | g_loss, e_loss = self.sess.run([self.g_loss, self.e_loss], feed_dict) 122 | return g_loss, e_loss, None, None 123 | 124 | def restore(self, path): 125 | print('restoring from:' + path) 126 | self.saver.restore(self.sess, path) 127 | 128 | def save(self, save_path): 129 | ps = self.sess.run(self.params_flat) 130 | joblib.dump(ps, save_path) 131 | 132 | def load(self, load_path): 133 | loaded_params = joblib.load(load_path) 134 | restores = [] 135 | for p, loaded_p in zip(self.params_flat, loaded_params): 136 | restores.append(p.assign(loaded_p)) 137 | self.sess.run(restores) 138 | 139 | -------------------------------------------------------------------------------- /multi-agent-irl/irl/mack/kfac_discriminator_codail.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import joblib 4 | from rl.acktr.utils import Scheduler, find_trainable_variables 5 | from rl.acktr.utils import fc, mse 6 | from rl.acktr import kfac 7 | 8 | disc_types = ['decentralized', 'dentralized-all'] 9 | 10 | 11 | class Discriminator(object): 12 | def __init__(self, sess, ob_spaces, ac_spaces, 13 | nstack, index, disc_type='decentralized', hidden_size=128, 14 | lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5): 15 | self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear') 16 | self.disc_type = disc_type 17 | if disc_type not in disc_types: 18 | assert False 19 | self.scope = scope 20 | self.index = index 21 | self.sess = sess 22 | ob_space = ob_spaces[index] 23 | ac_space = ac_spaces[index] 24 | self.ob_shape = ob_space.shape[0] * nstack 25 | try: 26 | nact = ac_space.n 27 | except: 28 | nact = ac_space.shape[0] 29 | self.ac_shape = nact * nstack 30 | self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack 31 | try: 32 | self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack 33 | except: 34 | self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack 35 | self.hidden_size = hidden_size 36 | 37 | if disc_type == 'decentralized': 38 | input_shape = self.ob_shape + self.all_ac_shape 39 | elif disc_type == 'decentralized-all': 40 | input_shape = self.all_ob_shape + self.all_ac_shape 41 | else: 42 | assert False 43 | 44 | self.g = tf.placeholder(tf.float32, (None, input_shape)) 45 | self.e = tf.placeholder(tf.float32, (None, input_shape)) 46 | self.lr_rate = tf.placeholder(tf.float32, ()) 47 | self.adv = tf.placeholder(tf.float32, ()) 48 | 49 | num_outputs = 1 50 | 51 | logits = self.build_graph(tf.concat([self.g, self.e], axis=0), num_outputs, reuse=False) 52 | labels = tf.concat([tf.zeros([tf.shape(self.g)[0], 1]), tf.ones([tf.shape(self.e)[0], 1])], axis=0) 53 | 54 | g_logits = self.build_graph(self.g, num_outputs, reuse=True) 55 | e_logits = self.build_graph(self.e, num_outputs, reuse=True) 56 | 57 | self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( 58 | logits=g_logits, labels=tf.zeros_like(g_logits))) 59 | self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( 60 | logits=e_logits, labels=tf.ones_like(e_logits))) 61 | 62 | self.total_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) 63 | fisher_loss = -self.total_loss 64 | 65 | # self.reward_op = tf.sigmoid(g_logits) * 2.0 - 1 66 | # self.reward_op = tf.log(tf.sigmoid(g_logits) + 1e-10) 67 | # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits)) 68 | self.reward_op = tf.log( tf.sigmoid(g_logits) + 1e-10) - tf.log(1-tf.sigmoid(g_logits) + 1e-10) 69 | 70 | self.var_list = self.get_trainable_variables() 71 | params = find_trainable_variables(self.scope) 72 | grads = tf.gradients(self.total_loss, params) 73 | 74 | # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list) 75 | with tf.variable_scope(self.scope + '/d_optim'): 76 | # d_optim = kfac.KfacOptimizer( 77 | # learning_rate=self.lr_rate, clip_kl=kfac_clip, 78 | # momentum=0.9, kfac_update=1, epsilon=0.01, 79 | # stats_decay=0.99, async=0, cold_iter=10, 80 | # max_grad_norm=max_grad_norm) 81 | # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params) 82 | # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params))) 83 | # self.q_runner = q_runner 84 | d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate) 85 | train_op = d_optim.apply_gradients(list(zip(grads, params))) 86 | 87 | self.d_optim = train_op 88 | self.saver = tf.train.Saver(self.get_variables()) 89 | 90 | self.params_flat = self.get_trainable_variables() 91 | 92 | def build_graph(self, x, num_outputs=1, reuse=False): 93 | with tf.variable_scope(self.scope): 94 | if reuse: 95 | tf.get_variable_scope().reuse_variables() 96 | p_h1 = fc(x, 'fc1', nh=self.hidden_size) 97 | p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size) 98 | logits = fc(p_h2, 'out', nh=num_outputs, act=lambda x: x) 99 | return logits 100 | 101 | def get_variables(self): 102 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 103 | 104 | def get_trainable_variables(self): 105 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 106 | 107 | def get_reward(self, obs, acs): 108 | if len(obs.shape) == 1: 109 | obs = np.expand_dims(obs, 0) 110 | if len(acs.shape) == 1: 111 | acs = np.expand_dims(acs, 0) 112 | feed_dict = {self.g: np.concatenate([obs, acs], axis=1)} 113 | return self.sess.run(self.reward_op, feed_dict) 114 | 115 | def train(self, g_obs, g_acs, e_obs, e_acs): 116 | feed_dict = {self.g: np.concatenate([g_obs, g_acs], axis=1), 117 | self.e: np.concatenate([e_obs, e_acs], axis=1), self.lr_rate: self.lr.value()} 118 | loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict) 119 | g_loss, e_loss = self.sess.run([self.g_loss, self.e_loss], feed_dict) 120 | return g_loss, e_loss, None, None 121 | 122 | def restore(self, path): 123 | print('restoring from:' + path) 124 | self.saver.restore(self.sess, path) 125 | 126 | def save(self, save_path): 127 | ps = self.sess.run(self.params_flat) 128 | joblib.dump(ps, save_path) 129 | 130 | def load(self, load_path): 131 | loaded_params = joblib.load(load_path) 132 | restores = [] 133 | for p, loaded_p in zip(self.params_flat, loaded_params): 134 | restores.append(p.assign(loaded_p)) 135 | self.sess.run(restores) 136 | 137 | -------------------------------------------------------------------------------- /multi-agent-irl/irl/mack/kfac_discriminator_ncdail.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import joblib 4 | from rl.acktr.utils import Scheduler, find_trainable_variables 5 | from rl.acktr.utils import fc, mse 6 | from rl.acktr import kfac 7 | 8 | disc_types = ['decentralized', 'dentralized-all'] 9 | 10 | 11 | class Discriminator(object): 12 | def __init__(self, sess, ob_spaces, ac_spaces, 13 | nstack, index, disc_type='decentralized', hidden_size=128, 14 | lr_rate=0.01, total_steps=50000, scope="discriminator", kfac_clip=0.001, max_grad_norm=0.5): 15 | self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear') 16 | self.disc_type = disc_type 17 | if disc_type not in disc_types: 18 | assert False 19 | self.scope = scope 20 | self.index = index 21 | self.sess = sess 22 | ob_space = ob_spaces[index] 23 | ac_space = ac_spaces[index] 24 | self.ob_shape = ob_space.shape[0] * nstack 25 | try: 26 | nact = ac_space.n 27 | except: 28 | nact = ac_space.shape[0] 29 | self.ac_shape = nact * nstack 30 | self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack 31 | try: 32 | self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack 33 | except: 34 | self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack 35 | self.hidden_size = hidden_size 36 | 37 | if disc_type == 'decentralized': 38 | input_shape = self.ob_shape + self.all_ac_shape 39 | elif disc_type == 'decentralized-all': 40 | input_shape = self.all_ob_shape + self.all_ac_shape 41 | else: 42 | assert False 43 | 44 | self.g = tf.placeholder(tf.float32, (None, input_shape)) 45 | self.e = tf.placeholder(tf.float32, (None, input_shape)) 46 | self.lr_rate = tf.placeholder(tf.float32, ()) 47 | self.adv = tf.placeholder(tf.float32, ()) 48 | 49 | num_outputs = 1 50 | 51 | logits = self.build_graph(tf.concat([self.g, self.e], axis=0), num_outputs, reuse=False) 52 | labels = tf.concat([tf.zeros([tf.shape(self.g)[0], 1]), tf.ones([tf.shape(self.e)[0], 1])], axis=0) 53 | 54 | g_logits = self.build_graph(self.g, num_outputs, reuse=True) 55 | e_logits = self.build_graph(self.e, num_outputs, reuse=True) 56 | 57 | self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( 58 | logits=g_logits, labels=tf.zeros_like(g_logits))) 59 | self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( 60 | logits=e_logits, labels=tf.ones_like(e_logits))) 61 | 62 | self.total_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) 63 | fisher_loss = -self.total_loss 64 | 65 | # self.reward_op = tf.sigmoid(g_logits) * 2.0 - 1 66 | # self.reward_op = tf.log(tf.sigmoid(g_logits) + 1e-10) 67 | # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits)) 68 | self.reward_op = tf.log( tf.sigmoid(g_logits) + 1e-10) - tf.log(1-tf.sigmoid(g_logits) + 1e-10) 69 | 70 | self.var_list = self.get_trainable_variables() 71 | params = find_trainable_variables(self.scope) 72 | grads = tf.gradients(self.total_loss, params) 73 | 74 | # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list) 75 | with tf.variable_scope(self.scope + '/d_optim'): 76 | # d_optim = kfac.KfacOptimizer( 77 | # learning_rate=self.lr_rate, clip_kl=kfac_clip, 78 | # momentum=0.9, kfac_update=1, epsilon=0.01, 79 | # stats_decay=0.99, async=0, cold_iter=10, 80 | # max_grad_norm=max_grad_norm) 81 | # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params) 82 | # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params))) 83 | # self.q_runner = q_runner 84 | d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate) 85 | train_op = d_optim.apply_gradients(list(zip(grads, params))) 86 | 87 | self.d_optim = train_op 88 | self.saver = tf.train.Saver(self.get_variables()) 89 | 90 | self.params_flat = self.get_trainable_variables() 91 | 92 | def build_graph(self, x, num_outputs=1, reuse=False): 93 | with tf.variable_scope(self.scope): 94 | if reuse: 95 | tf.get_variable_scope().reuse_variables() 96 | p_h1 = fc(x, 'fc1', nh=self.hidden_size) 97 | p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size) 98 | logits = fc(p_h2, 'out', nh=num_outputs, act=lambda x: x) 99 | return logits 100 | 101 | def get_variables(self): 102 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 103 | 104 | def get_trainable_variables(self): 105 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 106 | 107 | def get_reward(self, obs, acs): 108 | if len(obs.shape) == 1: 109 | obs = np.expand_dims(obs, 0) 110 | if len(acs.shape) == 1: 111 | acs = np.expand_dims(acs, 0) 112 | feed_dict = {self.g: np.concatenate([obs, acs], axis=1)} 113 | return self.sess.run(self.reward_op, feed_dict) 114 | 115 | def train(self, g_obs, g_acs, e_obs, e_acs): 116 | feed_dict = {self.g: np.concatenate([g_obs, g_acs], axis=1), 117 | self.e: np.concatenate([e_obs, e_acs], axis=1), self.lr_rate: self.lr.value()} 118 | loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict) 119 | g_loss, e_loss = self.sess.run([self.g_loss, self.e_loss], feed_dict) 120 | return g_loss, e_loss, None, None 121 | 122 | def restore(self, path): 123 | print('restoring from:' + path) 124 | self.saver.restore(self.sess, path) 125 | 126 | def save(self, save_path): 127 | ps = self.sess.run(self.params_flat) 128 | joblib.dump(ps, save_path) 129 | 130 | def load(self, load_path): 131 | loaded_params = joblib.load(load_path) 132 | restores = [] 133 | for p, loaded_p in zip(self.params_flat, loaded_params): 134 | restores.append(p.assign(loaded_p)) 135 | self.sess.run(restores) 136 | 137 | -------------------------------------------------------------------------------- /multi-agent-irl/irl/mack/run_mack_airl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import logging 3 | import os 4 | import itertools 5 | import click 6 | import gym 7 | 8 | import make_env 9 | from rl import bench 10 | from rl import logger 11 | from rl.common import set_global_seeds 12 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv 13 | from irl.dataset import MADataSet 14 | from irl.mack.airl import learn 15 | from sandbox.mack.policies import CategoricalPolicy 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 17 | 18 | 19 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path, 20 | traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500, l2=0.1, d_iters=1, 21 | rew_scale=0.1): 22 | def create_env(rank): 23 | def _thunk(): 24 | env = make_env.make_env(env_id) 25 | env.seed(seed + rank) 26 | env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), 27 | allow_early_resets=True) 28 | gym.logger.setLevel(logging.WARN) 29 | return env 30 | return _thunk 31 | 32 | logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) 33 | 34 | set_global_seeds(seed) 35 | env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) 36 | print(num_cpu) 37 | policy_fn = CategoricalPolicy 38 | expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation, nobs_flag=True) 39 | learn(policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, 40 | nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr, 41 | disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id), l2=l2, d_iters=d_iters, 42 | rew_scale=rew_scale) 43 | env.close() 44 | 45 | 46 | @click.command() 47 | @click.option('--logdir', type=click.STRING, default='./results') 48 | @click.option('--env', type=click.STRING, default='simple_spread') 49 | @click.option('--expert_path', type=click.STRING, 50 | default='./results/mack_om/simple_speaker_listener/l-0.1-b-1000/seed-1/checkpoint55000-200tra.pkl') 51 | @click.option('--seed', type=click.INT, default=1) 52 | @click.option('--traj_limitation', type=click.INT, default=200) 53 | @click.option('--ret_threshold', type=click.FLOAT, default=-10) 54 | @click.option('--dis_lr', type=click.FLOAT, default=0.1) 55 | @click.option('--disc_type', type=click.Choice(['decentralized', 'decentralized-all']), 56 | default='decentralized') 57 | @click.option('--bc_iters', type=click.INT, default=500) 58 | @click.option('--l2', type=click.FLOAT, default=0.1) 59 | @click.option('--d_iters', type=click.INT, default=1) 60 | @click.option('--rew_scale', type=click.FLOAT, default=0) 61 | def main(logdir, env, expert_path, seed, traj_limitation, ret_threshold, dis_lr, disc_type, bc_iters, l2, d_iters, 62 | rew_scale): 63 | expert_path='./results/mack_om/'+env+'/l-0.1-b-1000/seed-'+str(1)+'/checkpoint55000-200tra-{}.pkl'.format(seed) 64 | env_ids = [env] 65 | lrs = [0.1] 66 | seeds = [seed] 67 | batch_sizes = [1000] 68 | 69 | for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes): 70 | train(logdir + '/airl/' + env_id + '/' + disc_type + '/s-{}/l-{}-b-{}-d-{}-c-{}-l2-{}-iter-{}-r-{}/seed-{}'.format( 71 | traj_limitation, lr, batch_size, dis_lr, bc_iters, l2, d_iters, rew_scale, seed), 72 | env_id, 5e7, lr, batch_size, seed, batch_size // 250, expert_path, 73 | traj_limitation, ret_threshold, dis_lr, disc_type=disc_type, bc_iters=bc_iters, l2=l2, d_iters=d_iters, 74 | rew_scale=rew_scale) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /multi-agent-irl/irl/mack/run_mack_codail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import logging 3 | import os 4 | import itertools 5 | import click 6 | import gym 7 | 8 | import make_env 9 | from rl import bench 10 | from rl import logger 11 | from rl.common import set_global_seeds 12 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv 13 | from irl.dataset import MADataSet 14 | from irl.mack.codail import learn 15 | from sandbox.mack.policies_om import CategoricalPolicy as CategoricalPolicy_om 16 | from sandbox.mack.opponent_policies import CategoricalPolicy as oppo_CategoricalPolicy 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 18 | 19 | 20 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path, 21 | traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500, d_iters=1, g_iters=1, ent_coef=0.0): 22 | def create_env(rank): 23 | def _thunk(): 24 | env = make_env.make_env(env_id) 25 | env.seed(seed + rank) 26 | env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), 27 | allow_early_resets=True) 28 | gym.logger.setLevel(logging.WARN) 29 | return env 30 | return _thunk 31 | 32 | logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) 33 | 34 | set_global_seeds(seed) 35 | env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) 36 | print(num_cpu) 37 | policy_fn = CategoricalPolicy_om 38 | oppo_policy_fn = oppo_CategoricalPolicy 39 | expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation) 40 | learn(policy_fn, oppo_policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, 41 | nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr, 42 | disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id), d_iters=d_iters, g_iters=g_iters) 43 | env.close() 44 | 45 | 46 | @click.command() 47 | @click.option('--logdir', type=click.STRING, default='./results') 48 | @click.option('--env', type=click.STRING, default='simple_spread') 49 | @click.option('--expert_path', type=click.STRING, 50 | default='./results/mack_om/simple_speaker_listener/l-0.1-b-1000/seed-1/checkpoint55000-200tra.pkl') 51 | @click.option('--seed', type=click.INT, default=1) 52 | @click.option('--traj_limitation', type=click.INT, default=200) 53 | @click.option('--ret_threshold', type=click.FLOAT, default=-10) 54 | @click.option('--dis_lr', type=click.FLOAT, default=0.1) 55 | @click.option('--disc_type', type=click.Choice(['decentralized', 'decentralized-all']), default='decentralized') 56 | @click.option('--bc_iters', type=click.INT, default=500) 57 | @click.option('--d_iters', type=click.INT, default=1) 58 | @click.option('--g_iters', type=click.INT, default=1) 59 | @click.option('--ent_coef', type=click.FLOAT, default=0.0) 60 | @click.option('--hyper_study', is_flag=True, flag_value=True) 61 | def main(logdir, env, expert_path, seed, traj_limitation, ret_threshold, dis_lr, disc_type, bc_iters, d_iters, g_iters, ent_coef, hyper_study): 62 | expert_path='./results/mack_om/'+env+'/l-0.1-b-1000/seed-'+str(1)+'/checkpoint55000-200tra-{}.pkl'.format(seed) 63 | print(expert_path) 64 | env_ids = [env] 65 | lrs = [0.1] 66 | seeds = [seed] 67 | batch_sizes = [1000] 68 | 69 | ldir = './results' 70 | 71 | for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes): 72 | logdir = ldir + '/codail/' + env_id + '/' + disc_type + '/s-{}/l-{}-b-{}-d-{}-c-{}/seed-{}'.format( 73 | traj_limitation, lr, batch_size, dis_lr, bc_iters, seed) 74 | if hyper_study: 75 | logdir = ldir + '/codail/' + env_id + '/' + disc_type + '/hyper_study/'+'s-{}/d-{}-g-{}-c-{}/seed-{}'.format( 76 | traj_limitation, d_iters, g_iters, ent_coef, seed) 77 | else: 78 | d_iters = g_iters = 1 79 | ent_coef = 0.0 80 | print(logdir) 81 | train(logdir, 82 | env_id, 5e7, lr, batch_size, seed, batch_size // 250, expert_path, 83 | traj_limitation, ret_threshold, dis_lr, disc_type=disc_type, bc_iters=bc_iters, d_iters=d_iters, g_iters=g_iters, ent_coef=ent_coef) 84 | 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /multi-agent-irl/irl/mack/run_mack_gail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import logging 3 | import os 4 | import itertools 5 | import click 6 | import gym 7 | 8 | import make_env 9 | from rl import bench 10 | from rl import logger 11 | from rl.common import set_global_seeds 12 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv 13 | from irl.dataset import MADataSet 14 | from irl.mack.gail import learn 15 | from sandbox.mack.policies import CategoricalPolicy 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 17 | 18 | 19 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path, 20 | traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500): 21 | def create_env(rank): 22 | def _thunk(): 23 | env = make_env.make_env(env_id) 24 | env.seed(seed + rank) 25 | env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), 26 | allow_early_resets=True) 27 | gym.logger.setLevel(logging.WARN) 28 | return env 29 | return _thunk 30 | 31 | logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) 32 | 33 | set_global_seeds(seed) 34 | env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) 35 | print(num_cpu) 36 | policy_fn = CategoricalPolicy 37 | expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation) 38 | learn(policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, 39 | nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr, 40 | disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id)) 41 | env.close() 42 | 43 | 44 | @click.command() 45 | @click.option('--logdir', type=click.STRING, default='./results') 46 | @click.option('--env', type=click.STRING, default='simple_spread') 47 | @click.option('--expert_path', type=click.STRING, 48 | default='./results/mack_om/simple_speaker_listener/l-0.1-b-1000/seed-1/checkpoint55000-200tra.pkl') 49 | @click.option('--seed', type=click.INT, default=1) 50 | @click.option('--traj_limitation', type=click.INT, default=200) 51 | @click.option('--ret_threshold', type=click.FLOAT, default=-10) 52 | @click.option('--dis_lr', type=click.FLOAT, default=0.1) 53 | @click.option('--disc_type', type=click.Choice(['decentralized', 'centralized', 'single']), default='decentralized') 54 | @click.option('--bc_iters', type=click.INT, default=500) 55 | def main(logdir, env, expert_path, seed, traj_limitation, ret_threshold, dis_lr, disc_type, bc_iters): 56 | expert_path='./results/mack_om/'+env+'/l-0.1-b-1000/seed-'+str(1)+'/checkpoint55000-200tra-{}.pkl'.format(seed) 57 | env_ids = [env] 58 | lrs = [0.1] 59 | seeds = [seed] 60 | batch_sizes = [1000] 61 | 62 | logdir = './results' 63 | 64 | for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes): 65 | train(logdir + '/gail/' + env_id + '/' + disc_type + '/s-{}/l-{}-b-{}-d-{}-c-{}/seed-{}'.format( 66 | traj_limitation, lr, batch_size, dis_lr, bc_iters, seed), 67 | env_id, 5e7, lr, batch_size, seed, batch_size // 250, expert_path, 68 | traj_limitation, ret_threshold, dis_lr, disc_type=disc_type, bc_iters=bc_iters) 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | -------------------------------------------------------------------------------- /multi-agent-irl/irl/mack/run_mack_ncdail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import logging 3 | import os 4 | import itertools 5 | import click 6 | import gym 7 | 8 | import make_env 9 | from rl import bench 10 | from rl import logger 11 | from rl.common import set_global_seeds 12 | from rl.common.vec_env.subproc_vec_env import SubprocVecEnv 13 | from irl.dataset import MADataSet 14 | from irl.mack.ncdail import learn 15 | from sandbox.mack.policies import CategoricalPolicy 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 17 | 18 | 19 | def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path, 20 | traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500, d_iters=1): 21 | def create_env(rank): 22 | def _thunk(): 23 | env = make_env.make_env(env_id) 24 | env.seed(seed + rank) 25 | env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), 26 | allow_early_resets=True) 27 | gym.logger.setLevel(logging.WARN) 28 | return env 29 | return _thunk 30 | 31 | logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) 32 | 33 | set_global_seeds(seed) 34 | env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) 35 | print(num_cpu) 36 | policy_fn = CategoricalPolicy 37 | expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation) 38 | learn(policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, 39 | nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr, 40 | disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id), d_iters=d_iters) 41 | env.close() 42 | 43 | 44 | @click.command() 45 | @click.option('--logdir', type=click.STRING, default='./results') 46 | @click.option('--env', type=click.STRING, default='simple_spread') 47 | @click.option('--expert_path', type=click.STRING, 48 | default='./results/mack_om/simple_push/l-0.1-b-1000/seed-1/checkpoint55000-200tra.pkl') 49 | @click.option('--seed', type=click.INT, default=1) 50 | @click.option('--traj_limitation', type=click.INT, default=200) 51 | @click.option('--ret_threshold', type=click.FLOAT, default=-10) 52 | @click.option('--dis_lr', type=click.FLOAT, default=0.1) 53 | @click.option('--disc_type', type=click.Choice(['decentralized', 'decentralized-all']), default='decentralized') 54 | @click.option('--bc_iters', type=click.INT, default=500) 55 | @click.option('--d_iters', type=click.INT, default=1) 56 | def main(logdir, env, expert_path, seed, traj_limitation, ret_threshold, dis_lr, disc_type, bc_iters, d_iters): 57 | expert_path='./results/mack_om/'+env+'/l-0.1-b-1000/seed-'+str(1)+'/checkpoint55000-200tra-{}.pkl'.format(seed) 58 | env_ids = [env] 59 | lrs = [0.1] 60 | seeds = [seed] 61 | batch_sizes = [1000] 62 | 63 | ldir = './results' 64 | 65 | for env_id, seed, lr, batch_size in itertools.product(env_ids, seeds, lrs, batch_sizes): 66 | logdir = ldir + '/ncdail/' + env_id + '/' + disc_type + '/s-{}/l-{}-b-{}-d-{}-c-{}/seed-{}'.format( 67 | traj_limitation, lr, batch_size, dis_lr, bc_iters, seed) 68 | print(logdir) 69 | train(logdir, 70 | env_id, 5e7, lr, batch_size, seed, batch_size // 250, expert_path, 71 | traj_limitation, ret_threshold, dis_lr, disc_type=disc_type, bc_iters=bc_iters, d_iters=d_iters) 72 | 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /multi-agent-irl/irl/mack/tf_util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | REG_VARS = 'reg_vars' 5 | 6 | def linear(X, dout, name, bias=True): 7 | with tf.variable_scope(name): 8 | dX = int(X.get_shape()[-1]) 9 | W = tf.get_variable('W', shape=(dX, dout)) 10 | tf.add_to_collection(REG_VARS, W) 11 | if bias: 12 | b = tf.get_variable('b', initializer=tf.constant(np.zeros(dout).astype(np.float32))) 13 | else: 14 | b = 0 15 | return tf.matmul(X, W)+b 16 | 17 | def discounted_reduce_sum(X, discount, axis=-1): 18 | if discount != 1.0: 19 | disc = tf.cumprod(discount*tf.ones_like(X), axis=axis) 20 | else: 21 | disc = 1.0 22 | return tf.reduce_sum(X*disc, axis=axis) 23 | 24 | def assert_shape(tens, shape): 25 | assert tens.get_shape().is_compatible_with(shape) 26 | 27 | def relu_layer(X, dout, name): 28 | return tf.nn.relu(linear(X, dout, name)) 29 | 30 | def softplus_layer(X, dout, name): 31 | return tf.nn.softplus(linear(X, dout, name)) 32 | 33 | def tanh_layer(X, dout, name): 34 | return tf.nn.tanh(linear(X, dout, name)) 35 | 36 | def get_session_config(): 37 | session_config = tf.ConfigProto() 38 | session_config.gpu_options.allow_growth = True 39 | #session_config.gpu_options.per_process_gpu_memory_fraction = 0.2 40 | return session_config 41 | 42 | 43 | def load_prior_params(pkl_fname): 44 | import joblib 45 | with tf.Session(config=get_session_config()): 46 | params = joblib.load(pkl_fname) 47 | tf.reset_default_graph() 48 | #joblib.dump(params, file_name, compress=3) 49 | params = params['irl_params'] 50 | #print(params) 51 | assert params is not None 52 | return params 53 | -------------------------------------------------------------------------------- /multi-agent-irl/plot_distribution.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # source activate rl 3 | python -m irl.render --env=simple_speaker_listener --algo=mack_om --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 4 | python -m irl.render --env=simple_speaker_listener --algo=codail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 5 | python -m irl.render --env=simple_speaker_listener --algo=ncdail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 6 | python -m irl.render --env=simple_speaker_listener --algo=gail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 7 | python -m irl.render --env=simple_speaker_listener --algo=airl --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 8 | python -m irl.render --env=simple_speaker_listener --algo=random --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 9 | 10 | python -m irl.render --env=simple_spread --algo=mack_om --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 11 | python -m irl.render --env=simple_spread --algo=codail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 12 | python -m irl.render --env=simple_spread --algo=ncdail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 13 | python -m irl.render --env=simple_spread --algo=gail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 14 | python -m irl.render --env=simple_spread --algo=airl --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 15 | python -m irl.render --env=simple_spread --algo=random --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 16 | 17 | python -m irl.render --env=simple_push --algo=mack_om --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 18 | python -m irl.render --env=simple_push --algo=codail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 19 | python -m irl.render --env=simple_push --algo=ncdail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 20 | python -m irl.render --env=simple_push --algo=gail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 21 | python -m irl.render --env=simple_push --algo=airl --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 22 | python -m irl.render --env=simple_push --algo=random --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 23 | 24 | python -m irl.render --env=simple_tag --algo=mack_om --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 25 | python -m irl.render --env=simple_tag --algo=codail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 26 | python -m irl.render --env=simple_tag --algo=ncdail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 27 | python -m irl.render --env=simple_tag --algo=gail --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 28 | python -m irl.render --env=simple_tag --algo=airl --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 29 | python -m irl.render --env=simple_tag --algo=random --num_trajs=$1 --epoch=$2 --vis_dis --seed=$3 30 | 31 | -------------------------------------------------------------------------------- /multi-agent-irl/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow >= 1.2 2 | numpy 3 | scipy 4 | mpi4py 5 | ray 6 | click 7 | gym == 0.10.4 8 | tqdm 9 | joblib 10 | progressbar2 11 | zmq 12 | cloudpickle 13 | baselines 14 | box2d-py 15 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | from rl.envs.multi_walker import MultiWalkerEnv 3 | 4 | 5 | def make_env(env_id): 6 | if env_id == 'walker1': 7 | return MultiWalkerEnv(n_walkers=1, reward_mech='local') 8 | elif env_id == 'walker2': 9 | return MultiWalkerEnv(n_walkers=2, reward_mech='local') 10 | elif env_id == 'walker3': 11 | return MultiWalkerEnv(n_walkers=3, reward_mech='local') 12 | elif env_id == 'walker2c': 13 | return MultiWalkerEnv(n_walkers=2, reward_mech='local', competitive=True) 14 | elif env_id == 'walker4': 15 | return MultiWalkerEnv(n_walkers=4, reward_mech='local') 16 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/acktr/README.md: -------------------------------------------------------------------------------- 1 | Contains utilities for ACK (ACKTR but actually without Trust Region) -------------------------------------------------------------------------------- /multi-agent-irl/rl/acktr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apexrl/CoDAIL/d6996698155677b51f5b844d848bf2bdce0f8a5f/multi-agent-irl/rl/acktr/__init__.py -------------------------------------------------------------------------------- /multi-agent-irl/rl/acktr/filters.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | import numpy as np 4 | 5 | from rl.acktr.running_stat import RunningStat 6 | 7 | 8 | class Filter(object): 9 | def __call__(self, x, update=True): 10 | raise NotImplementedError 11 | def reset(self): 12 | pass 13 | 14 | class IdentityFilter(Filter): 15 | def __call__(self, x, update=True): 16 | return x 17 | 18 | class CompositionFilter(Filter): 19 | def __init__(self, fs): 20 | self.fs = fs 21 | def __call__(self, x, update=True): 22 | for f in self.fs: 23 | x = f(x) 24 | return x 25 | def output_shape(self, input_space): 26 | out = input_space.shape 27 | for f in self.fs: 28 | out = f.output_shape(out) 29 | return out 30 | 31 | class ZFilter(Filter): 32 | """ 33 | y = (x-mean)/std 34 | using running estimates of mean,std 35 | """ 36 | 37 | def __init__(self, shape, demean=True, destd=True, clip=10.0): 38 | self.demean = demean 39 | self.destd = destd 40 | self.clip = clip 41 | 42 | self.rs = RunningStat(shape) 43 | 44 | def __call__(self, x, update=True): 45 | if update: self.rs.push(x) 46 | if self.demean: 47 | x = x - self.rs.mean 48 | if self.destd: 49 | x = x / (self.rs.std+1e-8) 50 | if self.clip: 51 | x = np.clip(x, -self.clip, self.clip) 52 | return x 53 | def output_shape(self, input_space): 54 | return input_space.shape 55 | 56 | class AddClock(Filter): 57 | def __init__(self): 58 | self.count = 0 59 | def reset(self): 60 | self.count = 0 61 | def __call__(self, x, update=True): 62 | return np.append(x, self.count/100.0) 63 | def output_shape(self, input_space): 64 | return (input_space.shape[0]+1,) 65 | 66 | class FlattenFilter(Filter): 67 | def __call__(self, x, update=True): 68 | return x.ravel() 69 | def output_shape(self, input_space): 70 | return (int(np.prod(input_space.shape)),) 71 | 72 | class Ind2OneHotFilter(Filter): 73 | def __init__(self, n): 74 | self.n = n 75 | def __call__(self, x, update=True): 76 | out = np.zeros(self.n) 77 | out[x] = 1 78 | return out 79 | def output_shape(self, input_space): 80 | return (input_space.n,) 81 | 82 | class DivFilter(Filter): 83 | def __init__(self, divisor): 84 | self.divisor = divisor 85 | def __call__(self, x, update=True): 86 | return x / self.divisor 87 | def output_shape(self, input_space): 88 | return input_space.shape 89 | 90 | class StackFilter(Filter): 91 | def __init__(self, length): 92 | self.stack = deque(maxlen=length) 93 | def reset(self): 94 | self.stack.clear() 95 | def __call__(self, x, update=True): 96 | self.stack.append(x) 97 | while len(self.stack) < self.stack.maxlen: 98 | self.stack.append(x) 99 | return np.concatenate(self.stack, axis=-1) 100 | def output_shape(self, input_space): 101 | return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,) 102 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/acktr/kfac_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None): 6 | if reduce_dim == None: 7 | # general batch matmul 8 | if len(a.get_shape()) == 3 and len(b.get_shape()) == 3: 9 | return tf.batch_matmul(a, b, adj_x=transpose_a, adj_y=transpose_b) 10 | elif len(a.get_shape()) == 3 and len(b.get_shape()) == 2: 11 | if transpose_b: 12 | N = b.get_shape()[0].value 13 | else: 14 | N = b.get_shape()[1].value 15 | B = a.get_shape()[0].value 16 | if transpose_a: 17 | K = a.get_shape()[1].value 18 | a = tf.reshape(tf.transpose(a, [0, 2, 1]), [-1, K]) 19 | else: 20 | K = a.get_shape()[-1].value 21 | a = tf.reshape(a, [-1, K]) 22 | result = tf.matmul(a, b, transpose_b=transpose_b) 23 | result = tf.reshape(result, [B, -1, N]) 24 | return result 25 | elif len(a.get_shape()) == 2 and len(b.get_shape()) == 3: 26 | if transpose_a: 27 | M = a.get_shape()[1].value 28 | else: 29 | M = a.get_shape()[0].value 30 | B = b.get_shape()[0].value 31 | if transpose_b: 32 | K = b.get_shape()[-1].value 33 | b = tf.transpose(tf.reshape(b, [-1, K]), [1, 0]) 34 | else: 35 | K = b.get_shape()[1].value 36 | b = tf.transpose(tf.reshape( 37 | tf.transpose(b, [0, 2, 1]), [-1, K]), [1, 0]) 38 | result = tf.matmul(a, b, transpose_a=transpose_a) 39 | result = tf.transpose(tf.reshape(result, [M, B, -1]), [1, 0, 2]) 40 | return result 41 | else: 42 | return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b) 43 | else: 44 | # weird batch matmul 45 | if len(a.get_shape()) == 2 and len(b.get_shape()) > 2: 46 | # reshape reduce_dim to the left most dim in b 47 | b_shape = b.get_shape() 48 | if reduce_dim != 0: 49 | b_dims = list(range(len(b_shape))) 50 | b_dims.remove(reduce_dim) 51 | b_dims.insert(0, reduce_dim) 52 | b = tf.transpose(b, b_dims) 53 | b_t_shape = b.get_shape() 54 | b = tf.reshape(b, [int(b_shape[reduce_dim]), -1]) 55 | result = tf.matmul(a, b, transpose_a=transpose_a, 56 | transpose_b=transpose_b) 57 | result = tf.reshape(result, b_t_shape) 58 | if reduce_dim != 0: 59 | b_dims = list(range(len(b_shape))) 60 | b_dims.remove(0) 61 | b_dims.insert(reduce_dim, 0) 62 | result = tf.transpose(result, b_dims) 63 | return result 64 | 65 | elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2: 66 | # reshape reduce_dim to the right most dim in a 67 | a_shape = a.get_shape() 68 | outter_dim = len(a_shape) - 1 69 | reduce_dim = len(a_shape) - reduce_dim - 1 70 | if reduce_dim != outter_dim: 71 | a_dims = list(range(len(a_shape))) 72 | a_dims.remove(reduce_dim) 73 | a_dims.insert(outter_dim, reduce_dim) 74 | a = tf.transpose(a, a_dims) 75 | a_t_shape = a.get_shape() 76 | a = tf.reshape(a, [-1, int(a_shape[reduce_dim])]) 77 | result = tf.matmul(a, b, transpose_a=transpose_a, 78 | transpose_b=transpose_b) 79 | result = tf.reshape(result, a_t_shape) 80 | if reduce_dim != outter_dim: 81 | a_dims = list(range(len(a_shape))) 82 | a_dims.remove(outter_dim) 83 | a_dims.insert(reduce_dim, outter_dim) 84 | result = tf.transpose(result, a_dims) 85 | return result 86 | 87 | elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2: 88 | return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b) 89 | 90 | assert False, 'something went wrong' 91 | 92 | 93 | def clipoutNeg(vec, threshold=1e-6): 94 | mask = tf.cast(vec > threshold, tf.float32) 95 | return mask * vec 96 | 97 | 98 | def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False): 99 | eigen_min = tf.reduce_min(input_mat) 100 | eigen_max = tf.reduce_max(input_mat) 101 | eigen_ratio = eigen_max / eigen_min 102 | input_mat_clipped = clipoutNeg(input_mat, threshold) 103 | 104 | if debug: 105 | input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print( 106 | input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio])) 107 | 108 | return input_mat_clipped 109 | 110 | 111 | def factorReshape(Q, e, grad, facIndx=0, ftype='act'): 112 | grad_shape = grad.get_shape() 113 | if ftype == 'act': 114 | assert e.get_shape()[0] == grad_shape[facIndx] 115 | expanded_shape = [1, ] * len(grad_shape) 116 | expanded_shape[facIndx] = -1 117 | e = tf.reshape(e, expanded_shape) 118 | if ftype == 'grad': 119 | assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1] 120 | expanded_shape = [1, ] * len(grad_shape) 121 | expanded_shape[len(grad_shape) - facIndx - 1] = -1 122 | e = tf.reshape(e, expanded_shape) 123 | 124 | return Q, e 125 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/acktr/running_stat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # http://www.johndcook.com/blog/standard_deviation/ 4 | class RunningStat(object): 5 | def __init__(self, shape): 6 | self._n = 0 7 | self._M = np.zeros(shape) 8 | self._S = np.zeros(shape) 9 | def push(self, x): 10 | x = np.asarray(x) 11 | assert x.shape == self._M.shape 12 | self._n += 1 13 | if self._n == 1: 14 | self._M[...] = x 15 | else: 16 | oldM = self._M.copy() 17 | self._M[...] = oldM + (x - oldM)/self._n 18 | self._S[...] = self._S + (x - oldM)*(x - self._M) 19 | @property 20 | def n(self): 21 | return self._n 22 | @property 23 | def mean(self): 24 | return self._M 25 | @property 26 | def var(self): 27 | return self._S/(self._n - 1) if self._n > 1 else np.square(self._M) 28 | @property 29 | def std(self): 30 | return np.sqrt(self.var) 31 | @property 32 | def shape(self): 33 | return self._M.shape 34 | 35 | def test_running_stat(): 36 | for shp in ((), (3,), (3,4)): 37 | li = [] 38 | rs = RunningStat(shp) 39 | for _ in range(5): 40 | val = np.random.randn(*shp) 41 | rs.push(val) 42 | li.append(val) 43 | m = np.mean(li, axis=0) 44 | assert np.allclose(rs.mean, m) 45 | v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0) 46 | assert np.allclose(rs.var, v) 47 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/bench/__init__.py: -------------------------------------------------------------------------------- 1 | from rl.bench.benchmarks import * 2 | from rl.bench.monitor import * 3 | 4 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/bench/benchmarks.py: -------------------------------------------------------------------------------- 1 | _atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders'] 2 | _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture'] 3 | 4 | _BENCHMARKS = [] 5 | 6 | def register_benchmark(benchmark): 7 | for b in _BENCHMARKS: 8 | if b['name'] == benchmark['name']: 9 | raise ValueError('Benchmark with name %s already registered!'%b['name']) 10 | _BENCHMARKS.append(benchmark) 11 | 12 | def list_benchmarks(): 13 | return [b['name'] for b in _BENCHMARKS] 14 | 15 | def get_benchmark(benchmark_name): 16 | for b in _BENCHMARKS: 17 | if b['name'] == benchmark_name: 18 | return b 19 | raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks())) 20 | 21 | def get_task(benchmark, env_id): 22 | """Get a task by env_id. Return None if the benchmark doesn't have the env""" 23 | return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None) 24 | 25 | def find_task_for_env_id_in_any_benchmark(env_id): 26 | for bm in _BENCHMARKS: 27 | for task in bm["tasks"]: 28 | if task["env_id"]==env_id: 29 | return bm, task 30 | return None, None 31 | 32 | _ATARI_SUFFIX = 'NoFrameskip-v4' 33 | 34 | register_benchmark({ 35 | 'name' : 'Atari200M', 36 | 'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 200M frames', 37 | 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(200e6)} for _game in _atari7] 38 | }) 39 | 40 | register_benchmark({ 41 | 'name' : 'Atari40M', 42 | 'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames', 43 | 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atari7] 44 | }) 45 | 46 | register_benchmark({ 47 | 'name' : 'Atari1Hr', 48 | 'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime', 49 | 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_seconds' : 60*60} for _game in _atari7] 50 | }) 51 | 52 | register_benchmark({ 53 | 'name' : 'AtariExploration40M', 54 | 'description' :'7 Atari games emphasizing exploration, with pixel observations, 40M frames', 55 | 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atariexpl7] 56 | }) 57 | 58 | 59 | # MuJoCo 60 | 61 | _mujocosmall = [ 62 | 'InvertedDoublePendulum-v1', 'InvertedPendulum-v1', 63 | 'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1', 64 | 'Reacher-v1', 'Swimmer-v1'] 65 | register_benchmark({ 66 | 'name' : 'Mujoco1M', 67 | 'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps', 68 | 'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall] 69 | }) 70 | register_benchmark({ 71 | 'name' : 'MujocoWalkers', 72 | 'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M', 73 | 'tasks' : [ 74 | {'env_id' : "Hopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, 75 | {'env_id' : "Walker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, 76 | {'env_id' : "Humanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 }, 77 | ] 78 | }) 79 | # To reproduce: 80 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8 81 | # (observation input filters necessary) 82 | 83 | 84 | # Roboschool 85 | 86 | register_benchmark({ 87 | 'name' : 'Roboschool8M', 88 | 'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores', 89 | 'tasks' : [ 90 | {'env_id' : "RoboschoolReacher-v1", 'trials' : 4, 'num_timesteps' : 2*1000000 }, 91 | {'env_id' : "RoboschoolAnt-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, 92 | {'env_id' : "RoboschoolHalfCheetah-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, 93 | {'env_id' : "RoboschoolHopper-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, 94 | {'env_id' : "RoboschoolWalker2d-v1", 'trials' : 4, 'num_timesteps' : 8*1000000 }, 95 | ] 96 | }) 97 | register_benchmark({ 98 | 'name' : 'RoboschoolHarder', 99 | 'description' : 'Test your might!!! Up to 12 hours on 32 cores', 100 | 'tasks' : [ 101 | {'env_id' : "RoboschoolHumanoid-v1", 'trials' : 4, 'num_timesteps' : 100*1000000 }, 102 | {'env_id' : "RoboschoolHumanoidFlagrun-v1", 'trials' : 4, 'num_timesteps' : 200*1000000 }, 103 | {'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 }, 104 | ] 105 | }) 106 | # To reproduce: 107 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M myrun_ppo2_cpu8 108 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536 109 | # (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder) 110 | 111 | 112 | # Other 113 | 114 | _atari50 = [ # actually 49 115 | 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 116 | 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling', 117 | 'Boxing', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber', 118 | 'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway', 119 | 'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond', 120 | 'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', 121 | 'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert', 122 | 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 123 | 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 124 | 'Venture', 'VideoPinball', 'WizardOfWor', 'Zaxxon', 125 | ] 126 | 127 | register_benchmark({ 128 | 'name' : 'Atari50_40M', 129 | 'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames', 130 | 'tasks' : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50] 131 | }) 132 | 133 | def env_shortname(s): 134 | "Make typical names above shorter, while keeping recognizable" 135 | s = s.replace("NoFrameskip", "") 136 | if s[:10]=="Roboschool": s = s[10:] 137 | i = s.rfind("-v") 138 | if i!=-1: s = s[:i] 139 | 140 | return s.lower() 141 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/common/__init__.py: -------------------------------------------------------------------------------- 1 | from rl.common.console_util import * 2 | from rl.common.dataset import Dataset 3 | from rl.common.math_util import * 4 | from rl.common.misc_util import * 5 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/common/console_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from contextlib import contextmanager 3 | import numpy as np 4 | import time 5 | 6 | # ================================================================ 7 | # Misc 8 | # ================================================================ 9 | 10 | def fmt_row(width, row, header=False): 11 | out = " | ".join(fmt_item(x, width) for x in row) 12 | if header: out = out + "\n" + "-"*len(out) 13 | return out 14 | 15 | def fmt_item(x, l): 16 | if isinstance(x, np.ndarray): 17 | assert x.ndim==0 18 | x = x.item() 19 | if isinstance(x, float): rep = "%g"%x 20 | else: rep = str(x) 21 | return " "*(l - len(rep)) + rep 22 | 23 | color2num = dict( 24 | gray=30, 25 | red=31, 26 | green=32, 27 | yellow=33, 28 | blue=34, 29 | magenta=35, 30 | cyan=36, 31 | white=37, 32 | crimson=38 33 | ) 34 | 35 | def colorize(string, color, bold=False, highlight=False): 36 | attr = [] 37 | num = color2num[color] 38 | if highlight: num += 10 39 | attr.append(str(num)) 40 | if bold: attr.append('1') 41 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 42 | 43 | 44 | MESSAGE_DEPTH = 0 45 | 46 | @contextmanager 47 | def timed(msg): 48 | global MESSAGE_DEPTH #pylint: disable=W0603 49 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) 50 | tstart = time.time() 51 | MESSAGE_DEPTH += 1 52 | yield 53 | MESSAGE_DEPTH -= 1 54 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) 55 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/common/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Dataset(object): 4 | def __init__(self, data_map, deterministic=False, shuffle=True): 5 | self.data_map = data_map 6 | self.deterministic = deterministic 7 | self.enable_shuffle = shuffle 8 | self.n = next(iter(data_map.values())).shape[0] 9 | self._next_id = 0 10 | self.shuffle() 11 | 12 | def shuffle(self): 13 | if self.deterministic: 14 | return 15 | perm = np.arange(self.n) 16 | np.random.shuffle(perm) 17 | for key in self.data_map: 18 | self.data_map[key] = self.data_map[key][perm] 19 | 20 | self._next_id = 0 21 | 22 | def next_batch(self, batch_size): 23 | if self._next_id >= self.n and self.enable_shuffle: 24 | self.shuffle() 25 | 26 | cur_id = self._next_id 27 | cur_batch_size = min(batch_size, self.n - self._next_id) 28 | self._next_id += cur_batch_size 29 | 30 | data_map = dict() 31 | for key in self.data_map: 32 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] 33 | return data_map 34 | 35 | def iterate_once(self, batch_size): 36 | if self.enable_shuffle: self.shuffle() 37 | 38 | while self._next_id <= self.n - batch_size: 39 | yield self.next_batch(batch_size) 40 | self._next_id = 0 41 | 42 | def subset(self, num_elements, deterministic=True): 43 | data_map = dict() 44 | for key in self.data_map: 45 | data_map[key] = self.data_map[key][:num_elements] 46 | return Dataset(data_map, deterministic) 47 | 48 | 49 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): 50 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' 51 | arrays = tuple(map(np.asarray, arrays)) 52 | n = arrays[0].shape[0] 53 | assert all(a.shape[0] == n for a in arrays[1:]) 54 | inds = np.arange(n) 55 | if shuffle: np.random.shuffle(inds) 56 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches 57 | for batch_inds in np.array_split(inds, sections): 58 | if include_final_partial_batch or len(batch_inds) == batch_size: 59 | yield tuple(a[batch_inds] for a in arrays) 60 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/common/ma_wrappers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import deque 3 | from PIL import Image 4 | import gym 5 | from gym import spaces 6 | 7 | 8 | class MAWrapper(gym.Wrapper): 9 | def __init__(self, env): 10 | gym.Wrapper.__init__(self, env) 11 | self.observation_space = [self.env.observation_space] 12 | self.action_space = [self.env.action_space] 13 | self.n = 1 14 | 15 | def step(self, action): 16 | obs, reward, done, info = self.env.step(action[0]) 17 | return [obs], [reward], [done], info 18 | 19 | def reset(self): 20 | return [self.env.reset()] 21 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/common/math_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | 5 | def discount(x, gamma): 6 | """ 7 | computes discounted sums along 0th dimension of x. 8 | 9 | inputs 10 | ------ 11 | x: ndarray 12 | gamma: float 13 | 14 | outputs 15 | ------- 16 | y: ndarray with same shape as x, satisfying 17 | 18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], 19 | where k = len(x) - t - 1 20 | 21 | """ 22 | assert x.ndim >= 1 23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 24 | 25 | def explained_variance(ypred,y): 26 | """ 27 | Computes fraction of variance that ypred explains about y. 28 | Returns 1 - Var[y-ypred] / Var[y] 29 | 30 | interpretation: 31 | ev=0 => might as well have predicted zero 32 | ev=1 => perfect prediction 33 | ev<0 => worse than just predicting zero 34 | 35 | """ 36 | assert y.ndim == 1 and ypred.ndim == 1 37 | vary = np.var(y) 38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 39 | 40 | def explained_variance_2d(ypred, y): 41 | assert y.ndim == 2 and ypred.ndim == 2 42 | vary = np.var(y, axis=0) 43 | out = 1 - np.var(y-ypred)/vary 44 | out[vary < 1e-10] = 0 45 | return out 46 | 47 | def ncc(ypred, y): 48 | return np.corrcoef(ypred, y)[1,0] 49 | 50 | def flatten_arrays(arrs): 51 | return np.concatenate([arr.flat for arr in arrs]) 52 | 53 | def unflatten_vector(vec, shapes): 54 | i=0 55 | arrs = [] 56 | for shape in shapes: 57 | size = np.prod(shape) 58 | arr = vec[i:i+size].reshape(shape) 59 | arrs.append(arr) 60 | i += size 61 | return arrs 62 | 63 | def discount_with_boundaries(X, New, gamma): 64 | """ 65 | X: 2d array of floats, time x features 66 | New: 2d array of bools, indicating when a new episode has started 67 | """ 68 | Y = np.zeros_like(X) 69 | T = X.shape[0] 70 | Y[T-1] = X[T-1] 71 | for t in range(T-2, -1, -1): 72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1]) 73 | return Y 74 | 75 | def test_discount_with_boundaries(): 76 | gamma=0.9 77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32') 78 | starts = [1.0, 0.0, 0.0, 1.0] 79 | y = discount_with_boundaries(x, starts, gamma) 80 | assert np.allclose(y, [ 81 | 1 + gamma * 2 + gamma**2 * 3, 82 | 2 + gamma * 3, 83 | 3, 84 | 4 85 | ]) -------------------------------------------------------------------------------- /multi-agent-irl/rl/common/mpi_adam.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import rl.common.tf_util as U 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | class MpiAdam(object): 7 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): 8 | self.var_list = var_list 9 | self.beta1 = beta1 10 | self.beta2 = beta2 11 | self.epsilon = epsilon 12 | self.scale_grad_by_procs = scale_grad_by_procs 13 | size = sum(U.numel(v) for v in var_list) 14 | self.m = np.zeros(size, 'float32') 15 | self.v = np.zeros(size, 'float32') 16 | self.t = 0 17 | self.setfromflat = U.SetFromFlat(var_list) 18 | self.getflat = U.GetFlat(var_list) 19 | self.comm = MPI.COMM_WORLD if comm is None else comm 20 | 21 | def update(self, localg, stepsize): 22 | if self.t % 100 == 0: 23 | self.check_synced() 24 | localg = localg.astype('float32') 25 | globalg = np.zeros_like(localg) 26 | self.comm.Allreduce(localg, globalg, op=MPI.SUM) 27 | if self.scale_grad_by_procs: 28 | globalg /= self.comm.Get_size() 29 | 30 | self.t += 1 31 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t) 32 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg 33 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) 34 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon) 35 | self.setfromflat(self.getflat() + step) 36 | 37 | def sync(self): 38 | theta = self.getflat() 39 | self.comm.Bcast(theta, root=0) 40 | self.setfromflat(theta) 41 | 42 | def check_synced(self): 43 | if self.comm.Get_rank() == 0: # this is root 44 | theta = self.getflat() 45 | self.comm.Bcast(theta, root=0) 46 | else: 47 | thetalocal = self.getflat() 48 | thetaroot = np.empty_like(thetalocal) 49 | self.comm.Bcast(thetaroot, root=0) 50 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) 51 | 52 | @U.in_session 53 | def test_MpiAdam(): 54 | np.random.seed(0) 55 | tf.set_random_seed(0) 56 | 57 | a = tf.Variable(np.random.randn(3).astype('float32')) 58 | b = tf.Variable(np.random.randn(2,5).astype('float32')) 59 | loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) 60 | 61 | stepsize = 1e-2 62 | update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) 63 | do_update = U.function([], loss, updates=[update_op]) 64 | 65 | tf.get_default_session().run(tf.global_variables_initializer()) 66 | for i in range(10): 67 | print(i,do_update()) 68 | 69 | tf.set_random_seed(0) 70 | tf.get_default_session().run(tf.global_variables_initializer()) 71 | 72 | var_list = [a,b] 73 | lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) 74 | adam = MpiAdam(var_list) 75 | 76 | for i in range(10): 77 | l,g = lossandgrad() 78 | adam.update(g, stepsize) 79 | print(i,l) -------------------------------------------------------------------------------- /multi-agent-irl/rl/common/mpi_fork.py: -------------------------------------------------------------------------------- 1 | import os, subprocess, sys 2 | 3 | def mpi_fork(n, bind_to_core=False): 4 | """Re-launches the current script with workers 5 | Returns "parent" for original parent, "child" for MPI children 6 | """ 7 | if n<=1: 8 | return "child" 9 | if os.getenv("IN_MPI") is None: 10 | env = os.environ.copy() 11 | env.update( 12 | MKL_NUM_THREADS="1", 13 | OMP_NUM_THREADS="1", 14 | IN_MPI="1" 15 | ) 16 | args = ["mpirun", "-np", str(n)] 17 | if bind_to_core: 18 | args += ["-bind-to", "core"] 19 | args += [sys.executable] + sys.argv 20 | subprocess.check_call(args, env=env) 21 | return "parent" 22 | else: 23 | return "child" 24 | -------------------------------------------------------------------------------- /multi-agent-irl/rl/common/mpi_moments.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | from rl.common import zipsame 4 | 5 | def mpi_moments(x, axis=0): 6 | x = np.asarray(x, dtype='float64') 7 | newshape = list(x.shape) 8 | newshape.pop(axis) 9 | n = np.prod(newshape,dtype=int) 10 | totalvec = np.zeros(n*2+1, 'float64') 11 | addvec = np.concatenate([x.sum(axis=axis).ravel(), 12 | np.square(x).sum(axis=axis).ravel(), 13 | np.array([x.shape[axis]],dtype='float64')]) 14 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) 15 | sum = totalvec[:n] 16 | sumsq = totalvec[n:2*n] 17 | count = totalvec[2*n] 18 | if count == 0: 19 | mean = np.empty(newshape); mean[:] = np.nan 20 | std = np.empty(newshape); std[:] = np.nan 21 | else: 22 | mean = sum/count 23 | std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0)) 24 | return mean, std, count 25 | 26 | 27 | def test_runningmeanstd(): 28 | comm = MPI.COMM_WORLD 29 | np.random.seed(0) 30 | for (triple,axis) in [ 31 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), 32 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), 33 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), 34 | ]: 35 | 36 | 37 | x = np.concatenate(triple, axis=axis) 38 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] 39 | 40 | 41 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) 42 | 43 | for (a1,a2) in zipsame(ms1, ms2): 44 | print(a1, a2) 45 | assert np.allclose(a1, a2) 46 | print("ok!") 47 | 48 | if __name__ == "__main__": 49 | #mpirun -np 3 python