├── .gitignore ├── LICENSE ├── README.md ├── bin └── examine.py ├── examples ├── blueprint.jsonnet ├── blueprint.npz ├── hide_and_seek_full.jsonnet ├── hide_and_seek_full.npz ├── hide_and_seek_policy_phases │ ├── a_chasing.npz │ ├── b_forts.npz │ ├── c_ramps.npz │ ├── d_ramp_defense.npz │ └── e_box_surfing.npz ├── hide_and_seek_quadrant.jsonnet ├── hide_and_seek_quadrant.npz ├── hide_and_seek_quadrant_physics_exploits.npz ├── lock_and_return.jsonnet ├── lock_and_return.npz ├── sequential_lock.jsonnet ├── sequential_lock.npz ├── shelter.jsonnet ├── shelter.npz └── test_all_policies.py ├── ma_policy ├── graph_construct.py ├── layers.py ├── load_policy.py ├── ma_policy.py ├── normalizers.py ├── util.py └── variable_schema.py ├── mae_envs ├── __init__.py ├── envs │ ├── __init__.py │ ├── base.py │ ├── blueprint_construction.py │ ├── box_locking.py │ ├── hide_and_seek.py │ └── shelter_construction.py ├── modules │ ├── __init__.py │ ├── agents.py │ ├── construction_sites.py │ ├── food.py │ ├── module.py │ ├── objects.py │ ├── util.py │ ├── walls.py │ └── world.py ├── util │ ├── geometry.py │ ├── transforms.py │ └── vision.py ├── viewer │ ├── __init__.py │ ├── env_viewer.py │ └── policy_viewer.py └── wrappers │ ├── food.py │ ├── lidar.py │ ├── limit_mvmnt.py │ ├── line_of_sight.py │ ├── manipulation.py │ ├── multi_agent.py │ ├── prep_phase.py │ ├── team.py │ └── util.py ├── randomized_uncertain_social_preferences ├── rusp │ ├── README.md │ ├── __init__.py │ ├── abstract_base_env.py │ ├── env_indirect_reciprocity.py │ ├── env_ipd.py │ ├── env_oasis.py │ ├── env_prisoners_buddy.py │ ├── test_env_indirect_reciprocity.py │ ├── test_env_ipd.py │ ├── test_env_oasis.py │ ├── test_env_prisoners_buddy.py │ ├── test_wrapper_rusp.py │ ├── wrappers_rusp.py │ └── wrappers_util.py └── setup.py ├── requirements_ma_policy.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Status:** Archive (code is provided as-is, no updates expected) 2 | 3 | # Multiagent emergence environments 4 | Environment generation code for [Emergent Tool Use From Multi-Agent Autocurricula](https://arxiv.org/abs/1909.07528) ([blog](https://openai.com/blog/emergent-tool-use/)) 5 | 6 | ### Installation 7 | This repository depends on the [mujoco-worldgen](https://github.com/openai/mujoco-worldgen) package. You will need to clone the mujoco-worldgen repository and install it and its dependencies: 8 | ``` 9 | pip install -r mujoco-worldgen/requirements.txt 10 | pip install -e mujoco-worldgen/ 11 | pip install -e multi-agent-emergence-environments/ 12 | ``` 13 | 14 | This repository has been tested only on Mac OS X and Ubuntu 16.04 with Python 3.6 15 | 16 | ### Use 17 | 18 | Environment construction works in the following way: You start from the `Base` environment (defined in `mae_envs/envs/base.py`) and then you add environment modules (e.g. `Boxes`, `Ramps`, `RandomWalls`, etc.) and then wrappers on top. You can see examples in the `mae_envs/envs` folder. 19 | 20 | If you want to construct a new environment, we highly recommend using the above paradigm in order to minimize code duplication. If you need new objects or game dynamics that don't already exist in this codebase, add them in via a new `EnvModule` class or a `gym.Wrapper` class rather than subclassing `Base` (or mujoco-worldgen's `Env` class). In general, `EnvModules` should be used for adding objects or sites to the environment, or otherwise modifying the mujoco simulator; wrappers should be used for everything else (e.g. adding rewards, additional observations, or implementing game mechanics like Lock and Grab). 21 | 22 | The environments defined in this repository are: \ 23 | *Hide and seek* - `mae_envs/envs/hide_and_seek.py` - The Hide and Seek environment described in the paper. This encompasses the *random rooms*, *quadrant* and *food* versions of the game (you can switch between them by changing the arguments given to the `make_env` function in the file) \ 24 | *Box locking* - `mae_envs/envs/box_locking.py` - Encompasses the *Lock and Return* and *Sequential Lock* transfer tasks described in the paper. \ 25 | *Blueprint Construction* - `mae_envs/envs/blueprint_construction.py` \ 26 | *Shelter Construction* - `mae_envs/envs/shelter_construction.py` 27 | 28 | You can test out environments by using the `bin/examine` script. Example usage: `bin/examine.py base`. \ 29 | You can also use `bin/examine` to play a saved policy on an environment. There are several environment jsonnets and policies in the `examples` folder. Example usage: 30 | 31 | ```bin/examine.py examples/hide_and_seek_quadrant.jsonnet examples/hide_and_seek_quadrant.npz``` 32 | 33 | Note that to be able to play saved policies, you will need to install a few additional packages. You can do this via 34 | 35 | `pip install -r multi-agent-emergence-environments/requirements_ma_policy.txt` 36 | -------------------------------------------------------------------------------- /bin/examine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import logging 3 | import click 4 | import numpy as np 5 | from os.path import abspath, dirname, join 6 | from gym.spaces import Tuple 7 | 8 | from mae_envs.viewer.env_viewer import EnvViewer 9 | from mae_envs.wrappers.multi_agent import JoinMultiAgentActions 10 | from mujoco_worldgen.util.envs import examine_env, load_env 11 | from mujoco_worldgen.util.types import extract_matching_arguments 12 | from mujoco_worldgen.util.parse_arguments import parse_arguments 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | @click.command() 19 | @click.argument('argv', nargs=-1, required=False) 20 | def main(argv): 21 | ''' 22 | examine.py is used to display environments and run policies. 23 | 24 | For an example environment jsonnet, see 25 | mujoco-worldgen/examples/example_env_examine.jsonnet 26 | You can find saved policies and the in the 'examples' together with the environment they were 27 | trained in and the hyperparameters used. The naming used is 'examples/.jsonnet' for 28 | the environment jsonnet file and 'examples/.npz' for the policy weights file. 29 | Example uses: 30 | bin/examine.py hide_and_seek 31 | bin/examine.py mae_envs/envs/base.py 32 | bin/examine.py base n_boxes=6 n_ramps=2 n_agents=3 33 | bin/examine.py my_env_jsonnet.jsonnet 34 | bin/examine.py my_env_jsonnet.jsonnet my_policy.npz 35 | bin/examine.py hide_and_seek my_policy.npz n_hiders=3 n_seekers=2 n_boxes=8 n_ramps=1 36 | ''' 37 | names, kwargs = parse_arguments(argv) 38 | 39 | env_name = names[0] 40 | core_dir = abspath(join(dirname(__file__), '..')) 41 | envs_dir = 'mae_envs/envs', 42 | xmls_dir = 'xmls', 43 | 44 | if len(names) == 1: # examine the environment 45 | examine_env(env_name, kwargs, 46 | core_dir=core_dir, envs_dir=envs_dir, xmls_dir=xmls_dir, 47 | env_viewer=EnvViewer) 48 | 49 | if len(names) >= 2: # run policies on the environment 50 | # importing PolicyViewer and load_policy here because they depend on several 51 | # packages which are only needed for playing policies, not for any of the 52 | # environments code. 53 | from mae_envs.viewer.policy_viewer import PolicyViewer 54 | from ma_policy.load_policy import load_policy 55 | policy_names = names[1:] 56 | env, args_remaining_env = load_env(env_name, core_dir=core_dir, 57 | envs_dir=envs_dir, xmls_dir=xmls_dir, 58 | return_args_remaining=True, **kwargs) 59 | 60 | if isinstance(env.action_space, Tuple): 61 | env = JoinMultiAgentActions(env) 62 | if env is None: 63 | raise Exception(f'Could not find environment based on pattern {env_name}') 64 | 65 | env.reset() # generate action and observation spaces 66 | assert np.all([name.endswith('.npz') for name in policy_names]) 67 | policies = [load_policy(name, env=env, scope=f'policy_{i}') 68 | for i, name in enumerate(policy_names)] 69 | 70 | 71 | args_remaining_policy = args_remaining_env 72 | 73 | if env is not None and policies is not None: 74 | args_to_pass, args_remaining_viewer = extract_matching_arguments(PolicyViewer, kwargs) 75 | args_remaining = set(args_remaining_env) 76 | args_remaining = args_remaining.intersection(set(args_remaining_policy)) 77 | args_remaining = args_remaining.intersection(set(args_remaining_viewer)) 78 | assert len(args_remaining) == 0, ( 79 | f"There left unused arguments: {args_remaining}. There shouldn't be any.") 80 | viewer = PolicyViewer(env, policies, **args_to_pass) 81 | viewer.run() 82 | 83 | 84 | print(main.__doc__) 85 | 86 | 87 | if __name__ == '__main__': 88 | logging.getLogger('').handlers = [] 89 | logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) 90 | 91 | main() 92 | -------------------------------------------------------------------------------- /examples/blueprint.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | make_env: { 3 | "function": "mae_envs.envs.blueprint_construction:make_env", 4 | args: { 5 | # Agent Actions 6 | grab_box: true, 7 | grab_out_of_vision: false, 8 | grab_selective: false, 9 | grab_exclusive: false, 10 | 11 | lock_box: true, 12 | lock_type: "all_lock_team_specific", 13 | lock_out_of_vision: false, 14 | 15 | 16 | # Scenario 17 | n_substeps: 15, 18 | horizon: 150, 19 | 20 | # Objects 21 | n_boxes: 8, 22 | box_only_z_rot: true, 23 | boxid_obs: false, 24 | boxsize_obs: true, 25 | 26 | # Observations 27 | n_lidar_per_agent: 30, 28 | additional_obs: { 29 | hider: [[0]], 30 | prep_obs: [[0]], 31 | ramp_obs: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 32 | mask_ar_obs: [[0]], 33 | }, 34 | 35 | # Blueprint 36 | n_sites: [1, 4], 37 | site_placement: 'uniform_away_from_walls', 38 | reward_infos: [ 39 | { 40 | type: 'construction_dense', 41 | alpha: -1.5, 42 | use_corners: true, 43 | reward_scale: 0.05, 44 | }, 45 | { 46 | type: 'construction_completed', 47 | site_activation_radius: 0.1, 48 | use_corners: true, 49 | reward_scale: 3, 50 | }, 51 | ], 52 | }, 53 | }, 54 | } -------------------------------------------------------------------------------- /examples/blueprint.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/blueprint.npz -------------------------------------------------------------------------------- /examples/hide_and_seek_full.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | make_env: { 3 | "function": "mae_envs.envs.hide_and_seek:make_env", 4 | args: { 5 | # Agents 6 | n_hiders: 2, 7 | n_seekers: 2, 8 | # Agent Actions 9 | grab_box: true, 10 | grab_out_of_vision: false, 11 | grab_selective: false, 12 | grab_exclusive: false, 13 | 14 | lock_box: true, 15 | lock_type: "all_lock_team_specific", 16 | lock_out_of_vision: false, 17 | 18 | # Scenario 19 | n_substeps: 15, 20 | horizon: 240, 21 | scenario: "randomwalls", 22 | n_rooms: 4, 23 | random_room_number: true, 24 | prob_outside_walls: 0.5, 25 | prep_fraction: 0.4, 26 | rew_type: "joint_zero_sum", 27 | restrict_rect: [-6.0, -6.0, 12.0, 12.0], 28 | 29 | hiders_together_radius: 0.5, 30 | seekers_together_radius: 0.5, 31 | 32 | # Objects 33 | n_boxes: [3, 9], 34 | n_elongated_boxes: [3, 9], 35 | box_only_z_rot: true, 36 | boxid_obs: false, 37 | 38 | n_ramps: 2, 39 | 40 | # Food 41 | n_food: 0, 42 | max_food_health: 40, 43 | food_radius: 0.5, 44 | food_box_centered: true, 45 | food_together_radius: 0.25, 46 | food_respawn_time: 5, 47 | food_rew_type: "joint_mean", 48 | 49 | # Observations 50 | n_lidar_per_agent: 30, 51 | prep_obs: true, 52 | }, 53 | }, 54 | } 55 | -------------------------------------------------------------------------------- /examples/hide_and_seek_full.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/hide_and_seek_full.npz -------------------------------------------------------------------------------- /examples/hide_and_seek_policy_phases/a_chasing.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/hide_and_seek_policy_phases/a_chasing.npz -------------------------------------------------------------------------------- /examples/hide_and_seek_policy_phases/b_forts.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/hide_and_seek_policy_phases/b_forts.npz -------------------------------------------------------------------------------- /examples/hide_and_seek_policy_phases/c_ramps.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/hide_and_seek_policy_phases/c_ramps.npz -------------------------------------------------------------------------------- /examples/hide_and_seek_policy_phases/d_ramp_defense.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/hide_and_seek_policy_phases/d_ramp_defense.npz -------------------------------------------------------------------------------- /examples/hide_and_seek_policy_phases/e_box_surfing.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/hide_and_seek_policy_phases/e_box_surfing.npz -------------------------------------------------------------------------------- /examples/hide_and_seek_quadrant.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | make_env: { 3 | "function": "mae_envs.envs.hide_and_seek:make_env", 4 | args: { 5 | # Agents 6 | n_hiders: 2, 7 | n_seekers: 2, 8 | # Agent Actions 9 | grab_box: true, 10 | grab_out_of_vision: false, 11 | grab_selective: false, 12 | grab_exclusive: false, 13 | 14 | lock_box: true, 15 | lock_type: "all_lock_team_specific", 16 | lock_out_of_vision: false, 17 | 18 | # Scenario 19 | n_substeps: 15, 20 | horizon: 80, 21 | scenario: 'quadrant', 22 | prep_fraction: 0.4, 23 | rew_type: "joint_zero_sum", 24 | restrict_rect: [0.1, 0.1, 5.9, 5.9], 25 | p_door_dropout: 0.5, 26 | quadrant_game_hider_uniform_placement: true, 27 | 28 | # Objects 29 | n_boxes: 2, 30 | box_only_z_rot: true, 31 | boxid_obs: false, 32 | 33 | n_ramps: 1, 34 | lock_ramp: false, 35 | penalize_objects_out: true, 36 | 37 | # Food 38 | n_food: 0, 39 | 40 | # Observations 41 | n_lidar_per_agent: 30, 42 | prep_obs: true, 43 | }, 44 | }, 45 | } 46 | -------------------------------------------------------------------------------- /examples/hide_and_seek_quadrant.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/hide_and_seek_quadrant.npz -------------------------------------------------------------------------------- /examples/hide_and_seek_quadrant_physics_exploits.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/hide_and_seek_quadrant_physics_exploits.npz -------------------------------------------------------------------------------- /examples/lock_and_return.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | make_env: { 3 | "function": "mae_envs.envs.box_locking:make_env", 4 | args: { 5 | # Agents 6 | n_agents: 1, 7 | # Agent Actions 8 | grab_box: true, 9 | grab_out_of_vision: false, 10 | grab_selective: false, 11 | grab_exclusive: false, 12 | 13 | lock_box: true, 14 | lock_type: "all_lock_team_specific", 15 | lock_out_of_vision: false, 16 | 17 | # Scenario 18 | n_substeps: 15, 19 | horizon: 120, 20 | scenario: "randomwalls", 21 | n_rooms: 6, 22 | random_room_number: false, 23 | 24 | # Objects 25 | box_only_z_rot: true, 26 | boxid_obs: false, 27 | boxsize_obs: true, 28 | pad_ramp_size: true, 29 | 30 | # Observations 31 | n_lidar_per_agent: 30, 32 | additional_obs: { 33 | hider: [[1]], 34 | prep_obs: [[0]], 35 | ramp_obs: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 36 | ramp_obj_lock: [[0]], 37 | ramp_you_lock: [[[0]]], 38 | ramp_team_lock: [[[0]]], 39 | mask_ar_obs: [[0]], 40 | }, 41 | 42 | # Lock Box Task 43 | n_boxes: 1, 44 | task_type: 'all-return', 45 | lock_reward: 5.0, 46 | unlock_penalty: 5.0, 47 | shaped_reward_scale: 0.5, 48 | }, 49 | }, 50 | } -------------------------------------------------------------------------------- /examples/lock_and_return.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/lock_and_return.npz -------------------------------------------------------------------------------- /examples/sequential_lock.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | make_env: { 3 | "function": "mae_envs.envs.box_locking:make_env", 4 | args: { 5 | # Agents 6 | n_agents: 1, 7 | fixed_agent_spawn: false, 8 | 9 | # Agent Actions 10 | grab_box: true, 11 | grab_out_of_vision: false, 12 | grab_selective: false, 13 | grab_exclusive: false, 14 | 15 | lock_box: true, 16 | lock_type: "all_lock_team_specific", 17 | lock_out_of_vision: false, 18 | 19 | 20 | # Scenario 21 | n_substeps: 15, 22 | n_ramps: 3, 23 | horizon: 120, 24 | scenario: "var_tri_uniform", 25 | door_size: 0, 26 | 27 | # Objects 28 | box_only_z_rot: true, 29 | boxid_obs: false, 30 | boxsize_obs: true, 31 | pad_ramp_size: true, 32 | 33 | # Observations 34 | n_lidar_per_agent: 30, 35 | additional_obs: { 36 | hider: [[1]], 37 | prep_obs: [[0]], 38 | }, 39 | 40 | # Lock Box Task 41 | n_boxes: 4, 42 | task_type: 'order', 43 | lock_reward: 5.0, 44 | unlock_penalty: 5.0, 45 | shaped_reward_scale: 0.5, 46 | }, 47 | }, 48 | } -------------------------------------------------------------------------------- /examples/sequential_lock.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/sequential_lock.npz -------------------------------------------------------------------------------- /examples/shelter.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | make_env: { 3 | "function": "mae_envs.envs.shelter_construction:make_env", 4 | args: { 5 | # Agent Actions 6 | grab_box: true, 7 | grab_out_of_vision: false, 8 | grab_selective: false, 9 | grab_exclusive: false, 10 | 11 | lock_box: true, 12 | lock_type: "all_lock_team_specific", 13 | lock_out_of_vision: false, 14 | 15 | # Scenario 16 | n_substeps: 15, 17 | horizon: 240, 18 | 19 | # Objects 20 | n_boxes: 8, 21 | n_elongated_boxes: 3, 22 | box_only_z_rot: true, 23 | boxid_obs: false, 24 | 25 | # Observations 26 | n_lidar_per_agent: 30, 27 | additional_obs: { 28 | hider: [[0]], 29 | prep_obs: [[0]], 30 | ramp_obs: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 31 | mask_ar_obs: [[0]], 32 | }, 33 | 34 | # Shelter 35 | shelter_reward_scale: 0.001, 36 | objective_diameter: [1.5, 2], 37 | objective_placement: 'uniform_away_from_walls', 38 | }, 39 | }, 40 | } -------------------------------------------------------------------------------- /examples/shelter.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/examples/shelter.npz -------------------------------------------------------------------------------- /examples/test_all_policies.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import unittest 3 | import pytest 4 | import os 5 | 6 | EXAMPLES_DIR = os.path.dirname(os.path.abspath(__file__)) 7 | EXAMINE_FILE_PATH = os.path.join(EXAMPLES_DIR, "../bin/examine.py") 8 | 9 | class ExamineTest(unittest.TestCase): 10 | def test_examine_env(self): 11 | envs = [ 12 | "hide_and_seek_full.jsonnet", 13 | "hide_and_seek_quadrant.jsonnet", 14 | "blueprint.jsonnet", 15 | "lock_and_return.jsonnet", 16 | "sequential_lock.jsonnet", 17 | "shelter.jsonnet", 18 | ] 19 | for env in envs: 20 | with self.assertRaises(subprocess.TimeoutExpired): 21 | subprocess.check_call( 22 | ["/usr/bin/env", "python", EXAMINE_FILE_PATH, os.path.join(EXAMPLES_DIR, env)], 23 | timeout=10) 24 | 25 | 26 | def test_examine_policies(self): 27 | envs_policies = [ 28 | ("hide_and_seek_full.jsonnet", "hide_and_seek_full.npz"), 29 | ("hide_and_seek_quadrant.jsonnet", "hide_and_seek_quadrant.npz"), 30 | ("blueprint.jsonnet", "blueprint.npz"), 31 | ("lock_and_return.jsonnet", "lock_and_return.npz"), 32 | ("sequential_lock.jsonnet", "sequential_lock.npz"), 33 | ("shelter.jsonnet", "shelter.npz"), 34 | ] 35 | for env, policy in envs_policies: 36 | with self.assertRaises(subprocess.TimeoutExpired): 37 | subprocess.check_call( 38 | ["/usr/bin/env", "python", EXAMINE_FILE_PATH, os.path.join(EXAMPLES_DIR, env), os.path.join(EXAMPLES_DIR, policy)], 39 | timeout=15) 40 | -------------------------------------------------------------------------------- /ma_policy/layers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from ma_policy.util import shape_list 4 | 5 | 6 | ################# 7 | # Pooling ####### 8 | ################# 9 | 10 | def entity_avg_pooling_masked(x, mask): 11 | ''' 12 | Masks and pools x along the second to last dimension. Arguments have dimensions: 13 | x: batch x time x n_entities x n_features 14 | mask: batch x time x n_entities 15 | ''' 16 | mask = tf.expand_dims(mask, -1) 17 | masked = x * mask 18 | summed = tf.reduce_sum(masked, -2) 19 | denom = tf.reduce_sum(mask, -2) + 1e-5 20 | return summed / denom 21 | 22 | 23 | def entity_max_pooling_masked(x, mask): 24 | ''' 25 | Masks and pools x along the second to last dimension. Arguments have dimensions: 26 | x: batch x time x n_entities x n_features 27 | mask: batch x time x n_entities 28 | ''' 29 | mask = tf.expand_dims(mask, -1) 30 | has_unmasked_entities = tf.sign(tf.reduce_sum(mask, axis=-2, keepdims=True)) 31 | offset = (mask - 1) * 1e9 32 | masked = (x + offset) * has_unmasked_entities 33 | return tf.reduce_max(masked, -2) 34 | 35 | 36 | ################# 37 | # Contat Ops #### 38 | ################# 39 | 40 | def entity_concat(inps): 41 | ''' 42 | Concat 4D tensors along the third dimension. If a 3D tensor is in the list 43 | then treat it as a single entity and expand the third dimension 44 | Args: 45 | inps (list of tensors): tensors to concatenate 46 | ''' 47 | with tf.variable_scope('concat_entities'): 48 | shapes = [shape_list(_x) for _x in inps] 49 | # For inputs that don't have entity dimension add one. 50 | inps = [_x if len(_shape) == 4 else tf.expand_dims(_x, 2) for _x, _shape in zip(inps, shapes)] 51 | shapes = [shape_list(_x) for _x in inps] 52 | assert np.all([_shape[-1] == shapes[0][-1] for _shape in shapes]),\ 53 | f"Some entities don't have the same outer or inner dimensions {shapes}" 54 | # Concatenate along entity dimension 55 | out = tf.concat(inps, -2) 56 | return out 57 | 58 | 59 | def concat_entity_masks(inps, masks): 60 | ''' 61 | Concats masks together. If mask is None, then it creates 62 | a tensor of 1's with shape (BS, T, NE). 63 | Args: 64 | inps (list of tensors): tensors that masks apply to 65 | masks (list of tensors): corresponding masks 66 | ''' 67 | assert len(inps) == len(masks), "There should be the same number of inputs as masks" 68 | with tf.variable_scope('concat_masks'): 69 | shapes = [shape_list(_x) for _x in inps] 70 | new_masks = [] 71 | for inp, mask in zip(inps, masks): 72 | if mask is None: 73 | inp_shape = shape_list(inp) 74 | if len(inp_shape) == 4: # this is an entity tensor 75 | new_masks.append(tf.ones(inp_shape[:3])) 76 | elif len(inp_shape) == 3: # this is a pooled or main tensor. Set NE (outer dimension) to 1 77 | new_masks.append(tf.ones(inp_shape[:2] + [1])) 78 | else: 79 | new_masks.append(mask) 80 | new_mask = tf.concat(new_masks, -1) 81 | return new_mask 82 | 83 | 84 | ################# 85 | # Transformer ### 86 | ################# 87 | 88 | 89 | def residual_sa_block(inp, mask, heads, n_embd, 90 | layer_norm=False, post_sa_layer_norm=False, 91 | n_mlp=1, qk_w=0.125, v_w=0.125, post_w=0.125, 92 | mlp_w1=0.125, mlp_w2=0.125, 93 | scope="residual_sa_block", reuse=False): 94 | ''' 95 | Residual self attention block for entities. 96 | Notation: 97 | T - Time 98 | NE - Number entities 99 | Args: 100 | inp (tf): (BS, T, NE, f) 101 | mask (tf): (BS, T, NE) 102 | heads (int) -- number of attention heads 103 | n_embd (int) -- dimension of queries, keys, and values will be n_embd / heads 104 | layer_norm (bool) -- normalize embedding prior to computing qkv 105 | n_mlp (int) -- number of mlp layers. If there are more than 1 mlp layers, we'll add a residual 106 | connection from after the first mlp to after the last mlp. 107 | qk_w, v_w, post_w, mlp_w1, mlp_w2 (float) -- scale for gaussian init for keys/queries, values, mlp 108 | post self attention, second mlp, and third mlp, respectively. Std will be sqrt(scale/n_embd) 109 | scope (string) -- tf scope 110 | reuse (bool) -- tf reuse 111 | ''' 112 | with tf.variable_scope(scope, reuse=reuse): 113 | a = self_attention(inp, mask, heads, n_embd, layer_norm=layer_norm, qk_w=qk_w, v_w=v_w, 114 | scope='self_attention', reuse=reuse) 115 | post_scale = np.sqrt(post_w / n_embd) 116 | post_a_mlp = tf.layers.dense(a, 117 | n_embd, 118 | kernel_initializer=tf.random_normal_initializer(stddev=post_scale), 119 | name="mlp1") 120 | x = inp + post_a_mlp 121 | if post_sa_layer_norm: 122 | with tf.variable_scope('post_a_layernorm'): 123 | x = tf.contrib.layers.layer_norm(x, begin_norm_axis=3) 124 | if n_mlp > 1: 125 | mlp = x 126 | mlp2_scale = np.sqrt(mlp_w1 / n_embd) 127 | mlp = tf.layers.dense(mlp, 128 | n_embd, 129 | kernel_initializer=tf.random_normal_initializer(stddev=mlp2_scale), 130 | name="mlp2") 131 | if n_mlp > 2: 132 | mlp3_scale = np.sqrt(mlp_w2 / n_embd) 133 | mlp = tf.layers.dense(mlp, 134 | n_embd, 135 | kernel_initializer=tf.random_normal_initializer(stddev=mlp3_scale), 136 | name="mlp3") 137 | if n_mlp > 1: 138 | x = x + mlp 139 | return x 140 | 141 | 142 | def self_attention(inp, mask, heads, n_embd, layer_norm=False, qk_w=1.0, v_w=0.01, 143 | scope='', reuse=False): 144 | ''' 145 | Self attention over entities. 146 | Notation: 147 | T - Time 148 | NE - Number entities 149 | Args: 150 | inp (tf) -- tensor w/ shape (bs, T, NE, features) 151 | mask (tf) -- binary tensor with shape (bs, T, NE). For each batch x time, 152 | nner matrix represents entity i's ability to see entity j 153 | heads (int) -- number of attention heads 154 | n_embd (int) -- dimension of queries, keys, and values will be n_embd / heads 155 | layer_norm (bool) -- normalize embedding prior to computing qkv 156 | qk_w, v_w (float) -- scale for gaussian init for keys/queries and values 157 | Std will be sqrt(scale/n_embd) 158 | scope (string) -- tf scope 159 | reuse (bool) -- tf reuse 160 | ''' 161 | with tf.variable_scope(scope, reuse=reuse): 162 | bs, T, NE, features = shape_list(inp) 163 | # Put mask in format correct for logit matrix 164 | entity_mask = None 165 | if mask is not None: 166 | with tf.variable_scope('expand_mask'): 167 | assert np.all(np.array(mask.get_shape().as_list()) == np.array(inp.get_shape().as_list()[:3])),\ 168 | f"Mask and input should have the same first 3 dimensions. {shape_list(mask)} -- {shape_list(inp)}" 169 | entity_mask = mask 170 | mask = tf.expand_dims(mask, -2) # (BS, T, 1, NE) 171 | 172 | query, key, value = qkv_embed(inp, heads, n_embd, layer_norm=layer_norm, qk_w=qk_w, v_w=v_w, reuse=reuse) 173 | logits = tf.matmul(query, key, name="matmul_qk_parallel") # (bs, T, heads, NE, NE) 174 | logits /= np.sqrt(n_embd / heads) 175 | softmax = stable_masked_softmax(logits, mask) 176 | att_sum = tf.matmul(softmax, value, name="matmul_softmax_value") # (bs, T, heads, NE, features) 177 | with tf.variable_scope('flatten_heads'): 178 | out = tf.transpose(att_sum, (0, 1, 3, 2, 4)) # (bs, T, n_output_entities, heads, features) 179 | n_output_entities = shape_list(out)[2] 180 | out = tf.reshape(out, (bs, T, n_output_entities, n_embd)) # (bs, T, n_output_entities, n_embd) 181 | 182 | return out 183 | 184 | 185 | def stable_masked_softmax(logits, mask): 186 | ''' 187 | Args: 188 | logits (tf): tensor with shape (bs, T, heads, NE, NE) 189 | mask (tf): tensor with shape(bs, T, 1, NE) 190 | ''' 191 | with tf.variable_scope('stable_softmax'): 192 | # Subtract a big number from the masked logits so they don't interfere with computing the max value 193 | if mask is not None: 194 | mask = tf.expand_dims(mask, 2) 195 | logits -= (1.0 - mask) * 1e10 196 | 197 | # Subtract the max logit from everything so we don't overflow 198 | logits -= tf.reduce_max(logits, axis=-1, keepdims=True) 199 | unnormalized_p = tf.exp(logits) 200 | 201 | # Mask the unnormalized probibilities and then normalize and remask 202 | if mask is not None: 203 | unnormalized_p *= mask 204 | normalized_p = unnormalized_p / (tf.reduce_sum(unnormalized_p, axis=-1, keepdims=True) + 1e-10) 205 | if mask is not None: 206 | normalized_p *= mask 207 | return normalized_p 208 | 209 | 210 | def qkv_embed(inp, heads, n_embd, layer_norm=False, qk_w=1.0, v_w=0.01, reuse=False): 211 | ''' 212 | Compute queries, keys, and values 213 | Args: 214 | inp (tf) -- tensor w/ shape (bs, T, NE, features) 215 | heads (int) -- number of attention heads 216 | n_embd (int) -- dimension of queries, keys, and values will be n_embd / heads 217 | layer_norm (bool) -- normalize embedding prior to computing qkv 218 | qk_w (float) -- Initialization scale for keys and queries. Actual scale will be 219 | sqrt(qk_w / #input features) 220 | v_w (float) -- Initialization scale for values. Actual scale will be sqrt(v_w / #input features) 221 | reuse (bool) -- tf reuse 222 | ''' 223 | with tf.variable_scope('qkv_embed'): 224 | bs, T, NE, features = shape_list(inp) 225 | if layer_norm: 226 | with tf.variable_scope('pre_sa_layer_norm'): 227 | inp = tf.contrib.layers.layer_norm(inp, begin_norm_axis=3) 228 | 229 | # qk shape (bs x T x NE x h x n_embd/h) 230 | qk_scale = np.sqrt(qk_w / features) 231 | qk = tf.layers.dense(inp, 232 | n_embd * 2, 233 | kernel_initializer=tf.random_normal_initializer(stddev=qk_scale), 234 | reuse=reuse, 235 | name="qk_embed") # bs x T x n_embd*2 236 | qk = tf.reshape(qk, (bs, T, NE, heads, n_embd // heads, 2)) 237 | 238 | # (bs, T, NE, heads, features) 239 | query, key = [tf.squeeze(x, -1) for x in tf.split(qk, 2, -1)] 240 | 241 | v_scale = np.sqrt(v_w / features) 242 | value = tf.layers.dense(inp, 243 | n_embd, 244 | kernel_initializer=tf.random_normal_initializer(stddev=v_scale), 245 | reuse=reuse, 246 | name="v_embed") # bs x T x n_embd 247 | value = tf.reshape(value, (bs, T, NE, heads, n_embd // heads)) 248 | 249 | query = tf.transpose(query, (0, 1, 3, 2, 4), 250 | name="transpose_query") # (bs, T, heads, NE, n_embd / heads) 251 | key = tf.transpose(key, (0, 1, 3, 4, 2), 252 | name="transpose_key") # (bs, T, heads, n_embd / heads, NE) 253 | value = tf.transpose(value, (0, 1, 3, 2, 4), 254 | name="transpose_value") # (bs, T, heads, NE, n_embd / heads) 255 | 256 | return query, key, value 257 | 258 | 259 | ################## 260 | # 1D Convolution # 261 | ################## 262 | 263 | def circ_conv1d(inp, **conv_kwargs): 264 | valid_activations = {'relu': tf.nn.relu, 'tanh': tf.tanh, '': None} 265 | assert 'kernel_size' in conv_kwargs, f"Kernel size needs to be specified for circular convolution layer." 266 | conv_kwargs['activation'] = valid_activations[conv_kwargs['activation']] 267 | 268 | # concatenate input for circular convolution 269 | kernel_size = conv_kwargs['kernel_size'] 270 | num_pad = kernel_size // 2 271 | inp_shape = shape_list(inp) 272 | inp_rs = tf.reshape(inp, shape=[inp_shape[0] * inp_shape[1]] + inp_shape[2:]) # (BS * T, NE, feats) 273 | inp_padded = tf.concat([inp_rs[..., -num_pad:, :], inp_rs, inp_rs[..., :num_pad, :]], -2) 274 | out = tf.layers.conv1d(inp_padded, 275 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 276 | padding='valid', 277 | **conv_kwargs) 278 | 279 | out = tf.reshape(out, shape=inp_shape[:3] + [conv_kwargs['filters']]) 280 | return out 281 | 282 | ################## 283 | # Misc ########### 284 | ################## 285 | 286 | 287 | def layernorm(x, scope, epsilon=1e-5, reuse=False): 288 | ''' 289 | normalize state vector to be zero mean / unit variance + learned scale/shift 290 | ''' 291 | with tf.variable_scope(scope, reuse=reuse): 292 | n_state = x.get_shape()[-1] 293 | gain = tf.get_variable('gain', [n_state], initializer=tf.constant_initializer(1)) 294 | bias = tf.get_variable('bias', [n_state], initializer=tf.constant_initializer(0)) 295 | mean = tf.reduce_mean(x, axis=[-1], keep_dims=True) 296 | variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True) 297 | norm_x = (x - mean) * tf.rsqrt(variance + epsilon) 298 | return norm_x * gain + bias 299 | -------------------------------------------------------------------------------- /ma_policy/load_policy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | import logging 5 | import sys 6 | import traceback 7 | import cloudpickle as pickle 8 | 9 | from ma_policy.ma_policy import MAPolicy 10 | 11 | 12 | def shape_list(x): 13 | ''' 14 | deal with dynamic shape in tensorflow cleanly 15 | ''' 16 | ps = x.get_shape().as_list() 17 | ts = tf.shape(x) 18 | return [ts[i] if ps[i] is None else ps[i] for i in range(len(ps))] 19 | 20 | 21 | def replace_base_scope(var_name, new_base_scope): 22 | split = var_name.split('/') 23 | split[0] = new_base_scope 24 | return os.path.normpath('/'.join(split)) 25 | 26 | 27 | def load_variables(policy, weights): 28 | weights = {os.path.normpath(key): value for key, value in weights.items()} 29 | weights = {replace_base_scope(key, policy.scope): value for key, value in weights.items()} 30 | assign_ops = [] 31 | for var in policy.get_variables(): 32 | var_name = os.path.normpath(var.name) 33 | if var_name not in weights: 34 | logging.warning(f"{var_name} was not found in weights dict. This will be reinitialized.") 35 | tf.get_default_session().run(var.initializer) 36 | else: 37 | try: 38 | assert np.all(np.array(shape_list(var)) == np.array(weights[var_name].shape)) 39 | assign_ops.append(var.assign(weights[var_name])) 40 | except Exception: 41 | traceback.print_exc(file=sys.stdout) 42 | print(f"Error assigning weights of shape {weights[var_name].shape} to {var}") 43 | sys.exit() 44 | tf.get_default_session().run(assign_ops) 45 | 46 | 47 | def load_policy(path, env=None, scope='policy'): 48 | ''' 49 | Load a policy. 50 | Args: 51 | path (string): policy path 52 | env (Gym.Env): This will update the observation space of the 53 | policy that is returned 54 | scope (string): The base scope for the policy variables 55 | ''' 56 | # TODO this will probably need to be changed when trying to run policy on GPU 57 | if tf.get_default_session() is None: 58 | tf_config = tf.ConfigProto( 59 | inter_op_parallelism_threads=1, 60 | intra_op_parallelism_threads=1) 61 | sess = tf.Session(config=tf_config) 62 | sess.__enter__() 63 | 64 | policy_dict = dict(np.load(path)) 65 | policy_fn_and_args_raw = pickle.loads(policy_dict['policy_fn_and_args']) 66 | policy_args = policy_fn_and_args_raw['args'] 67 | policy_args['scope'] = scope 68 | 69 | if env is not None: 70 | policy_args['ob_space'] = env.observation_space 71 | policy_args['ac_space'] = env.action_space 72 | 73 | policy = MAPolicy(**policy_args) 74 | del policy_dict['policy_fn_and_args'] 75 | 76 | load_variables(policy, policy_dict) 77 | return policy 78 | -------------------------------------------------------------------------------- /ma_policy/normalizers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def _mean_std_update_size(x, axes): 5 | x_shape = tf.shape(x) 6 | x_dims_to_reduce = tf.gather(x_shape, axes) 7 | size = tf.reduce_prod(x_dims_to_reduce) 8 | return size 9 | 10 | 11 | def _interpolate(old, new, old_weight, scaled_weight): 12 | return old * old_weight + new * scaled_weight 13 | 14 | 15 | def _std_from_mean_and_square(mean, square): 16 | var_est = tf.to_float(square) - tf.square(mean) 17 | return tf.sqrt(tf.maximum(var_est, 1e-2)) 18 | 19 | 20 | class EMAMeanStd(object): 21 | """ 22 | Calculates an Exponential Moving Average for each argument with 23 | exponential coefficient `beta`. The forward relation is: 24 | mean = beta * old_mean + (1.0 - beta) * observation 25 | The algorithm removes the bias introduced from setting ema[-1] = 0.0 26 | 27 | Note: `beta` parameter is defined with respect to a single observation within a batch 28 | if `per_element_update=True` (if a batch has 1000 elements of an observation, this is 29 | considered to be a 1000 updates), else it is considered to be the size of an update for a full 30 | batch (1 update if `per_element_update=False`). 31 | """ 32 | 33 | def __init__(self, beta, scope="ema", reuse=None, epsilon=1e-6, per_element_update=False, shape=(), version=1): 34 | self._version = version 35 | self._per_element_update = per_element_update 36 | with tf.variable_scope(scope, reuse=reuse): 37 | # Expected value of x 38 | self._biased_mean = tf.get_variable( 39 | dtype=tf.float32, 40 | shape=shape, 41 | initializer=tf.constant_initializer(0.0), 42 | name="mean", 43 | trainable=False) 44 | # Expected value of x^2 45 | self._biased_sq = tf.get_variable( 46 | dtype=tf.float32, 47 | shape=shape, 48 | initializer=tf.constant_initializer(0.0), 49 | name="sq", 50 | trainable=False) 51 | # How to integrate observations of x over time 52 | self._one_minus_beta = 1.0 - beta 53 | # Weight placed on ema[-1] == 0.0 which we divide out to debias 54 | self._debiasing_term = tf.get_variable( 55 | dtype=tf.float32, 56 | shape=shape, 57 | initializer=tf.constant_initializer(0.0), 58 | name="debiasing_term", 59 | trainable=False) 60 | self.shape = shape 61 | 62 | # the stored mean and square are biased due to setting ema[-1] = 0.0, 63 | # we correct for this by dividing by the debiasing term: 64 | self.mean = self._biased_mean / tf.maximum(self._debiasing_term, epsilon) 65 | self.std = _std_from_mean_and_square(mean=self.mean, square=self._biased_sq / tf.maximum(self._debiasing_term, epsilon)) 66 | 67 | def update_op(self, x, axes=(0,)): 68 | scaled_weight = tf.cast(self._one_minus_beta, tf.float64) 69 | if self._per_element_update: 70 | # many updates were done at once in a batch, so we figure out what power 71 | # to raise `1-beta` to. 72 | # using the fact that for small 1.0 - beta we have: 73 | # 1 - beta^N ~= (1.0 - beta) * N 74 | size = _mean_std_update_size(x, axes) 75 | scaled_weight *= tf.cast(size, tf.float64) 76 | one = tf.constant(1.0, dtype=tf.float64) 77 | old_weight = one - scaled_weight 78 | old_weight_fp32 = tf.to_float(old_weight) 79 | scaled_weight_fp32 = tf.to_float(scaled_weight) 80 | return tf.group( 81 | # increment the running debiasing term by the contribution of the initial ema[-1] == 0.0 observation 82 | # (e.g. boost the observed value by how much it was initially discounted on step 1) 83 | tf.assign(self._debiasing_term, tf.to_float(_interpolate(old=tf.cast(self._debiasing_term, tf.float64), new=one, old_weight=old_weight, scaled_weight=scaled_weight))), 84 | # do an interpolation on the expected value of X 85 | tf.assign(self._biased_mean, _interpolate(old=self._biased_mean, new=tf.reduce_mean(tf.to_float(x), axis=axes), old_weight=old_weight_fp32, scaled_weight=scaled_weight_fp32)), 86 | # do an interpolation on the expected value of X^2 87 | tf.assign(self._biased_sq, _interpolate(old=self._biased_sq, new=tf.reduce_mean(tf.square(tf.to_float(x)), axis=axes), old_weight=old_weight_fp32, scaled_weight=scaled_weight_fp32)), 88 | ) 89 | -------------------------------------------------------------------------------- /ma_policy/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def normc_initializer(std=1.0, axis=0): 6 | def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613 7 | out = np.random.randn(*shape).astype(np.float32) 8 | out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True)) 9 | return tf.constant(out) 10 | return _initializer 11 | 12 | 13 | def listdict2dictnp(l, keepdims=False): 14 | ''' 15 | Convert a list of dicts of numpy arrays to a dict of numpy arrays. 16 | If keepdims is False the new outer dimension in each dict element will be 17 | the length of the list 18 | If keepdims is True, then the new outdimension in each dict will be the sum of the 19 | outer dimensions of each item in the list 20 | ''' 21 | if keepdims: 22 | return {k: np.concatenate([d[k] for d in l]) for k in l[0]} 23 | else: 24 | return {k: np.array([d[k] for d in l]) for k in l[0]} 25 | 26 | 27 | def shape_list(x): 28 | ''' 29 | deal with dynamic shape in tensorflow cleanly 30 | ''' 31 | ps = x.get_shape().as_list() 32 | ts = tf.shape(x) 33 | return [ts[i] if ps[i] is None else ps[i] for i in range(len(ps))] 34 | 35 | 36 | def l2_loss(pred, label, std, mask): 37 | ''' 38 | Masked L2 loss with a scaling paramter (std). We made the choice that 39 | the loss would scale with the number of unmasked data points rather 40 | than have the same magnitude regardless of how many samples came in. 41 | TODO: Revisit whether this is the right choice. 42 | ''' 43 | if mask is None: 44 | return 0.5 * tf.reduce_mean(tf.square((pred - label) / std)) 45 | else: 46 | return 0.5 * tf.reduce_mean(mask * tf.square((pred - label) / std)) 47 | -------------------------------------------------------------------------------- /ma_policy/variable_schema.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | BATCH = "batch" 5 | TIMESTEPS = "timesteps" 6 | 7 | 8 | class VariableSchema(object): 9 | def __init__(self, shape, dtype): 10 | """Creates a schema for a variable used in policy. 11 | Allows for symbolic definition of shape. Shape can consist of integers, as well as 12 | strings BATCH and TIMESTEPS. This is taken advantage of in the optimizers, to 13 | create placeholders or variables that asynchronously prefetch the inputs. 14 | 15 | Parameters 16 | ---------- 17 | shape: [int, np.int64, np.int32, or str] 18 | shape of the variable, e.g. [12, 4], [BATCH, 12], [BATCH, 'timestep'] 19 | dtype: 20 | tensorflow type of the variable, e.g. tf.float32, tf.int32 21 | """ 22 | assert all(isinstance(s, (int, np.int64, np.int32)) or s in [BATCH, TIMESTEPS] for s in shape), 'Bad shape %s' % shape 23 | self.shape = shape 24 | self.dtype = tf.as_dtype(dtype) 25 | 26 | def _substituted_shape(self, batch=None, timesteps=None): 27 | feeds = dict(batch=batch, timesteps=timesteps) 28 | return [feeds.get(v, v) for v in self.shape] 29 | 30 | def substitute(self, *, batch=BATCH, timesteps=TIMESTEPS): 31 | """Make a new VariableSchema with batch or timesteps optionally filled in.""" 32 | # Coerse None to default value. 33 | batch = batch or BATCH 34 | timesteps = timesteps or TIMESTEPS 35 | shape = self._substituted_shape(batch, timesteps) 36 | return VariableSchema(shape=shape, dtype=self.dtype) 37 | 38 | def placeholder(self, *, batch=None, timesteps=None, name=None): 39 | real_shape = self._substituted_shape(batch, timesteps) 40 | return tf.placeholder(self.dtype, real_shape, name=name) 41 | 42 | def variable(self, *, name, batch=None, timesteps=None, **kwargs): 43 | real_shape = self._substituted_shape(batch, timesteps) 44 | assert None not in real_shape 45 | return tf.get_variable(name, real_shape, self.dtype, **kwargs) 46 | 47 | def np_zeros(self, *, batch=None, timesteps=None, **kwargs): 48 | real_shape = self._substituted_shape(batch, timesteps) 49 | np_dtype = self.dtype.as_numpy_dtype 50 | return np.zeros(shape=real_shape, dtype=np_dtype, **kwargs) 51 | 52 | def match_shape(self, shape, *, batch=None, timesteps=None): 53 | expected = self._substituted_shape(batch, timesteps) 54 | if len(expected) != len(shape): 55 | return False 56 | for expected, actual in zip(expected, shape): 57 | if expected is not None and expected != actual: 58 | return False 59 | return True 60 | -------------------------------------------------------------------------------- /mae_envs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mae_envs/envs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mae_envs/envs/base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import logging 3 | from mujoco_worldgen import Floor, WorldBuilder, WorldParams, Env 4 | from mae_envs.wrappers.multi_agent import (SplitMultiAgentActions, SplitObservations, 5 | SelectKeysWrapper) 6 | from mae_envs.wrappers.util import DiscretizeActionWrapper, DiscardMujocoExceptionEpisodes 7 | from mae_envs.wrappers.line_of_sight import AgentAgentObsMask2D 8 | from mae_envs.modules.agents import Agents 9 | from mae_envs.modules.walls import RandomWalls 10 | from mae_envs.modules.objects import Boxes, Ramps 11 | 12 | 13 | class Base(Env): 14 | ''' 15 | Multi-agent Base Environment. 16 | Args: 17 | horizon (int): Number of steps agent gets to act 18 | n_substeps (int): Number of internal mujoco steps per outer environment step; 19 | essentially this is action repeat. 20 | n_agents (int): number of agents in the environment 21 | floor_size (float or (float, float)): size of the floor. If a list of 2 floats, the floorsize 22 | will be randomized between them on each episode 23 | grid_size (int): size of the grid that we'll use to place objects on the floor 24 | action_lims (float tuple): lower and upper limit of mujoco actions 25 | deterministic_mode (bool): if True, seeds are incremented rather than randomly sampled. 26 | ''' 27 | def __init__(self, horizon=250, n_substeps=5, n_agents=2, 28 | floor_size=6., grid_size=30, 29 | action_lims=(-1.0, 1.0), deterministic_mode=False, 30 | **kwargs): 31 | super().__init__(get_sim=self._get_sim, 32 | get_obs=self._get_obs, 33 | action_space=tuple(action_lims), 34 | horizon=horizon, 35 | deterministic_mode=deterministic_mode) 36 | self.n_agents = n_agents 37 | self.metadata = {} 38 | self.metadata['n_actors'] = n_agents 39 | self.horizon = horizon 40 | self.n_substeps = n_substeps 41 | if not isinstance(floor_size, (tuple, list, np.ndarray)): 42 | self.floor_size_dist = [floor_size, floor_size] 43 | else: 44 | self.floor_size_dist = floor_size 45 | self.grid_size = grid_size 46 | self.kwargs = kwargs 47 | self.placement_grid = np.zeros((grid_size, grid_size)) 48 | self.modules = [] 49 | 50 | def add_module(self, module): 51 | self.modules.append(module) 52 | 53 | def _get_obs(self, sim): 54 | ''' 55 | Loops through modules, calls their observation_step functions, and 56 | adds the result to the observation dictionary. 57 | ''' 58 | obs = {} 59 | for module in self.modules: 60 | obs.update(module.observation_step(self, self.sim)) 61 | return obs 62 | 63 | def _get_sim(self, seed): 64 | ''' 65 | Calls build_world_step and then modify_sim_step for each module. If 66 | a build_world_step failed, then restarts. 67 | ''' 68 | self.floor_size = np.random.uniform(self.floor_size_dist[0], self.floor_size_dist[1]) 69 | self.metadata['floor_size'] = self.floor_size 70 | world_params = WorldParams(size=(self.floor_size, self.floor_size, 2.5), 71 | num_substeps=self.n_substeps) 72 | successful_placement = False 73 | failures = 0 74 | while not successful_placement: 75 | if (failures + 1) % 10 == 0: 76 | logging.warning(f"Failed {failures} times in creating environment") 77 | builder = WorldBuilder(world_params, seed) 78 | floor = Floor() 79 | 80 | builder.append(floor) 81 | 82 | self.placement_grid = np.zeros((self.grid_size, self.grid_size)) 83 | 84 | successful_placement = np.all([module.build_world_step(self, floor, self.floor_size) 85 | for module in self.modules]) 86 | failures += 1 87 | 88 | sim = builder.get_sim() 89 | 90 | for module in self.modules: 91 | module.modify_sim_step(self, sim) 92 | 93 | return sim 94 | 95 | 96 | def make_env(n_substeps=5, horizon=250, deterministic_mode=False, n_agents=2, 97 | n_boxes=2, n_ramps=1): 98 | ''' 99 | This make_env function is not used anywhere; it exists to provide a simple, bare-bones 100 | example of how to construct a multi-agent environment using the modules framework. 101 | ''' 102 | env = Base(n_agents=n_agents, n_substeps=n_substeps, horizon=horizon, 103 | deterministic_mode=deterministic_mode) 104 | env.add_module(RandomWalls(grid_size=30, num_rooms=4, min_room_size=6, door_size=2)) 105 | if n_boxes > 0: 106 | env.add_module(Boxes(n_boxes=n_boxes)) 107 | if n_ramps > 0: 108 | env.add_module(Ramps(n_ramps=n_ramps)) 109 | env.add_module(Agents(n_agents)) 110 | env.reset() 111 | keys_self = ['agent_qpos_qvel'] 112 | keys_mask_self = ['mask_aa_obs'] 113 | keys_external = ['agent_qpos_qvel'] 114 | keys_mask_external = [] 115 | env = SplitMultiAgentActions(env) 116 | env = DiscretizeActionWrapper(env, 'action_movement') 117 | env = AgentAgentObsMask2D(env) 118 | env = SplitObservations(env, keys_self + keys_mask_self) 119 | env = SelectKeysWrapper(env, keys_self=keys_self, 120 | keys_other=keys_external + keys_mask_self + keys_mask_external) 121 | env = DiscardMujocoExceptionEpisodes(env) 122 | return env 123 | -------------------------------------------------------------------------------- /mae_envs/envs/blueprint_construction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from mae_envs.wrappers.multi_agent import (SplitMultiAgentActions, SplitObservations, 4 | SelectKeysWrapper) 5 | from mae_envs.wrappers.util import (DiscretizeActionWrapper, MaskActionWrapper, 6 | DiscardMujocoExceptionEpisodes, SpoofEntityWrapper, 7 | AddConstantObservationsWrapper, 8 | ConcatenateObsWrapper, NumpyArrayRewardWrapper) 9 | from mae_envs.wrappers.manipulation import (GrabObjWrapper, GrabClosestWrapper, 10 | LockObjWrapper, LockAllWrapper) 11 | from mae_envs.wrappers.lidar import Lidar 12 | from mae_envs.wrappers.team import TeamMembership 13 | from mae_envs.wrappers.line_of_sight import AgentAgentObsMask2D, AgentGeomObsMask2D 14 | from mae_envs.envs.base import Base 15 | from mae_envs.modules.agents import Agents, AgentManipulation 16 | from mae_envs.modules.construction_sites import ConstructionSites 17 | from mae_envs.modules.walls import WallScenarios, RandomWalls 18 | from mae_envs.modules.objects import Boxes, LidarSites 19 | from mae_envs.modules.world import FloorAttributes, WorldConstants 20 | from mae_envs.modules.util import (uniform_placement, center_placement, 21 | uniform_placement_middle) 22 | 23 | 24 | class ConstructionDistancesWrapper(gym.ObservationWrapper): 25 | ''' 26 | Calculates the distance between every pair of boxes, between boxes and 27 | construction sites, and between box corners and construction site corners. 28 | This wrapper should be only be applied if the both the Boxes module (with 29 | mark_box_corners set to True) and the ConstructionSites module have been 30 | added to the environment. 31 | ''' 32 | def __init__(self, env): 33 | super().__init__(env) 34 | 35 | def observation(self, obs): 36 | box_xpos = obs['box_xpos'] 37 | boxcorner_pos = obs['box_corner_pos'] 38 | site_pos = obs['construction_site_pos'] 39 | sitecorner_pos = obs['construction_site_corner_pos'] 40 | 41 | box_box_dist = np.linalg.norm(box_xpos[..., None] - box_xpos.T[None, ...], axis=1) 42 | box_site_dist = np.linalg.norm(box_xpos[..., None] - site_pos.T[None, ...], axis=1) 43 | boxcorner_sitecorner_dist = ( 44 | np.linalg.norm(boxcorner_pos[..., None] - sitecorner_pos.T[None, ...], axis=1)) 45 | 46 | obs.update({'box_box_dist': box_box_dist, 47 | 'box_site_dist': box_site_dist, 48 | 'boxcorner_sitecorner_dist': boxcorner_sitecorner_dist}) 49 | 50 | return obs 51 | 52 | 53 | class ConstructionDenseRewardWrapper(gym.Wrapper): 54 | ''' 55 | Adds a dense reward for placing the boxes at the construction site locations. 56 | Reward is based on the smoothmin distance between each site and all the boxes. 57 | Args: 58 | use_corners (bool): Whether to calculate reward based solely on the distances 59 | between box centers and site centers, or also based on the distances 60 | between box corners and site corners. 61 | alpha (float): Smoothing parameter. Should be nonpositive. 62 | reward_scale (float): scales the reward by this factor 63 | ''' 64 | def __init__(self, env, use_corners=False, alpha=-8, reward_scale=1): 65 | super().__init__(env) 66 | assert alpha < 0, 'alpha must be negative for the SmoothMin function to work' 67 | self.alpha = alpha 68 | self.reward_scale = reward_scale 69 | self.use_corners = use_corners 70 | 71 | def step(self, action): 72 | obs, rew, done, info = self.env.step(action) 73 | box_site_dist = (obs['boxcorner_sitecorner_dist'] 74 | if self.use_corners 75 | else obs['box_site_dist']) 76 | scaling_factors = np.exp(self.alpha * box_site_dist) 77 | site_box_smoothmin_dists = (np.sum(box_site_dist * scaling_factors, axis=0) / 78 | np.sum(scaling_factors, axis=0)) 79 | rew -= np.mean(site_box_smoothmin_dists) * self.reward_scale 80 | return obs, rew, done, info 81 | 82 | 83 | class ConstructionCompletedRewardWrapper(gym.Wrapper): 84 | ''' 85 | Adds a sparse reward and ends the episode after all construction sites have been 86 | 'activated' by having a box within a certain distance of them. The reward is based 87 | on the number of construction sites in the episode. 88 | Args: 89 | use_corners (bool): Whether to calculate if construction is finished based 90 | solely on the distances between box centers and site centers, or also 91 | based on the distances between box corners and site corners. 92 | site_activation_radius (float): a site is considered 'activated' if there is 93 | at least one box within the site activation radius. 94 | reward_scale (float): scales the reward by this factor 95 | ''' 96 | def __init__(self, env, use_corners=False, site_activation_radius=0.2, reward_scale=1): 97 | super().__init__(env) 98 | self.n_sites = self.metadata['curr_n_sites'] 99 | self.site_activation_radius = site_activation_radius 100 | self.reward_scale = reward_scale 101 | self.use_corners = use_corners 102 | 103 | def reset(self): 104 | obs = self.env.reset() 105 | self.n_sites = self.metadata['curr_n_sites'] 106 | return obs 107 | 108 | def step(self, action): 109 | obs, rew, done, info = self.env.step(action) 110 | site_dist_to_closest_box = obs['box_site_dist'].min(axis=0) 111 | sitecorner_dist_to_closest_boxcorner = obs['boxcorner_sitecorner_dist'].min(axis=0) 112 | activated_sites = site_dist_to_closest_box < self.site_activation_radius 113 | aligned_corners = sitecorner_dist_to_closest_boxcorner < self.site_activation_radius 114 | 115 | all_sites_activated = np.all(activated_sites) 116 | all_corners_aligned = np.all(aligned_corners) 117 | construction_completed = ((all_sites_activated and not self.use_corners) or 118 | (all_sites_activated and all_corners_aligned)) 119 | 120 | if construction_completed: 121 | rew += self.n_sites * self.reward_scale 122 | done = True 123 | 124 | return obs, rew, done, info 125 | 126 | 127 | def make_env(n_substeps=15, horizon=80, deterministic_mode=False, 128 | floor_size=6.0, grid_size=30, 129 | n_agents=1, 130 | n_rooms=4, random_room_number=True, scenario='empty', door_size=2, 131 | n_sites=3, n_elongated_sites=0, site_placement='uniform_away_from_walls', 132 | reward_infos=[{'type': 'construction_dense'}], 133 | n_boxes=2, n_elongated_boxes=0, 134 | n_min_boxes=None, box_size=0.5, box_only_z_rot=False, 135 | lock_box=True, grab_box=True, grab_selective=False, lock_grab_radius=0.25, 136 | lock_type='any_lock_specific', grab_exclusive=False, 137 | grab_out_of_vision=False, lock_out_of_vision=True, 138 | box_floor_friction=0.2, other_friction=0.01, gravity=[0, 0, -50], 139 | action_lims=(-0.9, 0.9), polar_obs=True, 140 | n_lidar_per_agent=0, visualize_lidar=False, compress_lidar_scale=None, 141 | boxid_obs=True, boxsize_obs=True, team_size_obs=False, additional_obs={}): 142 | 143 | grab_radius_multiplier = lock_grab_radius / box_size 144 | lock_radius_multiplier = lock_grab_radius / box_size 145 | 146 | if type(n_sites) not in [list, np.ndarray]: 147 | n_sites = [n_sites, n_sites] 148 | 149 | env = Base(n_agents=n_agents, n_substeps=n_substeps, horizon=horizon, 150 | floor_size=floor_size, grid_size=grid_size, 151 | action_lims=action_lims, deterministic_mode=deterministic_mode) 152 | 153 | if scenario == 'randomwalls': 154 | env.add_module(RandomWalls(grid_size=grid_size, num_rooms=n_rooms, 155 | random_room_number=random_room_number, min_room_size=6, 156 | door_size=door_size, gen_door_obs=False)) 157 | elif scenario == 'empty': 158 | env.add_module(WallScenarios(grid_size=grid_size, door_size=door_size, 159 | scenario='empty', 160 | friction=other_friction)) 161 | 162 | env.add_module(Agents(n_agents, 163 | placement_fn=uniform_placement, 164 | color=[np.array((66., 235., 244., 255.)) / 255] * n_agents, 165 | friction=other_friction, 166 | polar_obs=polar_obs)) 167 | if np.max(n_boxes) > 0: 168 | env.add_module(Boxes(n_boxes=n_boxes, placement_fn=uniform_placement, 169 | friction=box_floor_friction, polar_obs=polar_obs, 170 | n_elongated_boxes=n_elongated_boxes, 171 | boxid_obs=boxid_obs, boxsize_obs=boxsize_obs, 172 | box_size=box_size, 173 | box_only_z_rot=box_only_z_rot, 174 | mark_box_corners=True)) 175 | if n_sites[1] > 0: 176 | if site_placement == 'center': 177 | site_placement_fn = center_placement 178 | elif site_placement == 'uniform': 179 | site_placement_fn = uniform_placement 180 | elif site_placement == 'uniform_away_from_walls': 181 | site_placement_fn = uniform_placement_middle(0.85) 182 | else: 183 | raise ValueError(f'Site placement option: {site_placement} not implemented.' 184 | ' Please choose from center, uniform and uniform_away_from_walls.') 185 | 186 | env.add_module(ConstructionSites(n_sites, placement_fn=site_placement_fn, 187 | site_size=box_size, site_height=box_size / 2, 188 | n_elongated_sites=n_elongated_sites)) 189 | if n_lidar_per_agent > 0 and visualize_lidar: 190 | env.add_module(LidarSites(n_agents=n_agents, n_lidar_per_agent=n_lidar_per_agent)) 191 | if np.max(n_boxes) > 0 and grab_box: 192 | env.add_module(AgentManipulation()) 193 | if box_floor_friction is not None: 194 | env.add_module(FloorAttributes(friction=box_floor_friction)) 195 | env.add_module(WorldConstants(gravity=gravity)) 196 | env.reset() 197 | keys_self = ['agent_qpos_qvel', 'hider', 'prep_obs'] 198 | keys_mask_self = ['mask_aa_obs'] 199 | keys_external = ['agent_qpos_qvel', 'construction_site_obs'] 200 | keys_copy = ['you_lock', 'team_lock', 'ramp_you_lock', 'ramp_team_lock'] 201 | keys_mask_external = [] 202 | 203 | env = AddConstantObservationsWrapper(env, new_obs=additional_obs) 204 | keys_external += list(additional_obs) 205 | keys_mask_external += [ob for ob in additional_obs if 'mask' in ob] 206 | 207 | env = SplitMultiAgentActions(env) 208 | if team_size_obs: 209 | keys_self += ['team_size'] 210 | env = TeamMembership(env, np.zeros((n_agents,))) 211 | env = AgentAgentObsMask2D(env) 212 | env = DiscretizeActionWrapper(env, 'action_movement') 213 | if np.max(n_boxes) > 0: 214 | env = AgentGeomObsMask2D(env, pos_obs_key='box_pos', mask_obs_key='mask_ab_obs', 215 | geom_idxs_obs_key='box_geom_idxs') 216 | keys_external += ['mask_ab_obs', 'box_obs'] 217 | keys_mask_external.append('mask_ab_obs') 218 | if lock_box and np.max(n_boxes) > 0: 219 | agent_allowed_to_lock_keys = None if lock_out_of_vision else ["mask_ab_obs"] 220 | env = LockObjWrapper(env, body_names=[f'moveable_box{i}' for i in range(n_boxes)], 221 | agent_idx_allowed_to_lock=np.arange(n_agents), 222 | lock_type=lock_type, 223 | radius_multiplier=lock_radius_multiplier, 224 | obj_in_game_metadata_keys=["curr_n_boxes"], 225 | agent_allowed_to_lock_keys=agent_allowed_to_lock_keys) 226 | if grab_box and np.max(n_boxes) > 0: 227 | env = GrabObjWrapper(env, [f'moveable_box{i}' for i in range(n_boxes)], 228 | radius_multiplier=grab_radius_multiplier, 229 | grab_exclusive=grab_exclusive, 230 | obj_in_game_metadata_keys=['curr_n_boxes']) 231 | 232 | if n_lidar_per_agent > 0: 233 | env = Lidar(env, n_lidar_per_agent=n_lidar_per_agent, visualize_lidar=visualize_lidar, 234 | compress_lidar_scale=compress_lidar_scale) 235 | keys_copy += ['lidar'] 236 | keys_external += ['lidar'] 237 | 238 | env = ConstructionDistancesWrapper(env) 239 | env = NumpyArrayRewardWrapper(env) 240 | 241 | reward_wrappers = { 242 | 'construction_dense': ConstructionDenseRewardWrapper, 243 | 'construction_completed': ConstructionCompletedRewardWrapper, 244 | } 245 | 246 | for rew_info in reward_infos: 247 | rew_type = rew_info['type'] 248 | del rew_info['type'] 249 | env = reward_wrappers[rew_type](env, **rew_info) 250 | 251 | env = SplitObservations(env, keys_self + keys_mask_self, keys_copy=keys_copy) 252 | if n_agents == 1: 253 | env = SpoofEntityWrapper(env, 2, ['agent_qpos_qvel', 'hider', 'prep_obs'], ['mask_aa_obs']) 254 | env = SpoofEntityWrapper(env, n_boxes, 255 | ['box_obs', 'you_lock', 'team_lock', 'obj_lock'], 256 | ['mask_ab_obs']) 257 | env = SpoofEntityWrapper(env, n_sites[1], ['construction_site_obs'], ['mask_acs_obs']) 258 | keys_mask_external += ['mask_ab_obs_spoof', 'mask_acs_obs_spoof'] 259 | env = LockAllWrapper(env, remove_object_specific_lock=True) 260 | if not grab_out_of_vision and grab_box: 261 | env = MaskActionWrapper(env, 'action_pull', ['mask_ab_obs']) # Can only pull if in vision 262 | if not grab_selective and grab_box: 263 | env = GrabClosestWrapper(env) 264 | env = DiscardMujocoExceptionEpisodes(env) 265 | env = ConcatenateObsWrapper(env, {'agent_qpos_qvel': ['agent_qpos_qvel', 'hider', 'prep_obs'], 266 | 'box_obs': ['box_obs', 'you_lock', 'team_lock', 'obj_lock']}) 267 | env = SelectKeysWrapper(env, keys_self=keys_self, 268 | keys_other=keys_external + keys_mask_self + keys_mask_external) 269 | return env 270 | -------------------------------------------------------------------------------- /mae_envs/envs/shelter_construction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from mujoco_worldgen.util.types import store_args 4 | from mujoco_worldgen.util.geometry import raycast 5 | from mae_envs.wrappers.multi_agent import (SplitMultiAgentActions, SplitObservations, 6 | SelectKeysWrapper) 7 | from mae_envs.wrappers.util import (DiscretizeActionWrapper, MaskActionWrapper, 8 | DiscardMujocoExceptionEpisodes, SpoofEntityWrapper, 9 | AddConstantObservationsWrapper, 10 | ConcatenateObsWrapper) 11 | from mae_envs.wrappers.manipulation import (GrabObjWrapper, GrabClosestWrapper, 12 | LockObjWrapper, LockAllWrapper) 13 | from mae_envs.wrappers.lidar import Lidar 14 | from mae_envs.wrappers.team import TeamMembership 15 | from mae_envs.wrappers.line_of_sight import AgentAgentObsMask2D, AgentGeomObsMask2D 16 | from mae_envs.envs.base import Base 17 | from mae_envs.modules.agents import Agents, AgentManipulation 18 | from mae_envs.modules.walls import WallScenarios 19 | from mae_envs.modules.objects import Boxes, Cylinders, LidarSites 20 | from mae_envs.modules.world import FloorAttributes, WorldConstants 21 | from mae_envs.modules.util import (uniform_placement, center_placement, 22 | uniform_placement_middle) 23 | 24 | 25 | class ShelterRewardWrapper(gym.Wrapper): 26 | ''' 27 | Reward wrapper for the shelter construction task. There are invisible rays 28 | going from the edge of the playing area to the cylinder that needs to be 29 | guarded; at each timestep the agent receives negative reward proportional 30 | to the number of rays that make contact with the cylinder. 31 | Args: 32 | num_rays_per_side (int): Number of rays that shoot out of each side of the 33 | square playing area. The ray starting points are spaced out evenly. 34 | reward_scale (float): scales the reward by this factor 35 | ''' 36 | @store_args 37 | def __init__(self, env, num_rays_per_side=30, reward_scale=1): 38 | super().__init__(env) 39 | self.ray_start_points = [] 40 | 41 | grid_cell_size = self.unwrapped.floor_size / self.unwrapped.grid_size 42 | # start points for the rays should not be exactly on the edge of the floor, 43 | # so that they do not hit the outside walls 44 | sp_min_xy = 1.01 * grid_cell_size 45 | sp_max_xy = self.unwrapped.floor_size - (1.01 * grid_cell_size) 46 | for i in range(num_rays_per_side): 47 | sp_offset = i / num_rays_per_side * (sp_max_xy - sp_min_xy) 48 | new_start_points = [(sp_min_xy + sp_offset, sp_min_xy, 0), 49 | (sp_max_xy, sp_min_xy + sp_offset, 0), 50 | (sp_max_xy - sp_offset, sp_max_xy, 0), 51 | (sp_min_xy, sp_max_xy - sp_offset, 0)] 52 | self.ray_start_points.extend(new_start_points) 53 | 54 | self.ray_start_points = np.array(self.ray_start_points) 55 | 56 | def reset(self): 57 | obs = self.env.reset() 58 | self.sim = self.unwrapped.sim 59 | return obs 60 | 61 | def step(self, action): 62 | obs, rew, done, info = self.env.step(action) 63 | target_geom = obs['static_cylinder_geom_idxs'][0, 0] 64 | rew = rew + np.zeros((self.unwrapped.n_agents, 1)) 65 | for pt in self.ray_start_points: 66 | _, collision_geom = raycast(self.sim, pt1=pt, geom2_id=target_geom) 67 | if collision_geom == target_geom: 68 | rew -= 1 69 | 70 | rew *= self.reward_scale 71 | return obs, rew, done, info 72 | 73 | 74 | def make_env(n_substeps=15, horizon=80, deterministic_mode=False, 75 | floor_size=6.0, grid_size=30, 76 | n_agents=1, 77 | objective_diameter=[1, 1], objective_placement='center', 78 | num_rays_per_side=25, shelter_reward_scale=1, 79 | n_boxes=2, n_elongated_boxes=0, 80 | box_size=0.5, box_only_z_rot=False, 81 | lock_box=True, grab_box=True, grab_selective=False, lock_grab_radius=0.25, 82 | lock_type='any_lock_specific', grab_exclusive=False, 83 | grab_out_of_vision=False, lock_out_of_vision=True, 84 | box_floor_friction=0.2, other_friction=0.01, gravity=[0, 0, -50], 85 | action_lims=(-0.9, 0.9), polar_obs=True, 86 | n_lidar_per_agent=0, visualize_lidar=False, compress_lidar_scale=None, 87 | boxid_obs=True, boxsize_obs=True, team_size_obs=False, additional_obs={}): 88 | 89 | grab_radius_multiplier = lock_grab_radius / box_size 90 | lock_radius_multiplier = lock_grab_radius / box_size 91 | 92 | env = Base(n_agents=n_agents, n_substeps=n_substeps, horizon=horizon, 93 | floor_size=floor_size, grid_size=grid_size, 94 | action_lims=action_lims, deterministic_mode=deterministic_mode) 95 | 96 | env.add_module(WallScenarios(grid_size=grid_size, door_size=2, scenario='empty', 97 | friction=other_friction)) 98 | 99 | if objective_placement == 'center': 100 | objective_placement_fn = center_placement 101 | elif objective_placement == 'uniform_away_from_walls': 102 | objective_placement_fn = uniform_placement_middle(0.7) 103 | 104 | env.add_module(Cylinders(1, diameter=objective_diameter, height=box_size, 105 | make_static=True, placement_fn=objective_placement_fn)) 106 | 107 | env.add_module(Agents(n_agents, 108 | placement_fn=uniform_placement, 109 | color=[np.array((66., 235., 244., 255.)) / 255] * n_agents, 110 | friction=other_friction, 111 | polar_obs=polar_obs)) 112 | if np.max(n_boxes) > 0: 113 | env.add_module(Boxes(n_boxes=n_boxes, placement_fn=uniform_placement, 114 | friction=box_floor_friction, polar_obs=polar_obs, 115 | n_elongated_boxes=n_elongated_boxes, 116 | boxid_obs=boxid_obs, boxsize_obs=boxsize_obs, 117 | box_size=box_size, 118 | box_only_z_rot=box_only_z_rot)) 119 | if n_lidar_per_agent > 0 and visualize_lidar: 120 | env.add_module(LidarSites(n_agents=n_agents, n_lidar_per_agent=n_lidar_per_agent)) 121 | 122 | env.add_module(AgentManipulation()) 123 | if box_floor_friction is not None: 124 | env.add_module(FloorAttributes(friction=box_floor_friction)) 125 | env.add_module(WorldConstants(gravity=gravity)) 126 | env.reset() 127 | keys_self = ['agent_qpos_qvel', 'hider', 'prep_obs'] 128 | keys_mask_self = ['mask_aa_obs'] 129 | keys_external = ['agent_qpos_qvel'] 130 | keys_copy = ['you_lock', 'team_lock', 'ramp_you_lock', 'ramp_team_lock'] 131 | keys_mask_external = [] 132 | 133 | env = AddConstantObservationsWrapper(env, new_obs=additional_obs) 134 | keys_external += list(additional_obs) 135 | keys_mask_external += [ob for ob in additional_obs if 'mask' in ob] 136 | 137 | env = ShelterRewardWrapper(env, num_rays_per_side=num_rays_per_side, 138 | reward_scale=shelter_reward_scale) 139 | env = SplitMultiAgentActions(env) 140 | 141 | if team_size_obs: 142 | keys_self += ['team_size'] 143 | env = TeamMembership(env, np.zeros((n_agents,))) 144 | env = AgentAgentObsMask2D(env) 145 | env = DiscretizeActionWrapper(env, 'action_movement') 146 | if np.max(n_boxes) > 0: 147 | env = AgentGeomObsMask2D(env, pos_obs_key='box_pos', mask_obs_key='mask_ab_obs', 148 | geom_idxs_obs_key='box_geom_idxs') 149 | keys_external += ['mask_ab_obs', 'box_obs'] 150 | keys_mask_external.append('mask_ab_obs') 151 | if lock_box and np.max(n_boxes) > 0: 152 | env = LockObjWrapper(env, body_names=[f'moveable_box{i}' for i in range(n_boxes)], 153 | agent_idx_allowed_to_lock=np.arange(n_agents), 154 | lock_type=lock_type, 155 | radius_multiplier=lock_radius_multiplier, 156 | obj_in_game_metadata_keys=["curr_n_boxes"], 157 | agent_allowed_to_lock_keys=None if lock_out_of_vision else ["mask_ab_obs"]) 158 | 159 | if grab_box and np.max(n_boxes) > 0: 160 | env = GrabObjWrapper(env, [f'moveable_box{i}' for i in range(n_boxes)], 161 | radius_multiplier=grab_radius_multiplier, 162 | grab_exclusive=grab_exclusive, 163 | obj_in_game_metadata_keys=['curr_n_boxes']) 164 | 165 | if n_lidar_per_agent > 0: 166 | env = Lidar(env, n_lidar_per_agent=n_lidar_per_agent, visualize_lidar=visualize_lidar, 167 | compress_lidar_scale=compress_lidar_scale) 168 | keys_copy += ['lidar'] 169 | keys_external += ['lidar'] 170 | 171 | env = SplitObservations(env, keys_self + keys_mask_self, keys_copy=keys_copy) 172 | if n_agents == 1: 173 | env = SpoofEntityWrapper(env, 2, ['agent_qpos_qvel', 'hider', 'prep_obs'], ['mask_aa_obs']) 174 | env = SpoofEntityWrapper(env, n_boxes, ['box_obs', 'you_lock', 'team_lock', 'obj_lock'], ['mask_ab_obs']) 175 | keys_mask_external += ['mask_ab_obs_spoof'] 176 | env = LockAllWrapper(env, remove_object_specific_lock=True) 177 | if not grab_out_of_vision and grab_box: 178 | env = MaskActionWrapper(env, 'action_pull', ['mask_ab_obs']) # Can only pull if in vision 179 | if not grab_selective and grab_box: 180 | env = GrabClosestWrapper(env) 181 | env = DiscardMujocoExceptionEpisodes(env) 182 | env = ConcatenateObsWrapper(env, {'agent_qpos_qvel': ['agent_qpos_qvel', 'hider', 'prep_obs'], 183 | 'box_obs': ['box_obs', 'you_lock', 'team_lock', 'obj_lock']}) 184 | env = SelectKeysWrapper(env, keys_self=keys_self, 185 | keys_other=keys_external + keys_mask_self + keys_mask_external) 186 | return env 187 | -------------------------------------------------------------------------------- /mae_envs/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .module import * 2 | from .util import * 3 | -------------------------------------------------------------------------------- /mae_envs/modules/agents.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mujoco_worldgen.util.types import store_args 3 | from mujoco_worldgen.util.sim_funcs import (qpos_idxs_from_joint_prefix, 4 | qvel_idxs_from_joint_prefix) 5 | from mujoco_worldgen.transforms import set_geom_attr_transform 6 | from mujoco_worldgen.util.rotation import normalize_angles 7 | from mae_envs.util.transforms import (add_weld_equality_constraint_transform, 8 | set_joint_damping_transform) 9 | from mae_envs.modules import EnvModule, rejection_placement, get_size_from_xml 10 | from mujoco_worldgen import ObjFromXML 11 | 12 | 13 | class Agents(EnvModule): 14 | ''' 15 | Add Agents to the environment. 16 | Args: 17 | n_agents (int): number of agents 18 | placement_fn (fn or list of fns): See mae_envs.modules.util:rejection_placement for 19 | spec. If list of functions, then it is assumed there is one function given 20 | per agent 21 | color (tuple or list of tuples): rgba for agent. If list of tuples, then it is 22 | assumed there is one color given per agent 23 | friction (float): agent friction 24 | damp_z (bool): if False, reduce z damping to 1 25 | polar_obs (bool): Give observations about rotation in polar coordinates 26 | ''' 27 | @store_args 28 | def __init__(self, n_agents, placement_fn=None, color=None, friction=None, 29 | damp_z=False, polar_obs=True): 30 | pass 31 | 32 | def build_world_step(self, env, floor, floor_size): 33 | env.metadata['n_agents'] = self.n_agents 34 | successful_placement = True 35 | 36 | for i in range(self.n_agents): 37 | env.metadata.pop(f"agent{i}_initpos", None) 38 | 39 | for i in range(self.n_agents): 40 | obj = ObjFromXML("particle_hinge", name=f"agent{i}") 41 | if self.friction is not None: 42 | obj.add_transform(set_geom_attr_transform('friction', self.friction)) 43 | if self.color is not None: 44 | _color = (self.color[i] 45 | if isinstance(self.color[0], (list, tuple, np.ndarray)) 46 | else self.color) 47 | obj.add_transform(set_geom_attr_transform('rgba', _color)) 48 | if not self.damp_z: 49 | obj.add_transform(set_joint_damping_transform(1, 'tz')) 50 | 51 | if self.placement_fn is not None: 52 | _placement_fn = (self.placement_fn[i] 53 | if isinstance(self.placement_fn, list) 54 | else self.placement_fn) 55 | obj_size = get_size_from_xml(obj) 56 | pos, pos_grid = rejection_placement(env, _placement_fn, floor_size, obj_size) 57 | if pos is not None: 58 | floor.append(obj, placement_xy=pos) 59 | # store spawn position in metadata. This allows sampling subsequent agents 60 | # close to previous agents 61 | env.metadata[f"agent{i}_initpos"] = pos_grid 62 | else: 63 | successful_placement = False 64 | else: 65 | floor.append(obj) 66 | return successful_placement 67 | 68 | def modify_sim_step(self, env, sim): 69 | # Cache qpos, qvel idxs 70 | self.agent_qpos_idxs = np.array([qpos_idxs_from_joint_prefix(sim, f'agent{i}') 71 | for i in range(self.n_agents)]) 72 | self.agent_qvel_idxs = np.array([qvel_idxs_from_joint_prefix(sim, f'agent{i}') 73 | for i in range(self.n_agents)]) 74 | env.metadata['agent_geom_idxs'] = [sim.model.geom_name2id(f'agent{i}:agent') 75 | for i in range(self.n_agents)] 76 | 77 | def observation_step(self, env, sim): 78 | qpos = sim.data.qpos.copy() 79 | qvel = sim.data.qvel.copy() 80 | 81 | agent_qpos = qpos[self.agent_qpos_idxs] 82 | agent_qvel = qvel[self.agent_qvel_idxs] 83 | agent_angle = agent_qpos[:, [-1]] - np.pi / 2 # Rotate the angle to match visual front 84 | agent_qpos_qvel = np.concatenate([agent_qpos, agent_qvel], -1) 85 | polar_angle = np.concatenate([np.cos(agent_angle), np.sin(agent_angle)], -1) 86 | if self.polar_obs: 87 | agent_qpos = np.concatenate([agent_qpos[:, :-1], polar_angle], -1) 88 | agent_angle = normalize_angles(agent_angle) 89 | obs = { 90 | 'agent_qpos_qvel': agent_qpos_qvel, 91 | 'agent_angle': agent_angle, 92 | 'agent_pos': agent_qpos[:, :3]} 93 | 94 | return obs 95 | 96 | 97 | class AgentManipulation(EnvModule): 98 | ''' 99 | Adding this module is necessary for the grabbing mechanic implemented in GrabObjWrapper 100 | (found in mae_envs/wrappers/manipulation.py) to work correctly. 101 | ''' 102 | @store_args 103 | def __init__(self): 104 | pass 105 | 106 | def build_world_step(self, env, floor, floor_size): 107 | for i in range(env.n_agents): 108 | floor.add_transform(add_weld_equality_constraint_transform( 109 | f'agent{i}:gripper', f'agent{i}:particle', 'floor0')) 110 | return True 111 | 112 | def modify_sim_step(self, env, sim): 113 | sim.model.eq_active[:] = 0 114 | -------------------------------------------------------------------------------- /mae_envs/modules/construction_sites.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mujoco_worldgen.util.types import store_args 3 | from mae_envs.modules import EnvModule, rejection_placement 4 | 5 | 6 | class ConstructionSites(EnvModule): 7 | ''' 8 | Adds construction sites to the environment. A construction site consists of 5 9 | regular mujoco sites, with four of them (the 'corner' sites) forming a rectangle 10 | and the last site being placed in the center of the rectangle. 11 | Args: 12 | n_sites (int or (int, int)): Number of construction sites. If tuple of ints, every 13 | episode the number of sites is drawn uniformly from 14 | range(n_sites[0], n_sites[1] + 1) 15 | placement_fn (fn or list of fns): See mae_envs.modules.util:rejection_placement for spec 16 | If list of functions, then it is assumed there is one function given per agent 17 | site_name (str): Name for the sites. 18 | site_size (float): Site size 19 | site_height (float): Site height 20 | n_elongated_sites (int or (int, int)): Number of elongated sites. If tuple of ints, 21 | every episode the number of elongated sites is drawn uniformly from 22 | range(n_elongated_sites[0], n_elongated_sited[1] + 1) 23 | ''' 24 | @store_args 25 | def __init__(self, n_sites, placement_fn=None, site_name='construction_site', 26 | site_size=0.5, site_height=0.25, n_elongated_sites=0): 27 | if type(n_sites) not in [tuple, list, np.ndarray]: 28 | self.n_sites = [n_sites, n_sites] 29 | if type(n_elongated_sites) not in [tuple, list, np.ndarray]: 30 | self.n_elongated_sites = [n_elongated_sites, n_elongated_sites] 31 | 32 | def _mark_site_square(self, floor, floor_size, site_name, 33 | site_relative_xyz, site_dims): 34 | x, y, z = site_relative_xyz 35 | floor.mark(site_name, relative_xyz=(x, y, z), 36 | rgba=[1., 1., 1., 1.], size=0.1) 37 | 38 | corner_rel_offset_x, corner_rel_offset_y = (site_dims / floor_size) / 2 39 | corner_rel_xy = [[x - corner_rel_offset_x, y - corner_rel_offset_y], 40 | [x - corner_rel_offset_x, y + corner_rel_offset_y], 41 | [x + corner_rel_offset_x, y - corner_rel_offset_y], 42 | [x + corner_rel_offset_x, y + corner_rel_offset_y]] 43 | for i, (x_corner, y_corner) in enumerate(corner_rel_xy): 44 | floor.mark(f'{site_name}_corner{i}', 45 | relative_xyz=(x_corner, y_corner, z), 46 | size=0.05, rgba=[0.8, 0.8, 0.8, 1.]) 47 | 48 | def build_world_step(self, env, floor, floor_size): 49 | self.curr_n_sites = env._random_state.randint(self.n_sites[0], self.n_sites[1] + 1) 50 | self.curr_n_elongated_sites = env._random_state.randint( 51 | self.n_elongated_sites[0], self.n_elongated_sites[1] + 1) 52 | 53 | env.metadata['curr_n_sites'] = self.curr_n_sites 54 | env.metadata['curr_n_elongated_sites'] = self.curr_n_elongated_sites 55 | 56 | self.site_size_array = self.site_size * np.ones((self.curr_n_sites, 2)) 57 | if self.curr_n_elongated_sites > 0: 58 | n_xaligned = env._random_state.randint(self.curr_n_elongated_sites + 1) 59 | self.site_size_array[:n_xaligned, :] = self.site_size * np.array([3.3, 0.3]) 60 | self.site_size_array[n_xaligned:self.curr_n_elongated_sites, :] = ( 61 | self.site_size * np.array([0.3, 3.3])) 62 | 63 | successful_placement = True 64 | for i in range(self.curr_n_sites): 65 | if self.placement_fn is not None: 66 | _placement_fn = (self.placement_fn[i] 67 | if isinstance(self.placement_fn, list) 68 | else self.placement_fn) 69 | pos, _ = rejection_placement(env, _placement_fn, floor_size, 70 | self.site_size_array[i]) 71 | if pos is not None: 72 | self._mark_site_square(floor, floor_size, f'{self.site_name}{i}', 73 | (pos[0], pos[1], self.site_height), 74 | self.site_size_array[i]) 75 | else: 76 | successful_placement = False 77 | else: 78 | # place the site so that all the corners are still within the play area 79 | pos_min = self.site_size_array[i].max() / (floor_size * 1.1) / 2 80 | pos = env._random_state.uniform(pos_min, 1 - pos_min, 2) 81 | self._mark_site_square(floor, floor_size, f'{self.site_name}{i}', 82 | (pos[0], pos[1], self.site_height), 83 | self.site_size_array[i]) 84 | 85 | return successful_placement 86 | 87 | def modify_sim_step(self, env, sim): 88 | self.construction_site_idxs = np.array( 89 | [sim.model.site_name2id(f'{self.site_name}{i}') 90 | for i in range(self.curr_n_sites)] 91 | ) 92 | self.construction_site_corner_idxs = np.array( 93 | [sim.model.site_name2id(f'{self.site_name}{i}_corner{j}') 94 | for i in range(self.curr_n_sites) for j in range(4)] 95 | ) 96 | 97 | def observation_step(self, env, sim): 98 | site_pos = sim.data.site_xpos[self.construction_site_idxs] 99 | site_corner_pos = sim.data.site_xpos[self.construction_site_corner_idxs] 100 | site_obs = np.concatenate((site_pos, 101 | site_corner_pos.reshape((self.curr_n_sites, 12))), 102 | axis=-1) 103 | 104 | mask_site_obs = np.ones((env.n_agents, self.curr_n_sites)) 105 | 106 | obs = {'construction_site_pos': site_pos, 107 | 'construction_site_corner_pos': site_corner_pos, 108 | 'construction_site_obs': site_obs, 109 | 'mask_acs_obs': mask_site_obs} 110 | 111 | return obs 112 | -------------------------------------------------------------------------------- /mae_envs/modules/food.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mujoco_worldgen.util.types import store_args 3 | from mae_envs.modules import EnvModule, rejection_placement 4 | 5 | 6 | class Food(EnvModule): 7 | ''' 8 | Add food sites to the environment. 9 | Args: 10 | n_food (int or (int, int)): number of food items. If tuple of ints, every episode the 11 | number of food items is drawn uniformly from range(n_food[0], n_food[1] + 1) 12 | food_size (float): (visual) size of food items 13 | placement_fn (fn or list of fns): See mae_envs.modules.util:rejection_placement for spec 14 | If list of functions, then it is assumed there is one function given per food site 15 | ''' 16 | @store_args 17 | def __init__(self, n_food, food_size=0.1, placement_fn=None): 18 | if type(n_food) not in [tuple, list, np.ndarray]: 19 | self.n_food = [n_food, n_food] 20 | pass 21 | 22 | def build_world_step(self, env, floor, floor_size): 23 | env.metadata['food_size'] = self.food_size 24 | self.curr_n_food = env._random_state.randint(self.n_food[0], self.n_food[1] + 1) 25 | env.metadata['max_n_food'] = self.n_food[1] 26 | env.metadata['curr_n_food'] = self.curr_n_food 27 | successful_placement = True 28 | 29 | for i in range(self.curr_n_food): 30 | env.metadata.pop(f"food{i}_initpos", None) 31 | 32 | # Add food sites 33 | for i in range(self.curr_n_food): 34 | if self.placement_fn is not None: 35 | _placement_fn = (self.placement_fn[i] 36 | if isinstance(self.placement_fn, list) 37 | else self.placement_fn) 38 | pos, pos_grid = rejection_placement(env, _placement_fn, floor_size, 39 | np.array([self.food_size, self.food_size])) 40 | if pos is not None: 41 | floor.mark(f"food{i}", relative_xyz=np.append(pos, [self.food_size / 2]), 42 | size=(self.food_size, self.food_size, self.food_size), 43 | rgba=(0., 1., 0., 1.)) 44 | 45 | # store spawn position in metadata. This allows sampling subsequent food items 46 | # close to previous food items 47 | env.metadata[f"food{i}_initpos"] = pos_grid 48 | else: 49 | successful_placement = False 50 | else: 51 | floor.mark(f"food{i}", rgba=(0., 1., 0., 1.), 52 | size=(self.food_size, self.food_size, self.food_size)) 53 | return successful_placement 54 | 55 | def modify_sim_step(self, env, sim): 56 | self.food_site_ids = np.array([sim.model.site_name2id(f'food{i}') 57 | for i in range(self.curr_n_food)]) 58 | 59 | def observation_step(self, env, sim): 60 | if self.curr_n_food > 0: 61 | obs = {'food_pos': sim.data.site_xpos[self.food_site_ids]} 62 | else: 63 | obs = {'food_pos': np.zeros((0, 3))} 64 | return obs 65 | -------------------------------------------------------------------------------- /mae_envs/modules/module.py: -------------------------------------------------------------------------------- 1 | 2 | class EnvModule(): 3 | ''' 4 | Dummy class outline for "Environment Modules". 5 | NOTE: If in any function you are going to randomly sample a number, 6 | use env._random_state instead of numpy.random 7 | ''' 8 | def build_world_step(self, env, floor, floor_size): 9 | ''' 10 | This function allows you to add objects to worldgen floor object. 11 | You could also cache variables needed for observations or add 12 | information to the env.metadata dict 13 | Args: 14 | env (gym.Env): the environment 15 | floor (worldgen.Floor): square worldgen floor object 16 | floor_size (float): size of the worlgen floor object 17 | Returns: True if the the build_world_step was successful, False if it failed 18 | e.g. your build_world_step might fail because no valid object placements 19 | were found. 20 | ''' 21 | return True 22 | 23 | def modify_sim_step(self, env, sim): 24 | ''' 25 | After an MJSim has been created, this function can be used to modify that sim 26 | and cache any variables you can only get after the sim is created 27 | Args: 28 | env (gym.env): the environment 29 | sim (mujoco_py.MJSim): mujoco simulation object 30 | Returns: None 31 | ''' 32 | pass 33 | 34 | def observation_step(self, env, sim): 35 | ''' 36 | Create any observations specific to this module. 37 | Args: 38 | env (gym.env): the environment 39 | sim (mujoco_py.MJSim): mujoco simulation object 40 | Returns: dict of observations 41 | ''' 42 | return {} 43 | -------------------------------------------------------------------------------- /mae_envs/modules/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mujoco_worldgen.parser import parse_file 3 | 4 | 5 | def get_size_from_xml(obj): 6 | ''' 7 | Args: 8 | obj (worldgen.Obj): worldgen object 9 | Returns: size of object annotation:outerbound if it exists, None if it doesn't 10 | ''' 11 | outer_bound = None 12 | for body in parse_file(obj._generate_xml_path())['worldbody']['body']: 13 | if body.get('@name', '') == 'annotation:outer_bound': 14 | outer_bound = body 15 | if outer_bound is None: 16 | return None 17 | else: 18 | return outer_bound['geom'][0]['@size'][:2] * 2 19 | 20 | 21 | def rejection_placement(env, placement_fn, floor_size, obj_size, num_tries=10): 22 | ''' 23 | Args: 24 | env (gym.Env): environment 25 | placement_fn (function): Function that returns a position on a grid 26 | Args: 27 | grid (np.ndarray): 2D occupancy grid. 1's mean occupied 28 | obj_size_in_cells (int np.ndarray): number of cells in [x, y] 29 | that this object would occupy on the grid. Currently only supports 30 | rectangular object sizes (but so does worldgen) 31 | env.metadata (dict): environment metadata 32 | random_state (np.random.RandomState): numpy random state 33 | Returns: x, y placement position on grid 34 | floor_size (float): size of floor 35 | obj_size (float np.ndarray): [x, y] size of object 36 | num_tries (int): number of tries to place object 37 | Returns: int np.ndarray([x, y]) position on grid or None if no placement was found. 38 | ''' 39 | grid = env.placement_grid 40 | grid_size = len(grid) 41 | cell_size = floor_size / grid_size 42 | obj_size_in_cells = np.ceil(obj_size / cell_size).astype(int) 43 | 44 | for i in range(num_tries): 45 | if placement_fn is not None: 46 | pos = placement_fn(grid, obj_size_in_cells, env.metadata, env._random_state) 47 | else: 48 | # Assume that we'll always have boundary walls so don't sample there 49 | pos = np.array([env._random_state.randint(1, grid_size - obj_size_in_cells[0] - 1), 50 | env._random_state.randint(1, grid_size - obj_size_in_cells[1] - 1)]) 51 | if np.any(grid[pos[0]:pos[0] + obj_size_in_cells[0], pos[1]:pos[1] + obj_size_in_cells[1]]): 52 | continue 53 | else: 54 | extra_room = obj_size_in_cells * cell_size - obj_size 55 | pos_on_floor = pos / grid_size * floor_size 56 | pos_on_floor += env._random_state.uniform([0, 0], extra_room) 57 | placement = pos_on_floor / (floor_size - obj_size) 58 | grid[pos[0]:pos[0] + obj_size_in_cells[0], pos[1]:pos[1] + obj_size_in_cells[1]] = 1 59 | return placement, pos 60 | return None, None 61 | 62 | 63 | def uniform_placement(grid, obj_size, metadata, random_state): 64 | grid_size = len(grid) 65 | pos = np.array([random_state.randint(1, grid_size - obj_size[0] - 1), 66 | random_state.randint(1, grid_size - obj_size[1] - 1)]) 67 | 68 | return pos 69 | 70 | 71 | def close_to_other_object_placement(object_type, object_index, radius_key): 72 | def close_placement_fn(grid, obj_size, metadata, random_state): 73 | init_pos_key = f"{object_type}{object_index}_initpos" 74 | 75 | assert init_pos_key in metadata, \ 76 | f"First object position must be specified in metadata['{init_pos_key}']" 77 | assert radius_key in metadata, \ 78 | f"metadata['{radius_key}'] mus be specified." 79 | 80 | grid_size = len(grid) 81 | 82 | anchor_obj_pos = metadata[f"{init_pos_key}"] 83 | rad_in_cells = metadata[radius_key] 84 | 85 | distr_limits_min = np.maximum(1, anchor_obj_pos - rad_in_cells) 86 | distr_limits_max = np.minimum(grid_size - 1, anchor_obj_pos + rad_in_cells) 87 | 88 | pos = np.array([random_state.randint(distr_limits_min[0], distr_limits_max[0]), 89 | random_state.randint(distr_limits_min[1], distr_limits_max[1])]) 90 | 91 | return pos 92 | 93 | return close_placement_fn 94 | 95 | 96 | def uniform_placement_middle(area_side_length_fraction): 97 | ''' 98 | Creates a sampling function that samples object position uniformly within the 99 | middle of the playing area. E.g. if the playing area is 100 | ------ 101 | |AAAA| 102 | |ABBA| 103 | |ABBA| 104 | |AAAA| 105 | ------ 106 | then uniform_placement_middle(0.5) will returned a function that samples the object position 107 | from any of the B cells. 108 | Args: 109 | area_side_length_fraction (float, between 0 and 1): Length of the sides of the middle 110 | square being sampled from, as fraction of the overall playing field 111 | ''' 112 | def uniform_placement_middle_fn(grid, obj_size, metadata, random_state): 113 | grid_size = len(grid) 114 | distr_limits_min = ((grid_size - obj_size) * (1 - area_side_length_fraction) / 2 + area_side_length_fraction).astype(int) 115 | distr_limits_max = ((grid_size - obj_size) * (1 + area_side_length_fraction) / 2 - area_side_length_fraction).astype(int) 116 | 117 | pos = np.array([random_state.randint(distr_limits_min[0], distr_limits_max[0]), 118 | random_state.randint(distr_limits_min[1], distr_limits_max[1])]) 119 | 120 | return pos 121 | 122 | return uniform_placement_middle_fn 123 | 124 | 125 | def center_placement(grid, obj_size_in_cells, metadata, random_state): 126 | half_grid_size = int(len(grid) / 2) 127 | pos = np.array([half_grid_size - int(obj_size_in_cells[0]/2), 128 | half_grid_size - int(obj_size_in_cells[1]/2)]) 129 | return pos 130 | -------------------------------------------------------------------------------- /mae_envs/modules/world.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from mujoco_worldgen.transforms import set_geom_attr_transform 3 | from mae_envs.modules import EnvModule 4 | 5 | 6 | class FloorAttributes(EnvModule): 7 | ''' 8 | For each (key, value) in kwargs, sets the floor geom attribute key to value. 9 | ''' 10 | def __init__(self, **kwargs): 11 | self.kwargs = kwargs 12 | 13 | def build_world_step(self, env, floor, floor_size): 14 | for k, v in self.kwargs.items(): 15 | floor.add_transform(set_geom_attr_transform(k, v)) 16 | return True 17 | 18 | 19 | class WorldConstants(EnvModule): 20 | ''' 21 | For each (key, value) in kwargs, sets sim.model.opt[key] = value 22 | ''' 23 | def __init__(self, **kwargs): 24 | self.kwargs = kwargs 25 | 26 | def modify_sim_step(self, env, sim): 27 | for k, v in self.kwargs.items(): 28 | if not hasattr(sim.model.opt, k): 29 | logging.warning(f"sim.model.opt does not have attribute {k}") 30 | else: 31 | getattr(sim.model.opt, k)[:] = v 32 | -------------------------------------------------------------------------------- /mae_envs/util/geometry.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mujoco_worldgen.util.rotation import quat_mul, quat_conjugate 3 | 4 | 5 | def dist_pt_to_cuboid(pt1, cuboid_center, cuboid_dims, cuboid_quat): 6 | ''' 7 | This function calculates the shortest distance between test points 8 | and cuboids at arbitrary locations, widths and rotations 9 | 10 | Args: 11 | pt1 (num points x 3): test point positions 12 | cuboid_center (num cuboids x 3): cuboid centers 13 | cuboid_dims (num cuboids x 3): cuboid half-width 14 | cuboid_quat (num cuboids x 4): cuboid quaternion 15 | 16 | Returns: 17 | Distance array of size num points x num cuboids 18 | ''' 19 | assert cuboid_center.shape[0] == cuboid_dims.shape[0] == cuboid_quat.shape[0], \ 20 | "First dimension of cuboid_center, cuboid_dims and cuboid_quat need to match, " + \ 21 | f"but were {cuboid_center.shape[0]}, {cuboid_dims.shape[0]} and {cuboid_quat.shape[0]}." 22 | assert pt1.shape[1] == cuboid_center.shape[1] == cuboid_dims.shape[1] == 3, \ 23 | "Second dimension of pt1, cuboid_center and cuboid_dims needs to be 3, " + \ 24 | f"but were {pt1.shape[1]}, {cuboid_center.shape[1]} and {cuboid_dims.shape[1]}." 25 | assert cuboid_quat.shape[1] == 4, \ 26 | f"Second dimension of cuboid_quat needs to be 4, but was {cuboid_quat.shape[1]}." 27 | 28 | # calculate relative position of test points 29 | rel_pos = pt1[:, None, :] - cuboid_center[None, :, :] 30 | 31 | # convert into quaternion (leading dimension is zero) 32 | q_rel_pos = np.concatenate([np.zeros_like(rel_pos[:, :, [0]]), rel_pos], axis=-1) 33 | 34 | # broadcast cuboid_quat by hand 35 | cuboid_quat = np.repeat(cuboid_quat[None, :], pt1.shape[0], axis=0) 36 | 37 | # rotate relative position in cuboid frame 38 | # since cuboid_quat specifies how the cuboid is rotated wrt to the standard coordinate system, 39 | # we need to rotate the test points using the inverse rotation (i.e. conjugate quaternion) 40 | # 41 | # For rotation of vectors using quaternions see 42 | # https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation 43 | q_rel_pos = quat_mul(quat_conjugate(cuboid_quat), quat_mul(q_rel_pos, cuboid_quat)) 44 | 45 | # now we can pretend that the cuboid is aligned to x-axis 46 | # calculate vector to closest point on the cuboid 47 | # this can be done as described here: 48 | # https://gamedev.stackexchange.com/questions/44483/how-do-i-calculate-distance-between-a-point-and-an-axis-aligned-rectangle 49 | dist_vec = np.maximum(0, np.abs(q_rel_pos[:, :, 1:]) - cuboid_dims[None, :, :]) 50 | 51 | # distance is length of distance vector 52 | dist = np.linalg.norm(dist_vec, axis=-1) 53 | 54 | return dist 55 | -------------------------------------------------------------------------------- /mae_envs/util/transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import OrderedDict 3 | from mujoco_worldgen.transforms import closure_transform 4 | 5 | 6 | def add_weld_equality_constraint_transform(name, body_name1, body_name2): 7 | ''' 8 | Creates a weld constraint that maintains relative position and orientation between 9 | two objects 10 | ''' 11 | def fun(xml_dict): 12 | if 'equality' not in xml_dict: 13 | xml_dict['equality'] = OrderedDict() 14 | xml_dict['equality']['weld'] = [] 15 | constraint = OrderedDict() 16 | constraint['@name'] = name 17 | constraint['@body1'] = body_name1 18 | constraint['@body2'] = body_name2 19 | constraint['@active'] = False 20 | xml_dict['equality']['weld'].append(constraint) 21 | return xml_dict 22 | 23 | return fun 24 | 25 | 26 | def set_joint_damping_transform(damping, joint_name): 27 | ''' Set joints damping to a single value. 28 | Args: 29 | damping (float): damping to set 30 | joint_name (string): partial name of joint. Any joint with joint_name 31 | as a substring will be affected. 32 | ''' 33 | def closure(node): 34 | for joint in node.get('joint', []): 35 | if joint_name in joint['@name']: 36 | joint['@damping'] = damping 37 | return closure_transform(closure) 38 | 39 | 40 | def remove_hinge_axis_transform(axis): 41 | ''' Removes specific hinge axis from the body. ''' 42 | def fun(xml_dict): 43 | def closure(node): 44 | if 'joint' in node: 45 | node["joint"] = [j for j in node["joint"] 46 | if j["@type"] != "hinge" 47 | or np.linalg.norm(j["@axis"] - axis) >= 1e-5] 48 | return closure_transform(closure)(xml_dict) 49 | return fun 50 | -------------------------------------------------------------------------------- /mae_envs/util/vision.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mujoco_worldgen.util.rotation import normalize_angles 3 | from mujoco_worldgen.util.geometry import raycast 4 | 5 | 6 | def in_cone2d(origin_pts, origin_angles, cone_angle, target_pts): 7 | ''' 8 | Computes whether 2D points target_pts are in the cones originating from 9 | origin_pts at angle origin_angles with cone spread angle cone_angle. 10 | Args: 11 | origin_pts (np.ndarray): array with shape (n_points, 2) of origin points 12 | origin_angles (np.ndarray): array with shape (n_points,) of origin angles 13 | cone_angle (float): cone angle width 14 | target_pts (np.ndarray): target points to check whether in cones 15 | Returns: 16 | np.ndarray of bools. Each row corresponds to origin cone, and columns to 17 | target points 18 | ''' 19 | assert isinstance(origin_pts, np.ndarray) 20 | assert isinstance(origin_angles, np.ndarray) 21 | assert isinstance(cone_angle, float) 22 | assert isinstance(target_pts, np.ndarray) 23 | assert origin_pts.shape[0] == origin_angles.shape[0] 24 | assert len(origin_angles.shape) == 1, "Angles should only have 1 dimension" 25 | np.seterr(divide='ignore', invalid='ignore') 26 | cone_vec = np.array([np.cos(origin_angles), np.sin(origin_angles)]).T 27 | # Compute normed vectors between all pairs of agents 28 | pos_diffs = target_pts[None, ...] - origin_pts[:, None, :] 29 | norms = np.sqrt(np.sum(np.square(pos_diffs), -1, keepdims=True)) 30 | unit_diffs = pos_diffs / norms 31 | # Dot product between unit vector in middle of cone and the vector 32 | dot_cone_diff = np.sum(unit_diffs * cone_vec[:, None, :], -1) 33 | angle_between = np.arccos(dot_cone_diff) 34 | # Right now the only thing that should be nan will be targets that are on the origin point 35 | # This can only happen for the origin looking at itself, so just make this always true 36 | angle_between[np.isnan(angle_between)] = 0. 37 | 38 | return np.abs(normalize_angles(angle_between)) <= cone_angle 39 | 40 | 41 | def insight(sim, geom1_id, geom2_id=None, pt2=None, dist_thresh=np.inf, check_body=True): 42 | ''' 43 | Check if geom2 or pt2 is in line of sight of geom1. 44 | Args: 45 | sim: Mujoco sim object 46 | geom1 (int): geom id 47 | geom2 (int): geom id 48 | pt2 (tuple): xy point 49 | dist_thresh (float): Adds a distance threshold for vision. Objects beyond the threshold 50 | are considered out of sight. 51 | check_body (bool): Check whether the raycast hit any geom in the body that geom2 is in 52 | rather than if it just hit geom2 53 | ''' 54 | dist, collision_geom = raycast(sim, geom1_id, geom2_id=geom2_id, pt2=pt2) 55 | if geom2_id is not None: 56 | if check_body: 57 | body2_id, collision_body_id = sim.model.geom_bodyid[[geom2_id, collision_geom]] 58 | return (collision_body_id == body2_id and dist < dist_thresh) 59 | else: 60 | return (collision_geom == geom2_id and dist < dist_thresh) 61 | else: 62 | pt1 = sim.data.geom_xpos[geom1_id] 63 | dist_pt2 = np.linalg.norm(pt2 - pt1) 64 | # if dist == -1 then we're raycasting from a geom to a point within itself, 65 | # and all objects have line of sight of themselves. 66 | return (dist == -1.0 or dist > dist_pt2) and dist_pt2 < dist_thresh 67 | -------------------------------------------------------------------------------- /mae_envs/viewer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/mae_envs/viewer/__init__.py -------------------------------------------------------------------------------- /mae_envs/viewer/env_viewer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | from mujoco_py import const, MjViewer 4 | import glfw 5 | from gym.spaces import Box, MultiDiscrete, Discrete 6 | 7 | 8 | class EnvViewer(MjViewer): 9 | 10 | def __init__(self, env): 11 | self.env = env 12 | self.elapsed = [0] 13 | self.seed = self.env.seed() 14 | super().__init__(self.env.unwrapped.sim) 15 | self.n_agents = self.env.metadata['n_actors'] 16 | self.action_types = list(self.env.action_space.spaces.keys()) 17 | self.num_action_types = len(self.env.action_space.spaces) 18 | self.num_action = self.num_actions(self.env.action_space) 19 | self.agent_mod_index = 0 20 | self.action_mod_index = 0 21 | self.action_type_mod_index = 0 22 | self.action = self.zero_action(self.env.action_space) 23 | self.env_reset() 24 | 25 | def num_actions(self, ac_space): 26 | n_actions = [] 27 | for k, tuple_space in ac_space.spaces.items(): 28 | s = tuple_space.spaces[0] 29 | if isinstance(s, Box): 30 | n_actions.append(s.shape[0]) 31 | elif isinstance(s, Discrete): 32 | n_actions.append(1) 33 | elif isinstance(s, MultiDiscrete): 34 | n_actions.append(s.nvec.shape[0]) 35 | else: 36 | raise NotImplementedError(f"not NotImplementedError") 37 | 38 | return n_actions 39 | 40 | def zero_action(self, ac_space): 41 | ac = {} 42 | for k, space in ac_space.spaces.items(): 43 | if isinstance(space.spaces[0], Box): 44 | ac[k] = np.zeros_like(space.sample()) 45 | elif isinstance(space.spaces[0], Discrete): 46 | ac[k] = np.ones_like(space.sample()) * (space.spaces[0].n // 2) 47 | elif isinstance(space.spaces[0], MultiDiscrete): 48 | ac[k] = np.ones_like(space.sample(), dtype=int) * (space.spaces[0].nvec // 2) 49 | else: 50 | raise NotImplementedError("MultiDiscrete not NotImplementedError") 51 | # return action_space.nvec // 2 # assume middle element is "no action" action 52 | return ac 53 | 54 | def env_reset(self): 55 | start = time.time() 56 | # get the seed before calling env.reset(), so we display the one 57 | # that was used for the reset. 58 | self.seed = self.env.seed() 59 | self.env.reset() 60 | self.elapsed.append(time.time() - start) 61 | self.update_sim(self.env.unwrapped.sim) 62 | 63 | def key_callback(self, window, key, scancode, action, mods): 64 | # Trigger on keyup only: 65 | if action != glfw.RELEASE: 66 | return 67 | if key == glfw.KEY_ESCAPE: 68 | self.env.close() 69 | 70 | # Increment experiment seed 71 | elif key == glfw.KEY_N: 72 | self.seed[0] += 1 73 | self.env.seed(self.seed) 74 | self.env_reset() 75 | self.action = self.zero_action(self.env.action_space) 76 | # Decrement experiment trial 77 | elif key == glfw.KEY_P: 78 | self.seed = [max(self.seed[0] - 1, 0)] 79 | self.env.seed(self.seed) 80 | self.env_reset() 81 | self.action = self.zero_action(self.env.action_space) 82 | current_action_space = self.env.action_space.spaces[self.action_types[self.action_type_mod_index]].spaces[0] 83 | if key == glfw.KEY_A: 84 | if isinstance(current_action_space, Box): 85 | self.action[self.action_types[self.action_type_mod_index]][self.agent_mod_index][self.action_mod_index] -= 0.05 86 | elif isinstance(current_action_space, Discrete): 87 | self.action[self.action_types[self.action_type_mod_index]][self.agent_mod_index] = \ 88 | (self.action[self.action_types[self.action_type_mod_index]][self.agent_mod_index] - 1) % current_action_space.n 89 | elif isinstance(current_action_space, MultiDiscrete): 90 | self.action[self.action_types[self.action_type_mod_index]][self.agent_mod_index][self.action_mod_index] = \ 91 | (self.action[self.action_types[self.action_type_mod_index]][self.agent_mod_index][self.action_mod_index] - 1) \ 92 | % current_action_space.nvec[self.action_mod_index] 93 | elif key == glfw.KEY_Z: 94 | if isinstance(current_action_space, Box): 95 | self.action[self.action_types[self.action_type_mod_index]][self.agent_mod_index][self.action_mod_index] += 0.05 96 | elif isinstance(current_action_space, Discrete): 97 | self.action[self.action_types[self.action_type_mod_index]][self.agent_mod_index] = \ 98 | (self.action[self.action_types[self.action_type_mod_index]][self.agent_mod_index] + 1) % current_action_space.n 99 | elif isinstance(current_action_space, MultiDiscrete): 100 | self.action[self.action_types[self.action_type_mod_index]][self.agent_mod_index][self.action_mod_index] = \ 101 | (self.action[self.action_types[self.action_type_mod_index]][self.agent_mod_index][self.action_mod_index] + 1) \ 102 | % current_action_space.nvec[self.action_mod_index] 103 | elif key == glfw.KEY_K: 104 | self.action_mod_index = (self.action_mod_index + 1) % self.num_action[self.action_type_mod_index] 105 | elif key == glfw.KEY_J: 106 | self.action_mod_index = (self.action_mod_index - 1) % self.num_action[self.action_type_mod_index] 107 | elif key == glfw.KEY_Y: 108 | self.agent_mod_index = (self.agent_mod_index + 1) % self.n_agents 109 | elif key == glfw.KEY_U: 110 | self.agent_mod_index = (self.agent_mod_index - 1) % self.n_agents 111 | elif key == glfw.KEY_G: 112 | self.action_type_mod_index = (self.action_type_mod_index + 1) % self.num_action_types 113 | self.action_mod_index = 0 114 | elif key == glfw.KEY_B: 115 | self.action_type_mod_index = (self.action_type_mod_index - 1) % self.num_action_types 116 | self.action_mod_index = 0 117 | 118 | super().key_callback(window, key, scancode, action, mods) 119 | 120 | def run(self, once=False): 121 | while True: 122 | _, _, _, env_info = self.env.step(self.action) 123 | if env_info.get('discard_episode', False): 124 | self.env.reset() 125 | self.add_overlay(const.GRID_TOPRIGHT, "Reset env; (current seed: {})".format(self.seed), "N - next / P - previous ") 126 | self.add_overlay(const.GRID_TOPRIGHT, "Apply action", "A (-0.05) / Z (+0.05)") 127 | self.add_overlay(const.GRID_TOPRIGHT, "on agent index %d out %d" % (self.agent_mod_index, self.n_agents), "Y / U") 128 | self.add_overlay(const.GRID_TOPRIGHT, f"on action type {self.action_types[self.action_type_mod_index]}", "G / B") 129 | self.add_overlay(const.GRID_TOPRIGHT, "on action index %d out %d" % (self.action_mod_index, self.num_action[self.action_type_mod_index]), "J / K") 130 | self.add_overlay(const.GRID_BOTTOMRIGHT, "Reset took", "%.2f sec." % (sum(self.elapsed) / len(self.elapsed))) 131 | self.add_overlay(const.GRID_BOTTOMRIGHT, "Action", str(self.action)) 132 | self.render() 133 | if once: 134 | return 135 | -------------------------------------------------------------------------------- /mae_envs/viewer/policy_viewer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import time 3 | import glfw 4 | import numpy as np 5 | from operator import itemgetter 6 | from mujoco_py import const, MjViewer 7 | from mujoco_worldgen.util.types import store_args 8 | from ma_policy.util import listdict2dictnp 9 | 10 | 11 | def splitobs(obs, keepdims=True): 12 | ''' 13 | Split obs into list of single agent obs. 14 | Args: 15 | obs: dictionary of numpy arrays where first dim in each array is agent dim 16 | ''' 17 | n_agents = obs[list(obs.keys())[0]].shape[0] 18 | return [{k: v[[i]] if keepdims else v[i] for k, v in obs.items()} for i in range(n_agents)] 19 | 20 | 21 | class PolicyViewer(MjViewer): 22 | ''' 23 | PolicyViewer runs a policy with an environment and optionally displays it. 24 | env - environment to run policy in 25 | policy - policy object to run 26 | display_window - if true, show the graphical viewer 27 | seed - environment seed to view 28 | duration - time in seconds to run the policy, run forever if duration=None 29 | ''' 30 | @store_args 31 | def __init__(self, env, policies, display_window=True, seed=None, duration=None): 32 | if seed is None: 33 | self.seed = env.seed()[0] 34 | else: 35 | self.seed = seed 36 | env.seed(seed) 37 | self.total_rew = 0.0 38 | self.ob = env.reset() 39 | for policy in self.policies: 40 | policy.reset() 41 | assert env.metadata['n_actors'] % len(policies) == 0 42 | if hasattr(env, "reset_goal"): 43 | self.goal = env.reset_goal() 44 | super().__init__(self.env.unwrapped.sim) 45 | # TO DO: remove circular dependency on viewer object. It looks fishy. 46 | self.env.unwrapped.viewer = self 47 | if self.render and self.display_window: 48 | self.env.render() 49 | 50 | def key_callback(self, window, key, scancode, action, mods): 51 | super().key_callback(window, key, scancode, action, mods) 52 | # Trigger on keyup only: 53 | if action != glfw.RELEASE: 54 | return 55 | # Increment experiment seed 56 | if key == glfw.KEY_N: 57 | self.reset_increment() 58 | # Decrement experiment trial 59 | elif key == glfw.KEY_P: 60 | print("Pressed P") 61 | self.seed = max(self.seed - 1, 0) 62 | self.env.seed(self.seed) 63 | self.ob = self.env.reset() 64 | for policy in self.policies: 65 | policy.reset() 66 | if hasattr(self.env, "reset_goal"): 67 | self.goal = self.env.reset_goal() 68 | self.update_sim(self.env.unwrapped.sim) 69 | 70 | def run(self): 71 | if self.duration is not None: 72 | self.end_time = time.time() + self.duration 73 | self.total_rew_avg = 0.0 74 | self.n_episodes = 0 75 | while self.duration is None or time.time() < self.end_time: 76 | if len(self.policies) == 1: 77 | action, _ = self.policies[0].act(self.ob) 78 | else: 79 | self.ob = splitobs(self.ob, keepdims=False) 80 | ob_policy_idx = np.split(np.arange(len(self.ob)), len(self.policies)) 81 | actions = [] 82 | for i, policy in enumerate(self.policies): 83 | inp = itemgetter(*ob_policy_idx[i])(self.ob) 84 | inp = listdict2dictnp([inp] if ob_policy_idx[i].shape[0] == 1 else inp) 85 | ac, info = policy.act(inp) 86 | actions.append(ac) 87 | action = listdict2dictnp(actions, keepdims=True) 88 | 89 | self.ob, rew, done, env_info = self.env.step(action) 90 | self.total_rew += rew 91 | 92 | if done or env_info.get('discard_episode', False): 93 | self.reset_increment() 94 | 95 | if self.display_window: 96 | self.add_overlay(const.GRID_TOPRIGHT, "Reset env; (current seed: {})".format(self.seed), "N - next / P - previous ") 97 | self.add_overlay(const.GRID_TOPRIGHT, "Reward", str(self.total_rew)) 98 | if hasattr(self.env.unwrapped, "viewer_stats"): 99 | for k, v in self.env.unwrapped.viewer_stats.items(): 100 | self.add_overlay(const.GRID_TOPRIGHT, k, str(v)) 101 | 102 | self.env.render() 103 | 104 | def reset_increment(self): 105 | self.total_rew_avg = (self.n_episodes * self.total_rew_avg + self.total_rew) / (self.n_episodes + 1) 106 | self.n_episodes += 1 107 | print(f"Reward: {self.total_rew} (rolling average: {self.total_rew_avg})") 108 | self.total_rew = 0.0 109 | self.seed += 1 110 | self.env.seed(self.seed) 111 | self.ob = self.env.reset() 112 | for policy in self.policies: 113 | policy.reset() 114 | if hasattr(self.env, "reset_goal"): 115 | self.goal = self.env.reset_goal() 116 | self.update_sim(self.env.unwrapped.sim) 117 | -------------------------------------------------------------------------------- /mae_envs/wrappers/food.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from mae_envs.wrappers.util import update_obs_space 4 | from mujoco_worldgen.util.types import store_args 5 | from gym.spaces import Tuple, MultiDiscrete 6 | 7 | 8 | class FoodHealthWrapper(gym.Wrapper): 9 | ''' 10 | Adds food health to underlying env. 11 | Manages food levels. 12 | 13 | Args: 14 | eat_thresh (float): radius within which food items can be eaten 15 | max_food_health (int): number of times a food item can be eaten 16 | before it disappears 17 | respawn_time (int): Number of time steps after which food items 18 | that have been eaten reappear 19 | food_rew_type (string): can be 20 | 'selfish': each agent gets an inividual reward for the food they eat 21 | 'joint_mean': food rewards are averaged over teams 22 | reward_scale (float or (float, float)): scales the reward by this amount. If tuple of 23 | floats, the exact reward scaling is uniformly sampled from 24 | (reward_scale[0], reward_scale[1]) at the beginning of every episode. 25 | reward_scale_obs (bool): If true, adds the reward scale for the current 26 | episode to food_obs 27 | ''' 28 | @store_args 29 | def __init__(self, env, eat_thresh=0.5, max_food_health=10, respawn_time=np.inf, 30 | food_rew_type='selfish', reward_scale=1.0, reward_scale_obs=False, 31 | split_eat_between_agents=False): 32 | super().__init__(env) 33 | self.n_agents = self.metadata['n_agents'] 34 | 35 | if type(reward_scale) not in [list, tuple, np.ndarray]: 36 | self.reward_scale = [reward_scale, reward_scale] 37 | 38 | # Reset obs/action space to match 39 | self.max_n_food = self.metadata['max_n_food'] 40 | self.curr_n_food = self.metadata['curr_n_food'] 41 | self.max_food_size = self.metadata['food_size'] 42 | food_dim = 5 if self.reward_scale_obs else 4 43 | self.observation_space = update_obs_space(self.env, {'food_obs': (self.max_n_food, food_dim), 44 | 'food_health': (self.max_n_food, 1), 45 | 'food_eat': (self.max_n_food, 1)}) 46 | self.action_space.spaces['action_eat_food'] = Tuple([MultiDiscrete([2] * self.max_n_food) 47 | for _ in range(self.n_agents)]) 48 | 49 | def reset(self): 50 | obs = self.env.reset() 51 | sim = self.unwrapped.sim 52 | 53 | # Reset obs/action space to match 54 | self.curr_n_food = self.metadata['curr_n_food'] 55 | 56 | self.food_site_ids = np.array([sim.model.site_name2id(f'food{i}') 57 | for i in range(self.curr_n_food)]) 58 | # Reset food healths 59 | self.food_healths = np.ones((self.curr_n_food, 1)) * self.max_food_health 60 | self.eat_per_food = np.zeros((self.curr_n_food, 1)) 61 | 62 | # Reset food size 63 | self.respawn_counters = np.zeros((self.curr_n_food,)) 64 | 65 | self.curr_reward_scale = np.random.uniform(self.reward_scale[0], self.reward_scale[1]) 66 | 67 | return self.observation(obs) 68 | 69 | def observation(self, obs): 70 | # Add food position and healths to obersvations 71 | food_pos = obs['food_pos'] 72 | obs['food_health'] = self.food_healths 73 | obs['food_obs'] = np.concatenate([food_pos, self.food_healths], 1) 74 | if self.reward_scale_obs: 75 | obs['food_obs'] = np.concatenate([obs['food_obs'], np.ones((self.curr_n_food, 1)) * self.curr_reward_scale], 1) 76 | obs['food_eat'] = self.eat_per_food 77 | return obs 78 | 79 | def step(self, action): 80 | action_eat_food = action.pop('action_eat_food') 81 | obs, rew, done, info = self.env.step(action) 82 | 83 | if self.curr_n_food > 0: 84 | # Eat food that is close enough 85 | dist_to_food = np.linalg.norm(obs['agent_pos'][:, None] - obs['food_pos'][None], axis=-1) 86 | eat = np.logical_and(dist_to_food < self.eat_thresh, self.food_healths.T > 0) 87 | eat = np.logical_and(eat, action_eat_food).astype(np.float32) 88 | if self.split_eat_between_agents: 89 | eat_per_food = np.sum(eat, 0) 90 | eat[:, eat_per_food > 0] /= eat_per_food[eat_per_food > 0] 91 | eat_per_food = np.sum(eat, 0) 92 | 93 | # Make sure that all agents can't have the last bite of food. 94 | # At that point, food is split evenly 95 | over_eat = self.food_healths[:, 0] < eat_per_food 96 | eat[:, over_eat] *= (self.food_healths[over_eat, 0] / eat_per_food[over_eat]) 97 | eat_per_food = np.sum(eat, 0) 98 | self.eat_per_food = eat_per_food[:, None] 99 | 100 | # Update food healths and sizes 101 | self.food_healths -= eat_per_food[:, None] 102 | health_diff = eat_per_food[:, None] 103 | size_diff = health_diff * (self.max_food_size / self.max_food_health) 104 | size = self.unwrapped.sim.model.site_size[self.food_site_ids] - size_diff 105 | size = np.maximum(0, size) 106 | self.unwrapped.sim.model.site_size[self.food_site_ids] = size 107 | 108 | self.food_healths[self.respawn_counters == self.respawn_time] = self.max_food_health 109 | self.unwrapped.sim.model.site_size[self.food_site_ids[self.respawn_counters == self.respawn_time]] = self.max_food_size 110 | self.respawn_counters[self.food_healths[:, 0] == 0] += 1 111 | self.respawn_counters[self.food_healths[:, 0] != 0] = 0 112 | 113 | assert np.all(self.food_healths >= 0), \ 114 | f"There is a food health below 0: {self.food_healths}" 115 | 116 | # calculate food reward 117 | if self.food_rew_type == 'selfish': 118 | food_rew = np.sum(eat, axis=1) 119 | elif self.food_rew_type == 'joint_mean': 120 | food_rew = np.sum(eat, axis=1) 121 | team_index = self.metadata['team_index'] 122 | for team_index_number in np.unique(team_index): 123 | food_rew[team_index == team_index_number] = np.mean(food_rew[team_index == team_index_number]) 124 | else: 125 | raise ValueError(f"Food reward type {self.food_rew_type} unknown.") 126 | else: 127 | food_rew = 0.0 128 | 129 | info['agents_eat'] = eat 130 | rew += food_rew * self.curr_reward_scale 131 | return self.observation(obs), rew, done, info 132 | 133 | 134 | class AlwaysEatWrapper(gym.ActionWrapper): 135 | ''' 136 | Remove eat action and replace it with always eating. 137 | Args: 138 | agent_idx_allowed (ndarray): indicies of agents allowed to eat. 139 | ''' 140 | def __init__(self, env, agent_idx_allowed): 141 | super().__init__(env) 142 | self.action_space.spaces.pop('action_eat_food') 143 | self.agent_idx_allowed = agent_idx_allowed 144 | 145 | def action(self, action): 146 | action['action_eat_food'] = np.zeros((self.metadata['n_agents'], self.metadata['curr_n_food'])) 147 | action['action_eat_food'][self.agent_idx_allowed] = 1. 148 | return action 149 | -------------------------------------------------------------------------------- /mae_envs/wrappers/lidar.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from mujoco_worldgen.util.rotation import quat_from_angle_and_axis 4 | from mujoco_worldgen.util.geometry import raycast 5 | from mae_envs.wrappers.util import update_obs_space 6 | 7 | 8 | class Lidar(gym.ObservationWrapper): 9 | ''' 10 | Creates LIDAR-type observations based on Mujoco raycast 11 | 12 | Args: 13 | n_lidar_per_agent (int): Number of concentric lidar rays per agent 14 | lidar_range (float): Maximum range of lidar 15 | compress_lidar_scale (float): Scale for non-linear compression of 16 | lidar range 17 | visualize_lidar (bool): If true, visualize lidar using thin cylinders 18 | representing lidar rays (requires environment to create corresponding 19 | sites) 20 | ''' 21 | def __init__(self, env, n_lidar_per_agent=30, lidar_range=6.0, 22 | compress_lidar_scale=None, visualize_lidar=False): 23 | super().__init__(env) 24 | self.n_lidar_per_agent = n_lidar_per_agent 25 | self.lidar_range = lidar_range 26 | self.compress_lidar_scale = compress_lidar_scale 27 | self.visualize_lidar = visualize_lidar 28 | self.n_agents = self.unwrapped.n_agents 29 | 30 | self.observation_space = update_obs_space( 31 | env, {'lidar': (self.n_agents, self.n_lidar_per_agent, 1)}) 32 | 33 | # generate concentric lidar rays centered at origin 34 | self.lidar_angles = np.linspace(0, 2*np.pi, num=self.n_lidar_per_agent, endpoint=False) 35 | self.lidar_rays = self.lidar_range * np.array([np.cos(self.lidar_angles), 36 | np.sin(self.lidar_angles), 37 | np.zeros_like(self.lidar_angles)]).T 38 | self.lidar_rays = self.lidar_rays[None, :] 39 | 40 | def reset(self): 41 | obs = self.env.reset() 42 | 43 | sim = self.unwrapped.sim 44 | 45 | # Cache ids 46 | self.agent_body_ids = np.array([sim.model.body_name2id(f"agent{i}:particle") 47 | for i in range(self.n_agents)]) 48 | self.agent_geom_ids = np.array([sim.model.geom_name2id(f'agent{i}:agent') 49 | for i in range(self.n_agents)]) 50 | 51 | if self.visualize_lidar: 52 | self.lidar_ids = np.array([[sim.model.site_name2id(f"agent{i}:lidar{j}") 53 | for j in range(self.n_lidar_per_agent)] 54 | for i in range(self.n_agents)]) 55 | 56 | return self.observation(obs) 57 | 58 | def place_lidar_ray_markers(self, agent_pos, lidar_endpoints): 59 | sim = self.unwrapped.sim 60 | 61 | site_offset = sim.data.site_xpos[self.lidar_ids, :] - sim.model.site_pos[self.lidar_ids, :] 62 | 63 | # compute location of lidar rays 64 | sim.model.site_pos[self.lidar_ids, :] = .5 * (agent_pos[:, None, :] + lidar_endpoints) - site_offset 65 | 66 | # compute length of lidar rays 67 | rel_vec = lidar_endpoints - agent_pos[:, None, :] 68 | rel_vec_length = np.linalg.norm(rel_vec, axis=-1) 69 | sim.model.site_size[self.lidar_ids, 1] = rel_vec_length / 2 70 | 71 | # compute rotation of lidar rays 72 | # normalize relative vector 73 | rel_vec_norm = rel_vec / rel_vec_length[:, :, None] 74 | # set small relative vectors to zero instead 75 | rel_vec_norm[rel_vec_length <= 1e-8, :] = 0.0 76 | # start vector 77 | start_vec = np.array([0.0, 0.0, 1.0]) 78 | # calculate rotation axis: cross product between start and goal vector 79 | rot_axis = np.cross(start_vec, rel_vec_norm) 80 | norm_rot_axis = np.linalg.norm(rot_axis, axis=-1) 81 | 82 | # calculate rotation angle and quaternion 83 | rot_angle = np.arctan2(norm_rot_axis, np.dot(rel_vec_norm, start_vec)) 84 | quat = quat_from_angle_and_axis(rot_angle, rot_axis) 85 | 86 | # if norm of cross product is very small, set rotation to identity 87 | eps = 1e-3 88 | quat[norm_rot_axis <= eps, :] = np.array([1.0, 0.0, 0.0, 0.0]) 89 | 90 | sim.model.site_quat[self.lidar_ids, :] = quat 91 | 92 | def observation(self, obs): 93 | sim = self.unwrapped.sim 94 | agent_pos = sim.data.body_xpos[self.agent_body_ids] 95 | 96 | lidar_endpoints = agent_pos[:, None, :] + self.lidar_rays 97 | 98 | # Would be nice to vectorize in the future with better mujoco-py interface 99 | lidar = np.zeros((self.n_agents, self.n_lidar_per_agent)) 100 | for i in range(self.n_agents): 101 | for j in range(self.n_lidar_per_agent): 102 | lidar[i, j] = raycast(sim, geom1_id=self.agent_geom_ids[i], 103 | pt2=lidar_endpoints[i, j], geom_group=None)[0] 104 | 105 | lidar[lidar < 0.0] = self.lidar_range 106 | 107 | if self.compress_lidar_scale is not None: 108 | obs['lidar'] = (self.compress_lidar_scale * 109 | np.tanh(lidar[..., None] / self.compress_lidar_scale)) 110 | else: 111 | obs['lidar'] = lidar[..., None] 112 | 113 | if self.visualize_lidar: 114 | # recalculate lidar endpoints 115 | lidar_endpoints = agent_pos[:, None, :] + \ 116 | lidar[:, :, None] / self.lidar_range * self.lidar_rays 117 | self.place_lidar_ray_markers(agent_pos, lidar_endpoints) 118 | sim.model.site_rgba[self.lidar_ids, :] = np.array([0.0, 0.0, 1.0, 0.2]) 119 | sim.forward() 120 | 121 | return obs 122 | -------------------------------------------------------------------------------- /mae_envs/wrappers/limit_mvmnt.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | 4 | 5 | class RestrictAgentsRect(gym.RewardWrapper): 6 | ''' 7 | Give subset of agents negative reward if they leave a given area 8 | Args: 9 | restrict_rect (list of four floats): coordinates of rectangle 10 | defined as [x_min, y_min, x_max, y_max] 11 | reward_scale (float): reward for going out of bounds is -reward_scale at each 12 | timestep 13 | penalize_objects_out (bool): If true, penalizes all agents whenever an object is 14 | outside the specified area. 15 | ''' 16 | def __init__(self, env, restrict_rect, reward_scale=10., penalize_objects_out=False): 17 | super().__init__(env) 18 | self.n_agents = self.unwrapped.n_agents 19 | self.restrict_rect = np.array(restrict_rect) 20 | self.reward_scale = reward_scale 21 | self.penalize_objects_out = penalize_objects_out 22 | 23 | assert len(self.restrict_rect) == 4, \ 24 | "Restriction rectangle must be of format [x_min, y_min, x_max, y_max]" 25 | 26 | self.rect_middle = 0.5 * np.array([restrict_rect[0] + restrict_rect[2], 27 | restrict_rect[1] + restrict_rect[3]]) 28 | 29 | self.rect_size = np.array([restrict_rect[2] - restrict_rect[0], 30 | restrict_rect[3] - restrict_rect[1]]) 31 | 32 | def reset(self): 33 | obs = self.env.reset() 34 | sim = self.unwrapped.sim 35 | self.agent_body_idxs = np.array([sim.model.body_name2id(f"agent{i}:particle") 36 | for i in range(self.n_agents)]) 37 | if self.penalize_objects_out: 38 | obj_body_idxs = ([sim.model.body_name2id(f'moveable_box{i}') for i in np.where(self.metadata['curr_n_boxes'])[0]] + 39 | [sim.model.body_name2id(f'ramp{i}:ramp') for i in np.where(self.metadata['curr_n_ramps'])[0]]) 40 | self.obj_body_idxs = np.array(obj_body_idxs) 41 | 42 | return obs 43 | 44 | def reward(self, reward): 45 | sim = self.unwrapped.sim 46 | agent_pos = sim.data.body_xpos[self.agent_body_idxs, :2] 47 | outside_rect = np.any(np.abs(agent_pos - self.rect_middle) > (self.rect_size / 2), axis=1) 48 | if self.penalize_objects_out: 49 | obj_pos = sim.data.body_xpos[self.obj_body_idxs, :2] 50 | any_obj_outside_rect = np.any(np.abs(obj_pos - self.rect_middle) > (self.rect_size / 2)) 51 | if any_obj_outside_rect: 52 | reward[:] = - self.reward_scale 53 | reward[outside_rect] = - self.reward_scale 54 | 55 | return reward 56 | -------------------------------------------------------------------------------- /mae_envs/wrappers/line_of_sight.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from mae_envs.util.vision import insight, in_cone2d 4 | from mae_envs.wrappers.util import update_obs_space 5 | 6 | 7 | class AgentAgentObsMask2D(gym.ObservationWrapper): 8 | """ Adds an mask observation that states which agents are visible to which agents. 9 | Args: 10 | cone_angle: (float) the angle in radians btw the axis and edge of the observation cone 11 | """ 12 | def __init__(self, env, cone_angle=3/8 * np.pi): 13 | super().__init__(env) 14 | self.cone_angle = cone_angle 15 | self.n_agents = self.unwrapped.n_agents 16 | self.observation_space = update_obs_space(env, {'mask_aa_obs': (self.n_agents, self.n_agents)}) 17 | 18 | def observation(self, obs): 19 | # Agent to agent obs mask 20 | agent_pos2d = obs['agent_pos'][:, :-1] 21 | agent_angle = obs['agent_angle'] 22 | cone_mask = in_cone2d(agent_pos2d, np.squeeze(agent_angle, -1), self.cone_angle, agent_pos2d) 23 | # Make sure they are in line of sight 24 | for i, j in np.argwhere(cone_mask): 25 | if i != j: 26 | cone_mask[i, j] = insight(self.unwrapped.sim, 27 | self.metadata['agent_geom_idxs'][i], 28 | self.metadata['agent_geom_idxs'][j]) 29 | obs['mask_aa_obs'] = cone_mask 30 | return obs 31 | 32 | 33 | class AgentSiteObsMask2D(gym.ObservationWrapper): 34 | """ Adds an mask observation that states which sites are visible to which agents. 35 | Args: 36 | pos_obs_key: (string) the name of the site position observation of shape (n_sites, 3) 37 | mask_obs_key: (string) the name of the mask observation to output 38 | cone_angle: (float) the angle in radians btw the axis and edge of the observation cone 39 | """ 40 | def __init__(self, env, pos_obs_key, mask_obs_key, cone_angle=3/8 * np.pi): 41 | super().__init__(env) 42 | self.cone_angle = cone_angle 43 | self.n_agents = self.unwrapped.n_agents 44 | assert(self.n_agents == self.observation_space.spaces['agent_pos'].shape[0]) 45 | self.n_objects = self.observation_space.spaces[pos_obs_key].shape[0] 46 | self.observation_space = update_obs_space(env, {mask_obs_key: (self.n_agents, self.n_objects)}) 47 | self.pos_obs_key = pos_obs_key 48 | self.mask_obs_key = mask_obs_key 49 | 50 | def observation(self, obs): 51 | agent_pos2d = obs['agent_pos'][:, :-1] 52 | agent_angle = obs['agent_angle'] 53 | pos2d = obs[self.pos_obs_key][:, :2] 54 | cone_mask = in_cone2d(agent_pos2d, np.squeeze(agent_angle, -1), self.cone_angle, pos2d) 55 | # Make sure they are in line of sight 56 | for i, j in np.argwhere(cone_mask): 57 | agent_geom_id = self.metadata['agent_geom_idxs'][i] 58 | pt2 = obs[self.pos_obs_key][j] 59 | cone_mask[i, j] = insight(self.unwrapped.sim, agent_geom_id, pt2=pt2) 60 | obs[self.mask_obs_key] = cone_mask 61 | return obs 62 | 63 | 64 | class AgentGeomObsMask2D(gym.ObservationWrapper): 65 | """ Adds an mask observation that states which geoms are visible to which agents. 66 | Args: 67 | pos_obs_key: (string) the name of the site position observation of shape (n_geoms, 3) 68 | geom_idxs_obs_key: (string) the name of an observation that, for each object to be 69 | masked, gives the Mujoco index of the geom (e.g. in sim.geom_names) 70 | as an array of shape (n_geoms, 1) 71 | mask_obs_key: (string) the name of the mask observation to output 72 | cone_angle: (float) the angle in radians btw the axis and edge of the observation cone 73 | """ 74 | def __init__(self, env, pos_obs_key, geom_idxs_obs_key, mask_obs_key, cone_angle=3/8 * np.pi): 75 | super().__init__(env) 76 | self.cone_angle = cone_angle 77 | self.n_agents = self.unwrapped.n_agents 78 | assert(self.n_agents == self.observation_space.spaces['agent_pos'].shape[0]) 79 | self.n_objects = self.observation_space.spaces[pos_obs_key].shape[0] 80 | self.observation_space = update_obs_space(env, {mask_obs_key: (self.n_agents, self.n_objects)}) 81 | self.pos_obs_key = pos_obs_key 82 | self.mask_obs_key = mask_obs_key 83 | self.geom_idxs_obs_key = geom_idxs_obs_key 84 | 85 | def observation(self, obs): 86 | agent_pos2d = obs['agent_pos'][:, :-1] 87 | agent_angle = obs['agent_angle'] 88 | pos2d = obs[self.pos_obs_key][:, :2] 89 | cone_mask = in_cone2d(agent_pos2d, np.squeeze(agent_angle, -1), self.cone_angle, pos2d) 90 | # Make sure they are in line of sight 91 | for i, j in np.argwhere(cone_mask): 92 | agent_geom_id = self.metadata['agent_geom_idxs'][i] 93 | geom_id = obs[self.geom_idxs_obs_key][j, 0] 94 | if geom_id == -1: 95 | # This option is helpful if the number of geoms varies between episodes 96 | # If geoms don't exists this wrapper expects that the geom idx is 97 | # set to -1 98 | cone_mask[i, j] = 0 99 | else: 100 | cone_mask[i, j] = insight(self.unwrapped.sim, agent_geom_id, geom2_id=geom_id) 101 | obs[self.mask_obs_key] = cone_mask 102 | return obs 103 | -------------------------------------------------------------------------------- /mae_envs/wrappers/multi_agent.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from scipy.linalg import circulant 4 | from gym.spaces import Tuple, Box, Dict 5 | from copy import deepcopy 6 | 7 | 8 | class SplitMultiAgentActions(gym.ActionWrapper): 9 | ''' 10 | Splits mujoco generated actions into a dict of tuple actions. 11 | ''' 12 | def __init__(self, env): 13 | super().__init__(env) 14 | self.n_agents = self.metadata['n_actors'] 15 | lows = np.split(self.action_space.low, self.n_agents) 16 | highs = np.split(self.action_space.high, self.n_agents) 17 | self.action_space = Dict({ 18 | 'action_movement': Tuple([Box(low=low, high=high, dtype=self.action_space.dtype) 19 | for low, high in zip(lows, highs)]) 20 | }) 21 | 22 | def action(self, action): 23 | return action['action_movement'].flatten() 24 | 25 | 26 | class JoinMultiAgentActions(gym.ActionWrapper): 27 | def __init__(self, env): 28 | super().__init__(env) 29 | self.n_agents = self.metadata['n_actors'] 30 | low = np.concatenate([space.low for space in self.action_space.spaces]) 31 | high = np.concatenate([space.high for space in self.action_space.spaces]) 32 | self.action_space = Box(low=low, high=high, dtype=self.action_space.spaces[0].dtype) 33 | 34 | def action(self, action): 35 | # action should be a tuple of different agent actions 36 | return np.split(action, self.n_agents) 37 | 38 | 39 | class SplitObservations(gym.ObservationWrapper): 40 | """ 41 | Split observations for each agent. 42 | Args: 43 | keys_self: list of observation names which are agent specific. E.g. this will 44 | permute qpos such that each agent sees its own qpos as the first numbers 45 | keys_copy: list of observation names that are just passed down as is 46 | keys_self_matrices: list of observation names that should be (n_agent, n_agent, dim) where 47 | each agent has a custom observation of another agent. This is different from self_keys 48 | in that self_keys we assume that observations are symmetric, whereas these can represent 49 | unique pairwise interactions/observations 50 | """ 51 | def __init__(self, env, keys_self, keys_copy=[], keys_self_matrices=[]): 52 | super().__init__(env) 53 | self.keys_self = sorted(keys_self) 54 | self.keys_copy = sorted(keys_copy) 55 | self.keys_self_matrices = sorted(keys_self_matrices) 56 | self.n_agents = self.metadata['n_agents'] 57 | new_spaces = {} 58 | for k, v in self.observation_space.spaces.items(): 59 | # If obs is a self obs, then we only want to include other agents obs, 60 | # as we will pass the self obs separately. 61 | assert len(v.shape) > 1, f'Obs {k} has shape {v.shape}' 62 | if 'mask' in k and k not in self.keys_self_matrices: 63 | new_spaces[k] = v 64 | elif k in self.keys_self_matrices: 65 | new_spaces[k] = Box(low=v.low[:, 1:], high=v.high[:, 1:], dtype=v.dtype) 66 | elif k in self.keys_self: 67 | assert v.shape[0] == self.n_agents, \ 68 | f"For self obs, obs dim 0 should equal number of agents. {k} has shape {v.shape}" 69 | obs_shape = (v.shape[0], self.n_agents - 1, v.shape[1]) 70 | lows = np.tile(v.low, self.n_agents - 1).reshape(obs_shape) 71 | highs = np.tile(v.high, self.n_agents - 1).reshape(obs_shape) 72 | new_spaces[k] = Box(low=lows, high=highs, dtype=v.dtype) 73 | elif k in self.keys_copy: 74 | new_spaces[k] = deepcopy(v) 75 | else: 76 | obs_shape = (v.shape[0], self.n_agents, v.shape[1]) 77 | lows = np.tile(v.low, self.n_agents).reshape(obs_shape).transpose((1, 0, 2)) 78 | highs = np.tile(v.high, self.n_agents).reshape(obs_shape).transpose((1, 0, 2)) 79 | new_spaces[k] = Box(low=lows, high=highs, dtype=v.dtype) 80 | 81 | for k in self.keys_self: 82 | new_spaces[k + '_self'] = self.observation_space.spaces[k] 83 | 84 | self.observation_space = Dict(new_spaces) 85 | 86 | def observation(self, obs): 87 | new_obs = {} 88 | for k, v in obs.items(): 89 | # Masks that aren't self matrices should just be copied 90 | if 'mask' in k and k not in self.keys_self_matrices: 91 | new_obs[k] = obs[k] 92 | # Circulant self matrices 93 | elif k in self.keys_self_matrices: 94 | new_obs[k] = self._process_self_matrix(obs[k]) 95 | # Circulant self keys 96 | elif k in self.keys_self: 97 | new_obs[k + '_self'] = obs[k] 98 | new_obs[k] = obs[k][circulant(np.arange(self.n_agents))] 99 | new_obs[k] = new_obs[k][:, 1:, :] # Remove self observation 100 | elif k in self.keys_copy: 101 | new_obs[k] = obs[k] 102 | # Everything else should just get copied for each agent (e.g. external obs) 103 | else: 104 | new_obs[k] = np.tile(v, self.n_agents).reshape([v.shape[0], self.n_agents, v.shape[1]]).transpose((1, 0, 2)) 105 | 106 | return new_obs 107 | 108 | def _process_self_matrix(self, self_matrix): 109 | ''' 110 | self_matrix will be a (n_agent, n_agent) boolean matrix. Permute each row such that the matrix is consistent with 111 | the circulant permutation used for self observations. E.g. this should be used for agent agent masks 112 | ''' 113 | assert np.all(self_matrix.shape[:2] == np.array((self.n_agents, self.n_agents))), \ 114 | f"The first two dimensions of {self_matrix} were not (n_agents, n_agents)" 115 | 116 | new_mat = self_matrix.copy() 117 | # Permute each row to the right by one more than the previous 118 | # E.g., [[1,2],[3,4]] -> [[1,2],[4,3]] 119 | idx = circulant(np.arange(self.n_agents)) 120 | new_mat = new_mat[np.arange(self.n_agents)[:, None], idx] 121 | new_mat = new_mat[:, 1:] # Remove self observation 122 | return new_mat 123 | 124 | 125 | class SelectKeysWrapper(gym.ObservationWrapper): 126 | """ 127 | Select keys for final observations. 128 | Expects that all observations come in shape (n_agents, n_objects, n_dims) 129 | Args: 130 | keys_self (list): observation names that are specific to an agent 131 | These will be concatenated into 'observation_self' observation 132 | keys_other (list): observation names that should be passed through 133 | flatten (bool): if true, internal and external observations 134 | """ 135 | 136 | def __init__(self, env, keys_self, keys_other, flatten=False): 137 | super().__init__(env) 138 | self.keys_self = sorted([k + '_self' for k in keys_self]) 139 | self.keys_other = sorted(keys_other) 140 | self.flatten = flatten 141 | 142 | # Change observation space to look like a single agent observation space. 143 | # This makes constructing policies much easier 144 | if flatten: 145 | size_self = sum([np.prod(self.env.observation_space.spaces[k].shape[1:]) 146 | for k in self.keys_self + self.keys_other]) 147 | self.observation_space = Dict( 148 | {'observation_self': Box(-np.inf, np.inf, (size_self,), np.float32)}) 149 | else: 150 | size_self = sum([self.env.observation_space.spaces[k].shape[1] 151 | for k in self.keys_self]) 152 | obs_self = {'observation_self': Box(-np.inf, np.inf, (size_self,), np.float32)} 153 | obs_extern = {k: Box(-np.inf, np.inf, v.shape[1:], np.float32) 154 | for k, v in self.observation_space.spaces.items() 155 | if k in self.keys_other} 156 | obs_self.update(obs_extern) 157 | self.observation_space = Dict(obs_self) 158 | 159 | def observation(self, observation): 160 | if self.flatten: 161 | other_obs = [observation[k].reshape((observation[k].shape[0], -1)) 162 | for k in self.keys_other] 163 | obs = np.concatenate([observation[k] for k in self.keys_self] + other_obs, axis=-1) 164 | return {'observation_self': obs} 165 | else: 166 | obs = np.concatenate([observation[k] for k in self.keys_self], -1) 167 | obs = {'observation_self': obs} 168 | other_obs = {k: v for k, v in observation.items() if k in self.keys_other} 169 | obs.update(other_obs) 170 | return obs 171 | -------------------------------------------------------------------------------- /mae_envs/wrappers/prep_phase.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from copy import deepcopy 4 | from mae_envs.wrappers.util import update_obs_space 5 | 6 | 7 | class PreparationPhase(gym.Wrapper): 8 | ''' 9 | Rewards are switched off during preparation. 10 | 11 | Args: prep_fraction (float): Fraction of total time that is preparation time 12 | ''' 13 | def __init__(self, env, prep_fraction=.2): 14 | super().__init__(env) 15 | self.prep_fraction = prep_fraction 16 | self.prep_time = self.prep_fraction * self.unwrapped.horizon 17 | self.n_agents = self.metadata['n_agents'] 18 | self.step_counter = 0 19 | self.observation_space = update_obs_space(self, {'prep_obs': [self.n_agents, 1]}) 20 | 21 | def reset(self): 22 | self.step_counter = 0 23 | self.in_prep_phase = True 24 | return self.observation(self.env.reset()) 25 | 26 | def reward(self, reward): 27 | if self.in_prep_phase: 28 | reward = np.zeros_like(reward) 29 | 30 | return reward 31 | 32 | def observation(self, obs): 33 | obs['prep_obs'] = (np.ones((self.n_agents, 1)) * 34 | np.minimum(1.0, self.step_counter / (self.prep_time + 1e-5))) 35 | 36 | return obs 37 | 38 | def step(self, action): 39 | obs, rew, done, info = self.env.step(action) 40 | rew = self.reward(rew) 41 | self.step_counter += 1 42 | self.in_prep_phase = self.step_counter < self.prep_time 43 | info['in_prep_phase'] = self.in_prep_phase 44 | 45 | return self.observation(obs), rew, done, info 46 | 47 | 48 | class NoActionsInPrepPhase(gym.Wrapper): 49 | '''Agents have all actions turned off during preparation phase. 50 | For MultiDiscrete and Discrete, assumes zero action is the rounded down middle action''' 51 | 52 | def __init__(self, env, agent_idxs): 53 | super().__init__(env) 54 | self.agent_idxs = np.array(agent_idxs) 55 | 56 | def reset(self): 57 | obs = self.env.reset() 58 | self.in_prep_phase = True 59 | return obs 60 | 61 | def step(self, action): 62 | obs, rew, done, info = self.env.step(self.action(action)) 63 | self.in_prep_phase = info['in_prep_phase'] 64 | return obs, rew, done, info 65 | 66 | def action(self, action): 67 | ac = deepcopy(action) 68 | if self.in_prep_phase: 69 | for k, space in self.action_space.spaces.items(): 70 | _space = space.spaces[0] 71 | if isinstance(_space, gym.spaces.MultiDiscrete): 72 | zero_ac = (_space.nvec - 1) // 2 73 | elif isinstance(_space, gym.spaces.Discrete): 74 | zero_ac = (_space.n - 1) // 2 75 | else: 76 | zero_ac = 0.0 77 | ac[k][self.agent_idxs] = zero_ac 78 | 79 | return ac 80 | 81 | 82 | class MaskPrepPhaseAction(gym.Wrapper): 83 | ''' 84 | Masks a (binary) action during preparation phase 85 | ''' 86 | def __init__(self, env, action_key): 87 | super().__init__(env) 88 | self.action_key = action_key 89 | 90 | def reset(self): 91 | obs = self.env.reset() 92 | self.in_prep_phase = True 93 | return obs 94 | 95 | def step(self, action): 96 | action[self.action_key] = (action[self.action_key] * (1 - self.in_prep_phase)).astype(bool) 97 | 98 | obs, rew, done, info = self.env.step(action) 99 | self.in_prep_phase = info['in_prep_phase'] 100 | 101 | return obs, rew, done, info 102 | -------------------------------------------------------------------------------- /mae_envs/wrappers/team.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from mae_envs.wrappers.util import update_obs_space 4 | 5 | 6 | class TeamMembership(gym.ObservationWrapper): 7 | ''' 8 | This wrapper just stores team membership information at initialization. 9 | The information is stored as a key in the self.metadata property, which ensures 10 | that it is available even if this wrapper is not on top of the wrapper 11 | hierarchy. 12 | 13 | Arguments: 14 | team_index: list/numpy vector of team membership index 15 | length must be equal to number of agents 16 | e.g. [0,0,0,1,1,1] means first 3 agents are in team 0, 17 | second 3 agents in team 1 18 | n_teams: if team_index is None, agents are split in n_teams number 19 | of teams, with as equal team sizes as possible. 20 | if team_index is set, this argument is ignored 21 | 22 | One planned use of this wrapper is to evaluate the "TrueSkill" score 23 | during training, which requires knowing which agent belongs to which team 24 | 25 | Note: This wrapper currently does not align the reward structure with the 26 | teams, but that could be easily implemented if desired. 27 | ''' 28 | def __init__(self, env, team_index=None, n_teams=2): 29 | super().__init__(env) 30 | self.n_agents = self.metadata['n_actors'] 31 | 32 | if team_index is None: 33 | assert n_teams >= 1, "Number of teams must be at least 1" 34 | # split teams: 5 agents and 3 teams will result in team_index = [0,0,1,1,2] 35 | team_index = np.array_split(np.arange(self.n_agents), n_teams) 36 | team_index = np.concatenate([np.ones_like(ar) * i for i, ar in enumerate(team_index)]) 37 | 38 | assert len(team_index) == self.n_agents, ( 39 | "team_index parameter length must be equal to number of agents") 40 | if isinstance(team_index, np.ndarray): 41 | assert team_index.ndim == 1, ( 42 | "team_index parameter must be numpy array of dimension 1") 43 | 44 | # store in metadata property that gets automatically inherited 45 | # make sure we copy value of team_index if it's a numpy array 46 | self.metadata['team_index'] = np.array(team_index) 47 | self.team_idx = np.array(team_index) 48 | self.observation_space = update_obs_space(env, {'team_size': (self.n_agents, 1)}) 49 | 50 | def observation(self, obs): 51 | obs['team_size'] = np.sum(self.team_idx[:, None] == self.team_idx[None, :], 52 | axis=1, keepdims=True) 53 | return obs 54 | -------------------------------------------------------------------------------- /mae_envs/wrappers/util.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from mujoco_py import MujocoException 3 | from gym.spaces import Dict, Box 4 | import numpy as np 5 | from copy import deepcopy 6 | import logging 7 | 8 | 9 | def update_obs_space(env, delta): 10 | spaces = env.observation_space.spaces.copy() 11 | for key, shape in delta.items(): 12 | spaces[key] = Box(-np.inf, np.inf, shape, np.float32) 13 | return Dict(spaces) 14 | 15 | 16 | class NumpyArrayRewardWrapper(gym.RewardWrapper): 17 | """ 18 | Convenience wrapper that casts rewards to the multiagent format 19 | (numpy array of shape (n_agents,)) 20 | """ 21 | def __init__(self, env): 22 | super().__init__(env) 23 | 24 | def reward(self, rew): 25 | return np.zeros((self.unwrapped.n_agents,)) + rew 26 | 27 | 28 | class DiscretizeActionWrapper(gym.ActionWrapper): 29 | ''' 30 | Take a Box action and convert it to a MultiDiscrete Action through quantization 31 | Args: 32 | action_key: (string) action to discretize 33 | nbuckets: (int) number of discrete actions per dimension. It should be odd such 34 | that actions centered around 0 will have the middle action be 0. 35 | ''' 36 | def __init__(self, env, action_key, nbuckets=11): 37 | super().__init__(env) 38 | self.action_key = action_key 39 | self.discrete_to_continuous_act_map = [] 40 | for i, ac_space in enumerate(self.action_space.spaces[action_key].spaces): 41 | assert isinstance(ac_space, Box) 42 | action_map = np.array([np.linspace(low, high, nbuckets) 43 | for low, high in zip(ac_space.low, ac_space.high)]) 44 | _nbuckets = np.ones((len(action_map))) * nbuckets 45 | self.action_space.spaces[action_key].spaces[i] = gym.spaces.MultiDiscrete(_nbuckets) 46 | self.discrete_to_continuous_act_map.append(action_map) 47 | self.discrete_to_continuous_act_map = np.array(self.discrete_to_continuous_act_map) 48 | 49 | def action(self, action): 50 | action = deepcopy(action) 51 | ac = action[self.action_key] 52 | 53 | # helper variables for indexing the discrete-to-continuous action map 54 | agent_idxs = np.tile(np.arange(ac.shape[0])[:, None], ac.shape[1]) 55 | ac_idxs = np.tile(np.arange(ac.shape[1]), ac.shape[0]).reshape(ac.shape) 56 | 57 | action[self.action_key] = self.discrete_to_continuous_act_map[agent_idxs, ac_idxs, ac] 58 | return action 59 | 60 | 61 | class DiscardMujocoExceptionEpisodes(gym.Wrapper): 62 | ''' 63 | Catches Mujoco Exceptions. Sends signal to discard Episode. 64 | ''' 65 | def __init__(self, env): 66 | super().__init__(env) 67 | self.episode_error = False 68 | 69 | def step(self, action): 70 | assert not self.episode_error, "Won't Continue Episode After Mujoco Exception -- \ 71 | Please discard episode and reset. If info['discard_episode'] is True the episode\ 72 | should be discarded" 73 | try: 74 | obs, rew, done, info = self.env.step(action) 75 | info['discard_episode'] = False 76 | except MujocoException as e: 77 | self.episode_error = True 78 | # Done is set to False such that rollout workers do not accidently send data in 79 | # the event that timelimit is up in the same step as an error occured. 80 | obs, rew, done, info = {}, 0.0, False, {'discard_episode': True} 81 | logging.info(str(e)) 82 | logging.info("Encountered Mujoco Exception During Environment Step.\ 83 | Reset Episode Required") 84 | 85 | return obs, rew, done, info 86 | 87 | def reset(self): 88 | try: 89 | obs = self.env.reset() 90 | except MujocoException: 91 | logging.info("Encountered Mujoco Exception During Environment Reset.\ 92 | Trying Reset Again") 93 | obs = self.reset() 94 | self.episode_error = False 95 | return obs 96 | 97 | 98 | class MaskActionWrapper(gym.Wrapper): 99 | ''' 100 | For a boolean action, sets it to zero given a mask from the previous step. 101 | For example you could mask the grab action based on whether you can see the box 102 | Args: 103 | action_key (string): key in action dictionary to be masked 104 | mask_keys (string): keys in observation dictionary with which to mask. The shape 105 | of the concatenation of the masks (along the 1st dimension) should exactly 106 | match that of action_key 107 | ''' 108 | def __init__(self, env, action_key, mask_keys): 109 | super().__init__(env) 110 | self.action_key = action_key 111 | self.mask_keys = mask_keys 112 | 113 | def reset(self): 114 | self.prev_obs = self.env.reset() 115 | return deepcopy(self.prev_obs) 116 | 117 | def step(self, action): 118 | mask = np.concatenate([self.prev_obs[k] for k in self.mask_keys], -1) 119 | action[self.action_key] = np.logical_and(action[self.action_key], mask) 120 | self.prev_obs, rew, done, info = self.env.step(action) 121 | return deepcopy(self.prev_obs), rew, done, info 122 | 123 | 124 | class AddConstantObservationsWrapper(gym.ObservationWrapper): 125 | ''' 126 | Adds new constant observations to the environment. 127 | Args: 128 | new_obs: Dictionary with the new observations. 129 | ''' 130 | def __init__(self, env, new_obs): 131 | super().__init__(env) 132 | self.new_obs = new_obs 133 | for obs_key in self.new_obs: 134 | assert obs_key not in self.observation_space.spaces, ( 135 | f'Observation key {obs_key} exists in original observation space') 136 | if type(self.new_obs[obs_key]) in [list, tuple]: 137 | self.new_obs[obs_key] = np.array(self.new_obs[obs_key]) 138 | shape = self.new_obs[obs_key].shape 139 | self.observation_space = update_obs_space(self, {obs_key: shape}) 140 | 141 | def observation(self, obs): 142 | for key, val in self.new_obs.items(): 143 | obs[key] = val 144 | return obs 145 | 146 | 147 | class SpoofEntityWrapper(gym.ObservationWrapper): 148 | ''' 149 | Add extra entities along entity dimension such that shapes can match between 150 | environments with differing number of entities. This is meant to be used 151 | after SplitObservations and SelectKeysWrapper. This will also add masks that are 152 | 1 except along the new columns (which could be used by fully observed value function) 153 | Args: 154 | total_n_entities (int): total number of entities after spoofing (including spoofed ones) 155 | keys (list): observation keys with which to add entities along the second dimension 156 | mask_keys (list): mask keys with which to add columns. 157 | ''' 158 | def __init__(self, env, total_n_entities, keys, mask_keys): 159 | super().__init__(env) 160 | self.total_n_entities = total_n_entities 161 | self.keys = keys 162 | self.mask_keys = mask_keys 163 | for key in self.keys + self.mask_keys: 164 | shape = list(self.observation_space.spaces[key].shape) 165 | shape[1] = total_n_entities 166 | self.observation_space = update_obs_space(self, {key: shape}) 167 | for key in self.mask_keys: 168 | shape = list(self.observation_space.spaces[key].shape) 169 | self.observation_space = update_obs_space(self, {key + '_spoof': shape}) 170 | 171 | def observation(self, obs): 172 | for key in self.keys: 173 | n_to_spoof = self.total_n_entities - obs[key].shape[1] 174 | if n_to_spoof > 0: 175 | obs[key] = np.concatenate([obs[key], np.zeros((obs[key].shape[0], n_to_spoof, obs[key].shape[-1]))], 1) 176 | for key in self.mask_keys: 177 | n_to_spoof = self.total_n_entities - obs[key].shape[1] 178 | obs[key + '_spoof'] = np.concatenate([np.ones_like(obs[key]), np.zeros((obs[key].shape[0], n_to_spoof))], -1) 179 | if n_to_spoof > 0: 180 | obs[key] = np.concatenate([obs[key], np.zeros((obs[key].shape[0], n_to_spoof))], -1) 181 | 182 | return obs 183 | 184 | 185 | class ConcatenateObsWrapper(gym.ObservationWrapper): 186 | ''' 187 | Group multiple observations under the same key in the observation dictionary. 188 | Args: 189 | obs_groups: dict of {key_to_save: [keys to concat]} 190 | ''' 191 | def __init__(self, env, obs_groups): 192 | super().__init__(env) 193 | self.obs_groups = obs_groups 194 | for key_to_save, keys_to_concat in obs_groups.items(): 195 | assert np.all([np.array(self.observation_space.spaces[keys_to_concat[0]].shape[:-1]) == 196 | np.array(self.observation_space.spaces[k].shape[:-1]) 197 | for k in keys_to_concat]), \ 198 | f"Spaces were {[(k, v) for k, v in self.observation_space.spaces.items() if k in keys_to_concat]}" 199 | new_last_dim = sum([self.observation_space.spaces[k].shape[-1] for k in keys_to_concat]) 200 | new_shape = list(self.observation_space.spaces[keys_to_concat[0]].shape[:-1]) + [new_last_dim] 201 | self.observation_space = update_obs_space(self, {key_to_save: new_shape}) 202 | 203 | def observation(self, obs): 204 | for key_to_save, keys_to_concat in self.obs_groups.items(): 205 | obs[key_to_save] = np.concatenate([obs[k] for k in keys_to_concat], -1) 206 | return obs 207 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/README.md: -------------------------------------------------------------------------------- 1 | # Randomized Uncertain Social Preferences 2 | We share the environment code used in the work *Emergent Reciprocity and Team Formation from Randomized Uncertain Social Preferences* (TODO: ADD LINK). 3 | 4 | The relevant code for randomized uncertain social preferences are in wrappers in *wrappers_rusp.py* --- Here we define a wrapper that defines a random reward sharing relationship graph per episode and transforms agents' reward accordingly. Each agent is given an independent uncertainty and noisy sample around this relationship graph. Tests for making sure observations get routed properly are in *test_wrapper_rusp.py*. 5 | 6 | ## Environments 7 | * *env_ipd.py*: 2 player infinite horizon prisoner's dilemma 8 | * *env_indirect_reciprocity.py*: n-player infinite horizon prisoner's dilemma where at each step 2 agents are randomly chosen to play 9 | * *env_prisoners_buddy.py*: an abstract game where agents must mutually choose each other and resist temptation to defect and change teams. 10 | * *env_oasis.py*: MUJOCO based survival game where the environment is resource constrained such that only a subset of agents can survive. -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/multi-agent-emergence-environments/bafaf1e11e6398624116761f91ae7c93b136f395/randomized_uncertain_social_preferences/rusp/__init__.py -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/abstract_base_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from gym.spaces import Dict 4 | from mujoco_worldgen.util.types import store_args 5 | 6 | 7 | class AbstractBaseEnv(gym.Env): 8 | ''' 9 | Barebones Gym Env that allows game to be constructed soley in wrappers. 10 | ''' 11 | @store_args 12 | def __init__(self, n_agents): 13 | self.metadata = {} 14 | self.metadata['n_agents'] = n_agents 15 | self.metadata['n_actors'] = n_agents 16 | self.observation_space = Dict({}) 17 | self.action_space = Dict({}) 18 | 19 | def step(self, action): 20 | return {}, np.zeros(self.n_agents), False, {} 21 | 22 | def reset(self, **kwargs): 23 | return {} 24 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/env_ipd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym.spaces import Tuple, Discrete 4 | from mae_envs.wrappers.util import update_obs_space 5 | from mujoco_worldgen.util.types import store_args 6 | from rusp.abstract_base_env import AbstractBaseEnv 7 | from rusp.wrappers_util import RandomizedHorizonWrapper 8 | from rusp.wrappers_rusp import RUSPWrapper, add_rew_share_observation_keys 9 | from mae_envs.wrappers.util import ConcatenateObsWrapper 10 | from mae_envs.wrappers.multi_agent import (SplitObservations, SelectKeysWrapper) 11 | 12 | 13 | class IteratedMatrixGameWrapper(gym.Wrapper): 14 | ''' 15 | 2 player matrix game. Agents get a single binary action "action_defect". Agents 16 | get to observe the last action each agent took. Agents are rewarded according to 17 | payoff_matrix. 18 | Args: 19 | payoff_matrix (2x2x2 np.ndarray): the payoff payoff_matrix. We index into this payoff_matrix 20 | according to agent actions. 21 | Observations: 22 | prev_ac (n_agents, 1): previous action each agent took. 23 | ''' 24 | @store_args 25 | def __init__(self, env, payoff_matrix): 26 | super().__init__(env) 27 | self.n_agents = self.metadata['n_agents'] 28 | self.action_space.spaces['action_defect'] = Tuple([Discrete(n=2) for _ in range(self.n_agents)]) 29 | self.observation_space = update_obs_space(self, {'prev_ac': [self.n_agents, 1]}) 30 | 31 | def reset(self): 32 | self.previous_action = -1 * np.ones(self.n_agents) 33 | self.num_defects = np.zeros(self.n_agents) 34 | self.num_coops = np.zeros(self.n_agents) 35 | return self.observation(self.env.reset()) 36 | 37 | def step(self, action): 38 | self.previous_action = action['action_defect'].copy() 39 | obs, _, done, info = self.env.step(action) 40 | 41 | self.num_defects += action['action_defect'] 42 | self.num_coops += (1 - action['action_defect']) 43 | rew = self.payoff_matrix[action['action_defect'][0], action['action_defect'][1]] 44 | 45 | if done: 46 | info.update({f'actor{i}_n_defects': n_defects for i, n_defects in enumerate(self.num_defects)}) 47 | info.update({f'actor{i}_n_coops': n_coops for i, n_coops in enumerate(self.num_coops)}) 48 | return self.observation(obs), rew, done, info 49 | 50 | def observation(self, obs): 51 | obs['prev_ac'] = self.previous_action[:, None] 52 | return obs 53 | 54 | 55 | class LastAgentScripted(gym.Wrapper): 56 | ''' 57 | Replace the last agent with either a all-cooperate, all-defect, or tit-for-tat scripted policy. 58 | The last agent is considered part of the environment, so we remove them from the observation 59 | and action space. 60 | Args: 61 | policy_to_play (string): One of "allc", "alld", or "tft" 62 | ''' 63 | def __init__(self, env, policy_to_play): 64 | super().__init__(env) 65 | assert policy_to_play in ['allc', 'alld', 'tft'] 66 | self.policy_to_play = policy_to_play 67 | self.metadata['n_actors'] -= 1 68 | for k, v in self.action_space.spaces.items(): 69 | self.action_space.spaces[k] = Tuple(v.spaces[:-1]) 70 | 71 | def reset(self): 72 | self.previous_action = 0 73 | return self.observation(self.env.reset()) 74 | 75 | def step(self, action): 76 | if self.policy_to_play == 'allc': 77 | ac_to_play = 0 78 | elif self.policy_to_play == 'alld': 79 | ac_to_play = 1 80 | elif self.policy_to_play == 'tft': 81 | ac_to_play = self.previous_action 82 | 83 | self.previous_action = action['action_defect'][0] 84 | action['action_defect'] = np.concatenate([action['action_defect'], [ac_to_play]]) 85 | 86 | obs, rew, done, info = self.env.step(action) 87 | return self.observation(obs), rew[:-1], done, info 88 | 89 | def observation(self, obs): 90 | obs = {k: v[:-1] for k, v in obs.items()} 91 | return obs 92 | 93 | 94 | def make_env(horizon=10, horizon_lower=None, horizon_upper=None, 95 | prob_per_step_to_stop=0.1, # If set then we play the infinite game, 96 | mutual_cooperate=2, defected_against=-2, successful_defect=4, mutual_defect=0, 97 | # Evals 98 | against_all_c=False, against_all_d=False, against_tft=False, 99 | # Random Teams 100 | rusp_args={}): 101 | env = AbstractBaseEnv(2) 102 | 103 | env = RandomizedHorizonWrapper(env, lower_lim=horizon_lower or horizon, upper_lim=horizon_upper or horizon, 104 | prob_per_step_to_stop=prob_per_step_to_stop) 105 | # Construct Payoff Matrix 106 | cc = [mutual_cooperate, mutual_cooperate] 107 | cd = [defected_against, successful_defect] 108 | dc = list(reversed(cd)) 109 | dd = [mutual_defect, mutual_defect] 110 | payoff_matrix = np.array([[cc, cd], 111 | [dc, dd]]) 112 | env = IteratedMatrixGameWrapper(env, payoff_matrix=payoff_matrix) 113 | 114 | env = RUSPWrapper(env, **rusp_args) 115 | 116 | keys_self = ['prev_ac', 'timestep'] 117 | keys_additional_self_vf = ['fraction_episode_done', 'horizon'] 118 | 119 | keys_other_agents = ['prev_ac'] 120 | keys_additional_other_agents_vf = [] 121 | keys_self_matrices = [] 122 | add_rew_share_observation_keys(keys_self=keys_self, 123 | keys_additional_self_vf=keys_additional_self_vf, 124 | keys_other_agents=keys_other_agents, 125 | keys_additional_other_agents_vf=keys_additional_other_agents_vf, 126 | keys_self_matrices=keys_self_matrices, 127 | **rusp_args) 128 | keys_external = ['other_agents', 129 | 'other_agents_vf', 130 | 'additional_self_vf_obs'] 131 | 132 | env = SplitObservations(env, keys_self + keys_additional_self_vf, 133 | keys_copy=[], keys_self_matrices=keys_self_matrices) 134 | env = ConcatenateObsWrapper(env, {'other_agents': keys_other_agents, 135 | 'other_agents_vf': ['other_agents'] + keys_additional_other_agents_vf, 136 | 'additional_self_vf_obs': [k + '_self' for k in keys_additional_self_vf]}) 137 | env = SelectKeysWrapper(env, keys_self=keys_self, 138 | keys_other=keys_external) 139 | 140 | if against_all_c or against_all_d or against_tft: 141 | if against_all_c: 142 | policy_to_play = 'allc' 143 | elif against_all_d: 144 | policy_to_play = 'alld' 145 | elif against_tft: 146 | policy_to_play = 'tft' 147 | env = LastAgentScripted(env, policy_to_play) 148 | return env 149 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/env_prisoners_buddy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | from mujoco_worldgen.util.types import store_args 4 | from mae_envs.wrappers.util import update_obs_space 5 | from mae_envs.wrappers.util import ConcatenateObsWrapper 6 | from mae_envs.wrappers.multi_agent import (SplitObservations, SelectKeysWrapper) 7 | from rusp.wrappers_rusp import RUSPWrapper, add_rew_share_observation_keys 8 | from rusp.wrappers_util import RandomIdentityVector, RandomizedHorizonWrapper, OtherActorAttentionAction, ActionOptionsWrapper 9 | from rusp.abstract_base_env import AbstractBaseEnv 10 | 11 | 12 | class PrisonersBuddy(OtherActorAttentionAction): 13 | ''' 14 | Agents must mutually choose others to get reward (mutual_cooperate_rew). They can choose to sitout 15 | and receive zero reward. If they make an unreciprocated choice, the targeted agent will recieve a defection 16 | reward (successful_defect_rew). We call it a defection reward since they could reciprocated the choice. 17 | The agent making the unreciprocated choice receives a penalty (defected_against_rew). 18 | 19 | Agents get a chance to "communicate" in that their choices only elicit rewards every few timesteps (choosing_period). 20 | This gives them time to break symmetry. 21 | 22 | Observations: 23 | chose_me (n_agents, n_agents, 1): which other agents (column) chose me (row agent) last step 24 | i_chose (n_agents, n_agents, 1): which other agents (column) did I choose (row agent) last step 25 | chose_me_rew (n_agents, n_agents, 1): which other agents (column) chose me (row agent) last step reward was given 26 | i_chose_rew (n_agents, n_agents, 1): which other agents (column) did I choose (row agent) last step reward was given 27 | i_chose_any_rew (n_agents, 1): Did I choose to sitout or choose someone last timestep reward was given 28 | previous_choice_identity (n_agents, agent_identity_dim): ID of agent I previously chose 29 | next_choice_is_real (n_agents, 1): is the next timestep one in which reward will be given 30 | ''' 31 | @store_args 32 | def __init__(self, env, choosing_period, 33 | agent_identity_dim=4, 34 | mutual_cooperate_rew=2, 35 | defected_against_rew=-1, 36 | successful_defect_rew=1): 37 | super().__init__(env, 'action_choose_agent') 38 | self.observation_space = update_obs_space(self, { 39 | 'chose_me': [self.n_agents, self.n_agents, 1], 40 | 'i_chose': [self.n_agents, self.n_agents, 1], 41 | 'chose_me_rew': [self.n_agents, self.n_agents, 1], 42 | 'i_chose_rew': [self.n_agents, self.n_agents, 1], 43 | 'i_chose_any_rew': [self.n_agents, 1], 44 | 'previous_choice_identity': [self.n_agents, agent_identity_dim], 45 | 'next_choice_is_real': [self.n_agents, 1], 46 | }) 47 | 48 | def reset(self): 49 | self._t = 1 # Start t at 1 such that first round is not a reward round 50 | self._chose_me = np.zeros((self.n_agents, self.n_agents)) 51 | self._chose_me_rew = np.zeros((self.n_agents, self.n_agents)) 52 | self._n_times_not_chosen = np.zeros((self.n_agents)) 53 | self._n_times_team_changed = np.zeros((self.n_agents)) 54 | self._n_agents_on_team = [] 55 | self._previous_choice_identity = np.zeros((self.n_agents, self.agent_identity_dim)) 56 | self._i_chose_any_rew_obs = np.zeros((self.n_agents, 1)) 57 | self._team_lengths = [] 58 | self._n_successful_defections = 0 59 | self._current_team_lengths = defaultdict(lambda: 0) 60 | self._previous_teams = np.ones(self.n_agents, dtype=int) * -1 61 | self._both_chose = np.zeros((self.n_agents, self.n_agents), dtype=bool) 62 | self._perfect_game = True 63 | self._first_choice = True 64 | 65 | return self.observation(self.env.reset()) 66 | 67 | def step(self, action): 68 | obs, rew, done, info = self.env.step(action) 69 | self._chose_me = np.zeros((self.n_agents, self.n_agents), dtype=bool) 70 | targets = np.ones(self.n_agents, dtype=int) * -1 71 | for i in range(self.n_agents): 72 | target = self._get_target_actor(i, action) 73 | if len(target): 74 | targets[i] = target[0] 75 | self._chose_me[target[0], i] = 1 76 | 77 | self._previous_choice_identity = obs['agent_identity'][targets] 78 | self._previous_choice_identity[targets == -1] = 0 79 | 80 | # Reward rounds 81 | if self._t % self.choosing_period == 0: 82 | self._both_chose = self._chose_me * self._chose_me.T 83 | self._chose_me_rew = self._chose_me.copy() 84 | 85 | self._teams = np.argmax(self._both_chose, axis=1) # Indicies of teamate 86 | self._teams[np.all(self._both_chose == 0, axis=1)] = -1 # Make sure those without team are set to -1 instead of 0 87 | 88 | rew = self._prisoners_buddy_reward_update(rew) 89 | 90 | # Track stats 91 | self._n_times_not_chosen[np.sum(self._chose_me, 1) == 0] += 1 92 | # Since both_chose is symmetric, just get the index of nonzero entry in upper triangle 93 | current_team_indices = np.c_[np.nonzero(np.triu(self._both_chose))] 94 | current_team_tuples = list(map(tuple, current_team_indices)) 95 | teams_done = [k for k in self._current_team_lengths.keys() if k not in current_team_tuples] 96 | 97 | for team_done in teams_done: 98 | self._team_lengths.append(self._current_team_lengths[team_done]) 99 | del self._current_team_lengths[team_done] 100 | for current_team_tuple in current_team_tuples: 101 | self._current_team_lengths[current_team_tuple] += 1 102 | 103 | self._i_chose_any_rew_obs = np.any(self._chose_me_rew, 0)[:, None] 104 | 105 | if self._first_choice: 106 | self._first_choice = False 107 | else: 108 | all_teams_didnt_change = np.all(self._previous_teams == self._teams) 109 | max_number_of_teams_filled = np.sum(self._teams != -1) == ((self.n_agents // 2) * 2) 110 | self._perfect_game = self._perfect_game and all_teams_didnt_change and max_number_of_teams_filled 111 | 112 | self._previous_teams = self._teams 113 | 114 | self._t += 1 115 | 116 | if done: 117 | self._team_lengths += list(self._current_team_lengths.values()) 118 | info['average_team_length'] = np.mean(self._team_lengths) if len(self._team_lengths) else 0 119 | info['n_times_team_changed'] = np.sum(self._n_times_team_changed) 120 | info['n_agents_on_team_per_step'] = np.mean(self._n_agents_on_team) 121 | info['number_decisions'] = self._t / self.choosing_period 122 | info['n_unique_not_chosen'] = np.sum(self._n_times_not_chosen > 0) 123 | info['n_successful_defections'] = self._n_successful_defections 124 | info['perfect_game'] = self._perfect_game 125 | 126 | return self.observation(obs), rew, done, info 127 | 128 | def observation(self, obs): 129 | obs['chose_me'] = self._chose_me[:, :, None] 130 | obs['i_chose'] = self._chose_me.T[:, :, None] 131 | obs['chose_me_rew'] = self._chose_me_rew[:, :, None] 132 | obs['i_chose_rew'] = self._chose_me_rew.T[:, :, None] 133 | obs['i_chose_any_rew'] = self._i_chose_any_rew_obs 134 | obs['previous_choice_identity'] = self._previous_choice_identity 135 | # assumes this is called after t is increased 136 | obs['next_choice_is_real'] = np.ones((self.n_agents, 1)) if self._t % self.choosing_period == 0 else np.zeros((self.n_agents, 1)) 137 | return obs 138 | 139 | def _prisoners_buddy_reward_update(self, rew): 140 | on_team = np.any(self._both_chose, axis=1) 141 | chose_me_oneway = (self._chose_me & ~self._both_chose) 142 | num_chose_me_oneway = np.sum(chose_me_oneway, axis=1) 143 | i_chose_one_way = np.any(chose_me_oneway, axis=0) 144 | 145 | assert np.all(np.sum(chose_me_oneway, axis=0) <= 1) 146 | assert np.all((i_chose_one_way & on_team) == 0) 147 | 148 | previous_has_team = (self._previous_teams != -1) 149 | your_team_changed = (self._teams != self._previous_teams) 150 | 151 | rew[on_team] += self.mutual_cooperate_rew 152 | rew[i_chose_one_way] += self.defected_against_rew 153 | rew += num_chose_me_oneway * self.successful_defect_rew 154 | 155 | # Stats 156 | self._n_successful_defections += np.sum(i_chose_one_way) 157 | self._n_times_team_changed += (previous_has_team & your_team_changed) 158 | self._n_agents_on_team.append(np.sum(on_team)) 159 | 160 | return rew 161 | 162 | 163 | def make_env(n_agents=5, horizon=50, horizon_lower=None, horizon_upper=None, 164 | prob_per_step_to_stop=0.02, 165 | choosing_period=5, 166 | mutual_cooperate_rew=2, defected_against_rew=-2, successful_defect_rew=1, 167 | agent_identity_dim=16, 168 | rusp_args={}): 169 | env = AbstractBaseEnv(n_agents) 170 | env = RandomizedHorizonWrapper(env, lower_lim=horizon_lower or horizon, upper_lim=horizon_upper or horizon, 171 | prob_per_step_to_stop=prob_per_step_to_stop) 172 | env = RandomIdentityVector(env, vector_dim=agent_identity_dim) 173 | 174 | env = PrisonersBuddy(env, choosing_period=choosing_period, 175 | agent_identity_dim=agent_identity_dim, 176 | mutual_cooperate_rew=mutual_cooperate_rew, defected_against_rew=defected_against_rew, 177 | successful_defect_rew=successful_defect_rew) 178 | 179 | env = ActionOptionsWrapper(env, ['action_choose_agent'], {'action_choose_agent': -1}) 180 | 181 | env = RUSPWrapper(env, **rusp_args) 182 | 183 | keys_self = ['previous_choice', 184 | 'next_choice_is_real', 185 | 'i_chose_any_rew', 186 | 'agent_identity', 187 | 'previous_choice_identity', 188 | 'timestep'] 189 | keys_additional_self_vf = ['fraction_episode_done', 'horizon'] 190 | 191 | keys_other_agents = [ 192 | 'previous_choice', 193 | 'chose_me', 194 | 'i_chose', 195 | 'chose_me_rew', 196 | 'i_chose_rew', 197 | 'i_chose_any_rew', 198 | 'agent_identity', 199 | 'previous_choice_identity' 200 | ] 201 | keys_additional_other_agents_vf = [] 202 | keys_self_matrices = ['chose_me', 203 | 'i_chose', 204 | 'chose_me_rew', 205 | 'i_chose_rew'] 206 | 207 | keys_external = ['other_agents', 208 | 'other_agents_vf', 209 | 'additional_self_vf_obs'] 210 | 211 | add_rew_share_observation_keys(keys_self=keys_self, 212 | keys_additional_self_vf=keys_additional_self_vf, 213 | keys_other_agents=keys_other_agents, 214 | keys_additional_other_agents_vf=keys_additional_other_agents_vf, 215 | keys_self_matrices=keys_self_matrices, 216 | **rusp_args) 217 | 218 | env = SplitObservations(env, keys_self + keys_additional_self_vf, 219 | keys_copy=[], keys_self_matrices=keys_self_matrices) 220 | env = ConcatenateObsWrapper(env, {'other_agents': keys_other_agents, 221 | 'other_agents_vf': ['other_agents'] + keys_additional_other_agents_vf, 222 | 'additional_self_vf_obs': [k + '_self' for k in keys_additional_self_vf]}) 223 | env = SelectKeysWrapper(env, keys_self=keys_self, 224 | keys_other=keys_external) 225 | 226 | return env 227 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/test_env_indirect_reciprocity.py: -------------------------------------------------------------------------------- 1 | from rusp.env_indirect_reciprocity import make_env 2 | import numpy as np 3 | from copy import deepcopy 4 | 5 | 6 | def _test_fixed_policy(against_all_d=False, against_all_c=False): 7 | env = make_env(against_all_d=against_all_d, against_all_c=against_all_c, 8 | last_agent_always_plays=True) 9 | prev_obs = env.reset() 10 | for i in range(1000): 11 | currently_playing = np.squeeze(prev_obs['youre_playing_self']) 12 | ac = {'action_defect': np.random.randint(0, 2, size=(env.metadata['n_actors']))} 13 | 14 | obs, rew, done, info = env.step(ac) 15 | 16 | if against_all_d: 17 | assert np.all(rew[currently_playing & (ac['action_defect'] == 0)] == -2) 18 | assert np.all(rew[currently_playing & (ac['action_defect'] == 1)] == 0) 19 | elif against_all_c: 20 | assert np.all(rew[currently_playing & (ac['action_defect'] == 0)] == 2) 21 | assert np.all(rew[currently_playing & (ac['action_defect'] == 1)] == 4) 22 | else: 23 | assert False 24 | assert np.all(rew[~currently_playing] == 0) 25 | 26 | prev_obs = obs 27 | 28 | if done: 29 | prev_obs = env.reset() 30 | 31 | 32 | def test_all_d(): 33 | _test_fixed_policy(against_all_d=True) 34 | 35 | 36 | def test_all_c(): 37 | _test_fixed_policy(against_all_c=True) 38 | 39 | 40 | # Tests for play orderings 41 | def test_last_always_plays(): 42 | env = make_env(last_agent_always_plays=True) 43 | obs = env.reset() 44 | assert obs['youre_playing_self'][-1, 0] 45 | ac = {'action_defect': np.random.randint(0, 2, size=(env.metadata['n_actors']))} 46 | for i in range(1000): 47 | obs, _, done, _ = env.step(ac) 48 | assert obs['youre_playing_self'][-1, 0] 49 | 50 | if done: 51 | obs = env.reset() 52 | assert obs['youre_playing_self'][-1, 0] 53 | 54 | 55 | def test_last_first_versus_last(): 56 | env = make_env(last_step_first_agent_vs_last_agent=True) 57 | prev_obs = env.reset() 58 | ac = {'action_defect': np.random.randint(0, 2, size=(env.metadata['n_actors']))} 59 | for i in range(1000): 60 | obs, _, done, _ = env.step(ac) 61 | 62 | if done: 63 | assert prev_obs['youre_playing_self'][-1, 0] 64 | assert prev_obs['youre_playing_self'][0, 0] 65 | obs = env.reset() 66 | 67 | prev_obs = deepcopy(obs) 68 | 69 | 70 | def test_last_doesnt_play_until(): 71 | env = make_env(last_doesnt_play_until_t=5) 72 | ac = {'action_defect': np.random.randint(0, 2, size=(env.metadata['n_actors']))} 73 | obs = env.reset() 74 | done = False 75 | t = 0 76 | for i in range(1000): 77 | if t < 5: 78 | assert not obs['youre_playing_self'][-1, 0] 79 | obs, rew, done, info = env.step(ac) 80 | t += 1 81 | 82 | if done: 83 | obs = env.reset() 84 | done = False 85 | t = 0 86 | 87 | 88 | def test_last_doesnt_play_until_and_last_must_play_at_t(): 89 | env = make_env(last_doesnt_play_until_t=5, last_must_play_at_t=True) 90 | ac = {'action_defect': np.random.randint(0, 2, size=(env.metadata['n_actors']))} 91 | obs = env.reset() 92 | done = False 93 | t = 0 94 | for i in range(1000): 95 | if t < 5: 96 | assert not obs['youre_playing_self'][-1, 0] 97 | if t == 5: 98 | assert obs['youre_playing_self'][-1, 0] 99 | obs, rew, done, info = env.step(ac) 100 | t += 1 101 | 102 | if done: 103 | obs = env.reset() 104 | done = False 105 | t = 0 106 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/test_env_ipd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rusp.env_ipd import make_env 3 | 4 | 5 | def test_env_runs(): 6 | env = make_env() 7 | env.reset() 8 | 9 | action = {'action_defect': np.array([0, 0])} 10 | obs, rew, done, info = env.step(action) 11 | assert np.all(rew == np.array([2, 2])) 12 | 13 | action = {'action_defect': np.array([1, 0])} 14 | obs, rew, done, info = env.step(action) 15 | assert np.all(rew == np.array([4, -2])) 16 | 17 | action = {'action_defect': np.array([0, 1])} 18 | obs, rew, done, info = env.step(action) 19 | assert np.all(rew == np.array([-2, 4])) 20 | 21 | action = {'action_defect': np.array([1, 1])} 22 | obs, rew, done, info = env.step(action) 23 | assert np.all(rew == np.array([0, 0])) 24 | 25 | 26 | def test_env_against_all_c(): 27 | env = make_env(against_all_c=True) 28 | env.reset() 29 | 30 | action = {'action_defect': np.array([0])} 31 | obs, rew, done, info = env.step(action) 32 | assert np.all(rew == np.array([2])) 33 | 34 | action = {'action_defect': np.array([1])} 35 | obs, rew, done, info = env.step(action) 36 | assert np.all(rew == np.array([4])) 37 | 38 | 39 | def test_env_against_all_d(): 40 | env = make_env(against_all_d=True) 41 | env.reset() 42 | 43 | action = {'action_defect': np.array([0])} 44 | obs, rew, done, info = env.step(action) 45 | assert np.all(rew == np.array([-2])) 46 | 47 | action = {'action_defect': np.array([1])} 48 | obs, rew, done, info = env.step(action) 49 | assert np.all(rew == np.array([0])) 50 | 51 | 52 | def test_env_against_tft(): 53 | env = make_env(against_tft=True) 54 | env.reset() 55 | 56 | action = {'action_defect': np.array([0])} 57 | obs, rew, done, info = env.step(action) 58 | assert np.all(rew == np.array([2])) 59 | 60 | action = {'action_defect': np.array([1])} 61 | obs, rew, done, info = env.step(action) 62 | assert np.all(rew == np.array([4])) 63 | 64 | action = {'action_defect': np.array([1])} 65 | obs, rew, done, info = env.step(action) 66 | assert np.all(rew == np.array([0])) 67 | 68 | action = {'action_defect': np.array([0])} 69 | obs, rew, done, info = env.step(action) 70 | assert np.all(rew == np.array([-2])) 71 | 72 | action = {'action_defect': np.array([0])} 73 | obs, rew, done, info = env.step(action) 74 | assert np.all(rew == np.array([2])) 75 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/test_env_oasis.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import unittest 3 | import os 4 | 5 | EXAMPLES_DIR = os.path.dirname(os.path.abspath(__file__)) 6 | EXAMINE_FILE_PATH = os.path.join(EXAMPLES_DIR, "../../bin/examine.py") 7 | 8 | 9 | class ExamineTest(unittest.TestCase): 10 | def test_examine_env(self): 11 | envs = [ 12 | "env_oasis.py" 13 | ] 14 | for env in envs: 15 | with self.assertRaises(subprocess.TimeoutExpired): 16 | subprocess.check_call( 17 | ["/usr/bin/env", "python", EXAMINE_FILE_PATH, os.path.join(EXAMPLES_DIR, env)], 18 | timeout=10) 19 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/test_env_prisoners_buddy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rusp.env_prisoners_buddy import make_env 3 | 4 | 5 | def test_env_runs(): 6 | env = make_env() 7 | env.reset() 8 | 9 | action = {'action_choose_agent': [0, 0, 3, 0, 1], 'action_choose_option': [1, 0, 0, 0, 0]} 10 | for i in range(5): 11 | obs, rew, done, info = env.step(action) 12 | 13 | assert np.all(rew == [1, -2, 3, 2, -2]) 14 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/test_wrapper_rusp.py: -------------------------------------------------------------------------------- 1 | from rusp.wrappers_rusp import RUSPGenerator 2 | import _jsonnet 3 | import json 4 | import os 5 | import numpy as np 6 | 7 | 8 | def test_compute_observations(): 9 | N_AGENTS = 2 10 | FILE_PATH = os.path.dirname(os.path.abspath(__file__)) 11 | graph_generator = RUSPGenerator() 12 | 13 | graph_generator._generate_social_preferences(N_AGENTS) 14 | graph_generator._generate_uncertainty(N_AGENTS) 15 | 16 | graph_generator.noise_std = np.arange(1, N_AGENTS ** 3 + 1).reshape((N_AGENTS, N_AGENTS, N_AGENTS)) 17 | graph_generator.noise = np.arange(1, N_AGENTS ** 3 + 1).reshape((N_AGENTS, N_AGENTS, N_AGENTS)) * 10 18 | graph_generator.unnormalized_reward_xform_mat = graph_generator.reward_xform_mat = np.arange(1, N_AGENTS ** 2 + 1).reshape((N_AGENTS, N_AGENTS)) 19 | 20 | graph_generator._precompute_observations(N_AGENTS) 21 | 22 | assert np.all(graph_generator.precomputed_obs['self_rew_value'] == np.array([1, 4])) 23 | assert np.all(graph_generator.precomputed_obs['self_rew_value_noisy'] == np.array([1, 4]) + np.array([10, 80])) 24 | assert np.all(graph_generator.precomputed_obs['self_rew_value_noise_level'] == np.array([1, 8])) 25 | 26 | assert np.all(graph_generator.precomputed_obs['other_rew_value_s'] == np.array( 27 | [[1, 4], 28 | [1, 4]])) 29 | assert np.all(graph_generator.precomputed_obs['other_rew_value_s_noisy'] == np.array( 30 | [[1 + 10, 4 + 40], 31 | [1 + 50, 4 + 80]])) 32 | assert np.all(graph_generator.precomputed_obs['other_rew_value_s_noise_level'] == np.array( 33 | [[1, 4], 34 | [5, 8]])) 35 | 36 | assert np.all(graph_generator.precomputed_obs['rew_share_so_s'] == np.array( 37 | [[1, 2], 38 | [3, 4]])) 39 | assert np.all(graph_generator.precomputed_obs['rew_share_so_s_noisy'] == np.array( 40 | [[1 + 10, 2 + 20], 41 | [3 + 70, 4 + 80]])) 42 | assert np.all(graph_generator.precomputed_obs['rew_share_so_s_noise_level'] == np.array( 43 | [[1, 2], 44 | [7, 8]])) 45 | 46 | assert np.all(graph_generator.precomputed_obs['rew_share_os_o'] == np.array( 47 | [[1, 3], 48 | [2, 4]])) 49 | assert np.all(graph_generator.precomputed_obs['rew_share_os_o_noisy'] == np.array( 50 | [[1 + 10, 3 + 70], 51 | [2 + 20, 4 + 80]])) 52 | assert np.all(graph_generator.precomputed_obs['rew_share_os_o_noise_level'] == np.array( 53 | [[1, 7], 54 | [2, 8]])) 55 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/wrappers_rusp.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from typing import Tuple, List 4 | from mae_envs.wrappers.util import update_obs_space 5 | from mujoco_worldgen.util.types import store_args 6 | 7 | 8 | def get_all_integer_partitions(n, min_team_size=1, max_team_size=np.inf): 9 | ''' 10 | Return a list of all integer partitions of n. 11 | Args: 12 | n (int): number of entities. 13 | min_team_size (int): minimum number of entities in a partition 14 | max_team_size (int): maximum number of entities in a partition 15 | ''' 16 | if n <= max_team_size: 17 | yield (n,) 18 | for i in range(min_team_size, n // 2 + 1): 19 | for p in get_all_integer_partitions(n - i, i, max_team_size): 20 | yield (i,) + p 21 | 22 | 23 | class RUSPGenerator: 24 | ''' 25 | Helper class to generate the randomized uncertain relationship graph. Agents are first 26 | partitioned into groups. Within each group we randomize the amount each agent shares 27 | reward with everyone else in the group. We then sample independent noise such that each 28 | agent observes an inependent noisy observation of the relationship graph. 29 | 30 | Reward sharing values are sampled from a beta distribution with parameters alpha and beta. For 31 | all results in the paper except where we experiment with team hardness, we set both 32 | alpha and beta to 1. 33 | 34 | To compute noise added to the relationship graphs, we first sample the noise level (standard devation 35 | of a gaussian) from a uniform distribution independently per relationship, per agent. 36 | We then sample a single value from this Gaussian with sampled standard deviation centered around the true value 37 | 38 | Args: 39 | min_team_size (int): minimum size of a group of agents with non-zero reward sharing amounts 40 | max_team_size (int): maximum size of a group of agents with non-zero reward sharing amounts 41 | alpha (float): reward sharing beta distribution parameter 42 | beta (float): reward sharing beta distribution parameter 43 | allow_diagonal_non_1 (bool): if True then diagonal elements of the reward sharing matrix (an agents 44 | weight over its own reward) can be less than 1 (sampled from the same beta distribution as for other 45 | relationships) 46 | obs_noise_std_range (tuple of float): Range (maximum and minimum) that noise standard deviation can be sampled 47 | from. 48 | ''' 49 | @store_args 50 | def __init__(self, *, 51 | # Prosociality Graph 52 | min_team_size: int = 1, 53 | max_team_size: int = 1, 54 | alpha: float = 1.0, 55 | beta: float = 1.0, 56 | allow_diagonal_non_1: bool = True, 57 | # Uncertainty 58 | obs_noise_std_range: Tuple[float] = [0.0, 1.0], 59 | **kwargs): 60 | assert min_team_size >= 1 61 | assert max_team_size >= 1 62 | assert max_team_size >= min_team_size 63 | assert alpha > 0 64 | assert beta > 0 65 | assert np.all(np.array(obs_noise_std_range) >= 0) 66 | self.cached_partitions = {} # Keys are (n_agents, min_team_size, max_team_size) 67 | 68 | def _partition_agents(self, n_agents, min_team_size, max_team_size): 69 | ''' 70 | Return a random partition from the set of all integer partitions 71 | ''' 72 | settings = (n_agents, min_team_size, max_team_size) 73 | if settings not in self.cached_partitions: 74 | self.cached_partitions[settings] = list(get_all_integer_partitions(n_agents, min_team_size, max_team_size)) 75 | all_partitions = self.cached_partitions[settings] 76 | random_partitions = all_partitions[np.random.randint(len(all_partitions))] 77 | 78 | return random_partitions 79 | 80 | def _generate_social_preferences(self, n_agents): 81 | ''' 82 | Generate the relationship graph (without uncertainty) 83 | ''' 84 | # Generate random partitions 85 | if self.max_team_size != self.min_team_size: 86 | random_partitions = self._partition_agents(n_agents, self.min_team_size, self.max_team_size) 87 | else: 88 | random_partitions = np.random.randint(self.min_team_size, self.max_team_size + 1, (n_agents)) 89 | random_partitions = np.cumsum(random_partitions) 90 | random_partitions = random_partitions[random_partitions <= n_agents] 91 | random_partitions = np.concatenate([[0], random_partitions, [n_agents]]) 92 | 93 | # Convert random partitions into a block diagonal matrix 94 | self.reward_xform_mat = np.zeros((n_agents, n_agents)) 95 | for i in range(len(random_partitions) - 1): 96 | block = slice(random_partitions[i], random_partitions[i + 1]) 97 | self.reward_xform_mat[block, block] = 1 98 | 99 | # Randomize reward sharing values in block diagonal matrix 100 | self.reward_xform_mat *= np.random.beta(a=self.alpha, b=self.beta, size=(n_agents, n_agents)) 101 | 102 | # Make sure off-diagonal is symmetric 103 | self.reward_xform_mat = np.tril(self.reward_xform_mat, -1) + np.tril(self.reward_xform_mat).T 104 | 105 | if not self.allow_diagonal_non_1: 106 | np.fill_diagonal(self.reward_xform_mat, 1.0) 107 | 108 | # Randomly shuffle agents so that agent indicies do not matter 109 | random_shuffle_mat = np.eye(n_agents) 110 | np.random.shuffle(random_shuffle_mat) 111 | # We rotate, sum over teams, then unrotate 112 | self.reward_xform_mat = np.matmul(np.matmul(random_shuffle_mat.T, self.reward_xform_mat), random_shuffle_mat) 113 | 114 | # Normalize rows 115 | self.unnormalized_reward_xform_mat = self.reward_xform_mat.copy() 116 | self.reward_xform_mat /= np.sum(self.reward_xform_mat, axis=1, keepdims=True) 117 | 118 | def _generate_uncertainty(self, n_agents): 119 | ''' 120 | Generate uncertainty levels and noise to be applied to the matrices 121 | ''' 122 | self.noise_std = np.random.uniform(low=self.obs_noise_std_range[0], 123 | high=self.obs_noise_std_range[1], 124 | size=(n_agents, n_agents, n_agents)) 125 | self.noise = np.random.normal(scale=self.noise_std) 126 | 127 | def _precompute_observations(self, n_agents): 128 | ''' 129 | Precompute observations since they are static per episode. 130 | ''' 131 | # We have independent noisy observations per agents, so we copy the reward matrix n_agents times and 132 | # then add the noise matrices 133 | rew_mats = np.repeat(np.expand_dims(self.unnormalized_reward_xform_mat, 0), n_agents, axis=0) 134 | noisy_rew_mats = rew_mats + self.noise 135 | self.precomputed_obs = {} 136 | 137 | def _index_into_mats(key, *indices): 138 | ''' 139 | Helper function to create 3 observation types with the same indices 140 | ''' 141 | self.precomputed_obs[key] = rew_mats[indices] # Non-noisy version of the reward matrix 142 | self.precomputed_obs[key + "_noisy"] = noisy_rew_mats[indices] # Noisy version of the reward matrix 143 | self.precomputed_obs[key + '_noise_level'] = self.noise_std[indices] # Noise level associated with each entry in the noisy reward matrices 144 | 145 | def _transpose_existing(new_key, existing_key): 146 | ''' 147 | Helper function to transpose all 3 observations for an key. This is useful if an agent policy 148 | or value function needs to observe what other agents observe about it. 149 | ''' 150 | self.precomputed_obs[new_key] = self.precomputed_obs[existing_key].T 151 | self.precomputed_obs[new_key + "_noisy"] = self.precomputed_obs[existing_key + "_noisy"].T 152 | self.precomputed_obs[new_key + '_noise_level'] = self.precomputed_obs[existing_key + '_noise_level'].T 153 | 154 | # Relationship variable of myself (What is the weight over my own reward) with my own noise variable. 155 | # This is in effect the 3D diagonal, so the output shape will be (n_agents,) 156 | _index_into_mats('self_rew_value', np.arange(n_agents), np.arange(n_agents), np.arange(n_agents)) 157 | 158 | # Relationship variable of other agents weight over their own reward with my own noise variable (s) 159 | # Row i is the diagonal of the ith matrix 160 | _index_into_mats('other_rew_value_s', slice(None), np.arange(n_agents), np.arange(n_agents)) 161 | 162 | # My relationship variable with other agents (so) with my noise (s) 163 | # Row i is row i of the ith matrix 164 | _index_into_mats('rew_share_so_s', np.arange(n_agents), np.arange(n_agents), slice(None)) 165 | 166 | # Others relationship variable with me (os) with their noise (o) 167 | # Should only be used in the value function 168 | _transpose_existing('rew_share_os_o', 'rew_share_so_s') 169 | 170 | 171 | class RUSPWrapper(RUSPGenerator, gym.Wrapper): 172 | ''' 173 | Gym wrapper for generating relationship graphs. Generates a new relationship graph and uncertainties on reset. 174 | Provides all observations necessary to agents and transforms reward according to the relationship graph. 175 | 176 | Observations: 177 | Each observation has the true value, the noisy value "_noisy" and the uncertainty level "_noise_level" 178 | 179 | self_rew_value: Relationship variable of myself (What is the weight over my own reward) with my own noise variable. 180 | other_rew_value_s: Relationship variable of other agents weight over their own reward with my own noise variable (s) 181 | rew_share_so_s: My relationship variable with other agents (so) with my noise (s) 182 | rew_share_os_o: Others relationship variable with me (os) with their noise (o). Should only be used in the value function 183 | ''' 184 | @store_args 185 | def __init__(self, env, **graph_kwargs): 186 | RUSPGenerator.__init__(self, **graph_kwargs) 187 | gym.Wrapper.__init__(self, env) 188 | n_a = self.metadata['n_agents'] 189 | self.obs_keys_with_shapes = { 190 | 'self_rew_value': [n_a, 1], 191 | 'self_rew_value_noisy': [n_a, 1], 192 | 'self_rew_value_noise_level': [n_a, 1], 193 | 'other_rew_value_s': [n_a, n_a, 1], 194 | 'other_rew_value_s_noisy': [n_a, n_a, 1], 195 | 'other_rew_value_s_noise_level': [n_a, n_a, 1], 196 | 'rew_share_so_s': [n_a, n_a, 1], 197 | 'rew_share_so_s_noisy': [n_a, n_a, 1], 198 | 'rew_share_so_s_noise_level': [n_a, n_a, 1], 199 | 'rew_share_os_o': [n_a, n_a, 1], 200 | 'rew_share_os_o_noisy': [n_a, n_a, 1], 201 | 'rew_share_os_o_noise_level': [n_a, n_a, 1], 202 | } 203 | self.observation_space = update_obs_space(self, self.obs_keys_with_shapes) 204 | 205 | def reset(self): 206 | self._generate_social_preferences(self.metadata['n_agents']) 207 | self._generate_uncertainty(self.metadata['n_agents']) 208 | self._precompute_observations(self.metadata['n_agents']) 209 | return self.observation(self.env.reset()) 210 | 211 | def step(self, action): 212 | obs, rew, done, info = self.env.step(action) 213 | rew = np.matmul(self.reward_xform_mat, rew) 214 | return self.observation(obs), rew, done, info 215 | 216 | def observation(self, obs): 217 | for k in self.obs_keys_with_shapes: 218 | obs[k] = np.expand_dims(self.precomputed_obs[k], -1) 219 | return obs 220 | 221 | 222 | def add_rew_share_observation_keys(*, keys_self: List[str], 223 | keys_additional_self_vf: List[str], 224 | keys_other_agents: List[str], 225 | keys_additional_other_agents_vf: List[str], 226 | keys_self_matrices: List[str], 227 | **kwargs): 228 | ''' 229 | Determines how keys about the relationship graph should be observed. 230 | Args: 231 | keys_self: keys that the agent should observe about itself 232 | keys_additional_self_vf: keys about an agent but only that the value function should observe 233 | keys_other_agents: keys about other agents 234 | keys_additional_other_agents_vf: keys about other agents but only that the value function should observe 235 | keys_self_matrices: keys that are shaped (n_agents, n_agents, X). These need to be dealth with differently 236 | ''' 237 | 238 | keys_self += [ 239 | 'self_rew_value_noisy', 240 | 'self_rew_value_noise_level', 241 | ] 242 | keys_additional_self_vf.append('self_rew_value') 243 | 244 | keys_other_agents += [ 245 | 'rew_share_so_s_noisy', 246 | 'rew_share_so_s_noise_level', 247 | 'other_rew_value_s_noisy', 248 | 'other_rew_value_s_noise_level' 249 | ] 250 | 251 | other_rew_value_keys = [ 252 | 'other_rew_value_s_noisy', 253 | 'other_rew_value_s_noise_level', 254 | ] 255 | 256 | keys_additional_other_agents_vf += [ 257 | 'rew_share_so_s', 258 | 'other_rew_value_s', 259 | 'rew_share_os_o_noisy', 260 | 'rew_share_os_o_noise_level', 261 | ] 262 | 263 | keys_self_matrices += [ 264 | 'other_rew_value_s', 265 | 'other_rew_value_s_noisy', 266 | 'other_rew_value_s_noise_level', 267 | 'rew_share_so_s', 268 | 'rew_share_so_s_noisy', 269 | 'rew_share_so_s_noise_level', 270 | 'rew_share_os_o', 271 | 'rew_share_os_o_noisy', 272 | 'rew_share_os_o_noise_level', 273 | ] 274 | 275 | return keys_self, keys_additional_self_vf, keys_other_agents, keys_additional_other_agents_vf, keys_self_matrices 276 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/rusp/wrappers_util.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from gym.spaces import Tuple, Discrete 4 | from scipy.linalg import circulant 5 | from mae_envs.wrappers.util import update_obs_space 6 | from mujoco_worldgen.util.types import store_args 7 | 8 | 9 | class RandomizedHorizonWrapper(gym.Wrapper): 10 | ''' 11 | Randomize the horizon for a game. 12 | A fixed horizon can be set by setting lower_lim = upper_lim. 13 | A randomized but finite horizon can be set by setting upper_lim > lower_lim (randomized uniformly between these bounds) 14 | A discounted infinite horizon can be set by setting prob_per_step_to_stop, which is the probability that 15 | the episode will end on any given timestep. This is implemented by sampling the horizon from a geometric distribution. 16 | Args: 17 | lower_lim (int): Lower limit of the horizon 18 | upper_lim (int): Upper limit of the horizon 19 | prob_per_step_to_stop (float): probability that episode will end on any given timestep. 20 | 21 | Either lower and upper lim must both be set or prob_per_step_to_stop must be set. 22 | 23 | Observations: 24 | horizon (n_agents, 1): Episode horizon. Intended for value function 25 | fraction_episode_done (n_agents, 1): Fraction of the episode complete. Intended for value function 26 | timestep (n_agents, 1): raw timestep. Intended for policy 27 | ''' 28 | @store_args 29 | def __init__(self, env, lower_lim=None, upper_lim=None, prob_per_step_to_stop=None): 30 | super().__init__(env) 31 | assert (lower_lim is not None and upper_lim is not None) or prob_per_step_to_stop is not None 32 | if prob_per_step_to_stop is not None: 33 | assert prob_per_step_to_stop > 0 and prob_per_step_to_stop < 1 34 | self.observation_space = update_obs_space(self, { 35 | 'fraction_episode_done': [self.metadata['n_agents'], 1], 36 | 'horizon': [self.metadata['n_agents'], 1], 37 | 'timestep': [self.metadata['n_agents'], 1] 38 | }) 39 | self.observation_space = update_obs_space(self, {}) 40 | 41 | def reset(self, **kwargs): 42 | self._t = 0 43 | if self.prob_per_step_to_stop is not None: 44 | self._horizon = np.random.geometric(p=self.prob_per_step_to_stop) 45 | else: 46 | self._horizon = np.random.randint(self.lower_lim, self.upper_lim + 1) if self.lower_lim < self.upper_lim else self.lower_lim 47 | return self.observation(self.env.reset()) 48 | 49 | def step(self, action): 50 | self._t += 1 51 | obs, rew, done, info = self.env.step(action) 52 | if self._t >= self._horizon: 53 | done = True 54 | return self.observation(obs), rew, done, info 55 | 56 | def observation(self, obs): 57 | obs['timestep'] = np.ones((self.metadata['n_agents'], 1), dtype=int) * self._t 58 | obs['fraction_episode_done'] = np.ones((self.metadata['n_agents'], 1)) * self._t / self._horizon 59 | obs['horizon'] = np.ones((self.metadata['n_agents'], 1)) * self._horizon 60 | return obs 61 | 62 | 63 | class RandomIdentityVector(gym.Wrapper): 64 | ''' 65 | Give agents a vector_dim dimension random identity sampled uniformly between 0 and 1. 66 | 67 | Args: 68 | vector_dim (int): Dimension of the identity vector 69 | 70 | Observations: 71 | agent_identity (n_agents, vector_dim): identity for each agent 72 | ''' 73 | @store_args 74 | def __init__(self, env, vector_dim=16): 75 | super().__init__(env) 76 | self.observation_space = update_obs_space(self, {'agent_identity': [self.metadata['n_agents'], self.vector_dim]}) 77 | 78 | def reset(self): 79 | self.agent_identities = np.random.uniform(0, 1, (self.metadata['n_agents'], self.vector_dim)) 80 | return self.observation(self.env.reset()) 81 | 82 | def observation(self, obs): 83 | obs['agent_identity'] = self.agent_identities 84 | return obs 85 | 86 | def step(self, action): 87 | obs, rew, done, info = self.env.step(action) 88 | return self.observation(obs), rew, done, info 89 | 90 | 91 | class OtherActorAttentionAction(gym.Wrapper): 92 | ''' 93 | Utility class to make actions that attend over other agents possible. Agents will likely recieve an entity 94 | based observation of others. The order of these entities is defined by a circulant matrix (see 95 | mae_envs.wrappers.multi_agent:SplitObservations). If a policy constructions an attention action head 96 | based on these observations we need to properly process its choice as the ordering will be different 97 | for every agent. 98 | 99 | This class defines a Discrete action head with number of options n_agents - 1 (attention over all other agents) 100 | and it defines a function _get_target_actor that given the choice of a particular agent maps this back 101 | to the true ordering of agents. 102 | Args: 103 | action_name (string): name of the action to create 104 | ''' 105 | @store_args 106 | def __init__(self, env, action_name): 107 | super().__init__(env) 108 | self.action_name = action_name 109 | self.n_agents = self.metadata['n_agents'] 110 | self.action_space.spaces[action_name] = Tuple([Discrete(n=self.metadata['n_agents'] - 1) 111 | for _ in range(self.n_agents)]) 112 | 113 | # This matches the circulant ordering used for "Others" Observations (see mae_envs.wrappers.multi_agent:SplitObservations) 114 | self.other_actors = np.arange(self.n_agents)[circulant(np.arange(self.n_agents))[:, 1:]] 115 | self.other_actors = dict(zip(np.arange(self.n_agents), self.other_actors)) 116 | 117 | def _get_target_actor(self, actor, action): 118 | ''' 119 | Return the true index of the targeted agent. Indicies given by the action will be in a rotated space defined 120 | based on how entities are presented to the policy, so we must map back to the underlying ordering. 121 | 122 | If the index is -1, this means no other agent was chosen. 123 | ''' 124 | if action[self.action_name][actor] == -1: 125 | return np.array([]) 126 | else: 127 | return np.array([self.other_actors[actor][action[self.action_name][actor]]]) 128 | 129 | 130 | class ActionOptionsWrapper(gym.Wrapper): 131 | ''' 132 | Allows one to define a hierarchical action space by defining a meta action that chooses which 133 | sub action head to execute. E.g. you want agents to be only able to attack OR eat. 134 | 135 | Args: 136 | action_keys (list): list of action head names that will be options 137 | defaults (dict): mapping from action_key to the value that should be passed if that action is NOT chosen. 138 | Downstream wrappers for this action_key will handle cases when the default is passed. 139 | do_nothing_option (bool): If true adds the option to pick none of the available actions and pass the default 140 | value for all of them. 141 | 142 | Observations: 143 | previous_choice (n_agents, number options): One hot observation of each agent's previous action choice. 144 | ''' 145 | @store_args 146 | def __init__(self, env, action_keys, defaults, do_nothing_option=True): 147 | super().__init__(env) 148 | if self.do_nothing_option: 149 | self.action_keys.append('do_nothing') 150 | self.n_agents = self.metadata['n_agents'] 151 | self.action_space.spaces['action_choose_option'] = Tuple([Discrete(n=len(self.action_keys)) 152 | for _ in range(self.metadata['n_agents'])]) 153 | self.observation_space = update_obs_space(self, {'previous_choice': [self.metadata['n_agents'], len(self.action_keys)]}) 154 | 155 | def reset(self): 156 | self.previous_choice = np.zeros((self.metadata['n_agents'], len(self.action_keys))) 157 | return self.observation(self.env.reset()) 158 | 159 | def step(self, action): 160 | for i in range(self.n_agents): 161 | for ac_ind, ac_name in enumerate(self.action_keys): 162 | if ac_ind != action['action_choose_option'][i] and ac_name != 'do_nothing': 163 | action[ac_name][i] = self.defaults[ac_name] 164 | 165 | self.previous_choice = np.eye(len(self.action_keys))[action['action_choose_option']] 166 | obs, rew, done, info = self.env.step(action) 167 | return self.observation(obs), rew, done, info 168 | 169 | def observation(self, obs): 170 | obs['previous_choice'] = self.previous_choice 171 | return obs 172 | -------------------------------------------------------------------------------- /randomized_uncertain_social_preferences/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | 4 | setup( 5 | name='rusp', 6 | version='0.0.0', 7 | packages=find_packages(), 8 | package_data={ 9 | '': ['*.pyx', '*.pxd', '*.pxi', '*.h'], 10 | }) 11 | -------------------------------------------------------------------------------- /requirements_ma_policy.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.13.1 2 | cloudpickle==0.5.2 3 | baselines==0.1.5 4 | opencv-python>=3.4.3.18 # needed for baselines not to crash 5 | pytest==5.0.1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os import getenv 2 | from os.path import dirname, realpath 3 | from setuptools import find_packages, setup 4 | 5 | 6 | setup( 7 | name='mae_envs', 8 | version='0.0.0', 9 | packages=find_packages(), 10 | package_data={ 11 | '': ['*.pyx', '*.pxd', '*.pxi', '*.h'], 12 | }) 13 | --------------------------------------------------------------------------------