├── multiagent-particle-envs ├── bin │ ├── __init__.py │ └── interactive.py ├── .gitignore ├── multiagent │ ├── scenarios │ │ ├── __init__.py │ │ ├── simple.py │ │ ├── simple_speaker_listener.py │ │ ├── simple_reference.py │ │ ├── simple_spread.py │ │ ├── simple_spread_partially_observed.py │ │ ├── simple_push.py │ │ ├── simple_adversary.py │ │ ├── simple_tag.py │ │ ├── simple_crypto.py │ │ └── simple_world_comm.py │ ├── scenario.py │ ├── __init__.py │ ├── policy.py │ ├── multi_discrete.py │ ├── rendering.py │ ├── core.py │ └── environment.py ├── setup.py ├── make_env.py └── README.md ├── experiments ├── result_test │ ├── checkpoint │ ├── debug.meta │ ├── debug │ │ ├── checkpoint │ │ ├── team_0.index │ │ ├── team_0.meta │ │ ├── team_0.data-00000-of-00001 │ │ ├── events.out.tfevents.1597310617.cilc42-HP-Z4-G4-Workstation │ │ └── events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation │ ├── debug.index │ └── debug.data-00000-of-00001 ├── __pycache__ │ ├── ibmac.cpython-36.pyc │ └── ibmac_inter.cpython-36.pyc ├── graph │ ├── events.out.tfevents.1597310616.cilc42-HP-Z4-G4-Workstation │ └── events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation ├── ibmac_inter.py └── ibmac.py ├── maddpg ├── __pycache__ │ └── __init__.cpython-36.pyc ├── common │ ├── __pycache__ │ │ ├── tf_util.cpython-36.pyc │ │ └── distributions.cpython-36.pyc │ ├── tf_util.py │ └── distributions.py ├── trainer │ ├── __pycache__ │ │ ├── maddpg.cpython-36.pyc │ │ ├── replay_buffer.cpython-36.pyc │ │ └── replay_buffer_with_messages.cpython-36.pyc │ ├── replay_buffer.py │ ├── replay_buffer_with_messages.py │ └── maddpg.py └── __init__.py ├── .idea ├── .gitignore ├── misc.xml ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── vcs.xml ├── modules.xml └── icml_macom.iml ├── requirements.txt └── README.md /multiagent-particle-envs/bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /multiagent-particle-envs/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.egg-info/ 3 | *.pyc -------------------------------------------------------------------------------- /experiments/result_test/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "debug" 2 | all_model_checkpoint_paths: "debug" 3 | -------------------------------------------------------------------------------- /experiments/result_test/debug.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug.meta -------------------------------------------------------------------------------- /experiments/result_test/debug/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "team_0" 2 | all_model_checkpoint_paths: "team_0" 3 | -------------------------------------------------------------------------------- /experiments/result_test/debug.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug.index -------------------------------------------------------------------------------- /experiments/result_test/debug/team_0.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug/team_0.index -------------------------------------------------------------------------------- /experiments/result_test/debug/team_0.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug/team_0.meta -------------------------------------------------------------------------------- /maddpg/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /experiments/__pycache__/ibmac.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/__pycache__/ibmac.cpython-36.pyc -------------------------------------------------------------------------------- /experiments/result_test/debug.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug.data-00000-of-00001 -------------------------------------------------------------------------------- /maddpg/common/__pycache__/tf_util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/common/__pycache__/tf_util.cpython-36.pyc -------------------------------------------------------------------------------- /maddpg/trainer/__pycache__/maddpg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/trainer/__pycache__/maddpg.cpython-36.pyc -------------------------------------------------------------------------------- /experiments/__pycache__/ibmac_inter.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/__pycache__/ibmac_inter.cpython-36.pyc -------------------------------------------------------------------------------- /maddpg/common/__pycache__/distributions.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/common/__pycache__/distributions.cpython-36.pyc -------------------------------------------------------------------------------- /experiments/result_test/debug/team_0.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug/team_0.data-00000-of-00001 -------------------------------------------------------------------------------- /maddpg/trainer/__pycache__/replay_buffer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/trainer/__pycache__/replay_buffer.cpython-36.pyc -------------------------------------------------------------------------------- /maddpg/trainer/__pycache__/replay_buffer_with_messages.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/trainer/__pycache__/replay_buffer_with_messages.cpython-36.pyc -------------------------------------------------------------------------------- /experiments/graph/events.out.tfevents.1597310616.cilc42-HP-Z4-G4-Workstation: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/graph/events.out.tfevents.1597310616.cilc42-HP-Z4-G4-Workstation -------------------------------------------------------------------------------- /experiments/graph/events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/graph/events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /experiments/result_test/debug/events.out.tfevents.1597310617.cilc42-HP-Z4-G4-Workstation: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug/events.out.tfevents.1597310617.cilc42-HP-Z4-G4-Workstation -------------------------------------------------------------------------------- /experiments/result_test/debug/events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug/events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/__init__.py: -------------------------------------------------------------------------------- 1 | import imp 2 | import os.path as osp 3 | 4 | 5 | def load(name): 6 | pathname = osp.join(osp.dirname(__file__), name) 7 | return imp.load_source('', pathname) 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenario.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # defines scenario upon which the world is built 4 | class BaseScenario(object): 5 | # create elements of the world 6 | def make_world(self): 7 | raise NotImplementedError() 8 | # create initial conditions of the world 9 | def reset_world(self, world): 10 | raise NotImplementedError() 11 | -------------------------------------------------------------------------------- /maddpg/__init__.py: -------------------------------------------------------------------------------- 1 | class AgentTrainer(object): 2 | def __init__(self, name, model, obs_shape, act_space, args): 3 | raise NotImplemented() 4 | 5 | def action(self, obs): 6 | raise NotImplemented() 7 | 8 | def process_experience(self, obs, act, rew, new_obs, done, terminal): 9 | raise NotImplemented() 10 | 11 | def preupdate(self): 12 | raise NotImplemented() 13 | 14 | def update(self, agents): 15 | raise NotImplemented() -------------------------------------------------------------------------------- /multiagent-particle-envs/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='multiagent', 4 | version='0.0.1', 5 | description='Multi-Agent Goal-Driven Communication Environment', 6 | url='https://github.com/openai/multiagent-public', 7 | author='Igor Mordatch', 8 | author_email='mordatch@openai.com', 9 | packages=find_packages(), 10 | include_package_data=True, 11 | zip_safe=False, 12 | install_requires=['gym', 'numpy-stl'] 13 | ) 14 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | # Multiagent envs 4 | # ---------------------------------------- 5 | 6 | register( 7 | id='MultiagentSimple-v0', 8 | entry_point='multiagent.envs:SimpleEnv', 9 | # FIXME(cathywu) currently has to be exactly max_path_length parameters in 10 | # rllab run script 11 | max_episode_steps=100, 12 | ) 13 | 14 | register( 15 | id='MultiagentSimpleSpeakerListener-v0', 16 | entry_point='multiagent.envs:SimpleSpeakerListenerEnv', 17 | max_episode_steps=100, 18 | ) 19 | -------------------------------------------------------------------------------- /.idea/icml_macom.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | 13 | 15 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.7.0 2 | astor==0.7.1 3 | -e git+https://github.com/openai/baselines.git@57e05eb420f9a20fa8cd7ee7580b1a8874b4a323#egg=baselines 4 | cffi==1.12.2 5 | chardet==3.0.4 6 | Click==7.0 7 | cloudpickle==0.7.0 8 | cycler==0.10.0 9 | Cython==0.29.6 10 | dill==0.2.9 11 | enum34==1.1.6 12 | future==0.17.1 13 | futures==3.1.1 14 | gast==0.2.2 15 | glfw==1.7.1 16 | grpcio==1.18.0 17 | gym==0.9.4 18 | h5py==2.9.0 19 | idna==2.8 20 | imageio==2.5.0 21 | joblib==0.13.1 22 | Keras-Applications==1.0.7 23 | Keras-Preprocessing==1.0.9 24 | kiwisolver==1.0.1 25 | lockfile==0.12.2 26 | lxml==4.3.2 27 | Markdown==3.0.1 28 | matplotlib==3.0.2 29 | -e multiagent-particle-envs 30 | numpy==1.16.2 31 | numpy-stl==2.9.0 32 | opencv-python==4.0.0.21 33 | pandas==0.24.1 34 | Pillow==5.4.1 35 | progressbar2==3.39.2 36 | protobuf==3.6.1 37 | pycparser==2.19 38 | pygame==1.9.4 39 | pyglet==1.3.2 40 | PyOpenGL==3.1.0 41 | pyparsing==2.3.1 42 | python-dateutil==2.8.0 43 | python-utils==2.3.0 44 | pytz==2018.9 45 | requests==2.21.0 46 | scipy==1.2.0 47 | seaborn==0.9.0 48 | six==1.12.0 49 | tensorboard==1.12.2 50 | tensorboardX==1.6 51 | tensorflow==1.12.0 52 | termcolor==1.1.0 53 | torch==1.0.1 54 | torchvision==0.2.1 55 | tqdm==4.30.0 56 | urllib3==1.24.1 57 | Werkzeug==0.14.1 58 | -------------------------------------------------------------------------------- /multiagent-particle-envs/bin/interactive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os,sys 3 | sys.path.insert(1, os.path.join(sys.path[0], '..')) 4 | import argparse 5 | import time 6 | 7 | from multiagent.environment import MultiAgentEnv 8 | from multiagent.policy import InteractivePolicy 9 | import multiagent.scenarios as scenarios 10 | 11 | if __name__ == '__main__': 12 | # parse arguments 13 | parser = argparse.ArgumentParser(description=None) 14 | parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.') 15 | args = parser.parse_args() 16 | 17 | # load scenario from script 18 | scenario = scenarios.load(args.scenario).Scenario() 19 | # create world 20 | world = scenario.make_world() 21 | # create multiagent environment 22 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, 23 | scenario.observation, info_callback=None, 24 | shared_viewer=False, discrete_action=True) 25 | # render call to create viewer window (necessary only for interactive policies) 26 | env.render() 27 | # create interactive policies for each agent 28 | policies = [InteractivePolicy(env,i) for i in range(env.n)] 29 | # execution loop 30 | obs_n = env.reset() 31 | while True: 32 | start = time.time() 33 | # query for action from each agent's policy 34 | act_n = [] 35 | for i, policy in enumerate(policies): 36 | act_n.append(policy.action(obs_n[i])) 37 | # step environment 38 | obs_n, reward_n, done_n, _ = env.step(act_n) 39 | # render all agent views 40 | env.render() 41 | end = time.time() 42 | elapsed = end - start 43 | time.sleep(max(1 / 30 - elapsed, 0)) 44 | # display rewards 45 | #for agent in env.world.agents: 46 | # print(agent.name + " reward: %0.3f" % env._get_reward(agent)) 47 | -------------------------------------------------------------------------------- /multiagent-particle-envs/make_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for creating a multiagent environment with one of the scenarios listed 3 | in ./scenarios/. 4 | Can be called by using, for example: 5 | env = make_env('simple_speaker_listener') 6 | After producing the env object, can be used similarly to an OpenAI gym 7 | environment. 8 | 9 | A policy using this environment must output actions in the form of a list 10 | for all agents. Each element of the list should be a numpy array, 11 | of size (env.world.dim_p + env.world.dim_c, 1). Physical actions precede 12 | communication actions in this array. See environment.py for more details. 13 | """ 14 | 15 | def make_env(scenario_name, benchmark=False): 16 | ''' 17 | Creates a MultiAgentEnv object as env. This can be used similar to a gym 18 | environment by calling env.reset() and env.step(). 19 | Use env.render() to view the environment on the screen. 20 | 21 | Input: 22 | scenario_name : name of the scenario from ./scenarios/ to be Returns 23 | (without the .py extension) 24 | benchmark : whether you want to produce benchmarking data 25 | (usually only done during evaluation) 26 | 27 | Some useful env properties (see environment.py): 28 | .observation_space : Returns the observation space for each agent 29 | .action_space : Returns the action space for each agent 30 | .n : Returns the number of Agents 31 | ''' 32 | from multiagent.environment import MultiAgentEnv 33 | import multiagent.scenarios as scenarios 34 | 35 | # load scenario from script 36 | scenario = scenarios.load(scenario_name + ".py").Scenario() 37 | # create world 38 | world = scenario.make_world() 39 | # create multiagent environment 40 | if benchmark: 41 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data) 42 | else: 43 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) 44 | return env 45 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pyglet.window import key 3 | 4 | # individual agent policy 5 | class Policy(object): 6 | def __init__(self): 7 | pass 8 | def action(self, obs): 9 | raise NotImplementedError() 10 | 11 | # interactive policy based on keyboard input 12 | # hard-coded to deal only with movement, not communication 13 | class InteractivePolicy(Policy): 14 | def __init__(self, env, agent_index): 15 | super(InteractivePolicy, self).__init__() 16 | self.env = env 17 | # hard-coded keyboard events 18 | self.move = [False for i in range(4)] 19 | self.comm = [False for i in range(env.world.dim_c)] 20 | # register keyboard events with this environment's window 21 | env.viewers[agent_index].window.on_key_press = self.key_press 22 | env.viewers[agent_index].window.on_key_release = self.key_release 23 | 24 | def action(self, obs): 25 | # ignore observation and just act based on keyboard events 26 | if self.env.discrete_action_input: 27 | u = 0 28 | if self.move[0]: u = 1 29 | if self.move[1]: u = 2 30 | if self.move[2]: u = 4 31 | if self.move[3]: u = 3 32 | else: 33 | u = np.zeros(5) # 5-d because of no-move action 34 | if self.move[0]: u[1] += 1.0 35 | if self.move[1]: u[2] += 1.0 36 | if self.move[3]: u[3] += 1.0 37 | if self.move[2]: u[4] += 1.0 38 | if True not in self.move: 39 | u[0] += 1.0 40 | return np.concatenate([u, np.zeros(self.env.world.dim_c)]) 41 | 42 | # keyboard event callbacks 43 | def key_press(self, k, mod): 44 | if k==key.RIGHT: self.move[0] = True 45 | if k==key.LEFT: self.move[1] = True 46 | if k==key.DOWN: self.move[2] = True 47 | if k==key.UP: self.move[3] = True 48 | def key_release(self, k, mod): 49 | if k==key.RIGHT: self.move[0] = False 50 | if k==key.LEFT: self.move[1] = False 51 | if k==key.DOWN: self.move[2] = False 52 | if k==key.UP: self.move[3] = False 53 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/simple.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # add agents 9 | world.agents = [Agent() for i in range(1)] 10 | for i, agent in enumerate(world.agents): 11 | agent.name = 'agent %d' % i 12 | agent.collide = False 13 | agent.silent = True 14 | # add landmarks 15 | world.landmarks = [Landmark() for i in range(1)] 16 | for i, landmark in enumerate(world.landmarks): 17 | landmark.name = 'landmark %d' % i 18 | landmark.collide = False 19 | landmark.movable = False 20 | # make initial conditions 21 | self.reset_world(world) 22 | return world 23 | 24 | def reset_world(self, world): 25 | # random properties for agents 26 | for i, agent in enumerate(world.agents): 27 | agent.color = np.array([0.25,0.25,0.25]) 28 | # random properties for landmarks 29 | for i, landmark in enumerate(world.landmarks): 30 | landmark.color = np.array([0.75,0.75,0.75]) 31 | world.landmarks[0].color = np.array([0.75,0.25,0.25]) 32 | # set random initial states 33 | for agent in world.agents: 34 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 35 | agent.state.p_vel = np.zeros(world.dim_p) 36 | agent.state.c = np.zeros(world.dim_c) 37 | for i, landmark in enumerate(world.landmarks): 38 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 39 | landmark.state.p_vel = np.zeros(world.dim_p) 40 | 41 | def reward(self, agent, world): 42 | dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos)) 43 | return -dist2 #np.exp(-dist2) 44 | 45 | def observation(self, agent, world): 46 | # get positions of all entities in this agent's reference frame 47 | entity_pos = [] 48 | for entity in world.landmarks: 49 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 50 | return np.concatenate([agent.state.p_vel] + entity_pos) 51 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/multi_discrete.py: -------------------------------------------------------------------------------- 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates) 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py) 3 | 4 | import numpy as np 5 | 6 | import gym 7 | from gym.spaces import prng 8 | 9 | class MultiDiscrete(gym.Space): 10 | """ 11 | - The multi-discrete action space consists of a series of discrete action spaces with different parameters 12 | - It can be adapted to both a Discrete action space or a continuous (Box) action space 13 | - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space 14 | - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space 15 | where the discrete action space can take any integers from `min` to `max` (both inclusive) 16 | Note: A value of 0 always need to represent the NOOP action. 17 | e.g. Nintendo Game Controller 18 | - Can be conceptualized as 3 discrete action spaces: 19 | 1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4 20 | 2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 21 | 3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 22 | - Can be initialized as 23 | MultiDiscrete([ [0,4], [0,1], [0,1] ]) 24 | """ 25 | def __init__(self, array_of_param_array): 26 | self.low = np.array([x[0] for x in array_of_param_array]) 27 | self.high = np.array([x[1] for x in array_of_param_array]) 28 | self.num_discrete_space = self.low.shape[0] 29 | 30 | def sample(self): 31 | """ Returns a array with one sample from each discrete action space """ 32 | # For each row: round(random .* (max - min) + min, 0) 33 | random_array = prng.np_random.rand(self.num_discrete_space) 34 | return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)] 35 | def contains(self, x): 36 | return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all() 37 | 38 | @property 39 | def shape(self): 40 | return self.num_discrete_space 41 | def __repr__(self): 42 | return "MultiDiscrete" + str(self.num_discrete_space) 43 | def __eq__(self, other): 44 | return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high) -------------------------------------------------------------------------------- /maddpg/trainer/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | class ReplayBuffer(object): 5 | def __init__(self, size): 6 | """Create Prioritized Replay buffer. 7 | 8 | Parameters 9 | ---------- 10 | size: int 11 | Max number of transitions to store in the buffer. When the buffer 12 | overflows the old memories are dropped. 13 | """ 14 | self._storage = [] 15 | self._maxsize = int(size) 16 | self._next_idx = 0 17 | 18 | def __len__(self): 19 | return len(self._storage) 20 | 21 | def clear(self): 22 | self._storage = [] 23 | self._next_idx = 0 24 | 25 | def add(self, obs_t, action, reward, obs_tp1, done): 26 | data = (obs_t, action, reward, obs_tp1, done) 27 | 28 | if self._next_idx >= len(self._storage): 29 | self._storage.append(data) 30 | else: 31 | self._storage[self._next_idx] = data 32 | self._next_idx = (self._next_idx + 1) % self._maxsize 33 | 34 | def _encode_sample(self, idxes): 35 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 36 | for i in idxes: 37 | data = self._storage[i] 38 | obs_t, action, reward, obs_tp1, done = data 39 | obses_t.append(np.array(obs_t, copy=False)) 40 | actions.append(np.array(action, copy=False)) 41 | rewards.append(reward) 42 | obses_tp1.append(np.array(obs_tp1, copy=False)) 43 | dones.append(done) 44 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 45 | 46 | def make_index(self, batch_size): 47 | return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 48 | 49 | def make_latest_index(self, batch_size): 50 | idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)] 51 | np.random.shuffle(idx) 52 | return idx 53 | 54 | def sample_index(self, idxes): 55 | return self._encode_sample(idxes) 56 | 57 | def sample(self, batch_size): 58 | """Sample a batch of experiences. 59 | 60 | Parameters 61 | ---------- 62 | batch_size: int 63 | How many transitions to sample. 64 | 65 | Returns 66 | ------- 67 | obs_batch: np.array 68 | batch of observations 69 | act_batch: np.array 70 | batch of actions executed given obs_batch 71 | rew_batch: np.array 72 | rewards received as results of executing act_batch 73 | next_obs_batch: np.array 74 | next set of observations seen after executing act_batch 75 | done_mask: np.array 76 | done_mask[i] = 1 if executing act_batch[i] resulted in 77 | the end of an episode and 0 otherwise. 78 | """ 79 | if batch_size > 0: 80 | idxes = self.make_index(batch_size) 81 | else: 82 | idxes = range(0, len(self._storage)) 83 | return self._encode_sample(idxes) 84 | 85 | def collect(self): 86 | return self.sample(-1) 87 | -------------------------------------------------------------------------------- /maddpg/trainer/replay_buffer_with_messages.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | class ReplayBuffer(object): 5 | def __init__(self, size): 6 | """Create Prioritized Replay buffer. 7 | 8 | Parameters 9 | ---------- 10 | size: int 11 | Max number of transitions to store in the buffer. When the buffer 12 | overflows the old memories are dropped. 13 | """ 14 | self._storage = [] 15 | self._maxsize = int(size) 16 | self._next_idx = 0 17 | 18 | def __len__(self): 19 | return len(self._storage) 20 | 21 | def clear(self): 22 | self._storage = [] 23 | self._next_idx = 0 24 | 25 | def add(self, obs_t, message, action, reward, obs_tp1, done): 26 | data = (obs_t, message, action, reward, obs_tp1, done) 27 | 28 | if self._next_idx >= len(self._storage): 29 | self._storage.append(data) 30 | else: 31 | self._storage[self._next_idx] = data 32 | self._next_idx = (self._next_idx + 1) % self._maxsize 33 | 34 | def _encode_sample(self, idxes): 35 | obses_t, messages, actions, rewards, obses_tp1, dones = [], [], [], [], [], [] 36 | for i in idxes: 37 | data = self._storage[i] 38 | obs_t, message, action, reward, obs_tp1, done = data 39 | obses_t.append(np.array(obs_t, copy=False)) 40 | messages.append(np.array(message, copy=False)) 41 | actions.append(np.array(action, copy=False)) 42 | rewards.append(reward) 43 | obses_tp1.append(np.array(obs_tp1, copy=False)) 44 | dones.append(done) 45 | return np.array(obses_t), np.array(messages), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 46 | 47 | def make_index(self, batch_size): 48 | return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 49 | 50 | def make_latest_index(self, batch_size): 51 | idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)] 52 | np.random.shuffle(idx) 53 | return idx 54 | 55 | def sample_index(self, idxes): 56 | return self._encode_sample(idxes) 57 | 58 | def sample(self, batch_size): 59 | """Sample a batch of experiences. 60 | 61 | Parameters 62 | ---------- 63 | batch_size: int 64 | How many transitions to sample. 65 | 66 | Returns 67 | ------- 68 | obs_batch: np.array 69 | batch of observations 70 | act_batch: np.array 71 | batch of actions executed given obs_batch 72 | rew_batch: np.array 73 | rewards received as results of executing act_batch 74 | next_obs_batch: np.array 75 | next set of observations seen after executing act_batch 76 | done_mask: np.array 77 | done_mask[i] = 1 if executing act_batch[i] resulted in 78 | the end of an episode and 0 otherwise. 79 | """ 80 | if batch_size > 0: 81 | idxes = self.make_index(batch_size) 82 | else: 83 | idxes = range(0, len(self._storage)) 84 | return self._encode_sample(idxes) 85 | 86 | def collect(self): 87 | return self.sample(-1) 88 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/simple_speaker_listener.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # set any world properties first 9 | world.dim_c = 3 10 | num_landmarks = 3 11 | # add agents 12 | world.agents = [Agent() for i in range(2)] 13 | for i, agent in enumerate(world.agents): 14 | agent.name = 'agent %d' % i 15 | agent.collide = False 16 | agent.size = 0.075 17 | # speaker 18 | world.agents[0].movable = False 19 | # listener 20 | world.agents[1].silent = True 21 | # add landmarks 22 | world.landmarks = [Landmark() for i in range(num_landmarks)] 23 | for i, landmark in enumerate(world.landmarks): 24 | landmark.name = 'landmark %d' % i 25 | landmark.collide = False 26 | landmark.movable = False 27 | landmark.size = 0.04 28 | # make initial conditions 29 | self.reset_world(world) 30 | return world 31 | 32 | def reset_world(self, world): 33 | # assign goals to agents 34 | for agent in world.agents: 35 | agent.goal_a = None 36 | agent.goal_b = None 37 | # want listener to go to the goal landmark 38 | world.agents[0].goal_a = world.agents[1] 39 | world.agents[0].goal_b = np.random.choice(world.landmarks) 40 | # random properties for agents 41 | for i, agent in enumerate(world.agents): 42 | agent.color = np.array([0.25,0.25,0.25]) 43 | # random properties for landmarks 44 | world.landmarks[0].color = np.array([0.65,0.15,0.15]) 45 | world.landmarks[1].color = np.array([0.15,0.65,0.15]) 46 | world.landmarks[2].color = np.array([0.15,0.15,0.65]) 47 | # special colors for goals 48 | world.agents[0].goal_a.color = world.agents[0].goal_b.color + np.array([0.45, 0.45, 0.45]) 49 | # set random initial states 50 | for agent in world.agents: 51 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 52 | agent.state.p_vel = np.zeros(world.dim_p) 53 | agent.state.c = np.zeros(world.dim_c) 54 | for i, landmark in enumerate(world.landmarks): 55 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 56 | landmark.state.p_vel = np.zeros(world.dim_p) 57 | 58 | def benchmark_data(self, agent, world): 59 | # returns data for benchmarking purposes 60 | return reward(agent, reward) 61 | 62 | def reward(self, agent, world): 63 | # squared distance from listener to landmark 64 | a = world.agents[0] 65 | dist2 = np.sum(np.square(a.goal_a.state.p_pos - a.goal_b.state.p_pos)) 66 | return -dist2 67 | 68 | def observation(self, agent, world): 69 | # goal color 70 | goal_color = np.zeros(world.dim_color) 71 | if agent.goal_b is not None: 72 | goal_color = agent.goal_b.color 73 | 74 | # get positions of all entities in this agent's reference frame 75 | entity_pos = [] 76 | for entity in world.landmarks: 77 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 78 | 79 | # communication of all other agents 80 | comm = [] 81 | for other in world.agents: 82 | if other is agent or (other.state.c is None): continue 83 | comm.append(other.state.c) 84 | 85 | # speaker 86 | if not agent.movable: 87 | return np.concatenate([goal_color]) 88 | # listener 89 | if agent.silent: 90 | return np.concatenate([agent.state.p_vel] + entity_pos + comm) 91 | 92 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/simple_reference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | class Scenario(BaseScenario): 6 | def make_world(self): 7 | world = World() 8 | # set any world properties first 9 | world.dim_c = 10 10 | # add agents 11 | world.agents = [Agent() for i in range(2)] 12 | for i, agent in enumerate(world.agents): 13 | agent.name = 'agent %d' % i 14 | agent.collide = False 15 | # agent.u_noise = 1e-1 16 | # agent.c_noise = 1e-1 17 | # add landmarks 18 | world.landmarks = [Landmark() for i in range(3)] 19 | for i, landmark in enumerate(world.landmarks): 20 | landmark.name = 'landmark %d' % i 21 | landmark.collide = False 22 | landmark.movable = False 23 | # make initial conditions 24 | self.reset_world(world) 25 | return world 26 | 27 | def reset_world(self, world): 28 | # assign goals to agents 29 | for agent in world.agents: 30 | agent.goal_a = None 31 | agent.goal_b = None 32 | # want other agent to go to the goal landmark 33 | world.agents[0].goal_a = world.agents[1] 34 | world.agents[0].goal_b = np.random.choice(world.landmarks) 35 | world.agents[1].goal_a = world.agents[0] 36 | world.agents[1].goal_b = np.random.choice(world.landmarks) 37 | # random properties for agents 38 | for i, agent in enumerate(world.agents): 39 | agent.color = np.array([0.25,0.25,0.25]) 40 | # random properties for landmarks 41 | world.landmarks[0].color = np.array([0.75,0.25,0.25]) 42 | world.landmarks[1].color = np.array([0.25,0.75,0.25]) 43 | world.landmarks[2].color = np.array([0.25,0.25,0.75]) 44 | # special colors for goals 45 | world.agents[0].goal_a.color = world.agents[0].goal_b.color 46 | world.agents[1].goal_a.color = world.agents[1].goal_b.color 47 | # set random initial states 48 | for agent in world.agents: 49 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 50 | agent.state.p_vel = np.zeros(world.dim_p) 51 | agent.state.c = np.zeros(world.dim_c) 52 | for i, landmark in enumerate(world.landmarks): 53 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p) 54 | landmark.state.p_vel = np.zeros(world.dim_p) 55 | 56 | def reward(self, agent, world): 57 | if agent.goal_a is None or agent.goal_b is None: 58 | return 0.0 59 | dist2 = np.sum(np.square(agent.goal_a.state.p_pos - agent.goal_b.state.p_pos)) 60 | return -dist2 #np.exp(-dist2) 61 | 62 | def observation(self, agent, world): 63 | # goal positions 64 | # goal_pos = [np.zeros(world.dim_p), np.zeros(world.dim_p)] 65 | # if agent.goal_a is not None: 66 | # goal_pos[0] = agent.goal_a.state.p_pos - agent.state.p_pos 67 | # if agent.goal_b is not None: 68 | # goal_pos[1] = agent.goal_b.state.p_pos - agent.state.p_pos 69 | # goal color 70 | goal_color = [np.zeros(world.dim_color), np.zeros(world.dim_color)] 71 | # if agent.goal_a is not None: 72 | # goal_color[0] = agent.goal_a.color 73 | if agent.goal_b is not None: 74 | goal_color[1] = agent.goal_b.color 75 | 76 | # get positions of all entities in this agent's reference frame 77 | entity_pos = [] 78 | for entity in world.landmarks: #world.entities: 79 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 80 | # entity colors 81 | entity_color = [] 82 | for entity in world.landmarks: #world.entities: 83 | entity_color.append(entity.color) 84 | # communication of all other agents 85 | comm = [] 86 | for other in world.agents: 87 | if other is agent: continue 88 | comm.append(other.state.c) 89 | return np.concatenate([agent.state.p_vel] + entity_pos + [goal_color[1]] + comm) 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning Efficient Multi-agent Communication: An Information Bottleneck Approach 2 | 3 | This is the code for implementing of NIPS #3584 paper. 4 | 5 | ## Installation 6 | 7 | 8 | ``` 9 | conda create -n imac python=3.6 10 | conda activate imac 11 | pip install tensorflow==1.12.0 12 | conda install mkl_fft=1.0.10 13 | pip install -r requirements.txt 14 | ``` 15 | 16 | - Known dependencies: Python (3.6.8), OpenAI gym (0.9.4), tensorflow (1.12.0), numpy (1.16.2) 17 | 18 | ## How to run 19 | 20 | To run the code, `cd` into the `experiments` directory and run `train.py`: 21 | 22 | `` 23 | python train.py --scenario simple_spread --exp-name debug --save-dir ./result_test/debug --batch-size 1024 --ibmac_com --trainer ibmac 24 | `` 25 | 26 | You can use tensorboard to visualize the results. 27 | 28 | ## Command-line options 29 | 30 | ### Environment options 31 | 32 | - `--scenario`: defines which environment in the MPE is to be used (default: `"simple_spread"`) 33 | 34 | - `--max-episode-len` maximum length of each episode for the environment (default: `25`) 35 | 36 | - `--num-episodes` total number of training episodes (default: `60000`) 37 | 38 | - `--num-adversaries`: number of adversaries in the environment (default: `0`) 39 | 40 | - `--good-policy`: algorithm used for the 'good' (non adversary) policies in the environment 41 | (default: `"maddpg"`; options: {`"maddpg"`, `"ddpg"`}) 42 | 43 | - `--adv-policy`: algorithm used for the adversary policies in the environment 44 | (default: `"maddpg"`; options: {`"maddpg"`, `"ddpg"`}) 45 | 46 | ### Core training parameters 47 | 48 | - `--trainer`: different algorithms (default: `"imbac"`) 49 | 50 | `ibmac`: for training scheduler 51 | 52 | `ibmac_inter`: for training policy and messages output 53 | 54 | - `--lr`: learning rate (default: `1e-2`) 55 | 56 | - `--gamma`: discount factor (default: `0.95`) 57 | 58 | - `--batch-size`: batch size (default: `1024`) 59 | 60 | - `--num-units`: number of units in the MLP (default: `64`) 61 | 62 | - `--beta`: coefficient of KL loss (default: `0.05`) 63 | 64 | - `--ibmac_com`: boolean that enable commucniation (default: `False`) 65 | 66 | - `--random-seed`: random seed (default: `42`) 67 | 68 | ### Checkpointing 69 | 70 | - `--exp-name`: name of the experiment, used as the file name to save all results (default: `None`) 71 | 72 | - `--save-dir`: directory where intermediate training results and model will be saved (default: `"/tmp/policy/"`) 73 | 74 | - `--save-rate`: model is saved every time this number of episodes has been completed (default: `1000`) 75 | 76 | - `--load-dir`: directory where training state and model are loaded from (default: `""`) 77 | 78 | ### Evaluation 79 | 80 | - `--restore`: restores previous training state stored in `load-dir` (or in `save-dir` if no `load-dir` 81 | has been provided), and continues training (default: `False`) 82 | 83 | - `--display`: displays to the screen the trained policy stored in `load-dir` (or in `save-dir` if no `load-dir` 84 | has been provided), but does not continue training (default: `False`) 85 | 86 | - `--benchmark`: runs benchmarking evaluations on saved policy, saves results to `benchmark-dir` folder (default: `False`) 87 | 88 | - `--benchmark-iters`: number of iterations to run benchmarking for (default: `100000`) 89 | 90 | - `--benchmark-dir`: directory where benchmarking data is saved (default: `"./benchmark_files/"`) 91 | 92 | - `--plots-dir`: directory where training curves are saved (default: `"./learning_curves/"`) 93 | 94 | ### Acknowledgement 95 | 96 | Our code is based on the version in: 97 |
 98 | @article{lowe2017multi,
 99 |   title={Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments},
100 |   author={Lowe, Ryan and Wu, Yi and Tamar, Aviv and Harb, Jean and Abbeel, Pieter and Mordatch, Igor},
101 |   journal={Neural Information Processing Systems (NIPS)},
102 |   year={2017}
103 | }
104 | 
105 | 106 | We slightly modify the environment on the **act_space** setting, so there are some differences on final reward output if you directly install the original version of environment. 107 | 108 | We also add a new scenario: `simple_spread_partially_observed`. The `num_agents` can be modified for more agents. 109 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/simple_spread.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_agents = 3 12 | num_landmarks = 3 13 | # add agents 14 | world.agents = [Agent() for i in range(num_agents)] 15 | for i, agent in enumerate(world.agents): 16 | agent.name = 'agent %d' % i 17 | agent.collide = True 18 | agent.silent = True 19 | agent.size = 0.05 20 | # add landmarks 21 | world.landmarks = [Landmark() for i in range(num_landmarks)] 22 | for i, landmark in enumerate(world.landmarks): 23 | landmark.name = 'landmark %d' % i 24 | landmark.collide = False 25 | landmark.movable = False 26 | # make initial conditions 27 | self.reset_world(world) 28 | return world 29 | 30 | def reset_world(self, world): 31 | # random properties for agents 32 | for i, agent in enumerate(world.agents): 33 | agent.color = np.array([0.35, 0.35, 0.85]) 34 | # random properties for landmarks 35 | for i, landmark in enumerate(world.landmarks): 36 | landmark.color = np.array([0.25, 0.25, 0.25]) 37 | # set random initial states 38 | for agent in world.agents: 39 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 40 | agent.state.p_vel = np.zeros(world.dim_p) 41 | agent.state.c = np.zeros(world.dim_c) 42 | for i, landmark in enumerate(world.landmarks): 43 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 44 | landmark.state.p_vel = np.zeros(world.dim_p) 45 | 46 | def benchmark_data(self, agent, world): 47 | rew = 0 48 | collisions = 0 49 | occupied_landmarks = 0 50 | min_dists = 0 51 | for l in world.landmarks: 52 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 53 | min_dists += min(dists) 54 | rew -= min(dists) 55 | if min(dists) < 0.1: 56 | occupied_landmarks += 1 57 | if agent.collide: 58 | for a in world.agents: 59 | if self.is_collision(a, agent): 60 | rew -= 1 61 | collisions += 1 62 | return (rew, collisions, min_dists, occupied_landmarks) 63 | 64 | 65 | def is_collision(self, agent1, agent2): 66 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 67 | dist = np.sqrt(np.sum(np.square(delta_pos))) 68 | dist_min = agent1.size + agent2.size 69 | return True if dist < dist_min else False 70 | 71 | def reward(self, agent, world): 72 | # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions 73 | rew = 0 74 | for l in world.landmarks: 75 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 76 | rew -= min(dists) 77 | if agent.collide: 78 | for a in world.agents: 79 | if self.is_collision(a, agent): 80 | rew -= 1 81 | return rew 82 | 83 | def observation(self, agent, world): 84 | # get positions of all entities in this agent's reference frame 85 | entity_pos = [] 86 | for entity in world.landmarks: # world.entities: 87 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 88 | # entity colors 89 | entity_color = [] 90 | for entity in world.landmarks: # world.entities: 91 | entity_color.append(entity.color) 92 | # communication of all other agents 93 | comm = [] 94 | other_pos = [] 95 | for other in world.agents: 96 | if other is agent: continue 97 | comm.append(other.state.c) 98 | other_pos.append(other.state.p_pos - agent.state.p_pos) 99 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm) 100 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/simple_spread_partially_observed.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | import heapq 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_agents = 5 12 | num_landmarks = 5 13 | # add agents 14 | world.agents = [Agent() for i in range(num_agents)] 15 | for i, agent in enumerate(world.agents): 16 | agent.name = 'agent %d' % i 17 | agent.collide = True 18 | agent.silent = True 19 | agent.size = 0.05 20 | # add landmarks 21 | world.landmarks = [Landmark() for i in range(num_landmarks)] 22 | for i, landmark in enumerate(world.landmarks): 23 | landmark.name = 'landmark %d' % i 24 | landmark.collide = False 25 | landmark.movable = False 26 | # make initial conditions 27 | self.reset_world(world) 28 | return world 29 | 30 | def reset_world(self, world): 31 | # random properties for agents 32 | for i, agent in enumerate(world.agents): 33 | agent.color = np.array([0.35, 0.35, 0.85]) 34 | # random properties for landmarks 35 | for i, landmark in enumerate(world.landmarks): 36 | landmark.color = np.array([0.25, 0.25, 0.25]) 37 | # set random initial states 38 | for agent in world.agents: 39 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 40 | agent.state.p_vel = np.zeros(world.dim_p) 41 | agent.state.c = np.zeros(world.dim_c) 42 | for i, landmark in enumerate(world.landmarks): 43 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 44 | landmark.state.p_vel = np.zeros(world.dim_p) 45 | 46 | def benchmark_data(self, agent, world): 47 | rew = 0 48 | collisions = 0 49 | occupied_landmarks = 0 50 | min_dists = 0 51 | for l in world.landmarks: 52 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 53 | min_dists += min(dists) 54 | rew -= min(dists) 55 | if min(dists) < 0.1: 56 | occupied_landmarks += 1 57 | if agent.collide: 58 | for a in world.agents: 59 | if self.is_collision(a, agent): 60 | rew -= 1 61 | collisions += 1 62 | return (rew, collisions, min_dists, occupied_landmarks) 63 | 64 | 65 | def is_collision(self, agent1, agent2): 66 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 67 | dist = np.sqrt(np.sum(np.square(delta_pos))) 68 | dist_min = agent1.size + agent2.size 69 | return True if dist < dist_min else False 70 | 71 | def reward(self, agent, world): 72 | # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions 73 | rew = 0 74 | for l in world.landmarks: 75 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents] 76 | rew -= min(dists) 77 | if agent.collide: 78 | for a in world.agents: 79 | if self.is_collision(a, agent): 80 | rew -= 1 81 | return rew 82 | 83 | def observation(self, agent, world): 84 | # get positions of all entities in this agent's reference frame 85 | entity_pos = [] 86 | for entity in world.landmarks: # world.entities: 87 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 88 | entity_pos = heapq.nsmallest(3,entity_pos, key=lambda s: np.sum(np.square(s))) 89 | # entity colors 90 | entity_color = [] 91 | for entity in world.landmarks: # world.entities: 92 | entity_color.append(entity.color) 93 | # communication of all other agents 94 | comm = [] 95 | other_pos = [] 96 | for other in world.agents: 97 | if other is agent: continue 98 | comm.append(other.state.c) 99 | other_pos.append(other.state.p_pos - agent.state.p_pos) 100 | other_pos = heapq.nsmallest(3,other_pos, key=lambda s: np.sum(np.square(s))) 101 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos) 102 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/simple_push.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | import random 5 | 6 | # 7 | # # the non-ensemble version of 8 | # 9 | # 10 | 11 | class Scenario(BaseScenario): 12 | def make_world(self): 13 | world = World() 14 | # set any world properties first 15 | world.dim_c = 2 16 | num_agents = 2 17 | num_adversaries = 1 18 | num_landmarks = 2 19 | # add agents 20 | world.agents = [Agent() for i in range(num_agents)] 21 | for i, agent in enumerate(world.agents): 22 | agent.name = 'agent %d' % i 23 | agent.collide = True 24 | agent.silent = True 25 | if i < num_adversaries: 26 | agent.adversary = True 27 | else: 28 | agent.adversary = False 29 | # agent.u_noise = 1e-1 30 | # agent.c_noise = 1e-1 31 | # add landmarks 32 | world.landmarks = [Landmark() for i in range(num_landmarks)] 33 | for i, landmark in enumerate(world.landmarks): 34 | landmark.name = 'landmark %d' % i 35 | landmark.collide = False 36 | landmark.movable = False 37 | # make initial conditions 38 | self.reset_world(world) 39 | return world 40 | 41 | def reset_world(self, world): 42 | # random properties for landmarks 43 | for i, landmark in enumerate(world.landmarks): 44 | landmark.color = np.array([0.1, 0.1, 0.1]) 45 | landmark.color[i + 1] += 0.8 46 | landmark.index = i 47 | # set goal landmark 48 | goal = np.random.choice(world.landmarks) 49 | for i, agent in enumerate(world.agents): 50 | agent.goal_a = goal 51 | agent.color = np.array([0.25, 0.25, 0.25]) 52 | if agent.adversary: 53 | agent.color = np.array([0.75, 0.25, 0.25]) 54 | else: 55 | j = goal.index 56 | agent.color[j + 1] += 0.5 57 | # set random initial states 58 | for agent in world.agents: 59 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 60 | agent.state.p_vel = np.zeros(world.dim_p) 61 | agent.state.c = np.zeros(world.dim_c) 62 | for i, landmark in enumerate(world.landmarks): 63 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 64 | landmark.state.p_vel = np.zeros(world.dim_p) 65 | 66 | def reward(self, agent, world): 67 | # Agents are rewarded based on minimum agent distance to each landmark 68 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 69 | 70 | def agent_reward(self, agent, world): 71 | # the distance to the goal 72 | return -np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) 73 | 74 | def adversary_reward(self, agent, world): 75 | # keep the nearest good agents away from the goal 76 | agent_dist = [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in world.agents if not a.adversary] 77 | pos_rew = min(agent_dist) 78 | #nearest_agent = world.good_agents[np.argmin(agent_dist)] 79 | #neg_rew = np.sqrt(np.sum(np.square(nearest_agent.state.p_pos - agent.state.p_pos))) 80 | neg_rew = np.sqrt(np.sum(np.square(agent.goal_a.state.p_pos - agent.state.p_pos))) 81 | #neg_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in world.good_agents]) 82 | return pos_rew - neg_rew 83 | 84 | def observation(self, agent, world): 85 | # get positions of all entities in this agent's reference frame 86 | entity_pos = [] 87 | for entity in world.landmarks: # world.entities: 88 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 89 | # entity colors 90 | entity_color = [] 91 | for entity in world.landmarks: # world.entities: 92 | entity_color.append(entity.color) 93 | # communication of all other agents 94 | comm = [] 95 | other_pos = [] 96 | for other in world.agents: 97 | if other is agent: continue 98 | comm.append(other.state.c) 99 | other_pos.append(other.state.p_pos - agent.state.p_pos) 100 | if not agent.adversary: 101 | return np.concatenate([agent.state.p_vel] + [agent.goal_a.state.p_pos - agent.state.p_pos] + [agent.color] + entity_pos + entity_color + other_pos) 102 | else: 103 | #other_pos = list(reversed(other_pos)) if random.uniform(0,1) > 0.5 else other_pos # randomize position of other agents in adversary network 104 | return np.concatenate([agent.state.p_vel] + entity_pos + other_pos) 105 | -------------------------------------------------------------------------------- /multiagent-particle-envs/README.md: -------------------------------------------------------------------------------- 1 | # Multi-Agent Particle Environment 2 | 3 | A simple multi-agent particle world with a continuous observation and discrete action space, along with some basic simulated physics. 4 | Used in the paper [Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments](https://arxiv.org/pdf/1706.02275.pdf). 5 | 6 | ## Getting started: 7 | 8 | - To install, `cd` into the root directory and type `pip install -e .` 9 | 10 | - To interactively view moving to landmark scenario (see others in ./scenarios/): 11 | `bin/interactive.py --scenario simple.py` 12 | 13 | - Known dependencies: OpenAI gym, numpy 14 | 15 | - To use the environments, look at the code for importing them in `make_env.py`. 16 | 17 | ## Code structure 18 | 19 | - `make_env.py`: contains code for importing a multiagent environment as an OpenAI Gym-like object. 20 | 21 | - `./multiagent/environment.py`: contains code for environment simulation (interaction physics, `_step()` function, etc.) 22 | 23 | - `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code. 24 | 25 | - `./multiagent/rendering.py`: used for displaying agent behaviors on the screen. 26 | 27 | - `./multiagent/policy.py`: contains code for interactive policy based on keyboard input. 28 | 29 | - `./multiagent/scenario.py`: contains base scenario object that is extended for all scenarios. 30 | 31 | - `./multiagent/scenarios/`: folder where various scenarios/ environments are stored. scenario code consists of several functions: 32 | 1) `make_world()`: creates all of the entities that inhabit the world (landmarks, agents, etc.), assigns their capabilities (whether they can communicate, or move, or both). 33 | called once at the beginning of each training session 34 | 2) `reset_world()`: resets the world by assigning properties (position, color, etc.) to all entities in the world 35 | called before every episode (including after make_world() before the first episode) 36 | 3) `reward()`: defines the reward function for a given agent 37 | 4) `observation()`: defines the observation space of a given agent 38 | 5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics) 39 | 40 | ### Creating new environments 41 | 42 | You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`). 43 | 44 | ## List of environments 45 | 46 | 47 | | Env name in code (name in paper) | Communication? | Competitive? | Notes | 48 | | --- | --- | --- | --- | 49 | | `simple.py` | N | N | Single agent sees landmark position, rewarded based on how close it gets to landmark. Not a multiagent environment -- used for debugging policies. | 50 | | `simple_adversary.py` (Physical deception) | N | Y | 1 adversary (red), N good agents (green), N landmarks (usually N=2). All agents observe position of landmarks and other agents. One landmark is the ‘target landmark’ (colored green). Good agents rewarded based on how close one of them is to the target landmark, but negatively rewarded if the adversary is close to target landmark. Adversary is rewarded based on how close it is to the target, but it doesn’t know which landmark is the target landmark. So good agents have to learn to ‘split up’ and cover all landmarks to deceive the adversary. | 51 | | `simple_crypto.py` (Covert communication) | Y | Y | Two good agents (alice and bob), one adversary (eve). Alice must sent a private message to bob over a public channel. Alice and bob are rewarded based on how well bob reconstructs the message, but negatively rewarded if eve can reconstruct the message. Alice and bob have a private key (randomly generated at beginning of each episode), which they must learn to use to encrypt the message. | 52 | | `simple_push.py` (Keep-away) | N |Y | 1 agent, 1 adversary, 1 landmark. Agent is rewarded based on distance to landmark. Adversary is rewarded if it is close to the landmark, and if the agent is far from the landmark. So the adversary learns to push agent away from the landmark. | 53 | | `simple_reference.py` | Y | N | 2 agents, 3 landmarks of different colors. Each agent wants to get to their target landmark, which is known only by other agent. Reward is collective. So agents have to learn to communicate the goal of the other agent, and navigate to their landmark. This is the same as the simple_speaker_listener scenario where both agents are simultaneous speakers and listeners. | 54 | | `simple_speaker_listener.py` (Cooperative communication) | Y | N | Same as simple_reference, except one agent is the ‘speaker’ (gray) that does not move (observes goal of other agent), and other agent is the listener (cannot speak, but must navigate to correct landmark).| 55 | | `simple_spread.py` (Cooperative navigation) | N | N | N agents, N landmarks. Agents are rewarded based on how far any agent is from each landmark. Agents are penalized if they collide with other agents. So, agents have to learn to cover all the landmarks while avoiding collisions. | 56 | | `simple_tag.py` (Predator-prey) | N | Y | Predator-prey environment. Good agents (green) are faster and want to avoid being hit by adversaries (red). Adversaries are slower and want to hit good agents. Obstacles (large black circles) block the way. | 57 | | `simple_world_comm.py` | Y | Y | Environment seen in the video accompanying the paper. Same as simple_tag, except (1) there is food (small blue balls) that the good agents are rewarded for being near, (2) we now have ‘forests’ that hide agents inside from being seen from outside; (3) there is a ‘leader adversary” that can see the agents at all times, and can communicate with the other adversaries to help coordinate the chase. | 58 | 59 | ## Paper citation 60 | 61 | If you used this environment for your experiments or found it helpful, consider citing the following papers: 62 | 63 | Environments in this repo: 64 |
65 | @article{lowe2017multi,
66 |   title={Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments},
67 |   author={Lowe, Ryan and Wu, Yi and Tamar, Aviv and Harb, Jean and Abbeel, Pieter and Mordatch, Igor},
68 |   journal={Neural Information Processing Systems (NIPS)},
69 |   year={2017}
70 | }
71 | 
72 | 73 | Original particle world environment: 74 |
75 | @article{mordatch2017emergence,
76 |   title={Emergence of Grounded Compositional Language in Multi-Agent Populations},
77 |   author={Mordatch, Igor and Abbeel, Pieter},
78 |   journal={arXiv preprint arXiv:1703.04908},
79 |   year={2017}
80 | }
81 | 
82 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/simple_adversary.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | import random 5 | 6 | 7 | class Scenario(BaseScenario): 8 | 9 | def make_world(self): 10 | world = World() 11 | # set any world properties first 12 | world.dim_c = 2 13 | num_agents = 3 14 | world.num_agents = num_agents 15 | num_adversaries = 1 16 | num_landmarks = num_agents - 1 17 | # add agents 18 | world.agents = [Agent() for i in range(num_agents)] 19 | for i, agent in enumerate(world.agents): 20 | agent.name = 'agent %d' % i 21 | agent.collide = False 22 | agent.silent = True 23 | agent.adversary = True if i < num_adversaries else False 24 | agent.size = 0.15 25 | # add landmarks 26 | world.landmarks = [Landmark() for i in range(num_landmarks)] 27 | for i, landmark in enumerate(world.landmarks): 28 | landmark.name = 'landmark %d' % i 29 | landmark.collide = False 30 | landmark.movable = False 31 | landmark.size = 0.08 32 | # make initial conditions 33 | self.reset_world(world) 34 | return world 35 | 36 | def reset_world(self, world): 37 | # random properties for agents 38 | world.agents[0].color = np.array([0.85, 0.35, 0.35]) 39 | for i in range(1, world.num_agents): 40 | world.agents[i].color = np.array([0.35, 0.35, 0.85]) 41 | # random properties for landmarks 42 | for i, landmark in enumerate(world.landmarks): 43 | landmark.color = np.array([0.15, 0.15, 0.15]) 44 | # set goal landmark 45 | goal = np.random.choice(world.landmarks) 46 | goal.color = np.array([0.15, 0.65, 0.15]) 47 | for agent in world.agents: 48 | agent.goal_a = goal 49 | # set random initial states 50 | for agent in world.agents: 51 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 52 | agent.state.p_vel = np.zeros(world.dim_p) 53 | agent.state.c = np.zeros(world.dim_c) 54 | for i, landmark in enumerate(world.landmarks): 55 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 56 | landmark.state.p_vel = np.zeros(world.dim_p) 57 | 58 | def benchmark_data(self, agent, world): 59 | # returns data for benchmarking purposes 60 | if agent.adversary: 61 | return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) 62 | else: 63 | dists = [] 64 | for l in world.landmarks: 65 | dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos))) 66 | dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) 67 | return tuple(dists) 68 | 69 | # return all agents that are not adversaries 70 | def good_agents(self, world): 71 | return [agent for agent in world.agents if not agent.adversary] 72 | 73 | # return all adversarial agents 74 | def adversaries(self, world): 75 | return [agent for agent in world.agents if agent.adversary] 76 | 77 | def reward(self, agent, world): 78 | # Agents are rewarded based on minimum agent distance to each landmark 79 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 80 | 81 | def agent_reward(self, agent, world): 82 | # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it 83 | shaped_reward = True 84 | shaped_adv_reward = True 85 | 86 | # Calculate negative reward for adversary 87 | adversary_agents = self.adversaries(world) 88 | if shaped_adv_reward: # distance-based adversary reward 89 | adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents]) 90 | else: # proximity-based adversary reward (binary) 91 | adv_rew = 0 92 | for a in adversary_agents: 93 | if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size: 94 | adv_rew -= 5 95 | 96 | # Calculate positive reward for agents 97 | good_agents = self.good_agents(world) 98 | if shaped_reward: # distance-based agent reward 99 | pos_rew = -min( 100 | [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) 101 | else: # proximity-based agent reward (binary) 102 | pos_rew = 0 103 | if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \ 104 | < 2 * agent.goal_a.size: 105 | pos_rew += 5 106 | pos_rew -= min( 107 | [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) 108 | return pos_rew + adv_rew 109 | 110 | def adversary_reward(self, agent, world): 111 | # Rewarded based on proximity to the goal landmark 112 | shaped_reward = True 113 | if shaped_reward: # distance-based reward 114 | return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)) 115 | else: # proximity-based reward (binary) 116 | adv_rew = 0 117 | if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size: 118 | adv_rew += 5 119 | return adv_rew 120 | 121 | 122 | def observation(self, agent, world): 123 | # get positions of all entities in this agent's reference frame 124 | entity_pos = [] 125 | for entity in world.landmarks: 126 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 127 | # entity colors 128 | entity_color = [] 129 | for entity in world.landmarks: 130 | entity_color.append(entity.color) 131 | # communication of all other agents 132 | other_pos = [] 133 | for other in world.agents: 134 | if other is agent: continue 135 | other_pos.append(other.state.p_pos - agent.state.p_pos) 136 | 137 | if not agent.adversary: 138 | return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos) 139 | else: 140 | return np.concatenate(entity_pos + other_pos) 141 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/simple_tag.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 2 11 | num_good_agents = 2 12 | num_adversaries = 4 13 | num_agents = num_adversaries + num_good_agents 14 | num_landmarks = 2 15 | # add agents 16 | world.agents = [Agent() for i in range(num_agents)] 17 | for i, agent in enumerate(world.agents): 18 | agent.name = 'agent %d' % i 19 | agent.collide = True 20 | agent.silent = True 21 | agent.adversary = True if i < num_adversaries else False 22 | agent.size = 0.075 if agent.adversary else 0.05 23 | agent.accel = 3.0 if agent.adversary else 4.0 24 | #agent.accel = 20.0 if agent.adversary else 25.0 25 | agent.max_speed = 1.0 if agent.adversary else 1.3 26 | # add landmarks 27 | world.landmarks = [Landmark() for i in range(num_landmarks)] 28 | for i, landmark in enumerate(world.landmarks): 29 | landmark.name = 'landmark %d' % i 30 | landmark.collide = True 31 | landmark.movable = False 32 | landmark.size = 0.2 33 | landmark.boundary = False 34 | # make initial conditions 35 | self.reset_world(world) 36 | return world 37 | 38 | 39 | def reset_world(self, world): 40 | # random properties for agents 41 | for i, agent in enumerate(world.agents): 42 | agent.color = np.array([0.35, 0.85, 0.35]) if not agent.adversary else np.array([0.85, 0.35, 0.35]) 43 | # random properties for landmarks 44 | for i, landmark in enumerate(world.landmarks): 45 | landmark.color = np.array([0.25, 0.25, 0.25]) 46 | # set random initial states 47 | for agent in world.agents: 48 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 49 | agent.state.p_vel = np.zeros(world.dim_p) 50 | agent.state.c = np.zeros(world.dim_c) 51 | for i, landmark in enumerate(world.landmarks): 52 | if not landmark.boundary: 53 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 54 | landmark.state.p_vel = np.zeros(world.dim_p) 55 | 56 | 57 | def benchmark_data(self, agent, world): 58 | # # returns data for benchmarking purposes 59 | # if agent.adversary: 60 | # collisions = 0 61 | # for a in self.good_agents(world): 62 | # if self.is_collision(a, agent): 63 | # collisions += 1 64 | # return collisions 65 | # else: 66 | # return 0 67 | collisions = 0 68 | rew = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 69 | if agent.adversary: 70 | for a in self.good_agents(world): 71 | if self.is_collision(a, agent): 72 | collisions += 1 73 | return (rew, collisions) 74 | 75 | def is_collision(self, agent1, agent2): 76 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 77 | dist = np.sqrt(np.sum(np.square(delta_pos))) 78 | dist_min = agent1.size + agent2.size 79 | return True if dist < dist_min else False 80 | 81 | # return all agents that are not adversaries 82 | def good_agents(self, world): 83 | return [agent for agent in world.agents if not agent.adversary] 84 | 85 | # return all adversarial agents 86 | def adversaries(self, world): 87 | return [agent for agent in world.agents if agent.adversary] 88 | 89 | 90 | def reward(self, agent, world): 91 | # Agents are rewarded based on minimum agent distance to each landmark 92 | main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 93 | return main_reward 94 | 95 | def agent_reward(self, agent, world): 96 | # Agents are negatively rewarded if caught by adversaries 97 | rew = 0 98 | shape = True 99 | adversaries = self.adversaries(world) 100 | if shape: # reward can optionally be shaped (increased reward for increased distance from adversary) 101 | for adv in adversaries: 102 | rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos))) 103 | if agent.collide: 104 | for a in adversaries: 105 | if self.is_collision(a, agent): 106 | rew -= 10 107 | 108 | # agents are penalized for exiting the screen, so that they can be caught by the adversaries 109 | def bound(x): 110 | if x < 0.9: 111 | return 0 112 | if x < 1.0: 113 | return (x - 0.9) * 10 114 | return min(np.exp(2 * x - 2), 10) 115 | for p in range(world.dim_p): 116 | x = abs(agent.state.p_pos[p]) 117 | rew -= bound(x) 118 | 119 | return rew 120 | 121 | def adversary_reward(self, agent, world): 122 | # Adversaries are rewarded for collisions with agents 123 | rew = 0 124 | shape = True 125 | agents = self.good_agents(world) 126 | adversaries = self.adversaries(world) 127 | if shape: # reward can optionally be shaped (decreased reward for increased distance from agents) 128 | for adv in adversaries: 129 | rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents]) 130 | if agent.collide: 131 | for ag in agents: 132 | for adv in adversaries: 133 | if self.is_collision(ag, adv): 134 | rew += 10 135 | return rew 136 | 137 | def observation(self, agent, world): 138 | # get positions of all entities in this agent's reference frame 139 | entity_pos = [] 140 | for entity in world.landmarks: 141 | if not entity.boundary: 142 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 143 | # communication of all other agents 144 | comm = [] 145 | other_pos = [] 146 | other_vel = [] 147 | for other in world.agents: 148 | if other is agent: continue 149 | comm.append(other.state.c) 150 | other_pos.append(other.state.p_pos - agent.state.p_pos) 151 | if not other.adversary: 152 | other_vel.append(other.state.p_vel) 153 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel) 154 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/simple_crypto.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scenario: 3 | 1 speaker, 2 listeners (one of which is an adversary). Good agents rewarded for proximity to goal, and distance from 4 | adversary to goal. Adversary is rewarded for its distance to the goal. 5 | """ 6 | 7 | 8 | import numpy as np 9 | from multiagent.core import World, Agent, Landmark 10 | from multiagent.scenario import BaseScenario 11 | import random 12 | 13 | 14 | class CryptoAgent(Agent): 15 | def __init__(self): 16 | super(CryptoAgent, self).__init__() 17 | self.key = None 18 | 19 | class Scenario(BaseScenario): 20 | 21 | def make_world(self): 22 | world = World() 23 | # set any world properties first 24 | num_agents = 3 25 | num_adversaries = 1 26 | num_landmarks = 2 27 | world.dim_c = 4 28 | # add agents 29 | world.agents = [CryptoAgent() for i in range(num_agents)] 30 | for i, agent in enumerate(world.agents): 31 | agent.name = 'agent %d' % i 32 | agent.collide = False 33 | agent.adversary = True if i < num_adversaries else False 34 | agent.speaker = True if i == 2 else False 35 | agent.movable = False 36 | # add landmarks 37 | world.landmarks = [Landmark() for i in range(num_landmarks)] 38 | for i, landmark in enumerate(world.landmarks): 39 | landmark.name = 'landmark %d' % i 40 | landmark.collide = False 41 | landmark.movable = False 42 | # make initial conditions 43 | self.reset_world(world) 44 | return world 45 | 46 | 47 | def reset_world(self, world): 48 | # random properties for agents 49 | for i, agent in enumerate(world.agents): 50 | agent.color = np.array([0.25, 0.25, 0.25]) 51 | if agent.adversary: 52 | agent.color = np.array([0.75, 0.25, 0.25]) 53 | agent.key = None 54 | # random properties for landmarks 55 | color_list = [np.zeros(world.dim_c) for i in world.landmarks] 56 | for i, color in enumerate(color_list): 57 | color[i] += 1 58 | for color, landmark in zip(color_list, world.landmarks): 59 | landmark.color = color 60 | # set goal landmark 61 | goal = np.random.choice(world.landmarks) 62 | world.agents[1].color = goal.color 63 | world.agents[2].key = np.random.choice(world.landmarks).color 64 | 65 | for agent in world.agents: 66 | agent.goal_a = goal 67 | 68 | # set random initial states 69 | for agent in world.agents: 70 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 71 | agent.state.p_vel = np.zeros(world.dim_p) 72 | agent.state.c = np.zeros(world.dim_c) 73 | for i, landmark in enumerate(world.landmarks): 74 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 75 | landmark.state.p_vel = np.zeros(world.dim_p) 76 | 77 | 78 | def benchmark_data(self, agent, world): 79 | # returns data for benchmarking purposes 80 | return (agent.state.c, agent.goal_a.color) 81 | 82 | # return all agents that are not adversaries 83 | def good_listeners(self, world): 84 | return [agent for agent in world.agents if not agent.adversary and not agent.speaker] 85 | 86 | # return all agents that are not adversaries 87 | def good_agents(self, world): 88 | return [agent for agent in world.agents if not agent.adversary] 89 | 90 | # return all adversarial agents 91 | def adversaries(self, world): 92 | return [agent for agent in world.agents if agent.adversary] 93 | 94 | def reward(self, agent, world): 95 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 96 | 97 | def agent_reward(self, agent, world): 98 | # Agents rewarded if Bob can reconstruct message, but adversary (Eve) cannot 99 | good_listeners = self.good_listeners(world) 100 | adversaries = self.adversaries(world) 101 | good_rew = 0 102 | adv_rew = 0 103 | for a in good_listeners: 104 | if (a.state.c == np.zeros(world.dim_c)).all(): 105 | continue 106 | else: 107 | good_rew -= np.sum(np.square(a.state.c - agent.goal_a.color)) 108 | for a in adversaries: 109 | if (a.state.c == np.zeros(world.dim_c)).all(): 110 | continue 111 | else: 112 | adv_l1 = np.sum(np.square(a.state.c - agent.goal_a.color)) 113 | adv_rew += adv_l1 114 | return adv_rew + good_rew 115 | 116 | def adversary_reward(self, agent, world): 117 | # Adversary (Eve) is rewarded if it can reconstruct original goal 118 | rew = 0 119 | if not (agent.state.c == np.zeros(world.dim_c)).all(): 120 | rew -= np.sum(np.square(agent.state.c - agent.goal_a.color)) 121 | return rew 122 | 123 | 124 | def observation(self, agent, world): 125 | # goal color 126 | goal_color = np.zeros(world.dim_color) 127 | if agent.goal_a is not None: 128 | goal_color = agent.goal_a.color 129 | 130 | #print('goal color in obs is {}'.format(goal_color)) 131 | 132 | # get positions of all entities in this agent's reference frame 133 | entity_pos = [] 134 | for entity in world.landmarks: 135 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 136 | # communication of all other agents 137 | comm = [] 138 | for other in world.agents: 139 | if other is agent or (other.state.c is None) or not other.speaker: continue 140 | comm.append(other.state.c) 141 | 142 | confer = np.array([0]) 143 | 144 | if world.agents[2].key is None: 145 | confer = np.array([1]) 146 | key = np.zeros(world.dim_c) 147 | goal_color = np.zeros(world.dim_c) 148 | else: 149 | key = world.agents[2].key 150 | 151 | prnt = False 152 | # speaker 153 | if agent.speaker: 154 | if prnt: 155 | print('speaker') 156 | print(agent.state.c) 157 | print(np.concatenate([goal_color] + [key] + [confer] + [np.random.randn(1)])) 158 | return np.concatenate([goal_color] + [key]) 159 | # listener 160 | if not agent.speaker and not agent.adversary: 161 | if prnt: 162 | print('listener') 163 | print(agent.state.c) 164 | print(np.concatenate([key] + comm + [confer])) 165 | return np.concatenate([key] + comm) 166 | if not agent.speaker and agent.adversary: 167 | if prnt: 168 | print('adversary') 169 | print(agent.state.c) 170 | print(np.concatenate(comm + [confer])) 171 | return np.concatenate(comm) 172 | -------------------------------------------------------------------------------- /maddpg/trainer/maddpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | import maddpg.common.tf_util as U 5 | 6 | from maddpg.common.distributions import make_pdtype 7 | from maddpg import AgentTrainer 8 | from maddpg.trainer.replay_buffer import ReplayBuffer 9 | 10 | 11 | def discount_with_dones(rewards, dones, gamma): 12 | discounted = [] 13 | r = 0 14 | for reward, done in zip(rewards[::-1], dones[::-1]): 15 | r = reward + gamma*r 16 | r = r*(1.-done) 17 | discounted.append(r) 18 | return discounted[::-1] 19 | 20 | def make_update_exp(vals, target_vals): 21 | polyak = 1.0 - 1e-2 22 | expression = [] 23 | for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): 24 | expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var)) 25 | expression = tf.group(*expression) 26 | return U.function([], [], updates=[expression]) 27 | 28 | def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None): 29 | with tf.variable_scope(scope, reuse=reuse): 30 | # create distribtuions 31 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 32 | 33 | # set up placeholders 34 | obs_ph_n = make_obs_ph_n 35 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] 36 | 37 | p_input = obs_ph_n[p_index] 38 | 39 | p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units) 40 | p_func_vars = U.scope_vars(U.absolute_scope_name("p_func")) 41 | 42 | # wrap parameters in distribution 43 | act_pd = act_pdtype_n[p_index].pdfromflat(p) 44 | 45 | act_sample = act_pd.sample() 46 | p_reg = tf.reduce_mean(tf.square(act_pd.flatparam())) 47 | 48 | act_input_n = act_ph_n + [] 49 | act_input_n[p_index] = act_pd.sample() 50 | q_input = tf.concat(obs_ph_n + act_input_n, 1) 51 | if local_q_func: 52 | q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) 53 | q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0] 54 | pg_loss = -tf.reduce_mean(q) 55 | 56 | loss = pg_loss + p_reg * 1e-3 57 | 58 | optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping) 59 | 60 | # Create callable functions 61 | train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) 62 | act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample) 63 | p_values = U.function([obs_ph_n[p_index]], p) 64 | 65 | # target network 66 | target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units) 67 | target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func")) 68 | update_target_p = make_update_exp(p_func_vars, target_p_func_vars) 69 | 70 | target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample() 71 | target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample) 72 | 73 | return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act} 74 | 75 | def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64): 76 | with tf.variable_scope(scope, reuse=reuse): 77 | # create distribtuions 78 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 79 | 80 | # set up placeholders 81 | obs_ph_n = make_obs_ph_n 82 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))] 83 | target_ph = tf.placeholder(tf.float32, [None], name="target") 84 | 85 | q_input = tf.concat(obs_ph_n + act_ph_n, 1) 86 | if local_q_func: 87 | q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1) 88 | q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0] 89 | q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) 90 | 91 | q_loss = tf.reduce_mean(tf.square(q - target_ph)) 92 | 93 | # viscosity solution to Bellman differential equation in place of an initial condition 94 | q_reg = tf.reduce_mean(tf.square(q)) 95 | loss = q_loss #+ 1e-3 * q_reg 96 | 97 | optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping) 98 | 99 | # Create callable functions 100 | train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr]) 101 | q_values = U.function(obs_ph_n + act_ph_n, q) 102 | 103 | # target network 104 | target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0] 105 | target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) 106 | update_target_q = make_update_exp(q_func_vars, target_q_func_vars) 107 | 108 | target_q_values = U.function(obs_ph_n + act_ph_n, target_q) 109 | 110 | return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values} 111 | 112 | class MADDPGAgentTrainer(AgentTrainer): 113 | def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False): 114 | self.name = name 115 | self.n = len(obs_shape_n) 116 | self.agent_index = agent_index 117 | self.args = args 118 | obs_ph_n = [] 119 | for i in range(self.n): 120 | obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) 121 | 122 | # Create all the functions necessary to train the model 123 | self.q_train, self.q_update, self.q_debug = q_train( 124 | scope=self.name, 125 | make_obs_ph_n=obs_ph_n, 126 | act_space_n=act_space_n, 127 | q_index=agent_index, 128 | q_func=model, 129 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 130 | grad_norm_clipping=0.5, 131 | local_q_func=local_q_func, 132 | num_units=args.num_units 133 | ) 134 | self.act, self.p_train, self.p_update, self.p_debug = p_train( 135 | scope=self.name, 136 | make_obs_ph_n=obs_ph_n, 137 | act_space_n=act_space_n, 138 | p_index=agent_index, 139 | p_func=model, 140 | q_func=model, 141 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 142 | grad_norm_clipping=0.5, 143 | local_q_func=local_q_func, 144 | num_units=args.num_units 145 | ) 146 | # Create experience buffer 147 | self.replay_buffer = ReplayBuffer(1e6) 148 | self.max_replay_buffer_len = 50 * args.max_episode_len 149 | # self.max_replay_buffer_len = args.batch_size * args.max_episode_len 150 | self.replay_sample_index = None 151 | 152 | def action(self, obs): 153 | return self.act(obs[None])[0] 154 | 155 | def experience(self, obs, act, rew, new_obs, done, terminal): 156 | # Store transition in the replay buffer. 157 | self.replay_buffer.add(obs, act, rew, new_obs, float(done)) 158 | 159 | def preupdate(self): 160 | self.replay_sample_index = None 161 | 162 | def update(self, agents, t): 163 | if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough 164 | return 165 | if not t % 100 == 0: # only update every 100 steps 166 | return 167 | 168 | self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) 169 | # collect replay sample from all agents 170 | obs_n = [] 171 | obs_next_n = [] 172 | act_n = [] 173 | index = self.replay_sample_index 174 | for i in range(self.n): 175 | obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) 176 | obs_n.append(obs) 177 | obs_next_n.append(obs_next) 178 | act_n.append(act) 179 | obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) 180 | 181 | # train q network 182 | num_sample = 1 183 | target_q = 0.0 184 | for i in range(num_sample): 185 | target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] 186 | target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n)) 187 | target_q += rew + self.args.gamma * (1.0 - done) * target_q_next 188 | target_q /= num_sample 189 | q_loss = self.q_train(*(obs_n + act_n + [target_q])) 190 | 191 | # train p network 192 | p_loss = self.p_train(*(obs_n + act_n)) 193 | 194 | self.p_update() 195 | self.q_update() 196 | 197 | return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)] 198 | -------------------------------------------------------------------------------- /experiments/ibmac_inter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | import maddpg.common.tf_util as U 5 | 6 | from maddpg.common.distributions import make_pdtype 7 | from maddpg import AgentTrainer 8 | from maddpg.trainer.replay_buffer_with_messages import ReplayBuffer 9 | 10 | import itertools 11 | 12 | 13 | def discount_with_dones(rewards, dones, gamma): 14 | discounted = [] 15 | r = 0 16 | for reward, done in zip(rewards[::-1], dones[::-1]): 17 | r = reward + gamma * r 18 | r = r * (1. - done) 19 | discounted.append(r) 20 | return discounted[::-1] 21 | 22 | 23 | def make_update_exp(vals, target_vals): 24 | polyak = 1.0 - 1e-2 25 | expression = [] 26 | for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): 27 | expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var)) 28 | expression = tf.group(*expression) 29 | return U.function([], [], updates=[expression]) 30 | 31 | 32 | def p_train(make_obs_ph_n, make_meesages_ph_n, act_space_n, p_func, q_func, optimizer, grad_norm_clipping=None, 33 | local_q_func=False, num_units=64, scope="trainer", reuse=None, beta=0.01): 34 | with tf.variable_scope(scope, reuse=reuse): 35 | num_agents = len(make_obs_ph_n) 36 | 37 | # create distribtuions 38 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 39 | 40 | # set up placeholders 41 | obs_ph_n = make_obs_ph_n 42 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(num_agents)] 43 | 44 | messages_ph_n = make_meesages_ph_n 45 | 46 | # multi_head = pre_message(messages_ph_n) 47 | 48 | items = [p_func([obs_ph_n[i], tf.concat(messages_ph_n, 1)], int(act_pdtype_n[i].param_shape()[0]), 49 | scope="p_func_{}".format(i), num_units=num_units) for i in range(num_agents)] 50 | p_n, message_n, mu_message_n, logvar_message_n = list(zip(*items)) 51 | 52 | logvar_message_n = [tf.clip_by_value(log, -10, 10) for log in 53 | logvar_message_n] # constrain kl_loss not to be too large 54 | 55 | p_func_vars = [U.scope_vars(U.absolute_scope_name("p_func_{}".format(i))) for i in range(num_agents)] 56 | 57 | # wrap parameters in distribution 58 | act_pd_n = [act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents)] 59 | 60 | act_sample_n = [act_pd.sample() for act_pd in act_pd_n] 61 | p_reg_n = [tf.reduce_mean(tf.square(act_pd.flatparam())) for act_pd in act_pd_n] 62 | 63 | act_input_n_n = [act_ph_n + [] for _ in range(num_agents)] 64 | for i in range(num_agents): 65 | act_input_n_n[i][i] = act_pd_n[i].sample() 66 | q_input_n = [tf.concat(obs_ph_n + messages_ph_n + act_input_n, 1) for act_input_n in act_input_n_n] 67 | 68 | q_n = [q_func(q_input_n[i], 1, scope="q_func_{}".format(i), reuse=True, num_units=num_units)[:, 0] for i in 69 | range(num_agents)] 70 | pg_loss_n = [-tf.reduce_mean(q) for q in q_n] 71 | 72 | kl_loss_message_n = [0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5 for mu, log in 73 | zip(mu_message_n, logvar_message_n)] 74 | kl_loss_message = tf.reduce_mean(kl_loss_message_n) 75 | 76 | pg_loss = tf.reduce_sum(pg_loss_n) 77 | p_reg = tf.reduce_sum(p_reg_n) 78 | loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message 79 | 80 | var_list = [] 81 | var_list.extend(p_func_vars) 82 | var_list = list(itertools.chain(*var_list)) 83 | optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping) 84 | 85 | # Create callable functions 86 | train = U.function(inputs=obs_ph_n + messages_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr]) 87 | act = U.function(inputs=obs_ph_n + messages_ph_n, outputs=[act_sample_n, message_n]) 88 | p_values = U.function(inputs=obs_ph_n + messages_ph_n, outputs=p_n) 89 | 90 | # target network 91 | target_items = [p_func([obs_ph_n[i], tf.concat(messages_ph_n, 1)], int(act_pdtype_n[i].param_shape()[0]), 92 | scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents)] 93 | 94 | target_p_n, target_message_n, target_mu_message_n, target_logvar_message_n = list(zip(*target_items)) 95 | target_logvar_message_n = [tf.clip_by_value(log, -10, 10) for log in 96 | target_logvar_message_n] # constrain kl_loss not to be too large 97 | 98 | target_p_func_vars = [U.scope_vars(U.absolute_scope_name("target_p_func_{}".format(i))) for i in 99 | range(num_agents)] 100 | 101 | target_var_list = [] 102 | target_var_list.extend(target_p_func_vars) 103 | target_var_list = list(itertools.chain(*target_var_list)) 104 | update_target_p = make_update_exp(var_list, target_var_list) 105 | 106 | target_act_sample_n = [act_pdtype_n[i].pdfromflat(target_p_n[i]).sample() for i in range(num_agents)] 107 | target_act = U.function(inputs=obs_ph_n + messages_ph_n, outputs=[target_act_sample_n, target_message_n]) 108 | 109 | return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act} 110 | 111 | 112 | def q_train(make_obs_ph_n, make_meesages_ph_n, act_space_n, q_func, optimizer, grad_norm_clipping=None, 113 | local_q_func=False, scope="trainer", reuse=None, num_units=64): 114 | with tf.variable_scope(scope, reuse=reuse): 115 | num_agents = len(make_obs_ph_n) 116 | 117 | # create distribtuions 118 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 119 | 120 | # set up placeholders 121 | obs_ph_n = make_obs_ph_n 122 | messages_ph_n = make_meesages_ph_n 123 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action_{}".format(i)) for i in 124 | range(len(act_space_n))] 125 | target_ph_n = [tf.placeholder(tf.float32, [None], name="target_{}".format(i)) for i in range(num_agents)] 126 | 127 | q_input = tf.concat(obs_ph_n + messages_ph_n + act_ph_n, 1) 128 | q_n = [q_func(q_input, 1, scope="q_func_{}".format(i), num_units=num_units)[:, 0] for i in range(num_agents)] 129 | q_func_vars = [U.scope_vars(U.absolute_scope_name("q_func_{}".format(i))) for i in range(num_agents)] 130 | 131 | q_loss_n = [tf.reduce_mean(tf.square(q - target_ph)) for q, target_ph in zip(q_n, target_ph_n)] 132 | 133 | # viscosity solution to Bellman differential equation in place of an initial condition 134 | # q_reg = tf.reduce_mean(tf.square(q)) 135 | q_loss = tf.reduce_sum(q_loss_n) 136 | loss = q_loss # + 1e-3 * q_reg 137 | 138 | var_list = list(itertools.chain(*q_func_vars)) 139 | optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping) 140 | 141 | # Create callable functions 142 | train = U.function(inputs=obs_ph_n + messages_ph_n + act_ph_n + target_ph_n, outputs=loss, 143 | updates=[optimize_expr]) 144 | q_values = U.function(obs_ph_n + messages_ph_n + act_ph_n, q_n) 145 | 146 | # target network 147 | target_q_n = [q_func(q_input, 1, scope="target_q_func_{}".format(i), num_units=num_units)[:, 0] for i in 148 | range(num_agents)] 149 | target_q_func_vars = [U.scope_vars(U.absolute_scope_name("target_q_func_{}".format(i))) for i in 150 | range(num_agents)] 151 | 152 | traget_var_list = list(itertools.chain(*target_q_func_vars)) 153 | update_target_q = make_update_exp(var_list, traget_var_list) 154 | 155 | target_q_values = U.function(obs_ph_n + messages_ph_n + act_ph_n, target_q_n) 156 | 157 | return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values} 158 | 159 | 160 | class IBMACInterAgentTrainer(AgentTrainer): 161 | def __init__(self, name, actor_model, critic_mlp_model, obs_shape_n, act_space_n, args, local_q_func=False): 162 | self.name = name 163 | self.n = len(obs_shape_n) 164 | self.args = args 165 | obs_ph_n = [] 166 | messages_ph_n = [] 167 | for i in range(self.n): 168 | obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation_" + str(i)).get()) 169 | messages_ph_n.append(U.BatchInput((args.dim_message,), name="message_" + str(i)).get()) 170 | 171 | # Create all the functions necessary to train the model 172 | self.q_train, self.q_update, self.q_debug = q_train( 173 | scope=self.name, 174 | make_obs_ph_n=obs_ph_n, 175 | make_meesages_ph_n=messages_ph_n, 176 | act_space_n=act_space_n, 177 | q_func=critic_mlp_model, 178 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 179 | grad_norm_clipping=0.5, 180 | local_q_func=local_q_func, 181 | num_units=args.num_units, 182 | ) 183 | self.act, self.p_train, self.p_update, self.p_debug = p_train( 184 | scope=self.name, 185 | make_obs_ph_n=obs_ph_n, 186 | make_meesages_ph_n=messages_ph_n, 187 | act_space_n=act_space_n, 188 | p_func=actor_model, 189 | q_func=critic_mlp_model, 190 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 191 | grad_norm_clipping=0.5, 192 | local_q_func=local_q_func, 193 | num_units=args.num_units, 194 | beta=args.beta, 195 | ) 196 | # Create experience buffer 197 | self.replay_buffer = ReplayBuffer(1e6) 198 | # self.max_replay_buffer_len = 50 * args.max_episode_len 199 | self.max_replay_buffer_len = args.batch_size * args.max_episode_len 200 | self.replay_sample_index = None 201 | 202 | def action(self, obs_n, message_n): 203 | obs = [obs[None] for obs in obs_n] 204 | message = [message[None] for message in message_n] 205 | return self.act(*(list(obs) + list(message))) 206 | 207 | def experience(self, obs, message, act, rew, new_obs, done, terminal): 208 | # Store transition in the replay buffer. 209 | self.replay_buffer.add(obs, message, act, rew, new_obs, [float(d) for d in done]) 210 | 211 | def preupdate(self): 212 | self.replay_sample_index = None 213 | 214 | def update(self, agents, t): 215 | if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough 216 | return 217 | if not t % 100 == 0: # only update every 100 steps 218 | return 219 | 220 | self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) 221 | # collect replay sample from all agents 222 | obs_n = [] 223 | obs_next_n = [] 224 | act_n = [] 225 | index = self.replay_sample_index 226 | samples = self.replay_buffer.sample_index(index) 227 | obs_n, message_n, act_n, rew_n, obs_next_n, done_n = [np.swapaxes(item, 0, 1) for item in samples] 228 | # for i in range(self.n): 229 | # obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) 230 | # obs_n.append(obs) 231 | # obs_next_n.append(obs_next) 232 | # act_n.append(act) 233 | # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) 234 | 235 | # train q network 236 | num_sample = 1 237 | target_q = 0.0 238 | for i in range(num_sample): 239 | target_act_next_n, target_next_message_n = self.p_debug['target_act'](*(list(obs_next_n) + list(message_n))) 240 | target_q_next_n = self.q_debug['target_q_values']( 241 | *(list(obs_next_n) + list(target_next_message_n) + list(target_act_next_n))) 242 | target_q_n = [rew + self.args.gamma * (1.0 - done) * target_q_next for rew, done, target_q_next in 243 | zip(rew_n, done_n, target_q_next_n)] 244 | target_q_n = [target_q / num_sample for target_q in target_q_n] 245 | q_loss = self.q_train(*(list(obs_n) + list(message_n) + list(act_n) + target_q_n)) 246 | 247 | # train p network 248 | p_loss = self.p_train(*(list(obs_n) + list(message_n) + list(act_n))) 249 | 250 | self.p_update() 251 | self.q_update() 252 | 253 | return [q_loss, p_loss, np.mean(target_q), np.mean(rew_n), np.mean(target_q_next_n), np.std(target_q)] 254 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/rendering.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2D rendering framework 3 | """ 4 | from __future__ import division 5 | import os 6 | import six 7 | import sys 8 | 9 | if "Apple" in sys.version: 10 | if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ: 11 | os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib' 12 | # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite 13 | 14 | from gym.utils import reraise 15 | from gym import error 16 | 17 | try: 18 | import pyglet 19 | except ImportError as e: 20 | reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.") 21 | 22 | try: 23 | from pyglet.gl import * 24 | except ImportError as e: 25 | reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '") 26 | 27 | import math 28 | import numpy as np 29 | 30 | RAD2DEG = 57.29577951308232 31 | 32 | def get_display(spec): 33 | """Convert a display specification (such as :0) into an actual Display 34 | object. 35 | 36 | Pyglet only supports multiple Displays on Linux. 37 | """ 38 | if spec is None: 39 | return None 40 | elif isinstance(spec, six.string_types): 41 | return pyglet.canvas.Display(spec) 42 | else: 43 | raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec)) 44 | 45 | class Viewer(object): 46 | def __init__(self, width, height, display=None): 47 | display = get_display(display) 48 | 49 | self.width = width 50 | self.height = height 51 | 52 | self.window = pyglet.window.Window(width=width, height=height, display=display) 53 | self.window.on_close = self.window_closed_by_user 54 | self.geoms = [] 55 | self.onetime_geoms = [] 56 | self.transform = Transform() 57 | 58 | glEnable(GL_BLEND) 59 | # glEnable(GL_MULTISAMPLE) 60 | glEnable(GL_LINE_SMOOTH) 61 | # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE) 62 | glHint(GL_LINE_SMOOTH_HINT, GL_NICEST) 63 | glLineWidth(2.0) 64 | glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) 65 | 66 | def close(self): 67 | self.window.close() 68 | 69 | def window_closed_by_user(self): 70 | self.close() 71 | 72 | def set_bounds(self, left, right, bottom, top): 73 | assert right > left and top > bottom 74 | scalex = self.width/(right-left) 75 | scaley = self.height/(top-bottom) 76 | self.transform = Transform( 77 | translation=(-left*scalex, -bottom*scaley), 78 | scale=(scalex, scaley)) 79 | 80 | def add_geom(self, geom): 81 | self.geoms.append(geom) 82 | 83 | def add_onetime(self, geom): 84 | self.onetime_geoms.append(geom) 85 | 86 | def render(self, return_rgb_array=False): 87 | glClearColor(1,1,1,1) 88 | self.window.clear() 89 | self.window.switch_to() 90 | self.window.dispatch_events() 91 | self.transform.enable() 92 | for geom in self.geoms: 93 | geom.render() 94 | for geom in self.onetime_geoms: 95 | geom.render() 96 | self.transform.disable() 97 | arr = None 98 | if return_rgb_array: 99 | buffer = pyglet.image.get_buffer_manager().get_color_buffer() 100 | image_data = buffer.get_image_data() 101 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 102 | # In https://github.com/openai/gym-http-api/issues/2, we 103 | # discovered that someone using Xmonad on Arch was having 104 | # a window of size 598 x 398, though a 600 x 400 window 105 | # was requested. (Guess Xmonad was preserving a pixel for 106 | # the boundary.) So we use the buffer height/width rather 107 | # than the requested one. 108 | arr = arr.reshape(buffer.height, buffer.width, 4) 109 | arr = arr[::-1,:,0:3] 110 | self.window.flip() 111 | self.onetime_geoms = [] 112 | return arr 113 | 114 | # Convenience 115 | def draw_circle(self, radius=10, res=30, filled=True, **attrs): 116 | geom = make_circle(radius=radius, res=res, filled=filled) 117 | _add_attrs(geom, attrs) 118 | self.add_onetime(geom) 119 | return geom 120 | 121 | def draw_polygon(self, v, filled=True, **attrs): 122 | geom = make_polygon(v=v, filled=filled) 123 | _add_attrs(geom, attrs) 124 | self.add_onetime(geom) 125 | return geom 126 | 127 | def draw_polyline(self, v, **attrs): 128 | geom = make_polyline(v=v) 129 | _add_attrs(geom, attrs) 130 | self.add_onetime(geom) 131 | return geom 132 | 133 | def draw_line(self, start, end, **attrs): 134 | geom = Line(start, end) 135 | _add_attrs(geom, attrs) 136 | self.add_onetime(geom) 137 | return geom 138 | 139 | def get_array(self): 140 | self.window.flip() 141 | image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() 142 | self.window.flip() 143 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 144 | arr = arr.reshape(self.height, self.width, 4) 145 | return arr[::-1,:,0:3] 146 | 147 | def _add_attrs(geom, attrs): 148 | if "color" in attrs: 149 | geom.set_color(*attrs["color"]) 150 | if "linewidth" in attrs: 151 | geom.set_linewidth(attrs["linewidth"]) 152 | 153 | class Geom(object): 154 | def __init__(self): 155 | self._color=Color((0, 0, 0, 1.0)) 156 | self.attrs = [self._color] 157 | def render(self): 158 | for attr in reversed(self.attrs): 159 | attr.enable() 160 | self.render1() 161 | for attr in self.attrs: 162 | attr.disable() 163 | def render1(self): 164 | raise NotImplementedError 165 | def add_attr(self, attr): 166 | self.attrs.append(attr) 167 | def set_color(self, r, g, b, alpha=1): 168 | self._color.vec4 = (r, g, b, alpha) 169 | 170 | class Attr(object): 171 | def enable(self): 172 | raise NotImplementedError 173 | def disable(self): 174 | pass 175 | 176 | class Transform(Attr): 177 | def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)): 178 | self.set_translation(*translation) 179 | self.set_rotation(rotation) 180 | self.set_scale(*scale) 181 | def enable(self): 182 | glPushMatrix() 183 | glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint 184 | glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0) 185 | glScalef(self.scale[0], self.scale[1], 1) 186 | def disable(self): 187 | glPopMatrix() 188 | def set_translation(self, newx, newy): 189 | self.translation = (float(newx), float(newy)) 190 | def set_rotation(self, new): 191 | self.rotation = float(new) 192 | def set_scale(self, newx, newy): 193 | self.scale = (float(newx), float(newy)) 194 | 195 | class Color(Attr): 196 | def __init__(self, vec4): 197 | self.vec4 = vec4 198 | def enable(self): 199 | glColor4f(*self.vec4) 200 | 201 | class LineStyle(Attr): 202 | def __init__(self, style): 203 | self.style = style 204 | def enable(self): 205 | glEnable(GL_LINE_STIPPLE) 206 | glLineStipple(1, self.style) 207 | def disable(self): 208 | glDisable(GL_LINE_STIPPLE) 209 | 210 | class LineWidth(Attr): 211 | def __init__(self, stroke): 212 | self.stroke = stroke 213 | def enable(self): 214 | glLineWidth(self.stroke) 215 | 216 | class Point(Geom): 217 | def __init__(self): 218 | Geom.__init__(self) 219 | def render1(self): 220 | glBegin(GL_POINTS) # draw point 221 | glVertex3f(0.0, 0.0, 0.0) 222 | glEnd() 223 | 224 | class FilledPolygon(Geom): 225 | def __init__(self, v): 226 | Geom.__init__(self) 227 | self.v = v 228 | def render1(self): 229 | if len(self.v) == 4 : glBegin(GL_QUADS) 230 | elif len(self.v) > 4 : glBegin(GL_POLYGON) 231 | else: glBegin(GL_TRIANGLES) 232 | for p in self.v: 233 | glVertex3f(p[0], p[1],0) # draw each vertex 234 | glEnd() 235 | 236 | color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5) 237 | glColor4f(*color) 238 | glBegin(GL_LINE_LOOP) 239 | for p in self.v: 240 | glVertex3f(p[0], p[1],0) # draw each vertex 241 | glEnd() 242 | 243 | def make_circle(radius=10, res=30, filled=True): 244 | points = [] 245 | for i in range(res): 246 | ang = 2*math.pi*i / res 247 | points.append((math.cos(ang)*radius, math.sin(ang)*radius)) 248 | if filled: 249 | return FilledPolygon(points) 250 | else: 251 | return PolyLine(points, True) 252 | 253 | def make_polygon(v, filled=True): 254 | if filled: return FilledPolygon(v) 255 | else: return PolyLine(v, True) 256 | 257 | def make_polyline(v): 258 | return PolyLine(v, False) 259 | 260 | def make_capsule(length, width): 261 | l, r, t, b = 0, length, width/2, -width/2 262 | box = make_polygon([(l,b), (l,t), (r,t), (r,b)]) 263 | circ0 = make_circle(width/2) 264 | circ1 = make_circle(width/2) 265 | circ1.add_attr(Transform(translation=(length, 0))) 266 | geom = Compound([box, circ0, circ1]) 267 | return geom 268 | 269 | class Compound(Geom): 270 | def __init__(self, gs): 271 | Geom.__init__(self) 272 | self.gs = gs 273 | for g in self.gs: 274 | g.attrs = [a for a in g.attrs if not isinstance(a, Color)] 275 | def render1(self): 276 | for g in self.gs: 277 | g.render() 278 | 279 | class PolyLine(Geom): 280 | def __init__(self, v, close): 281 | Geom.__init__(self) 282 | self.v = v 283 | self.close = close 284 | self.linewidth = LineWidth(1) 285 | self.add_attr(self.linewidth) 286 | def render1(self): 287 | glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP) 288 | for p in self.v: 289 | glVertex3f(p[0], p[1],0) # draw each vertex 290 | glEnd() 291 | def set_linewidth(self, x): 292 | self.linewidth.stroke = x 293 | 294 | class Line(Geom): 295 | def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)): 296 | Geom.__init__(self) 297 | self.start = start 298 | self.end = end 299 | self.linewidth = LineWidth(1) 300 | self.add_attr(self.linewidth) 301 | 302 | def render1(self): 303 | glBegin(GL_LINES) 304 | glVertex2f(*self.start) 305 | glVertex2f(*self.end) 306 | glEnd() 307 | 308 | class Image(Geom): 309 | def __init__(self, fname, width, height): 310 | Geom.__init__(self) 311 | self.width = width 312 | self.height = height 313 | img = pyglet.image.load(fname) 314 | self.img = img 315 | self.flip = False 316 | def render1(self): 317 | self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height) 318 | 319 | # ================================================================ 320 | 321 | class SimpleImageViewer(object): 322 | def __init__(self, display=None): 323 | self.window = None 324 | self.isopen = False 325 | self.display = display 326 | def imshow(self, arr): 327 | if self.window is None: 328 | height, width, channels = arr.shape 329 | self.window = pyglet.window.Window(width=width, height=height, display=self.display) 330 | self.width = width 331 | self.height = height 332 | self.isopen = True 333 | assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape" 334 | image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3) 335 | self.window.clear() 336 | self.window.switch_to() 337 | self.window.dispatch_events() 338 | image.blit(0,0) 339 | self.window.flip() 340 | def close(self): 341 | if self.isopen: 342 | self.window.close() 343 | self.isopen = False 344 | def __del__(self): 345 | self.close() -------------------------------------------------------------------------------- /maddpg/common/tf_util.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | 6 | def sum(x, axis=None, keepdims=False): 7 | return tf.reduce_sum(x, axis=None if axis is None else [axis], keep_dims = keepdims) 8 | def mean(x, axis=None, keepdims=False): 9 | return tf.reduce_mean(x, axis=None if axis is None else [axis], keep_dims = keepdims) 10 | def var(x, axis=None, keepdims=False): 11 | meanx = mean(x, axis=axis, keepdims=keepdims) 12 | return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) 13 | def std(x, axis=None, keepdims=False): 14 | return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) 15 | def max(x, axis=None, keepdims=False): 16 | return tf.reduce_max(x, axis=None if axis is None else [axis], keep_dims = keepdims) 17 | def min(x, axis=None, keepdims=False): 18 | return tf.reduce_min(x, axis=None if axis is None else [axis], keep_dims = keepdims) 19 | def concatenate(arrs, axis=0): 20 | return tf.concat(axis=axis, values=arrs) 21 | def argmax(x, axis=None): 22 | return tf.argmax(x, axis=axis) 23 | def softmax(x, axis=None): 24 | return tf.nn.softmax(x, axis=axis) 25 | 26 | # ================================================================ 27 | # Misc 28 | # ================================================================ 29 | 30 | 31 | def is_placeholder(x): 32 | return type(x) is tf.Tensor and len(x.op.inputs) == 0 33 | 34 | # ================================================================ 35 | # Inputs 36 | # ================================================================ 37 | 38 | 39 | class TfInput(object): 40 | def __init__(self, name="(unnamed)"): 41 | """Generalized Tensorflow placeholder. The main differences are: 42 | - possibly uses multiple placeholders internally and returns multiple values 43 | - can apply light postprocessing to the value feed to placeholder. 44 | """ 45 | self.name = name 46 | 47 | def get(self): 48 | """Return the tf variable(s) representing the possibly postprocessed value 49 | of placeholder(s). 50 | """ 51 | raise NotImplemented() 52 | 53 | def make_feed_dict(data): 54 | """Given data input it to the placeholder(s).""" 55 | raise NotImplemented() 56 | 57 | 58 | class PlacholderTfInput(TfInput): 59 | def __init__(self, placeholder): 60 | """Wrapper for regular tensorflow placeholder.""" 61 | super().__init__(placeholder.name) 62 | self._placeholder = placeholder 63 | 64 | def get(self): 65 | return self._placeholder 66 | 67 | def make_feed_dict(self, data): 68 | return {self._placeholder: data} 69 | 70 | 71 | class BatchInput(PlacholderTfInput): 72 | def __init__(self, shape, dtype=tf.float32, name=None): 73 | """Creates a placeholder for a batch of tensors of a given shape and dtype 74 | 75 | Parameters 76 | ---------- 77 | shape: [int] 78 | shape of a single elemenet of the batch 79 | dtype: tf.dtype 80 | number representation used for tensor contents 81 | name: str 82 | name of the underlying placeholder 83 | """ 84 | super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name)) 85 | 86 | 87 | class Uint8Input(PlacholderTfInput): 88 | def __init__(self, shape, name=None): 89 | """Takes input in uint8 format which is cast to float32 and divided by 255 90 | before passing it to the model. 91 | 92 | On GPU this ensures lower data transfer times. 93 | 94 | Parameters 95 | ---------- 96 | shape: [int] 97 | shape of the tensor. 98 | name: str 99 | name of the underlying placeholder 100 | """ 101 | 102 | super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name)) 103 | self._shape = shape 104 | self._output = tf.cast(super().get(), tf.float32) / 255.0 105 | 106 | def get(self): 107 | return self._output 108 | 109 | 110 | def ensure_tf_input(thing): 111 | """Takes either tf.placeholder of TfInput and outputs equivalent TfInput""" 112 | if isinstance(thing, TfInput): 113 | return thing 114 | elif is_placeholder(thing): 115 | return PlacholderTfInput(thing) 116 | else: 117 | raise ValueError("Must be a placeholder or TfInput") 118 | 119 | # ================================================================ 120 | # Mathematical utils 121 | # ================================================================ 122 | 123 | 124 | def huber_loss(x, delta=1.0): 125 | """Reference: https://en.wikipedia.org/wiki/Huber_loss""" 126 | return tf.where( 127 | tf.abs(x) < delta, 128 | tf.square(x) * 0.5, 129 | delta * (tf.abs(x) - 0.5 * delta) 130 | ) 131 | 132 | # ================================================================ 133 | # Optimizer utils 134 | # ================================================================ 135 | 136 | 137 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 138 | """Minimized `objective` using `optimizer` w.r.t. variables in 139 | `var_list` while ensure the norm of the gradients for each 140 | variable is clipped to `clip_val` 141 | """ 142 | if clip_val is None: 143 | return optimizer.minimize(objective, var_list=var_list) 144 | else: 145 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 146 | for i, (grad, var) in enumerate(gradients): 147 | if grad is not None: 148 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 149 | return optimizer.apply_gradients(gradients) 150 | 151 | 152 | # ================================================================ 153 | # Global session 154 | # ================================================================ 155 | 156 | def get_session(): 157 | """Returns recently made Tensorflow session""" 158 | return tf.get_default_session() 159 | 160 | 161 | def make_session(num_cpu): 162 | """Returns a session that will use CPU's only""" 163 | tf_config = tf.ConfigProto( 164 | inter_op_parallelism_threads=num_cpu, 165 | intra_op_parallelism_threads=num_cpu) 166 | return tf.Session(config=tf_config) 167 | 168 | 169 | def single_threaded_session(): 170 | """Returns a session which will only use a single CPU""" 171 | return make_session(1) 172 | 173 | 174 | ALREADY_INITIALIZED = set() 175 | 176 | 177 | def initialize(): 178 | """Initialize all the uninitialized variables in the global scope.""" 179 | new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED 180 | get_session().run(tf.variables_initializer(new_variables)) 181 | ALREADY_INITIALIZED.update(new_variables) 182 | 183 | 184 | # ================================================================ 185 | # Scopes 186 | # ================================================================ 187 | 188 | 189 | def scope_vars(scope, trainable_only=False): 190 | """ 191 | Get variables inside a scope 192 | The scope can be specified as a string 193 | 194 | Parameters 195 | ---------- 196 | scope: str or VariableScope 197 | scope in which the variables reside. 198 | trainable_only: bool 199 | whether or not to return only the variables that were marked as trainable. 200 | 201 | Returns 202 | ------- 203 | vars: [tf.Variable] 204 | list of variables in `scope`. 205 | """ 206 | return tf.get_collection( 207 | tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES, 208 | scope=scope if isinstance(scope, str) else scope.name 209 | ) 210 | 211 | 212 | def scope_name(): 213 | """Returns the name of current scope as a string, e.g. deepq/q_func""" 214 | return tf.get_variable_scope().name 215 | 216 | 217 | def absolute_scope_name(relative_scope_name): 218 | """Appends parent scope name to `relative_scope_name`""" 219 | return scope_name() + "/" + relative_scope_name 220 | 221 | # ================================================================ 222 | # Saving variables 223 | # ================================================================ 224 | 225 | 226 | def load_state(fname, saver=None): 227 | """Load all the variables to the current session from the location """ 228 | if saver is None: 229 | saver = tf.train.Saver() 230 | saver.restore(get_session(), fname) 231 | return saver 232 | 233 | 234 | def save_state(fname, saver=None): 235 | """Save all the variables in the current session to the location """ 236 | os.makedirs(os.path.dirname(fname), exist_ok=True) 237 | if saver is None: 238 | saver = tf.train.Saver() 239 | saver.save(get_session(), fname) 240 | return saver 241 | 242 | # ================================================================ 243 | # Theano-like Function 244 | # ================================================================ 245 | 246 | 247 | def function(inputs, outputs, updates=None, givens=None): 248 | """Just like Theano function. Take a bunch of tensorflow placeholders and expersions 249 | computed based on those placeholders and produces f(inputs) -> outputs. Function f takes 250 | values to be feed to the inputs placeholders and produces the values of the experessions 251 | in outputs. 252 | 253 | Input values can be passed in the same order as inputs or can be provided as kwargs based 254 | on placeholder name (passed to constructor or accessible via placeholder.op.name). 255 | 256 | Example: 257 | x = tf.placeholder(tf.int32, (), name="x") 258 | y = tf.placeholder(tf.int32, (), name="y") 259 | z = 3 * x + 2 * y 260 | lin = function([x, y], z, givens={y: 0}) 261 | 262 | with single_threaded_session(): 263 | initialize() 264 | 265 | assert lin(2) == 6 266 | assert lin(x=3) == 9 267 | assert lin(2, 2) == 10 268 | assert lin(x=2, y=3) == 12 269 | 270 | Parameters 271 | ---------- 272 | inputs: [tf.placeholder or TfInput] 273 | list of input arguments 274 | outputs: [tf.Variable] or tf.Variable 275 | list of outputs or a single output to be returned from function. Returned 276 | value will also have the same shape. 277 | """ 278 | if isinstance(outputs, list): 279 | return _Function(inputs, outputs, updates, givens=givens) 280 | elif isinstance(outputs, (dict, collections.OrderedDict)): 281 | f = _Function(inputs, outputs.values(), updates, givens=givens) 282 | return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs))) 283 | else: 284 | f = _Function(inputs, [outputs], updates, givens=givens) 285 | return lambda *args, **kwargs: f(*args, **kwargs)[0] 286 | 287 | 288 | class _Function(object): 289 | def __init__(self, inputs, outputs, updates, givens, check_nan=False): 290 | for inpt in inputs: 291 | if not issubclass(type(inpt), TfInput): 292 | assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of rl_algs.common.TfInput" 293 | self.inputs = inputs 294 | updates = updates or [] 295 | self.update_group = tf.group(*updates) 296 | self.outputs_update = list(outputs) + [self.update_group] 297 | self.givens = {} if givens is None else givens 298 | self.check_nan = check_nan 299 | 300 | def _feed_input(self, feed_dict, inpt, value): 301 | if issubclass(type(inpt), TfInput): 302 | feed_dict.update(inpt.make_feed_dict(value)) 303 | elif is_placeholder(inpt): 304 | feed_dict[inpt] = value 305 | 306 | def __call__(self, *args, **kwargs): 307 | assert len(args) <= len(self.inputs), "Too many arguments provided" 308 | feed_dict = {} 309 | # Update the args 310 | for inpt, value in zip(self.inputs, args): 311 | self._feed_input(feed_dict, inpt, value) 312 | # Update the kwargs 313 | kwargs_passed_inpt_names = set() 314 | for inpt in self.inputs[len(args):]: 315 | inpt_name = inpt.name.split(':')[0] 316 | inpt_name = inpt_name.split('/')[-1] 317 | assert inpt_name not in kwargs_passed_inpt_names, \ 318 | "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name) 319 | if inpt_name in kwargs: 320 | kwargs_passed_inpt_names.add(inpt_name) 321 | self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name)) 322 | else: 323 | assert inpt in self.givens, "Missing argument " + inpt_name 324 | assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys())) 325 | # Update feed dict with givens. 326 | for inpt in self.givens: 327 | feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) 328 | results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] 329 | if self.check_nan: 330 | if any(np.isnan(r).any() for r in results): 331 | raise RuntimeError("Nan detected") 332 | return results 333 | -------------------------------------------------------------------------------- /maddpg/common/distributions.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import maddpg.common.tf_util as U 4 | from tensorflow.python.ops import math_ops 5 | from multiagent.multi_discrete import MultiDiscrete 6 | from tensorflow.python.ops import nn 7 | 8 | class Pd(object): 9 | """ 10 | A particular probability distribution 11 | """ 12 | def flatparam(self): 13 | raise NotImplementedError 14 | def mode(self): 15 | raise NotImplementedError 16 | def logp(self, x): 17 | raise NotImplementedError 18 | def kl(self, other): 19 | raise NotImplementedError 20 | def entropy(self): 21 | raise NotImplementedError 22 | def sample(self): 23 | raise NotImplementedError 24 | 25 | class PdType(object): 26 | """ 27 | Parametrized family of probability distributions 28 | """ 29 | def pdclass(self): 30 | raise NotImplementedError 31 | def pdfromflat(self, flat): 32 | return self.pdclass()(flat) 33 | def param_shape(self): 34 | raise NotImplementedError 35 | def sample_shape(self): 36 | raise NotImplementedError 37 | def sample_dtype(self): 38 | raise NotImplementedError 39 | 40 | def param_placeholder(self, prepend_shape, name=None): 41 | return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) 42 | def sample_placeholder(self, prepend_shape, name=None): 43 | return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) 44 | 45 | class CategoricalPdType(PdType): 46 | def __init__(self, ncat): 47 | self.ncat = ncat 48 | def pdclass(self): 49 | return CategoricalPd 50 | def param_shape(self): 51 | return [self.ncat] 52 | def sample_shape(self): 53 | return [] 54 | def sample_dtype(self): 55 | return tf.int32 56 | 57 | class SoftCategoricalPdType(PdType): 58 | def __init__(self, ncat): 59 | self.ncat = ncat 60 | def pdclass(self): 61 | return SoftCategoricalPd 62 | def param_shape(self): 63 | return [self.ncat] 64 | def sample_shape(self): 65 | return [self.ncat] 66 | def sample_dtype(self): 67 | return tf.float32 68 | 69 | class MultiCategoricalPdType(PdType): 70 | def __init__(self, low, high): 71 | self.low = low 72 | self.high = high 73 | self.ncats = high - low + 1 74 | def pdclass(self): 75 | return MultiCategoricalPd 76 | def pdfromflat(self, flat): 77 | return MultiCategoricalPd(self.low, self.high, flat) 78 | def param_shape(self): 79 | return [sum(self.ncats)] 80 | def sample_shape(self): 81 | return [len(self.ncats)] 82 | def sample_dtype(self): 83 | return tf.int32 84 | 85 | class SoftMultiCategoricalPdType(PdType): 86 | def __init__(self, low, high): 87 | self.low = low 88 | self.high = high 89 | self.ncats = high - low + 1 90 | def pdclass(self): 91 | return SoftMultiCategoricalPd 92 | def pdfromflat(self, flat): 93 | return SoftMultiCategoricalPd(self.low, self.high, flat) 94 | def param_shape(self): 95 | return [sum(self.ncats)] 96 | def sample_shape(self): 97 | return [sum(self.ncats)] 98 | def sample_dtype(self): 99 | return tf.float32 100 | 101 | class DiagGaussianPdType(PdType): 102 | def __init__(self, size): 103 | self.size = size 104 | def pdclass(self): 105 | return DiagGaussianPd 106 | def param_shape(self): 107 | return [2*self.size] 108 | def sample_shape(self): 109 | return [self.size] 110 | def sample_dtype(self): 111 | return tf.float32 112 | 113 | class BernoulliPdType(PdType): 114 | def __init__(self, size): 115 | self.size = size 116 | def pdclass(self): 117 | return BernoulliPd 118 | def param_shape(self): 119 | return [self.size] 120 | def sample_shape(self): 121 | return [self.size] 122 | def sample_dtype(self): 123 | return tf.int32 124 | 125 | # WRONG SECOND DERIVATIVES 126 | # class CategoricalPd(Pd): 127 | # def __init__(self, logits): 128 | # self.logits = logits 129 | # self.ps = tf.nn.softmax(logits) 130 | # @classmethod 131 | # def fromflat(cls, flat): 132 | # return cls(flat) 133 | # def flatparam(self): 134 | # return self.logits 135 | # def mode(self): 136 | # return U.argmax(self.logits, axis=1) 137 | # def logp(self, x): 138 | # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) 139 | # def kl(self, other): 140 | # return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ 141 | # - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 142 | # def entropy(self): 143 | # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) 144 | # def sample(self): 145 | # u = tf.random_uniform(tf.shape(self.logits)) 146 | # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) 147 | 148 | class CategoricalPd(Pd): 149 | def __init__(self, logits): 150 | self.logits = logits 151 | def flatparam(self): 152 | return self.logits 153 | def mode(self): 154 | return U.argmax(self.logits, axis=1) 155 | def logp(self, x): 156 | return -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 157 | def kl(self, other): 158 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 159 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) 160 | ea0 = tf.exp(a0) 161 | ea1 = tf.exp(a1) 162 | z0 = U.sum(ea0, axis=1, keepdims=True) 163 | z1 = U.sum(ea1, axis=1, keepdims=True) 164 | p0 = ea0 / z0 165 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) 166 | def entropy(self): 167 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 168 | ea0 = tf.exp(a0) 169 | z0 = U.sum(ea0, axis=1, keepdims=True) 170 | p0 = ea0 / z0 171 | return U.sum(p0 * (tf.log(z0) - a0), axis=1) 172 | def sample(self): 173 | u = tf.random_uniform(tf.shape(self.logits)) 174 | return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1) 175 | @classmethod 176 | def fromflat(cls, flat): 177 | return cls(flat) 178 | 179 | class SoftCategoricalPd(Pd): 180 | def __init__(self, logits): 181 | self.logits = logits 182 | def flatparam(self): 183 | return self.logits 184 | def mode(self): 185 | return U.softmax(self.logits, axis=-1) 186 | def logp(self, x): 187 | return -tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=x) 188 | def kl(self, other): 189 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 190 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True) 191 | ea0 = tf.exp(a0) 192 | ea1 = tf.exp(a1) 193 | z0 = U.sum(ea0, axis=1, keepdims=True) 194 | z1 = U.sum(ea1, axis=1, keepdims=True) 195 | p0 = ea0 / z0 196 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1) 197 | def entropy(self): 198 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True) 199 | ea0 = tf.exp(a0) 200 | z0 = U.sum(ea0, axis=1, keepdims=True) 201 | p0 = ea0 / z0 202 | return U.sum(p0 * (tf.log(z0) - a0), axis=1) 203 | def sample(self): 204 | u = tf.random_uniform(tf.shape(self.logits)) 205 | return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1) 206 | @classmethod 207 | def fromflat(cls, flat): 208 | return cls(flat) 209 | 210 | class MultiCategoricalPd(Pd): 211 | def __init__(self, low, high, flat): 212 | self.flat = flat 213 | self.low = tf.constant(low, dtype=tf.int32) 214 | self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 215 | def flatparam(self): 216 | return self.flat 217 | def mode(self): 218 | return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) 219 | def logp(self, x): 220 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 221 | def kl(self, other): 222 | return tf.add_n([ 223 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 224 | ]) 225 | def entropy(self): 226 | return tf.add_n([p.entropy() for p in self.categoricals]) 227 | def sample(self): 228 | return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) 229 | @classmethod 230 | def fromflat(cls, flat): 231 | return cls(flat) 232 | 233 | class SoftMultiCategoricalPd(Pd): # doesn't work yet 234 | def __init__(self, low, high, flat): 235 | self.flat = flat 236 | self.low = tf.constant(low, dtype=tf.float32) 237 | self.categoricals = list(map(SoftCategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1))) 238 | def flatparam(self): 239 | return self.flat 240 | def mode(self): 241 | x = [] 242 | for i in range(len(self.categoricals)): 243 | x.append(self.low[i] + self.categoricals[i].mode()) 244 | return tf.concat(x, axis=-1) 245 | def logp(self, x): 246 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))]) 247 | def kl(self, other): 248 | return tf.add_n([ 249 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals) 250 | ]) 251 | def entropy(self): 252 | return tf.add_n([p.entropy() for p in self.categoricals]) 253 | def sample(self): 254 | x = [] 255 | for i in range(len(self.categoricals)): 256 | x.append(self.low[i] + self.categoricals[i].sample()) 257 | return tf.concat(x, axis=-1) 258 | @classmethod 259 | def fromflat(cls, flat): 260 | return cls(flat) 261 | 262 | class DiagGaussianPd(Pd): 263 | def __init__(self, flat): 264 | self.flat = flat 265 | mean, logstd = tf.split(axis=1, num_or_size_splits=2, value=flat) 266 | self.mean = mean 267 | self.logstd = logstd 268 | self.std = tf.exp(logstd) 269 | def flatparam(self): 270 | return self.flat 271 | def mode(self): 272 | return self.mean 273 | def logp(self, x): 274 | return - 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=1) \ 275 | - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) \ 276 | - U.sum(self.logstd, axis=1) 277 | def kl(self, other): 278 | assert isinstance(other, DiagGaussianPd) 279 | return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=1) 280 | def entropy(self): 281 | return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), 1) 282 | def sample(self): 283 | return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) 284 | @classmethod 285 | def fromflat(cls, flat): 286 | return cls(flat) 287 | 288 | class BernoulliPd(Pd): 289 | def __init__(self, logits): 290 | self.logits = logits 291 | self.ps = tf.sigmoid(logits) 292 | def flatparam(self): 293 | return self.logits 294 | def mode(self): 295 | return tf.round(self.ps) 296 | def logp(self, x): 297 | return - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1) 298 | def kl(self, other): 299 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) 300 | def entropy(self): 301 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1) 302 | def sample(self): 303 | p = tf.sigmoid(self.logits) 304 | u = tf.random_uniform(tf.shape(p)) 305 | return tf.to_float(math_ops.less(u, p)) 306 | @classmethod 307 | def fromflat(cls, flat): 308 | return cls(flat) 309 | 310 | def make_pdtype(ac_space): 311 | from gym import spaces 312 | if isinstance(ac_space, spaces.Box): 313 | assert len(ac_space.shape) == 1 314 | return DiagGaussianPdType(ac_space.shape[0]) 315 | elif isinstance(ac_space, spaces.Discrete): 316 | # return CategoricalPdType(ac_space.n) 317 | return SoftCategoricalPdType(ac_space.n) 318 | elif isinstance(ac_space, MultiDiscrete): 319 | #return MultiCategoricalPdType(ac_space.low, ac_space.high) 320 | return SoftMultiCategoricalPdType(ac_space.low, ac_space.high) 321 | elif isinstance(ac_space, spaces.MultiBinary): 322 | return BernoulliPdType(ac_space.n) 323 | else: 324 | raise NotImplementedError 325 | 326 | def shape_el(v, i): 327 | maybe = v.get_shape()[i] 328 | if maybe is not None: 329 | return maybe 330 | else: 331 | return tf.shape(v)[i] 332 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import seaborn as sns 3 | 4 | # physical/external base state of all entites 5 | class EntityState(object): 6 | def __init__(self): 7 | # physical position 8 | self.p_pos = None 9 | # physical velocity 10 | self.p_vel = None 11 | 12 | # state of agents (including communication and internal/mental state) 13 | class AgentState(EntityState): 14 | def __init__(self): 15 | super(AgentState, self).__init__() 16 | # communication utterance 17 | self.c = None 18 | 19 | # action of the agent 20 | class Action(object): 21 | def __init__(self): 22 | # physical action 23 | self.u = None 24 | # communication action 25 | self.c = None 26 | 27 | class Wall(object): 28 | def __init__(self, orient='H', axis_pos=0.0, endpoints=(-1, 1), width=0.1, 29 | hard=True): 30 | # orientation: 'H'orizontal or 'V'ertical 31 | self.orient = orient 32 | # position along axis which wall lays on (y-axis for H, x-axis for V) 33 | self.axis_pos = axis_pos 34 | # endpoints of wall (x-coords for H, y-coords for V) 35 | self.endpoints = np.array(endpoints) 36 | # width of wall 37 | self.width = width 38 | # whether wall is impassable to all agents 39 | self.hard = hard 40 | # color of wall 41 | self.color = np.array([0.0, 0.0, 0.0]) 42 | 43 | 44 | # properties and state of physical world entity 45 | class Entity(object): 46 | def __init__(self): 47 | # index among all entities (important to set for distance caching) 48 | self.i = 0 49 | # name 50 | self.name = '' 51 | # properties: 52 | self.size = 0.050 53 | # entity can move / be pushed 54 | self.movable = False 55 | # entity collides with others 56 | self.collide = True 57 | # entity can pass through non-hard walls 58 | self.ghost = False 59 | # material density (affects mass) 60 | self.density = 25.0 61 | # color 62 | self.color = None 63 | # max speed and accel 64 | self.max_speed = None 65 | self.accel = None 66 | # state 67 | self.state = EntityState() 68 | # mass 69 | self.initial_mass = 1.0 70 | 71 | @property 72 | def mass(self): 73 | return self.initial_mass 74 | 75 | # properties of landmark entities 76 | class Landmark(Entity): 77 | def __init__(self): 78 | super(Landmark, self).__init__() 79 | 80 | # properties of agent entities 81 | class Agent(Entity): 82 | def __init__(self): 83 | super(Agent, self).__init__() 84 | # agents are movable by default 85 | self.movable = True 86 | # cannot send communication signals 87 | self.silent = False 88 | # cannot observe the world 89 | self.blind = False 90 | # physical motor noise amount 91 | self.u_noise = None 92 | # communication noise amount 93 | self.c_noise = None 94 | # control range 95 | self.u_range = 1.0 96 | # state 97 | self.state = AgentState() 98 | # action 99 | self.action = Action() 100 | # script behavior to execute 101 | self.action_callback = None 102 | 103 | # multi-agent world 104 | class World(object): 105 | def __init__(self): 106 | # list of agents and entities (can change at execution-time!) 107 | self.agents = [] 108 | self.landmarks = [] 109 | self.walls = [] 110 | # communication channel dimensionality 111 | self.dim_c = 0 112 | # position dimensionality 113 | self.dim_p = 2 114 | # color dimensionality 115 | self.dim_color = 3 116 | # simulation timestep 117 | self.dt = 0.1 118 | # physical damping 119 | self.damping = 0.25 120 | # contact response parameters 121 | self.contact_force = 1e+2 122 | self.contact_margin = 1e-3 123 | # cache distances between all agents (not calculated by default) 124 | self.cache_dists = False 125 | self.cached_dist_vect = None 126 | self.cached_dist_mag = None 127 | 128 | # return all entities in the world 129 | @property 130 | def entities(self): 131 | return self.agents + self.landmarks 132 | 133 | # return all agents controllable by external policies 134 | @property 135 | def policy_agents(self): 136 | return [agent for agent in self.agents if agent.action_callback is None] 137 | 138 | # return all agents controlled by world scripts 139 | @property 140 | def scripted_agents(self): 141 | return [agent for agent in self.agents if agent.action_callback is not None] 142 | 143 | def calculate_distances(self): 144 | if self.cached_dist_vect is None: 145 | # initialize distance data structure 146 | self.cached_dist_vect = np.zeros((len(self.entities), 147 | len(self.entities), 148 | self.dim_p)) 149 | # calculate minimum distance for a collision between all entities 150 | self.min_dists = np.zeros((len(self.entities), len(self.entities))) 151 | for ia, entity_a in enumerate(self.entities): 152 | for ib in range(ia + 1, len(self.entities)): 153 | entity_b = self.entities[ib] 154 | min_dist = entity_a.size + entity_b.size 155 | self.min_dists[ia, ib] = min_dist 156 | self.min_dists[ib, ia] = min_dist 157 | 158 | for ia, entity_a in enumerate(self.entities): 159 | for ib in range(ia + 1, len(self.entities)): 160 | entity_b = self.entities[ib] 161 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos 162 | self.cached_dist_vect[ia, ib, :] = delta_pos 163 | self.cached_dist_vect[ib, ia, :] = -delta_pos 164 | 165 | self.cached_dist_mag = np.linalg.norm(self.cached_dist_vect, axis=2) 166 | self.cached_collisions = (self.cached_dist_mag <= self.min_dists) 167 | 168 | def assign_agent_colors(self): 169 | n_dummies = 0 170 | if hasattr(self.agents[0], 'dummy'): 171 | n_dummies = len([a for a in self.agents if a.dummy]) 172 | n_adversaries = 0 173 | if hasattr(self.agents[0], 'adversary'): 174 | n_adversaries = len([a for a in self.agents if a.adversary]) 175 | n_good_agents = len(self.agents) - n_adversaries - n_dummies 176 | dummy_colors = [(0, 0, 0)] * n_dummies 177 | adv_colors = sns.color_palette("OrRd_d", n_adversaries) 178 | good_colors = sns.color_palette("GnBu_d", n_good_agents) 179 | colors = dummy_colors + adv_colors + good_colors 180 | for color, agent in zip(colors, self.agents): 181 | agent.color = color 182 | 183 | # update state of the world 184 | def step(self): 185 | # set actions for scripted agents 186 | for agent in self.scripted_agents: 187 | agent.action = agent.action_callback(agent, self) 188 | # gather forces applied to entities 189 | p_force = [None] * len(self.entities) 190 | # apply agent physical controls 191 | p_force = self.apply_action_force(p_force) 192 | # apply environment forces 193 | p_force = self.apply_environment_force(p_force) 194 | # integrate physical state 195 | self.integrate_state(p_force) 196 | # update agent state 197 | for agent in self.agents: 198 | self.update_agent_state(agent) 199 | # calculate and store distances between all entities 200 | if self.cache_dists: 201 | self.calculate_distances() 202 | 203 | 204 | # gather agent action forces 205 | def apply_action_force(self, p_force): 206 | # set applied forces 207 | for i,agent in enumerate(self.agents): 208 | if agent.movable: 209 | noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0 210 | p_force[i] = (agent.mass * agent.accel if agent.accel is not None else agent.mass) * agent.action.u + noise 211 | return p_force 212 | 213 | # gather physical forces acting on entities 214 | def apply_environment_force(self, p_force): 215 | # simple (but inefficient) collision response 216 | for a,entity_a in enumerate(self.entities): 217 | for b,entity_b in enumerate(self.entities): 218 | if(b <= a): continue 219 | [f_a, f_b] = self.get_entity_collision_force(a, b) 220 | if(f_a is not None): 221 | if(p_force[a] is None): p_force[a] = 0.0 222 | p_force[a] = f_a + p_force[a] 223 | if(f_b is not None): 224 | if(p_force[b] is None): p_force[b] = 0.0 225 | p_force[b] = f_b + p_force[b] 226 | if entity_a.movable: 227 | for wall in self.walls: 228 | wf = self.get_wall_collision_force(entity_a, wall) 229 | if wf is not None: 230 | if p_force[a] is None: 231 | p_force[a] = 0.0 232 | p_force[a] = p_force[a] + wf 233 | return p_force 234 | 235 | # integrate physical state 236 | def integrate_state(self, p_force): 237 | for i,entity in enumerate(self.entities): 238 | if not entity.movable: continue 239 | entity.state.p_vel = entity.state.p_vel * (1 - self.damping) 240 | if (p_force[i] is not None): 241 | entity.state.p_vel += (p_force[i] / entity.mass) * self.dt 242 | if entity.max_speed is not None: 243 | speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1])) 244 | if speed > entity.max_speed: 245 | entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) + 246 | np.square(entity.state.p_vel[1])) * entity.max_speed 247 | entity.state.p_pos += entity.state.p_vel * self.dt 248 | 249 | def update_agent_state(self, agent): 250 | # set communication state (directly for now) 251 | if agent.silent: 252 | agent.state.c = np.zeros(self.dim_c) 253 | else: 254 | noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0 255 | agent.state.c = agent.action.c + noise 256 | 257 | # get collision forces for any contact between two entities 258 | def get_entity_collision_force(self, ia, ib): 259 | entity_a = self.entities[ia] 260 | entity_b = self.entities[ib] 261 | if (not entity_a.collide) or (not entity_b.collide): 262 | return [None, None] # not a collider 263 | if (not entity_a.movable) and (not entity_b.movable): 264 | return [None, None] # neither entity moves 265 | if (entity_a is entity_b): 266 | return [None, None] # don't collide against itself 267 | if self.cache_dists: 268 | delta_pos = self.cached_dist_vect[ia, ib] 269 | dist = self.cached_dist_mag[ia, ib] 270 | dist_min = self.min_dists[ia, ib] 271 | else: 272 | # compute actual distance between entities 273 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos 274 | dist = np.sqrt(np.sum(np.square(delta_pos))) 275 | # minimum allowable distance 276 | dist_min = entity_a.size + entity_b.size 277 | # softmax penetration 278 | k = self.contact_margin 279 | penetration = np.logaddexp(0, -(dist - dist_min)/k)*k 280 | force = self.contact_force * delta_pos / dist * penetration 281 | if entity_a.movable and entity_b.movable: 282 | # consider mass in collisions 283 | force_ratio = entity_b.mass / entity_a.mass 284 | force_a = force_ratio * force 285 | force_b = -(1 / force_ratio) * force 286 | else: 287 | force_a = +force if entity_a.movable else None 288 | force_b = -force if entity_b.movable else None 289 | return [force_a, force_b] 290 | 291 | # get collision forces for contact between an entity and a wall 292 | def get_wall_collision_force(self, entity, wall): 293 | if entity.ghost and not wall.hard: 294 | return None # ghost passes through soft walls 295 | if wall.orient == 'H': 296 | prll_dim = 0 297 | perp_dim = 1 298 | else: 299 | prll_dim = 1 300 | perp_dim = 0 301 | ent_pos = entity.state.p_pos 302 | if (ent_pos[prll_dim] < wall.endpoints[0] - entity.size or 303 | ent_pos[prll_dim] > wall.endpoints[1] + entity.size): 304 | return None # entity is beyond endpoints of wall 305 | elif (ent_pos[prll_dim] < wall.endpoints[0] or 306 | ent_pos[prll_dim] > wall.endpoints[1]): 307 | # part of entity is beyond wall 308 | if ent_pos[prll_dim] < wall.endpoints[0]: 309 | dist_past_end = ent_pos[prll_dim] - wall.endpoints[0] 310 | else: 311 | dist_past_end = ent_pos[prll_dim] - wall.endpoints[1] 312 | theta = np.arcsin(dist_past_end / entity.size) 313 | dist_min = np.cos(theta) * entity.size + 0.5 * wall.width 314 | else: # entire entity lies within bounds of wall 315 | theta = 0 316 | dist_past_end = 0 317 | dist_min = entity.size + 0.5 * wall.width 318 | 319 | # only need to calculate distance in relevant dim 320 | delta_pos = ent_pos[perp_dim] - wall.axis_pos 321 | dist = np.abs(delta_pos) 322 | # softmax penetration 323 | k = self.contact_margin 324 | penetration = np.logaddexp(0, -(dist - dist_min)/k)*k 325 | force_mag = self.contact_force * delta_pos / dist * penetration 326 | force = np.zeros(2) 327 | force[perp_dim] = np.cos(theta) * force_mag 328 | force[prll_dim] = np.sin(theta) * np.abs(force_mag) 329 | return force 330 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/scenarios/simple_world_comm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiagent.core import World, Agent, Landmark 3 | from multiagent.scenario import BaseScenario 4 | 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self): 8 | world = World() 9 | # set any world properties first 10 | world.dim_c = 4 11 | #world.damping = 1 12 | num_good_agents = 2 13 | num_adversaries = 4 14 | num_agents = num_adversaries + num_good_agents 15 | num_landmarks = 1 16 | num_food = 2 17 | num_forests = 2 18 | # add agents 19 | world.agents = [Agent() for i in range(num_agents)] 20 | for i, agent in enumerate(world.agents): 21 | agent.name = 'agent %d' % i 22 | agent.collide = True 23 | agent.leader = True if i == 0 else False 24 | agent.silent = True if i > 0 else False 25 | agent.adversary = True if i < num_adversaries else False 26 | agent.size = 0.075 if agent.adversary else 0.045 27 | agent.accel = 3.0 if agent.adversary else 4.0 28 | #agent.accel = 20.0 if agent.adversary else 25.0 29 | agent.max_speed = 1.0 if agent.adversary else 1.3 30 | # add landmarks 31 | world.landmarks = [Landmark() for i in range(num_landmarks)] 32 | for i, landmark in enumerate(world.landmarks): 33 | landmark.name = 'landmark %d' % i 34 | landmark.collide = True 35 | landmark.movable = False 36 | landmark.size = 0.2 37 | landmark.boundary = False 38 | world.food = [Landmark() for i in range(num_food)] 39 | for i, landmark in enumerate(world.food): 40 | landmark.name = 'food %d' % i 41 | landmark.collide = False 42 | landmark.movable = False 43 | landmark.size = 0.03 44 | landmark.boundary = False 45 | world.forests = [Landmark() for i in range(num_forests)] 46 | for i, landmark in enumerate(world.forests): 47 | landmark.name = 'forest %d' % i 48 | landmark.collide = False 49 | landmark.movable = False 50 | landmark.size = 0.3 51 | landmark.boundary = False 52 | world.landmarks += world.food 53 | world.landmarks += world.forests 54 | #world.landmarks += self.set_boundaries(world) # world boundaries now penalized with negative reward 55 | # make initial conditions 56 | self.reset_world(world) 57 | return world 58 | 59 | def set_boundaries(self, world): 60 | boundary_list = [] 61 | landmark_size = 1 62 | edge = 1 + landmark_size 63 | num_landmarks = int(edge * 2 / landmark_size) 64 | for x_pos in [-edge, edge]: 65 | for i in range(num_landmarks): 66 | l = Landmark() 67 | l.state.p_pos = np.array([x_pos, -1 + i * landmark_size]) 68 | boundary_list.append(l) 69 | 70 | for y_pos in [-edge, edge]: 71 | for i in range(num_landmarks): 72 | l = Landmark() 73 | l.state.p_pos = np.array([-1 + i * landmark_size, y_pos]) 74 | boundary_list.append(l) 75 | 76 | for i, l in enumerate(boundary_list): 77 | l.name = 'boundary %d' % i 78 | l.collide == True 79 | l.movable = False 80 | l.boundary = True 81 | l.color = np.array([0.75, 0.75, 0.75]) 82 | l.size = landmark_size 83 | l.state.p_vel = np.zeros(world.dim_p) 84 | 85 | return boundary_list 86 | 87 | 88 | def reset_world(self, world): 89 | # random properties for agents 90 | for i, agent in enumerate(world.agents): 91 | agent.color = np.array([0.45, 0.95, 0.45]) if not agent.adversary else np.array([0.95, 0.45, 0.45]) 92 | agent.color -= np.array([0.3, 0.3, 0.3]) if agent.leader else np.array([0, 0, 0]) 93 | # random properties for landmarks 94 | for i, landmark in enumerate(world.landmarks): 95 | landmark.color = np.array([0.25, 0.25, 0.25]) 96 | for i, landmark in enumerate(world.food): 97 | landmark.color = np.array([0.15, 0.15, 0.65]) 98 | for i, landmark in enumerate(world.forests): 99 | landmark.color = np.array([0.6, 0.9, 0.6]) 100 | # set random initial states 101 | for agent in world.agents: 102 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 103 | agent.state.p_vel = np.zeros(world.dim_p) 104 | agent.state.c = np.zeros(world.dim_c) 105 | for i, landmark in enumerate(world.landmarks): 106 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 107 | landmark.state.p_vel = np.zeros(world.dim_p) 108 | for i, landmark in enumerate(world.food): 109 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 110 | landmark.state.p_vel = np.zeros(world.dim_p) 111 | for i, landmark in enumerate(world.forests): 112 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p) 113 | landmark.state.p_vel = np.zeros(world.dim_p) 114 | 115 | def benchmark_data(self, agent, world): 116 | if agent.adversary: 117 | collisions = 0 118 | for a in self.good_agents(world): 119 | if self.is_collision(a, agent): 120 | collisions += 1 121 | return collisions 122 | else: 123 | return 0 124 | 125 | 126 | def is_collision(self, agent1, agent2): 127 | delta_pos = agent1.state.p_pos - agent2.state.p_pos 128 | dist = np.sqrt(np.sum(np.square(delta_pos))) 129 | dist_min = agent1.size + agent2.size 130 | return True if dist < dist_min else False 131 | 132 | 133 | # return all agents that are not adversaries 134 | def good_agents(self, world): 135 | return [agent for agent in world.agents if not agent.adversary] 136 | 137 | # return all adversarial agents 138 | def adversaries(self, world): 139 | return [agent for agent in world.agents if agent.adversary] 140 | 141 | 142 | def reward(self, agent, world): 143 | # Agents are rewarded based on minimum agent distance to each landmark 144 | #boundary_reward = -10 if self.outside_boundary(agent) else 0 145 | main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world) 146 | return main_reward 147 | 148 | def outside_boundary(self, agent): 149 | if agent.state.p_pos[0] > 1 or agent.state.p_pos[0] < -1 or agent.state.p_pos[1] > 1 or agent.state.p_pos[1] < -1: 150 | return True 151 | else: 152 | return False 153 | 154 | 155 | def agent_reward(self, agent, world): 156 | # Agents are rewarded based on minimum agent distance to each landmark 157 | rew = 0 158 | shape = False 159 | adversaries = self.adversaries(world) 160 | if shape: 161 | for adv in adversaries: 162 | rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos))) 163 | if agent.collide: 164 | for a in adversaries: 165 | if self.is_collision(a, agent): 166 | rew -= 5 167 | def bound(x): 168 | if x < 0.9: 169 | return 0 170 | if x < 1.0: 171 | return (x - 0.9) * 10 172 | return min(np.exp(2 * x - 2), 10) # 1 + (x - 1) * (x - 1) 173 | 174 | for p in range(world.dim_p): 175 | x = abs(agent.state.p_pos[p]) 176 | rew -= 2 * bound(x) 177 | 178 | for food in world.food: 179 | if self.is_collision(agent, food): 180 | rew += 2 181 | rew += 0.05 * min([np.sqrt(np.sum(np.square(food.state.p_pos - agent.state.p_pos))) for food in world.food]) 182 | 183 | return rew 184 | 185 | def adversary_reward(self, agent, world): 186 | # Agents are rewarded based on minimum agent distance to each landmark 187 | rew = 0 188 | shape = True 189 | agents = self.good_agents(world) 190 | adversaries = self.adversaries(world) 191 | if shape: 192 | rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in agents]) 193 | #for adv in adversaries: 194 | # rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents]) 195 | if agent.collide: 196 | for ag in agents: 197 | for adv in adversaries: 198 | if self.is_collision(ag, adv): 199 | rew += 5 200 | return rew 201 | 202 | 203 | def observation2(self, agent, world): 204 | # get positions of all entities in this agent's reference frame 205 | entity_pos = [] 206 | for entity in world.landmarks: # world.entities: 207 | if not entity.boundary: 208 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 209 | 210 | food_pos = [] 211 | for entity in world.food: # world.entities: 212 | if not entity.boundary: 213 | food_pos.append(entity.state.p_pos - agent.state.p_pos) 214 | # communication of all other agents 215 | comm = [] 216 | other_pos = [] 217 | other_vel = [] 218 | for other in world.agents: 219 | if other is agent: continue 220 | comm.append(other.state.c) 221 | other_pos.append(other.state.p_pos - agent.state.p_pos) 222 | if not other.adversary: 223 | other_vel.append(other.state.p_vel) 224 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel) 225 | 226 | def observation(self, agent, world): 227 | # get positions of all entities in this agent's reference frame 228 | entity_pos = [] 229 | for entity in world.landmarks: # world.entities: 230 | if not entity.boundary: 231 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 232 | 233 | in_forest = [np.array([-1]), np.array([-1])] 234 | inf1 = False 235 | inf2 = False 236 | if self.is_collision(agent, world.forests[0]): 237 | in_forest[0] = np.array([1]) 238 | inf1= True 239 | if self.is_collision(agent, world.forests[1]): 240 | in_forest[1] = np.array([1]) 241 | inf2 = True 242 | 243 | food_pos = [] 244 | for entity in world.food: # world.entities: 245 | if not entity.boundary: 246 | food_pos.append(entity.state.p_pos - agent.state.p_pos) 247 | # communication of all other agents 248 | comm = [] 249 | other_pos = [] 250 | other_vel = [] 251 | for other in world.agents: 252 | if other is agent: continue 253 | comm.append(other.state.c) 254 | oth_f1 = self.is_collision(other, world.forests[0]) 255 | oth_f2 = self.is_collision(other, world.forests[1]) 256 | #if (inf1 and not oth_f2) or (inf2 and not oth_f1) or (not inf1 and not oth_f1 and not inf2 and not oth_f2) or agent.leader: #with forest vis 257 | if (inf1 and oth_f1) or (inf2 and oth_f2) or (not inf1 and not oth_f1 and not inf2 and not oth_f2) or agent.leader: #without forest vis 258 | #if (in_forest == np.array([-1]) and not self.is_collision(other, world.forests[0])) or (in_forest == np.array([1]) and not self.is_collision(other, world.forests[0])) or agent.leader: 259 | other_pos.append(other.state.p_pos - agent.state.p_pos) 260 | if not other.adversary: 261 | other_vel.append(other.state.p_vel) 262 | else: 263 | other_pos.append([0, 0]) 264 | if not other.adversary: 265 | other_vel.append([0, 0]) 266 | 267 | # to tell the pred when the prey are in the forest 268 | prey_forest = [] 269 | ga = self.good_agents(world) 270 | for a in ga: 271 | if any([self.is_collision(a, f) for f in world.forests]): 272 | prey_forest.append(np.array([1])) 273 | else: 274 | prey_forest.append(np.array([-1])) 275 | # to tell leader when pred are in forest 276 | prey_forest_lead = [] 277 | for f in world.forests: 278 | if any([self.is_collision(a, f) for a in ga]): 279 | prey_forest_lead.append(np.array([1])) 280 | else: 281 | prey_forest_lead.append(np.array([-1])) 282 | 283 | 284 | #print(agent.adversary) 285 | #print(agent.leader) 286 | #print(in_forest) 287 | #print(other_pos) 288 | comm = [world.agents[0].state.c] 289 | #comm = [np.array([0, 0, 0, 0])] 290 | """ 291 | # old setting 292 | if agent.adversary: 293 | #print(np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + [in_forest] + comm).shape) 294 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm) 295 | else: 296 | #print(np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + [in_forest] + other_vel).shape) 297 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + in_forest + other_vel) 298 | 299 | # new setting 300 | """ 301 | if agent.adversary and not agent.leader: 302 | #print(np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + [in_forest] + comm).shape) 303 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm) 304 | if agent.leader: 305 | return np.concatenate( 306 | [agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm) 307 | else: 308 | #print(np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + [in_forest] + other_vel).shape) 309 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + in_forest + other_vel) 310 | #""" 311 | 312 | -------------------------------------------------------------------------------- /multiagent-particle-envs/multiagent/environment.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | from gym.envs.registration import EnvSpec 4 | import numpy as np 5 | 6 | # environment for all agents in the multiagent world 7 | # currently code assumes that no agents will be created/destroyed at runtime! 8 | class MultiAgentEnv(gym.Env): 9 | metadata = { 10 | 'render.modes' : ['human', 'rgb_array'] 11 | } 12 | 13 | def __init__(self, world, reset_callback=None, reward_callback=None, 14 | observation_callback=None, info_callback=None, 15 | done_callback=None, post_step_callback=None, 16 | shared_viewer=True, discrete_action=False): 17 | 18 | self.world = world 19 | self.agents = self.world.policy_agents 20 | # set required vectorized gym env property 21 | self.n = len(world.policy_agents) 22 | # scenario callbacks 23 | self.reset_callback = reset_callback 24 | self.reward_callback = reward_callback 25 | self.observation_callback = observation_callback 26 | self.info_callback = info_callback 27 | self.done_callback = done_callback 28 | self.post_step_callback = post_step_callback 29 | # environment parameters 30 | self.discrete_action_space = discrete_action 31 | # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector 32 | self.discrete_action_input = False 33 | # if true, even the action is continuous, action will be performed discretely 34 | self.force_discrete_action = world.discrete_action if hasattr(world, 'discrete_action') else False 35 | # if true, every agent has the same reward 36 | self.shared_reward = False 37 | self.time = 0 38 | 39 | # configure spaces 40 | self.action_space = [] 41 | self.observation_space = [] 42 | for agent in self.agents: 43 | total_action_space = [] 44 | # physical action space 45 | if self.discrete_action_space: 46 | u_action_space = spaces.Discrete(world.dim_p * 2 + 1) 47 | else: 48 | u_action_space = spaces.Box(low=-agent.u_range, high=+agent.u_range, shape=(world.dim_p,)) 49 | if agent.movable: 50 | total_action_space.append(u_action_space) 51 | # communication action space 52 | c_action_space = spaces.Discrete(world.dim_c) 53 | if not agent.silent: 54 | total_action_space.append(c_action_space) 55 | # total action space 56 | if len(total_action_space) > 1: 57 | # all action spaces are discrete, so simplify to MultiDiscrete action space 58 | if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]): 59 | act_space = spaces.MultiDiscrete([[0,act_space.n-1] for act_space in total_action_space]) 60 | else: 61 | act_space = spaces.Tuple(total_action_space) 62 | self.action_space.append(act_space) 63 | else: 64 | self.action_space.append(total_action_space[0]) 65 | # observation space 66 | obs_dim = len(observation_callback(agent, self.world)) 67 | self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,))) 68 | agent.action.c = np.zeros(self.world.dim_c) 69 | 70 | # rendering 71 | self.shared_viewer = shared_viewer 72 | if self.shared_viewer: 73 | self.viewers = [None] 74 | else: 75 | self.viewers = [None] * self.n 76 | self._reset_render() 77 | 78 | def _seed(self, seed=None): 79 | if seed is None: 80 | np.random.seed(1) 81 | else: 82 | np.random.seed(seed) 83 | 84 | def _step(self, action_n): 85 | obs_n = [] 86 | reward_n = [] 87 | done_n = [] 88 | info_n = {'n': []} 89 | self.agents = self.world.policy_agents 90 | # set action for each agent 91 | for i, agent in enumerate(self.agents): 92 | self._set_action(action_n[i], agent, self.action_space[i]) 93 | # advance world state 94 | self.world.step() 95 | # record observation for each agent 96 | for agent in self.agents: 97 | obs_n.append(self._get_obs(agent)) 98 | reward_n.append(self._get_reward(agent)) 99 | done_n.append(self._get_done(agent)) 100 | 101 | info_n['n'].append(self._get_info(agent)) 102 | 103 | # all agents get total reward in cooperative case 104 | reward = np.sum(reward_n) 105 | if self.shared_reward: 106 | reward_n = [reward] * self.n 107 | if self.post_step_callback is not None: 108 | self.post_step_callback(self.world) 109 | return obs_n, reward_n, done_n, info_n 110 | 111 | def _reset(self): 112 | # reset world 113 | self.reset_callback(self.world) 114 | # reset renderer 115 | self._reset_render() 116 | # record observations for each agent 117 | obs_n = [] 118 | self.agents = self.world.policy_agents 119 | for agent in self.agents: 120 | obs_n.append(self._get_obs(agent)) 121 | return obs_n 122 | 123 | # get info used for benchmarking 124 | def _get_info(self, agent): 125 | if self.info_callback is None: 126 | return {} 127 | return self.info_callback(agent, self.world) 128 | 129 | # get observation for a particular agent 130 | def _get_obs(self, agent): 131 | if self.observation_callback is None: 132 | return np.zeros(0) 133 | return self.observation_callback(agent, self.world) 134 | 135 | # get dones for a particular agent 136 | # unused right now -- agents are allowed to go beyond the viewing screen 137 | def _get_done(self, agent): 138 | if self.done_callback is None: 139 | return False 140 | return self.done_callback(agent, self.world) 141 | 142 | # get reward for a particular agent 143 | def _get_reward(self, agent): 144 | if self.reward_callback is None: 145 | return 0.0 146 | return self.reward_callback(agent, self.world) 147 | 148 | # set env action for a particular agent 149 | def _set_action(self, action, agent, action_space, time=None): 150 | agent.action.u = np.zeros(self.world.dim_p) 151 | agent.action.c = np.zeros(self.world.dim_c) 152 | # process action 153 | if isinstance(action_space, spaces.MultiDiscrete): 154 | act = [] 155 | size = action_space.high - action_space.low + 1 156 | index = 0 157 | for s in size: 158 | act.append(action[index:(index+s)]) 159 | index += s 160 | action = act 161 | else: 162 | action = [action] 163 | 164 | if agent.movable: 165 | # physical action 166 | if self.discrete_action_input: 167 | agent.action.u = np.zeros(self.world.dim_p) 168 | # process discrete action 169 | if action[0] == 1: agent.action.u[0] = -1.0 170 | if action[0] == 2: agent.action.u[0] = +1.0 171 | if action[0] == 3: agent.action.u[1] = -1.0 172 | if action[0] == 4: agent.action.u[1] = +1.0 173 | else: 174 | if self.force_discrete_action: 175 | d = np.argmax(action[0]) 176 | action[0][:] = 0.0 177 | action[0][d] = 1.0 178 | if self.discrete_action_space: 179 | agent.action.u[0] += action[0][1] - action[0][2] 180 | agent.action.u[1] += action[0][3] - action[0][4] 181 | else: 182 | agent.action.u = action[0] 183 | sensitivity = 5.0 184 | if agent.accel is not None: 185 | sensitivity = agent.accel 186 | agent.action.u *= sensitivity 187 | action = action[1:] 188 | if not agent.silent: 189 | # communication action 190 | if self.discrete_action_input: 191 | agent.action.c = np.zeros(self.world.dim_c) 192 | agent.action.c[action[0]] = 1.0 193 | else: 194 | agent.action.c = action[0] 195 | action = action[1:] 196 | # make sure we used all elements of action 197 | assert len(action) == 0 198 | 199 | # reset rendering assets 200 | def _reset_render(self): 201 | self.render_geoms = None 202 | self.render_geoms_xform = None 203 | 204 | # render environment 205 | def _render(self, mode='human', close=True): 206 | if close: 207 | # close any existic renderers 208 | for i,viewer in enumerate(self.viewers): 209 | if viewer is not None: 210 | viewer.close() 211 | self.viewers[i] = None 212 | return [] 213 | 214 | if mode == 'human': 215 | alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 216 | message = '' 217 | for agent in self.world.agents: 218 | comm = [] 219 | for other in self.world.agents: 220 | if other is agent: continue 221 | if np.all(other.state.c == 0): 222 | word = '_' 223 | else: 224 | word = alphabet[np.argmax(other.state.c)] 225 | message += (other.name + ' to ' + agent.name + ': ' + word + ' ') 226 | # print(message) 227 | 228 | for i in range(len(self.viewers)): 229 | # create viewers (if necessary) 230 | if self.viewers[i] is None: 231 | # import rendering only if we need it (and don't import for headless machines) 232 | #from gym.envs.classic_control import rendering 233 | from multiagent import rendering 234 | self.viewers[i] = rendering.Viewer(700,700) 235 | 236 | # create rendering geometry 237 | if self.render_geoms is None: 238 | # import rendering only if we need it (and don't import for headless machines) 239 | #from gym.envs.classic_control import rendering 240 | from multiagent import rendering 241 | self.render_geoms = [] 242 | self.render_geoms_xform = [] 243 | self.comm_geoms = [] 244 | for entity in self.world.entities: 245 | geom = rendering.make_circle(entity.size) 246 | xform = rendering.Transform() 247 | entity_comm_geoms = [] 248 | if 'agent' in entity.name: 249 | geom.set_color(*entity.color, alpha=0.5) 250 | if not entity.silent: 251 | dim_c = self.world.dim_c 252 | # make circles to represent communication 253 | for ci in range(dim_c): 254 | comm = rendering.make_circle(entity.size / dim_c) 255 | comm.set_color(1, 1, 1) 256 | comm.add_attr(xform) 257 | offset = rendering.Transform() 258 | comm_size = (entity.size / dim_c) 259 | offset.set_translation(ci * comm_size * 2 - 260 | entity.size + comm_size, 0) 261 | comm.add_attr(offset) 262 | entity_comm_geoms.append(comm) 263 | else: 264 | geom.set_color(*entity.color) 265 | geom.add_attr(xform) 266 | self.render_geoms.append(geom) 267 | self.render_geoms_xform.append(xform) 268 | self.comm_geoms.append(entity_comm_geoms) 269 | for wall in self.world.walls: 270 | corners = ((wall.axis_pos - 0.5 * wall.width, wall.endpoints[0]), 271 | (wall.axis_pos - 0.5 * wall.width, wall.endpoints[1]), 272 | (wall.axis_pos + 0.5 * wall.width, wall.endpoints[1]), 273 | (wall.axis_pos + 0.5 * wall.width, wall.endpoints[0])) 274 | if wall.orient == 'H': 275 | corners = tuple(c[::-1] for c in corners) 276 | geom = rendering.make_polygon(corners) 277 | if wall.hard: 278 | geom.set_color(*wall.color) 279 | else: 280 | geom.set_color(*wall.color, alpha=0.5) 281 | self.render_geoms.append(geom) 282 | 283 | # add geoms to viewer 284 | for viewer in self.viewers: 285 | viewer.geoms = [] 286 | for geom in self.render_geoms: 287 | viewer.add_geom(geom) 288 | for entity_comm_geoms in self.comm_geoms: 289 | for geom in entity_comm_geoms: 290 | viewer.add_geom(geom) 291 | 292 | results = [] 293 | for i in range(len(self.viewers)): 294 | from multiagent import rendering 295 | # update bounds to center around agent 296 | cam_range = 1 297 | if self.shared_viewer: 298 | pos = np.zeros(self.world.dim_p) 299 | else: 300 | pos = self.agents[i].state.p_pos 301 | self.viewers[i].set_bounds(pos[0]-cam_range,pos[0]+cam_range,pos[1]-cam_range,pos[1]+cam_range) 302 | # update geometry positions 303 | for e, entity in enumerate(self.world.entities): 304 | self.render_geoms_xform[e].set_translation(*entity.state.p_pos) 305 | if 'agent' in entity.name: 306 | self.render_geoms[e].set_color(*entity.color, alpha=0.5) 307 | if not entity.silent: 308 | for ci in range(self.world.dim_c): 309 | color = 1 - entity.state.c[ci] 310 | self.comm_geoms[e][ci].set_color(color, color, color) 311 | else: 312 | self.render_geoms[e].set_color(*entity.color) 313 | # render to display or array 314 | results.append(self.viewers[i].render(return_rgb_array = mode=='rgb_array')) 315 | 316 | return results 317 | 318 | # create receptor field locations in local coordinate frame 319 | def _make_receptor_locations(self, agent): 320 | receptor_type = 'polar' 321 | range_min = 0.05 * 2.0 322 | range_max = 1.00 323 | dx = [] 324 | # circular receptive field 325 | if receptor_type == 'polar': 326 | for angle in np.linspace(-np.pi, +np.pi, 8, endpoint=False): 327 | for distance in np.linspace(range_min, range_max, 3): 328 | dx.append(distance * np.array([np.cos(angle), np.sin(angle)])) 329 | # add origin 330 | dx.append(np.array([0.0, 0.0])) 331 | # grid receptive field 332 | if receptor_type == 'grid': 333 | for x in np.linspace(-range_max, +range_max, 5): 334 | for y in np.linspace(-range_max, +range_max, 5): 335 | dx.append(np.array([x,y])) 336 | return dx 337 | 338 | 339 | # vectorized wrapper for a batch of multi-agent environments 340 | # assumes all environments have the same observation and action space 341 | class BatchMultiAgentEnv(gym.Env): 342 | metadata = { 343 | 'runtime.vectorized': True, 344 | 'render.modes' : ['human', 'rgb_array'] 345 | } 346 | 347 | def __init__(self, env_batch): 348 | self.env_batch = env_batch 349 | 350 | @property 351 | def n(self): 352 | return np.sum([env.n for env in self.env_batch]) 353 | 354 | @property 355 | def action_space(self): 356 | return self.env_batch[0].action_space 357 | 358 | @property 359 | def observation_space(self): 360 | return self.env_batch[0].observation_space 361 | 362 | def _step(self, action_n, time): 363 | obs_n = [] 364 | reward_n = [] 365 | done_n = [] 366 | info_n = {'n': []} 367 | i = 0 368 | for env in self.env_batch: 369 | obs, reward, done, _ = env.step(action_n[i:(i+env.n)], time) 370 | i += env.n 371 | obs_n += obs 372 | # reward = [r / len(self.env_batch) for r in reward] 373 | reward_n += reward 374 | done_n += done 375 | return obs_n, reward_n, done_n, info_n 376 | 377 | def _reset(self): 378 | obs_n = [] 379 | for env in self.env_batch: 380 | obs_n += env.reset() 381 | return obs_n 382 | 383 | # render environment 384 | def _render(self, mode='human', close=True): 385 | results_n = [] 386 | for env in self.env_batch: 387 | results_n += env.render(mode, close) 388 | return results_n 389 | -------------------------------------------------------------------------------- /experiments/ibmac.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import tensorflow as tf 4 | import maddpg.common.tf_util as U 5 | 6 | from maddpg.common.distributions import make_pdtype 7 | from maddpg import AgentTrainer 8 | from maddpg.trainer.replay_buffer import ReplayBuffer 9 | 10 | import itertools 11 | 12 | 13 | def discount_with_dones(rewards, dones, gamma): 14 | discounted = [] 15 | r = 0 16 | for reward, done in zip(rewards[::-1], dones[::-1]): 17 | r = reward + gamma * r 18 | r = r * (1. - done) 19 | discounted.append(r) 20 | return discounted[::-1] 21 | 22 | def clip_message(message, clip_threshold, is_norm_training, is_inference): 23 | 24 | gamma = tf.Variable(clip_threshold * tf.ones(message.shape[-1]), name='clip_gamma') 25 | beta = tf.Variable(tf.zeros(message.shape[-1]), name='clip_beta') 26 | 27 | pop_mean = tf.Variable(tf.zeros(message.shape[-1]), trainable=False, name='pop_mean') 28 | pop_variance = tf.Variable(tf.ones(message.shape[-1]), trainable=False, name='pop_variance') 29 | 30 | epsilon = 1e-8 31 | 32 | def batch_norm_training(): 33 | batch_mean, batch_variance = tf.nn.moments(message, [0]) 34 | 35 | decay = 0.999 36 | train_mean = tf.assign(pop_mean, pop_mean*decay + batch_mean*(1 - decay), name='train_mean') 37 | train_variance = tf.assign(pop_variance, pop_variance*decay + batch_variance*(1 - decay), name='train_variance') 38 | 39 | with tf.control_dependencies([train_mean, train_variance]): 40 | return tf.nn.batch_normalization(message, batch_mean, batch_variance, batch_mean, tf.math.sqrt(batch_variance), epsilon, name='train_clip_message') 41 | 42 | def batch_norm_inference(): 43 | return tf.nn.batch_normalization(message, pop_mean, pop_variance, beta, gamma, epsilon, name='inference_clip_message') 44 | 45 | def batch_direct_act(): 46 | return message 47 | 48 | batch_normalized_output = tf.case({is_norm_training: batch_norm_training, is_inference: batch_norm_inference}, 49 | default=batch_direct_act, exclusive=True) 50 | 51 | return batch_normalized_output 52 | 53 | 54 | 55 | def make_update_exp(vals, target_vals): 56 | polyak = 1.0 - 1e-2 57 | expression = [] 58 | for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)): 59 | expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var)) 60 | expression = tf.group(*expression) 61 | return U.function([], [], updates=[expression]) 62 | 63 | 64 | def p_train(make_obs_ph_n, act_space_n, before_com_func, channel, after_com_func, q_func, optimizer, 65 | grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None, beta=0.01, 66 | ibmac_com=True): 67 | with tf.variable_scope(scope, reuse=reuse): 68 | clip_threshold = 1 # 1, 5, 10 69 | is_norm_training = tf.placeholder(tf.bool) 70 | is_inference = tf.placeholder(tf.bool) 71 | 72 | 73 | ibmac_nocom = not ibmac_com 74 | num_agents = len(make_obs_ph_n) 75 | 76 | # create distribtuions 77 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 78 | 79 | # set up placeholders 80 | obs_ph_n = make_obs_ph_n 81 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(num_agents)] 82 | 83 | hiddens_n = [before_com_func(obs_ph_n[i], num_units, scope="before_com_{}".format(i), num_units=num_units) for i 84 | in range(num_agents)] 85 | before_com_vars_n = [U.scope_vars(U.absolute_scope_name("before_com_{}".format(i))) for i in range(num_agents)] 86 | 87 | hiddens_n_for_message = tf.concat( 88 | [before_com_func(obs_ph_n[i], num_units, scope="before_com_{}".format(i), reuse=True, num_units=num_units) 89 | for i in range(num_agents)], axis=1) 90 | hiddens_n_for_message = tf.stop_gradient(hiddens_n_for_message) 91 | channel_output = channel(hiddens_n_for_message, num_units * num_agents, scope="channel", 92 | num_units=num_units * num_agents) 93 | message_n, mu_message_n, logvar_message_n = [tf.split(item, num_or_size_splits=num_agents, axis=1) for item in 94 | channel_output] 95 | logvar_message_n = [tf.clip_by_value(log, -10, 10) for log in logvar_message_n] # constrain kl_loss not to be too large 96 | 97 | 98 | message_n = [clip_message(message, clip_threshold, is_norm_training, is_inference) for message in message_n] 99 | 100 | channel_vars_n = [U.scope_vars(U.absolute_scope_name("channel"))] 101 | 102 | if ibmac_nocom: 103 | print('no_com') 104 | p_n = [after_com_func(hiddens_n[i], int(act_pdtype_n[i].param_shape()[0]), scope="p_func_{}".format(i), 105 | num_units=num_units) for i in range(num_agents)] 106 | else: 107 | check_n = [hiddens_n[i] + message_n[i] for i in range(num_agents)] 108 | p_n = [after_com_func(hiddens_n[i] + message_n[i], int(act_pdtype_n[i].param_shape()[0]), 109 | scope="p_func_{}".format(i), num_units=num_units) for i in range(num_agents)] 110 | p_func_vars = [U.scope_vars(U.absolute_scope_name("p_func_{}".format(i))) for i in range(num_agents)] 111 | 112 | # wrap parameters in distribution 113 | act_pd_n = [act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents)] 114 | 115 | act_sample_n = [act_pd.sample() for act_pd in act_pd_n] 116 | p_reg_n = [tf.reduce_mean(tf.square(act_pd.flatparam())) for act_pd in act_pd_n] 117 | 118 | act_input_n_n = [act_ph_n + [] for _ in range(num_agents)] 119 | for i in range(num_agents): 120 | act_input_n_n[i][i] = act_pd_n[i].sample() 121 | q_input_n = [tf.concat(obs_ph_n + act_input_n, 1) for act_input_n in act_input_n_n] 122 | 123 | q_n = [q_func(q_input_n[i], 1, scope="q_func_{}".format(i), reuse=True, num_units=num_units)[:, 0] for i in 124 | range(num_agents)] 125 | pg_loss_n = [-tf.reduce_mean(q) for q in q_n] 126 | 127 | # # 0.25 128 | # kl_loss_message_n = [2 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(0.5) - 0.5 for mu, log in 129 | # zip(mu_message_n, logvar_message_n)] 130 | 131 | # #1 132 | # kl_loss_message_n = [0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5 for mu, log in 133 | # zip(mu_message_n, logvar_message_n)] 134 | # #5 135 | # kl_loss_message_n = [1.0/50 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(5) - 0.5 for mu, log in 136 | # zip(mu_message_n, logvar_message_n)] 137 | #10 138 | kl_loss_message_n = [1.0/200 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(10) - 0.5 for mu, log in 139 | zip(mu_message_n, logvar_message_n)] 140 | 141 | entropy = [tf.exp(log) + 1.4189 for log in logvar_message_n] 142 | 143 | pg_loss = tf.reduce_sum(pg_loss_n) 144 | p_reg = tf.reduce_sum(p_reg_n) 145 | kl_loss_message = tf.reduce_mean(kl_loss_message_n) 146 | 147 | if ibmac_nocom: 148 | loss = pg_loss + p_reg * 1e-3 149 | else: 150 | loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message 151 | 152 | kl_loss = U.function(inputs=obs_ph_n + act_ph_n+[is_norm_training, is_inference], outputs=kl_loss_message) 153 | 154 | var_list = [] 155 | var_list.extend(before_com_vars_n) 156 | if not ibmac_nocom: 157 | var_list.extend(channel_vars_n) 158 | var_list.extend(p_func_vars) 159 | var_list = list(itertools.chain(*var_list)) 160 | optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping) 161 | 162 | # Create callable functions 163 | train = U.function(inputs=obs_ph_n + act_ph_n+[is_norm_training, is_inference], outputs=loss, updates=[optimize_expr]) 164 | act = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=act_sample_n) 165 | p_values = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=p_n) 166 | if not ibmac_nocom: 167 | check_values = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=check_n) 168 | channel_com = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=channel_output) 169 | check_mu = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=mu_message_n) 170 | check_log = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=logvar_message_n) 171 | else: 172 | check_values = lambda x: 0 173 | channel_com = lambda x: 0 174 | check_mu = lambda x: 0 175 | check_log = lambda x: 0 176 | 177 | # target network 178 | target_hiddens_n = [ 179 | before_com_func(obs_ph_n[i], num_units, scope="target_before_com_{}".format(i), num_units=num_units) for i 180 | in range(num_agents)] 181 | target_before_com_vars = [U.scope_vars(U.absolute_scope_name("target_before_com_{}".format(i))) for i in 182 | range(num_agents)] 183 | 184 | target_hiddens_n_for_message = tf.concat([before_com_func(obs_ph_n[i], num_units, 185 | scope="target_before_com_{}".format(i), reuse=True, 186 | num_units=num_units) for i in range(num_agents)], 187 | axis=1) 188 | target_hiddens_n_for_message = tf.stop_gradient(target_hiddens_n_for_message) 189 | target_channel_output = channel(target_hiddens_n_for_message, num_units * num_agents, scope="target_channel", 190 | num_units=num_units * num_agents) 191 | target_message_n, target_mu_message_n, target_logvar_message_n = [ 192 | tf.split(item, num_or_size_splits=num_agents, axis=1) for item in target_channel_output] 193 | target_channel_vars = [U.scope_vars(U.absolute_scope_name("target_channel"))] 194 | if ibmac_nocom: 195 | target_p_n = [after_com_func(target_hiddens_n[i], int(act_pdtype_n[i].param_shape()[0]), 196 | scope="target_p_func_{}".format(i), num_units=num_units) for i in 197 | range(num_agents)] 198 | else: 199 | target_p_n = [ 200 | after_com_func(target_hiddens_n[i] + target_message_n[i], int(act_pdtype_n[i].param_shape()[0]), 201 | scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents)] 202 | # target_p_n = [after_com_func(tf.concat([target_hiddens_n[i],target_message_n[i]], axis=1), int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents)] 203 | target_p_func_vars = [U.scope_vars(U.absolute_scope_name("target_p_func_{}".format(i))) for i in 204 | range(num_agents)] 205 | 206 | target_var_list = [] 207 | target_var_list.extend(target_before_com_vars) 208 | if not ibmac_nocom: 209 | target_var_list.extend(target_channel_vars) 210 | target_var_list.extend(target_p_func_vars) 211 | target_var_list = list(itertools.chain(*target_var_list)) 212 | update_target_p = make_update_exp(var_list, target_var_list) 213 | 214 | target_act_sample_n = [act_pdtype_n[i].pdfromflat(target_p_n[i]).sample() for i in range(num_agents)] 215 | target_act = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=target_act_sample_n) 216 | 217 | 218 | check_message_n = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=message_n) 219 | check_hiddens_n = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=hiddens_n) 220 | check_entropy = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=entropy) 221 | 222 | return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act, 'kl_loss': kl_loss, 223 | 'check_values': check_values, 'channel_com': channel_com, 224 | 'check_mu': check_mu, 'check_log': check_log, 225 | 'check_message_n':check_message_n, 'check_hiddens_n': check_hiddens_n, 226 | 'check_entropy': check_entropy} 227 | 228 | 229 | def q_train(make_obs_ph_n, act_space_n, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", 230 | reuse=None, num_units=64): 231 | with tf.variable_scope(scope, reuse=reuse): 232 | num_agents = len(make_obs_ph_n) 233 | 234 | # create distribtuions 235 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] 236 | 237 | # set up placeholders 238 | obs_ph_n = make_obs_ph_n 239 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action_{}".format(i)) for i in 240 | range(len(act_space_n))] 241 | target_ph_n = [tf.placeholder(tf.float32, [None], name="target_{}".format(i)) for i in range(num_agents)] 242 | is_norm_training = tf.placeholder(tf.bool) 243 | is_inference = tf.placeholder(tf.bool) 244 | 245 | q_input = tf.concat(obs_ph_n + act_ph_n, 1) 246 | q_n = [q_func(q_input, 1, scope="q_func_{}".format(i), num_units=num_units)[:, 0] for i in range(num_agents)] 247 | q_func_vars = [U.scope_vars(U.absolute_scope_name("q_func_{}".format(i))) for i in range(num_agents)] 248 | 249 | q_loss_n = [tf.reduce_mean(tf.square(q - target_ph)) for q, target_ph in zip(q_n, target_ph_n)] 250 | 251 | # viscosity solution to Bellman differential equation in place of an initial condition 252 | # q_reg = tf.reduce_mean(tf.square(q)) 253 | q_loss = tf.reduce_sum(q_loss_n) 254 | loss = q_loss # + 1e-3 * q_reg 255 | 256 | var_list = list(itertools.chain(*q_func_vars)) 257 | optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping) 258 | 259 | # Create callable functions 260 | train = U.function(inputs=obs_ph_n + act_ph_n + target_ph_n+[is_norm_training, is_inference], outputs=loss, updates=[optimize_expr]) 261 | q_values = U.function(obs_ph_n + act_ph_n+[is_norm_training, is_inference], q_n) 262 | 263 | # target network 264 | target_q_n = [q_func(q_input, 1, scope="target_q_func_{}".format(i), num_units=num_units)[:, 0] for i in 265 | range(num_agents)] 266 | target_q_func_vars = [U.scope_vars(U.absolute_scope_name("target_q_func_{}".format(i))) for i in 267 | range(num_agents)] 268 | 269 | traget_var_list = list(itertools.chain(*target_q_func_vars)) 270 | update_target_q = make_update_exp(var_list, traget_var_list) 271 | 272 | target_q_values = U.function(obs_ph_n + act_ph_n+[is_norm_training, is_inference], target_q_n) 273 | 274 | return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values} 275 | 276 | 277 | class IBMACAgentTrainer(AgentTrainer): 278 | def __init__(self, name, before_com_model, channel, after_com_model, critic_mlp_model, obs_shape_n, act_space_n, 279 | args, local_q_func=False): 280 | self.name = name 281 | self.n = len(obs_shape_n) 282 | self.args = args 283 | obs_ph_n = [] 284 | for i in range(self.n): 285 | obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation_" + str(i)).get()) 286 | 287 | # Create all the functions necessary to train the model 288 | self.q_train, self.q_update, self.q_debug = q_train( 289 | scope=self.name, 290 | make_obs_ph_n=obs_ph_n, 291 | act_space_n=act_space_n, 292 | q_func=critic_mlp_model, 293 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 294 | grad_norm_clipping=0.5, 295 | local_q_func=local_q_func, 296 | num_units=args.num_units, 297 | ) 298 | self.act, self.p_train, self.p_update, self.p_debug = p_train( 299 | scope=self.name, 300 | make_obs_ph_n=obs_ph_n, 301 | act_space_n=act_space_n, 302 | before_com_func=before_com_model, 303 | channel=channel, 304 | after_com_func=after_com_model, 305 | q_func=critic_mlp_model, 306 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 307 | grad_norm_clipping=0.5, 308 | local_q_func=local_q_func, 309 | num_units=args.num_units, 310 | beta=args.beta, 311 | ibmac_com=args.ibmac_com, 312 | ) 313 | # Create experience buffer 314 | self.replay_buffer = ReplayBuffer(1e6) 315 | # self.max_replay_buffer_len = 50 * args.max_episode_len 316 | self.max_replay_buffer_len = args.batch_size * args.max_episode_len 317 | self.replay_sample_index = None 318 | 319 | self.message_1_for_record = [] 320 | 321 | def action(self, obs_n, is_norm_training=False, is_inference=False): 322 | obs = [obs[None] for obs in obs_n] 323 | message_n = self.p_debug['check_message_n'](*(list(obs)+[is_norm_training, is_inference])) 324 | self.message_1_for_record.append(message_n[0]) 325 | if len(self.message_1_for_record)%2500 == 0: 326 | # print(np.var(self.message_1_for_record, axis=0)) 327 | # print(0.5 * np.log(2 * np.pi * np.mean(np.var(self.message_1_for_record, axis=0))) + 0.5) 328 | self.message_1_for_record = [] 329 | return self.act(*(list(obs)+[is_norm_training, is_inference])) 330 | 331 | def experience(self, obs, act, rew, new_obs, done, terminal): 332 | # Store transition in the replay buffer. 333 | self.replay_buffer.add(obs, act, rew, new_obs, [float(d) for d in done]) 334 | 335 | def preupdate(self): 336 | self.replay_sample_index = None 337 | 338 | def update(self, agents, t): 339 | if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough 340 | return 341 | if not t % 100 == 0: # only update every 100 steps 342 | return 343 | is_norm_training = True 344 | is_inference = False 345 | self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size) 346 | # collect replay sample from all agents 347 | obs_n = [] 348 | obs_next_n = [] 349 | act_n = [] 350 | index = self.replay_sample_index 351 | samples = self.replay_buffer.sample_index(index) 352 | obs_n, act_n, rew_n, obs_next_n, done_n = [np.swapaxes(item, 0, 1) for item in samples] 353 | # for i in range(self.n): 354 | # obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index) 355 | # obs_n.append(obs) 356 | # obs_next_n.append(obs_next) 357 | # act_n.append(act) 358 | # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) 359 | 360 | # train q network 361 | num_sample = 1 362 | target_q = 0.0 363 | # print(len(obs_next_n)) 364 | for i in range(num_sample): 365 | target_act_next_n = self.p_debug['target_act'](*(list(obs_next_n)+[is_norm_training, is_inference])) 366 | target_q_next_n = self.q_debug['target_q_values'](*(list(obs_next_n) + list(target_act_next_n)+[is_norm_training, is_inference])) 367 | target_q_n = [rew + self.args.gamma * (1.0 - done) * target_q_next for rew, done, target_q_next in 368 | zip(rew_n, done_n, target_q_next_n)] 369 | target_q_n = [target_q / num_sample for target_q in target_q_n] 370 | q_loss = self.q_train(*(list(obs_n) + list(act_n) + target_q_n + [is_norm_training, is_inference])) 371 | 372 | # train p network 373 | p_loss = self.p_train(*(list(obs_n) + list(act_n)+[is_norm_training, is_inference])) 374 | 375 | self.p_update() 376 | self.q_update() 377 | 378 | # p_values = self.p_debug['p_values'](*(list(obs_n))) 379 | kl_loss = self.p_debug['kl_loss'](*(list(obs_n) + list(act_n)+[is_norm_training, is_inference])) 380 | # print('kl_loss', self.p_debug['kl_loss'](*(list(obs_n) + list(act_n)))) 381 | # if t % 5000 == 0: 382 | # print('p_values', p_values[0][0]) 383 | # print('check_value', self.p_debug['p_values'](*(list(obs_n)))[0][0]) 384 | # print('check_mu', self.p_debug['check_mu'](*(list(obs_n)))[0][0]) 385 | # print('check_log', self.p_debug['check_log'](*(list(obs_n)))[0][0]) 386 | 387 | # print('kl_loss', kl_loss) 388 | # message_n = self.p_debug['check_message_n'](*(list(obs_n)+[is_norm_training, is_inference])) 389 | # hiddens_n = self.p_debug['check_hiddens_n'](*list(obs_n)) 390 | # print("message_n", message_n[0][0]) 391 | # for message in message_n: 392 | # print("mean, var", np.mean(message, axis=0), np.var(message,axis=0)) 393 | # print("hiddens_n", hiddens_n[0][0]) 394 | # entropy = self.p_debug['check_entropy'](*list(obs_n)) 395 | # print("entropy",np.mean(entropy, (1,2))) 396 | 397 | return [q_loss, p_loss, np.mean(target_q), np.mean(rew_n), np.mean(target_q_next_n), np.std(target_q), kl_loss] 398 | --------------------------------------------------------------------------------