├── multiagent-particle-envs
├── bin
│ ├── __init__.py
│ └── interactive.py
├── .gitignore
├── multiagent
│ ├── scenarios
│ │ ├── __init__.py
│ │ ├── simple.py
│ │ ├── simple_speaker_listener.py
│ │ ├── simple_reference.py
│ │ ├── simple_spread.py
│ │ ├── simple_spread_partially_observed.py
│ │ ├── simple_push.py
│ │ ├── simple_adversary.py
│ │ ├── simple_tag.py
│ │ ├── simple_crypto.py
│ │ └── simple_world_comm.py
│ ├── scenario.py
│ ├── __init__.py
│ ├── policy.py
│ ├── multi_discrete.py
│ ├── rendering.py
│ ├── core.py
│ └── environment.py
├── setup.py
├── make_env.py
└── README.md
├── experiments
├── result_test
│ ├── checkpoint
│ ├── debug.meta
│ ├── debug
│ │ ├── checkpoint
│ │ ├── team_0.index
│ │ ├── team_0.meta
│ │ ├── team_0.data-00000-of-00001
│ │ ├── events.out.tfevents.1597310617.cilc42-HP-Z4-G4-Workstation
│ │ └── events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation
│ ├── debug.index
│ └── debug.data-00000-of-00001
├── __pycache__
│ ├── ibmac.cpython-36.pyc
│ └── ibmac_inter.cpython-36.pyc
├── graph
│ ├── events.out.tfevents.1597310616.cilc42-HP-Z4-G4-Workstation
│ └── events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation
├── ibmac_inter.py
└── ibmac.py
├── maddpg
├── __pycache__
│ └── __init__.cpython-36.pyc
├── common
│ ├── __pycache__
│ │ ├── tf_util.cpython-36.pyc
│ │ └── distributions.cpython-36.pyc
│ ├── tf_util.py
│ └── distributions.py
├── trainer
│ ├── __pycache__
│ │ ├── maddpg.cpython-36.pyc
│ │ ├── replay_buffer.cpython-36.pyc
│ │ └── replay_buffer_with_messages.cpython-36.pyc
│ ├── replay_buffer.py
│ ├── replay_buffer_with_messages.py
│ └── maddpg.py
└── __init__.py
├── .idea
├── .gitignore
├── misc.xml
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
├── vcs.xml
├── modules.xml
└── icml_macom.iml
├── requirements.txt
└── README.md
/multiagent-particle-envs/bin/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.egg-info/
3 | *.pyc
--------------------------------------------------------------------------------
/experiments/result_test/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "debug"
2 | all_model_checkpoint_paths: "debug"
3 |
--------------------------------------------------------------------------------
/experiments/result_test/debug.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug.meta
--------------------------------------------------------------------------------
/experiments/result_test/debug/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "team_0"
2 | all_model_checkpoint_paths: "team_0"
3 |
--------------------------------------------------------------------------------
/experiments/result_test/debug.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug.index
--------------------------------------------------------------------------------
/experiments/result_test/debug/team_0.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug/team_0.index
--------------------------------------------------------------------------------
/experiments/result_test/debug/team_0.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug/team_0.meta
--------------------------------------------------------------------------------
/maddpg/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/experiments/__pycache__/ibmac.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/__pycache__/ibmac.cpython-36.pyc
--------------------------------------------------------------------------------
/experiments/result_test/debug.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug.data-00000-of-00001
--------------------------------------------------------------------------------
/maddpg/common/__pycache__/tf_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/common/__pycache__/tf_util.cpython-36.pyc
--------------------------------------------------------------------------------
/maddpg/trainer/__pycache__/maddpg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/trainer/__pycache__/maddpg.cpython-36.pyc
--------------------------------------------------------------------------------
/experiments/__pycache__/ibmac_inter.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/__pycache__/ibmac_inter.cpython-36.pyc
--------------------------------------------------------------------------------
/maddpg/common/__pycache__/distributions.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/common/__pycache__/distributions.cpython-36.pyc
--------------------------------------------------------------------------------
/experiments/result_test/debug/team_0.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug/team_0.data-00000-of-00001
--------------------------------------------------------------------------------
/maddpg/trainer/__pycache__/replay_buffer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/trainer/__pycache__/replay_buffer.cpython-36.pyc
--------------------------------------------------------------------------------
/maddpg/trainer/__pycache__/replay_buffer_with_messages.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/maddpg/trainer/__pycache__/replay_buffer_with_messages.cpython-36.pyc
--------------------------------------------------------------------------------
/experiments/graph/events.out.tfevents.1597310616.cilc42-HP-Z4-G4-Workstation:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/graph/events.out.tfevents.1597310616.cilc42-HP-Z4-G4-Workstation
--------------------------------------------------------------------------------
/experiments/graph/events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/graph/events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/experiments/result_test/debug/events.out.tfevents.1597310617.cilc42-HP-Z4-G4-Workstation:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug/events.out.tfevents.1597310617.cilc42-HP-Z4-G4-Workstation
--------------------------------------------------------------------------------
/experiments/result_test/debug/events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EC2EZ4RD/IMAC/HEAD/experiments/result_test/debug/events.out.tfevents.1597310683.cilc42-HP-Z4-G4-Workstation
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/__init__.py:
--------------------------------------------------------------------------------
1 | import imp
2 | import os.path as osp
3 |
4 |
5 | def load(name):
6 | pathname = osp.join(osp.dirname(__file__), name)
7 | return imp.load_source('', pathname)
8 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenario.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | # defines scenario upon which the world is built
4 | class BaseScenario(object):
5 | # create elements of the world
6 | def make_world(self):
7 | raise NotImplementedError()
8 | # create initial conditions of the world
9 | def reset_world(self, world):
10 | raise NotImplementedError()
11 |
--------------------------------------------------------------------------------
/maddpg/__init__.py:
--------------------------------------------------------------------------------
1 | class AgentTrainer(object):
2 | def __init__(self, name, model, obs_shape, act_space, args):
3 | raise NotImplemented()
4 |
5 | def action(self, obs):
6 | raise NotImplemented()
7 |
8 | def process_experience(self, obs, act, rew, new_obs, done, terminal):
9 | raise NotImplemented()
10 |
11 | def preupdate(self):
12 | raise NotImplemented()
13 |
14 | def update(self, agents):
15 | raise NotImplemented()
--------------------------------------------------------------------------------
/multiagent-particle-envs/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(name='multiagent',
4 | version='0.0.1',
5 | description='Multi-Agent Goal-Driven Communication Environment',
6 | url='https://github.com/openai/multiagent-public',
7 | author='Igor Mordatch',
8 | author_email='mordatch@openai.com',
9 | packages=find_packages(),
10 | include_package_data=True,
11 | zip_safe=False,
12 | install_requires=['gym', 'numpy-stl']
13 | )
14 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 |
3 | # Multiagent envs
4 | # ----------------------------------------
5 |
6 | register(
7 | id='MultiagentSimple-v0',
8 | entry_point='multiagent.envs:SimpleEnv',
9 | # FIXME(cathywu) currently has to be exactly max_path_length parameters in
10 | # rllab run script
11 | max_episode_steps=100,
12 | )
13 |
14 | register(
15 | id='MultiagentSimpleSpeakerListener-v0',
16 | entry_point='multiagent.envs:SimpleSpeakerListenerEnv',
17 | max_episode_steps=100,
18 | )
19 |
--------------------------------------------------------------------------------
/.idea/icml_macom.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py==0.7.0
2 | astor==0.7.1
3 | -e git+https://github.com/openai/baselines.git@57e05eb420f9a20fa8cd7ee7580b1a8874b4a323#egg=baselines
4 | cffi==1.12.2
5 | chardet==3.0.4
6 | Click==7.0
7 | cloudpickle==0.7.0
8 | cycler==0.10.0
9 | Cython==0.29.6
10 | dill==0.2.9
11 | enum34==1.1.6
12 | future==0.17.1
13 | futures==3.1.1
14 | gast==0.2.2
15 | glfw==1.7.1
16 | grpcio==1.18.0
17 | gym==0.9.4
18 | h5py==2.9.0
19 | idna==2.8
20 | imageio==2.5.0
21 | joblib==0.13.1
22 | Keras-Applications==1.0.7
23 | Keras-Preprocessing==1.0.9
24 | kiwisolver==1.0.1
25 | lockfile==0.12.2
26 | lxml==4.3.2
27 | Markdown==3.0.1
28 | matplotlib==3.0.2
29 | -e multiagent-particle-envs
30 | numpy==1.16.2
31 | numpy-stl==2.9.0
32 | opencv-python==4.0.0.21
33 | pandas==0.24.1
34 | Pillow==5.4.1
35 | progressbar2==3.39.2
36 | protobuf==3.6.1
37 | pycparser==2.19
38 | pygame==1.9.4
39 | pyglet==1.3.2
40 | PyOpenGL==3.1.0
41 | pyparsing==2.3.1
42 | python-dateutil==2.8.0
43 | python-utils==2.3.0
44 | pytz==2018.9
45 | requests==2.21.0
46 | scipy==1.2.0
47 | seaborn==0.9.0
48 | six==1.12.0
49 | tensorboard==1.12.2
50 | tensorboardX==1.6
51 | tensorflow==1.12.0
52 | termcolor==1.1.0
53 | torch==1.0.1
54 | torchvision==0.2.1
55 | tqdm==4.30.0
56 | urllib3==1.24.1
57 | Werkzeug==0.14.1
58 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/bin/interactive.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os,sys
3 | sys.path.insert(1, os.path.join(sys.path[0], '..'))
4 | import argparse
5 | import time
6 |
7 | from multiagent.environment import MultiAgentEnv
8 | from multiagent.policy import InteractivePolicy
9 | import multiagent.scenarios as scenarios
10 |
11 | if __name__ == '__main__':
12 | # parse arguments
13 | parser = argparse.ArgumentParser(description=None)
14 | parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.')
15 | args = parser.parse_args()
16 |
17 | # load scenario from script
18 | scenario = scenarios.load(args.scenario).Scenario()
19 | # create world
20 | world = scenario.make_world()
21 | # create multiagent environment
22 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
23 | scenario.observation, info_callback=None,
24 | shared_viewer=False, discrete_action=True)
25 | # render call to create viewer window (necessary only for interactive policies)
26 | env.render()
27 | # create interactive policies for each agent
28 | policies = [InteractivePolicy(env,i) for i in range(env.n)]
29 | # execution loop
30 | obs_n = env.reset()
31 | while True:
32 | start = time.time()
33 | # query for action from each agent's policy
34 | act_n = []
35 | for i, policy in enumerate(policies):
36 | act_n.append(policy.action(obs_n[i]))
37 | # step environment
38 | obs_n, reward_n, done_n, _ = env.step(act_n)
39 | # render all agent views
40 | env.render()
41 | end = time.time()
42 | elapsed = end - start
43 | time.sleep(max(1 / 30 - elapsed, 0))
44 | # display rewards
45 | #for agent in env.world.agents:
46 | # print(agent.name + " reward: %0.3f" % env._get_reward(agent))
47 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/make_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Code for creating a multiagent environment with one of the scenarios listed
3 | in ./scenarios/.
4 | Can be called by using, for example:
5 | env = make_env('simple_speaker_listener')
6 | After producing the env object, can be used similarly to an OpenAI gym
7 | environment.
8 |
9 | A policy using this environment must output actions in the form of a list
10 | for all agents. Each element of the list should be a numpy array,
11 | of size (env.world.dim_p + env.world.dim_c, 1). Physical actions precede
12 | communication actions in this array. See environment.py for more details.
13 | """
14 |
15 | def make_env(scenario_name, benchmark=False):
16 | '''
17 | Creates a MultiAgentEnv object as env. This can be used similar to a gym
18 | environment by calling env.reset() and env.step().
19 | Use env.render() to view the environment on the screen.
20 |
21 | Input:
22 | scenario_name : name of the scenario from ./scenarios/ to be Returns
23 | (without the .py extension)
24 | benchmark : whether you want to produce benchmarking data
25 | (usually only done during evaluation)
26 |
27 | Some useful env properties (see environment.py):
28 | .observation_space : Returns the observation space for each agent
29 | .action_space : Returns the action space for each agent
30 | .n : Returns the number of Agents
31 | '''
32 | from multiagent.environment import MultiAgentEnv
33 | import multiagent.scenarios as scenarios
34 |
35 | # load scenario from script
36 | scenario = scenarios.load(scenario_name + ".py").Scenario()
37 | # create world
38 | world = scenario.make_world()
39 | # create multiagent environment
40 | if benchmark:
41 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
42 | else:
43 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
44 | return env
45 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/policy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from pyglet.window import key
3 |
4 | # individual agent policy
5 | class Policy(object):
6 | def __init__(self):
7 | pass
8 | def action(self, obs):
9 | raise NotImplementedError()
10 |
11 | # interactive policy based on keyboard input
12 | # hard-coded to deal only with movement, not communication
13 | class InteractivePolicy(Policy):
14 | def __init__(self, env, agent_index):
15 | super(InteractivePolicy, self).__init__()
16 | self.env = env
17 | # hard-coded keyboard events
18 | self.move = [False for i in range(4)]
19 | self.comm = [False for i in range(env.world.dim_c)]
20 | # register keyboard events with this environment's window
21 | env.viewers[agent_index].window.on_key_press = self.key_press
22 | env.viewers[agent_index].window.on_key_release = self.key_release
23 |
24 | def action(self, obs):
25 | # ignore observation and just act based on keyboard events
26 | if self.env.discrete_action_input:
27 | u = 0
28 | if self.move[0]: u = 1
29 | if self.move[1]: u = 2
30 | if self.move[2]: u = 4
31 | if self.move[3]: u = 3
32 | else:
33 | u = np.zeros(5) # 5-d because of no-move action
34 | if self.move[0]: u[1] += 1.0
35 | if self.move[1]: u[2] += 1.0
36 | if self.move[3]: u[3] += 1.0
37 | if self.move[2]: u[4] += 1.0
38 | if True not in self.move:
39 | u[0] += 1.0
40 | return np.concatenate([u, np.zeros(self.env.world.dim_c)])
41 |
42 | # keyboard event callbacks
43 | def key_press(self, k, mod):
44 | if k==key.RIGHT: self.move[0] = True
45 | if k==key.LEFT: self.move[1] = True
46 | if k==key.DOWN: self.move[2] = True
47 | if k==key.UP: self.move[3] = True
48 | def key_release(self, k, mod):
49 | if k==key.RIGHT: self.move[0] = False
50 | if k==key.LEFT: self.move[1] = False
51 | if k==key.DOWN: self.move[2] = False
52 | if k==key.UP: self.move[3] = False
53 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/simple.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from multiagent.core import World, Agent, Landmark
3 | from multiagent.scenario import BaseScenario
4 |
5 | class Scenario(BaseScenario):
6 | def make_world(self):
7 | world = World()
8 | # add agents
9 | world.agents = [Agent() for i in range(1)]
10 | for i, agent in enumerate(world.agents):
11 | agent.name = 'agent %d' % i
12 | agent.collide = False
13 | agent.silent = True
14 | # add landmarks
15 | world.landmarks = [Landmark() for i in range(1)]
16 | for i, landmark in enumerate(world.landmarks):
17 | landmark.name = 'landmark %d' % i
18 | landmark.collide = False
19 | landmark.movable = False
20 | # make initial conditions
21 | self.reset_world(world)
22 | return world
23 |
24 | def reset_world(self, world):
25 | # random properties for agents
26 | for i, agent in enumerate(world.agents):
27 | agent.color = np.array([0.25,0.25,0.25])
28 | # random properties for landmarks
29 | for i, landmark in enumerate(world.landmarks):
30 | landmark.color = np.array([0.75,0.75,0.75])
31 | world.landmarks[0].color = np.array([0.75,0.25,0.25])
32 | # set random initial states
33 | for agent in world.agents:
34 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
35 | agent.state.p_vel = np.zeros(world.dim_p)
36 | agent.state.c = np.zeros(world.dim_c)
37 | for i, landmark in enumerate(world.landmarks):
38 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
39 | landmark.state.p_vel = np.zeros(world.dim_p)
40 |
41 | def reward(self, agent, world):
42 | dist2 = np.sum(np.square(agent.state.p_pos - world.landmarks[0].state.p_pos))
43 | return -dist2 #np.exp(-dist2)
44 |
45 | def observation(self, agent, world):
46 | # get positions of all entities in this agent's reference frame
47 | entity_pos = []
48 | for entity in world.landmarks:
49 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
50 | return np.concatenate([agent.state.p_vel] + entity_pos)
51 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/multi_discrete.py:
--------------------------------------------------------------------------------
1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates)
2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py)
3 |
4 | import numpy as np
5 |
6 | import gym
7 | from gym.spaces import prng
8 |
9 | class MultiDiscrete(gym.Space):
10 | """
11 | - The multi-discrete action space consists of a series of discrete action spaces with different parameters
12 | - It can be adapted to both a Discrete action space or a continuous (Box) action space
13 | - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
14 | - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
15 | where the discrete action space can take any integers from `min` to `max` (both inclusive)
16 | Note: A value of 0 always need to represent the NOOP action.
17 | e.g. Nintendo Game Controller
18 | - Can be conceptualized as 3 discrete action spaces:
19 | 1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4
20 | 2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
21 | 3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
22 | - Can be initialized as
23 | MultiDiscrete([ [0,4], [0,1], [0,1] ])
24 | """
25 | def __init__(self, array_of_param_array):
26 | self.low = np.array([x[0] for x in array_of_param_array])
27 | self.high = np.array([x[1] for x in array_of_param_array])
28 | self.num_discrete_space = self.low.shape[0]
29 |
30 | def sample(self):
31 | """ Returns a array with one sample from each discrete action space """
32 | # For each row: round(random .* (max - min) + min, 0)
33 | random_array = prng.np_random.rand(self.num_discrete_space)
34 | return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
35 | def contains(self, x):
36 | return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all()
37 |
38 | @property
39 | def shape(self):
40 | return self.num_discrete_space
41 | def __repr__(self):
42 | return "MultiDiscrete" + str(self.num_discrete_space)
43 | def __eq__(self, other):
44 | return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
--------------------------------------------------------------------------------
/maddpg/trainer/replay_buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 |
4 | class ReplayBuffer(object):
5 | def __init__(self, size):
6 | """Create Prioritized Replay buffer.
7 |
8 | Parameters
9 | ----------
10 | size: int
11 | Max number of transitions to store in the buffer. When the buffer
12 | overflows the old memories are dropped.
13 | """
14 | self._storage = []
15 | self._maxsize = int(size)
16 | self._next_idx = 0
17 |
18 | def __len__(self):
19 | return len(self._storage)
20 |
21 | def clear(self):
22 | self._storage = []
23 | self._next_idx = 0
24 |
25 | def add(self, obs_t, action, reward, obs_tp1, done):
26 | data = (obs_t, action, reward, obs_tp1, done)
27 |
28 | if self._next_idx >= len(self._storage):
29 | self._storage.append(data)
30 | else:
31 | self._storage[self._next_idx] = data
32 | self._next_idx = (self._next_idx + 1) % self._maxsize
33 |
34 | def _encode_sample(self, idxes):
35 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
36 | for i in idxes:
37 | data = self._storage[i]
38 | obs_t, action, reward, obs_tp1, done = data
39 | obses_t.append(np.array(obs_t, copy=False))
40 | actions.append(np.array(action, copy=False))
41 | rewards.append(reward)
42 | obses_tp1.append(np.array(obs_tp1, copy=False))
43 | dones.append(done)
44 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
45 |
46 | def make_index(self, batch_size):
47 | return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
48 |
49 | def make_latest_index(self, batch_size):
50 | idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)]
51 | np.random.shuffle(idx)
52 | return idx
53 |
54 | def sample_index(self, idxes):
55 | return self._encode_sample(idxes)
56 |
57 | def sample(self, batch_size):
58 | """Sample a batch of experiences.
59 |
60 | Parameters
61 | ----------
62 | batch_size: int
63 | How many transitions to sample.
64 |
65 | Returns
66 | -------
67 | obs_batch: np.array
68 | batch of observations
69 | act_batch: np.array
70 | batch of actions executed given obs_batch
71 | rew_batch: np.array
72 | rewards received as results of executing act_batch
73 | next_obs_batch: np.array
74 | next set of observations seen after executing act_batch
75 | done_mask: np.array
76 | done_mask[i] = 1 if executing act_batch[i] resulted in
77 | the end of an episode and 0 otherwise.
78 | """
79 | if batch_size > 0:
80 | idxes = self.make_index(batch_size)
81 | else:
82 | idxes = range(0, len(self._storage))
83 | return self._encode_sample(idxes)
84 |
85 | def collect(self):
86 | return self.sample(-1)
87 |
--------------------------------------------------------------------------------
/maddpg/trainer/replay_buffer_with_messages.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 |
4 | class ReplayBuffer(object):
5 | def __init__(self, size):
6 | """Create Prioritized Replay buffer.
7 |
8 | Parameters
9 | ----------
10 | size: int
11 | Max number of transitions to store in the buffer. When the buffer
12 | overflows the old memories are dropped.
13 | """
14 | self._storage = []
15 | self._maxsize = int(size)
16 | self._next_idx = 0
17 |
18 | def __len__(self):
19 | return len(self._storage)
20 |
21 | def clear(self):
22 | self._storage = []
23 | self._next_idx = 0
24 |
25 | def add(self, obs_t, message, action, reward, obs_tp1, done):
26 | data = (obs_t, message, action, reward, obs_tp1, done)
27 |
28 | if self._next_idx >= len(self._storage):
29 | self._storage.append(data)
30 | else:
31 | self._storage[self._next_idx] = data
32 | self._next_idx = (self._next_idx + 1) % self._maxsize
33 |
34 | def _encode_sample(self, idxes):
35 | obses_t, messages, actions, rewards, obses_tp1, dones = [], [], [], [], [], []
36 | for i in idxes:
37 | data = self._storage[i]
38 | obs_t, message, action, reward, obs_tp1, done = data
39 | obses_t.append(np.array(obs_t, copy=False))
40 | messages.append(np.array(message, copy=False))
41 | actions.append(np.array(action, copy=False))
42 | rewards.append(reward)
43 | obses_tp1.append(np.array(obs_tp1, copy=False))
44 | dones.append(done)
45 | return np.array(obses_t), np.array(messages), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
46 |
47 | def make_index(self, batch_size):
48 | return [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
49 |
50 | def make_latest_index(self, batch_size):
51 | idx = [(self._next_idx - 1 - i) % self._maxsize for i in range(batch_size)]
52 | np.random.shuffle(idx)
53 | return idx
54 |
55 | def sample_index(self, idxes):
56 | return self._encode_sample(idxes)
57 |
58 | def sample(self, batch_size):
59 | """Sample a batch of experiences.
60 |
61 | Parameters
62 | ----------
63 | batch_size: int
64 | How many transitions to sample.
65 |
66 | Returns
67 | -------
68 | obs_batch: np.array
69 | batch of observations
70 | act_batch: np.array
71 | batch of actions executed given obs_batch
72 | rew_batch: np.array
73 | rewards received as results of executing act_batch
74 | next_obs_batch: np.array
75 | next set of observations seen after executing act_batch
76 | done_mask: np.array
77 | done_mask[i] = 1 if executing act_batch[i] resulted in
78 | the end of an episode and 0 otherwise.
79 | """
80 | if batch_size > 0:
81 | idxes = self.make_index(batch_size)
82 | else:
83 | idxes = range(0, len(self._storage))
84 | return self._encode_sample(idxes)
85 |
86 | def collect(self):
87 | return self.sample(-1)
88 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/simple_speaker_listener.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from multiagent.core import World, Agent, Landmark
3 | from multiagent.scenario import BaseScenario
4 |
5 | class Scenario(BaseScenario):
6 | def make_world(self):
7 | world = World()
8 | # set any world properties first
9 | world.dim_c = 3
10 | num_landmarks = 3
11 | # add agents
12 | world.agents = [Agent() for i in range(2)]
13 | for i, agent in enumerate(world.agents):
14 | agent.name = 'agent %d' % i
15 | agent.collide = False
16 | agent.size = 0.075
17 | # speaker
18 | world.agents[0].movable = False
19 | # listener
20 | world.agents[1].silent = True
21 | # add landmarks
22 | world.landmarks = [Landmark() for i in range(num_landmarks)]
23 | for i, landmark in enumerate(world.landmarks):
24 | landmark.name = 'landmark %d' % i
25 | landmark.collide = False
26 | landmark.movable = False
27 | landmark.size = 0.04
28 | # make initial conditions
29 | self.reset_world(world)
30 | return world
31 |
32 | def reset_world(self, world):
33 | # assign goals to agents
34 | for agent in world.agents:
35 | agent.goal_a = None
36 | agent.goal_b = None
37 | # want listener to go to the goal landmark
38 | world.agents[0].goal_a = world.agents[1]
39 | world.agents[0].goal_b = np.random.choice(world.landmarks)
40 | # random properties for agents
41 | for i, agent in enumerate(world.agents):
42 | agent.color = np.array([0.25,0.25,0.25])
43 | # random properties for landmarks
44 | world.landmarks[0].color = np.array([0.65,0.15,0.15])
45 | world.landmarks[1].color = np.array([0.15,0.65,0.15])
46 | world.landmarks[2].color = np.array([0.15,0.15,0.65])
47 | # special colors for goals
48 | world.agents[0].goal_a.color = world.agents[0].goal_b.color + np.array([0.45, 0.45, 0.45])
49 | # set random initial states
50 | for agent in world.agents:
51 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
52 | agent.state.p_vel = np.zeros(world.dim_p)
53 | agent.state.c = np.zeros(world.dim_c)
54 | for i, landmark in enumerate(world.landmarks):
55 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
56 | landmark.state.p_vel = np.zeros(world.dim_p)
57 |
58 | def benchmark_data(self, agent, world):
59 | # returns data for benchmarking purposes
60 | return reward(agent, reward)
61 |
62 | def reward(self, agent, world):
63 | # squared distance from listener to landmark
64 | a = world.agents[0]
65 | dist2 = np.sum(np.square(a.goal_a.state.p_pos - a.goal_b.state.p_pos))
66 | return -dist2
67 |
68 | def observation(self, agent, world):
69 | # goal color
70 | goal_color = np.zeros(world.dim_color)
71 | if agent.goal_b is not None:
72 | goal_color = agent.goal_b.color
73 |
74 | # get positions of all entities in this agent's reference frame
75 | entity_pos = []
76 | for entity in world.landmarks:
77 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
78 |
79 | # communication of all other agents
80 | comm = []
81 | for other in world.agents:
82 | if other is agent or (other.state.c is None): continue
83 | comm.append(other.state.c)
84 |
85 | # speaker
86 | if not agent.movable:
87 | return np.concatenate([goal_color])
88 | # listener
89 | if agent.silent:
90 | return np.concatenate([agent.state.p_vel] + entity_pos + comm)
91 |
92 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/simple_reference.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from multiagent.core import World, Agent, Landmark
3 | from multiagent.scenario import BaseScenario
4 |
5 | class Scenario(BaseScenario):
6 | def make_world(self):
7 | world = World()
8 | # set any world properties first
9 | world.dim_c = 10
10 | # add agents
11 | world.agents = [Agent() for i in range(2)]
12 | for i, agent in enumerate(world.agents):
13 | agent.name = 'agent %d' % i
14 | agent.collide = False
15 | # agent.u_noise = 1e-1
16 | # agent.c_noise = 1e-1
17 | # add landmarks
18 | world.landmarks = [Landmark() for i in range(3)]
19 | for i, landmark in enumerate(world.landmarks):
20 | landmark.name = 'landmark %d' % i
21 | landmark.collide = False
22 | landmark.movable = False
23 | # make initial conditions
24 | self.reset_world(world)
25 | return world
26 |
27 | def reset_world(self, world):
28 | # assign goals to agents
29 | for agent in world.agents:
30 | agent.goal_a = None
31 | agent.goal_b = None
32 | # want other agent to go to the goal landmark
33 | world.agents[0].goal_a = world.agents[1]
34 | world.agents[0].goal_b = np.random.choice(world.landmarks)
35 | world.agents[1].goal_a = world.agents[0]
36 | world.agents[1].goal_b = np.random.choice(world.landmarks)
37 | # random properties for agents
38 | for i, agent in enumerate(world.agents):
39 | agent.color = np.array([0.25,0.25,0.25])
40 | # random properties for landmarks
41 | world.landmarks[0].color = np.array([0.75,0.25,0.25])
42 | world.landmarks[1].color = np.array([0.25,0.75,0.25])
43 | world.landmarks[2].color = np.array([0.25,0.25,0.75])
44 | # special colors for goals
45 | world.agents[0].goal_a.color = world.agents[0].goal_b.color
46 | world.agents[1].goal_a.color = world.agents[1].goal_b.color
47 | # set random initial states
48 | for agent in world.agents:
49 | agent.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
50 | agent.state.p_vel = np.zeros(world.dim_p)
51 | agent.state.c = np.zeros(world.dim_c)
52 | for i, landmark in enumerate(world.landmarks):
53 | landmark.state.p_pos = np.random.uniform(-1,+1, world.dim_p)
54 | landmark.state.p_vel = np.zeros(world.dim_p)
55 |
56 | def reward(self, agent, world):
57 | if agent.goal_a is None or agent.goal_b is None:
58 | return 0.0
59 | dist2 = np.sum(np.square(agent.goal_a.state.p_pos - agent.goal_b.state.p_pos))
60 | return -dist2 #np.exp(-dist2)
61 |
62 | def observation(self, agent, world):
63 | # goal positions
64 | # goal_pos = [np.zeros(world.dim_p), np.zeros(world.dim_p)]
65 | # if agent.goal_a is not None:
66 | # goal_pos[0] = agent.goal_a.state.p_pos - agent.state.p_pos
67 | # if agent.goal_b is not None:
68 | # goal_pos[1] = agent.goal_b.state.p_pos - agent.state.p_pos
69 | # goal color
70 | goal_color = [np.zeros(world.dim_color), np.zeros(world.dim_color)]
71 | # if agent.goal_a is not None:
72 | # goal_color[0] = agent.goal_a.color
73 | if agent.goal_b is not None:
74 | goal_color[1] = agent.goal_b.color
75 |
76 | # get positions of all entities in this agent's reference frame
77 | entity_pos = []
78 | for entity in world.landmarks: #world.entities:
79 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
80 | # entity colors
81 | entity_color = []
82 | for entity in world.landmarks: #world.entities:
83 | entity_color.append(entity.color)
84 | # communication of all other agents
85 | comm = []
86 | for other in world.agents:
87 | if other is agent: continue
88 | comm.append(other.state.c)
89 | return np.concatenate([agent.state.p_vel] + entity_pos + [goal_color[1]] + comm)
90 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Learning Efficient Multi-agent Communication: An Information Bottleneck Approach
2 |
3 | This is the code for implementing of NIPS #3584 paper.
4 |
5 | ## Installation
6 |
7 |
8 | ```
9 | conda create -n imac python=3.6
10 | conda activate imac
11 | pip install tensorflow==1.12.0
12 | conda install mkl_fft=1.0.10
13 | pip install -r requirements.txt
14 | ```
15 |
16 | - Known dependencies: Python (3.6.8), OpenAI gym (0.9.4), tensorflow (1.12.0), numpy (1.16.2)
17 |
18 | ## How to run
19 |
20 | To run the code, `cd` into the `experiments` directory and run `train.py`:
21 |
22 | ``
23 | python train.py --scenario simple_spread --exp-name debug --save-dir ./result_test/debug --batch-size 1024 --ibmac_com --trainer ibmac
24 | ``
25 |
26 | You can use tensorboard to visualize the results.
27 |
28 | ## Command-line options
29 |
30 | ### Environment options
31 |
32 | - `--scenario`: defines which environment in the MPE is to be used (default: `"simple_spread"`)
33 |
34 | - `--max-episode-len` maximum length of each episode for the environment (default: `25`)
35 |
36 | - `--num-episodes` total number of training episodes (default: `60000`)
37 |
38 | - `--num-adversaries`: number of adversaries in the environment (default: `0`)
39 |
40 | - `--good-policy`: algorithm used for the 'good' (non adversary) policies in the environment
41 | (default: `"maddpg"`; options: {`"maddpg"`, `"ddpg"`})
42 |
43 | - `--adv-policy`: algorithm used for the adversary policies in the environment
44 | (default: `"maddpg"`; options: {`"maddpg"`, `"ddpg"`})
45 |
46 | ### Core training parameters
47 |
48 | - `--trainer`: different algorithms (default: `"imbac"`)
49 |
50 | `ibmac`: for training scheduler
51 |
52 | `ibmac_inter`: for training policy and messages output
53 |
54 | - `--lr`: learning rate (default: `1e-2`)
55 |
56 | - `--gamma`: discount factor (default: `0.95`)
57 |
58 | - `--batch-size`: batch size (default: `1024`)
59 |
60 | - `--num-units`: number of units in the MLP (default: `64`)
61 |
62 | - `--beta`: coefficient of KL loss (default: `0.05`)
63 |
64 | - `--ibmac_com`: boolean that enable commucniation (default: `False`)
65 |
66 | - `--random-seed`: random seed (default: `42`)
67 |
68 | ### Checkpointing
69 |
70 | - `--exp-name`: name of the experiment, used as the file name to save all results (default: `None`)
71 |
72 | - `--save-dir`: directory where intermediate training results and model will be saved (default: `"/tmp/policy/"`)
73 |
74 | - `--save-rate`: model is saved every time this number of episodes has been completed (default: `1000`)
75 |
76 | - `--load-dir`: directory where training state and model are loaded from (default: `""`)
77 |
78 | ### Evaluation
79 |
80 | - `--restore`: restores previous training state stored in `load-dir` (or in `save-dir` if no `load-dir`
81 | has been provided), and continues training (default: `False`)
82 |
83 | - `--display`: displays to the screen the trained policy stored in `load-dir` (or in `save-dir` if no `load-dir`
84 | has been provided), but does not continue training (default: `False`)
85 |
86 | - `--benchmark`: runs benchmarking evaluations on saved policy, saves results to `benchmark-dir` folder (default: `False`)
87 |
88 | - `--benchmark-iters`: number of iterations to run benchmarking for (default: `100000`)
89 |
90 | - `--benchmark-dir`: directory where benchmarking data is saved (default: `"./benchmark_files/"`)
91 |
92 | - `--plots-dir`: directory where training curves are saved (default: `"./learning_curves/"`)
93 |
94 | ### Acknowledgement
95 |
96 | Our code is based on the version in:
97 |
98 | @article{lowe2017multi,
99 | title={Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments},
100 | author={Lowe, Ryan and Wu, Yi and Tamar, Aviv and Harb, Jean and Abbeel, Pieter and Mordatch, Igor},
101 | journal={Neural Information Processing Systems (NIPS)},
102 | year={2017}
103 | }
104 |
105 |
106 | We slightly modify the environment on the **act_space** setting, so there are some differences on final reward output if you directly install the original version of environment.
107 |
108 | We also add a new scenario: `simple_spread_partially_observed`. The `num_agents` can be modified for more agents.
109 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/simple_spread.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from multiagent.core import World, Agent, Landmark
3 | from multiagent.scenario import BaseScenario
4 |
5 |
6 | class Scenario(BaseScenario):
7 | def make_world(self):
8 | world = World()
9 | # set any world properties first
10 | world.dim_c = 2
11 | num_agents = 3
12 | num_landmarks = 3
13 | # add agents
14 | world.agents = [Agent() for i in range(num_agents)]
15 | for i, agent in enumerate(world.agents):
16 | agent.name = 'agent %d' % i
17 | agent.collide = True
18 | agent.silent = True
19 | agent.size = 0.05
20 | # add landmarks
21 | world.landmarks = [Landmark() for i in range(num_landmarks)]
22 | for i, landmark in enumerate(world.landmarks):
23 | landmark.name = 'landmark %d' % i
24 | landmark.collide = False
25 | landmark.movable = False
26 | # make initial conditions
27 | self.reset_world(world)
28 | return world
29 |
30 | def reset_world(self, world):
31 | # random properties for agents
32 | for i, agent in enumerate(world.agents):
33 | agent.color = np.array([0.35, 0.35, 0.85])
34 | # random properties for landmarks
35 | for i, landmark in enumerate(world.landmarks):
36 | landmark.color = np.array([0.25, 0.25, 0.25])
37 | # set random initial states
38 | for agent in world.agents:
39 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
40 | agent.state.p_vel = np.zeros(world.dim_p)
41 | agent.state.c = np.zeros(world.dim_c)
42 | for i, landmark in enumerate(world.landmarks):
43 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
44 | landmark.state.p_vel = np.zeros(world.dim_p)
45 |
46 | def benchmark_data(self, agent, world):
47 | rew = 0
48 | collisions = 0
49 | occupied_landmarks = 0
50 | min_dists = 0
51 | for l in world.landmarks:
52 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
53 | min_dists += min(dists)
54 | rew -= min(dists)
55 | if min(dists) < 0.1:
56 | occupied_landmarks += 1
57 | if agent.collide:
58 | for a in world.agents:
59 | if self.is_collision(a, agent):
60 | rew -= 1
61 | collisions += 1
62 | return (rew, collisions, min_dists, occupied_landmarks)
63 |
64 |
65 | def is_collision(self, agent1, agent2):
66 | delta_pos = agent1.state.p_pos - agent2.state.p_pos
67 | dist = np.sqrt(np.sum(np.square(delta_pos)))
68 | dist_min = agent1.size + agent2.size
69 | return True if dist < dist_min else False
70 |
71 | def reward(self, agent, world):
72 | # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions
73 | rew = 0
74 | for l in world.landmarks:
75 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
76 | rew -= min(dists)
77 | if agent.collide:
78 | for a in world.agents:
79 | if self.is_collision(a, agent):
80 | rew -= 1
81 | return rew
82 |
83 | def observation(self, agent, world):
84 | # get positions of all entities in this agent's reference frame
85 | entity_pos = []
86 | for entity in world.landmarks: # world.entities:
87 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
88 | # entity colors
89 | entity_color = []
90 | for entity in world.landmarks: # world.entities:
91 | entity_color.append(entity.color)
92 | # communication of all other agents
93 | comm = []
94 | other_pos = []
95 | for other in world.agents:
96 | if other is agent: continue
97 | comm.append(other.state.c)
98 | other_pos.append(other.state.p_pos - agent.state.p_pos)
99 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm)
100 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/simple_spread_partially_observed.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from multiagent.core import World, Agent, Landmark
3 | from multiagent.scenario import BaseScenario
4 | import heapq
5 |
6 | class Scenario(BaseScenario):
7 | def make_world(self):
8 | world = World()
9 | # set any world properties first
10 | world.dim_c = 2
11 | num_agents = 5
12 | num_landmarks = 5
13 | # add agents
14 | world.agents = [Agent() for i in range(num_agents)]
15 | for i, agent in enumerate(world.agents):
16 | agent.name = 'agent %d' % i
17 | agent.collide = True
18 | agent.silent = True
19 | agent.size = 0.05
20 | # add landmarks
21 | world.landmarks = [Landmark() for i in range(num_landmarks)]
22 | for i, landmark in enumerate(world.landmarks):
23 | landmark.name = 'landmark %d' % i
24 | landmark.collide = False
25 | landmark.movable = False
26 | # make initial conditions
27 | self.reset_world(world)
28 | return world
29 |
30 | def reset_world(self, world):
31 | # random properties for agents
32 | for i, agent in enumerate(world.agents):
33 | agent.color = np.array([0.35, 0.35, 0.85])
34 | # random properties for landmarks
35 | for i, landmark in enumerate(world.landmarks):
36 | landmark.color = np.array([0.25, 0.25, 0.25])
37 | # set random initial states
38 | for agent in world.agents:
39 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
40 | agent.state.p_vel = np.zeros(world.dim_p)
41 | agent.state.c = np.zeros(world.dim_c)
42 | for i, landmark in enumerate(world.landmarks):
43 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
44 | landmark.state.p_vel = np.zeros(world.dim_p)
45 |
46 | def benchmark_data(self, agent, world):
47 | rew = 0
48 | collisions = 0
49 | occupied_landmarks = 0
50 | min_dists = 0
51 | for l in world.landmarks:
52 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
53 | min_dists += min(dists)
54 | rew -= min(dists)
55 | if min(dists) < 0.1:
56 | occupied_landmarks += 1
57 | if agent.collide:
58 | for a in world.agents:
59 | if self.is_collision(a, agent):
60 | rew -= 1
61 | collisions += 1
62 | return (rew, collisions, min_dists, occupied_landmarks)
63 |
64 |
65 | def is_collision(self, agent1, agent2):
66 | delta_pos = agent1.state.p_pos - agent2.state.p_pos
67 | dist = np.sqrt(np.sum(np.square(delta_pos)))
68 | dist_min = agent1.size + agent2.size
69 | return True if dist < dist_min else False
70 |
71 | def reward(self, agent, world):
72 | # Agents are rewarded based on minimum agent distance to each landmark, penalized for collisions
73 | rew = 0
74 | for l in world.landmarks:
75 | dists = [np.sqrt(np.sum(np.square(a.state.p_pos - l.state.p_pos))) for a in world.agents]
76 | rew -= min(dists)
77 | if agent.collide:
78 | for a in world.agents:
79 | if self.is_collision(a, agent):
80 | rew -= 1
81 | return rew
82 |
83 | def observation(self, agent, world):
84 | # get positions of all entities in this agent's reference frame
85 | entity_pos = []
86 | for entity in world.landmarks: # world.entities:
87 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
88 | entity_pos = heapq.nsmallest(3,entity_pos, key=lambda s: np.sum(np.square(s)))
89 | # entity colors
90 | entity_color = []
91 | for entity in world.landmarks: # world.entities:
92 | entity_color.append(entity.color)
93 | # communication of all other agents
94 | comm = []
95 | other_pos = []
96 | for other in world.agents:
97 | if other is agent: continue
98 | comm.append(other.state.c)
99 | other_pos.append(other.state.p_pos - agent.state.p_pos)
100 | other_pos = heapq.nsmallest(3,other_pos, key=lambda s: np.sum(np.square(s)))
101 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos)
102 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/simple_push.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from multiagent.core import World, Agent, Landmark
3 | from multiagent.scenario import BaseScenario
4 | import random
5 |
6 | #
7 | # # the non-ensemble version of
8 | #
9 | #
10 |
11 | class Scenario(BaseScenario):
12 | def make_world(self):
13 | world = World()
14 | # set any world properties first
15 | world.dim_c = 2
16 | num_agents = 2
17 | num_adversaries = 1
18 | num_landmarks = 2
19 | # add agents
20 | world.agents = [Agent() for i in range(num_agents)]
21 | for i, agent in enumerate(world.agents):
22 | agent.name = 'agent %d' % i
23 | agent.collide = True
24 | agent.silent = True
25 | if i < num_adversaries:
26 | agent.adversary = True
27 | else:
28 | agent.adversary = False
29 | # agent.u_noise = 1e-1
30 | # agent.c_noise = 1e-1
31 | # add landmarks
32 | world.landmarks = [Landmark() for i in range(num_landmarks)]
33 | for i, landmark in enumerate(world.landmarks):
34 | landmark.name = 'landmark %d' % i
35 | landmark.collide = False
36 | landmark.movable = False
37 | # make initial conditions
38 | self.reset_world(world)
39 | return world
40 |
41 | def reset_world(self, world):
42 | # random properties for landmarks
43 | for i, landmark in enumerate(world.landmarks):
44 | landmark.color = np.array([0.1, 0.1, 0.1])
45 | landmark.color[i + 1] += 0.8
46 | landmark.index = i
47 | # set goal landmark
48 | goal = np.random.choice(world.landmarks)
49 | for i, agent in enumerate(world.agents):
50 | agent.goal_a = goal
51 | agent.color = np.array([0.25, 0.25, 0.25])
52 | if agent.adversary:
53 | agent.color = np.array([0.75, 0.25, 0.25])
54 | else:
55 | j = goal.index
56 | agent.color[j + 1] += 0.5
57 | # set random initial states
58 | for agent in world.agents:
59 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
60 | agent.state.p_vel = np.zeros(world.dim_p)
61 | agent.state.c = np.zeros(world.dim_c)
62 | for i, landmark in enumerate(world.landmarks):
63 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
64 | landmark.state.p_vel = np.zeros(world.dim_p)
65 |
66 | def reward(self, agent, world):
67 | # Agents are rewarded based on minimum agent distance to each landmark
68 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
69 |
70 | def agent_reward(self, agent, world):
71 | # the distance to the goal
72 | return -np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)))
73 |
74 | def adversary_reward(self, agent, world):
75 | # keep the nearest good agents away from the goal
76 | agent_dist = [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in world.agents if not a.adversary]
77 | pos_rew = min(agent_dist)
78 | #nearest_agent = world.good_agents[np.argmin(agent_dist)]
79 | #neg_rew = np.sqrt(np.sum(np.square(nearest_agent.state.p_pos - agent.state.p_pos)))
80 | neg_rew = np.sqrt(np.sum(np.square(agent.goal_a.state.p_pos - agent.state.p_pos)))
81 | #neg_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in world.good_agents])
82 | return pos_rew - neg_rew
83 |
84 | def observation(self, agent, world):
85 | # get positions of all entities in this agent's reference frame
86 | entity_pos = []
87 | for entity in world.landmarks: # world.entities:
88 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
89 | # entity colors
90 | entity_color = []
91 | for entity in world.landmarks: # world.entities:
92 | entity_color.append(entity.color)
93 | # communication of all other agents
94 | comm = []
95 | other_pos = []
96 | for other in world.agents:
97 | if other is agent: continue
98 | comm.append(other.state.c)
99 | other_pos.append(other.state.p_pos - agent.state.p_pos)
100 | if not agent.adversary:
101 | return np.concatenate([agent.state.p_vel] + [agent.goal_a.state.p_pos - agent.state.p_pos] + [agent.color] + entity_pos + entity_color + other_pos)
102 | else:
103 | #other_pos = list(reversed(other_pos)) if random.uniform(0,1) > 0.5 else other_pos # randomize position of other agents in adversary network
104 | return np.concatenate([agent.state.p_vel] + entity_pos + other_pos)
105 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/README.md:
--------------------------------------------------------------------------------
1 | # Multi-Agent Particle Environment
2 |
3 | A simple multi-agent particle world with a continuous observation and discrete action space, along with some basic simulated physics.
4 | Used in the paper [Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments](https://arxiv.org/pdf/1706.02275.pdf).
5 |
6 | ## Getting started:
7 |
8 | - To install, `cd` into the root directory and type `pip install -e .`
9 |
10 | - To interactively view moving to landmark scenario (see others in ./scenarios/):
11 | `bin/interactive.py --scenario simple.py`
12 |
13 | - Known dependencies: OpenAI gym, numpy
14 |
15 | - To use the environments, look at the code for importing them in `make_env.py`.
16 |
17 | ## Code structure
18 |
19 | - `make_env.py`: contains code for importing a multiagent environment as an OpenAI Gym-like object.
20 |
21 | - `./multiagent/environment.py`: contains code for environment simulation (interaction physics, `_step()` function, etc.)
22 |
23 | - `./multiagent/core.py`: contains classes for various objects (Entities, Landmarks, Agents, etc.) that are used throughout the code.
24 |
25 | - `./multiagent/rendering.py`: used for displaying agent behaviors on the screen.
26 |
27 | - `./multiagent/policy.py`: contains code for interactive policy based on keyboard input.
28 |
29 | - `./multiagent/scenario.py`: contains base scenario object that is extended for all scenarios.
30 |
31 | - `./multiagent/scenarios/`: folder where various scenarios/ environments are stored. scenario code consists of several functions:
32 | 1) `make_world()`: creates all of the entities that inhabit the world (landmarks, agents, etc.), assigns their capabilities (whether they can communicate, or move, or both).
33 | called once at the beginning of each training session
34 | 2) `reset_world()`: resets the world by assigning properties (position, color, etc.) to all entities in the world
35 | called before every episode (including after make_world() before the first episode)
36 | 3) `reward()`: defines the reward function for a given agent
37 | 4) `observation()`: defines the observation space of a given agent
38 | 5) (optional) `benchmark_data()`: provides diagnostic data for policies trained on the environment (e.g. evaluation metrics)
39 |
40 | ### Creating new environments
41 |
42 | You can create new scenarios by implementing the first 4 functions above (`make_world()`, `reset_world()`, `reward()`, and `observation()`).
43 |
44 | ## List of environments
45 |
46 |
47 | | Env name in code (name in paper) | Communication? | Competitive? | Notes |
48 | | --- | --- | --- | --- |
49 | | `simple.py` | N | N | Single agent sees landmark position, rewarded based on how close it gets to landmark. Not a multiagent environment -- used for debugging policies. |
50 | | `simple_adversary.py` (Physical deception) | N | Y | 1 adversary (red), N good agents (green), N landmarks (usually N=2). All agents observe position of landmarks and other agents. One landmark is the ‘target landmark’ (colored green). Good agents rewarded based on how close one of them is to the target landmark, but negatively rewarded if the adversary is close to target landmark. Adversary is rewarded based on how close it is to the target, but it doesn’t know which landmark is the target landmark. So good agents have to learn to ‘split up’ and cover all landmarks to deceive the adversary. |
51 | | `simple_crypto.py` (Covert communication) | Y | Y | Two good agents (alice and bob), one adversary (eve). Alice must sent a private message to bob over a public channel. Alice and bob are rewarded based on how well bob reconstructs the message, but negatively rewarded if eve can reconstruct the message. Alice and bob have a private key (randomly generated at beginning of each episode), which they must learn to use to encrypt the message. |
52 | | `simple_push.py` (Keep-away) | N |Y | 1 agent, 1 adversary, 1 landmark. Agent is rewarded based on distance to landmark. Adversary is rewarded if it is close to the landmark, and if the agent is far from the landmark. So the adversary learns to push agent away from the landmark. |
53 | | `simple_reference.py` | Y | N | 2 agents, 3 landmarks of different colors. Each agent wants to get to their target landmark, which is known only by other agent. Reward is collective. So agents have to learn to communicate the goal of the other agent, and navigate to their landmark. This is the same as the simple_speaker_listener scenario where both agents are simultaneous speakers and listeners. |
54 | | `simple_speaker_listener.py` (Cooperative communication) | Y | N | Same as simple_reference, except one agent is the ‘speaker’ (gray) that does not move (observes goal of other agent), and other agent is the listener (cannot speak, but must navigate to correct landmark).|
55 | | `simple_spread.py` (Cooperative navigation) | N | N | N agents, N landmarks. Agents are rewarded based on how far any agent is from each landmark. Agents are penalized if they collide with other agents. So, agents have to learn to cover all the landmarks while avoiding collisions. |
56 | | `simple_tag.py` (Predator-prey) | N | Y | Predator-prey environment. Good agents (green) are faster and want to avoid being hit by adversaries (red). Adversaries are slower and want to hit good agents. Obstacles (large black circles) block the way. |
57 | | `simple_world_comm.py` | Y | Y | Environment seen in the video accompanying the paper. Same as simple_tag, except (1) there is food (small blue balls) that the good agents are rewarded for being near, (2) we now have ‘forests’ that hide agents inside from being seen from outside; (3) there is a ‘leader adversary” that can see the agents at all times, and can communicate with the other adversaries to help coordinate the chase. |
58 |
59 | ## Paper citation
60 |
61 | If you used this environment for your experiments or found it helpful, consider citing the following papers:
62 |
63 | Environments in this repo:
64 |
65 | @article{lowe2017multi,
66 | title={Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments},
67 | author={Lowe, Ryan and Wu, Yi and Tamar, Aviv and Harb, Jean and Abbeel, Pieter and Mordatch, Igor},
68 | journal={Neural Information Processing Systems (NIPS)},
69 | year={2017}
70 | }
71 |
72 |
73 | Original particle world environment:
74 |
75 | @article{mordatch2017emergence,
76 | title={Emergence of Grounded Compositional Language in Multi-Agent Populations},
77 | author={Mordatch, Igor and Abbeel, Pieter},
78 | journal={arXiv preprint arXiv:1703.04908},
79 | year={2017}
80 | }
81 |
82 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/simple_adversary.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from multiagent.core import World, Agent, Landmark
3 | from multiagent.scenario import BaseScenario
4 | import random
5 |
6 |
7 | class Scenario(BaseScenario):
8 |
9 | def make_world(self):
10 | world = World()
11 | # set any world properties first
12 | world.dim_c = 2
13 | num_agents = 3
14 | world.num_agents = num_agents
15 | num_adversaries = 1
16 | num_landmarks = num_agents - 1
17 | # add agents
18 | world.agents = [Agent() for i in range(num_agents)]
19 | for i, agent in enumerate(world.agents):
20 | agent.name = 'agent %d' % i
21 | agent.collide = False
22 | agent.silent = True
23 | agent.adversary = True if i < num_adversaries else False
24 | agent.size = 0.15
25 | # add landmarks
26 | world.landmarks = [Landmark() for i in range(num_landmarks)]
27 | for i, landmark in enumerate(world.landmarks):
28 | landmark.name = 'landmark %d' % i
29 | landmark.collide = False
30 | landmark.movable = False
31 | landmark.size = 0.08
32 | # make initial conditions
33 | self.reset_world(world)
34 | return world
35 |
36 | def reset_world(self, world):
37 | # random properties for agents
38 | world.agents[0].color = np.array([0.85, 0.35, 0.35])
39 | for i in range(1, world.num_agents):
40 | world.agents[i].color = np.array([0.35, 0.35, 0.85])
41 | # random properties for landmarks
42 | for i, landmark in enumerate(world.landmarks):
43 | landmark.color = np.array([0.15, 0.15, 0.15])
44 | # set goal landmark
45 | goal = np.random.choice(world.landmarks)
46 | goal.color = np.array([0.15, 0.65, 0.15])
47 | for agent in world.agents:
48 | agent.goal_a = goal
49 | # set random initial states
50 | for agent in world.agents:
51 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
52 | agent.state.p_vel = np.zeros(world.dim_p)
53 | agent.state.c = np.zeros(world.dim_c)
54 | for i, landmark in enumerate(world.landmarks):
55 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
56 | landmark.state.p_vel = np.zeros(world.dim_p)
57 |
58 | def benchmark_data(self, agent, world):
59 | # returns data for benchmarking purposes
60 | if agent.adversary:
61 | return np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
62 | else:
63 | dists = []
64 | for l in world.landmarks:
65 | dists.append(np.sum(np.square(agent.state.p_pos - l.state.p_pos)))
66 | dists.append(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos)))
67 | return tuple(dists)
68 |
69 | # return all agents that are not adversaries
70 | def good_agents(self, world):
71 | return [agent for agent in world.agents if not agent.adversary]
72 |
73 | # return all adversarial agents
74 | def adversaries(self, world):
75 | return [agent for agent in world.agents if agent.adversary]
76 |
77 | def reward(self, agent, world):
78 | # Agents are rewarded based on minimum agent distance to each landmark
79 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
80 |
81 | def agent_reward(self, agent, world):
82 | # Rewarded based on how close any good agent is to the goal landmark, and how far the adversary is from it
83 | shaped_reward = True
84 | shaped_adv_reward = True
85 |
86 | # Calculate negative reward for adversary
87 | adversary_agents = self.adversaries(world)
88 | if shaped_adv_reward: # distance-based adversary reward
89 | adv_rew = sum([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in adversary_agents])
90 | else: # proximity-based adversary reward (binary)
91 | adv_rew = 0
92 | for a in adversary_agents:
93 | if np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) < 2 * a.goal_a.size:
94 | adv_rew -= 5
95 |
96 | # Calculate positive reward for agents
97 | good_agents = self.good_agents(world)
98 | if shaped_reward: # distance-based agent reward
99 | pos_rew = -min(
100 | [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents])
101 | else: # proximity-based agent reward (binary)
102 | pos_rew = 0
103 | if min([np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents]) \
104 | < 2 * agent.goal_a.size:
105 | pos_rew += 5
106 | pos_rew -= min(
107 | [np.sqrt(np.sum(np.square(a.state.p_pos - a.goal_a.state.p_pos))) for a in good_agents])
108 | return pos_rew + adv_rew
109 |
110 | def adversary_reward(self, agent, world):
111 | # Rewarded based on proximity to the goal landmark
112 | shaped_reward = True
113 | if shaped_reward: # distance-based reward
114 | return -np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))
115 | else: # proximity-based reward (binary)
116 | adv_rew = 0
117 | if np.sqrt(np.sum(np.square(agent.state.p_pos - agent.goal_a.state.p_pos))) < 2 * agent.goal_a.size:
118 | adv_rew += 5
119 | return adv_rew
120 |
121 |
122 | def observation(self, agent, world):
123 | # get positions of all entities in this agent's reference frame
124 | entity_pos = []
125 | for entity in world.landmarks:
126 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
127 | # entity colors
128 | entity_color = []
129 | for entity in world.landmarks:
130 | entity_color.append(entity.color)
131 | # communication of all other agents
132 | other_pos = []
133 | for other in world.agents:
134 | if other is agent: continue
135 | other_pos.append(other.state.p_pos - agent.state.p_pos)
136 |
137 | if not agent.adversary:
138 | return np.concatenate([agent.goal_a.state.p_pos - agent.state.p_pos] + entity_pos + other_pos)
139 | else:
140 | return np.concatenate(entity_pos + other_pos)
141 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/simple_tag.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from multiagent.core import World, Agent, Landmark
3 | from multiagent.scenario import BaseScenario
4 |
5 |
6 | class Scenario(BaseScenario):
7 | def make_world(self):
8 | world = World()
9 | # set any world properties first
10 | world.dim_c = 2
11 | num_good_agents = 2
12 | num_adversaries = 4
13 | num_agents = num_adversaries + num_good_agents
14 | num_landmarks = 2
15 | # add agents
16 | world.agents = [Agent() for i in range(num_agents)]
17 | for i, agent in enumerate(world.agents):
18 | agent.name = 'agent %d' % i
19 | agent.collide = True
20 | agent.silent = True
21 | agent.adversary = True if i < num_adversaries else False
22 | agent.size = 0.075 if agent.adversary else 0.05
23 | agent.accel = 3.0 if agent.adversary else 4.0
24 | #agent.accel = 20.0 if agent.adversary else 25.0
25 | agent.max_speed = 1.0 if agent.adversary else 1.3
26 | # add landmarks
27 | world.landmarks = [Landmark() for i in range(num_landmarks)]
28 | for i, landmark in enumerate(world.landmarks):
29 | landmark.name = 'landmark %d' % i
30 | landmark.collide = True
31 | landmark.movable = False
32 | landmark.size = 0.2
33 | landmark.boundary = False
34 | # make initial conditions
35 | self.reset_world(world)
36 | return world
37 |
38 |
39 | def reset_world(self, world):
40 | # random properties for agents
41 | for i, agent in enumerate(world.agents):
42 | agent.color = np.array([0.35, 0.85, 0.35]) if not agent.adversary else np.array([0.85, 0.35, 0.35])
43 | # random properties for landmarks
44 | for i, landmark in enumerate(world.landmarks):
45 | landmark.color = np.array([0.25, 0.25, 0.25])
46 | # set random initial states
47 | for agent in world.agents:
48 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
49 | agent.state.p_vel = np.zeros(world.dim_p)
50 | agent.state.c = np.zeros(world.dim_c)
51 | for i, landmark in enumerate(world.landmarks):
52 | if not landmark.boundary:
53 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
54 | landmark.state.p_vel = np.zeros(world.dim_p)
55 |
56 |
57 | def benchmark_data(self, agent, world):
58 | # # returns data for benchmarking purposes
59 | # if agent.adversary:
60 | # collisions = 0
61 | # for a in self.good_agents(world):
62 | # if self.is_collision(a, agent):
63 | # collisions += 1
64 | # return collisions
65 | # else:
66 | # return 0
67 | collisions = 0
68 | rew = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
69 | if agent.adversary:
70 | for a in self.good_agents(world):
71 | if self.is_collision(a, agent):
72 | collisions += 1
73 | return (rew, collisions)
74 |
75 | def is_collision(self, agent1, agent2):
76 | delta_pos = agent1.state.p_pos - agent2.state.p_pos
77 | dist = np.sqrt(np.sum(np.square(delta_pos)))
78 | dist_min = agent1.size + agent2.size
79 | return True if dist < dist_min else False
80 |
81 | # return all agents that are not adversaries
82 | def good_agents(self, world):
83 | return [agent for agent in world.agents if not agent.adversary]
84 |
85 | # return all adversarial agents
86 | def adversaries(self, world):
87 | return [agent for agent in world.agents if agent.adversary]
88 |
89 |
90 | def reward(self, agent, world):
91 | # Agents are rewarded based on minimum agent distance to each landmark
92 | main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
93 | return main_reward
94 |
95 | def agent_reward(self, agent, world):
96 | # Agents are negatively rewarded if caught by adversaries
97 | rew = 0
98 | shape = True
99 | adversaries = self.adversaries(world)
100 | if shape: # reward can optionally be shaped (increased reward for increased distance from adversary)
101 | for adv in adversaries:
102 | rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos)))
103 | if agent.collide:
104 | for a in adversaries:
105 | if self.is_collision(a, agent):
106 | rew -= 10
107 |
108 | # agents are penalized for exiting the screen, so that they can be caught by the adversaries
109 | def bound(x):
110 | if x < 0.9:
111 | return 0
112 | if x < 1.0:
113 | return (x - 0.9) * 10
114 | return min(np.exp(2 * x - 2), 10)
115 | for p in range(world.dim_p):
116 | x = abs(agent.state.p_pos[p])
117 | rew -= bound(x)
118 |
119 | return rew
120 |
121 | def adversary_reward(self, agent, world):
122 | # Adversaries are rewarded for collisions with agents
123 | rew = 0
124 | shape = True
125 | agents = self.good_agents(world)
126 | adversaries = self.adversaries(world)
127 | if shape: # reward can optionally be shaped (decreased reward for increased distance from agents)
128 | for adv in adversaries:
129 | rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents])
130 | if agent.collide:
131 | for ag in agents:
132 | for adv in adversaries:
133 | if self.is_collision(ag, adv):
134 | rew += 10
135 | return rew
136 |
137 | def observation(self, agent, world):
138 | # get positions of all entities in this agent's reference frame
139 | entity_pos = []
140 | for entity in world.landmarks:
141 | if not entity.boundary:
142 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
143 | # communication of all other agents
144 | comm = []
145 | other_pos = []
146 | other_vel = []
147 | for other in world.agents:
148 | if other is agent: continue
149 | comm.append(other.state.c)
150 | other_pos.append(other.state.p_pos - agent.state.p_pos)
151 | if not other.adversary:
152 | other_vel.append(other.state.p_vel)
153 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel)
154 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/simple_crypto.py:
--------------------------------------------------------------------------------
1 | """
2 | Scenario:
3 | 1 speaker, 2 listeners (one of which is an adversary). Good agents rewarded for proximity to goal, and distance from
4 | adversary to goal. Adversary is rewarded for its distance to the goal.
5 | """
6 |
7 |
8 | import numpy as np
9 | from multiagent.core import World, Agent, Landmark
10 | from multiagent.scenario import BaseScenario
11 | import random
12 |
13 |
14 | class CryptoAgent(Agent):
15 | def __init__(self):
16 | super(CryptoAgent, self).__init__()
17 | self.key = None
18 |
19 | class Scenario(BaseScenario):
20 |
21 | def make_world(self):
22 | world = World()
23 | # set any world properties first
24 | num_agents = 3
25 | num_adversaries = 1
26 | num_landmarks = 2
27 | world.dim_c = 4
28 | # add agents
29 | world.agents = [CryptoAgent() for i in range(num_agents)]
30 | for i, agent in enumerate(world.agents):
31 | agent.name = 'agent %d' % i
32 | agent.collide = False
33 | agent.adversary = True if i < num_adversaries else False
34 | agent.speaker = True if i == 2 else False
35 | agent.movable = False
36 | # add landmarks
37 | world.landmarks = [Landmark() for i in range(num_landmarks)]
38 | for i, landmark in enumerate(world.landmarks):
39 | landmark.name = 'landmark %d' % i
40 | landmark.collide = False
41 | landmark.movable = False
42 | # make initial conditions
43 | self.reset_world(world)
44 | return world
45 |
46 |
47 | def reset_world(self, world):
48 | # random properties for agents
49 | for i, agent in enumerate(world.agents):
50 | agent.color = np.array([0.25, 0.25, 0.25])
51 | if agent.adversary:
52 | agent.color = np.array([0.75, 0.25, 0.25])
53 | agent.key = None
54 | # random properties for landmarks
55 | color_list = [np.zeros(world.dim_c) for i in world.landmarks]
56 | for i, color in enumerate(color_list):
57 | color[i] += 1
58 | for color, landmark in zip(color_list, world.landmarks):
59 | landmark.color = color
60 | # set goal landmark
61 | goal = np.random.choice(world.landmarks)
62 | world.agents[1].color = goal.color
63 | world.agents[2].key = np.random.choice(world.landmarks).color
64 |
65 | for agent in world.agents:
66 | agent.goal_a = goal
67 |
68 | # set random initial states
69 | for agent in world.agents:
70 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
71 | agent.state.p_vel = np.zeros(world.dim_p)
72 | agent.state.c = np.zeros(world.dim_c)
73 | for i, landmark in enumerate(world.landmarks):
74 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
75 | landmark.state.p_vel = np.zeros(world.dim_p)
76 |
77 |
78 | def benchmark_data(self, agent, world):
79 | # returns data for benchmarking purposes
80 | return (agent.state.c, agent.goal_a.color)
81 |
82 | # return all agents that are not adversaries
83 | def good_listeners(self, world):
84 | return [agent for agent in world.agents if not agent.adversary and not agent.speaker]
85 |
86 | # return all agents that are not adversaries
87 | def good_agents(self, world):
88 | return [agent for agent in world.agents if not agent.adversary]
89 |
90 | # return all adversarial agents
91 | def adversaries(self, world):
92 | return [agent for agent in world.agents if agent.adversary]
93 |
94 | def reward(self, agent, world):
95 | return self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
96 |
97 | def agent_reward(self, agent, world):
98 | # Agents rewarded if Bob can reconstruct message, but adversary (Eve) cannot
99 | good_listeners = self.good_listeners(world)
100 | adversaries = self.adversaries(world)
101 | good_rew = 0
102 | adv_rew = 0
103 | for a in good_listeners:
104 | if (a.state.c == np.zeros(world.dim_c)).all():
105 | continue
106 | else:
107 | good_rew -= np.sum(np.square(a.state.c - agent.goal_a.color))
108 | for a in adversaries:
109 | if (a.state.c == np.zeros(world.dim_c)).all():
110 | continue
111 | else:
112 | adv_l1 = np.sum(np.square(a.state.c - agent.goal_a.color))
113 | adv_rew += adv_l1
114 | return adv_rew + good_rew
115 |
116 | def adversary_reward(self, agent, world):
117 | # Adversary (Eve) is rewarded if it can reconstruct original goal
118 | rew = 0
119 | if not (agent.state.c == np.zeros(world.dim_c)).all():
120 | rew -= np.sum(np.square(agent.state.c - agent.goal_a.color))
121 | return rew
122 |
123 |
124 | def observation(self, agent, world):
125 | # goal color
126 | goal_color = np.zeros(world.dim_color)
127 | if agent.goal_a is not None:
128 | goal_color = agent.goal_a.color
129 |
130 | #print('goal color in obs is {}'.format(goal_color))
131 |
132 | # get positions of all entities in this agent's reference frame
133 | entity_pos = []
134 | for entity in world.landmarks:
135 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
136 | # communication of all other agents
137 | comm = []
138 | for other in world.agents:
139 | if other is agent or (other.state.c is None) or not other.speaker: continue
140 | comm.append(other.state.c)
141 |
142 | confer = np.array([0])
143 |
144 | if world.agents[2].key is None:
145 | confer = np.array([1])
146 | key = np.zeros(world.dim_c)
147 | goal_color = np.zeros(world.dim_c)
148 | else:
149 | key = world.agents[2].key
150 |
151 | prnt = False
152 | # speaker
153 | if agent.speaker:
154 | if prnt:
155 | print('speaker')
156 | print(agent.state.c)
157 | print(np.concatenate([goal_color] + [key] + [confer] + [np.random.randn(1)]))
158 | return np.concatenate([goal_color] + [key])
159 | # listener
160 | if not agent.speaker and not agent.adversary:
161 | if prnt:
162 | print('listener')
163 | print(agent.state.c)
164 | print(np.concatenate([key] + comm + [confer]))
165 | return np.concatenate([key] + comm)
166 | if not agent.speaker and agent.adversary:
167 | if prnt:
168 | print('adversary')
169 | print(agent.state.c)
170 | print(np.concatenate(comm + [confer]))
171 | return np.concatenate(comm)
172 |
--------------------------------------------------------------------------------
/maddpg/trainer/maddpg.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import tensorflow as tf
4 | import maddpg.common.tf_util as U
5 |
6 | from maddpg.common.distributions import make_pdtype
7 | from maddpg import AgentTrainer
8 | from maddpg.trainer.replay_buffer import ReplayBuffer
9 |
10 |
11 | def discount_with_dones(rewards, dones, gamma):
12 | discounted = []
13 | r = 0
14 | for reward, done in zip(rewards[::-1], dones[::-1]):
15 | r = reward + gamma*r
16 | r = r*(1.-done)
17 | discounted.append(r)
18 | return discounted[::-1]
19 |
20 | def make_update_exp(vals, target_vals):
21 | polyak = 1.0 - 1e-2
22 | expression = []
23 | for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
24 | expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var))
25 | expression = tf.group(*expression)
26 | return U.function([], [], updates=[expression])
27 |
28 | def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
29 | with tf.variable_scope(scope, reuse=reuse):
30 | # create distribtuions
31 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
32 |
33 | # set up placeholders
34 | obs_ph_n = make_obs_ph_n
35 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
36 |
37 | p_input = obs_ph_n[p_index]
38 |
39 | p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units)
40 | p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
41 |
42 | # wrap parameters in distribution
43 | act_pd = act_pdtype_n[p_index].pdfromflat(p)
44 |
45 | act_sample = act_pd.sample()
46 | p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
47 |
48 | act_input_n = act_ph_n + []
49 | act_input_n[p_index] = act_pd.sample()
50 | q_input = tf.concat(obs_ph_n + act_input_n, 1)
51 | if local_q_func:
52 | q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
53 | q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
54 | pg_loss = -tf.reduce_mean(q)
55 |
56 | loss = pg_loss + p_reg * 1e-3
57 |
58 | optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)
59 |
60 | # Create callable functions
61 | train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
62 | act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
63 | p_values = U.function([obs_ph_n[p_index]], p)
64 |
65 | # target network
66 | target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units)
67 | target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
68 | update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
69 |
70 | target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
71 | target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)
72 |
73 | return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
74 |
75 | def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
76 | with tf.variable_scope(scope, reuse=reuse):
77 | # create distribtuions
78 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
79 |
80 | # set up placeholders
81 | obs_ph_n = make_obs_ph_n
82 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
83 | target_ph = tf.placeholder(tf.float32, [None], name="target")
84 |
85 | q_input = tf.concat(obs_ph_n + act_ph_n, 1)
86 | if local_q_func:
87 | q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
88 | q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
89 | q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
90 |
91 | q_loss = tf.reduce_mean(tf.square(q - target_ph))
92 |
93 | # viscosity solution to Bellman differential equation in place of an initial condition
94 | q_reg = tf.reduce_mean(tf.square(q))
95 | loss = q_loss #+ 1e-3 * q_reg
96 |
97 | optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)
98 |
99 | # Create callable functions
100 | train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
101 | q_values = U.function(obs_ph_n + act_ph_n, q)
102 |
103 | # target network
104 | target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
105 | target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
106 | update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
107 |
108 | target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
109 |
110 | return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
111 |
112 | class MADDPGAgentTrainer(AgentTrainer):
113 | def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
114 | self.name = name
115 | self.n = len(obs_shape_n)
116 | self.agent_index = agent_index
117 | self.args = args
118 | obs_ph_n = []
119 | for i in range(self.n):
120 | obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
121 |
122 | # Create all the functions necessary to train the model
123 | self.q_train, self.q_update, self.q_debug = q_train(
124 | scope=self.name,
125 | make_obs_ph_n=obs_ph_n,
126 | act_space_n=act_space_n,
127 | q_index=agent_index,
128 | q_func=model,
129 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
130 | grad_norm_clipping=0.5,
131 | local_q_func=local_q_func,
132 | num_units=args.num_units
133 | )
134 | self.act, self.p_train, self.p_update, self.p_debug = p_train(
135 | scope=self.name,
136 | make_obs_ph_n=obs_ph_n,
137 | act_space_n=act_space_n,
138 | p_index=agent_index,
139 | p_func=model,
140 | q_func=model,
141 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
142 | grad_norm_clipping=0.5,
143 | local_q_func=local_q_func,
144 | num_units=args.num_units
145 | )
146 | # Create experience buffer
147 | self.replay_buffer = ReplayBuffer(1e6)
148 | self.max_replay_buffer_len = 50 * args.max_episode_len
149 | # self.max_replay_buffer_len = args.batch_size * args.max_episode_len
150 | self.replay_sample_index = None
151 |
152 | def action(self, obs):
153 | return self.act(obs[None])[0]
154 |
155 | def experience(self, obs, act, rew, new_obs, done, terminal):
156 | # Store transition in the replay buffer.
157 | self.replay_buffer.add(obs, act, rew, new_obs, float(done))
158 |
159 | def preupdate(self):
160 | self.replay_sample_index = None
161 |
162 | def update(self, agents, t):
163 | if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough
164 | return
165 | if not t % 100 == 0: # only update every 100 steps
166 | return
167 |
168 | self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)
169 | # collect replay sample from all agents
170 | obs_n = []
171 | obs_next_n = []
172 | act_n = []
173 | index = self.replay_sample_index
174 | for i in range(self.n):
175 | obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
176 | obs_n.append(obs)
177 | obs_next_n.append(obs_next)
178 | act_n.append(act)
179 | obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)
180 |
181 | # train q network
182 | num_sample = 1
183 | target_q = 0.0
184 | for i in range(num_sample):
185 | target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
186 | target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
187 | target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
188 | target_q /= num_sample
189 | q_loss = self.q_train(*(obs_n + act_n + [target_q]))
190 |
191 | # train p network
192 | p_loss = self.p_train(*(obs_n + act_n))
193 |
194 | self.p_update()
195 | self.q_update()
196 |
197 | return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]
198 |
--------------------------------------------------------------------------------
/experiments/ibmac_inter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import tensorflow as tf
4 | import maddpg.common.tf_util as U
5 |
6 | from maddpg.common.distributions import make_pdtype
7 | from maddpg import AgentTrainer
8 | from maddpg.trainer.replay_buffer_with_messages import ReplayBuffer
9 |
10 | import itertools
11 |
12 |
13 | def discount_with_dones(rewards, dones, gamma):
14 | discounted = []
15 | r = 0
16 | for reward, done in zip(rewards[::-1], dones[::-1]):
17 | r = reward + gamma * r
18 | r = r * (1. - done)
19 | discounted.append(r)
20 | return discounted[::-1]
21 |
22 |
23 | def make_update_exp(vals, target_vals):
24 | polyak = 1.0 - 1e-2
25 | expression = []
26 | for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
27 | expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var))
28 | expression = tf.group(*expression)
29 | return U.function([], [], updates=[expression])
30 |
31 |
32 | def p_train(make_obs_ph_n, make_meesages_ph_n, act_space_n, p_func, q_func, optimizer, grad_norm_clipping=None,
33 | local_q_func=False, num_units=64, scope="trainer", reuse=None, beta=0.01):
34 | with tf.variable_scope(scope, reuse=reuse):
35 | num_agents = len(make_obs_ph_n)
36 |
37 | # create distribtuions
38 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
39 |
40 | # set up placeholders
41 | obs_ph_n = make_obs_ph_n
42 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(num_agents)]
43 |
44 | messages_ph_n = make_meesages_ph_n
45 |
46 | # multi_head = pre_message(messages_ph_n)
47 |
48 | items = [p_func([obs_ph_n[i], tf.concat(messages_ph_n, 1)], int(act_pdtype_n[i].param_shape()[0]),
49 | scope="p_func_{}".format(i), num_units=num_units) for i in range(num_agents)]
50 | p_n, message_n, mu_message_n, logvar_message_n = list(zip(*items))
51 |
52 | logvar_message_n = [tf.clip_by_value(log, -10, 10) for log in
53 | logvar_message_n] # constrain kl_loss not to be too large
54 |
55 | p_func_vars = [U.scope_vars(U.absolute_scope_name("p_func_{}".format(i))) for i in range(num_agents)]
56 |
57 | # wrap parameters in distribution
58 | act_pd_n = [act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents)]
59 |
60 | act_sample_n = [act_pd.sample() for act_pd in act_pd_n]
61 | p_reg_n = [tf.reduce_mean(tf.square(act_pd.flatparam())) for act_pd in act_pd_n]
62 |
63 | act_input_n_n = [act_ph_n + [] for _ in range(num_agents)]
64 | for i in range(num_agents):
65 | act_input_n_n[i][i] = act_pd_n[i].sample()
66 | q_input_n = [tf.concat(obs_ph_n + messages_ph_n + act_input_n, 1) for act_input_n in act_input_n_n]
67 |
68 | q_n = [q_func(q_input_n[i], 1, scope="q_func_{}".format(i), reuse=True, num_units=num_units)[:, 0] for i in
69 | range(num_agents)]
70 | pg_loss_n = [-tf.reduce_mean(q) for q in q_n]
71 |
72 | kl_loss_message_n = [0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5 for mu, log in
73 | zip(mu_message_n, logvar_message_n)]
74 | kl_loss_message = tf.reduce_mean(kl_loss_message_n)
75 |
76 | pg_loss = tf.reduce_sum(pg_loss_n)
77 | p_reg = tf.reduce_sum(p_reg_n)
78 | loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message
79 |
80 | var_list = []
81 | var_list.extend(p_func_vars)
82 | var_list = list(itertools.chain(*var_list))
83 | optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping)
84 |
85 | # Create callable functions
86 | train = U.function(inputs=obs_ph_n + messages_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
87 | act = U.function(inputs=obs_ph_n + messages_ph_n, outputs=[act_sample_n, message_n])
88 | p_values = U.function(inputs=obs_ph_n + messages_ph_n, outputs=p_n)
89 |
90 | # target network
91 | target_items = [p_func([obs_ph_n[i], tf.concat(messages_ph_n, 1)], int(act_pdtype_n[i].param_shape()[0]),
92 | scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents)]
93 |
94 | target_p_n, target_message_n, target_mu_message_n, target_logvar_message_n = list(zip(*target_items))
95 | target_logvar_message_n = [tf.clip_by_value(log, -10, 10) for log in
96 | target_logvar_message_n] # constrain kl_loss not to be too large
97 |
98 | target_p_func_vars = [U.scope_vars(U.absolute_scope_name("target_p_func_{}".format(i))) for i in
99 | range(num_agents)]
100 |
101 | target_var_list = []
102 | target_var_list.extend(target_p_func_vars)
103 | target_var_list = list(itertools.chain(*target_var_list))
104 | update_target_p = make_update_exp(var_list, target_var_list)
105 |
106 | target_act_sample_n = [act_pdtype_n[i].pdfromflat(target_p_n[i]).sample() for i in range(num_agents)]
107 | target_act = U.function(inputs=obs_ph_n + messages_ph_n, outputs=[target_act_sample_n, target_message_n])
108 |
109 | return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
110 |
111 |
112 | def q_train(make_obs_ph_n, make_meesages_ph_n, act_space_n, q_func, optimizer, grad_norm_clipping=None,
113 | local_q_func=False, scope="trainer", reuse=None, num_units=64):
114 | with tf.variable_scope(scope, reuse=reuse):
115 | num_agents = len(make_obs_ph_n)
116 |
117 | # create distribtuions
118 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
119 |
120 | # set up placeholders
121 | obs_ph_n = make_obs_ph_n
122 | messages_ph_n = make_meesages_ph_n
123 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action_{}".format(i)) for i in
124 | range(len(act_space_n))]
125 | target_ph_n = [tf.placeholder(tf.float32, [None], name="target_{}".format(i)) for i in range(num_agents)]
126 |
127 | q_input = tf.concat(obs_ph_n + messages_ph_n + act_ph_n, 1)
128 | q_n = [q_func(q_input, 1, scope="q_func_{}".format(i), num_units=num_units)[:, 0] for i in range(num_agents)]
129 | q_func_vars = [U.scope_vars(U.absolute_scope_name("q_func_{}".format(i))) for i in range(num_agents)]
130 |
131 | q_loss_n = [tf.reduce_mean(tf.square(q - target_ph)) for q, target_ph in zip(q_n, target_ph_n)]
132 |
133 | # viscosity solution to Bellman differential equation in place of an initial condition
134 | # q_reg = tf.reduce_mean(tf.square(q))
135 | q_loss = tf.reduce_sum(q_loss_n)
136 | loss = q_loss # + 1e-3 * q_reg
137 |
138 | var_list = list(itertools.chain(*q_func_vars))
139 | optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping)
140 |
141 | # Create callable functions
142 | train = U.function(inputs=obs_ph_n + messages_ph_n + act_ph_n + target_ph_n, outputs=loss,
143 | updates=[optimize_expr])
144 | q_values = U.function(obs_ph_n + messages_ph_n + act_ph_n, q_n)
145 |
146 | # target network
147 | target_q_n = [q_func(q_input, 1, scope="target_q_func_{}".format(i), num_units=num_units)[:, 0] for i in
148 | range(num_agents)]
149 | target_q_func_vars = [U.scope_vars(U.absolute_scope_name("target_q_func_{}".format(i))) for i in
150 | range(num_agents)]
151 |
152 | traget_var_list = list(itertools.chain(*target_q_func_vars))
153 | update_target_q = make_update_exp(var_list, traget_var_list)
154 |
155 | target_q_values = U.function(obs_ph_n + messages_ph_n + act_ph_n, target_q_n)
156 |
157 | return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
158 |
159 |
160 | class IBMACInterAgentTrainer(AgentTrainer):
161 | def __init__(self, name, actor_model, critic_mlp_model, obs_shape_n, act_space_n, args, local_q_func=False):
162 | self.name = name
163 | self.n = len(obs_shape_n)
164 | self.args = args
165 | obs_ph_n = []
166 | messages_ph_n = []
167 | for i in range(self.n):
168 | obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation_" + str(i)).get())
169 | messages_ph_n.append(U.BatchInput((args.dim_message,), name="message_" + str(i)).get())
170 |
171 | # Create all the functions necessary to train the model
172 | self.q_train, self.q_update, self.q_debug = q_train(
173 | scope=self.name,
174 | make_obs_ph_n=obs_ph_n,
175 | make_meesages_ph_n=messages_ph_n,
176 | act_space_n=act_space_n,
177 | q_func=critic_mlp_model,
178 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
179 | grad_norm_clipping=0.5,
180 | local_q_func=local_q_func,
181 | num_units=args.num_units,
182 | )
183 | self.act, self.p_train, self.p_update, self.p_debug = p_train(
184 | scope=self.name,
185 | make_obs_ph_n=obs_ph_n,
186 | make_meesages_ph_n=messages_ph_n,
187 | act_space_n=act_space_n,
188 | p_func=actor_model,
189 | q_func=critic_mlp_model,
190 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
191 | grad_norm_clipping=0.5,
192 | local_q_func=local_q_func,
193 | num_units=args.num_units,
194 | beta=args.beta,
195 | )
196 | # Create experience buffer
197 | self.replay_buffer = ReplayBuffer(1e6)
198 | # self.max_replay_buffer_len = 50 * args.max_episode_len
199 | self.max_replay_buffer_len = args.batch_size * args.max_episode_len
200 | self.replay_sample_index = None
201 |
202 | def action(self, obs_n, message_n):
203 | obs = [obs[None] for obs in obs_n]
204 | message = [message[None] for message in message_n]
205 | return self.act(*(list(obs) + list(message)))
206 |
207 | def experience(self, obs, message, act, rew, new_obs, done, terminal):
208 | # Store transition in the replay buffer.
209 | self.replay_buffer.add(obs, message, act, rew, new_obs, [float(d) for d in done])
210 |
211 | def preupdate(self):
212 | self.replay_sample_index = None
213 |
214 | def update(self, agents, t):
215 | if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough
216 | return
217 | if not t % 100 == 0: # only update every 100 steps
218 | return
219 |
220 | self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)
221 | # collect replay sample from all agents
222 | obs_n = []
223 | obs_next_n = []
224 | act_n = []
225 | index = self.replay_sample_index
226 | samples = self.replay_buffer.sample_index(index)
227 | obs_n, message_n, act_n, rew_n, obs_next_n, done_n = [np.swapaxes(item, 0, 1) for item in samples]
228 | # for i in range(self.n):
229 | # obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
230 | # obs_n.append(obs)
231 | # obs_next_n.append(obs_next)
232 | # act_n.append(act)
233 | # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)
234 |
235 | # train q network
236 | num_sample = 1
237 | target_q = 0.0
238 | for i in range(num_sample):
239 | target_act_next_n, target_next_message_n = self.p_debug['target_act'](*(list(obs_next_n) + list(message_n)))
240 | target_q_next_n = self.q_debug['target_q_values'](
241 | *(list(obs_next_n) + list(target_next_message_n) + list(target_act_next_n)))
242 | target_q_n = [rew + self.args.gamma * (1.0 - done) * target_q_next for rew, done, target_q_next in
243 | zip(rew_n, done_n, target_q_next_n)]
244 | target_q_n = [target_q / num_sample for target_q in target_q_n]
245 | q_loss = self.q_train(*(list(obs_n) + list(message_n) + list(act_n) + target_q_n))
246 |
247 | # train p network
248 | p_loss = self.p_train(*(list(obs_n) + list(message_n) + list(act_n)))
249 |
250 | self.p_update()
251 | self.q_update()
252 |
253 | return [q_loss, p_loss, np.mean(target_q), np.mean(rew_n), np.mean(target_q_next_n), np.std(target_q)]
254 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/rendering.py:
--------------------------------------------------------------------------------
1 | """
2 | 2D rendering framework
3 | """
4 | from __future__ import division
5 | import os
6 | import six
7 | import sys
8 |
9 | if "Apple" in sys.version:
10 | if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
11 | os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
12 | # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
13 |
14 | from gym.utils import reraise
15 | from gym import error
16 |
17 | try:
18 | import pyglet
19 | except ImportError as e:
20 | reraise(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.")
21 |
22 | try:
23 | from pyglet.gl import *
24 | except ImportError as e:
25 | reraise(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '")
26 |
27 | import math
28 | import numpy as np
29 |
30 | RAD2DEG = 57.29577951308232
31 |
32 | def get_display(spec):
33 | """Convert a display specification (such as :0) into an actual Display
34 | object.
35 |
36 | Pyglet only supports multiple Displays on Linux.
37 | """
38 | if spec is None:
39 | return None
40 | elif isinstance(spec, six.string_types):
41 | return pyglet.canvas.Display(spec)
42 | else:
43 | raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
44 |
45 | class Viewer(object):
46 | def __init__(self, width, height, display=None):
47 | display = get_display(display)
48 |
49 | self.width = width
50 | self.height = height
51 |
52 | self.window = pyglet.window.Window(width=width, height=height, display=display)
53 | self.window.on_close = self.window_closed_by_user
54 | self.geoms = []
55 | self.onetime_geoms = []
56 | self.transform = Transform()
57 |
58 | glEnable(GL_BLEND)
59 | # glEnable(GL_MULTISAMPLE)
60 | glEnable(GL_LINE_SMOOTH)
61 | # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
62 | glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
63 | glLineWidth(2.0)
64 | glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
65 |
66 | def close(self):
67 | self.window.close()
68 |
69 | def window_closed_by_user(self):
70 | self.close()
71 |
72 | def set_bounds(self, left, right, bottom, top):
73 | assert right > left and top > bottom
74 | scalex = self.width/(right-left)
75 | scaley = self.height/(top-bottom)
76 | self.transform = Transform(
77 | translation=(-left*scalex, -bottom*scaley),
78 | scale=(scalex, scaley))
79 |
80 | def add_geom(self, geom):
81 | self.geoms.append(geom)
82 |
83 | def add_onetime(self, geom):
84 | self.onetime_geoms.append(geom)
85 |
86 | def render(self, return_rgb_array=False):
87 | glClearColor(1,1,1,1)
88 | self.window.clear()
89 | self.window.switch_to()
90 | self.window.dispatch_events()
91 | self.transform.enable()
92 | for geom in self.geoms:
93 | geom.render()
94 | for geom in self.onetime_geoms:
95 | geom.render()
96 | self.transform.disable()
97 | arr = None
98 | if return_rgb_array:
99 | buffer = pyglet.image.get_buffer_manager().get_color_buffer()
100 | image_data = buffer.get_image_data()
101 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
102 | # In https://github.com/openai/gym-http-api/issues/2, we
103 | # discovered that someone using Xmonad on Arch was having
104 | # a window of size 598 x 398, though a 600 x 400 window
105 | # was requested. (Guess Xmonad was preserving a pixel for
106 | # the boundary.) So we use the buffer height/width rather
107 | # than the requested one.
108 | arr = arr.reshape(buffer.height, buffer.width, 4)
109 | arr = arr[::-1,:,0:3]
110 | self.window.flip()
111 | self.onetime_geoms = []
112 | return arr
113 |
114 | # Convenience
115 | def draw_circle(self, radius=10, res=30, filled=True, **attrs):
116 | geom = make_circle(radius=radius, res=res, filled=filled)
117 | _add_attrs(geom, attrs)
118 | self.add_onetime(geom)
119 | return geom
120 |
121 | def draw_polygon(self, v, filled=True, **attrs):
122 | geom = make_polygon(v=v, filled=filled)
123 | _add_attrs(geom, attrs)
124 | self.add_onetime(geom)
125 | return geom
126 |
127 | def draw_polyline(self, v, **attrs):
128 | geom = make_polyline(v=v)
129 | _add_attrs(geom, attrs)
130 | self.add_onetime(geom)
131 | return geom
132 |
133 | def draw_line(self, start, end, **attrs):
134 | geom = Line(start, end)
135 | _add_attrs(geom, attrs)
136 | self.add_onetime(geom)
137 | return geom
138 |
139 | def get_array(self):
140 | self.window.flip()
141 | image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
142 | self.window.flip()
143 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
144 | arr = arr.reshape(self.height, self.width, 4)
145 | return arr[::-1,:,0:3]
146 |
147 | def _add_attrs(geom, attrs):
148 | if "color" in attrs:
149 | geom.set_color(*attrs["color"])
150 | if "linewidth" in attrs:
151 | geom.set_linewidth(attrs["linewidth"])
152 |
153 | class Geom(object):
154 | def __init__(self):
155 | self._color=Color((0, 0, 0, 1.0))
156 | self.attrs = [self._color]
157 | def render(self):
158 | for attr in reversed(self.attrs):
159 | attr.enable()
160 | self.render1()
161 | for attr in self.attrs:
162 | attr.disable()
163 | def render1(self):
164 | raise NotImplementedError
165 | def add_attr(self, attr):
166 | self.attrs.append(attr)
167 | def set_color(self, r, g, b, alpha=1):
168 | self._color.vec4 = (r, g, b, alpha)
169 |
170 | class Attr(object):
171 | def enable(self):
172 | raise NotImplementedError
173 | def disable(self):
174 | pass
175 |
176 | class Transform(Attr):
177 | def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)):
178 | self.set_translation(*translation)
179 | self.set_rotation(rotation)
180 | self.set_scale(*scale)
181 | def enable(self):
182 | glPushMatrix()
183 | glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint
184 | glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
185 | glScalef(self.scale[0], self.scale[1], 1)
186 | def disable(self):
187 | glPopMatrix()
188 | def set_translation(self, newx, newy):
189 | self.translation = (float(newx), float(newy))
190 | def set_rotation(self, new):
191 | self.rotation = float(new)
192 | def set_scale(self, newx, newy):
193 | self.scale = (float(newx), float(newy))
194 |
195 | class Color(Attr):
196 | def __init__(self, vec4):
197 | self.vec4 = vec4
198 | def enable(self):
199 | glColor4f(*self.vec4)
200 |
201 | class LineStyle(Attr):
202 | def __init__(self, style):
203 | self.style = style
204 | def enable(self):
205 | glEnable(GL_LINE_STIPPLE)
206 | glLineStipple(1, self.style)
207 | def disable(self):
208 | glDisable(GL_LINE_STIPPLE)
209 |
210 | class LineWidth(Attr):
211 | def __init__(self, stroke):
212 | self.stroke = stroke
213 | def enable(self):
214 | glLineWidth(self.stroke)
215 |
216 | class Point(Geom):
217 | def __init__(self):
218 | Geom.__init__(self)
219 | def render1(self):
220 | glBegin(GL_POINTS) # draw point
221 | glVertex3f(0.0, 0.0, 0.0)
222 | glEnd()
223 |
224 | class FilledPolygon(Geom):
225 | def __init__(self, v):
226 | Geom.__init__(self)
227 | self.v = v
228 | def render1(self):
229 | if len(self.v) == 4 : glBegin(GL_QUADS)
230 | elif len(self.v) > 4 : glBegin(GL_POLYGON)
231 | else: glBegin(GL_TRIANGLES)
232 | for p in self.v:
233 | glVertex3f(p[0], p[1],0) # draw each vertex
234 | glEnd()
235 |
236 | color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5)
237 | glColor4f(*color)
238 | glBegin(GL_LINE_LOOP)
239 | for p in self.v:
240 | glVertex3f(p[0], p[1],0) # draw each vertex
241 | glEnd()
242 |
243 | def make_circle(radius=10, res=30, filled=True):
244 | points = []
245 | for i in range(res):
246 | ang = 2*math.pi*i / res
247 | points.append((math.cos(ang)*radius, math.sin(ang)*radius))
248 | if filled:
249 | return FilledPolygon(points)
250 | else:
251 | return PolyLine(points, True)
252 |
253 | def make_polygon(v, filled=True):
254 | if filled: return FilledPolygon(v)
255 | else: return PolyLine(v, True)
256 |
257 | def make_polyline(v):
258 | return PolyLine(v, False)
259 |
260 | def make_capsule(length, width):
261 | l, r, t, b = 0, length, width/2, -width/2
262 | box = make_polygon([(l,b), (l,t), (r,t), (r,b)])
263 | circ0 = make_circle(width/2)
264 | circ1 = make_circle(width/2)
265 | circ1.add_attr(Transform(translation=(length, 0)))
266 | geom = Compound([box, circ0, circ1])
267 | return geom
268 |
269 | class Compound(Geom):
270 | def __init__(self, gs):
271 | Geom.__init__(self)
272 | self.gs = gs
273 | for g in self.gs:
274 | g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
275 | def render1(self):
276 | for g in self.gs:
277 | g.render()
278 |
279 | class PolyLine(Geom):
280 | def __init__(self, v, close):
281 | Geom.__init__(self)
282 | self.v = v
283 | self.close = close
284 | self.linewidth = LineWidth(1)
285 | self.add_attr(self.linewidth)
286 | def render1(self):
287 | glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
288 | for p in self.v:
289 | glVertex3f(p[0], p[1],0) # draw each vertex
290 | glEnd()
291 | def set_linewidth(self, x):
292 | self.linewidth.stroke = x
293 |
294 | class Line(Geom):
295 | def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
296 | Geom.__init__(self)
297 | self.start = start
298 | self.end = end
299 | self.linewidth = LineWidth(1)
300 | self.add_attr(self.linewidth)
301 |
302 | def render1(self):
303 | glBegin(GL_LINES)
304 | glVertex2f(*self.start)
305 | glVertex2f(*self.end)
306 | glEnd()
307 |
308 | class Image(Geom):
309 | def __init__(self, fname, width, height):
310 | Geom.__init__(self)
311 | self.width = width
312 | self.height = height
313 | img = pyglet.image.load(fname)
314 | self.img = img
315 | self.flip = False
316 | def render1(self):
317 | self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height)
318 |
319 | # ================================================================
320 |
321 | class SimpleImageViewer(object):
322 | def __init__(self, display=None):
323 | self.window = None
324 | self.isopen = False
325 | self.display = display
326 | def imshow(self, arr):
327 | if self.window is None:
328 | height, width, channels = arr.shape
329 | self.window = pyglet.window.Window(width=width, height=height, display=self.display)
330 | self.width = width
331 | self.height = height
332 | self.isopen = True
333 | assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
334 | image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
335 | self.window.clear()
336 | self.window.switch_to()
337 | self.window.dispatch_events()
338 | image.blit(0,0)
339 | self.window.flip()
340 | def close(self):
341 | if self.isopen:
342 | self.window.close()
343 | self.isopen = False
344 | def __del__(self):
345 | self.close()
--------------------------------------------------------------------------------
/maddpg/common/tf_util.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import numpy as np
3 | import os
4 | import tensorflow as tf
5 |
6 | def sum(x, axis=None, keepdims=False):
7 | return tf.reduce_sum(x, axis=None if axis is None else [axis], keep_dims = keepdims)
8 | def mean(x, axis=None, keepdims=False):
9 | return tf.reduce_mean(x, axis=None if axis is None else [axis], keep_dims = keepdims)
10 | def var(x, axis=None, keepdims=False):
11 | meanx = mean(x, axis=axis, keepdims=keepdims)
12 | return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
13 | def std(x, axis=None, keepdims=False):
14 | return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
15 | def max(x, axis=None, keepdims=False):
16 | return tf.reduce_max(x, axis=None if axis is None else [axis], keep_dims = keepdims)
17 | def min(x, axis=None, keepdims=False):
18 | return tf.reduce_min(x, axis=None if axis is None else [axis], keep_dims = keepdims)
19 | def concatenate(arrs, axis=0):
20 | return tf.concat(axis=axis, values=arrs)
21 | def argmax(x, axis=None):
22 | return tf.argmax(x, axis=axis)
23 | def softmax(x, axis=None):
24 | return tf.nn.softmax(x, axis=axis)
25 |
26 | # ================================================================
27 | # Misc
28 | # ================================================================
29 |
30 |
31 | def is_placeholder(x):
32 | return type(x) is tf.Tensor and len(x.op.inputs) == 0
33 |
34 | # ================================================================
35 | # Inputs
36 | # ================================================================
37 |
38 |
39 | class TfInput(object):
40 | def __init__(self, name="(unnamed)"):
41 | """Generalized Tensorflow placeholder. The main differences are:
42 | - possibly uses multiple placeholders internally and returns multiple values
43 | - can apply light postprocessing to the value feed to placeholder.
44 | """
45 | self.name = name
46 |
47 | def get(self):
48 | """Return the tf variable(s) representing the possibly postprocessed value
49 | of placeholder(s).
50 | """
51 | raise NotImplemented()
52 |
53 | def make_feed_dict(data):
54 | """Given data input it to the placeholder(s)."""
55 | raise NotImplemented()
56 |
57 |
58 | class PlacholderTfInput(TfInput):
59 | def __init__(self, placeholder):
60 | """Wrapper for regular tensorflow placeholder."""
61 | super().__init__(placeholder.name)
62 | self._placeholder = placeholder
63 |
64 | def get(self):
65 | return self._placeholder
66 |
67 | def make_feed_dict(self, data):
68 | return {self._placeholder: data}
69 |
70 |
71 | class BatchInput(PlacholderTfInput):
72 | def __init__(self, shape, dtype=tf.float32, name=None):
73 | """Creates a placeholder for a batch of tensors of a given shape and dtype
74 |
75 | Parameters
76 | ----------
77 | shape: [int]
78 | shape of a single elemenet of the batch
79 | dtype: tf.dtype
80 | number representation used for tensor contents
81 | name: str
82 | name of the underlying placeholder
83 | """
84 | super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
85 |
86 |
87 | class Uint8Input(PlacholderTfInput):
88 | def __init__(self, shape, name=None):
89 | """Takes input in uint8 format which is cast to float32 and divided by 255
90 | before passing it to the model.
91 |
92 | On GPU this ensures lower data transfer times.
93 |
94 | Parameters
95 | ----------
96 | shape: [int]
97 | shape of the tensor.
98 | name: str
99 | name of the underlying placeholder
100 | """
101 |
102 | super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
103 | self._shape = shape
104 | self._output = tf.cast(super().get(), tf.float32) / 255.0
105 |
106 | def get(self):
107 | return self._output
108 |
109 |
110 | def ensure_tf_input(thing):
111 | """Takes either tf.placeholder of TfInput and outputs equivalent TfInput"""
112 | if isinstance(thing, TfInput):
113 | return thing
114 | elif is_placeholder(thing):
115 | return PlacholderTfInput(thing)
116 | else:
117 | raise ValueError("Must be a placeholder or TfInput")
118 |
119 | # ================================================================
120 | # Mathematical utils
121 | # ================================================================
122 |
123 |
124 | def huber_loss(x, delta=1.0):
125 | """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
126 | return tf.where(
127 | tf.abs(x) < delta,
128 | tf.square(x) * 0.5,
129 | delta * (tf.abs(x) - 0.5 * delta)
130 | )
131 |
132 | # ================================================================
133 | # Optimizer utils
134 | # ================================================================
135 |
136 |
137 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
138 | """Minimized `objective` using `optimizer` w.r.t. variables in
139 | `var_list` while ensure the norm of the gradients for each
140 | variable is clipped to `clip_val`
141 | """
142 | if clip_val is None:
143 | return optimizer.minimize(objective, var_list=var_list)
144 | else:
145 | gradients = optimizer.compute_gradients(objective, var_list=var_list)
146 | for i, (grad, var) in enumerate(gradients):
147 | if grad is not None:
148 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
149 | return optimizer.apply_gradients(gradients)
150 |
151 |
152 | # ================================================================
153 | # Global session
154 | # ================================================================
155 |
156 | def get_session():
157 | """Returns recently made Tensorflow session"""
158 | return tf.get_default_session()
159 |
160 |
161 | def make_session(num_cpu):
162 | """Returns a session that will use CPU's only"""
163 | tf_config = tf.ConfigProto(
164 | inter_op_parallelism_threads=num_cpu,
165 | intra_op_parallelism_threads=num_cpu)
166 | return tf.Session(config=tf_config)
167 |
168 |
169 | def single_threaded_session():
170 | """Returns a session which will only use a single CPU"""
171 | return make_session(1)
172 |
173 |
174 | ALREADY_INITIALIZED = set()
175 |
176 |
177 | def initialize():
178 | """Initialize all the uninitialized variables in the global scope."""
179 | new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
180 | get_session().run(tf.variables_initializer(new_variables))
181 | ALREADY_INITIALIZED.update(new_variables)
182 |
183 |
184 | # ================================================================
185 | # Scopes
186 | # ================================================================
187 |
188 |
189 | def scope_vars(scope, trainable_only=False):
190 | """
191 | Get variables inside a scope
192 | The scope can be specified as a string
193 |
194 | Parameters
195 | ----------
196 | scope: str or VariableScope
197 | scope in which the variables reside.
198 | trainable_only: bool
199 | whether or not to return only the variables that were marked as trainable.
200 |
201 | Returns
202 | -------
203 | vars: [tf.Variable]
204 | list of variables in `scope`.
205 | """
206 | return tf.get_collection(
207 | tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
208 | scope=scope if isinstance(scope, str) else scope.name
209 | )
210 |
211 |
212 | def scope_name():
213 | """Returns the name of current scope as a string, e.g. deepq/q_func"""
214 | return tf.get_variable_scope().name
215 |
216 |
217 | def absolute_scope_name(relative_scope_name):
218 | """Appends parent scope name to `relative_scope_name`"""
219 | return scope_name() + "/" + relative_scope_name
220 |
221 | # ================================================================
222 | # Saving variables
223 | # ================================================================
224 |
225 |
226 | def load_state(fname, saver=None):
227 | """Load all the variables to the current session from the location """
228 | if saver is None:
229 | saver = tf.train.Saver()
230 | saver.restore(get_session(), fname)
231 | return saver
232 |
233 |
234 | def save_state(fname, saver=None):
235 | """Save all the variables in the current session to the location """
236 | os.makedirs(os.path.dirname(fname), exist_ok=True)
237 | if saver is None:
238 | saver = tf.train.Saver()
239 | saver.save(get_session(), fname)
240 | return saver
241 |
242 | # ================================================================
243 | # Theano-like Function
244 | # ================================================================
245 |
246 |
247 | def function(inputs, outputs, updates=None, givens=None):
248 | """Just like Theano function. Take a bunch of tensorflow placeholders and expersions
249 | computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
250 | values to be feed to the inputs placeholders and produces the values of the experessions
251 | in outputs.
252 |
253 | Input values can be passed in the same order as inputs or can be provided as kwargs based
254 | on placeholder name (passed to constructor or accessible via placeholder.op.name).
255 |
256 | Example:
257 | x = tf.placeholder(tf.int32, (), name="x")
258 | y = tf.placeholder(tf.int32, (), name="y")
259 | z = 3 * x + 2 * y
260 | lin = function([x, y], z, givens={y: 0})
261 |
262 | with single_threaded_session():
263 | initialize()
264 |
265 | assert lin(2) == 6
266 | assert lin(x=3) == 9
267 | assert lin(2, 2) == 10
268 | assert lin(x=2, y=3) == 12
269 |
270 | Parameters
271 | ----------
272 | inputs: [tf.placeholder or TfInput]
273 | list of input arguments
274 | outputs: [tf.Variable] or tf.Variable
275 | list of outputs or a single output to be returned from function. Returned
276 | value will also have the same shape.
277 | """
278 | if isinstance(outputs, list):
279 | return _Function(inputs, outputs, updates, givens=givens)
280 | elif isinstance(outputs, (dict, collections.OrderedDict)):
281 | f = _Function(inputs, outputs.values(), updates, givens=givens)
282 | return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
283 | else:
284 | f = _Function(inputs, [outputs], updates, givens=givens)
285 | return lambda *args, **kwargs: f(*args, **kwargs)[0]
286 |
287 |
288 | class _Function(object):
289 | def __init__(self, inputs, outputs, updates, givens, check_nan=False):
290 | for inpt in inputs:
291 | if not issubclass(type(inpt), TfInput):
292 | assert len(inpt.op.inputs) == 0, "inputs should all be placeholders of rl_algs.common.TfInput"
293 | self.inputs = inputs
294 | updates = updates or []
295 | self.update_group = tf.group(*updates)
296 | self.outputs_update = list(outputs) + [self.update_group]
297 | self.givens = {} if givens is None else givens
298 | self.check_nan = check_nan
299 |
300 | def _feed_input(self, feed_dict, inpt, value):
301 | if issubclass(type(inpt), TfInput):
302 | feed_dict.update(inpt.make_feed_dict(value))
303 | elif is_placeholder(inpt):
304 | feed_dict[inpt] = value
305 |
306 | def __call__(self, *args, **kwargs):
307 | assert len(args) <= len(self.inputs), "Too many arguments provided"
308 | feed_dict = {}
309 | # Update the args
310 | for inpt, value in zip(self.inputs, args):
311 | self._feed_input(feed_dict, inpt, value)
312 | # Update the kwargs
313 | kwargs_passed_inpt_names = set()
314 | for inpt in self.inputs[len(args):]:
315 | inpt_name = inpt.name.split(':')[0]
316 | inpt_name = inpt_name.split('/')[-1]
317 | assert inpt_name not in kwargs_passed_inpt_names, \
318 | "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name)
319 | if inpt_name in kwargs:
320 | kwargs_passed_inpt_names.add(inpt_name)
321 | self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name))
322 | else:
323 | assert inpt in self.givens, "Missing argument " + inpt_name
324 | assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys()))
325 | # Update feed dict with givens.
326 | for inpt in self.givens:
327 | feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
328 | results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
329 | if self.check_nan:
330 | if any(np.isnan(r).any() for r in results):
331 | raise RuntimeError("Nan detected")
332 | return results
333 |
--------------------------------------------------------------------------------
/maddpg/common/distributions.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import maddpg.common.tf_util as U
4 | from tensorflow.python.ops import math_ops
5 | from multiagent.multi_discrete import MultiDiscrete
6 | from tensorflow.python.ops import nn
7 |
8 | class Pd(object):
9 | """
10 | A particular probability distribution
11 | """
12 | def flatparam(self):
13 | raise NotImplementedError
14 | def mode(self):
15 | raise NotImplementedError
16 | def logp(self, x):
17 | raise NotImplementedError
18 | def kl(self, other):
19 | raise NotImplementedError
20 | def entropy(self):
21 | raise NotImplementedError
22 | def sample(self):
23 | raise NotImplementedError
24 |
25 | class PdType(object):
26 | """
27 | Parametrized family of probability distributions
28 | """
29 | def pdclass(self):
30 | raise NotImplementedError
31 | def pdfromflat(self, flat):
32 | return self.pdclass()(flat)
33 | def param_shape(self):
34 | raise NotImplementedError
35 | def sample_shape(self):
36 | raise NotImplementedError
37 | def sample_dtype(self):
38 | raise NotImplementedError
39 |
40 | def param_placeholder(self, prepend_shape, name=None):
41 | return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
42 | def sample_placeholder(self, prepend_shape, name=None):
43 | return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
44 |
45 | class CategoricalPdType(PdType):
46 | def __init__(self, ncat):
47 | self.ncat = ncat
48 | def pdclass(self):
49 | return CategoricalPd
50 | def param_shape(self):
51 | return [self.ncat]
52 | def sample_shape(self):
53 | return []
54 | def sample_dtype(self):
55 | return tf.int32
56 |
57 | class SoftCategoricalPdType(PdType):
58 | def __init__(self, ncat):
59 | self.ncat = ncat
60 | def pdclass(self):
61 | return SoftCategoricalPd
62 | def param_shape(self):
63 | return [self.ncat]
64 | def sample_shape(self):
65 | return [self.ncat]
66 | def sample_dtype(self):
67 | return tf.float32
68 |
69 | class MultiCategoricalPdType(PdType):
70 | def __init__(self, low, high):
71 | self.low = low
72 | self.high = high
73 | self.ncats = high - low + 1
74 | def pdclass(self):
75 | return MultiCategoricalPd
76 | def pdfromflat(self, flat):
77 | return MultiCategoricalPd(self.low, self.high, flat)
78 | def param_shape(self):
79 | return [sum(self.ncats)]
80 | def sample_shape(self):
81 | return [len(self.ncats)]
82 | def sample_dtype(self):
83 | return tf.int32
84 |
85 | class SoftMultiCategoricalPdType(PdType):
86 | def __init__(self, low, high):
87 | self.low = low
88 | self.high = high
89 | self.ncats = high - low + 1
90 | def pdclass(self):
91 | return SoftMultiCategoricalPd
92 | def pdfromflat(self, flat):
93 | return SoftMultiCategoricalPd(self.low, self.high, flat)
94 | def param_shape(self):
95 | return [sum(self.ncats)]
96 | def sample_shape(self):
97 | return [sum(self.ncats)]
98 | def sample_dtype(self):
99 | return tf.float32
100 |
101 | class DiagGaussianPdType(PdType):
102 | def __init__(self, size):
103 | self.size = size
104 | def pdclass(self):
105 | return DiagGaussianPd
106 | def param_shape(self):
107 | return [2*self.size]
108 | def sample_shape(self):
109 | return [self.size]
110 | def sample_dtype(self):
111 | return tf.float32
112 |
113 | class BernoulliPdType(PdType):
114 | def __init__(self, size):
115 | self.size = size
116 | def pdclass(self):
117 | return BernoulliPd
118 | def param_shape(self):
119 | return [self.size]
120 | def sample_shape(self):
121 | return [self.size]
122 | def sample_dtype(self):
123 | return tf.int32
124 |
125 | # WRONG SECOND DERIVATIVES
126 | # class CategoricalPd(Pd):
127 | # def __init__(self, logits):
128 | # self.logits = logits
129 | # self.ps = tf.nn.softmax(logits)
130 | # @classmethod
131 | # def fromflat(cls, flat):
132 | # return cls(flat)
133 | # def flatparam(self):
134 | # return self.logits
135 | # def mode(self):
136 | # return U.argmax(self.logits, axis=1)
137 | # def logp(self, x):
138 | # return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
139 | # def kl(self, other):
140 | # return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
141 | # - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
142 | # def entropy(self):
143 | # return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
144 | # def sample(self):
145 | # u = tf.random_uniform(tf.shape(self.logits))
146 | # return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
147 |
148 | class CategoricalPd(Pd):
149 | def __init__(self, logits):
150 | self.logits = logits
151 | def flatparam(self):
152 | return self.logits
153 | def mode(self):
154 | return U.argmax(self.logits, axis=1)
155 | def logp(self, x):
156 | return -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
157 | def kl(self, other):
158 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
159 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
160 | ea0 = tf.exp(a0)
161 | ea1 = tf.exp(a1)
162 | z0 = U.sum(ea0, axis=1, keepdims=True)
163 | z1 = U.sum(ea1, axis=1, keepdims=True)
164 | p0 = ea0 / z0
165 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
166 | def entropy(self):
167 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
168 | ea0 = tf.exp(a0)
169 | z0 = U.sum(ea0, axis=1, keepdims=True)
170 | p0 = ea0 / z0
171 | return U.sum(p0 * (tf.log(z0) - a0), axis=1)
172 | def sample(self):
173 | u = tf.random_uniform(tf.shape(self.logits))
174 | return U.argmax(self.logits - tf.log(-tf.log(u)), axis=1)
175 | @classmethod
176 | def fromflat(cls, flat):
177 | return cls(flat)
178 |
179 | class SoftCategoricalPd(Pd):
180 | def __init__(self, logits):
181 | self.logits = logits
182 | def flatparam(self):
183 | return self.logits
184 | def mode(self):
185 | return U.softmax(self.logits, axis=-1)
186 | def logp(self, x):
187 | return -tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
188 | def kl(self, other):
189 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
190 | a1 = other.logits - U.max(other.logits, axis=1, keepdims=True)
191 | ea0 = tf.exp(a0)
192 | ea1 = tf.exp(a1)
193 | z0 = U.sum(ea0, axis=1, keepdims=True)
194 | z1 = U.sum(ea1, axis=1, keepdims=True)
195 | p0 = ea0 / z0
196 | return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
197 | def entropy(self):
198 | a0 = self.logits - U.max(self.logits, axis=1, keepdims=True)
199 | ea0 = tf.exp(a0)
200 | z0 = U.sum(ea0, axis=1, keepdims=True)
201 | p0 = ea0 / z0
202 | return U.sum(p0 * (tf.log(z0) - a0), axis=1)
203 | def sample(self):
204 | u = tf.random_uniform(tf.shape(self.logits))
205 | return U.softmax(self.logits - tf.log(-tf.log(u)), axis=-1)
206 | @classmethod
207 | def fromflat(cls, flat):
208 | return cls(flat)
209 |
210 | class MultiCategoricalPd(Pd):
211 | def __init__(self, low, high, flat):
212 | self.flat = flat
213 | self.low = tf.constant(low, dtype=tf.int32)
214 | self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
215 | def flatparam(self):
216 | return self.flat
217 | def mode(self):
218 | return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
219 | def logp(self, x):
220 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
221 | def kl(self, other):
222 | return tf.add_n([
223 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
224 | ])
225 | def entropy(self):
226 | return tf.add_n([p.entropy() for p in self.categoricals])
227 | def sample(self):
228 | return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
229 | @classmethod
230 | def fromflat(cls, flat):
231 | return cls(flat)
232 |
233 | class SoftMultiCategoricalPd(Pd): # doesn't work yet
234 | def __init__(self, low, high, flat):
235 | self.flat = flat
236 | self.low = tf.constant(low, dtype=tf.float32)
237 | self.categoricals = list(map(SoftCategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
238 | def flatparam(self):
239 | return self.flat
240 | def mode(self):
241 | x = []
242 | for i in range(len(self.categoricals)):
243 | x.append(self.low[i] + self.categoricals[i].mode())
244 | return tf.concat(x, axis=-1)
245 | def logp(self, x):
246 | return tf.add_n([p.logp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
247 | def kl(self, other):
248 | return tf.add_n([
249 | p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
250 | ])
251 | def entropy(self):
252 | return tf.add_n([p.entropy() for p in self.categoricals])
253 | def sample(self):
254 | x = []
255 | for i in range(len(self.categoricals)):
256 | x.append(self.low[i] + self.categoricals[i].sample())
257 | return tf.concat(x, axis=-1)
258 | @classmethod
259 | def fromflat(cls, flat):
260 | return cls(flat)
261 |
262 | class DiagGaussianPd(Pd):
263 | def __init__(self, flat):
264 | self.flat = flat
265 | mean, logstd = tf.split(axis=1, num_or_size_splits=2, value=flat)
266 | self.mean = mean
267 | self.logstd = logstd
268 | self.std = tf.exp(logstd)
269 | def flatparam(self):
270 | return self.flat
271 | def mode(self):
272 | return self.mean
273 | def logp(self, x):
274 | return - 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=1) \
275 | - 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) \
276 | - U.sum(self.logstd, axis=1)
277 | def kl(self, other):
278 | assert isinstance(other, DiagGaussianPd)
279 | return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=1)
280 | def entropy(self):
281 | return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), 1)
282 | def sample(self):
283 | return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
284 | @classmethod
285 | def fromflat(cls, flat):
286 | return cls(flat)
287 |
288 | class BernoulliPd(Pd):
289 | def __init__(self, logits):
290 | self.logits = logits
291 | self.ps = tf.sigmoid(logits)
292 | def flatparam(self):
293 | return self.logits
294 | def mode(self):
295 | return tf.round(self.ps)
296 | def logp(self, x):
297 | return - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=1)
298 | def kl(self, other):
299 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
300 | def entropy(self):
301 | return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=1)
302 | def sample(self):
303 | p = tf.sigmoid(self.logits)
304 | u = tf.random_uniform(tf.shape(p))
305 | return tf.to_float(math_ops.less(u, p))
306 | @classmethod
307 | def fromflat(cls, flat):
308 | return cls(flat)
309 |
310 | def make_pdtype(ac_space):
311 | from gym import spaces
312 | if isinstance(ac_space, spaces.Box):
313 | assert len(ac_space.shape) == 1
314 | return DiagGaussianPdType(ac_space.shape[0])
315 | elif isinstance(ac_space, spaces.Discrete):
316 | # return CategoricalPdType(ac_space.n)
317 | return SoftCategoricalPdType(ac_space.n)
318 | elif isinstance(ac_space, MultiDiscrete):
319 | #return MultiCategoricalPdType(ac_space.low, ac_space.high)
320 | return SoftMultiCategoricalPdType(ac_space.low, ac_space.high)
321 | elif isinstance(ac_space, spaces.MultiBinary):
322 | return BernoulliPdType(ac_space.n)
323 | else:
324 | raise NotImplementedError
325 |
326 | def shape_el(v, i):
327 | maybe = v.get_shape()[i]
328 | if maybe is not None:
329 | return maybe
330 | else:
331 | return tf.shape(v)[i]
332 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/core.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import seaborn as sns
3 |
4 | # physical/external base state of all entites
5 | class EntityState(object):
6 | def __init__(self):
7 | # physical position
8 | self.p_pos = None
9 | # physical velocity
10 | self.p_vel = None
11 |
12 | # state of agents (including communication and internal/mental state)
13 | class AgentState(EntityState):
14 | def __init__(self):
15 | super(AgentState, self).__init__()
16 | # communication utterance
17 | self.c = None
18 |
19 | # action of the agent
20 | class Action(object):
21 | def __init__(self):
22 | # physical action
23 | self.u = None
24 | # communication action
25 | self.c = None
26 |
27 | class Wall(object):
28 | def __init__(self, orient='H', axis_pos=0.0, endpoints=(-1, 1), width=0.1,
29 | hard=True):
30 | # orientation: 'H'orizontal or 'V'ertical
31 | self.orient = orient
32 | # position along axis which wall lays on (y-axis for H, x-axis for V)
33 | self.axis_pos = axis_pos
34 | # endpoints of wall (x-coords for H, y-coords for V)
35 | self.endpoints = np.array(endpoints)
36 | # width of wall
37 | self.width = width
38 | # whether wall is impassable to all agents
39 | self.hard = hard
40 | # color of wall
41 | self.color = np.array([0.0, 0.0, 0.0])
42 |
43 |
44 | # properties and state of physical world entity
45 | class Entity(object):
46 | def __init__(self):
47 | # index among all entities (important to set for distance caching)
48 | self.i = 0
49 | # name
50 | self.name = ''
51 | # properties:
52 | self.size = 0.050
53 | # entity can move / be pushed
54 | self.movable = False
55 | # entity collides with others
56 | self.collide = True
57 | # entity can pass through non-hard walls
58 | self.ghost = False
59 | # material density (affects mass)
60 | self.density = 25.0
61 | # color
62 | self.color = None
63 | # max speed and accel
64 | self.max_speed = None
65 | self.accel = None
66 | # state
67 | self.state = EntityState()
68 | # mass
69 | self.initial_mass = 1.0
70 |
71 | @property
72 | def mass(self):
73 | return self.initial_mass
74 |
75 | # properties of landmark entities
76 | class Landmark(Entity):
77 | def __init__(self):
78 | super(Landmark, self).__init__()
79 |
80 | # properties of agent entities
81 | class Agent(Entity):
82 | def __init__(self):
83 | super(Agent, self).__init__()
84 | # agents are movable by default
85 | self.movable = True
86 | # cannot send communication signals
87 | self.silent = False
88 | # cannot observe the world
89 | self.blind = False
90 | # physical motor noise amount
91 | self.u_noise = None
92 | # communication noise amount
93 | self.c_noise = None
94 | # control range
95 | self.u_range = 1.0
96 | # state
97 | self.state = AgentState()
98 | # action
99 | self.action = Action()
100 | # script behavior to execute
101 | self.action_callback = None
102 |
103 | # multi-agent world
104 | class World(object):
105 | def __init__(self):
106 | # list of agents and entities (can change at execution-time!)
107 | self.agents = []
108 | self.landmarks = []
109 | self.walls = []
110 | # communication channel dimensionality
111 | self.dim_c = 0
112 | # position dimensionality
113 | self.dim_p = 2
114 | # color dimensionality
115 | self.dim_color = 3
116 | # simulation timestep
117 | self.dt = 0.1
118 | # physical damping
119 | self.damping = 0.25
120 | # contact response parameters
121 | self.contact_force = 1e+2
122 | self.contact_margin = 1e-3
123 | # cache distances between all agents (not calculated by default)
124 | self.cache_dists = False
125 | self.cached_dist_vect = None
126 | self.cached_dist_mag = None
127 |
128 | # return all entities in the world
129 | @property
130 | def entities(self):
131 | return self.agents + self.landmarks
132 |
133 | # return all agents controllable by external policies
134 | @property
135 | def policy_agents(self):
136 | return [agent for agent in self.agents if agent.action_callback is None]
137 |
138 | # return all agents controlled by world scripts
139 | @property
140 | def scripted_agents(self):
141 | return [agent for agent in self.agents if agent.action_callback is not None]
142 |
143 | def calculate_distances(self):
144 | if self.cached_dist_vect is None:
145 | # initialize distance data structure
146 | self.cached_dist_vect = np.zeros((len(self.entities),
147 | len(self.entities),
148 | self.dim_p))
149 | # calculate minimum distance for a collision between all entities
150 | self.min_dists = np.zeros((len(self.entities), len(self.entities)))
151 | for ia, entity_a in enumerate(self.entities):
152 | for ib in range(ia + 1, len(self.entities)):
153 | entity_b = self.entities[ib]
154 | min_dist = entity_a.size + entity_b.size
155 | self.min_dists[ia, ib] = min_dist
156 | self.min_dists[ib, ia] = min_dist
157 |
158 | for ia, entity_a in enumerate(self.entities):
159 | for ib in range(ia + 1, len(self.entities)):
160 | entity_b = self.entities[ib]
161 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
162 | self.cached_dist_vect[ia, ib, :] = delta_pos
163 | self.cached_dist_vect[ib, ia, :] = -delta_pos
164 |
165 | self.cached_dist_mag = np.linalg.norm(self.cached_dist_vect, axis=2)
166 | self.cached_collisions = (self.cached_dist_mag <= self.min_dists)
167 |
168 | def assign_agent_colors(self):
169 | n_dummies = 0
170 | if hasattr(self.agents[0], 'dummy'):
171 | n_dummies = len([a for a in self.agents if a.dummy])
172 | n_adversaries = 0
173 | if hasattr(self.agents[0], 'adversary'):
174 | n_adversaries = len([a for a in self.agents if a.adversary])
175 | n_good_agents = len(self.agents) - n_adversaries - n_dummies
176 | dummy_colors = [(0, 0, 0)] * n_dummies
177 | adv_colors = sns.color_palette("OrRd_d", n_adversaries)
178 | good_colors = sns.color_palette("GnBu_d", n_good_agents)
179 | colors = dummy_colors + adv_colors + good_colors
180 | for color, agent in zip(colors, self.agents):
181 | agent.color = color
182 |
183 | # update state of the world
184 | def step(self):
185 | # set actions for scripted agents
186 | for agent in self.scripted_agents:
187 | agent.action = agent.action_callback(agent, self)
188 | # gather forces applied to entities
189 | p_force = [None] * len(self.entities)
190 | # apply agent physical controls
191 | p_force = self.apply_action_force(p_force)
192 | # apply environment forces
193 | p_force = self.apply_environment_force(p_force)
194 | # integrate physical state
195 | self.integrate_state(p_force)
196 | # update agent state
197 | for agent in self.agents:
198 | self.update_agent_state(agent)
199 | # calculate and store distances between all entities
200 | if self.cache_dists:
201 | self.calculate_distances()
202 |
203 |
204 | # gather agent action forces
205 | def apply_action_force(self, p_force):
206 | # set applied forces
207 | for i,agent in enumerate(self.agents):
208 | if agent.movable:
209 | noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0
210 | p_force[i] = (agent.mass * agent.accel if agent.accel is not None else agent.mass) * agent.action.u + noise
211 | return p_force
212 |
213 | # gather physical forces acting on entities
214 | def apply_environment_force(self, p_force):
215 | # simple (but inefficient) collision response
216 | for a,entity_a in enumerate(self.entities):
217 | for b,entity_b in enumerate(self.entities):
218 | if(b <= a): continue
219 | [f_a, f_b] = self.get_entity_collision_force(a, b)
220 | if(f_a is not None):
221 | if(p_force[a] is None): p_force[a] = 0.0
222 | p_force[a] = f_a + p_force[a]
223 | if(f_b is not None):
224 | if(p_force[b] is None): p_force[b] = 0.0
225 | p_force[b] = f_b + p_force[b]
226 | if entity_a.movable:
227 | for wall in self.walls:
228 | wf = self.get_wall_collision_force(entity_a, wall)
229 | if wf is not None:
230 | if p_force[a] is None:
231 | p_force[a] = 0.0
232 | p_force[a] = p_force[a] + wf
233 | return p_force
234 |
235 | # integrate physical state
236 | def integrate_state(self, p_force):
237 | for i,entity in enumerate(self.entities):
238 | if not entity.movable: continue
239 | entity.state.p_vel = entity.state.p_vel * (1 - self.damping)
240 | if (p_force[i] is not None):
241 | entity.state.p_vel += (p_force[i] / entity.mass) * self.dt
242 | if entity.max_speed is not None:
243 | speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1]))
244 | if speed > entity.max_speed:
245 | entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) +
246 | np.square(entity.state.p_vel[1])) * entity.max_speed
247 | entity.state.p_pos += entity.state.p_vel * self.dt
248 |
249 | def update_agent_state(self, agent):
250 | # set communication state (directly for now)
251 | if agent.silent:
252 | agent.state.c = np.zeros(self.dim_c)
253 | else:
254 | noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0
255 | agent.state.c = agent.action.c + noise
256 |
257 | # get collision forces for any contact between two entities
258 | def get_entity_collision_force(self, ia, ib):
259 | entity_a = self.entities[ia]
260 | entity_b = self.entities[ib]
261 | if (not entity_a.collide) or (not entity_b.collide):
262 | return [None, None] # not a collider
263 | if (not entity_a.movable) and (not entity_b.movable):
264 | return [None, None] # neither entity moves
265 | if (entity_a is entity_b):
266 | return [None, None] # don't collide against itself
267 | if self.cache_dists:
268 | delta_pos = self.cached_dist_vect[ia, ib]
269 | dist = self.cached_dist_mag[ia, ib]
270 | dist_min = self.min_dists[ia, ib]
271 | else:
272 | # compute actual distance between entities
273 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
274 | dist = np.sqrt(np.sum(np.square(delta_pos)))
275 | # minimum allowable distance
276 | dist_min = entity_a.size + entity_b.size
277 | # softmax penetration
278 | k = self.contact_margin
279 | penetration = np.logaddexp(0, -(dist - dist_min)/k)*k
280 | force = self.contact_force * delta_pos / dist * penetration
281 | if entity_a.movable and entity_b.movable:
282 | # consider mass in collisions
283 | force_ratio = entity_b.mass / entity_a.mass
284 | force_a = force_ratio * force
285 | force_b = -(1 / force_ratio) * force
286 | else:
287 | force_a = +force if entity_a.movable else None
288 | force_b = -force if entity_b.movable else None
289 | return [force_a, force_b]
290 |
291 | # get collision forces for contact between an entity and a wall
292 | def get_wall_collision_force(self, entity, wall):
293 | if entity.ghost and not wall.hard:
294 | return None # ghost passes through soft walls
295 | if wall.orient == 'H':
296 | prll_dim = 0
297 | perp_dim = 1
298 | else:
299 | prll_dim = 1
300 | perp_dim = 0
301 | ent_pos = entity.state.p_pos
302 | if (ent_pos[prll_dim] < wall.endpoints[0] - entity.size or
303 | ent_pos[prll_dim] > wall.endpoints[1] + entity.size):
304 | return None # entity is beyond endpoints of wall
305 | elif (ent_pos[prll_dim] < wall.endpoints[0] or
306 | ent_pos[prll_dim] > wall.endpoints[1]):
307 | # part of entity is beyond wall
308 | if ent_pos[prll_dim] < wall.endpoints[0]:
309 | dist_past_end = ent_pos[prll_dim] - wall.endpoints[0]
310 | else:
311 | dist_past_end = ent_pos[prll_dim] - wall.endpoints[1]
312 | theta = np.arcsin(dist_past_end / entity.size)
313 | dist_min = np.cos(theta) * entity.size + 0.5 * wall.width
314 | else: # entire entity lies within bounds of wall
315 | theta = 0
316 | dist_past_end = 0
317 | dist_min = entity.size + 0.5 * wall.width
318 |
319 | # only need to calculate distance in relevant dim
320 | delta_pos = ent_pos[perp_dim] - wall.axis_pos
321 | dist = np.abs(delta_pos)
322 | # softmax penetration
323 | k = self.contact_margin
324 | penetration = np.logaddexp(0, -(dist - dist_min)/k)*k
325 | force_mag = self.contact_force * delta_pos / dist * penetration
326 | force = np.zeros(2)
327 | force[perp_dim] = np.cos(theta) * force_mag
328 | force[prll_dim] = np.sin(theta) * np.abs(force_mag)
329 | return force
330 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/scenarios/simple_world_comm.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from multiagent.core import World, Agent, Landmark
3 | from multiagent.scenario import BaseScenario
4 |
5 |
6 | class Scenario(BaseScenario):
7 | def make_world(self):
8 | world = World()
9 | # set any world properties first
10 | world.dim_c = 4
11 | #world.damping = 1
12 | num_good_agents = 2
13 | num_adversaries = 4
14 | num_agents = num_adversaries + num_good_agents
15 | num_landmarks = 1
16 | num_food = 2
17 | num_forests = 2
18 | # add agents
19 | world.agents = [Agent() for i in range(num_agents)]
20 | for i, agent in enumerate(world.agents):
21 | agent.name = 'agent %d' % i
22 | agent.collide = True
23 | agent.leader = True if i == 0 else False
24 | agent.silent = True if i > 0 else False
25 | agent.adversary = True if i < num_adversaries else False
26 | agent.size = 0.075 if agent.adversary else 0.045
27 | agent.accel = 3.0 if agent.adversary else 4.0
28 | #agent.accel = 20.0 if agent.adversary else 25.0
29 | agent.max_speed = 1.0 if agent.adversary else 1.3
30 | # add landmarks
31 | world.landmarks = [Landmark() for i in range(num_landmarks)]
32 | for i, landmark in enumerate(world.landmarks):
33 | landmark.name = 'landmark %d' % i
34 | landmark.collide = True
35 | landmark.movable = False
36 | landmark.size = 0.2
37 | landmark.boundary = False
38 | world.food = [Landmark() for i in range(num_food)]
39 | for i, landmark in enumerate(world.food):
40 | landmark.name = 'food %d' % i
41 | landmark.collide = False
42 | landmark.movable = False
43 | landmark.size = 0.03
44 | landmark.boundary = False
45 | world.forests = [Landmark() for i in range(num_forests)]
46 | for i, landmark in enumerate(world.forests):
47 | landmark.name = 'forest %d' % i
48 | landmark.collide = False
49 | landmark.movable = False
50 | landmark.size = 0.3
51 | landmark.boundary = False
52 | world.landmarks += world.food
53 | world.landmarks += world.forests
54 | #world.landmarks += self.set_boundaries(world) # world boundaries now penalized with negative reward
55 | # make initial conditions
56 | self.reset_world(world)
57 | return world
58 |
59 | def set_boundaries(self, world):
60 | boundary_list = []
61 | landmark_size = 1
62 | edge = 1 + landmark_size
63 | num_landmarks = int(edge * 2 / landmark_size)
64 | for x_pos in [-edge, edge]:
65 | for i in range(num_landmarks):
66 | l = Landmark()
67 | l.state.p_pos = np.array([x_pos, -1 + i * landmark_size])
68 | boundary_list.append(l)
69 |
70 | for y_pos in [-edge, edge]:
71 | for i in range(num_landmarks):
72 | l = Landmark()
73 | l.state.p_pos = np.array([-1 + i * landmark_size, y_pos])
74 | boundary_list.append(l)
75 |
76 | for i, l in enumerate(boundary_list):
77 | l.name = 'boundary %d' % i
78 | l.collide == True
79 | l.movable = False
80 | l.boundary = True
81 | l.color = np.array([0.75, 0.75, 0.75])
82 | l.size = landmark_size
83 | l.state.p_vel = np.zeros(world.dim_p)
84 |
85 | return boundary_list
86 |
87 |
88 | def reset_world(self, world):
89 | # random properties for agents
90 | for i, agent in enumerate(world.agents):
91 | agent.color = np.array([0.45, 0.95, 0.45]) if not agent.adversary else np.array([0.95, 0.45, 0.45])
92 | agent.color -= np.array([0.3, 0.3, 0.3]) if agent.leader else np.array([0, 0, 0])
93 | # random properties for landmarks
94 | for i, landmark in enumerate(world.landmarks):
95 | landmark.color = np.array([0.25, 0.25, 0.25])
96 | for i, landmark in enumerate(world.food):
97 | landmark.color = np.array([0.15, 0.15, 0.65])
98 | for i, landmark in enumerate(world.forests):
99 | landmark.color = np.array([0.6, 0.9, 0.6])
100 | # set random initial states
101 | for agent in world.agents:
102 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
103 | agent.state.p_vel = np.zeros(world.dim_p)
104 | agent.state.c = np.zeros(world.dim_c)
105 | for i, landmark in enumerate(world.landmarks):
106 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
107 | landmark.state.p_vel = np.zeros(world.dim_p)
108 | for i, landmark in enumerate(world.food):
109 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
110 | landmark.state.p_vel = np.zeros(world.dim_p)
111 | for i, landmark in enumerate(world.forests):
112 | landmark.state.p_pos = np.random.uniform(-0.9, +0.9, world.dim_p)
113 | landmark.state.p_vel = np.zeros(world.dim_p)
114 |
115 | def benchmark_data(self, agent, world):
116 | if agent.adversary:
117 | collisions = 0
118 | for a in self.good_agents(world):
119 | if self.is_collision(a, agent):
120 | collisions += 1
121 | return collisions
122 | else:
123 | return 0
124 |
125 |
126 | def is_collision(self, agent1, agent2):
127 | delta_pos = agent1.state.p_pos - agent2.state.p_pos
128 | dist = np.sqrt(np.sum(np.square(delta_pos)))
129 | dist_min = agent1.size + agent2.size
130 | return True if dist < dist_min else False
131 |
132 |
133 | # return all agents that are not adversaries
134 | def good_agents(self, world):
135 | return [agent for agent in world.agents if not agent.adversary]
136 |
137 | # return all adversarial agents
138 | def adversaries(self, world):
139 | return [agent for agent in world.agents if agent.adversary]
140 |
141 |
142 | def reward(self, agent, world):
143 | # Agents are rewarded based on minimum agent distance to each landmark
144 | #boundary_reward = -10 if self.outside_boundary(agent) else 0
145 | main_reward = self.adversary_reward(agent, world) if agent.adversary else self.agent_reward(agent, world)
146 | return main_reward
147 |
148 | def outside_boundary(self, agent):
149 | if agent.state.p_pos[0] > 1 or agent.state.p_pos[0] < -1 or agent.state.p_pos[1] > 1 or agent.state.p_pos[1] < -1:
150 | return True
151 | else:
152 | return False
153 |
154 |
155 | def agent_reward(self, agent, world):
156 | # Agents are rewarded based on minimum agent distance to each landmark
157 | rew = 0
158 | shape = False
159 | adversaries = self.adversaries(world)
160 | if shape:
161 | for adv in adversaries:
162 | rew += 0.1 * np.sqrt(np.sum(np.square(agent.state.p_pos - adv.state.p_pos)))
163 | if agent.collide:
164 | for a in adversaries:
165 | if self.is_collision(a, agent):
166 | rew -= 5
167 | def bound(x):
168 | if x < 0.9:
169 | return 0
170 | if x < 1.0:
171 | return (x - 0.9) * 10
172 | return min(np.exp(2 * x - 2), 10) # 1 + (x - 1) * (x - 1)
173 |
174 | for p in range(world.dim_p):
175 | x = abs(agent.state.p_pos[p])
176 | rew -= 2 * bound(x)
177 |
178 | for food in world.food:
179 | if self.is_collision(agent, food):
180 | rew += 2
181 | rew += 0.05 * min([np.sqrt(np.sum(np.square(food.state.p_pos - agent.state.p_pos))) for food in world.food])
182 |
183 | return rew
184 |
185 | def adversary_reward(self, agent, world):
186 | # Agents are rewarded based on minimum agent distance to each landmark
187 | rew = 0
188 | shape = True
189 | agents = self.good_agents(world)
190 | adversaries = self.adversaries(world)
191 | if shape:
192 | rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - agent.state.p_pos))) for a in agents])
193 | #for adv in adversaries:
194 | # rew -= 0.1 * min([np.sqrt(np.sum(np.square(a.state.p_pos - adv.state.p_pos))) for a in agents])
195 | if agent.collide:
196 | for ag in agents:
197 | for adv in adversaries:
198 | if self.is_collision(ag, adv):
199 | rew += 5
200 | return rew
201 |
202 |
203 | def observation2(self, agent, world):
204 | # get positions of all entities in this agent's reference frame
205 | entity_pos = []
206 | for entity in world.landmarks: # world.entities:
207 | if not entity.boundary:
208 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
209 |
210 | food_pos = []
211 | for entity in world.food: # world.entities:
212 | if not entity.boundary:
213 | food_pos.append(entity.state.p_pos - agent.state.p_pos)
214 | # communication of all other agents
215 | comm = []
216 | other_pos = []
217 | other_vel = []
218 | for other in world.agents:
219 | if other is agent: continue
220 | comm.append(other.state.c)
221 | other_pos.append(other.state.p_pos - agent.state.p_pos)
222 | if not other.adversary:
223 | other_vel.append(other.state.p_vel)
224 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel)
225 |
226 | def observation(self, agent, world):
227 | # get positions of all entities in this agent's reference frame
228 | entity_pos = []
229 | for entity in world.landmarks: # world.entities:
230 | if not entity.boundary:
231 | entity_pos.append(entity.state.p_pos - agent.state.p_pos)
232 |
233 | in_forest = [np.array([-1]), np.array([-1])]
234 | inf1 = False
235 | inf2 = False
236 | if self.is_collision(agent, world.forests[0]):
237 | in_forest[0] = np.array([1])
238 | inf1= True
239 | if self.is_collision(agent, world.forests[1]):
240 | in_forest[1] = np.array([1])
241 | inf2 = True
242 |
243 | food_pos = []
244 | for entity in world.food: # world.entities:
245 | if not entity.boundary:
246 | food_pos.append(entity.state.p_pos - agent.state.p_pos)
247 | # communication of all other agents
248 | comm = []
249 | other_pos = []
250 | other_vel = []
251 | for other in world.agents:
252 | if other is agent: continue
253 | comm.append(other.state.c)
254 | oth_f1 = self.is_collision(other, world.forests[0])
255 | oth_f2 = self.is_collision(other, world.forests[1])
256 | #if (inf1 and not oth_f2) or (inf2 and not oth_f1) or (not inf1 and not oth_f1 and not inf2 and not oth_f2) or agent.leader: #with forest vis
257 | if (inf1 and oth_f1) or (inf2 and oth_f2) or (not inf1 and not oth_f1 and not inf2 and not oth_f2) or agent.leader: #without forest vis
258 | #if (in_forest == np.array([-1]) and not self.is_collision(other, world.forests[0])) or (in_forest == np.array([1]) and not self.is_collision(other, world.forests[0])) or agent.leader:
259 | other_pos.append(other.state.p_pos - agent.state.p_pos)
260 | if not other.adversary:
261 | other_vel.append(other.state.p_vel)
262 | else:
263 | other_pos.append([0, 0])
264 | if not other.adversary:
265 | other_vel.append([0, 0])
266 |
267 | # to tell the pred when the prey are in the forest
268 | prey_forest = []
269 | ga = self.good_agents(world)
270 | for a in ga:
271 | if any([self.is_collision(a, f) for f in world.forests]):
272 | prey_forest.append(np.array([1]))
273 | else:
274 | prey_forest.append(np.array([-1]))
275 | # to tell leader when pred are in forest
276 | prey_forest_lead = []
277 | for f in world.forests:
278 | if any([self.is_collision(a, f) for a in ga]):
279 | prey_forest_lead.append(np.array([1]))
280 | else:
281 | prey_forest_lead.append(np.array([-1]))
282 |
283 |
284 | #print(agent.adversary)
285 | #print(agent.leader)
286 | #print(in_forest)
287 | #print(other_pos)
288 | comm = [world.agents[0].state.c]
289 | #comm = [np.array([0, 0, 0, 0])]
290 | """
291 | # old setting
292 | if agent.adversary:
293 | #print(np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + [in_forest] + comm).shape)
294 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm)
295 | else:
296 | #print(np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + [in_forest] + other_vel).shape)
297 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + in_forest + other_vel)
298 |
299 | # new setting
300 | """
301 | if agent.adversary and not agent.leader:
302 | #print(np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + [in_forest] + comm).shape)
303 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm)
304 | if agent.leader:
305 | return np.concatenate(
306 | [agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + other_vel + in_forest + comm)
307 | else:
308 | #print(np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + [in_forest] + other_vel).shape)
309 | return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + in_forest + other_vel)
310 | #"""
311 |
312 |
--------------------------------------------------------------------------------
/multiagent-particle-envs/multiagent/environment.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from gym import spaces
3 | from gym.envs.registration import EnvSpec
4 | import numpy as np
5 |
6 | # environment for all agents in the multiagent world
7 | # currently code assumes that no agents will be created/destroyed at runtime!
8 | class MultiAgentEnv(gym.Env):
9 | metadata = {
10 | 'render.modes' : ['human', 'rgb_array']
11 | }
12 |
13 | def __init__(self, world, reset_callback=None, reward_callback=None,
14 | observation_callback=None, info_callback=None,
15 | done_callback=None, post_step_callback=None,
16 | shared_viewer=True, discrete_action=False):
17 |
18 | self.world = world
19 | self.agents = self.world.policy_agents
20 | # set required vectorized gym env property
21 | self.n = len(world.policy_agents)
22 | # scenario callbacks
23 | self.reset_callback = reset_callback
24 | self.reward_callback = reward_callback
25 | self.observation_callback = observation_callback
26 | self.info_callback = info_callback
27 | self.done_callback = done_callback
28 | self.post_step_callback = post_step_callback
29 | # environment parameters
30 | self.discrete_action_space = discrete_action
31 | # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
32 | self.discrete_action_input = False
33 | # if true, even the action is continuous, action will be performed discretely
34 | self.force_discrete_action = world.discrete_action if hasattr(world, 'discrete_action') else False
35 | # if true, every agent has the same reward
36 | self.shared_reward = False
37 | self.time = 0
38 |
39 | # configure spaces
40 | self.action_space = []
41 | self.observation_space = []
42 | for agent in self.agents:
43 | total_action_space = []
44 | # physical action space
45 | if self.discrete_action_space:
46 | u_action_space = spaces.Discrete(world.dim_p * 2 + 1)
47 | else:
48 | u_action_space = spaces.Box(low=-agent.u_range, high=+agent.u_range, shape=(world.dim_p,))
49 | if agent.movable:
50 | total_action_space.append(u_action_space)
51 | # communication action space
52 | c_action_space = spaces.Discrete(world.dim_c)
53 | if not agent.silent:
54 | total_action_space.append(c_action_space)
55 | # total action space
56 | if len(total_action_space) > 1:
57 | # all action spaces are discrete, so simplify to MultiDiscrete action space
58 | if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
59 | act_space = spaces.MultiDiscrete([[0,act_space.n-1] for act_space in total_action_space])
60 | else:
61 | act_space = spaces.Tuple(total_action_space)
62 | self.action_space.append(act_space)
63 | else:
64 | self.action_space.append(total_action_space[0])
65 | # observation space
66 | obs_dim = len(observation_callback(agent, self.world))
67 | self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,)))
68 | agent.action.c = np.zeros(self.world.dim_c)
69 |
70 | # rendering
71 | self.shared_viewer = shared_viewer
72 | if self.shared_viewer:
73 | self.viewers = [None]
74 | else:
75 | self.viewers = [None] * self.n
76 | self._reset_render()
77 |
78 | def _seed(self, seed=None):
79 | if seed is None:
80 | np.random.seed(1)
81 | else:
82 | np.random.seed(seed)
83 |
84 | def _step(self, action_n):
85 | obs_n = []
86 | reward_n = []
87 | done_n = []
88 | info_n = {'n': []}
89 | self.agents = self.world.policy_agents
90 | # set action for each agent
91 | for i, agent in enumerate(self.agents):
92 | self._set_action(action_n[i], agent, self.action_space[i])
93 | # advance world state
94 | self.world.step()
95 | # record observation for each agent
96 | for agent in self.agents:
97 | obs_n.append(self._get_obs(agent))
98 | reward_n.append(self._get_reward(agent))
99 | done_n.append(self._get_done(agent))
100 |
101 | info_n['n'].append(self._get_info(agent))
102 |
103 | # all agents get total reward in cooperative case
104 | reward = np.sum(reward_n)
105 | if self.shared_reward:
106 | reward_n = [reward] * self.n
107 | if self.post_step_callback is not None:
108 | self.post_step_callback(self.world)
109 | return obs_n, reward_n, done_n, info_n
110 |
111 | def _reset(self):
112 | # reset world
113 | self.reset_callback(self.world)
114 | # reset renderer
115 | self._reset_render()
116 | # record observations for each agent
117 | obs_n = []
118 | self.agents = self.world.policy_agents
119 | for agent in self.agents:
120 | obs_n.append(self._get_obs(agent))
121 | return obs_n
122 |
123 | # get info used for benchmarking
124 | def _get_info(self, agent):
125 | if self.info_callback is None:
126 | return {}
127 | return self.info_callback(agent, self.world)
128 |
129 | # get observation for a particular agent
130 | def _get_obs(self, agent):
131 | if self.observation_callback is None:
132 | return np.zeros(0)
133 | return self.observation_callback(agent, self.world)
134 |
135 | # get dones for a particular agent
136 | # unused right now -- agents are allowed to go beyond the viewing screen
137 | def _get_done(self, agent):
138 | if self.done_callback is None:
139 | return False
140 | return self.done_callback(agent, self.world)
141 |
142 | # get reward for a particular agent
143 | def _get_reward(self, agent):
144 | if self.reward_callback is None:
145 | return 0.0
146 | return self.reward_callback(agent, self.world)
147 |
148 | # set env action for a particular agent
149 | def _set_action(self, action, agent, action_space, time=None):
150 | agent.action.u = np.zeros(self.world.dim_p)
151 | agent.action.c = np.zeros(self.world.dim_c)
152 | # process action
153 | if isinstance(action_space, spaces.MultiDiscrete):
154 | act = []
155 | size = action_space.high - action_space.low + 1
156 | index = 0
157 | for s in size:
158 | act.append(action[index:(index+s)])
159 | index += s
160 | action = act
161 | else:
162 | action = [action]
163 |
164 | if agent.movable:
165 | # physical action
166 | if self.discrete_action_input:
167 | agent.action.u = np.zeros(self.world.dim_p)
168 | # process discrete action
169 | if action[0] == 1: agent.action.u[0] = -1.0
170 | if action[0] == 2: agent.action.u[0] = +1.0
171 | if action[0] == 3: agent.action.u[1] = -1.0
172 | if action[0] == 4: agent.action.u[1] = +1.0
173 | else:
174 | if self.force_discrete_action:
175 | d = np.argmax(action[0])
176 | action[0][:] = 0.0
177 | action[0][d] = 1.0
178 | if self.discrete_action_space:
179 | agent.action.u[0] += action[0][1] - action[0][2]
180 | agent.action.u[1] += action[0][3] - action[0][4]
181 | else:
182 | agent.action.u = action[0]
183 | sensitivity = 5.0
184 | if agent.accel is not None:
185 | sensitivity = agent.accel
186 | agent.action.u *= sensitivity
187 | action = action[1:]
188 | if not agent.silent:
189 | # communication action
190 | if self.discrete_action_input:
191 | agent.action.c = np.zeros(self.world.dim_c)
192 | agent.action.c[action[0]] = 1.0
193 | else:
194 | agent.action.c = action[0]
195 | action = action[1:]
196 | # make sure we used all elements of action
197 | assert len(action) == 0
198 |
199 | # reset rendering assets
200 | def _reset_render(self):
201 | self.render_geoms = None
202 | self.render_geoms_xform = None
203 |
204 | # render environment
205 | def _render(self, mode='human', close=True):
206 | if close:
207 | # close any existic renderers
208 | for i,viewer in enumerate(self.viewers):
209 | if viewer is not None:
210 | viewer.close()
211 | self.viewers[i] = None
212 | return []
213 |
214 | if mode == 'human':
215 | alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
216 | message = ''
217 | for agent in self.world.agents:
218 | comm = []
219 | for other in self.world.agents:
220 | if other is agent: continue
221 | if np.all(other.state.c == 0):
222 | word = '_'
223 | else:
224 | word = alphabet[np.argmax(other.state.c)]
225 | message += (other.name + ' to ' + agent.name + ': ' + word + ' ')
226 | # print(message)
227 |
228 | for i in range(len(self.viewers)):
229 | # create viewers (if necessary)
230 | if self.viewers[i] is None:
231 | # import rendering only if we need it (and don't import for headless machines)
232 | #from gym.envs.classic_control import rendering
233 | from multiagent import rendering
234 | self.viewers[i] = rendering.Viewer(700,700)
235 |
236 | # create rendering geometry
237 | if self.render_geoms is None:
238 | # import rendering only if we need it (and don't import for headless machines)
239 | #from gym.envs.classic_control import rendering
240 | from multiagent import rendering
241 | self.render_geoms = []
242 | self.render_geoms_xform = []
243 | self.comm_geoms = []
244 | for entity in self.world.entities:
245 | geom = rendering.make_circle(entity.size)
246 | xform = rendering.Transform()
247 | entity_comm_geoms = []
248 | if 'agent' in entity.name:
249 | geom.set_color(*entity.color, alpha=0.5)
250 | if not entity.silent:
251 | dim_c = self.world.dim_c
252 | # make circles to represent communication
253 | for ci in range(dim_c):
254 | comm = rendering.make_circle(entity.size / dim_c)
255 | comm.set_color(1, 1, 1)
256 | comm.add_attr(xform)
257 | offset = rendering.Transform()
258 | comm_size = (entity.size / dim_c)
259 | offset.set_translation(ci * comm_size * 2 -
260 | entity.size + comm_size, 0)
261 | comm.add_attr(offset)
262 | entity_comm_geoms.append(comm)
263 | else:
264 | geom.set_color(*entity.color)
265 | geom.add_attr(xform)
266 | self.render_geoms.append(geom)
267 | self.render_geoms_xform.append(xform)
268 | self.comm_geoms.append(entity_comm_geoms)
269 | for wall in self.world.walls:
270 | corners = ((wall.axis_pos - 0.5 * wall.width, wall.endpoints[0]),
271 | (wall.axis_pos - 0.5 * wall.width, wall.endpoints[1]),
272 | (wall.axis_pos + 0.5 * wall.width, wall.endpoints[1]),
273 | (wall.axis_pos + 0.5 * wall.width, wall.endpoints[0]))
274 | if wall.orient == 'H':
275 | corners = tuple(c[::-1] for c in corners)
276 | geom = rendering.make_polygon(corners)
277 | if wall.hard:
278 | geom.set_color(*wall.color)
279 | else:
280 | geom.set_color(*wall.color, alpha=0.5)
281 | self.render_geoms.append(geom)
282 |
283 | # add geoms to viewer
284 | for viewer in self.viewers:
285 | viewer.geoms = []
286 | for geom in self.render_geoms:
287 | viewer.add_geom(geom)
288 | for entity_comm_geoms in self.comm_geoms:
289 | for geom in entity_comm_geoms:
290 | viewer.add_geom(geom)
291 |
292 | results = []
293 | for i in range(len(self.viewers)):
294 | from multiagent import rendering
295 | # update bounds to center around agent
296 | cam_range = 1
297 | if self.shared_viewer:
298 | pos = np.zeros(self.world.dim_p)
299 | else:
300 | pos = self.agents[i].state.p_pos
301 | self.viewers[i].set_bounds(pos[0]-cam_range,pos[0]+cam_range,pos[1]-cam_range,pos[1]+cam_range)
302 | # update geometry positions
303 | for e, entity in enumerate(self.world.entities):
304 | self.render_geoms_xform[e].set_translation(*entity.state.p_pos)
305 | if 'agent' in entity.name:
306 | self.render_geoms[e].set_color(*entity.color, alpha=0.5)
307 | if not entity.silent:
308 | for ci in range(self.world.dim_c):
309 | color = 1 - entity.state.c[ci]
310 | self.comm_geoms[e][ci].set_color(color, color, color)
311 | else:
312 | self.render_geoms[e].set_color(*entity.color)
313 | # render to display or array
314 | results.append(self.viewers[i].render(return_rgb_array = mode=='rgb_array'))
315 |
316 | return results
317 |
318 | # create receptor field locations in local coordinate frame
319 | def _make_receptor_locations(self, agent):
320 | receptor_type = 'polar'
321 | range_min = 0.05 * 2.0
322 | range_max = 1.00
323 | dx = []
324 | # circular receptive field
325 | if receptor_type == 'polar':
326 | for angle in np.linspace(-np.pi, +np.pi, 8, endpoint=False):
327 | for distance in np.linspace(range_min, range_max, 3):
328 | dx.append(distance * np.array([np.cos(angle), np.sin(angle)]))
329 | # add origin
330 | dx.append(np.array([0.0, 0.0]))
331 | # grid receptive field
332 | if receptor_type == 'grid':
333 | for x in np.linspace(-range_max, +range_max, 5):
334 | for y in np.linspace(-range_max, +range_max, 5):
335 | dx.append(np.array([x,y]))
336 | return dx
337 |
338 |
339 | # vectorized wrapper for a batch of multi-agent environments
340 | # assumes all environments have the same observation and action space
341 | class BatchMultiAgentEnv(gym.Env):
342 | metadata = {
343 | 'runtime.vectorized': True,
344 | 'render.modes' : ['human', 'rgb_array']
345 | }
346 |
347 | def __init__(self, env_batch):
348 | self.env_batch = env_batch
349 |
350 | @property
351 | def n(self):
352 | return np.sum([env.n for env in self.env_batch])
353 |
354 | @property
355 | def action_space(self):
356 | return self.env_batch[0].action_space
357 |
358 | @property
359 | def observation_space(self):
360 | return self.env_batch[0].observation_space
361 |
362 | def _step(self, action_n, time):
363 | obs_n = []
364 | reward_n = []
365 | done_n = []
366 | info_n = {'n': []}
367 | i = 0
368 | for env in self.env_batch:
369 | obs, reward, done, _ = env.step(action_n[i:(i+env.n)], time)
370 | i += env.n
371 | obs_n += obs
372 | # reward = [r / len(self.env_batch) for r in reward]
373 | reward_n += reward
374 | done_n += done
375 | return obs_n, reward_n, done_n, info_n
376 |
377 | def _reset(self):
378 | obs_n = []
379 | for env in self.env_batch:
380 | obs_n += env.reset()
381 | return obs_n
382 |
383 | # render environment
384 | def _render(self, mode='human', close=True):
385 | results_n = []
386 | for env in self.env_batch:
387 | results_n += env.render(mode, close)
388 | return results_n
389 |
--------------------------------------------------------------------------------
/experiments/ibmac.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import tensorflow as tf
4 | import maddpg.common.tf_util as U
5 |
6 | from maddpg.common.distributions import make_pdtype
7 | from maddpg import AgentTrainer
8 | from maddpg.trainer.replay_buffer import ReplayBuffer
9 |
10 | import itertools
11 |
12 |
13 | def discount_with_dones(rewards, dones, gamma):
14 | discounted = []
15 | r = 0
16 | for reward, done in zip(rewards[::-1], dones[::-1]):
17 | r = reward + gamma * r
18 | r = r * (1. - done)
19 | discounted.append(r)
20 | return discounted[::-1]
21 |
22 | def clip_message(message, clip_threshold, is_norm_training, is_inference):
23 |
24 | gamma = tf.Variable(clip_threshold * tf.ones(message.shape[-1]), name='clip_gamma')
25 | beta = tf.Variable(tf.zeros(message.shape[-1]), name='clip_beta')
26 |
27 | pop_mean = tf.Variable(tf.zeros(message.shape[-1]), trainable=False, name='pop_mean')
28 | pop_variance = tf.Variable(tf.ones(message.shape[-1]), trainable=False, name='pop_variance')
29 |
30 | epsilon = 1e-8
31 |
32 | def batch_norm_training():
33 | batch_mean, batch_variance = tf.nn.moments(message, [0])
34 |
35 | decay = 0.999
36 | train_mean = tf.assign(pop_mean, pop_mean*decay + batch_mean*(1 - decay), name='train_mean')
37 | train_variance = tf.assign(pop_variance, pop_variance*decay + batch_variance*(1 - decay), name='train_variance')
38 |
39 | with tf.control_dependencies([train_mean, train_variance]):
40 | return tf.nn.batch_normalization(message, batch_mean, batch_variance, batch_mean, tf.math.sqrt(batch_variance), epsilon, name='train_clip_message')
41 |
42 | def batch_norm_inference():
43 | return tf.nn.batch_normalization(message, pop_mean, pop_variance, beta, gamma, epsilon, name='inference_clip_message')
44 |
45 | def batch_direct_act():
46 | return message
47 |
48 | batch_normalized_output = tf.case({is_norm_training: batch_norm_training, is_inference: batch_norm_inference},
49 | default=batch_direct_act, exclusive=True)
50 |
51 | return batch_normalized_output
52 |
53 |
54 |
55 | def make_update_exp(vals, target_vals):
56 | polyak = 1.0 - 1e-2
57 | expression = []
58 | for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
59 | expression.append(var_target.assign(polyak * var_target + (1.0 - polyak) * var))
60 | expression = tf.group(*expression)
61 | return U.function([], [], updates=[expression])
62 |
63 |
64 | def p_train(make_obs_ph_n, act_space_n, before_com_func, channel, after_com_func, q_func, optimizer,
65 | grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None, beta=0.01,
66 | ibmac_com=True):
67 | with tf.variable_scope(scope, reuse=reuse):
68 | clip_threshold = 1 # 1, 5, 10
69 | is_norm_training = tf.placeholder(tf.bool)
70 | is_inference = tf.placeholder(tf.bool)
71 |
72 |
73 | ibmac_nocom = not ibmac_com
74 | num_agents = len(make_obs_ph_n)
75 |
76 | # create distribtuions
77 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
78 |
79 | # set up placeholders
80 | obs_ph_n = make_obs_ph_n
81 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(num_agents)]
82 |
83 | hiddens_n = [before_com_func(obs_ph_n[i], num_units, scope="before_com_{}".format(i), num_units=num_units) for i
84 | in range(num_agents)]
85 | before_com_vars_n = [U.scope_vars(U.absolute_scope_name("before_com_{}".format(i))) for i in range(num_agents)]
86 |
87 | hiddens_n_for_message = tf.concat(
88 | [before_com_func(obs_ph_n[i], num_units, scope="before_com_{}".format(i), reuse=True, num_units=num_units)
89 | for i in range(num_agents)], axis=1)
90 | hiddens_n_for_message = tf.stop_gradient(hiddens_n_for_message)
91 | channel_output = channel(hiddens_n_for_message, num_units * num_agents, scope="channel",
92 | num_units=num_units * num_agents)
93 | message_n, mu_message_n, logvar_message_n = [tf.split(item, num_or_size_splits=num_agents, axis=1) for item in
94 | channel_output]
95 | logvar_message_n = [tf.clip_by_value(log, -10, 10) for log in logvar_message_n] # constrain kl_loss not to be too large
96 |
97 |
98 | message_n = [clip_message(message, clip_threshold, is_norm_training, is_inference) for message in message_n]
99 |
100 | channel_vars_n = [U.scope_vars(U.absolute_scope_name("channel"))]
101 |
102 | if ibmac_nocom:
103 | print('no_com')
104 | p_n = [after_com_func(hiddens_n[i], int(act_pdtype_n[i].param_shape()[0]), scope="p_func_{}".format(i),
105 | num_units=num_units) for i in range(num_agents)]
106 | else:
107 | check_n = [hiddens_n[i] + message_n[i] for i in range(num_agents)]
108 | p_n = [after_com_func(hiddens_n[i] + message_n[i], int(act_pdtype_n[i].param_shape()[0]),
109 | scope="p_func_{}".format(i), num_units=num_units) for i in range(num_agents)]
110 | p_func_vars = [U.scope_vars(U.absolute_scope_name("p_func_{}".format(i))) for i in range(num_agents)]
111 |
112 | # wrap parameters in distribution
113 | act_pd_n = [act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents)]
114 |
115 | act_sample_n = [act_pd.sample() for act_pd in act_pd_n]
116 | p_reg_n = [tf.reduce_mean(tf.square(act_pd.flatparam())) for act_pd in act_pd_n]
117 |
118 | act_input_n_n = [act_ph_n + [] for _ in range(num_agents)]
119 | for i in range(num_agents):
120 | act_input_n_n[i][i] = act_pd_n[i].sample()
121 | q_input_n = [tf.concat(obs_ph_n + act_input_n, 1) for act_input_n in act_input_n_n]
122 |
123 | q_n = [q_func(q_input_n[i], 1, scope="q_func_{}".format(i), reuse=True, num_units=num_units)[:, 0] for i in
124 | range(num_agents)]
125 | pg_loss_n = [-tf.reduce_mean(q) for q in q_n]
126 |
127 | # # 0.25
128 | # kl_loss_message_n = [2 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(0.5) - 0.5 for mu, log in
129 | # zip(mu_message_n, logvar_message_n)]
130 |
131 | # #1
132 | # kl_loss_message_n = [0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5 for mu, log in
133 | # zip(mu_message_n, logvar_message_n)]
134 | # #5
135 | # kl_loss_message_n = [1.0/50 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(5) - 0.5 for mu, log in
136 | # zip(mu_message_n, logvar_message_n)]
137 | #10
138 | kl_loss_message_n = [1.0/200 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(10) - 0.5 for mu, log in
139 | zip(mu_message_n, logvar_message_n)]
140 |
141 | entropy = [tf.exp(log) + 1.4189 for log in logvar_message_n]
142 |
143 | pg_loss = tf.reduce_sum(pg_loss_n)
144 | p_reg = tf.reduce_sum(p_reg_n)
145 | kl_loss_message = tf.reduce_mean(kl_loss_message_n)
146 |
147 | if ibmac_nocom:
148 | loss = pg_loss + p_reg * 1e-3
149 | else:
150 | loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message
151 |
152 | kl_loss = U.function(inputs=obs_ph_n + act_ph_n+[is_norm_training, is_inference], outputs=kl_loss_message)
153 |
154 | var_list = []
155 | var_list.extend(before_com_vars_n)
156 | if not ibmac_nocom:
157 | var_list.extend(channel_vars_n)
158 | var_list.extend(p_func_vars)
159 | var_list = list(itertools.chain(*var_list))
160 | optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping)
161 |
162 | # Create callable functions
163 | train = U.function(inputs=obs_ph_n + act_ph_n+[is_norm_training, is_inference], outputs=loss, updates=[optimize_expr])
164 | act = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=act_sample_n)
165 | p_values = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=p_n)
166 | if not ibmac_nocom:
167 | check_values = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=check_n)
168 | channel_com = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=channel_output)
169 | check_mu = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=mu_message_n)
170 | check_log = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=logvar_message_n)
171 | else:
172 | check_values = lambda x: 0
173 | channel_com = lambda x: 0
174 | check_mu = lambda x: 0
175 | check_log = lambda x: 0
176 |
177 | # target network
178 | target_hiddens_n = [
179 | before_com_func(obs_ph_n[i], num_units, scope="target_before_com_{}".format(i), num_units=num_units) for i
180 | in range(num_agents)]
181 | target_before_com_vars = [U.scope_vars(U.absolute_scope_name("target_before_com_{}".format(i))) for i in
182 | range(num_agents)]
183 |
184 | target_hiddens_n_for_message = tf.concat([before_com_func(obs_ph_n[i], num_units,
185 | scope="target_before_com_{}".format(i), reuse=True,
186 | num_units=num_units) for i in range(num_agents)],
187 | axis=1)
188 | target_hiddens_n_for_message = tf.stop_gradient(target_hiddens_n_for_message)
189 | target_channel_output = channel(target_hiddens_n_for_message, num_units * num_agents, scope="target_channel",
190 | num_units=num_units * num_agents)
191 | target_message_n, target_mu_message_n, target_logvar_message_n = [
192 | tf.split(item, num_or_size_splits=num_agents, axis=1) for item in target_channel_output]
193 | target_channel_vars = [U.scope_vars(U.absolute_scope_name("target_channel"))]
194 | if ibmac_nocom:
195 | target_p_n = [after_com_func(target_hiddens_n[i], int(act_pdtype_n[i].param_shape()[0]),
196 | scope="target_p_func_{}".format(i), num_units=num_units) for i in
197 | range(num_agents)]
198 | else:
199 | target_p_n = [
200 | after_com_func(target_hiddens_n[i] + target_message_n[i], int(act_pdtype_n[i].param_shape()[0]),
201 | scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents)]
202 | # target_p_n = [after_com_func(tf.concat([target_hiddens_n[i],target_message_n[i]], axis=1), int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents)]
203 | target_p_func_vars = [U.scope_vars(U.absolute_scope_name("target_p_func_{}".format(i))) for i in
204 | range(num_agents)]
205 |
206 | target_var_list = []
207 | target_var_list.extend(target_before_com_vars)
208 | if not ibmac_nocom:
209 | target_var_list.extend(target_channel_vars)
210 | target_var_list.extend(target_p_func_vars)
211 | target_var_list = list(itertools.chain(*target_var_list))
212 | update_target_p = make_update_exp(var_list, target_var_list)
213 |
214 | target_act_sample_n = [act_pdtype_n[i].pdfromflat(target_p_n[i]).sample() for i in range(num_agents)]
215 | target_act = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=target_act_sample_n)
216 |
217 |
218 | check_message_n = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=message_n)
219 | check_hiddens_n = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=hiddens_n)
220 | check_entropy = U.function(inputs=obs_ph_n+[is_norm_training, is_inference], outputs=entropy)
221 |
222 | return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act, 'kl_loss': kl_loss,
223 | 'check_values': check_values, 'channel_com': channel_com,
224 | 'check_mu': check_mu, 'check_log': check_log,
225 | 'check_message_n':check_message_n, 'check_hiddens_n': check_hiddens_n,
226 | 'check_entropy': check_entropy}
227 |
228 |
229 | def q_train(make_obs_ph_n, act_space_n, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer",
230 | reuse=None, num_units=64):
231 | with tf.variable_scope(scope, reuse=reuse):
232 | num_agents = len(make_obs_ph_n)
233 |
234 | # create distribtuions
235 | act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
236 |
237 | # set up placeholders
238 | obs_ph_n = make_obs_ph_n
239 | act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action_{}".format(i)) for i in
240 | range(len(act_space_n))]
241 | target_ph_n = [tf.placeholder(tf.float32, [None], name="target_{}".format(i)) for i in range(num_agents)]
242 | is_norm_training = tf.placeholder(tf.bool)
243 | is_inference = tf.placeholder(tf.bool)
244 |
245 | q_input = tf.concat(obs_ph_n + act_ph_n, 1)
246 | q_n = [q_func(q_input, 1, scope="q_func_{}".format(i), num_units=num_units)[:, 0] for i in range(num_agents)]
247 | q_func_vars = [U.scope_vars(U.absolute_scope_name("q_func_{}".format(i))) for i in range(num_agents)]
248 |
249 | q_loss_n = [tf.reduce_mean(tf.square(q - target_ph)) for q, target_ph in zip(q_n, target_ph_n)]
250 |
251 | # viscosity solution to Bellman differential equation in place of an initial condition
252 | # q_reg = tf.reduce_mean(tf.square(q))
253 | q_loss = tf.reduce_sum(q_loss_n)
254 | loss = q_loss # + 1e-3 * q_reg
255 |
256 | var_list = list(itertools.chain(*q_func_vars))
257 | optimize_expr = U.minimize_and_clip(optimizer, loss, var_list, grad_norm_clipping)
258 |
259 | # Create callable functions
260 | train = U.function(inputs=obs_ph_n + act_ph_n + target_ph_n+[is_norm_training, is_inference], outputs=loss, updates=[optimize_expr])
261 | q_values = U.function(obs_ph_n + act_ph_n+[is_norm_training, is_inference], q_n)
262 |
263 | # target network
264 | target_q_n = [q_func(q_input, 1, scope="target_q_func_{}".format(i), num_units=num_units)[:, 0] for i in
265 | range(num_agents)]
266 | target_q_func_vars = [U.scope_vars(U.absolute_scope_name("target_q_func_{}".format(i))) for i in
267 | range(num_agents)]
268 |
269 | traget_var_list = list(itertools.chain(*target_q_func_vars))
270 | update_target_q = make_update_exp(var_list, traget_var_list)
271 |
272 | target_q_values = U.function(obs_ph_n + act_ph_n+[is_norm_training, is_inference], target_q_n)
273 |
274 | return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
275 |
276 |
277 | class IBMACAgentTrainer(AgentTrainer):
278 | def __init__(self, name, before_com_model, channel, after_com_model, critic_mlp_model, obs_shape_n, act_space_n,
279 | args, local_q_func=False):
280 | self.name = name
281 | self.n = len(obs_shape_n)
282 | self.args = args
283 | obs_ph_n = []
284 | for i in range(self.n):
285 | obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation_" + str(i)).get())
286 |
287 | # Create all the functions necessary to train the model
288 | self.q_train, self.q_update, self.q_debug = q_train(
289 | scope=self.name,
290 | make_obs_ph_n=obs_ph_n,
291 | act_space_n=act_space_n,
292 | q_func=critic_mlp_model,
293 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
294 | grad_norm_clipping=0.5,
295 | local_q_func=local_q_func,
296 | num_units=args.num_units,
297 | )
298 | self.act, self.p_train, self.p_update, self.p_debug = p_train(
299 | scope=self.name,
300 | make_obs_ph_n=obs_ph_n,
301 | act_space_n=act_space_n,
302 | before_com_func=before_com_model,
303 | channel=channel,
304 | after_com_func=after_com_model,
305 | q_func=critic_mlp_model,
306 | optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
307 | grad_norm_clipping=0.5,
308 | local_q_func=local_q_func,
309 | num_units=args.num_units,
310 | beta=args.beta,
311 | ibmac_com=args.ibmac_com,
312 | )
313 | # Create experience buffer
314 | self.replay_buffer = ReplayBuffer(1e6)
315 | # self.max_replay_buffer_len = 50 * args.max_episode_len
316 | self.max_replay_buffer_len = args.batch_size * args.max_episode_len
317 | self.replay_sample_index = None
318 |
319 | self.message_1_for_record = []
320 |
321 | def action(self, obs_n, is_norm_training=False, is_inference=False):
322 | obs = [obs[None] for obs in obs_n]
323 | message_n = self.p_debug['check_message_n'](*(list(obs)+[is_norm_training, is_inference]))
324 | self.message_1_for_record.append(message_n[0])
325 | if len(self.message_1_for_record)%2500 == 0:
326 | # print(np.var(self.message_1_for_record, axis=0))
327 | # print(0.5 * np.log(2 * np.pi * np.mean(np.var(self.message_1_for_record, axis=0))) + 0.5)
328 | self.message_1_for_record = []
329 | return self.act(*(list(obs)+[is_norm_training, is_inference]))
330 |
331 | def experience(self, obs, act, rew, new_obs, done, terminal):
332 | # Store transition in the replay buffer.
333 | self.replay_buffer.add(obs, act, rew, new_obs, [float(d) for d in done])
334 |
335 | def preupdate(self):
336 | self.replay_sample_index = None
337 |
338 | def update(self, agents, t):
339 | if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough
340 | return
341 | if not t % 100 == 0: # only update every 100 steps
342 | return
343 | is_norm_training = True
344 | is_inference = False
345 | self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)
346 | # collect replay sample from all agents
347 | obs_n = []
348 | obs_next_n = []
349 | act_n = []
350 | index = self.replay_sample_index
351 | samples = self.replay_buffer.sample_index(index)
352 | obs_n, act_n, rew_n, obs_next_n, done_n = [np.swapaxes(item, 0, 1) for item in samples]
353 | # for i in range(self.n):
354 | # obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
355 | # obs_n.append(obs)
356 | # obs_next_n.append(obs_next)
357 | # act_n.append(act)
358 | # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)
359 |
360 | # train q network
361 | num_sample = 1
362 | target_q = 0.0
363 | # print(len(obs_next_n))
364 | for i in range(num_sample):
365 | target_act_next_n = self.p_debug['target_act'](*(list(obs_next_n)+[is_norm_training, is_inference]))
366 | target_q_next_n = self.q_debug['target_q_values'](*(list(obs_next_n) + list(target_act_next_n)+[is_norm_training, is_inference]))
367 | target_q_n = [rew + self.args.gamma * (1.0 - done) * target_q_next for rew, done, target_q_next in
368 | zip(rew_n, done_n, target_q_next_n)]
369 | target_q_n = [target_q / num_sample for target_q in target_q_n]
370 | q_loss = self.q_train(*(list(obs_n) + list(act_n) + target_q_n + [is_norm_training, is_inference]))
371 |
372 | # train p network
373 | p_loss = self.p_train(*(list(obs_n) + list(act_n)+[is_norm_training, is_inference]))
374 |
375 | self.p_update()
376 | self.q_update()
377 |
378 | # p_values = self.p_debug['p_values'](*(list(obs_n)))
379 | kl_loss = self.p_debug['kl_loss'](*(list(obs_n) + list(act_n)+[is_norm_training, is_inference]))
380 | # print('kl_loss', self.p_debug['kl_loss'](*(list(obs_n) + list(act_n))))
381 | # if t % 5000 == 0:
382 | # print('p_values', p_values[0][0])
383 | # print('check_value', self.p_debug['p_values'](*(list(obs_n)))[0][0])
384 | # print('check_mu', self.p_debug['check_mu'](*(list(obs_n)))[0][0])
385 | # print('check_log', self.p_debug['check_log'](*(list(obs_n)))[0][0])
386 |
387 | # print('kl_loss', kl_loss)
388 | # message_n = self.p_debug['check_message_n'](*(list(obs_n)+[is_norm_training, is_inference]))
389 | # hiddens_n = self.p_debug['check_hiddens_n'](*list(obs_n))
390 | # print("message_n", message_n[0][0])
391 | # for message in message_n:
392 | # print("mean, var", np.mean(message, axis=0), np.var(message,axis=0))
393 | # print("hiddens_n", hiddens_n[0][0])
394 | # entropy = self.p_debug['check_entropy'](*list(obs_n))
395 | # print("entropy",np.mean(entropy, (1,2)))
396 |
397 | return [q_loss, p_loss, np.mean(target_q), np.mean(rew_n), np.mean(target_q_next_n), np.std(target_q), kl_loss]
398 |
--------------------------------------------------------------------------------