├── train
    ├── maddpg-v3
    │   ├── =2.0.0
    │   ├── env
    │   │   ├── __init__.py
    │   │   ├── multiagent_particle_env.py
    │   │   └── wrapper.py
    │   └── main.py
    ├── maddpg-v2
    │   ├── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── noise.py
    │   │   ├── networks.py
    │   │   ├── make_env.py
    │   │   ├── agents.py
    │   │   ├── misc.py
    │   │   ├── buffer.py
    │   │   └── env_wrappers.py
    │   ├── algorithms
    │   │   └── __init__.py
    │   ├── assets
    │   │   ├── predator_prey
    │   │   │   ├── 1.gif
    │   │   │   ├── 2.gif
    │   │   │   └── 3.gif
    │   │   ├── physical_deception
    │   │   │   ├── 1.gif
    │   │   │   ├── 2.gif
    │   │   │   └── 3.gif
    │   │   └── cooperative_communication
    │   │   │   ├── 1.gif
    │   │   │   ├── 2.gif
    │   │   │   └── 3.gif
    │   ├── evaluate.py
    │   └── main.py
    ├── maddpg-v1
    │   ├── .gitignore
    │   ├── agent.py
    │   ├── main.py
    │   ├── maddpg
    │   │   ├── actor_critic.py
    │   │   └── maddpg.py
    │   ├── common
    │   │   ├── utils.py
    │   │   ├── replay_buffer.py
    │   │   └── arguments.py
    │   └── runner.py
    ├── ddpg
    │   ├── models
    │   │   └── dqn.pth
    │   └── test.py
    ├── mappo
    │   ├── train_formation.sh
    │   ├── inbox
    │   │   ├── train_formation.sh
    │   │   ├── render_formation.sh
    │   │   ├── render_formation.py
    │   │   └── train_formation.py
    │   └── train_formation.py
    ├── maddpg-v4
    │   ├── parameters.yaml
    │   ├── train.py
    │   └── utils.py
    └── maddpg-v5
    │   ├── render.py
    │   ├── train.py
    │   └── config.py
├── formation_gym
    ├── inbox
    │   ├── scenario.py
    │   ├── core.py
    │   └── rendering.py
    ├── scenario.py
    ├── policy.py
    ├── multi_discrete.py
    ├── envs
    │   ├── basic_formation_env.py
    │   ├── formation_hd_partial_range_env.py
    │   ├── formation_hd_partial_env.py
    │   ├── formation_hd_env.py
    │   └── formation_hd_obs_env.py
    ├── __init__.py
    └── rendering.py
├── setup.py
├── test.py
├── .gitignore
└── README.md


/train/maddpg-v3/=2.0.0:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/train/maddpg-v1/.gitignore:
--------------------------------------------------------------------------------
1 | model_1/
2 | model_2/
3 | __pychache__/


--------------------------------------------------------------------------------
/train/ddpg/models/dqn.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/ddpg/models/dqn.pth


--------------------------------------------------------------------------------
/train/maddpg-v2/assets/predator_prey/1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/predator_prey/1.gif


--------------------------------------------------------------------------------
/train/maddpg-v2/assets/predator_prey/2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/predator_prey/2.gif


--------------------------------------------------------------------------------
/train/maddpg-v2/assets/predator_prey/3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/predator_prey/3.gif


--------------------------------------------------------------------------------
/train/maddpg-v2/assets/physical_deception/1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/physical_deception/1.gif


--------------------------------------------------------------------------------
/train/maddpg-v2/assets/physical_deception/2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/physical_deception/2.gif


--------------------------------------------------------------------------------
/train/maddpg-v2/assets/physical_deception/3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/physical_deception/3.gif


--------------------------------------------------------------------------------
/train/maddpg-v2/assets/cooperative_communication/1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/cooperative_communication/1.gif


--------------------------------------------------------------------------------
/train/maddpg-v2/assets/cooperative_communication/2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/cooperative_communication/2.gif


--------------------------------------------------------------------------------
/train/maddpg-v2/assets/cooperative_communication/3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/cooperative_communication/3.gif


--------------------------------------------------------------------------------
/train/maddpg-v3/env/__init__.py:
--------------------------------------------------------------------------------
1 | from .multiagent_particle_env import RLlibMultiAgentParticleEnv as MultiAgentParticleEnv
2 | from .wrapper import FormationEnv
3 | 
4 | __all__ = [
5 |     "MultiAgentParticleEnv",
6 |     "FormationEnv"
7 | ]
8 | 


--------------------------------------------------------------------------------
/formation_gym/inbox/scenario.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # defines scenario upon which the world is built
 4 | class BaseScenario(object):
 5 |     # create elements of the world
 6 |     def make_world(self):
 7 |         raise NotImplementedError()
 8 |     # create initial conditions of the world
 9 |     def reset_world(self, world):
10 |         raise NotImplementedError()
11 | 


--------------------------------------------------------------------------------
/formation_gym/scenario.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # defines scenario upon which the world is built
 4 | class BaseScenario(object):
 5 |     # create elements of the world
 6 |     def make_world(self):
 7 |         raise NotImplementedError()
 8 |     # create initial conditions of the world
 9 |     def reset_world(self, world):
10 |         raise NotImplementedError()
11 |     def info(self, agent, world):
12 |         return {}
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from pathlib import Path
 3 | 
 4 | setup(
 5 |     name='formation_gym',
 6 |     author="Chaoyi Pan",
 7 |     author_email="pcy19@mails.tsinghua.edu.cn",
 8 |     version='0.0.1',
 9 |     description="An OpenAI Gym Env for Formaion",
10 |     long_description=Path("README.md").read_text(),
11 |     classifiers=[
12 |         "Programming Language :: Python :: 3",
13 |         "License :: OSI Approved :: MIT License",
14 |         "Operating System :: OS Independent",
15 |         ],
16 |     python_requires='>=3.6'
17 | )
18 | 


--------------------------------------------------------------------------------
/train/mappo/train_formation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | env="MPE"
 3 | scenario="formation_hd_env"  # simple_speaker_listener # simple_reference
 4 | num_agents=3
 5 | algo="rmappo"
 6 | exp="check"
 7 | seed_max=1
 8 | 
 9 | echo "env is ${env}, scenario is ${scenario}, algo is ${algo}, exp is ${exp}, max seed is ${seed_max}"
10 | for seed in `seq ${seed_max}`;
11 | do
12 |     echo "seed is ${seed}:"
13 |     python train_formation.py --use_valuenorm --env_name ${env} --algorithm_name ${algo} --experiment_name ${exp} --scenario_name ${scenario} --num_agents ${num_agents} --seed ${seed} --n_training_threads 1 --n_rollout_threads 128 --num_mini_batch 1 --episode_length 25 --num_env_steps 20000000 --ppo_epoch 10 --use_ReLU --gain 0.01 --lr 7e-4 --critic_lr 7e-4 --wandb_name "jc-bao" --user_name "jc-bao"
14 | done


--------------------------------------------------------------------------------
/train/maddpg-v2/utils/noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | # from https://github.com/songrotek/DDPG/blob/master/ou_noise.py
 5 | class OUNoise:
 6 |     def __init__(self, action_dimension, scale=0.1, mu=0, theta=0.15, sigma=0.2):
 7 |         self.action_dimension = action_dimension
 8 |         self.scale = scale
 9 |         self.mu = mu
10 |         self.theta = theta
11 |         self.sigma = sigma
12 |         self.state = np.ones(self.action_dimension) * self.mu
13 |         self.reset()
14 | 
15 |     def reset(self):
16 |         self.state = np.ones(self.action_dimension) * self.mu
17 | 
18 |     def noise(self):
19 |         x = self.state
20 |         dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
21 |         self.state = x + dx
22 |         return self.state * self.scale
23 | 


--------------------------------------------------------------------------------
/train/mappo/inbox/train_formation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | env="MPE"
 3 | scenario="formation_hd_env"  # simple_speaker_listener # simple_reference
 4 | num_agents=3
 5 | algo="rmappo"
 6 | exp="check"
 7 | seed_max=1
 8 | 
 9 | echo "env is ${env}, scenario is ${scenario}, algo is ${algo}, exp is ${exp}, max seed is ${seed_max}"
10 | for seed in `seq ${seed_max}`;
11 | do
12 |     echo "seed is ${seed}:"
13 |     python train_formation.py --use_valuenorm --use_popart --env_name ${env} --algorithm_name ${algo} --experiment_name ${exp} --scenario_name ${scenario} --num_agents ${num_agents} --num_landmarks ${num_landmarks} --seed ${seed} --n_training_threads 1 --n_rollout_threads 128 --num_mini_batch 1 --episode_length 25 --num_env_steps 20000000 --ppo_epoch 10 --use_ReLU --gain 0.01 --lr 7e-4 --critic_lr 7e-4  --user_name "chaoyi"
14 | done


--------------------------------------------------------------------------------
/train/mappo/inbox/render_formation.sh:
--------------------------------------------------------------------------------
 1 | env="MPE"
 2 | scenario="formation_hd_env"
 3 | num_agents=3
 4 | algo="rmappo"
 5 | exp="render"
 6 | seed_max=1
 7 | 
 8 | echo "env is ${env}"
 9 | for seed in `seq ${seed_max}`
10 | do
11 |     # CUDA_VISIBLE_DEVICES=1 python render_formation.py --save_gifs --share_policy --env_name ${env} --algorithm_name ${algo} --experiment_name ${exp} --scenario_name ${scenario} --num_agents ${num_agents} --seed ${seed} --n_training_threads 1 --n_rollout_threads 1 --use_render --episode_length 25 --render_episodes 5 --model_dir "./results/MPE/formation_hd_env/rmappo/check/run15/models"
12 |     python render_formation.py --save_gifs --env_name ${env} --algorithm_name ${algo} --experiment_name ${exp} --scenario_name ${scenario} --num_agents ${num_agents} --seed ${seed} --n_training_threads 1 --n_rollout_threads 1 --use_render --episode_length 50 --render_episodes 5 --model_dir "/Users/reedpan/Desktop/Research/gym_formation/train/mappo/results/MPE/formation_hd_env/rmappo/hd_3/run1/models" --gif_dir './results/gif'
13 | done


--------------------------------------------------------------------------------
/train/maddpg-v1/agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import os
 4 | from maddpg.maddpg import MADDPG
 5 | 
 6 | 
 7 | class Agent:
 8 |     def __init__(self, agent_id, args):
 9 |         self.args = args
10 |         self.agent_id = agent_id
11 |         self.policy = MADDPG(args, agent_id)
12 | 
13 |     def select_action(self, o, noise_rate, epsilon):
14 |         if np.random.uniform() < epsilon:
15 |             u = np.random.uniform(-self.args.high_action, self.args.high_action, self.args.action_shape[self.agent_id])
16 |         else:
17 |             inputs = torch.tensor(o, dtype=torch.float32).unsqueeze(0)
18 |             pi = self.policy.actor_network(inputs).squeeze(0)
19 |             u = pi.cpu().numpy()
20 |             noise = noise_rate * self.args.high_action * np.random.randn(*u.shape)  # gaussian noise
21 |             u += noise
22 |             u = np.clip(u, -self.args.high_action, self.args.high_action)
23 |         return u.copy()
24 | 
25 |     def learn(self, transitions, other_agents):
26 |         self.policy.train(transitions, other_agents)
27 | 
28 | 


--------------------------------------------------------------------------------
/train/maddpg-v1/main.py:
--------------------------------------------------------------------------------
 1 | from runner import Runner
 2 | from common.arguments import get_args
 3 | from common.utils import make_env
 4 | import numpy as np
 5 | import random
 6 | import torch
 7 | import formation_gym
 8 | 
 9 | '''
10 | action = [0.1, 0.2, 0.4, 0.1, 0.2]
11 | '''
12 | 
13 | if __name__ == '__main__':
14 |     # get the params
15 |     args = get_args()
16 |     env = formation_gym.make_env(args.scenario_name, benchmark = False, num_agents = args.num_agents)
17 |     args.n_agents = args.num_agents
18 |     args.n_players = 0
19 |     args.obs_shape = [env.observation_space[i].shape[0] for i in range(args.n_agents)]  # 每一维代表该agent的obs维度
20 |     action_shape = []
21 |     for content in env.action_space:
22 |         action_shape.append(content.shape[0])
23 |     args.action_shape = action_shape[:args.n_agents]  # 每一维代表该agent的act维度
24 |     args.high_action = 1
25 |     args.low_action = -1
26 |     runner = Runner(args, env)
27 |     if args.evaluate:
28 |         returns = runner.evaluate(True)
29 |         print('Average returns is', returns)
30 |     else:
31 |         runner.run()


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | 
 4 | import formation_gym
 5 | 
 6 | if __name__ == '__main__':
 7 |     parser = argparse.ArgumentParser(description=None)
 8 |     parser.add_argument('-s', '--scenario', default='formation_hd_env', help='Path of the scenario Python script.')
 9 |     parser.add_argument('-n', '--num-agents', type=int, default=3, help='Number of agents')
10 |     parser.add_argument('-r', '--random', action='store_true', help='If use random policy.')
11 |     parser.add_argument('--num-layer', type=int, default = 1, help = 'use hierachy policy to control')
12 |     args = parser.parse_args()
13 | 
14 |     env = formation_gym.make_env(args.scenario, benchmark=False, num_agents = args.num_agents**args.num_layer)
15 |     obs_n = env.reset()
16 |     total_num_agents = args.num_agents**args.num_layer
17 |     while True:
18 |         # random policy
19 |         if args.random: 
20 |             act_n = [space.sample() for space in env.action_space]
21 |         # demo policy
22 |         else:
23 |             act_n = formation_gym.get_action_BFS(formation_gym.ezpolicy, obs_n, args.num_agents)
24 |         # step environment
25 |         obs_n, reward_n, done_n, _ = env.step(act_n)
26 |         if np.all(done_n):
27 |             obs_n = env.reset()
28 |         # render all agent views
29 |         env.render()


--------------------------------------------------------------------------------
/train/maddpg-v1/maddpg/actor_critic.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | # define the actor network
 7 | class Actor(nn.Module):
 8 |     def __init__(self, args, agent_id):
 9 |         super(Actor, self).__init__()
10 |         self.max_action = args.high_action
11 |         self.fc1 = nn.Linear(args.obs_shape[agent_id], 64)
12 |         self.fc2 = nn.Linear(64, 64)
13 |         self.fc3 = nn.Linear(64, 64)
14 |         self.action_out = nn.Linear(64, args.action_shape[agent_id])
15 | 
16 |     def forward(self, x):
17 |         x = F.relu(self.fc1(x))
18 |         x = F.relu(self.fc2(x))
19 |         x = F.relu(self.fc3(x))
20 |         actions = self.max_action * torch.tanh(self.action_out(x))
21 |         return actions
22 | 
23 | 
24 | class Critic(nn.Module):
25 |     def __init__(self, args):
26 |         super(Critic, self).__init__()
27 |         self.max_action = args.high_action
28 |         self.fc1 = nn.Linear(sum(args.obs_shape) + sum(args.action_shape), 64)
29 |         self.fc2 = nn.Linear(64, 64)
30 |         self.fc3 = nn.Linear(64, 64)
31 |         self.q_out = nn.Linear(64, 1)
32 | 
33 |     def forward(self, state, action):
34 |         state = torch.cat(state, dim=1)
35 |         for i in range(len(action)):
36 |             action[i] /= self.max_action
37 |         action = torch.cat(action, dim=1)
38 |         x = torch.cat([state, action], dim=1)
39 |         x = F.relu(self.fc1(x))
40 |         x = F.relu(self.fc2(x))
41 |         x = F.relu(self.fc3(x))
42 |         q_value = self.q_out(x)
43 |         return q_value
44 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/utils/networks.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | class MLPNetwork(nn.Module):
 5 |     """
 6 |     MLP network (can be used as value or policy)
 7 |     """
 8 |     def __init__(self, input_dim, out_dim, hidden_dim=64, nonlin=F.relu,
 9 |                  constrain_out=False, norm_in=True, discrete_action=True):
10 |         """
11 |         Inputs:
12 |             input_dim (int): Number of dimensions in input
13 |             out_dim (int): Number of dimensions in output
14 |             hidden_dim (int): Number of hidden dimensions
15 |             nonlin (PyTorch function): Nonlinearity to apply to hidden layers
16 |         """
17 |         super(MLPNetwork, self).__init__()
18 | 
19 |         if norm_in:  # normalize inputs
20 |             self.in_fn = nn.BatchNorm1d(input_dim)
21 |             self.in_fn.weight.data.fill_(1)
22 |             self.in_fn.bias.data.fill_(0)
23 |         else:
24 |             self.in_fn = lambda x: x
25 |         self.fc1 = nn.Linear(input_dim, hidden_dim)
26 |         self.fc2 = nn.Linear(hidden_dim, hidden_dim)
27 |         self.fc3 = nn.Linear(hidden_dim, out_dim)
28 |         self.nonlin = nonlin
29 |         if constrain_out and not discrete_action:
30 |             # initialize small to prevent saturation
31 |             self.fc3.weight.data.uniform_(-3e-3, 3e-3)
32 |             self.out_fn = F.tanh
33 |         else:  # logits for discrete action (will softmax later)
34 |             self.out_fn = lambda x: x
35 | 
36 |     def forward(self, X):
37 |         """
38 |         Inputs:
39 |             X (PyTorch Matrix): Batch of observations
40 |         Outputs:
41 |             out (PyTorch Matrix): Output of network (actions, values, etc)
42 |         """
43 |         h1 = self.nonlin(self.fc1(self.in_fn(X)))
44 |         h2 = self.nonlin(self.fc2(h1))
45 |         out = self.out_fn(self.fc3(h2))
46 |         return out


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | env.bak/
 86 | venv.bak/
 87 | 
 88 | # Spyder project settings
 89 | .spyderproject
 90 | .spyproject
 91 | 
 92 | # Rope project settings
 93 | .ropeproject
 94 | 
 95 | # mkdocs documentation
 96 | /site
 97 | 
 98 | # mypy
 99 | .mypy_cache/
100 | 
101 | # DS Store
102 | .DS_Store
103 | 
104 | # tensorboard
105 | /runs
106 | 
107 | # debug folder
108 | /debug
109 | 
110 | # training folder
111 | results/
112 | ray_results/
113 | logs/
114 | log/
115 | formation_gym.egg-info/


--------------------------------------------------------------------------------
/train/maddpg-v1/common/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import inspect
 3 | import functools
 4 | 
 5 | 
 6 | def store_args(method):
 7 |     """Stores provided method args as instance attributes.
 8 |     """
 9 |     argspec = inspect.getfullargspec(method)
10 |     defaults = {}
11 |     if argspec.defaults is not None:
12 |         defaults = dict(
13 |             zip(argspec.args[-len(argspec.defaults):], argspec.defaults))
14 |     if argspec.kwonlydefaults is not None:
15 |         defaults.update(argspec.kwonlydefaults)
16 |     arg_names = argspec.args[1:]
17 | 
18 |     @functools.wraps(method)
19 |     def wrapper(*positional_args, **keyword_args):
20 |         self = positional_args[0]
21 |         # Get default arg values
22 |         args = defaults.copy()
23 |         # Add provided arg values
24 |         for name, value in zip(arg_names, positional_args[1:]):
25 |             args[name] = value
26 |         args.update(keyword_args)
27 |         self.__dict__.update(args)
28 |         return method(*positional_args, **keyword_args)
29 | 
30 |     return wrapper
31 | 
32 | 
33 | def make_env(args):
34 |     from multiagent.environment import MultiAgentEnv
35 |     import multiagent.scenarios as scenarios
36 | 
37 |     # load scenario from script
38 |     scenario = scenarios.load(args.scenario_name + ".py").Scenario()
39 | 
40 |     # create world
41 |     world = scenario.make_world()
42 |     # create multiagent environment
43 |     env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
44 |     # env = MultiAgentEnv(world)
45 |     args.n_players = env.n  # 包含敌人的所有玩家个数
46 |     args.n_agents = env.n - args.num_adversaries  # 需要操控的玩家个数，虽然敌人也可以控制，但是双方都学习的话需要不同的算法
47 |     args.obs_shape = [env.observation_space[i].shape[0] for i in range(args.n_agents)]  # 每一维代表该agent的obs维度
48 |     action_shape = []
49 |     for content in env.action_space:
50 |         action_shape.append(content.n)
51 |     args.action_shape = action_shape[:args.n_agents]  # 每一维代表该agent的act维度
52 |     args.high_action = 1
53 |     args.low_action = -1
54 |     return env, args
55 | 


--------------------------------------------------------------------------------
/formation_gym/policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pyglet.window import key
 3 | 
 4 | # individual agent policy
 5 | class Policy(object):
 6 |     def __init__(self):
 7 |         pass
 8 |     def action(self, obs):
 9 |         raise NotImplementedError()
10 | 
11 | # interactive policy based on keyboard input
12 | # hard-coded to deal only with movement, not communication
13 | class InteractivePolicy(Policy):
14 |     def __init__(self, env, agent_index):
15 |         super(InteractivePolicy, self).__init__()
16 |         self.env = env
17 |         # hard-coded keyboard events
18 |         self.move = [False for i in range(4)]
19 |         self.comm = [False for i in range(env.world.dim_c)]
20 |         # register keyboard events with this environment's window
21 |         env.viewers[agent_index].window.on_key_press = self.key_press
22 |         env.viewers[agent_index].window.on_key_release = self.key_release
23 | 
24 |     def action(self, obs):
25 |         # ignore observation and just act based on keyboard events
26 |         if self.env.discrete_action_input:
27 |             u = 0
28 |             if self.move[0]: u = 1
29 |             if self.move[1]: u = 2
30 |             if self.move[2]: u = 4
31 |             if self.move[3]: u = 3
32 |         else:
33 |             u = np.zeros(5) # 5-d because of no-move action
34 |             if self.move[0]: u[1] += 1.0
35 |             if self.move[1]: u[2] += 1.0
36 |             if self.move[3]: u[3] += 1.0
37 |             if self.move[2]: u[4] += 1.0
38 |             if True not in self.move:
39 |                 u[0] += 1.0
40 |         return np.concatenate([u, np.zeros(self.env.world.dim_c)])
41 | 
42 |     # keyboard event callbacks
43 |     def key_press(self, k, mod):
44 |         if k==key.LEFT:  self.move[0] = True
45 |         if k==key.RIGHT: self.move[1] = True
46 |         if k==key.UP:    self.move[2] = True
47 |         if k==key.DOWN:  self.move[3] = True
48 |     def key_release(self, k, mod):
49 |         if k==key.LEFT:  self.move[0] = False
50 |         if k==key.RIGHT: self.move[1] = False
51 |         if k==key.UP:    self.move[2] = False
52 |         if k==key.DOWN:  self.move[3] = False
53 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/utils/make_env.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code for creating a multiagent environment with one of the scenarios listed
 3 | in ./scenarios/.
 4 | Can be called by using, for example:
 5 |     env = make_env('simple_speaker_listener')
 6 | After producing the env object, can be used similarly to an OpenAI gym
 7 | environment.
 8 | 
 9 | A policy using this environment must output actions in the form of a list
10 | for all agents. Each element of the list should be a numpy array,
11 | of size (env.world.dim_p + env.world.dim_c, 1). Physical actions precede
12 | communication actions in this array. See environment.py for more details.
13 | """
14 | 
15 | def make_env(scenario_name, benchmark=False, discrete_action=False):
16 |     '''
17 |     Creates a MultiAgentEnv object as env. This can be used similar to a gym
18 |     environment by calling env.reset() and env.step().
19 |     Use env.render() to view the environment on the screen.
20 | 
21 |     Input:
22 |         scenario_name   :   name of the scenario from ./scenarios/ to be Returns
23 |                             (without the .py extension)
24 |         benchmark       :   whether you want to produce benchmarking data
25 |                             (usually only done during evaluation)
26 | 
27 |     Some useful env properties (see environment.py):
28 |         .observation_space  :   Returns the observation space for each agent
29 |         .action_space       :   Returns the action space for each agent
30 |         .n                  :   Returns the number of Agents
31 |     '''
32 |     from multiagent.environment import MultiAgentEnv
33 |     import multiagent.scenarios as scenarios
34 | 
35 |     # load scenario from script
36 |     scenario = scenarios.load(scenario_name + ".py").Scenario()
37 |     # create world
38 |     world = scenario.make_world()
39 |     # create multiagent environment
40 |     if benchmark:        
41 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
42 |                             scenario.observation, scenario.benchmark_data,
43 |                             discrete_action=discrete_action)
44 |     else:
45 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
46 |                             scenario.observation,
47 |                             discrete_action=discrete_action)
48 |     return env
49 | 


--------------------------------------------------------------------------------
/train/maddpg-v1/common/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import numpy as np
 3 | 
 4 | 
 5 | class Buffer:
 6 |     def __init__(self, args):
 7 |         self.size = args.buffer_size
 8 |         self.args = args
 9 |         # memory management
10 |         self.current_size = 0
11 |         # create the buffer to store info
12 |         self.buffer = dict()
13 |         for i in range(self.args.n_agents):
14 |             self.buffer['o_%d' % i] = np.empty([self.size, self.args.obs_shape[i]])
15 |             self.buffer['u_%d' % i] = np.empty([self.size, self.args.action_shape[i]])
16 |             self.buffer['r_%d' % i] = np.empty([self.size])
17 |             self.buffer['o_next_%d' % i] = np.empty([self.size, self.args.obs_shape[i]])
18 |         # thread lock
19 |         self.lock = threading.Lock()
20 | 
21 |     # store the episode
22 |     def store_episode(self, o, u, r, o_next):
23 |         idxs = self._get_storage_idx(inc=1)  # 以transition的形式存，每次只存一条经验
24 |         for i in range(self.args.n_agents):
25 |             with self.lock:
26 |                 self.buffer['o_%d' % i][idxs] = o[i]
27 |                 self.buffer['u_%d' % i][idxs] = u[i]
28 |                 if isinstance(r[i], list): self.buffer['r_%d' % i][idxs] = r[i][0]
29 |                 else: self.buffer['r_%d' % i][idxs] = r[i]
30 |                 self.buffer['o_next_%d' % i][idxs] = o_next[i]
31 |     
32 |     # sample the data from the replay buffer
33 |     def sample(self, batch_size):
34 |         temp_buffer = {}
35 |         idx = np.random.randint(0, self.current_size, batch_size)
36 |         for key in self.buffer.keys():
37 |             temp_buffer[key] = self.buffer[key][idx]
38 |         return temp_buffer
39 | 
40 |     def _get_storage_idx(self, inc=None):
41 |         inc = inc or 1
42 |         if self.current_size+inc <= self.size:
43 |             idx = np.arange(self.current_size, self.current_size+inc)
44 |         elif self.current_size < self.size:
45 |             overflow = inc - (self.size - self.current_size)
46 |             idx_a = np.arange(self.current_size, self.size)
47 |             idx_b = np.random.randint(0, self.current_size, overflow)
48 |             idx = np.concatenate([idx_a, idx_b])
49 |         else:
50 |             idx = np.random.randint(0, self.size, inc)
51 |         self.current_size = min(self.size, self.current_size+inc)
52 |         if inc == 1:
53 |             idx = idx[0]
54 |         return idx
55 | 


--------------------------------------------------------------------------------
/formation_gym/multi_discrete.py:
--------------------------------------------------------------------------------
 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates)
 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py)
 3 | 
 4 | import numpy as np
 5 | 
 6 | import gym
 7 | # from gym.spaces import prng
 8 | 
 9 | class MultiDiscrete(gym.Space):
10 |     """
11 |     - The multi-discrete action space consists of a series of discrete action spaces with different parameters
12 |     - It can be adapted to both a Discrete action space or a continuous (Box) action space
13 |     - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
14 |     - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
15 |        where the discrete action space can take any integers from `min` to `max` (both inclusive)
16 |     Note: A value of 0 always need to represent the NOOP action.
17 |     e.g. Nintendo Game Controller
18 |     - Can be conceptualized as 3 discrete action spaces:
19 |         1) Arrow Keys: Discrete 5  - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4]  - params: min: 0, max: 4
20 |         2) Button A:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
21 |         3) Button B:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
22 |     - Can be initialized as
23 |         MultiDiscrete([ [0,4], [0,1], [0,1] ])
24 |     """
25 |     def __init__(self, array_of_param_array):
26 |         self.low = np.array([x[0] for x in array_of_param_array])
27 |         self.high = np.array([x[1] for x in array_of_param_array])
28 |         self.num_discrete_space = self.low.shape[0]
29 | 
30 |     def sample(self):
31 |         """ Returns a array with one sample from each discrete action space """
32 |         # For each row: round(random .* (max - min) + min, 0)
33 |         np_random = np.random.RandomState()
34 |         random_array = np_random.rand(self.num_discrete_space)
35 |         return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
36 |     def contains(self, x):
37 |         return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all()
38 | 
39 |     @property
40 |     def shape(self):
41 |         return self.num_discrete_space
42 |     def __repr__(self):
43 |         return "MultiDiscrete" + str(self.num_discrete_space)
44 |     def __eq__(self, other):
45 |         return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)


--------------------------------------------------------------------------------
/train/maddpg-v1/common/arguments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | """
 4 | Here are the param for the training
 5 | 
 6 | """
 7 | 
 8 | 
 9 | def get_args():
10 |     parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
11 |     # Environment
12 |     parser.add_argument("--scenario-name", type=str, default="formation_hd_env", help="name of the scenario script")
13 |     parser.add_argument("--max-episode-len", type=int, default=30, help="maximum episode length")
14 |     parser.add_argument("--time-steps", type=int, default=1000000, help="number of time steps") # 2000000
15 |     # 一个地图最多env.n个agents，用户可以定义min(env.n,num-adversaries)个敌人，剩下的是好的agent 
16 |     parser.add_argument("--num-adversaries", type=int, default=0, help="number of adversaries")
17 |     parser.add_argument("--num-agents", type=int, default=3, help="number of agents")
18 |     # Core training parameters
19 |     parser.add_argument("--lr-actor", type=float, default=1e-4, help="learning rate of actor")
20 |     parser.add_argument("--lr-critic", type=float, default=1e-4, help="learning rate of critic")
21 |     parser.add_argument("--epsilon", type=float, default=0.1, help="epsilon greedy")
22 |     parser.add_argument("--noise_rate", type=float, default=0.25, help="noise rate for sampling from a standard normal distribution ")
23 |     parser.add_argument("--gamma", type=float, default=0.95, help="discount factor")
24 |     parser.add_argument("--tau", type=float, default=0.01, help="parameter for updating the target network")
25 |     parser.add_argument("--buffer-size", type=int, default=int(5e5), help="number of transitions can be stored in buffer")
26 |     parser.add_argument("--batch-size", type=int, default=256, help="number of episodes to optimize at the same time")
27 |     # Checkpointing
28 |     parser.add_argument("--save-dir", type=str, default="./model_3", help="directory in which training state and model should be saved")
29 |     parser.add_argument("--save-rate", type=int, default=10000, help="save model once every time this many episodes are completed")
30 |     parser.add_argument("--model-idx", type=int, default=1, help="The index of saved model to load")
31 | 
32 |     # Evaluate
33 |     parser.add_argument("--evaluate-episodes", type=int, default=10, help="number of episodes for evaluating")
34 |     parser.add_argument("--evaluate-episode-len", type=int, default=30, help="length of episodes for evaluating")
35 |     parser.add_argument("--evaluate", type=bool, default=False, help="whether to evaluate the model")
36 |     parser.add_argument("--evaluate-rate", type=int, default=10000, help="how often to evaluate model")
37 |     args = parser.parse_args()
38 | 
39 |     return args
40 | 


--------------------------------------------------------------------------------
/train/maddpg-v4/parameters.yaml:
--------------------------------------------------------------------------------
 1 | # environment
 2 | env_name: MPE
 3 | scenario_name: formation_hd_env
 4 | num_agents: 4
 5 | env_steps: 1e7
 6 | episode_length: 5000 # how many steps to evaluate and save
 7 | train_interval: 500000 # how many steps to train
 8 | 
 9 | # experinment
10 | experiment_index: 1
11 | seed: 1
12 | 
13 | # policy
14 | share_policy: True
15 | 
16 | # algorithm
17 | algorithm_name: maddpg
18 | gamma: 0.95
19 | use_same_share_obs: True # whether all agents share the same centralized observation[TBD]
20 | use_avail_acts: False # whether to store what actions are available. [TBD]
21 | use_reward_normalization: True # Whether to normalize rewards in replay buffer [TBD]
22 | use_popart: False # if use popart to handle multi-tasks
23 | popart_update_interval_step: 2 # after how many train steps popart should be updated
24 | use_value_active_masks: False # [TBD] [Q]
25 | use_huber_loss: False # Whether to use Huber loss for critic update to improve robustness [TBD]
26 | huber_delta: 10.0 
27 | actor_update_interval: 1 # number of critic updates to perform between every update to the actor. [TBD]
28 | tau: 0.005 # Polyak update rate
29 | lr: 5e-4 # learning rate
30 | opti_eps: 1e-5 # RMSprop optimizer epsilon [Q]
31 | weight_decay: 0 # [Q]
32 | target_noise: False
33 | use_orthogonal: True
34 | use_feature_normalization: True # Whether to apply layernorm to the inputs
35 | use_ReLU: True
36 | use_conv1d: False # Whether to use conv1d
37 | stacked_frames: 1 # Dimension of hidden layers for actor/critic networks
38 | layer_N: 1 # Number of layers for actor/critic networks
39 | hidden_size: 64 # Dimension of hidden layers for actor/critic networks
40 | gain: 0.01 # gain for action last layer [Q]
41 | hidden_size: 64 #"Dimension of hidden layers for actor/critic networks")
42 | 
43 | # exploration parameters
44 | epsilon_start: 1.0
45 | epsilon_finish: 0.05
46 | epsilon_anneal_time: 50000
47 | act_noise_std: 0.1
48 | num_random_episodes: 500 # [TBD]
49 | 
50 | # replay buffer
51 | buffer_size: 32 # Number of buffer transitions to train on at once
52 | use_per: True # Whether to use prioritized experience replay
53 | per_alpha: 0.6 # Alpha term for prioritized experience replay, like learning rate
54 | per_beta_start: 0.4 # Starting beta term for prioritized experience replay
55 | per_eps: 1e-6 # Eps term for prioritized experience replay
56 | 
57 | # policy
58 | 
59 | # parallel
60 | n_training_threads: 8 # TBD
61 | n_rollout_threads: 1 #TBD
62 | 
63 | # GPU
64 | device: 'gpu'
65 | cuda: True
66 | cuda_deterministic: False # TBD
67 | 
68 | # save
69 | save_path: results
70 | restore: False
71 | save_interval: 100000
72 | 
73 | # log
74 | log_interval: 1000
75 | 
76 | # evaluate
77 | use_eval: True
78 | eval_interval: 10000
79 | num_eval_episodes: 5


--------------------------------------------------------------------------------
/train/ddpg/test.py:
--------------------------------------------------------------------------------
 1 | import gym, torch, numpy as np, tianshou as ts
 2 | from torch import nn
 3 | from torch.utils.tensorboard import SummaryWriter
 4 | from tianshou.utils import TensorboardLogger
 5 | 
 6 | # make  env
 7 | train_envs = ts.env.DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(4)])
 8 | test_envs = ts.env.DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(1)])
 9 | 
10 | # build the network
11 | class Net(nn.Module):
12 |     def __init__(self, state_shape, action_shape):
13 |         super().__init__()
14 |         self.model = nn.Sequential(
15 |             nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
16 |             nn.Linear(128, 128), nn.ReLU(inplace=True),
17 |             nn.Linear(128, 128), nn.ReLU(inplace=True),
18 |             nn.Linear(128, np.prod(action_shape)),
19 |         ) # inplace: calculate without copy prod: flatten the shape
20 | 
21 |     def forward(self, obs, state = None, info = {}):
22 |         if not isinstance(obs, torch.Tensor):
23 |             obs = torch.tensor(obs, dtype=torch.float)
24 |         batch = obs.shape[0]
25 |         logits = self.model(obs.view(batch, -1))
26 |         return logits, state
27 | 
28 | state_shape = train_envs.observation_space[0].shape or train_envs.observation_space[0].n
29 | action_shape = train_envs.action_space[0].shape or train_envs.action_space[0].n
30 | net = Net(state_shape, action_shape)
31 | optim = torch.optim.Adam(net.parameters(), lr=1e-3)
32 | 
33 | # set up policy
34 | policy = ts.policy.DQNPolicy(net, optim, discount_factor=0.9, estimation_step=3, target_update_freq=320)
35 | 
36 | # setup collector
37 | train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True)
38 | test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)
39 | 
40 | # logger
41 | writer = SummaryWriter('log/dqn')
42 | logger = TensorboardLogger(writer)
43 | 
44 | # trainer
45 | # step/epoch: collect  
46 | # update/step: train after these steps
47 | # step/collect: update times according to collect
48 | result = ts.trainer.offpolicy_trainer(
49 |     policy, train_collector, test_collector,
50 |     max_epoch=10, step_per_epoch=10000, step_per_collect=10,
51 |     update_per_step=0.1, episode_per_test=100, batch_size=64,
52 |     train_fn=lambda epoch, env_step: policy.set_eps(0.1),
53 |     test_fn=lambda epoch, env_step: policy.set_eps(0.05),
54 |     stop_fn=lambda mean_rewards: mean_rewards >= train_envs.spec[0].reward_threshold,
55 |     logger = logger)
56 | print(f'Finished training! Use {result["duration"]}')
57 | 
58 | # save policy
59 | torch.save(policy.state_dict(), 'models/dqn.pth')
60 | # policy.load_state_dict(torch.load('dqn.pth'))
61 | # evaluate
62 | test_collector.collect(n_episode = 1, render = 1/30)
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Formation environment based on MPE
 2 | 
 3 | multi-agent formation control environment implemented with MPE.
 4 | 
 5 | ## Installation
 6 | 
 7 | ```
 8 | git clone https://github.com/jc-bao/gym-formation.git
 9 | cd gym-formation
10 | pip install -e .
11 | ```
12 | 
13 | ## Test
14 | ```
15 | python test.py -s formation_hd_env --num-layer 1
16 | ```
17 | Note: use `-r` flag to use random policy.
18 | 
19 | ## TODO
20 | 
21 | - [ ] Observation: reduce the number of observation in hierarchy policy. (now use fully observation)
22 | 
23 | - [ ] Leader&Communication: choose the group leader in each layer smartly and communicate smartly. (now use the first agent as leader)
24 | 
25 | - [ ] Target shape: achieve asymmetic shape. (now only symetric shape in higher level control)
26 | 
27 | - [ ] Group: divide the group smartly to reduce formation time. (now the group was previously divided, more layers, less distributional)
28 | 
29 | - [ ] Location: the esitimation of the position of groups. (now the group was located by inference the center of group by leader's observation)
30 | 
31 | ## Extend to more agent use hierarchy policy
32 | 
33 | ```python
34 | num_agents_per_layer = 3 # number of agents of your original policy network (or you can use ezpolicy provided by the package)
35 | num_layer = 2 # number of control layer, extend agent number to n^{layers}
36 | env = formation_gym.make_env('formation_hd_env', benchmark=False, num_agents = anum_agents_per_layer**num_layer)
37 | obs_n = env.reset()
38 | while True:
39 |   		# use BFS to extend your policy to larger scale
40 |       act_n = formation_gym.get_action_BFS(YOUR_POLICY_HERE, obs_n, num_agents_per_layer)
41 |       # step environment
42 |       obs_n, reward_n, done_n, _ = env.step(act_n)
43 |       ...
44 | ```
45 | 
46 | Note: 
47 | 
48 | * not recommend to use layer larger than 5, which will run 3^5 network in parallel. 
49 | * make sure your policy network can correct turn the observation of single agent into action.
50 | * the `get_action_BFS` is based on [Breadth-first search](https://en.wikipedia.org/wiki/Breadth-first_search)
51 | * **get any target shape**: by using the function provided in env by calling `ideal_shape = env.generate_shape(num_layers = 3, layer_shapes = YOUR_TARGET_LAYER_SHAPE).reshape(-1,2)`  and replace the observation counter part with it. 
52 | 
53 | ## Train
54 | 
55 | Please Refer to `train/README.md`
56 | If you want to use another algorithm, here is the template:
57 | 
58 | ```
59 | import formation_gym
60 | 
61 | env = formation_gym.make_env(your_scenario_name, if_use_benchmark, number_of_agents, episode_length)
62 | ```
63 | 
64 | ## Scenarios
65 | 
66 | | basic_formation_env                                          | formation_hd_env                                             |
67 | | ------------------------------------------------------------ | ------------------------------------------------------------ |
68 | | The reimplemtation for OpenAI MPE spread enviroment. The target it reach the landmark. | Try to mimic the topology of landmarks only with relative observation. |
69 | |                                                              | ![Nov-24-2021 14-10-59](https://tva1.sinaimg.cn/large/008i3skNly1gwq7m2aj1pg30ii0i0e82.gif) |
70 | | ![plt](https://tva1.sinaimg.cn/large/008i3skNly1gukfvhkxraj60hs0dcaal02.jpg) | ![plt](https://tva1.sinaimg.cn/large/008i3skNly1gukfuj9pr7j60hs0dc3yz02.jpg) |
71 | 
72 | ## Further information
73 | 
74 | ```
75 | action space = [if_moveable, action_1, ... action_n,  comm_1, ... comm_n]
76 | ```
77 | 
78 | ### MVE Support
79 | 
80 | * Action: ``
81 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/evaluate.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | import time
 4 | import imageio
 5 | import numpy as np
 6 | from pathlib import Path
 7 | from torch.autograd import Variable
 8 | from utils.make_env import make_env
 9 | from algorithms.maddpg import MADDPG
10 | 
11 | import formation_gym
12 | 
13 | 
14 | def run(config):
15 |     model_path = (Path('./models') / config.env_id / config.model_name /
16 |                   ('run%i' % config.run_num))
17 |     if config.incremental is not None:
18 |         model_path = model_path / 'incremental' / ('model_ep%i.pt' %
19 |                                                    config.incremental)
20 |     else:
21 |         model_path = model_path / 'model.pt'
22 | 
23 |     if config.save_gifs:
24 |         gif_path = model_path.parent / 'gifs'
25 |         gif_path.mkdir(exist_ok=True)
26 | 
27 |     maddpg = MADDPG.init_from_save(model_path)
28 |     env = formation_gym.make_env(config.env_id, False, config.agent_num)
29 |     maddpg.prep_rollouts(device='cpu') # cpu
30 |     ifi = 1 / config.fps  # inter-frame interval
31 | 
32 |     for ep_i in range(config.n_episodes):
33 |         print("Episode %i of %i" % (ep_i + 1, config.n_episodes))
34 |         obs = env.reset()
35 |         if config.save_gifs:
36 |             frames = []
37 |             frames.append(env.render('rgb_array')[0])
38 |         env.render('human')
39 |         for t_i in range(config.episode_length):
40 |             calc_start = time.time()
41 |             # rearrange observations to be per agent, and convert to torch Variable
42 |             torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1),
43 |                                   requires_grad=False)
44 |                          for i in range(maddpg.nagents)]
45 |             # get actions as torch Variables
46 |             torch_actions = maddpg.step(torch_obs, explore=False)
47 |             # convert actions to numpy arrays
48 |             actions = [ac.data.numpy().flatten() for ac in torch_actions]
49 |             obs, rewards, dones, infos = env.step(actions)
50 |             if config.save_gifs:
51 |                 frames.append(env.render('rgb_array')[0])
52 |             calc_end = time.time()
53 |             elapsed = calc_end - calc_start
54 |             if elapsed < ifi:
55 |                 time.sleep(ifi - elapsed)
56 |             env.render('human')
57 |         if config.save_gifs:
58 |             gif_num = 0
59 |             while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists():
60 |                 gif_num += 1
61 |             imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))),
62 |                             frames, duration=ifi)
63 | 
64 |     env.close()
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     parser = argparse.ArgumentParser()
69 |     parser.add_argument("--env_id", default='formation_hd_env', type = str,  help="Name of environment")
70 |     parser.add_argument("--model_name",default='model', type = str, help="Name of model")
71 |     parser.add_argument("--run_num", default=1, type=int)
72 |     parser.add_argument("--save_gifs", action="store_true", help="Saves gif of each episode into model directory")
73 |     parser.add_argument("--incremental", default=None, type=int, help="Load incremental policy from given episode " + "rather than final policy")
74 |     parser.add_argument("--n_episodes", default=10, type=int)
75 |     parser.add_argument("--episode_length", default=30, type=int)
76 |     parser.add_argument("--fps", default=30, type=int)
77 |     parser.add_argument("--agent-num", type=int, default = 9)
78 | 
79 |     config = parser.parse_args()
80 | 
81 |     run(config)


--------------------------------------------------------------------------------
/formation_gym/envs/basic_formation_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from multiagent.scenario import BaseScenario
 4 | from multiagent.core import World, Agent, Landmark
 5 | 
 6 | class Scenario(BaseScenario):
 7 |     def make_world(self, num_agents = 3, num_landmarks = 3):
 8 |         # world properties
 9 |         world = World()
10 |         world.dim_c = 2 # communication channel
11 |         world.collaborative = True
12 |         # agent properties
13 |         world.agents = [Agent() for i in range(num_agents)]
14 |         for i, agent in enumerate(world.agents):
15 |             agent.name = 'agent %d' % i
16 |             agent.collide = True
17 |             agent.silent = True
18 |             agent.size = 0.1 
19 |         # landmark properties
20 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
21 |         for i, landmark in enumerate(world.landmarks):
22 |             landmark.name = 'landmarks %d' % i
23 |             landmark.collide = False 
24 |             landmark.movable = False
25 |         # initial conditions
26 |         self.reset_world(world)
27 |         return world
28 |     
29 |     def observation(self, agent, world):
30 |         # landmark pos
31 |         entity_pos = []
32 |         for entity in world.landmarks:
33 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
34 |         # agent pos & communication
35 |         other_pos = []
36 |         comm = []
37 |         for other in world.agents:
38 |             if other is agent: continue
39 |             comm.append(other.state.c)
40 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
41 |         return np.concatenate([agent.state.p_vel]+[agent.state.p_pos]+entity_pos + other_pos + comm)
42 | 
43 |     def reward(self, agent, world):
44 |         rew = 0
45 |         for l in world.landmarks:
46 |             dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents]
47 |             rew -= min(dists)
48 |         if agent.collide:
49 |             for a in world.agents:
50 |                 if self.is_collision(a, agent):
51 |                     rew -= 1
52 |         return rew
53 | 
54 |     def reset_world(self, world):
55 |         # agent
56 |         for agent in world.agents:
57 |             agent.color = np.array([0.35, 0.35, 0.85])
58 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
59 |             agent.state.p_vel = np.zeros(world.dim_p)
60 |             agent.state.c = np.zeros(world.dim_c)
61 |         # landmark
62 |         for landmark in world.landmarks:
63 |             landmark.color = np.array([0.25, 0.25, 0.25])
64 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
65 |             landmark.state.p_vel = np.zeros(world.dim_p)
66 | 
67 |     def benchmark_data(self, agent, world):
68 |         # get data to debug
69 |         rew = self.reward(agent, world)
70 |         collisions = 0
71 |         if agent.collide:
72 |             for a in world.agents:
73 |                 if self.is_collision(a, agent):
74 |                     collisions += 1
75 |         min_dists = 0
76 |         occupied_landmarks = 0
77 |         for l in world.landmarks:
78 |             dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents]
79 |             min_dists += min(dists)
80 |             if min(dists) < 0.1:
81 |                 occupied_landmarks += 1
82 |         return {
83 |             'reward': rew, 
84 |             'collisions': collisions, 
85 |             'min_dists': min_dists, 
86 |             'occupied_landmarks': occupied_landmarks
87 |         }
88 | 
89 |     def is_collision(self, agent1, agent2):
90 |         dist = np.linalg.norm(agent1.state.p_pos - agent2.state.p_pos)
91 |         return dist < (agent1.size + agent2.size)
92 | 
93 | 


--------------------------------------------------------------------------------
/train/maddpg-v4/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import numpy as np
 4 | from pathlib import Path
 5 | from gym.spaces import Box, Discrete, Tuple
 6 | 
 7 | from utils import get_config, get_cent_act_dim, get_dim_from_space, make_train_env
 8 | from runner import Runner
 9 | 
10 | if __name__ == "__main__":
11 |     config = get_config()
12 |     # torch setup
13 |     if config['cuda'] and torch.cuda.is_available():
14 |         print("choose to use gpu...")
15 |         device = torch.device("cuda:0")
16 |         torch.set_num_threads(config['n_training_threads'])
17 |         if config['cuda_deterministic']:
18 |             torch.backends.cudnn.benchmark = False
19 |             torch.backends.cudnn.deterministic = True
20 |         else:
21 |             print("choose to use cpu...")
22 |             device = torch.device("cpu")
23 |             torch.set_num_threads(config['n_training_threads'])
24 |     torch.manual_seed(config['seed'])
25 |     torch.cuda.manual_seed_all(config['seed'])
26 |     np.random.seed(config['seed'])
27 |     # dir setup
28 |     run_dir = Path(os.path.dirname(os.path.abspath(__file__)) + '/' + config['save_path']+ '/'+config['scenario_name']+'_' + config['algorithm_name']+'_'+str(config['experiment_index']))
29 |     if not run_dir.exists(): 
30 |         os.makedirs(str(run_dir))
31 |     if not os.path.exists(run_dir/'logs'):
32 |         os.makedirs(run_dir/'logs')
33 |     if not os.path.exists(run_dir/'models'):
34 |         os.makedirs(run_dir/'models')
35 |     else: 
36 |         config['restore'] = True
37 |     config['fullpath'] = str(run_dir)
38 |     config['model_path'] = str(run_dir) + '/models'
39 |     config['log_path'] = str(run_dir) + '/logs'
40 |     # env setup
41 |     env = make_train_env(config)
42 |     eval_env = make_train_env(config)
43 |     # algorithm setup
44 |     if config['share_policy']:
45 |         config['policy_info'] = {
46 |             'policy_0': {"cent_obs_dim": get_dim_from_space(env.share_observation_space[0]),
47 |                          "cent_act_dim": get_cent_act_dim(env.action_space),
48 |                          "obs_space": env.observation_space[0],
49 |                          "share_obs_space": env.share_observation_space[0],
50 |                          "act_space": env.action_space[0]}
51 |         }
52 |         def policy_mapping_fn(id): return 'policy_0'
53 |     else:
54 |         config['policy_info'] = {
55 |             'policy_' + str(agent_id): {
56 |                 "cent_obs_dim": get_dim_from_space(env.share_observation_space[agent_id]),
57 |                 "cent_act_dim": get_cent_act_dim(env.action_space),
58 |                 "obs_space": env.observation_space[agent_id],
59 |                 'obs_dim': get_dim_from_space(env.observation_space[agent_id]),
60 |                 "share_obs_space": env.share_observation_space[agent_id],
61 |                 "act_space": env.action_space[agent_id],
62 |                 "act_dim": get_dim_from_space(env.action_space[agent_id]),
63 |                 "output_dim": sum(get_dim_from_space(env.action_space[agent_id])) if isinstance((get_dim_from_space(env.action_space[agent_id]), np.ndarray)) else get_dim_from_space(env.action_space[agent_id]),
64 |             }
65 |             for agent_id in range(config['num_agents'])
66 |         }
67 |         def policy_mapping_fn(agent_id): return 'policy_' + str(agent_id)
68 |     # Q: why do we need this one
69 |     config['policy_mapping_fn']=policy_mapping_fn
70 |     # more parameters
71 |     config['env'] = env 
72 |     config['eval_env'] = eval_env
73 |     config['discrete'] = isinstance(config['policy_info']["act_space"], Discrete) or "MultiDiscrete" in (config['policy_info']["act_space"].__class__.__name__)
74 |     config['multidiscrete'] = ("MultiDiscrete" in config['policy_info']["act_space"].__class__.__name__)
75 |     config['tpdv'] = dict(dtype=torch.float32, device=config['device'])
76 |     # train
77 |     total_steps = 0
78 |     runner = Runner(config)
79 |     while total_steps < config['env_steps']:
80 |         total_steps = runner.run()
81 |     # close
82 |     env.close()
83 |     eval_env.close()


--------------------------------------------------------------------------------
/train/maddpg-v1/runner.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | from agent import Agent
 3 | from common.replay_buffer import Buffer
 4 | import torch
 5 | import os
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | import time
 9 | 
10 | 
11 | class Runner:
12 |     def __init__(self, args, env):
13 |         self.args = args
14 |         self.noise = args.noise_rate
15 |         self.epsilon = args.epsilon
16 |         self.episode_limit = args.max_episode_len
17 |         self.env = env
18 |         self.agents = self._init_agents()
19 |         self.buffer = Buffer(args)
20 |         self.save_path = 'results/' + self.args.save_dir + '/' + self.args.scenario_name
21 |         if not os.path.exists(self.save_path):
22 |             os.makedirs(self.save_path)
23 | 
24 |     def _init_agents(self):
25 |         agents = []
26 |         for i in range(self.args.n_agents):
27 |             agent = Agent(i, self.args)
28 |             agents.append(agent)
29 |         return agents
30 | 
31 |     def run(self):
32 |         returns = []
33 |         for time_step in tqdm(range(self.args.time_steps)):
34 |             # reset the environment
35 |             if time_step % self.episode_limit == 0:
36 |                 s = self.env.reset()
37 |             u = []
38 |             actions = []
39 |             with torch.no_grad():
40 |                 for agent_id, agent in enumerate(self.agents):
41 |                     action = agent.select_action(s[agent_id], self.noise, self.epsilon)
42 |                     u.append(action)
43 |                     actions.append(action)
44 |             for i in range(self.args.n_agents, self.args.n_players):
45 |                 actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
46 |             s_next, r, done, info = self.env.step(actions)
47 |             self.buffer.store_episode(s[:self.args.n_agents], u, r[:self.args.n_agents], s_next[:self.args.n_agents])
48 |             s = s_next
49 |             if self.buffer.current_size >= self.args.batch_size:
50 |                 transitions = self.buffer.sample(self.args.batch_size)
51 |                 for agent in self.agents:
52 |                     other_agents = self.agents.copy()
53 |                     other_agents.remove(agent)
54 |                     agent.learn(transitions, other_agents)
55 |             if time_step > 0 and time_step % self.args.evaluate_rate == 0:
56 |                 returns.append(self.evaluate())
57 |                 plt.figure()
58 |                 plt.plot(range(len(returns)), returns)
59 |                 plt.xlabel('episode * ' + str(self.args.evaluate_rate / self.episode_limit))
60 |                 plt.ylabel('average returns')
61 |                 plt.savefig(self.save_path + '/plt.png', format='png')
62 |             self.noise = max(0.05, self.noise - 0.0000005)
63 |             self.epsilon = max(0.05, self.noise - 0.0000005)
64 |             np.save(self.save_path + '/returns.pkl', returns)
65 | 
66 |     def evaluate(self, rnd = False):
67 |         returns = []
68 |         for episode in range(self.args.evaluate_episodes):
69 |             # reset the environment
70 |             rewards = 0
71 |             s = self.env.reset()
72 |             for time_step in range(self.args.evaluate_episode_len):
73 |                 if rnd: 
74 |                     self.env.render()
75 |                     # time.sleep(1)
76 |                 actions = []
77 |                 with torch.no_grad():
78 |                     for agent_id, agent in enumerate(self.agents):
79 |                         action = agent.select_action(s[agent_id], 0, 0)
80 |                         actions.append(action)
81 |                 for i in range(self.args.n_agents, self.args.n_players):
82 |                     actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])
83 |                 s_next, r, done, info = self.env.step(actions)
84 |                 if isinstance(r[0], list): rewards += r[0][0]
85 |                 else: rewards += r[0]
86 |                 s = s_next
87 |             returns.append(rewards)
88 |             print('Returns is', rewards, 'Final Reward:', r[0])
89 |         return sum(returns) / self.args.evaluate_episodes
90 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/utils/agents.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch.autograd import Variable
 3 | from torch.optim import Adam
 4 | from .networks import MLPNetwork
 5 | from .misc import hard_update, gumbel_softmax, onehot_from_logits
 6 | from .noise import OUNoise
 7 | 
 8 | class DDPGAgent(object):
 9 |     """
10 |     General class for DDPG agents (policy, critic, target policy, target
11 |     critic, exploration noise)
12 |     """
13 |     def __init__(self, num_in_pol, num_out_pol, num_in_critic, hidden_dim=64,
14 |                  lr=0.01, discrete_action=True):
15 |         """
16 |         Inputs:
17 |             num_in_pol (int): number of dimensions for policy input
18 |             num_out_pol (int): number of dimensions for policy output
19 |             num_in_critic (int): number of dimensions for critic input
20 |         """
21 |         self.policy = MLPNetwork(num_in_pol, num_out_pol,
22 |                                  hidden_dim=hidden_dim,
23 |                                  constrain_out=True,
24 |                                  discrete_action=discrete_action)
25 |         self.critic = MLPNetwork(num_in_critic, 1,
26 |                                  hidden_dim=hidden_dim,
27 |                                  constrain_out=False)
28 |         self.target_policy = MLPNetwork(num_in_pol, num_out_pol,
29 |                                         hidden_dim=hidden_dim,
30 |                                         constrain_out=True,
31 |                                         discrete_action=discrete_action)
32 |         self.target_critic = MLPNetwork(num_in_critic, 1,
33 |                                         hidden_dim=hidden_dim,
34 |                                         constrain_out=False)
35 |         hard_update(self.target_policy, self.policy)
36 |         hard_update(self.target_critic, self.critic)
37 |         self.policy_optimizer = Adam(self.policy.parameters(), lr=lr)
38 |         self.critic_optimizer = Adam(self.critic.parameters(), lr=lr)
39 |         if not discrete_action:
40 |             self.exploration = OUNoise(num_out_pol)
41 |         else:
42 |             self.exploration = 0.3  # epsilon for eps-greedy
43 |         self.discrete_action = discrete_action
44 | 
45 |     def reset_noise(self):
46 |         if not self.discrete_action:
47 |             self.exploration.reset()
48 | 
49 |     def scale_noise(self, scale):
50 |         if self.discrete_action:
51 |             self.exploration = scale
52 |         else:
53 |             self.exploration.scale = scale
54 | 
55 |     def step(self, obs, explore=False):
56 |         """
57 |         Take a step forward in environment for a minibatch of observations
58 |         Inputs:
59 |             obs (PyTorch Variable): Observations for this agent
60 |             explore (boolean): Whether or not to add exploration noise
61 |         Outputs:
62 |             action (PyTorch Variable): Actions for this agent
63 |         """
64 |         action = self.policy(obs)
65 |         if self.discrete_action:
66 |             if explore:
67 |                 action = gumbel_softmax(action, hard=True)
68 |             else:
69 |                 action = onehot_from_logits(action)
70 |         else:  # continuous action
71 |             if explore:
72 |                 action += Variable(Tensor(self.exploration.noise()),
73 |                                    requires_grad=False)
74 |             action = action.clamp(-1, 1)
75 |         return action
76 | 
77 |     def get_params(self):
78 |         return {'policy': self.policy.state_dict(),
79 |                 'critic': self.critic.state_dict(),
80 |                 'target_policy': self.target_policy.state_dict(),
81 |                 'target_critic': self.target_critic.state_dict(),
82 |                 'policy_optimizer': self.policy_optimizer.state_dict(),
83 |                 'critic_optimizer': self.critic_optimizer.state_dict()}
84 | 
85 |     def load_params(self, params):
86 |         self.policy.load_state_dict(params['policy'])
87 |         self.critic.load_state_dict(params['critic'])
88 |         self.target_policy.load_state_dict(params['target_policy'])
89 |         self.target_critic.load_state_dict(params['target_critic'])
90 |         self.policy_optimizer.load_state_dict(params['policy_optimizer'])
91 |         self.critic_optimizer.load_state_dict(params['critic_optimizer'])
92 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.nn.functional as F
 4 | import torch.distributed as dist
 5 | from torch.autograd import Variable
 6 | import numpy as np
 7 | 
 8 | # https://github.com/ikostrikov/pytorch-ddpg-naf/blob/master/ddpg.py#L11
 9 | def soft_update(target, source, tau):
10 |     """
11 |     Perform DDPG soft update (move target params toward source based on weight
12 |     factor tau)
13 |     Inputs:
14 |         target (torch.nn.Module): Net to copy parameters to
15 |         source (torch.nn.Module): Net whose parameters to copy
16 |         tau (float, 0 < x < 1): Weight factor for update
17 |     """
18 |     for target_param, param in zip(target.parameters(), source.parameters()):
19 |         target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
20 | 
21 | # https://github.com/ikostrikov/pytorch-ddpg-naf/blob/master/ddpg.py#L15
22 | def hard_update(target, source):
23 |     """
24 |     Copy network parameters from source to target
25 |     Inputs:
26 |         target (torch.nn.Module): Net to copy parameters to
27 |         source (torch.nn.Module): Net whose parameters to copy
28 |     """
29 |     for target_param, param in zip(target.parameters(), source.parameters()):
30 |         target_param.data.copy_(param.data)
31 | 
32 | # https://github.com/seba-1511/dist_tuto.pth/blob/gh-pages/train_dist.py
33 | def average_gradients(model):
34 |     """ Gradient averaging. """
35 |     size = float(dist.get_world_size())
36 |     for param in model.parameters():
37 |         dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0)
38 |         param.grad.data /= size
39 | 
40 | # https://github.com/seba-1511/dist_tuto.pth/blob/gh-pages/train_dist.py
41 | def init_processes(rank, size, fn, backend='gloo'):
42 |     """ Initialize the distributed environment. """
43 |     os.environ['MASTER_ADDR'] = '127.0.0.1'
44 |     os.environ['MASTER_PORT'] = '29500'
45 |     dist.init_process_group(backend, rank=rank, world_size=size)
46 |     fn(rank, size)
47 | 
48 | def onehot_from_logits(logits, eps=0.0):
49 |     """
50 |     Given batch of logits, return one-hot sample using epsilon greedy strategy
51 |     (based on given epsilon)
52 |     """
53 |     # get best (according to current policy) actions in one-hot form
54 |     argmax_acs = (logits == logits.max(1, keepdim=True)[0]).float()
55 |     if eps == 0.0:
56 |         return argmax_acs
57 |     # get random actions in one-hot form
58 |     rand_acs = Variable(torch.eye(logits.shape[1])[[np.random.choice(
59 |         range(logits.shape[1]), size=logits.shape[0])]], requires_grad=False)
60 |     # chooses between best and random actions using epsilon greedy
61 |     return torch.stack([argmax_acs[i] if r > eps else rand_acs[i] for i, r in
62 |                         enumerate(torch.rand(logits.shape[0]))])
63 | 
64 | # modified for PyTorch from https://github.com/ericjang/gumbel-softmax/blob/master/Categorical%20VAE.ipynb
65 | def sample_gumbel(shape, eps=1e-20, tens_type=torch.FloatTensor):
66 |     """Sample from Gumbel(0, 1)"""
67 |     U = Variable(tens_type(*shape).uniform_(), requires_grad=False)
68 |     return -torch.log(-torch.log(U + eps) + eps)
69 | 
70 | # modified for PyTorch from https://github.com/ericjang/gumbel-softmax/blob/master/Categorical%20VAE.ipynb
71 | def gumbel_softmax_sample(logits, temperature):
72 |     """ Draw a sample from the Gumbel-Softmax distribution"""
73 |     y = logits + sample_gumbel(logits.shape, tens_type=type(logits.data))
74 |     return F.softmax(y / temperature, dim=1)
75 | 
76 | # modified for PyTorch from https://github.com/ericjang/gumbel-softmax/blob/master/Categorical%20VAE.ipynb
77 | def gumbel_softmax(logits, temperature=1.0, hard=False):
78 |     """Sample from the Gumbel-Softmax distribution and optionally discretize.
79 |     Args:
80 |       logits: [batch_size, n_class] unnormalized log-probs
81 |       temperature: non-negative scalar
82 |       hard: if True, take argmax, but differentiate w.r.t. soft sample y
83 |     Returns:
84 |       [batch_size, n_class] sample from the Gumbel-Softmax distribution.
85 |       If hard=True, then the returned sample will be one-hot, otherwise it will
86 |       be a probabilitiy distribution that sums to 1 across classes
87 |     """
88 |     y = gumbel_softmax_sample(logits, temperature)
89 |     if hard:
90 |         y_hard = onehot_from_logits(y)
91 |         y = (y_hard - y).detach() + y
92 |     return y
93 | 


--------------------------------------------------------------------------------
/train/maddpg-v3/env/multiagent_particle_env.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | from gym.spaces import Discrete, Box, MultiDiscrete
  6 | from ray import rllib
  7 | from make_env import make_env
  8 | 
  9 | import numpy as np
 10 | import time
 11 | 
 12 | 
 13 | class RLlibMultiAgentParticleEnv(rllib.MultiAgentEnv):
 14 |     """Wraps OpenAI Multi-Agent Particle env to be compatible with RLLib multi-agent."""
 15 | 
 16 |     def __init__(self, **mpe_args):
 17 |         """Create a new Multi-Agent Particle env compatible with RLlib.
 18 | 
 19 |         Arguments:
 20 |             mpe_args (dict): Arguments to pass to the underlying
 21 |                 make_env.make_env instance.
 22 | 
 23 |         Examples:
 24 |             >>> from rllib_env import RLlibMultiAgentParticleEnv
 25 |             >>> env = RLlibMultiAgentParticleEnv(scenario_name="simple_reference")
 26 |             >>> print(env.reset())
 27 |         """
 28 | 
 29 |         self._env = make_env(**mpe_args)
 30 |         self.num_agents = self._env.n
 31 |         self.agent_ids = list(range(self.num_agents))
 32 | 
 33 |         self.observation_space_dict = self._make_dict(self._env.observation_space)
 34 |         self.action_space_dict = self._make_dict(self._env.action_space)
 35 | 
 36 |     def reset(self):
 37 |         """Resets the env and returns observations from ready agents.
 38 | 
 39 |         Returns:
 40 |             obs_dict: New observations for each ready agent.
 41 |         """
 42 | 
 43 |         obs_dict = self._make_dict(self._env.reset())
 44 |         return obs_dict
 45 | 
 46 |     def step(self, action_dict):
 47 |         """Returns observations from ready agents.
 48 | 
 49 |         The returns are dicts mapping from agent_id strings to values. The
 50 |         number of agents in the env can vary over time.
 51 | 
 52 |         Returns:
 53 |             obs_dict:
 54 |                 New observations for each ready agent.
 55 |             rew_dict:
 56 |                 Reward values for each ready agent.
 57 |             done_dict:
 58 |                 Done values for each ready agent.
 59 |                 The special key "__all__" (required) is used to indicate env termination.
 60 |             info_dict:
 61 |                 Optional info values for each agent id.
 62 |         """
 63 | 
 64 |         actions = list(action_dict.values())
 65 |         obs_list, rew_list, done_list, info_list = self._env.step(actions)
 66 | 
 67 |         obs_dict = self._make_dict(obs_list)
 68 |         rew_dict = self._make_dict(rew_list)
 69 |         done_dict = self._make_dict(done_list)
 70 |         done_dict["__all__"] = all(done_list)
 71 |         # FIXME: Currently, this is the best option to transfer agent-wise termination signal without touching RLlib code hugely.
 72 |         # FIXME: Hopefully, this will be solved in the future.
 73 |         info_dict = self._make_dict([{"done": done} for done in done_list])
 74 | 
 75 |         return obs_dict, rew_dict, done_dict, info_dict
 76 | 
 77 |     def render(self, mode='human'):
 78 |         time.sleep(0.05)
 79 |         self._env.render(mode=mode)
 80 | 
 81 |     def _make_dict(self, values):
 82 |         return dict(zip(self.agent_ids, values))
 83 | 
 84 | 
 85 | if __name__ == '__main__':
 86 |     for scenario_name in ["simple",
 87 |                           "simple_adversary",
 88 |                           "simple_crypto",
 89 |                           "simple_push",
 90 |                           "simple_reference",
 91 |                           "simple_speaker_listener",
 92 |                           "simple_spread",
 93 |                           "simple_tag",
 94 |                           "simple_world_comm"]:
 95 |         print("scenario_name: ", scenario_name)
 96 |         env = RLlibMultiAgentParticleEnv(scenario_name=scenario_name)
 97 |         print("obs: ", env.reset())
 98 |         print(env.observation_space_dict)
 99 |         print(env.action_space_dict)
100 | 
101 |         action_dict = {}
102 |         for i, ac_space in env.action_space_dict.items():
103 |             sample = ac_space.sample()
104 |             if isinstance(ac_space, Discrete):
105 |                 action_dict[i] = np.zeros(ac_space.n)
106 |                 action_dict[i][sample] = 1.0
107 |             elif isinstance(ac_space, Box):
108 |                 action_dict[i] = sample
109 |             elif isinstance(ac_space, MultiDiscrete):
110 |                 print("sample: ", sample)
111 |                 print("ac_space: ", ac_space.nvec)
112 |                 action_dict[i] = np.zeros(sum(ac_space.nvec))
113 |                 start_ls = np.cumsum([0] + list(ac_space.nvec))[:-1]
114 |                 for l in list(start_ls + sample):
115 |                     action_dict[i][l] = 1.0
116 |             else:
117 |                 raise NotImplementedError
118 | 
119 |         print("action_dict: ", action_dict)
120 | 
121 |         for i in env.step(action_dict):
122 |             print(i)
123 | 


--------------------------------------------------------------------------------
/train/maddpg-v3/env/wrapper.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | from gym.spaces import Discrete, Box, MultiDiscrete
  6 | from ray import rllib
  7 | from make_env import make_env
  8 | 
  9 | import formation_gym
 10 | 
 11 | import numpy as np
 12 | import time
 13 | 
 14 | 
 15 | class FormationEnv(rllib.MultiAgentEnv):
 16 |     """Wraps OpenAI Multi-Agent Particle env to be compatible with RLLib multi-agent."""
 17 | 
 18 |     def __init__(self, **mpe_args):
 19 |         """Create a new Multi-Agent Particle env compatible with RLlib.
 20 | 
 21 |         Arguments:
 22 |             mpe_args (dict): Arguments to pass to the underlying
 23 |                 make_env.make_env instance.
 24 | 
 25 |         Examples:
 26 |             >>> from rllib_env import RLlibMultiAgentParticleEnv
 27 |             >>> env = RLlibMultiAgentParticleEnv(scenario_name="simple_reference")
 28 |             >>> print(env.reset())
 29 |         """
 30 |         print(mpe_args)
 31 |         self._env = formation_gym.make_env(**mpe_args)
 32 |         self.num_agents = self._env.num_agents
 33 |         self.agent_ids = list(range(self.num_agents))
 34 | 
 35 |         self.observation_space_dict = self._make_dict(self._env.observation_space)
 36 |         self.action_space_dict = self._make_dict(self._env.action_space)
 37 | 
 38 |     def reset(self):
 39 |         """Resets the env and returns observations from ready agents.
 40 | 
 41 |         Returns:
 42 |             obs_dict: New observations for each ready agent.
 43 |         """
 44 | 
 45 |         obs_dict = self._make_dict(self._env.reset())
 46 |         return obs_dict
 47 | 
 48 |     def step(self, action_dict):
 49 |         """Returns observations from ready agents.
 50 | 
 51 |         The returns are dicts mapping from agent_id strings to values. The
 52 |         number of agents in the env can vary over time.
 53 | 
 54 |         Returns:
 55 |             obs_dict:
 56 |                 New observations for each ready agent.
 57 |             rew_dict:
 58 |                 Reward values for each ready agent.
 59 |             done_dict:
 60 |                 Done values for each ready agent.
 61 |                 The special key "__all__" (required) is used to indicate env termination.
 62 |             info_dict:
 63 |                 Optional info values for each agent id.
 64 |         """
 65 | 
 66 |         actions = list(action_dict.values())
 67 |         obs_list, rew_list, done_list, info_list = self._env.step(actions)
 68 | 
 69 |         obs_dict = self._make_dict(obs_list)
 70 |         rew_dict = self._make_dict(rew_list)
 71 |         done_dict = self._make_dict(done_list)
 72 |         done_dict["__all__"] = all(done_list)
 73 |         # FIXME: Currently, this is the best option to transfer agent-wise termination signal without touching RLlib code hugely.
 74 |         # FIXME: Hopefully, this will be solved in the future.
 75 |         info_dict = self._make_dict([{"done": done} for done in done_list])
 76 | 
 77 |         return obs_dict, rew_dict, done_dict, info_dict
 78 | 
 79 |     def render(self, mode='human'):
 80 |         time.sleep(0.05)
 81 |         self._env.render(mode=mode)
 82 | 
 83 |     def _make_dict(self, values):
 84 |         return dict(zip(self.agent_ids, values))
 85 | 
 86 | 
 87 | if __name__ == '__main__':
 88 |     for scenario_name in ["simple",
 89 |                           "simple_adversary",
 90 |                           "simple_crypto",
 91 |                           "simple_push",
 92 |                           "simple_reference",
 93 |                           "simple_speaker_listener",
 94 |                           "simple_spread",
 95 |                           "simple_tag",
 96 |                           "simple_world_comm"]:
 97 |         print("scenario_name: ", scenario_name)
 98 |         env = RLlibMultiAgentParticleEnv(scenario_name=scenario_name)
 99 |         print("obs: ", env.reset())
100 |         print(env.observation_space_dict)
101 |         print(env.action_space_dict)
102 | 
103 |         action_dict = {}
104 |         for i, ac_space in env.action_space_dict.items():
105 |             sample = ac_space.sample()
106 |             if isinstance(ac_space, Discrete):
107 |                 action_dict[i] = np.zeros(ac_space.n)
108 |                 action_dict[i][sample] = 1.0
109 |             elif isinstance(ac_space, Box):
110 |                 action_dict[i] = sample
111 |             elif isinstance(ac_space, MultiDiscrete):
112 |                 print("sample: ", sample)
113 |                 print("ac_space: ", ac_space.nvec)
114 |                 action_dict[i] = np.zeros(sum(ac_space.nvec))
115 |                 start_ls = np.cumsum([0] + list(ac_space.nvec))[:-1]
116 |                 for l in list(start_ls + sample):
117 |                     action_dict[i][l] = 1.0
118 |             else:
119 |                 raise NotImplementedError
120 | 
121 |         print("action_dict: ", action_dict)
122 | 
123 |         for i in env.step(action_dict):
124 |             print(i)
125 | 


--------------------------------------------------------------------------------
/train/mappo/inbox/render_formation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import os
  4 | import setproctitle
  5 | import numpy as np
  6 | from pathlib import Path
  7 | 
  8 | import torch
  9 | 
 10 | from config import get_config
 11 | 
 12 | from onpolicy.envs.env_wrappers import SubprocVecEnv, DummyVecEnv
 13 | 
 14 | import formation_gym
 15 | 
 16 | def make_render_env(all_args):
 17 |     def get_env_fn(rank):
 18 |         def init_env():
 19 |             if all_args.env_name == "MPE":
 20 |                 env = formation_gym.make_env(all_args.scenario_name, False , all_args.num_agents)
 21 |             else:
 22 |                 print("Can not support the " +
 23 |                       all_args.env_name + "environment.")
 24 |                 raise NotImplementedError
 25 |             env.seed(all_args.seed + rank * 1000)
 26 |             return env
 27 |         return init_env
 28 |     if all_args.n_rollout_threads == 1:
 29 |         return DummyVecEnv([get_env_fn(0)])
 30 |     else:
 31 |         return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
 32 | 
 33 | def parse_args(args, parser):
 34 |     parser.add_argument('--scenario_name', type=str,
 35 |                         default='simple_spread', help="Which scenario to run on")
 36 |     parser.add_argument('--num_agents', type=int,
 37 |                         default=3, help="number of players")
 38 | 
 39 |     all_args = parser.parse_known_args(args)[0]
 40 | 
 41 |     return all_args
 42 | 
 43 | 
 44 | def main(args):
 45 |     parser = get_config()
 46 |     all_args = parse_args(args, parser)
 47 | 
 48 |     if all_args.algorithm_name == "rmappo" or all_args.algorithm_name == "rmappg":
 49 |         assert (
 50 |             all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ("check recurrent policy!")
 51 |     elif all_args.algorithm_name == "mappo" or all_args.algorithm_name == "mappg":
 52 |         assert (all_args.use_recurrent_policy and all_args.use_naive_recurrent_policy) == False, (
 53 |             "check recurrent policy!")
 54 |     else:
 55 |         raise NotImplementedError
 56 | 
 57 |     assert (all_args.share_policy == True and all_args.scenario_name == 'simple_speaker_listener') == False, (
 58 |         "The simple_speaker_listener scenario can not use shared policy. Please check the config.py.")
 59 | 
 60 |     assert all_args.use_render, ("u need to set use_render be True")
 61 |     assert not (all_args.model_dir == None or all_args.model_dir == ""), ("set model_dir first")
 62 |     assert all_args.n_rollout_threads==1, ("only support to use 1 env to render.")
 63 |     
 64 |     # cuda
 65 |     if all_args.cuda and torch.cuda.is_available():
 66 |         print("choose to use gpu...")
 67 |         device = torch.device("cuda:0")
 68 |         torch.set_num_threads(all_args.n_training_threads)
 69 |         if all_args.cuda_deterministic:
 70 |             torch.backends.cudnn.benchmark = False
 71 |             torch.backends.cudnn.deterministic = True
 72 |     else:
 73 |         print("choose to use cpu...")
 74 |         device = torch.device("cpu")
 75 |         torch.set_num_threads(all_args.n_training_threads)
 76 | 
 77 |     # run dir
 78 |     run_dir = Path(os.path.dirname(os.path.abspath(__file__)) + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name
 79 |     print(run_dir)
 80 |     if not run_dir.exists():
 81 |         os.makedirs(str(run_dir))
 82 | 
 83 |     if not run_dir.exists():
 84 |         curr_run = 'run1'
 85 |     else:
 86 |         exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run')]
 87 |         if len(exst_run_nums) == 0:
 88 |             curr_run = 'run1'
 89 |         else:
 90 |             curr_run = 'run%i' % (max(exst_run_nums) + 1)
 91 |     run_dir = run_dir / curr_run
 92 |     if not run_dir.exists():
 93 |         os.makedirs(str(run_dir))
 94 | 
 95 |     setproctitle.setproctitle(str(all_args.algorithm_name) + "-" + \
 96 |         str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(all_args.user_name))
 97 | 
 98 |     # seed
 99 |     torch.manual_seed(all_args.seed)
100 |     torch.cuda.manual_seed_all(all_args.seed)
101 |     np.random.seed(all_args.seed)
102 | 
103 |     # env init
104 |     envs = make_render_env(all_args)
105 |     eval_envs = None
106 |     num_agents = all_args.num_agents
107 | 
108 |     config = {
109 |         "all_args": all_args,
110 |         "envs": envs,
111 |         "eval_envs": eval_envs,
112 |         "num_agents": num_agents,
113 |         "device": device,
114 |         "run_dir": run_dir
115 |     }
116 | 
117 |     # run experiments
118 |     if all_args.share_policy:
119 |         from onpolicy.runner.shared.mpe_runner import MPERunner as Runner
120 |     else:
121 |         from onpolicy.runner.separated.mpe_runner import MPERunner as Runner
122 | 
123 |     runner = Runner(config)
124 |     runner.render()
125 |     
126 |     # post process
127 |     envs.close()
128 | 
129 | if __name__ == "__main__":
130 |     main(sys.argv[1:])
131 | 


--------------------------------------------------------------------------------
/formation_gym/envs/formation_hd_partial_range_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.spatial.distance import directed_hausdorff
  3 | 
  4 | from formation_gym.scenario import BaseScenario
  5 | from formation_gym.core import World, Agent, Landmark
  6 | 
  7 | '''
  8 | use Hausdorff distance as reward function
  9 | refer to https://www.wikiwand.com/en/Hausdorff_distance#/Applications
 10 | 
 11 | partial observation environment
 12 | '''
 13 | 
 14 | class Scenario(BaseScenario):
 15 |     def make_world(self, num_agents = 4, num_landmarks = 4, obs_range = 0.7, world_length = 25):
 16 |         self.obs_range = obs_range
 17 |         self.num_agents = num_agents
 18 |         # world properties
 19 |         world = World()
 20 |         world.world_length = world_length
 21 |         world.dim_c = 2 # communication channel
 22 |         world.collaborative = True
 23 |         # agent properties
 24 |         world.agents = [Agent() for i in range(num_agents)]
 25 |         for i, agent in enumerate(world.agents):
 26 |             agent.name = 'agent %d' % i
 27 |             agent.collide = True
 28 |             agent.silent = True
 29 |             agent.size = 0.04
 30 |         # landmark properties
 31 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 32 |         for i, landmark in enumerate(world.landmarks):
 33 |             landmark.name = 'landmarks %d' % i
 34 |             landmark.collide = False
 35 |             landmark.movable = False
 36 |             landmark.size = 0.02
 37 |         # initial conditions
 38 |         self.reset_world(world)
 39 |         return world
 40 |     
 41 |     def observation(self, agent, world):
 42 |         # landmark pos
 43 |         entity_pos = []
 44 |         for entity in world.landmarks:
 45 |             entity_pos.append(entity.state.p_pos)
 46 |         # agent pos & communication
 47 |         other_pos = []
 48 |         comm = []
 49 |         # set range for watching
 50 |         for other in world.agents:
 51 |             if other is agent: continue
 52 |             comm.append(other.state.c)
 53 |             other_pos.append(np.clip(other.state.p_pos - agent.state.p_pos, [-self.obs_range, -self.obs_range], [self.obs_range, self.obs_range]))
 54 |         return np.concatenate([agent.state.p_vel]+entity_pos + other_pos + comm)
 55 | 
 56 |     def reward(self, agent, world):
 57 |         rew = 0
 58 |         u = [a.state.p_pos for a in world.agents]
 59 |         v = [l.state.p_pos for l in world.landmarks]
 60 |         delta = np.mean(u, 0) - np.mean(v, 0)
 61 |         u = u - np.mean(u, 0)
 62 |         v = v - np.mean(v, 0)
 63 |         rew = -max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
 64 |         # change landmark pos and color
 65 |         # for i in range(len(world.landmarks)):
 66 |             # world.landmarks[i].state.p_pos += delta
 67 |             # dist = min([np.linalg.norm(a.state.p_pos - world.landmarks[i].state.p_pos) for a in world.agents])
 68 |             # if dist <= 0.2: world.landmarks[i].color = np.array([0, 0.6, 0])
 69 |         # self.set_bound(world)
 70 |         if agent.collide:
 71 |             for a in world.agents:
 72 |                 if  agent!=a and self.is_collision(a, agent):
 73 |                     rew -= 1
 74 |         return rew
 75 | 
 76 |     def reset_world(self, world):
 77 |         # agent
 78 |         for agent in world.agents:
 79 |             agent.color = np.array([0.35, 0.35, 0.85])
 80 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 81 |             agent.state.p_vel = np.zeros(world.dim_p)
 82 |             agent.state.c = np.zeros(world.dim_c)
 83 |         # landmark
 84 |         for landmark in world.landmarks:
 85 |             landmark.color = np.array([0.25, 0.25, 0.25])
 86 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 87 |             landmark.state.p_vel = np.zeros(world.dim_p)
 88 | 
 89 |     def benchmark_data(self, agent, world):
 90 |         # get data to debug
 91 |         rew = self.reward(agent, world)
 92 |         collisions = 0
 93 |         if agent.collide:
 94 |             for a in world.agents:
 95 |                 if self.is_collision(a, agent):
 96 |                     collisions += 1
 97 |         min_dists = 0
 98 |         occupied_landmarks = 0
 99 |         for l in world.landmarks:
100 |             dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents]
101 |             min_dists += min(dists)
102 |             if min(dists) < 0.1:
103 |                 occupied_landmarks += 1
104 |         return {
105 |             'reward': rew, 
106 |             'collisions': collisions, 
107 |             'min_dists': min_dists, 
108 |             'occupied_landmarks': occupied_landmarks
109 |         }
110 | 
111 |     def is_collision(self, agent1, agent2):
112 |         dist = np.linalg.norm(agent1.state.p_pos - agent2.state.p_pos)
113 |         return dist < (agent1.size + agent2.size)
114 | 
115 |     def set_bound(self, world):
116 |         for agent in world.agents:
117 |             agent.state.p_pos = np.clip(agent.state.p_pos, [-2, -2], [2, 2])
118 | 
119 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/utils/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch import Tensor
 3 | from torch.autograd import Variable
 4 | 
 5 | class ReplayBuffer(object):
 6 |     """
 7 |     Replay Buffer for multi-agent RL with parallel rollouts
 8 |     """
 9 |     def __init__(self, max_steps, num_agents, obs_dims, ac_dims):
10 |         """
11 |         Inputs:
12 |             max_steps (int): Maximum number of timepoints to store in buffer
13 |             num_agents (int): Number of agents in environment
14 |             obs_dims (list of ints): number of obervation dimensions for each
15 |                                      agent
16 |             ac_dims (list of ints): number of action dimensions for each agent
17 |         """
18 |         self.max_steps = max_steps
19 |         self.num_agents = num_agents
20 |         self.obs_buffs = []
21 |         self.ac_buffs = []
22 |         self.rew_buffs = []
23 |         self.next_obs_buffs = []
24 |         self.done_buffs = []
25 |         for odim, adim in zip(obs_dims, ac_dims):
26 |             self.obs_buffs.append(np.zeros((max_steps, odim)))
27 |             self.ac_buffs.append(np.zeros((max_steps, adim)))
28 |             self.rew_buffs.append(np.zeros(max_steps))
29 |             self.next_obs_buffs.append(np.zeros((max_steps, odim)))
30 |             self.done_buffs.append(np.zeros(max_steps))
31 | 
32 | 
33 |         self.filled_i = 0  # index of first empty location in buffer (last index when full)
34 |         self.curr_i = 0  # current index to write to (ovewrite oldest data)
35 | 
36 |     def __len__(self):
37 |         return self.filled_i
38 | 
39 |     def push(self, observations, actions, rewards, next_observations, dones):
40 |         nentries = observations.shape[0]  # handle multiple parallel environments
41 |         if self.curr_i + nentries > self.max_steps:
42 |             rollover = self.max_steps - self.curr_i # num of indices to roll over
43 |             for agent_i in range(self.num_agents):
44 |                 self.obs_buffs[agent_i] = np.roll(self.obs_buffs[agent_i],
45 |                                                   rollover, axis=0)
46 |                 self.ac_buffs[agent_i] = np.roll(self.ac_buffs[agent_i],
47 |                                                  rollover, axis=0)
48 |                 self.rew_buffs[agent_i] = np.roll(self.rew_buffs[agent_i],
49 |                                                   rollover)
50 |                 self.next_obs_buffs[agent_i] = np.roll(
51 |                     self.next_obs_buffs[agent_i], rollover, axis=0)
52 |                 self.done_buffs[agent_i] = np.roll(self.done_buffs[agent_i],
53 |                                                    rollover)
54 |             self.curr_i = 0
55 |             self.filled_i = self.max_steps
56 |         for agent_i in range(self.num_agents):
57 |             self.obs_buffs[agent_i][self.curr_i:self.curr_i + nentries] = np.vstack(
58 |                 observations[:, agent_i])
59 |             # actions are already batched by agent, so they are indexed differently
60 |             self.ac_buffs[agent_i][self.curr_i:self.curr_i + nentries] = actions[agent_i]
61 |             self.rew_buffs[agent_i][self.curr_i:self.curr_i + nentries] = rewards[:, agent_i][0]
62 |             self.next_obs_buffs[agent_i][self.curr_i:self.curr_i + nentries] = np.vstack(
63 |                 next_observations[:, agent_i])
64 |             self.done_buffs[agent_i][self.curr_i:self.curr_i + nentries] = dones[:, agent_i]
65 |         self.curr_i += nentries
66 |         if self.filled_i < self.max_steps:
67 |             self.filled_i += nentries
68 |         if self.curr_i == self.max_steps:
69 |             self.curr_i = 0
70 | 
71 |     def sample(self, N, to_gpu=False, norm_rews=True):
72 |         inds = np.random.choice(np.arange(self.filled_i), size=N,
73 |                                 replace=False)
74 |         if to_gpu:
75 |             cast = lambda x: Variable(Tensor(x), requires_grad=False).cuda()
76 |         else:
77 |             cast = lambda x: Variable(Tensor(x), requires_grad=False)
78 |         if norm_rews:
79 |             ret_rews = [cast((self.rew_buffs[i][inds] -
80 |                               self.rew_buffs[i][:self.filled_i].mean()) /
81 |                              self.rew_buffs[i][:self.filled_i].std())
82 |                         for i in range(self.num_agents)]
83 |         else:
84 |             ret_rews = [cast(self.rew_buffs[i][inds]) for i in range(self.num_agents)]
85 |         return ([cast(self.obs_buffs[i][inds]) for i in range(self.num_agents)],
86 |                 [cast(self.ac_buffs[i][inds]) for i in range(self.num_agents)],
87 |                 ret_rews,
88 |                 [cast(self.next_obs_buffs[i][inds]) for i in range(self.num_agents)],
89 |                 [cast(self.done_buffs[i][inds]) for i in range(self.num_agents)])
90 | 
91 |     def get_average_rewards(self, N):
92 |         if self.filled_i == self.max_steps:
93 |             inds = np.arange(self.curr_i - N, self.curr_i)  # allow for negative indexing
94 |         else:
95 |             inds = np.arange(max(0, self.curr_i - N), self.curr_i)
96 |         return [self.rew_buffs[i][inds].mean() for i in range(self.num_agents)]
97 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/utils/env_wrappers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Modified from OpenAI Baselines code to work with multi-agent envs
  3 | """
  4 | import numpy as np
  5 | from multiprocessing import Process, Pipe
  6 | from baselines.common.vec_env import VecEnv, CloudpickleWrapper
  7 | 
  8 | 
  9 | def worker(remote, parent_remote, env_fn_wrapper):
 10 |     parent_remote.close()
 11 |     env = env_fn_wrapper.x()
 12 |     while True:
 13 |         cmd, data = remote.recv()
 14 |         if cmd == 'step':
 15 |             ob, reward, done, info = env.step(data)
 16 |             if all(done):
 17 |                 ob = env.reset()
 18 |             remote.send((ob, reward, done, info))
 19 |         elif cmd == 'reset':
 20 |             ob = env.reset()
 21 |             remote.send(ob)
 22 |         elif cmd == 'reset_task':
 23 |             ob = env.reset_task()
 24 |             remote.send(ob)
 25 |         elif cmd == 'close':
 26 |             remote.close()
 27 |             break
 28 |         elif cmd == 'get_spaces':
 29 |             remote.send((env.observation_space, env.action_space))
 30 |         elif cmd == 'get_agent_types':
 31 |             if all([hasattr(a, 'adversary') for a in env.agents]):
 32 |                 remote.send(['adversary' if a.adversary else 'agent' for a in
 33 |                              env.agents])
 34 |             else:
 35 |                 remote.send(['agent' for _ in env.agents])
 36 |         else:
 37 |             raise NotImplementedError
 38 | 
 39 | 
 40 | class SubprocVecEnv(VecEnv):
 41 |     def __init__(self, env_fns, spaces=None):
 42 |         """
 43 |         envs: list of gym environments to run in subprocesses
 44 |         """
 45 |         self.waiting = False
 46 |         self.closed = False
 47 |         nenvs = len(env_fns)
 48 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
 49 |         self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
 50 |             for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
 51 |         for p in self.ps:
 52 |             p.daemon = True # if the main process crashes, we should not cause things to hang
 53 |             p.start()
 54 |         for remote in self.work_remotes:
 55 |             remote.close()
 56 | 
 57 |         self.remotes[0].send(('get_spaces', None))
 58 |         observation_space, action_space = self.remotes[0].recv()
 59 |         self.remotes[0].send(('get_agent_types', None))
 60 |         self.agent_types = self.remotes[0].recv()
 61 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
 62 | 
 63 |     def step_async(self, actions):
 64 |         for remote, action in zip(self.remotes, actions):
 65 |             remote.send(('step', action))
 66 |         self.waiting = True
 67 | 
 68 |     def step_wait(self):
 69 |         results = [remote.recv() for remote in self.remotes]
 70 |         self.waiting = False
 71 |         obs, rews, dones, infos = zip(*results)
 72 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
 73 | 
 74 |     def reset(self):
 75 |         for remote in self.remotes:
 76 |             remote.send(('reset', None))
 77 |         return np.stack([remote.recv() for remote in self.remotes])
 78 | 
 79 |     def reset_task(self):
 80 |         for remote in self.remotes:
 81 |             remote.send(('reset_task', None))
 82 |         return np.stack([remote.recv() for remote in self.remotes])
 83 | 
 84 |     def close(self):
 85 |         if self.closed:
 86 |             return
 87 |         if self.waiting:
 88 |             for remote in self.remotes:            
 89 |                 remote.recv()
 90 |         for remote in self.remotes:
 91 |             remote.send(('close', None))
 92 |         for p in self.ps:
 93 |             p.join()
 94 |         self.closed = True
 95 | 
 96 | 
 97 | class DummyVecEnv(VecEnv):
 98 |     def __init__(self, env_fns):
 99 |         self.envs = [fn() for fn in env_fns]
100 |         env = self.envs[0]        
101 |         VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
102 |         if all([hasattr(a, 'adversary') for a in env.agents]):
103 |             self.agent_types = ['adversary' if a.adversary else 'agent' for a in
104 |                                 env.agents]
105 |         else:
106 |             self.agent_types = ['agent' for _ in env.agents]
107 |         self.ts = np.zeros(len(self.envs), dtype='int')        
108 |         self.actions = None
109 | 
110 |     def step_async(self, actions):
111 |         self.actions = actions
112 | 
113 |     def step_wait(self):
114 |         results = [env.step(a) for (a,env) in zip(self.actions, self.envs)]
115 |         obs, rews, dones, infos = map(np.array, zip(*results))
116 |         self.ts += 1
117 |         for (i, done) in enumerate(dones):
118 |             if all(done): 
119 |                 obs[i] = self.envs[i].reset()
120 |                 self.ts[i] = 0
121 |         self.actions = None
122 |         return np.array(obs), np.array(rews), np.array(dones), infos
123 | 
124 |     def reset(self):        
125 |         results = [env.reset() for env in self.envs]
126 |         return np.array(results)
127 | 
128 |     def close(self):
129 |         return


--------------------------------------------------------------------------------
/formation_gym/__init__.py:
--------------------------------------------------------------------------------
 1 | import imp
 2 | from .environment import MultiAgentEnv
 3 | import os.path as osp
 4 | import numpy as np
 5 | 
 6 | def make_env(scenario_name='basic_formation_env', benchmark=False, num_agents = 3):
 7 |     # load scenario from script
 8 |     pathname = osp.join(osp.dirname(__file__), 'envs/'+scenario_name+'.py')
 9 |     scenario = imp.load_source('', pathname).Scenario() 
10 |     # create world
11 |     world = scenario.make_world(num_agents) # use same number of agent and landmarks
12 |     # create multiagent environment
13 |     if benchmark:
14 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data, shared_viewer = True)
15 |     else:
16 |         env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, shared_viewer = True)
17 |     return env
18 | 
19 | def ezpolicy(obs):
20 |     num_agents = len(obs)/6
21 |     assert num_agents.is_integer(), num_agents
22 |     num_agents = int(num_agents)
23 |     # get info from observation
24 |     p_vel = obs[:2]
25 |     other_pos = obs[2:2*num_agents]
26 |     ideal_shape = obs[4*num_agents-2:6*num_agents-2]
27 |     ideal_shape = np.reshape(ideal_shape, (-1, 2))
28 |     ideal_shape = ideal_shape - np.mean(ideal_shape, axis = 0)
29 |     ideal_vel = obs[-2:]
30 |     # calculate relative formation
31 |     current_shape = np.append(other_pos, [0,0])
32 |     current_shape = np.reshape(current_shape, (-1,2))
33 |     current_shape -= np.mean(current_shape, axis = 0)
34 |     # get action
35 |     sort_mark_idx = np.argsort([np.linalg.norm(current_shape[-1] - mark) for mark in ideal_shape]) # distance to different landmarks
36 |     for idx in sort_mark_idx:
37 |         closest_agent_idx = np.argmin([np.linalg.norm(agent - ideal_shape[idx]) for agent in current_shape])
38 |         if closest_agent_idx == (num_agents - 1) or idx == sort_mark_idx[-1]: # this agent is the closet agent
39 |             act = np.clip(0.5*(ideal_shape[idx] - current_shape[-1]), -1, 1)
40 |             break
41 |     # add ideal velocity control to action
42 |     done = np.linalg.norm(ideal_shape - current_shape) < 0.01
43 |     if done: 
44 |         act += ideal_vel
45 |     else:
46 |         act += ideal_vel * 0.3
47 |     return act
48 | 
49 | def get_action_BFS(policy, obs, num_agents_per_layer):
50 |     '''
51 |     :param policy: agent policy function
52 |     :param obs: total observation
53 |     :param num_agents_per_layer: number of agents per group
54 |     '''
55 |     num_layer = np.log(len(obs))/ np.log(num_agents_per_layer)
56 |     assert num_layer.is_integer(), 'Observation shape error!'
57 |     queue = [obs]
58 |     act = []
59 |     while queue:
60 |         current_layer_obs = queue.pop(0)
61 |         current_layer_num_agents = len(current_layer_obs)
62 |         next_layer_num_agents = int(len(current_layer_obs)/num_agents_per_layer)
63 |         for i in range(num_agents_per_layer):
64 |             leader_obs = current_layer_obs[i*next_layer_num_agents]
65 |             # get current layer leader observation
66 |             p_vel = leader_obs[:2]
67 |             # get observation of others by inference center
68 |             current_shape = np.insert(leader_obs[2:2*current_layer_num_agents], 2*i*next_layer_num_agents, [0,0]).reshape((-1, 2))
69 |             layer_current_shape = np.array([np.mean(current_shape[next_layer_num_agents*k:next_layer_num_agents*(k+1)], axis = 0) for k in range(num_agents_per_layer)])
70 |             layer_current_shape -= layer_current_shape[i]
71 |             layer_current_shape = np.delete(layer_current_shape, i, 0).flatten()
72 |             # get ideal formation
73 |             ideal_shape = np.reshape(leader_obs[4*current_layer_num_agents-2:6*current_layer_num_agents-2], (-1, 2))
74 |             layer_target_shape = np.array([np.mean(ideal_shape[next_layer_num_agents*(k):next_layer_num_agents*(k+1)], axis = 0) for k in range(num_agents_per_layer)]).flatten()
75 |             # get ideal velocity
76 |             layer_target_vel = leader_obs[-2:]
77 |             obs_input = np.concatenate((p_vel, layer_current_shape, [0]*2*(num_agents_per_layer-1), layer_target_shape, layer_target_vel))
78 |             current_layer = np.log(current_layer_num_agents)/ np.log(num_agents_per_layer)
79 |             next_layer_target_vel = policy(obs_input) * (current_layer)
80 |             # next layer observation
81 |             if next_layer_num_agents == 1:
82 |                 # END case: reach the last layer and append the action
83 |                 act.append(next_layer_target_vel)
84 |             else:
85 |                 next_layer_obs = []
86 |                 for j in range(i*next_layer_num_agents, (i+1)*next_layer_num_agents):
87 |                     # remove redundent observation
88 |                     obs_n = current_layer_obs[j]
89 |                     p_vel = obs_n[:2]
90 |                     others_pos = obs_n[2:2*current_layer_num_agents]
91 |                     others_pos = others_pos[2*i*next_layer_num_agents:2*(i+1)*next_layer_num_agents-2]
92 |                     comm = [0]*2*(next_layer_num_agents-1)
93 |                     shape = obs_n[4*current_layer_num_agents-2:6*current_layer_num_agents-2]
94 |                     shape = shape[2*i*next_layer_num_agents:2*(i+1)*next_layer_num_agents]
95 |                     tar_vel = next_layer_target_vel
96 |                     obs_n = np.concatenate((p_vel, others_pos, comm, shape, tar_vel))
97 |                     next_layer_obs.append(obs_n)
98 |                 queue.append(next_layer_obs)
99 |     return act


--------------------------------------------------------------------------------
/formation_gym/envs/formation_hd_partial_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.spatial.distance import directed_hausdorff
  3 | 
  4 | from formation_gym.scenario import BaseScenario
  5 | from formation_gym.core import World, Agent, Landmark
  6 | 
  7 | '''
  8 | use Hausdorff distance as reward function
  9 | refer to https://www.wikiwand.com/en/Hausdorff_distance#/Applications
 10 | 
 11 | partial observation environment
 12 | '''
 13 | 
 14 | class Scenario(BaseScenario):
 15 |     def make_world(self, num_agents = 5, num_landmarks = 5, num_obs = 3, world_length = 25):
 16 |         self.num_obs = num_obs
 17 |         self.num_agents = num_agents
 18 |         # world properties
 19 |         world = World()
 20 |         world.world_length = world_length
 21 |         world.dim_c = 2 # communication channel
 22 |         world.collaborative = True
 23 |         # agent properties
 24 |         world.agents = [Agent() for i in range(num_agents)]
 25 |         for i, agent in enumerate(world.agents):
 26 |             agent.name = 'agent %d' % i
 27 |             agent.collide = True
 28 |             agent.silent = True
 29 |             agent.size = 0.04
 30 |         # landmark properties
 31 |         world.landmarks = [Landmark() for i in range(num_landmarks)]
 32 |         for i, landmark in enumerate(world.landmarks):
 33 |             landmark.name = 'landmarks %d' % i
 34 |             landmark.collide = False
 35 |             landmark.movable = False
 36 |             landmark.size = 0.02
 37 |         # initial conditions
 38 |         self.reset_world(world)
 39 |         return world
 40 |     
 41 |     def observation(self, agent, world):
 42 |         # landmark pos
 43 |         entity_pos = []
 44 |         for entity in world.landmarks:
 45 |             entity_pos.append(entity.state.p_pos)
 46 |         # agent pos & communication
 47 |         other_pos = []
 48 |         comm = []
 49 |         # way3: watch for 2 guys
 50 |         # get agent ID
 51 |         agent_id = int(agent.name.split()[-1])
 52 |         idx = [i % self.num_agents for i in range(agent_id+1, agent_id+1 + self.num_obs)]
 53 |         for i in idx:
 54 |             other_pos.append(world.agents[i].state.p_pos - agent.state.p_pos)
 55 |         for other in world.agents:
 56 |             if other is agent: continue
 57 |             comm.append(other.state.c)
 58 |         # make the furthest point to zero
 59 |         # way1: make the far observation to zero
 60 |         # others_dist = np.linalg.norm(other_pos, axis = 1)
 61 |         # idx = np.argpartition(others_dist, self.num_obs)
 62 |         # for i in idx[self.num_obs:]:
 63 |         #     other_pos[i] = np.zeros(world.dim_p)
 64 |         # way2: remove the far obs
 65 |         # other_pos = other_pos[idx[:self.num_obs]]
 66 |         return np.concatenate([agent.state.p_vel]+entity_pos + other_pos + comm)
 67 | 
 68 |     def reward(self, agent, world):
 69 |         rew = 0
 70 |         u = [a.state.p_pos for a in world.agents]
 71 |         v = [l.state.p_pos for l in world.landmarks]
 72 |         delta = np.mean(u, 0) - np.mean(v, 0)
 73 |         u = u - np.mean(u, 0)
 74 |         v = v - np.mean(v, 0)
 75 |         rew = -max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
 76 |         # change landmark pos and color
 77 |         # for i in range(len(world.landmarks)):
 78 |             # world.landmarks[i].state.p_pos += delta
 79 |             # dist = min([np.linalg.norm(a.state.p_pos - world.landmarks[i].state.p_pos) for a in world.agents])
 80 |             # if dist <= 0.2: world.landmarks[i].color = np.array([0, 0.6, 0])
 81 |         # self.set_bound(world)
 82 |         if agent.collide:
 83 |             for a in world.agents:
 84 |                 if  agent!=a and self.is_collision(a, agent):
 85 |                     rew -= 1
 86 |         return rew
 87 | 
 88 |     def reset_world(self, world):
 89 |         # agent
 90 |         for agent in world.agents:
 91 |             agent.color = np.array([0.35, 0.35, 0.85])
 92 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 93 |             agent.state.p_vel = np.zeros(world.dim_p)
 94 |             agent.state.c = np.zeros(world.dim_c)
 95 |         # landmark
 96 |         for landmark in world.landmarks:
 97 |             landmark.color = np.array([0.25, 0.25, 0.25])
 98 |             landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 99 |             landmark.state.p_vel = np.zeros(world.dim_p)
100 | 
101 |     def benchmark_data(self, agent, world):
102 |         # get data to debug
103 |         rew = self.reward(agent, world)
104 |         collisions = 0
105 |         if agent.collide:
106 |             for a in world.agents:
107 |                 if self.is_collision(a, agent):
108 |                     collisions += 1
109 |         min_dists = 0
110 |         occupied_landmarks = 0
111 |         for l in world.landmarks:
112 |             dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents]
113 |             min_dists += min(dists)
114 |             if min(dists) < 0.1:
115 |                 occupied_landmarks += 1
116 |         return {
117 |             'reward': rew, 
118 |             'collisions': collisions, 
119 |             'min_dists': min_dists, 
120 |             'occupied_landmarks': occupied_landmarks
121 |         }
122 | 
123 |     def is_collision(self, agent1, agent2):
124 |         dist = np.linalg.norm(agent1.state.p_pos - agent2.state.p_pos)
125 |         return dist < (agent1.size + agent2.size)
126 | 
127 |     def set_bound(self, world):
128 |         for agent in world.agents:
129 |             agent.state.p_pos = np.clip(agent.state.p_pos, [-2, -2], [2, 2])
130 | 
131 | 


--------------------------------------------------------------------------------
/train/mappo/inbox/train_formation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import os
  4 | import numpy as np
  5 | from pathlib import Path
  6 | import torch
  7 | from config import get_config
  8 | from onpolicy.envs.env_wrappers import SubprocVecEnv, DummyVecEnv
  9 | 
 10 | import formation_gym
 11 | 
 12 | """Train script for MPEs."""
 13 | 
 14 | def make_train_env(all_args):
 15 |     def get_env_fn(rank):
 16 |         def init_env():
 17 |             if all_args.env_name == "MPE":
 18 |                 env = formation_gym.make_env(all_args.scenario_name, False , all_args.num_agents)
 19 |             else:
 20 |                 print("Can not support the " +
 21 |                       all_args.env_name + "environment.")
 22 |                 raise NotImplementedError
 23 |             env.seed(all_args.seed + rank * 1000)
 24 |             return env
 25 |         return init_env
 26 |     if all_args.n_rollout_threads == 1:
 27 |         return DummyVecEnv([get_env_fn(0)])
 28 |     else:
 29 |         return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
 30 | 
 31 | 
 32 | def make_eval_env(all_args):
 33 |     def get_env_fn(rank):
 34 |         def init_env():
 35 |             if all_args.env_name == "MPE":
 36 |                 env = formation_gym.make_env(all_args.scenario_name, False , all_args.num_agents)
 37 |             else:
 38 |                 print("Can not support the " +
 39 |                       all_args.env_name + "environment.")
 40 |                 raise NotImplementedError
 41 |             env.seed(all_args.seed * 50000 + rank * 10000)
 42 |             return env
 43 |         return init_env
 44 |     if all_args.n_eval_rollout_threads == 1:
 45 |         return DummyVecEnv([get_env_fn(0)])
 46 |     else:
 47 |         return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)])
 48 | 
 49 | 
 50 | def parse_args(args, parser):
 51 |     parser.add_argument('--scenario_name', type=str,
 52 |                         default='formation_hd_env', help="Which scenario to run on")
 53 |     parser.add_argument('--num_agents', type=int,
 54 |                         default=3, help="number of players")
 55 | 
 56 |     all_args = parser.parse_known_args(args)[0]
 57 | 
 58 |     return all_args
 59 | 
 60 | 
 61 | def main(args):
 62 |     parser = get_config()
 63 |     all_args = parse_args(args, parser)
 64 | 
 65 |     if all_args.algorithm_name == "rmappo":
 66 |         assert (all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ("check recurrent policy!")
 67 |     elif all_args.algorithm_name == "mappo":
 68 |         assert (all_args.use_recurrent_policy == False and all_args.use_naive_recurrent_policy == False), ("check recurrent policy!")
 69 |     else:
 70 |         raise NotImplementedError
 71 | 
 72 |     assert (all_args.share_policy == True and all_args.scenario_name == 'simple_speaker_listener') == False, (
 73 |         "The simple_speaker_listener scenario can not use shared policy. Please check the config.py.")
 74 | 
 75 |     # cuda
 76 |     if all_args.cuda and torch.cuda.is_available():
 77 |         print("choose to use gpu...")
 78 |         device = torch.device("cuda:0")
 79 |         torch.set_num_threads(all_args.n_training_threads)
 80 |         if all_args.cuda_deterministic:
 81 |             torch.backends.cudnn.benchmark = False
 82 |             torch.backends.cudnn.deterministic = True
 83 |     else:
 84 |         print("choose to use cpu...")
 85 |         device = torch.device("cpu")
 86 |         torch.set_num_threads(all_args.n_training_threads)
 87 | 
 88 |     # run dir
 89 |     # run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[0] + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name
 90 |     run_dir = Path(os.path.dirname(os.path.abspath(__file__)) + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name
 91 |     if not run_dir.exists():
 92 |         os.makedirs(str(run_dir))
 93 | 
 94 |     if not run_dir.exists():
 95 |         curr_run = 'run1'
 96 |     else:
 97 |         exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run')]
 98 |         if len(exst_run_nums) == 0:
 99 |             curr_run = 'run1'
100 |         else:
101 |             curr_run = 'run%i' % (max(exst_run_nums) + 1)
102 |     run_dir = run_dir / curr_run
103 |     if not run_dir.exists():
104 |         os.makedirs(str(run_dir))
105 | 
106 |     # seed
107 |     torch.manual_seed(all_args.seed)
108 |     torch.cuda.manual_seed_all(all_args.seed)
109 |     np.random.seed(all_args.seed)
110 | 
111 |     # env init
112 |     envs = make_train_env(all_args)
113 |     eval_envs = make_eval_env(all_args) if all_args.use_eval else None
114 |     num_agents = all_args.num_agents
115 | 
116 |     config = {
117 |         "all_args": all_args,
118 |         "envs": envs,
119 |         "eval_envs": eval_envs,
120 |         "num_agents": num_agents,
121 |         "device": device,
122 |         "run_dir": run_dir
123 |     }
124 | 
125 |     # run experiments
126 |     if all_args.share_policy:
127 |         from onpolicy.runner.shared.mpe_runner import MPERunner as Runner
128 |     else:
129 |         from onpolicy.runner.separated.mpe_runner import MPERunner as Runner
130 | 
131 |     runner = Runner(config)
132 |     runner.run()
133 |     
134 |     # post process
135 |     envs.close()
136 |     if all_args.use_eval and eval_envs is not envs:
137 |         eval_envs.close()
138 | 
139 |     runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json'))
140 |     runner.writter.close()
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     main(sys.argv[1:])
145 | 


--------------------------------------------------------------------------------
/train/maddpg-v1/maddpg/maddpg.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import os
  3 | from maddpg.actor_critic import Actor, Critic
  4 | 
  5 | 
  6 | class MADDPG:
  7 |     def __init__(self, args, agent_id):  # 因为不同的agent的obs、act维度可能不一样，所以神经网络不同,需要agent_id来区分
  8 |         self.args = args
  9 |         self.agent_id = agent_id
 10 |         self.train_step = 0
 11 | 
 12 |         # create the network
 13 |         self.actor_network = Actor(args, agent_id)
 14 |         self.critic_network = Critic(args)
 15 | 
 16 |         # build up the target network
 17 |         self.actor_target_network = Actor(args, agent_id)
 18 |         self.critic_target_network = Critic(args)
 19 | 
 20 |         # load the weights into the target networks
 21 |         self.actor_target_network.load_state_dict(self.actor_network.state_dict())
 22 |         self.critic_target_network.load_state_dict(self.critic_network.state_dict())
 23 | 
 24 |         # create the optimizer
 25 |         self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
 26 |         self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)
 27 | 
 28 |         # create the dict for store the model
 29 |         if not os.path.exists('results/' + self.args.save_dir):
 30 |             os.mkdir('results/' + self.args.save_dir)
 31 |         # path to save the model
 32 |         self.model_path = 'results/' + self.args.save_dir + '/' + self.args.scenario_name
 33 |         if not os.path.exists(self.model_path):
 34 |             os.mkdir(self.model_path)
 35 |         self.model_path = self.model_path + '/' + 'agent_%d' % agent_id
 36 |         if not os.path.exists(self.model_path):
 37 |             os.mkdir(self.model_path)
 38 | 
 39 |         # load model
 40 |         actor_fullpath = self.model_path + '/99_actor_params.pkl'
 41 |         critic_fullpath = self.model_path + '/99_critic_params.pkl'
 42 |         if os.path.exists(actor_fullpath):
 43 |             self.actor_network.load_state_dict(torch.load(actor_fullpath))
 44 |             self.critic_network.load_state_dict(torch.load(critic_fullpath))
 45 |             print('Agent {} successfully loaded actor_network: {}'.format(self.agent_id, self.model_path + '/actor_params.pkl'))
 46 |             print('Agent {} successfully loaded critic_network: {}'.format(self.agent_id,self.model_path + '/critic_params.pkl'))
 47 | 
 48 |     # soft update
 49 |     def _soft_update_target_network(self):
 50 |         for target_param, param in zip(self.actor_target_network.parameters(), self.actor_network.parameters()):
 51 |             target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data)
 52 | 
 53 |         for target_param, param in zip(self.critic_target_network.parameters(), self.critic_network.parameters()):
 54 |             target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data)
 55 | 
 56 |     # update the network
 57 |     def train(self, transitions, other_agents):
 58 |         for key in transitions.keys():
 59 |             if torch.is_tensor(transitions[key]): continue
 60 |             transitions[key] = torch.tensor(transitions[key], dtype=torch.float32)
 61 |         r = transitions['r_%d' % self.agent_id]  # 训练时只需要自己的reward
 62 |         o, u, o_next = [], [], []  # 用来装每个agent经验中的各项
 63 |         for agent_id in range(self.args.n_agents):
 64 |             o.append(transitions['o_%d' % agent_id])
 65 |             u.append(transitions['u_%d' % agent_id])
 66 |             o_next.append(transitions['o_next_%d' % agent_id])
 67 | 
 68 |         # calculate the target Q value function
 69 |         u_next = []
 70 |         with torch.no_grad():
 71 |             # 得到下一个状态对应的动作
 72 |             index = 0
 73 |             for agent_id in range(self.args.n_agents):
 74 |                 if agent_id == self.agent_id:
 75 |                     u_next.append(self.actor_target_network(o_next[agent_id]))
 76 |                 else:
 77 |                     # 因为传入的other_agents要比总数少一个，可能中间某个agent是当前agent，不能遍历去选择动作
 78 |                     u_next.append(other_agents[index].policy.actor_target_network(o_next[agent_id]))
 79 |                     index += 1
 80 |             q_next = self.critic_target_network(o_next, u_next).detach()
 81 | 
 82 |             target_q = (r.unsqueeze(1) + self.args.gamma * q_next).detach()
 83 | 
 84 |         # the q loss
 85 |         q_value = self.critic_network(o, u)
 86 |         critic_loss = (target_q - q_value).pow(2).mean()
 87 | 
 88 |         # the actor loss
 89 |         # 重新选择联合动作中当前agent的动作，其他agent的动作不变
 90 |         u[self.agent_id] = self.actor_network(o[self.agent_id])
 91 |         actor_loss = - self.critic_network(o, u).mean()
 92 |         # if self.agent_id == 0:
 93 |         #     print('critic_loss is {}, actor_loss is {}'.format(critic_loss, actor_loss))
 94 |         # update the network
 95 |         self.actor_optim.zero_grad()
 96 |         actor_loss.backward()
 97 |         self.actor_optim.step()
 98 |         self.critic_optim.zero_grad()
 99 |         critic_loss.backward()
100 |         self.critic_optim.step()
101 | 
102 |         self._soft_update_target_network()
103 |         if self.train_step > 0 and self.train_step % self.args.save_rate == 0:
104 |             self.save_model(self.train_step)
105 |         self.train_step += 1
106 | 
107 |     def save_model(self, train_step):
108 |         num = str(train_step // self.args.save_rate)
109 |         model_path = os.path.join('results/'+self.args.save_dir, self.args.scenario_name)
110 |         if not os.path.exists(model_path):
111 |             os.makedirs(model_path)
112 |         model_path = os.path.join(model_path, 'agent_%d' % self.agent_id)
113 |         if not os.path.exists(model_path):
114 |             os.makedirs(model_path)
115 |         torch.save(self.actor_network.state_dict(), model_path + '/'  + 'actor_params.pkl')
116 |         torch.save(self.critic_network.state_dict(),  model_path + '/' + 'critic_params.pkl')
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/train/maddpg-v5/render.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import numpy as np
  4 | from pathlib import Path
  5 | import socket
  6 | import setproctitle
  7 | import torch
  8 | from config import get_config
  9 | from offpolicy.utils.util import get_cent_act_dim, get_dim_from_space
 10 | import formation_gym
 11 | from offpolicy.envs.env_wrappers import DummyVecEnv, SubprocVecEnv
 12 | 
 13 | 
 14 | def make_train_env(all_args):
 15 |     def get_env_fn(rank):
 16 |         def init_env():
 17 |             print(all_args.env_name)
 18 |             if all_args.env_name == "formation":
 19 |                 env = formation_gym.make_env(all_args.scenario_name, benchmark = False, num_agents = all_args.num_agents)
 20 |             else:
 21 |                 print("Can not support the " +
 22 |                       all_args.env_name + "environment.")
 23 |                 raise NotImplementedError
 24 |             env.seed(all_args.seed + rank * 1000)
 25 |             return env
 26 |         return init_env
 27 |     if all_args.n_rollout_threads == 1:
 28 |         return DummyVecEnv([get_env_fn(0)])
 29 |     else:
 30 |         return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
 31 | 
 32 | 
 33 | def make_eval_env(all_args):
 34 |     def get_env_fn(rank):
 35 |         def init_env():
 36 |             if all_args.env_name == "formation":
 37 |                 env = formation_gym.make_env(all_args.scenario_name, benchmark = False, num_agents = all_args.num_agents)
 38 |             else:
 39 |                 print("Can not support the " +
 40 |                       all_args.env_name + "environment.")
 41 |                 raise NotImplementedError
 42 |             env.seed(all_args.seed * 50000 + rank * 10000)
 43 |             return env
 44 |         return init_env
 45 |     if all_args.n_eval_rollout_threads == 1:
 46 |         return DummyVecEnv([get_env_fn(0)])
 47 |     else:
 48 |         return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)])
 49 | 
 50 | 
 51 | def parse_args(args, parser):
 52 |     parser.add_argument('--scenario_name', type=str,
 53 |                         default='formation_hd_env', help="Which scenario to run on")
 54 |     parser.add_argument("--num_landmarks", type=int, default=3)
 55 |     parser.add_argument('--num_agents', type=int,
 56 |                         default=3, help="number of agents")
 57 |     parser.add_argument('--use_same_share_obs', action='store_false',
 58 |                         default=True, help="Whether to use available actions")
 59 | 
 60 |     all_args = parser.parse_known_args(args)[0]
 61 | 
 62 |     return all_args
 63 | 
 64 | 
 65 | def main(args):
 66 |     parser = get_config()
 67 |     all_args = parse_args(args, parser)
 68 | 
 69 |     # cuda and # threads
 70 |     if all_args.cuda and torch.cuda.is_available():
 71 |         print("choose to use gpu...")
 72 |         device = torch.device("cuda:0")
 73 |         torch.set_num_threads(all_args.n_training_threads)
 74 |         if all_args.cuda_deterministic:
 75 |             torch.backends.cudnn.benchmark = False
 76 |             torch.backends.cudnn.deterministic = True
 77 |     else:
 78 |         print("choose to use cpu...")
 79 |         device = torch.device("cpu")
 80 |         torch.set_num_threads(all_args.n_training_threads)
 81 | 
 82 |     # setup file to output tensorboard, hyperparameters, and saved models
 83 |     run_dir = Path(os.path.dirname(os.path.abspath(__file__)) + "/results") / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name
 84 | 
 85 |     # create env
 86 |     env = make_train_env(all_args)
 87 |     # env = formation_gym.make_env(all_args.scenario_name, False, all_args.num_agents)
 88 |     num_agents = all_args.num_agents
 89 | 
 90 |     # create policies and mapping fn
 91 |     if all_args.share_policy:
 92 |         policy_info = {
 93 |             'policy_0': {"cent_obs_dim": get_dim_from_space(env.share_observation_space[0]),
 94 |                          "cent_act_dim": get_cent_act_dim(env.action_space),
 95 |                          "obs_space": env.observation_space[0],
 96 |                          "share_obs_space": env.share_observation_space[0],
 97 |                          "act_space": env.action_space[0]}
 98 |         }
 99 | 
100 |         def policy_mapping_fn(id): return 'policy_0'
101 |     else:
102 |         policy_info = {
103 |             'policy_' + str(agent_id): {"cent_obs_dim": get_dim_from_space(env.share_observation_space[agent_id]),
104 |                                         "cent_act_dim": get_cent_act_dim(env.action_space),
105 |                                         "obs_space": env.observation_space[agent_id],
106 |                                         "share_obs_space": env.share_observation_space[agent_id],
107 |                                         "act_space": env.action_space[agent_id]}
108 |             for agent_id in range(num_agents)
109 |         }
110 | 
111 |         def policy_mapping_fn(agent_id): return 'policy_' + str(agent_id)
112 | 
113 |     # choose algo
114 |     if all_args.algorithm_name in ["rmatd3", "rmaddpg", "rmasac", "qmix", "vdn"]:
115 |         from offpolicy.runner.rnn.mpe_runner import MPERunner as Runner
116 |         assert all_args.n_rollout_threads == 1, (
117 |             "only support 1 env in recurrent version.")
118 |         eval_env = env
119 |     elif all_args.algorithm_name in ["matd3", "maddpg", "masac", "mqmix", "mvdn"]:
120 |         from offpolicy.runner.mlp.mpe_runner import MPERunner as Runner
121 |         eval_env = make_eval_env(all_args)
122 |     else:
123 |         raise NotImplementedError
124 | 
125 |     config = {"args": all_args,
126 |               "policy_info": policy_info,
127 |               "policy_mapping_fn": policy_mapping_fn,
128 |               "env": env,
129 |               "eval_env": eval_env,
130 |               "num_agents": num_agents,
131 |               "device": device,
132 |               "use_same_share_obs": all_args.use_same_share_obs,
133 |               "run_dir": run_dir
134 |               }
135 | 
136 |     runner = Runner(config=config)
137 |     runner.eval(render = True)
138 | 
139 |     env.close()
140 |     if all_args.use_eval and (eval_env is not env):
141 |         eval_env.close()
142 | 
143 | if __name__ == "__main__":
144 |     main(sys.argv[1:])
145 | 


--------------------------------------------------------------------------------
/formation_gym/envs/formation_hd_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.spatial.distance import directed_hausdorff
  3 | 
  4 | from formation_gym.scenario import BaseScenario
  5 | from formation_gym.core import World, Agent, Landmark
  6 | 
  7 | '''
  8 | use Hausdorff distance as reward function
  9 | refer to https://www.wikiwand.com/en/Hausdorff_distance#/Applications
 10 | '''
 11 | 
 12 | class Scenario(BaseScenario):
 13 |     def make_world(self, num_agents = 3, episode_length = 100):
 14 |         # world properties
 15 |         world = World()
 16 |         world.world_length = episode_length
 17 |         world.dim_c = 2 # communication channel
 18 |         world.collaborative = True
 19 |         self.num_agents = num_agents
 20 |         # agent properties
 21 |         world.agents = [Agent() for i in range(num_agents)]
 22 |         for i, agent in enumerate(world.agents):
 23 |             agent.name = 'agent %d' % i
 24 |             agent.collide = True
 25 |             agent.silent = True
 26 |             agent.size = 0.03
 27 |         # landmark properties
 28 |         world.landmarks = [Landmark() for i in range(num_agents)]
 29 |         for i, landmark in enumerate(world.landmarks):
 30 |             landmark.name = 'landmarks %d' % i
 31 |             landmark.collide = False 
 32 |             landmark.movable = False
 33 |             landmark.size = 0.01
 34 |         # initial conditions
 35 |         self.reset_world(world)
 36 |         return world
 37 |     
 38 |     def observation(self, agent, world):
 39 |         # change landmark pos for visualization (Note: not necessary for training)
 40 |         u = [a.state.p_pos for a in world.agents]
 41 |         v = [l.state.p_pos for l in world.landmarks]
 42 |         delta = np.mean(u,0) - np.mean(v,0)
 43 |         for l in world.landmarks:
 44 |             l.state.p_pos += delta
 45 |         # for i in range(3):
 46 |         #     u = [world.agents[i].state.p_pos for i in range(i*3, (i+1)*3)]
 47 |         #     v = [world.landmarks[i].state.p_pos for i in range(i*3, (i+1)*3)]
 48 |         #     delta = np.mean(u, 0) - np.mean(v, 0)
 49 |         #     for j in range(3):
 50 |         #         world.landmarks[i*3+j].state.p_pos += delta # synchronize the center of landmarks and agents
 51 |         # agent pos & communication
 52 |         other_pos = np.array([])
 53 |         comm = np.array([])
 54 |         for other in world.agents:
 55 |             if other is agent: continue
 56 |             comm = np.append(comm, other.state.c)
 57 |             other_pos = np.append(other_pos, other.state.p_pos - agent.state.p_pos)
 58 |         foo = [world.agents[i].state.p_pos for i in range(0, 3)]
 59 |         return np.concatenate((agent.state.p_vel, other_pos, comm, self.ideal_shape.flatten(), self.ideal_vel))
 60 | 
 61 |     def reward(self, agent, world):
 62 |         # part1: formation reward: define by hausdorff distance
 63 |         rew = 0
 64 |         agent_shape = [a.state.p_pos for a in world.agents]
 65 |         agent_shape = agent_shape - np.mean(agent_shape, 0)
 66 |         rew = -max(directed_hausdorff(agent_shape, self.ideal_shape)[0], directed_hausdorff(self.ideal_shape, agent_shape)[0])
 67 |         # part2: velocity reward: define by overall velocity difference
 68 |         mean_vel = np.mean([a.state.p_vel for a in world.agents], axis = 0)
 69 |         rew -= np.linalg.norm(self.ideal_vel - mean_vel)
 70 |         # part3: collision
 71 |         if agent.collide:
 72 |             for a in world.agents:
 73 |                 if agent!=a and self.is_collision(a, agent):
 74 |                     rew -= 1
 75 |         return rew
 76 | 
 77 |     def reset_world(self, world):
 78 |         # agent
 79 |         for i, agent in enumerate(world.agents):
 80 |             agent.color = np.array([0.35, 0.35, 0.85])
 81 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
 82 |             agent.state.p_vel = np.zeros(world.dim_p)
 83 |             agent.state.c = np.zeros(world.dim_c)
 84 |         # landmark: use can use `generate_shape` to generate target shape
 85 |         # self.ideal_shape = self.generate_shape(3).reshape(-1,2)
 86 |         self.ideal_shape = []
 87 |         for i, landmark in enumerate(world.landmarks):
 88 |             landmark.color = np.array([0.25, 0.25, 0.25])
 89 |             pos = np.random.uniform(-1, +1, world.dim_p)
 90 |             self.ideal_shape.append(pos)
 91 |             landmark.state.p_pos = self.ideal_shape[i]
 92 |             landmark.state.p_vel = np.zeros(world.dim_p)
 93 |         self.ideal_shape = self.ideal_shape - np.mean(self.ideal_shape, 0)
 94 |         # ideal velocity
 95 |         self.ideal_vel = np.random.uniform(-1, +1, world.dim_p)
 96 | 
 97 |     def benchmark_data(self, agent, world):
 98 |         # get data to debug
 99 |         rew = self.reward(agent, world)
100 |         collisions = 0
101 |         if agent.collide:
102 |             for a in world.agents:
103 |                 if self.is_collision(a, agent):
104 |                     collisions += 1
105 |         min_dists = 0
106 |         occupied_landmarks = 0
107 |         for l in world.landmarks:
108 |             dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents]
109 |             min_dists += min(dists)
110 |             if min(dists) < 0.1:
111 |                 occupied_landmarks += 1
112 |         return {
113 |             'reward': rew, 
114 |             'collisions': collisions, 
115 |             'min_dists': min_dists, 
116 |             'occupied_landmarks': occupied_landmarks
117 |         }
118 | 
119 |     def is_collision(self, agent1, agent2):
120 |         dist = np.linalg.norm(agent1.state.p_pos - agent2.state.p_pos)
121 |         return dist < (agent1.size + agent2.size)/2
122 | 
123 |     def generate_shape(self, layer, layer_shapes = None):
124 |         # this is default shape
125 |         layer_shapes = layer_shapes or np.array([
126 |             [[0, -1], [0.5, 0], [0, 1]],
127 |             [[0, 1.6], [-1, 0], [1, 0]],
128 |             [[1.5, 0], [0, 0], [-1.5, 0]],
129 |             [[0, 0.6], [1, 0], [-1, 0]],
130 |         ])
131 |         num_layers = layer_shapes.shape[0]
132 |         assert layer < num_layers, 'Layer shape is not enough!'
133 |         num_agents_per_layer = layer_shapes.shape[1]
134 |         if layer == 0:
135 |             return layer_shapes[0]
136 |         else:
137 |             old_shape = self.generate_shape(layer-1)
138 |             shape = np.array([(layer_shapes[layer][i] + old_shape * 0.45) for i in range(num_agents_per_layer)])
139 |         return shape
140 | 
141 | if __name__ == '__main__':
142 |     s = Scenario()


--------------------------------------------------------------------------------
/train/mappo/train_formation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import os
  4 | import wandb
  5 | import socket
  6 | import setproctitle
  7 | import numpy as np
  8 | from pathlib import Path
  9 | import torch
 10 | from onpolicy.config import get_config
 11 | import formation_gym
 12 | from onpolicy.envs.env_wrappers import SubprocVecEnv, DummyVecEnv
 13 | 
 14 | """Train script for formation control."""
 15 | 
 16 | def make_train_env(all_args):
 17 |     def get_env_fn(rank):
 18 |         def init_env():
 19 |             if all_args.env_name == "MPE":
 20 |                 env = formation_gym.make_env(all_args.scenario_name, False , all_args.num_agents)
 21 |             else:
 22 |                 print("Can not support the " +
 23 |                       all_args.env_name + "environment.")
 24 |                 raise NotImplementedError
 25 |             env.seed(all_args.seed + rank * 1000)
 26 |             return env
 27 |         return init_env
 28 |     if all_args.n_rollout_threads == 1:
 29 |         return DummyVecEnv([get_env_fn(0)])
 30 |     else:
 31 |         return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
 32 | 
 33 | 
 34 | def make_eval_env(all_args):
 35 |     def get_env_fn(rank):
 36 |         def init_env():
 37 |             if all_args.env_name == "MPE":
 38 |                 env = formation_gym.make_env(all_args.scenario_name, False , all_args.num_agents)
 39 |             else:
 40 |                 print("Can not support the " +
 41 |                       all_args.env_name + "environment.")
 42 |                 raise NotImplementedError
 43 |             env.seed(all_args.seed * 50000 + rank * 10000)
 44 |             return env
 45 |         return init_env
 46 |     if all_args.n_eval_rollout_threads == 1:
 47 |         return DummyVecEnv([get_env_fn(0)])
 48 |     else:
 49 |         return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)])
 50 | 
 51 | 
 52 | def parse_args(args, parser):
 53 |     parser.add_argument('--scenario_name', type=str,
 54 |                         default='simple_spread', help="Which scenario to run on")
 55 |     parser.add_argument("--num_landmarks", type=int, default=3)
 56 |     parser.add_argument('--num_agents', type=int,
 57 |                         default=2, help="number of players")
 58 | 
 59 |     all_args = parser.parse_known_args(args)[0]
 60 | 
 61 |     return all_args
 62 | 
 63 | 
 64 | def main(args):
 65 |     parser = get_config()
 66 |     all_args = parse_args(args, parser)
 67 | 
 68 |     if all_args.algorithm_name == "rmappo":
 69 |         assert (all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ("check recurrent policy!")
 70 |     elif all_args.algorithm_name == "mappo":
 71 |         assert (all_args.use_recurrent_policy == False and all_args.use_naive_recurrent_policy == False), ("check recurrent policy!")
 72 |     else:
 73 |         raise NotImplementedError
 74 | 
 75 |     assert (all_args.share_policy == True and all_args.scenario_name == 'simple_speaker_listener') == False, (
 76 |         "The simple_speaker_listener scenario can not use shared policy. Please check the config.py.")
 77 | 
 78 |     # cuda
 79 |     if all_args.cuda and torch.cuda.is_available():
 80 |         print("choose to use gpu...")
 81 |         device = torch.device("cuda:0")
 82 |         torch.set_num_threads(all_args.n_training_threads)
 83 |         if all_args.cuda_deterministic:
 84 |             torch.backends.cudnn.benchmark = False
 85 |             torch.backends.cudnn.deterministic = True
 86 |     else:
 87 |         print("choose to use cpu...")
 88 |         device = torch.device("cpu")
 89 |         torch.set_num_threads(all_args.n_training_threads)
 90 | 
 91 |     # run dir
 92 |     run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[
 93 |                    0] + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name
 94 |     if not run_dir.exists():
 95 |         os.makedirs(str(run_dir))
 96 | 
 97 |     # wandb
 98 |     if all_args.use_wandb:
 99 |         run = wandb.init(config=all_args,
100 |                          project=all_args.env_name,
101 |                          entity=all_args.user_name,
102 |                          notes=socket.gethostname(),
103 |                          name=str(all_args.algorithm_name) + "_" +
104 |                          str(all_args.experiment_name) +
105 |                          "_seed" + str(all_args.seed),
106 |                          group=all_args.scenario_name,
107 |                          dir=str(run_dir),
108 |                          job_type="training",
109 |                          reinit=True)
110 |     else:
111 |         if not run_dir.exists():
112 |             curr_run = 'run1'
113 |         else:
114 |             exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run')]
115 |             if len(exst_run_nums) == 0:
116 |                 curr_run = 'run1'
117 |             else:
118 |                 curr_run = 'run%i' % (max(exst_run_nums) + 1)
119 |         run_dir = run_dir / curr_run
120 |         if not run_dir.exists():
121 |             os.makedirs(str(run_dir))
122 | 
123 |     setproctitle.setproctitle(str(all_args.algorithm_name) + "-" + \
124 |         str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(all_args.user_name))
125 | 
126 |     # seed
127 |     torch.manual_seed(all_args.seed)
128 |     torch.cuda.manual_seed_all(all_args.seed)
129 |     np.random.seed(all_args.seed)
130 | 
131 |     # env init
132 |     envs = make_train_env(all_args)
133 |     eval_envs = make_eval_env(all_args) if all_args.use_eval else None
134 |     num_agents = all_args.num_agents
135 | 
136 |     config = {
137 |         "all_args": all_args,
138 |         "envs": envs,
139 |         "eval_envs": eval_envs,
140 |         "num_agents": num_agents,
141 |         "device": device,
142 |         "run_dir": run_dir
143 |     }
144 | 
145 |     # run experiments
146 |     if all_args.share_policy:
147 |         from onpolicy.runner.shared.mpe_runner import MPERunner as Runner
148 |     else:
149 |         from onpolicy.runner.separated.mpe_runner import MPERunner as Runner
150 | 
151 |     runner = Runner(config)
152 |     runner.run()
153 |     
154 |     # post process
155 |     envs.close()
156 |     if all_args.use_eval and eval_envs is not envs:
157 |         eval_envs.close()
158 | 
159 |     if all_args.use_wandb:
160 |         run.finish()
161 |     else:
162 |         runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json'))
163 |         runner.writter.close()
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     main(sys.argv[1:])
168 | 


--------------------------------------------------------------------------------
/formation_gym/envs/formation_hd_obs_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.spatial.distance import directed_hausdorff
  3 | 
  4 | from formation_gym.scenario import BaseScenario
  5 | from formation_gym.core import World, Agent, Landmark, Wall
  6 | 
  7 | '''
  8 | use Hausdorff distance as reward function
  9 | refer to https://www.wikiwand.com/en/Hausdorff_distance#/Applications
 10 | add obstables into consideration
 11 | '''
 12 | 
 13 | class Scenario(BaseScenario):
 14 |     def make_world(self, num_agents = 4, num_landmarks = 4, num_obstacles = 3, world_length = 50):
 15 |         self.num_agents = num_agents
 16 |         self.num_landmarks = num_landmarks
 17 |         self.num_obstacles = num_obstacles
 18 |         # world properties
 19 |         world = World()
 20 |         world.world_length = world_length
 21 |         world.dim_c = 2 # communication channel
 22 |         world.collaborative = True
 23 |         # agent properties
 24 |         world.agents = [Agent() for i in range(num_agents)]
 25 |         for i, agent in enumerate(world.agents):
 26 |             agent.name = 'agent %d' % i
 27 |             agent.collide = True
 28 |             agent.silent = True
 29 |             agent.size = 0.1
 30 |         # landmark and obstacles properties
 31 |         world.landmarks = [Landmark() for i in range(num_landmarks + num_obstacles)]
 32 |         for i, landmark in enumerate(world.landmarks):
 33 |             # setup landmarks
 34 |             if i < num_landmarks:
 35 |                 landmark.name = 'landmarks %d' % i
 36 |                 landmark.collide = False 
 37 |                 landmark.movable = False
 38 |                 landmark.size = 0.02
 39 |             # setup obstacles
 40 |             else: 
 41 |                 landmark.name = 'obstacles %d' % (i - num_landmarks)
 42 |                 landmark.collide = True 
 43 |                 landmark.movable = True
 44 |                 landmark.size = 0.15
 45 |         # setup walls 
 46 |         # world.walls = []
 47 |         # world.walls.append(Wall(orient='H',axis_pos=2.6,endpoints=(-2.2, 2.2),width=0.2,hard=True))
 48 |         # world.walls.append(Wall(orient='H',axis_pos=-2.6,endpoints=(-2.2, 2.2),width=0.2,hard=True))
 49 |         # world.walls.append(Wall(orient='V',axis_pos=2.2,endpoints=(-10, 10),width=0.2,hard=True))
 50 |         # world.walls.append(Wall(orient='V',axis_pos=-2.2,endpoints=(-10, 10),width=0.2,hard=True))
 51 |         # initial conditions
 52 |         self.reset_world(world)
 53 |         return world
 54 |     
 55 |     def observation(self, agent, world):
 56 |         # landmark pos
 57 |         entity_pos = []
 58 |         for entity in world.landmarks[:self.num_landmarks]:
 59 |             entity_pos.append(entity.state.p_pos)
 60 |         for entity in world.landmarks[self.num_landmarks:]:
 61 |             entity_pos.append(entity.state.p_pos - agent.state.p_pos)
 62 |         # agent pos & communication
 63 |         other_pos = []
 64 |         comm = []
 65 |         for other in world.agents:
 66 |             if other is agent: continue
 67 |             comm.append(other.state.c)
 68 |             other_pos.append(other.state.p_pos - agent.state.p_pos)
 69 |         return np.concatenate([agent.state.p_vel]+entity_pos + other_pos + comm)
 70 | 
 71 |     def reward(self, agent, world):
 72 |         rew = 0
 73 |         u = [a.state.p_pos for a in world.agents]
 74 |         v = [l.state.p_pos for l in world.landmarks[:self.num_landmarks]]
 75 |         delta = np.mean(u, 0) - np.mean(v, 0)
 76 |         u = u - np.mean(u, 0)
 77 |         v = v - np.mean(v, 0)
 78 |         rew = -max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])
 79 |         # set boundary
 80 |         # self.set_bound(world)
 81 |         # change landmark pos and color
 82 |         for i, landmark in enumerate(world.landmarks):
 83 |             if i < self.num_landmarks:
 84 |                 delta = [0, 0]
 85 |                 landmark.state.p_pos += delta
 86 |             else:
 87 |                 if landmark.state.p_pos[1] > -2.2:
 88 |                      landmark.state.p_vel = np.array([0, -1])
 89 |                 else: landmark.state.p_vel = np.array([0, 0])
 90 |             # dist = min([np.linalg.norm(a.state.p_pos - world.landmarks[i].state.p_pos) for a in world.agents])
 91 |             # if dist <= 0.2: world.landmarks[i].color = np.array([0, 0.6, 0])
 92 |         if agent.collide:
 93 |             for a in world.agents:
 94 |                 if agent!=a and self.is_collision(a, agent):
 95 |                     rew -= 2
 96 |             for l in world.landmarks[self.num_landmarks:]:
 97 |                 if self.is_collision(l, agent):
 98 |                     rew -= 2
 99 |         return rew
100 | 
101 |     def reset_world(self, world):
102 |         # agent
103 |         for agent in world.agents:
104 |             agent.color = np.array([0.65, 0.65, 0.85])
105 |             agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
106 |             agent.state.p_vel = np.zeros(world.dim_p)
107 |             agent.state.c = np.zeros(world.dim_c)
108 |         # landmark
109 |         for i, landmark in enumerate(world.landmarks):
110 |             step = np.linspace(-1.8, 1.8, self.num_obstacles+1)
111 |             # setup landmarks
112 |             if i <self.num_landmarks:
113 |                 landmark.color = np.array([0, 0.6, 0])
114 |                 landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p)
115 |                 landmark.state.p_vel = np.zeros(world.dim_p)
116 |             # setup obstacles
117 |             else: 
118 |                 landmark.color = np.array([0.25, 0.25, 0.25])
119 |                 landmark.state.p_pos = np.random.uniform([step[i-self.num_landmarks], 2.0], [step[i+1-self.num_landmarks], 2.5])
120 |                 landmark.state.p_vel = np.array([0, -1])
121 |             
122 | 
123 |     def benchmark_data(self, agent, world):
124 |         # get data to debug
125 |         rew = self.reward(agent, world)
126 |         collisions = 0
127 |         if agent.collide:
128 |             for a in world.agents:
129 |                 if self.is_collision(a, agent):
130 |                     collisions += 1
131 |         min_dists = 0
132 |         occupied_landmarks = 0
133 |         for l in world.landmarks:
134 |             dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents]
135 |             min_dists += min(dists)
136 |             if min(dists) < 0.1:
137 |                 occupied_landmarks += 1
138 |         return {
139 |             'reward': rew, 
140 |             'collisions': collisions, 
141 |             'min_dists': min_dists, 
142 |             'occupied_landmarks': occupied_landmarks
143 |         }
144 | 
145 |     def is_collision(self, agent1, agent2):
146 |         dist = np.linalg.norm(agent1.state.p_pos - agent2.state.p_pos)
147 |         return dist < (agent1.size + agent2.size)
148 | 
149 |     def set_bound(self, world):
150 |         for agent in world.agents:
151 |             agent.state.p_pos = np.clip(agent.state.p_pos, [-2.5, -20], [2.5, 20])
152 |         for landmark in world.landmarks[self.num_landmarks:]:
153 |             landmark.state.p_pos = np.clip(landmark.state.p_pos, [-2.5, -20], [2.5, 20])


--------------------------------------------------------------------------------
/train/maddpg-v5/train.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import numpy as np
  4 | from pathlib import Path
  5 | import socket
  6 | import setproctitle
  7 | import torch
  8 | from config import get_config
  9 | from offpolicy.utils.util import get_cent_act_dim, get_dim_from_space
 10 | import formation_gym
 11 | from offpolicy.envs.env_wrappers import DummyVecEnv, SubprocVecEnv
 12 | 
 13 | def make_train_env(all_args):
 14 |     def get_env_fn(rank):
 15 |         def init_env():
 16 |             print(all_args.env_name)
 17 |             if all_args.env_name == "formation":
 18 |                 env = formation_gym.make_env(all_args.scenario_name, benchmark = False, num_agents = all_args.num_agents)
 19 |             else:
 20 |                 print("Can not support the " +
 21 |                       all_args.env_name + "environment.")
 22 |                 raise NotImplementedError
 23 |             env.seed(all_args.seed + rank * 1000)
 24 |             return env
 25 |         return init_env
 26 |     if all_args.n_rollout_threads == 1:
 27 |         return DummyVecEnv([get_env_fn(0)])
 28 |     else:
 29 |         return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
 30 | 
 31 | 
 32 | def make_eval_env(all_args):
 33 |     def get_env_fn(rank):
 34 |         def init_env():
 35 |             if all_args.env_name == "formation":
 36 |                 env = formation_gym.make_env(all_args.scenario_name, benchmark = False, num_agents = all_args.num_agents)
 37 |             else:
 38 |                 print("Can not support the " +
 39 |                       all_args.env_name + "environment.")
 40 |                 raise NotImplementedError
 41 |             env.seed(all_args.seed * 50000 + rank * 10000)
 42 |             return env
 43 |         return init_env
 44 |     if all_args.n_eval_rollout_threads == 1:
 45 |         return DummyVecEnv([get_env_fn(0)])
 46 |     else:
 47 |         return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)])
 48 | 
 49 | 
 50 | def parse_args(args, parser):
 51 |     parser.add_argument('--scenario_name', type=str,
 52 |                         default='formation_hd_env', help="Which scenario to run on")
 53 |     parser.add_argument("--num_landmarks", type=int, default=3)
 54 |     parser.add_argument('--num_agents', type=int,
 55 |                         default=3, help="number of agents")
 56 |     parser.add_argument('--use_same_share_obs', action='store_false',
 57 |                         default=True, help="Whether to use available actions")
 58 | 
 59 |     all_args = parser.parse_known_args(args)[0]
 60 | 
 61 |     return all_args
 62 | 
 63 | 
 64 | def main(args):
 65 |     parser = get_config()
 66 |     all_args = parse_args(args, parser)
 67 | 
 68 |     # cuda and # threads
 69 |     if all_args.cuda and torch.cuda.is_available():
 70 |         print("choose to use gpu...")
 71 |         device = torch.device("cuda:0")
 72 |         torch.set_num_threads(all_args.n_training_threads)
 73 |         if all_args.cuda_deterministic:
 74 |             torch.backends.cudnn.benchmark = False
 75 |             torch.backends.cudnn.deterministic = True
 76 |     else:
 77 |         print("choose to use cpu...")
 78 |         device = torch.device("cpu")
 79 |         torch.set_num_threads(all_args.n_training_threads)
 80 | 
 81 |     # setup file to output tensorboard, hyperparameters, and saved models
 82 |     run_dir = Path(os.path.dirname(os.path.abspath(__file__)) + "/results") / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name
 83 | 
 84 |     if not run_dir.exists():
 85 |         os.makedirs(str(run_dir))
 86 | 
 87 | 
 88 |     if not run_dir.exists():
 89 |         curr_run = 'run1'
 90 |     else:
 91 |         exst_run_nums = [int(str(folder.name).split('run')[
 92 |                                 1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run')]
 93 |         if len(exst_run_nums) == 0:
 94 |             curr_run = 'run1'
 95 |         else:
 96 |             curr_run = 'run%i' % (max(exst_run_nums) + 1)
 97 |     run_dir = run_dir / curr_run
 98 |     if not run_dir.exists():
 99 |         os.makedirs(str(run_dir))
100 | 
101 |     setproctitle.setproctitle(str(all_args.algorithm_name) + "-" + str(
102 |         all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(all_args.user_name))
103 | 
104 |     # set seeds
105 |     torch.manual_seed(all_args.seed)
106 |     torch.cuda.manual_seed_all(all_args.seed)
107 |     np.random.seed(all_args.seed)
108 | 
109 |     # create env
110 |     env = make_train_env(all_args)
111 |     num_agents = all_args.num_agents
112 | 
113 |     # create policies and mapping fn
114 |     if all_args.share_policy:
115 |         policy_info = {
116 |             'policy_0': {"cent_obs_dim": get_dim_from_space(env.share_observation_space[0]),
117 |                          "cent_act_dim": get_cent_act_dim(env.action_space),
118 |                          "obs_space": env.observation_space[0],
119 |                          "share_obs_space": env.share_observation_space[0],
120 |                          "act_space": env.action_space[0]}
121 |         }
122 | 
123 |         def policy_mapping_fn(id): return 'policy_0'
124 |     else:
125 |         policy_info = {
126 |             'policy_' + str(agent_id): {"cent_obs_dim": get_dim_from_space(env.share_observation_space[agent_id]),
127 |                                         "cent_act_dim": get_cent_act_dim(env.action_space),
128 |                                         "obs_space": env.observation_space[agent_id],
129 |                                         "share_obs_space": env.share_observation_space[agent_id],
130 |                                         "act_space": env.action_space[agent_id]}
131 |             for agent_id in range(num_agents)
132 |         }
133 | 
134 |         def policy_mapping_fn(agent_id): return 'policy_' + str(agent_id)
135 | 
136 |     # choose algo
137 |     if all_args.algorithm_name in ["rmatd3", "rmaddpg", "rmasac", "qmix", "vdn"]:
138 |         from offpolicy.runner.rnn.mpe_runner import MPERunner as Runner
139 |         assert all_args.n_rollout_threads == 1, (
140 |             "only support 1 env in recurrent version.")
141 |         eval_env = env
142 |     elif all_args.algorithm_name in ["matd3", "maddpg", "masac", "mqmix", "mvdn"]:
143 |         from offpolicy.runner.mlp.mpe_runner import MPERunner as Runner
144 |         eval_env = make_eval_env(all_args)
145 |     else:
146 |         raise NotImplementedError
147 | 
148 |     config = {"args": all_args,
149 |               "policy_info": policy_info,
150 |               "policy_mapping_fn": policy_mapping_fn,
151 |               "env": env,
152 |               "eval_env": eval_env,
153 |               "num_agents": num_agents,
154 |               "device": device,
155 |               "use_same_share_obs": all_args.use_same_share_obs,
156 |               "run_dir": run_dir
157 |               }
158 | 
159 |     total_num_steps = 0
160 |     runner = Runner(config=config)
161 |     while total_num_steps < all_args.num_env_steps:
162 |         total_num_steps = runner.run()
163 | 
164 |     env.close()
165 |     if all_args.use_eval and (eval_env is not env):
166 |         eval_env.close()
167 | 
168 |     runner.writter.export_scalars_to_json(
169 |         str(runner.log_dir + '/summary.json'))
170 |     runner.writter.close()
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     main(sys.argv[1:])
175 | 


--------------------------------------------------------------------------------
/formation_gym/inbox/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | # physical/external base state of all entites
  4 | class EntityState(object):
  5 |     def __init__(self):
  6 |         # physical position
  7 |         self.p_pos = None
  8 |         # physical velocity
  9 |         self.p_vel = None
 10 | 
 11 | # state of agents (including communication and internal/mental state)
 12 | class AgentState(EntityState):
 13 |     def __init__(self):
 14 |         super(AgentState, self).__init__()
 15 |         # communication utterance
 16 |         self.c = None
 17 | 
 18 | # action of the agent
 19 | class Action(object):
 20 |     def __init__(self):
 21 |         # physical action
 22 |         self.u = None
 23 |         # communication action
 24 |         self.c = None
 25 | 
 26 | # properties and state of physical world entity
 27 | class Entity(object):
 28 |     def __init__(self):
 29 |         # name 
 30 |         self.name = ''
 31 |         # properties:
 32 |         self.size = 0.050
 33 |         # entity can move / be pushed
 34 |         self.movable = False
 35 |         # entity collides with others
 36 |         self.collide = True
 37 |         # material density (affects mass)
 38 |         self.density = 25.0
 39 |         # color
 40 |         self.color = None
 41 |         # max speed and accel
 42 |         self.max_speed = None
 43 |         self.accel = None
 44 |         # state
 45 |         self.state = EntityState()
 46 |         # mass
 47 |         self.initial_mass = 1.0
 48 | 
 49 |     @property
 50 |     def mass(self):
 51 |         return self.initial_mass
 52 | 
 53 | # properties of landmark entities
 54 | class Landmark(Entity):
 55 |      def __init__(self):
 56 |         super(Landmark, self).__init__()
 57 | 
 58 | # properties of agent entities
 59 | class Agent(Entity):
 60 |     def __init__(self):
 61 |         super(Agent, self).__init__()
 62 |         # agents are movable by default
 63 |         self.movable = True
 64 |         # cannot send communication signals
 65 |         self.silent = False
 66 |         # cannot observe the world
 67 |         self.blind = False
 68 |         # physical motor noise amount
 69 |         self.u_noise = None
 70 |         # communication noise amount
 71 |         self.c_noise = None
 72 |         # control range
 73 |         self.u_range = 1.0
 74 |         # state
 75 |         self.state = AgentState()
 76 |         # action
 77 |         self.action = Action()
 78 |         # script behavior to execute
 79 |         self.action_callback = None
 80 | 
 81 | # multi-agent world
 82 | class World(object):
 83 |     def __init__(self):
 84 |         # list of agents and entities (can change at execution-time!)
 85 |         self.agents = []
 86 |         self.landmarks = []
 87 |         # communication channel dimensionality
 88 |         self.dim_c = 0
 89 |         # position dimensionality
 90 |         self.dim_p = 2
 91 |         # color dimensionality
 92 |         self.dim_color = 3
 93 |         # simulation timestep
 94 |         self.dt = 0.1
 95 |         # physical damping
 96 |         self.damping = 0.25
 97 |         # contact response parameters
 98 |         self.contact_force = 1e+2
 99 |         self.contact_margin = 1e-3
100 | 
101 |     # return all entities in the world
102 |     @property
103 |     def entities(self):
104 |         return self.agents + self.landmarks
105 | 
106 |     # return all agents controllable by external policies
107 |     @property
108 |     def policy_agents(self):
109 |         return [agent for agent in self.agents if agent.action_callback is None]
110 | 
111 |     # return all agents controlled by world scripts
112 |     @property
113 |     def scripted_agents(self):
114 |         return [agent for agent in self.agents if agent.action_callback is not None]
115 | 
116 |     # update state of the world
117 |     def step(self):
118 |         # set actions for scripted agents 
119 |         for agent in self.scripted_agents:
120 |             agent.action = agent.action_callback(agent, self)
121 |         # gather forces applied to entities
122 |         p_force = [None] * len(self.entities)
123 |         # apply agent physical controlsx
124 |         p_force = self.apply_action_force(p_force)
125 |         # apply environment forces
126 |         p_force = self.apply_environment_force(p_force)
127 |         # integrate physical state
128 |         self.integrate_state(p_force)
129 |         # update agent state
130 |         for agent in self.agents:
131 |             self.update_agent_state(agent)
132 | 
133 |     # gather agent action forces
134 |     def apply_action_force(self, p_force):
135 |         # set applied forces
136 |         for i,agent in enumerate(self.agents):
137 |             if agent.movable:
138 |                 noise = np.random.randn(*agent.action.u.shape) * agent.u_noise if agent.u_noise else 0.0
139 |                 p_force[i] = agent.action.u + noise                
140 |         return p_force
141 | 
142 |     # gather physical forces acting on entities
143 |     def apply_environment_force(self, p_force):
144 |         # simple (but inefficient) collision response
145 |         for a,entity_a in enumerate(self.entities):
146 |             for b,entity_b in enumerate(self.entities):
147 |                 if(b <= a): continue
148 |                 [f_a, f_b] = self.get_collision_force(entity_a, entity_b)
149 |                 if(f_a is not None):
150 |                     if(p_force[a] is None): p_force[a] = 0.0
151 |                     p_force[a] = f_a + p_force[a] 
152 |                 if(f_b is not None):
153 |                     if(p_force[b] is None): p_force[b] = 0.0
154 |                     p_force[b] = f_b + p_force[b]        
155 |         return p_force
156 | 
157 |     # integrate physical state
158 |     def integrate_state(self, p_force):
159 |         for i,entity in enumerate(self.entities):
160 |             if not entity.movable: continue
161 |             entity.state.p_vel = entity.state.p_vel * (1 - self.damping)
162 |             if (p_force[i] is not None):
163 |                 entity.state.p_vel += (p_force[i] / entity.mass) * self.dt
164 |             if entity.max_speed is not None:
165 |                 speed = np.sqrt(np.square(entity.state.p_vel[0]) + np.square(entity.state.p_vel[1]))
166 |                 if speed > entity.max_speed:
167 |                     entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) +
168 |                                                                   np.square(entity.state.p_vel[1])) * entity.max_speed
169 |             entity.state.p_pos += entity.state.p_vel * self.dt
170 | 
171 |     def update_agent_state(self, agent):
172 |         # set communication state (directly for now)
173 |         if agent.silent:
174 |             agent.state.c = np.zeros(self.dim_c)
175 |         else:
176 |             noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0
177 |             agent.state.c = agent.action.c + noise      
178 | 
179 |     # get collision forces for any contact between two entities
180 |     def get_collision_force(self, entity_a, entity_b):
181 |         if (not entity_a.collide) or (not entity_b.collide):
182 |             return [None, None] # not a collider
183 |         if (entity_a is entity_b):
184 |             return [None, None] # don't collide against itself
185 |         # compute actual distance between entities
186 |         delta_pos = entity_a.state.p_pos - entity_b.state.p_pos
187 |         dist = np.sqrt(np.sum(np.square(delta_pos)))
188 |         # minimum allowable distance
189 |         dist_min = entity_a.size + entity_b.size
190 |         # softmax penetration
191 |         k = self.contact_margin
192 |         penetration = np.logaddexp(0, -(dist - dist_min)/k)*k
193 |         force = self.contact_force * delta_pos / dist * penetration
194 |         force_a = +force if entity_a.movable else None
195 |         force_b = -force if entity_b.movable else None
196 |         return [force_a, force_b]


--------------------------------------------------------------------------------
/train/maddpg-v3/main.py:
--------------------------------------------------------------------------------
  1 | import ray
  2 | from ray.tune import run_experiments
  3 | from ray.tune.registry import register_trainable, register_env
  4 | from env import MultiAgentParticleEnv, FormationEnv
  5 | import ray.rllib.contrib.maddpg.maddpg as maddpg
  6 | import argparse
  7 | 
  8 | import os
  9 | 
 10 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 11 | 
 12 | 
 13 | class CustomStdOut(object):
 14 |     def _log_result(self, result):
 15 |         if result["training_iteration"] % 50 == 0:
 16 |             try:
 17 |                 print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
 18 |                     result["timesteps_total"],
 19 |                     result["episodes_total"],
 20 |                     result["episode_reward_mean"],
 21 |                     result["policy_reward_mean"],
 22 |                     round(result["time_total_s"] - self.cur_time, 3)
 23 |                 ))
 24 |             except:
 25 |                 pass
 26 | 
 27 |             self.cur_time = result["time_total_s"]
 28 | 
 29 | 
 30 | def parse_args():
 31 |     parser = argparse.ArgumentParser("MADDPG with OpenAI MPE")
 32 | 
 33 |     # Environment
 34 |     parser.add_argument("--scenario", type=str, default="formation_hd_env",
 35 |                         help="name of the scenario script")
 36 |     parser.add_argument("--max-episode-len", type=int, default=25,
 37 |                         help="maximum episode length")
 38 |     parser.add_argument("--num-episodes", type=int, default=60000,
 39 |                         help="number of episodes")
 40 |     parser.add_argument("--num-adversaries", type=int, default=0,help="number of adversaries")
 41 |     parser.add_argument("--num-agents", type=int, default=3,help="number of agents")
 42 |     parser.add_argument("--good-policy", type=str, default="maddpg",
 43 |                         help="policy for good agents")
 44 |     parser.add_argument("--adv-policy", type=str, default="maddpg",
 45 |                         help="policy of adversaries")
 46 | 
 47 |     # Core training parameters
 48 |     parser.add_argument("--lr", type=float, default=1e-2,
 49 |                         help="learning rate for Adam optimizer")
 50 |     parser.add_argument("--gamma", type=float, default=0.95,
 51 |                         help="discount factor")
 52 |     # NOTE: 1 iteration = sample_batch_size * num_workers timesteps * num_envs_per_worker
 53 |     parser.add_argument("--sample-batch-size", type=int, default=25,
 54 |                         help="number of data points sampled /update /worker")
 55 |     parser.add_argument("--train-batch-size", type=int, default=1024,
 56 |                         help="number of data points /update")
 57 |     parser.add_argument("--n-step", type=int, default=1,
 58 |                         help="length of multistep value backup")
 59 |     parser.add_argument("--num-units", type=int, default=64,
 60 |                         help="number of units in the mlp")
 61 | 
 62 |     # Checkpoint
 63 |     parser.add_argument("--checkpoint-freq", type=int, default=7500,
 64 |                         help="save model once every time this many iterations are completed")
 65 |     parser.add_argument("--local-dir", type=str, default="./ray_results",
 66 |                         help="path to save checkpoints")
 67 |     parser.add_argument("--restore", type=str, default=None,
 68 |                         help="directory in which training state and model are loaded")
 69 | 
 70 |     # Parallelism
 71 |     parser.add_argument("--num-workers", type=int, default=1)
 72 |     parser.add_argument("--num-envs-per-worker", type=int, default=4)
 73 |     parser.add_argument("--num-gpus", type=int, default=0)
 74 | 
 75 |     return parser.parse_args()
 76 | 
 77 | 
 78 | def main(args):
 79 |     # ray.init(redis_max_memory=int(1e10), object_store_memory=int(3e9))
 80 |     ray.init()
 81 |     MADDPGAgent = maddpg.MADDPGTrainer.with_updates(
 82 |         mixins=[CustomStdOut]
 83 |     )
 84 |     register_trainable("MADDPG", MADDPGAgent)
 85 | 
 86 |     if 'formation' not in args.scenario:
 87 |         def env_creater(mpe_args):
 88 |             return MultiAgentParticleEnv(**mpe_args)
 89 | 
 90 |         register_env("mpe", env_creater)
 91 | 
 92 |         env = env_creater({
 93 |             "scenario_name": args.scenario
 94 |         })
 95 |     else: 
 96 |         def env_creater(mpe_args):
 97 |             return FormationEnv(**mpe_args)
 98 | 
 99 |         register_env("mpe", env_creater)
100 | 
101 |         env = env_creater({
102 |             "scenario_name": args.scenario,
103 |             'benchmark': False, 
104 |             'num_agents': args.num_agents
105 |         })
106 | 
107 |     def gen_policy(i):
108 |         use_local_critic = [
109 |             args.adv_policy == "ddpg" if i < args.num_adversaries else
110 |             args.good_policy == "ddpg" for i in range(env.num_agents)
111 |         ]
112 |         return (
113 |             None,
114 |             env.observation_space_dict[i],
115 |             env.action_space_dict[i],
116 |             {
117 |                 "agent_id": i,
118 |                 "use_local_critic": use_local_critic[i],
119 |                 "obs_space_dict": env.observation_space_dict,
120 |                 "act_space_dict": env.action_space_dict,
121 |             }
122 |         )
123 | 
124 |     policies = {"policy_%d" %i: gen_policy(i) for i in range(len(env.observation_space_dict))}
125 |     policy_ids = list(policies.keys())
126 | 
127 |     run_experiments({
128 |         "MADDPG_RLLib": {
129 |             "run": "MADDPG",
130 |             "env": "mpe",
131 |             "stop": {
132 |                 "episodes_total": args.num_episodes,
133 |             },
134 |             "checkpoint_freq": args.checkpoint_freq,
135 |             "local_dir": args.local_dir,
136 |             "restore": args.restore,
137 |             "config": {
138 |                 # === Log ===
139 |                 "log_level": "ERROR",
140 | 
141 |                 # === Environment ===
142 |                 "env_config": {
143 |                     "scenario_name": args.scenario,
144 |                 },
145 |                 "num_envs_per_worker": args.num_envs_per_worker,
146 |                 "horizon": args.max_episode_len,
147 | 
148 |                 # === Policy Config ===
149 |                 # --- Model ---
150 |                 "good_policy": args.good_policy,
151 |                 "adv_policy": args.adv_policy,
152 |                 "actor_hiddens": [args.num_units] * 2,
153 |                 "actor_hidden_activation": "relu",
154 |                 "critic_hiddens": [args.num_units] * 2,
155 |                 "critic_hidden_activation": "relu",
156 |                 "n_step": args.n_step,
157 |                 "gamma": args.gamma,
158 | 
159 |                 # --- Exploration ---
160 |                 "tau": 0.01,
161 | 
162 |                 # --- Replay buffer ---
163 |                 "buffer_size": int(1e6),
164 | 
165 |                 # --- Optimization ---
166 |                 "actor_lr": args.lr,
167 |                 "critic_lr": args.lr,
168 |                 "learning_starts": args.train_batch_size * args.max_episode_len,
169 |                 # "sample_batch_size": args.sample_batch_size,
170 |                 "train_batch_size": args.train_batch_size,
171 |                 "batch_mode": "truncate_episodes",
172 | 
173 |                 # --- Parallelism ---
174 |                 "num_workers": args.num_workers,
175 |                 "num_gpus": args.num_gpus,
176 |                 "num_gpus_per_worker": 0,
177 | 
178 |                 # === Multi-agent setting ===
179 |                 "multiagent": {
180 |                     "policies": policies,
181 |                     "policy_mapping_fn": ray.tune.function(
182 |                         lambda i: policy_ids[i]
183 |                     )
184 |                 },
185 |             },
186 |         },
187 |     }, verbose=0)
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     args = parse_args()
192 |     main(args)
193 | 


--------------------------------------------------------------------------------
/train/maddpg-v2/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import time
  4 | import os
  5 | import numpy as np
  6 | from gym.spaces import Box, Discrete
  7 | from pathlib import Path
  8 | from torch.autograd import Variable
  9 | from tensorboardX import SummaryWriter
 10 | from utils.make_env import make_env
 11 | from utils.buffer import ReplayBuffer
 12 | from utils.env_wrappers import SubprocVecEnv, DummyVecEnv
 13 | from algorithms.maddpg import MADDPG
 14 | 
 15 | import formation_gym
 16 | 
 17 | USE_CUDA = False  # torch.cuda.is_available()
 18 | 
 19 | def make_parallel_env(env_id, n_rollout_threads, seed, agent_num):
 20 |     def get_env_fn(rank):
 21 |         def init_env():
 22 |             env = formation_gym.make_env(env_id ,benchmark = False, num_agents = agent_num)
 23 |             env.seed(seed + rank * 1000)
 24 |             np.random.seed(seed + rank * 1000)
 25 |             return env
 26 |         return init_env
 27 |     if n_rollout_threads == 1:
 28 |         return DummyVecEnv([get_env_fn(0)])
 29 |     else:
 30 |         return SubprocVecEnv([get_env_fn(i) for i in range(n_rollout_threads)])
 31 | 
 32 | def run(config):
 33 |     model_dir = Path('./models') / config.env_id / config.model_name
 34 |     if not model_dir.exists():
 35 |         curr_run = 'run1'
 36 |     else:
 37 |         exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in
 38 |                          model_dir.iterdir() if
 39 |                          str(folder.name).startswith('run')]
 40 |         if len(exst_run_nums) == 0:
 41 |             curr_run = 'run1'
 42 |         else:
 43 |             curr_run = 'run%i' % (max(exst_run_nums) + 1)
 44 |     run_dir = model_dir / curr_run
 45 |     log_dir = run_dir / 'logs'
 46 |     os.makedirs(log_dir)
 47 |     logger = SummaryWriter(str(log_dir))
 48 | 
 49 |     torch.manual_seed(config.seed)
 50 |     np.random.seed(config.seed)
 51 |     if not USE_CUDA:
 52 |         torch.set_num_threads(config.n_training_threads)
 53 |     env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed,
 54 |                             config.agent_num)
 55 |     maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg,
 56 |                                   adversary_alg=config.adversary_alg,
 57 |                                   tau=config.tau,
 58 |                                   lr=config.lr,
 59 |                                   hidden_dim=config.hidden_dim)
 60 |     replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents,
 61 |                                  [obsp.shape[0] for obsp in env.observation_space],
 62 |                                  [acsp.shape[0] if isinstance(acsp, Box) else acsp.n
 63 |                                   for acsp in env.action_space])
 64 |     t = 0
 65 |     for ep_i in range(0, config.n_episodes, config.n_rollout_threads):
 66 |         print("Episodes %i-%i of %i" % (ep_i + 1,
 67 |                                         ep_i + 1 + config.n_rollout_threads,
 68 |                                         config.n_episodes))
 69 |         obs = env.reset()
 70 |         # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor
 71 |         maddpg.prep_rollouts(device='cpu') # gpu
 72 | 
 73 |         explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps
 74 |         maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining)
 75 |         maddpg.reset_noise()
 76 | 
 77 |         for et_i in range(config.episode_length):
 78 |             # rearrange observations to be per agent, and convert to torch Variable
 79 |             torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])),
 80 |                                   requires_grad=False)
 81 |                          for i in range(maddpg.nagents)]
 82 |             # get actions as torch Variables
 83 |             torch_agent_actions = maddpg.step(torch_obs, explore=True)
 84 |             # convert actions to numpy arrays
 85 |             agent_actions = [ac.data.numpy() for ac in torch_agent_actions]
 86 |             # rearrange actions to be per environment
 87 |             actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)]
 88 |             next_obs, rewards, dones, infos = env.step(actions)
 89 |             replay_buffer.push(obs, agent_actions, rewards, next_obs, dones)
 90 |             obs = next_obs
 91 |             t += config.n_rollout_threads
 92 |             if (len(replay_buffer) >= config.batch_size and
 93 |                 (t % config.steps_per_update) < config.n_rollout_threads):
 94 |                 if USE_CUDA:
 95 |                     maddpg.prep_training(device='gpu')
 96 |                 else:
 97 |                     maddpg.prep_training(device='cpu')
 98 |                 for u_i in range(config.n_rollout_threads):
 99 |                     for a_i in range(maddpg.nagents):
100 |                         sample = replay_buffer.sample(config.batch_size,
101 |                                                       to_gpu=USE_CUDA)
102 |                         maddpg.update(sample, a_i, logger=logger)
103 |                     maddpg.update_all_targets()
104 |                 maddpg.prep_rollouts(device='cpu') # cpu
105 |         ep_rews = replay_buffer.get_average_rewards(
106 |             config.episode_length * config.n_rollout_threads)
107 |         for a_i, a_ep_rew in enumerate(ep_rews):
108 |             logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
109 | 
110 |         if ep_i % config.save_interval < config.n_rollout_threads:
111 |             os.makedirs(run_dir / 'incremental', exist_ok=True)
112 |             maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))
113 |             maddpg.save(run_dir / 'model.pt')
114 | 
115 |     maddpg.save(run_dir / 'model.pt')
116 |     env.close()
117 |     logger.export_scalars_to_json(str(log_dir / 'summary.json'))
118 |     logger.close()
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     parser = argparse.ArgumentParser()
123 |     parser.add_argument("--env_id", default='simple_spread', type = str, help="Name of environment", )
124 |     parser.add_argument("--model_name", default='model', type = str, help="Name of directory to store " +
125 |                              "model/training contents")
126 |     parser.add_argument("--seed", default=1, type=int, help="Random seed")
127 |     parser.add_argument("--n_rollout_threads", default=1, type=int) # 1
128 |     parser.add_argument("--n_training_threads", default=6, type=int) # 6
129 |     parser.add_argument("--buffer_length", default=int(5e5), type=int)
130 |     parser.add_argument("--n_episodes", default=25000, type=int)
131 |     parser.add_argument("--episode_length", default=30, type=int)
132 |     parser.add_argument("--steps_per_update", default=120, type=int)
133 |     parser.add_argument("--batch_size",
134 |                         default=256, type=int,
135 |                         help="Batch size for model training")
136 |     parser.add_argument("--n_exploration_eps", default=25000, type=int)
137 |     parser.add_argument("--init_noise_scale", default=0.3, type=float)
138 |     parser.add_argument("--final_noise_scale", default=0.0, type=float)
139 |     parser.add_argument("--save_interval", default=10000, type=int)
140 |     parser.add_argument("--hidden_dim", default=64, type=int)
141 |     parser.add_argument("--lr", default=0.01, type=float)
142 |     parser.add_argument("--tau", default=0.01, type=float)
143 |     parser.add_argument("--agent_alg",
144 |                         default="MADDPG", type=str,
145 |                         choices=['MADDPG', 'DDPG'])
146 |     parser.add_argument("--adversary_alg",
147 |                         default="MADDPG", type=str,
148 |                         choices=['MADDPG', 'DDPG'])
149 |     parser.add_argument("--discrete_action", action='store_true')
150 |     parser.add_argument("--agent-num", type=int, default = 9)
151 | 
152 |     config = parser.parse_args()
153 | 
154 |     run(config)
155 | 


--------------------------------------------------------------------------------
/train/maddpg-v5/config.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | 
  4 | def get_config():
  5 |     parser = argparse.ArgumentParser(
  6 |         description="OFF-POLICY", formatter_class=argparse.RawDescriptionHelpFormatter)
  7 | 
  8 |     # prepare parameters
  9 |     parser.add_argument("--algorithm_name", type=str, default="rmaddpg", choices=[
 10 |                         "rmatd3", "rmaddpg", "rmasac", "qmix", "vdn", "matd3", "maddpg", "masac", "mqmix", "mvdn"])
 11 |     parser.add_argument("--experiment_name", type=str, default="debug")
 12 |     parser.add_argument("--seed", type=int, default=1,
 13 |                         help="Random seed for numpy/torch")
 14 |     parser.add_argument("--cuda", action='store_false', default=True)
 15 |     parser.add_argument("--cuda_deterministic",
 16 |                         action='store_false', default=True)
 17 |     parser.add_argument('--n_training_threads', type=int,
 18 |                         default=1, help="Number of torch threads for training")
 19 |     parser.add_argument('--n_rollout_threads', type=int,  default=1,
 20 |                         help="Number of parallel envs for training rollout")
 21 |     parser.add_argument('--n_eval_rollout_threads', type=int,  default=1,
 22 |                         help="Number of parallel envs for evaluating rollout")
 23 |     parser.add_argument('--num_env_steps', type=int,
 24 |                         default=2000000, help="Number of env steps to train for")
 25 |     parser.add_argument('--use_wandb', action='store_true', default=False,
 26 |                         help="Whether to use weights&biases, if not, use tensorboardX instead")
 27 |     parser.add_argument('--user_name', type=str, default="zoeyuchao")
 28 | 
 29 |     # env parameters
 30 |     parser.add_argument('--env_name', type=str, default="formation")
 31 |     parser.add_argument("--use_obs_instead_of_state", action='store_true',
 32 |                         default=False, help="Whether to use global state or concatenated obs")
 33 | 
 34 |     # replay buffer parameters
 35 |     parser.add_argument('--episode_length', type=int,
 36 |                         default=25, help="Max length for any episode")
 37 |     parser.add_argument('--buffer_size', type=int, default=5000,
 38 |                         help="Max # of transitions that replay buffer can contain")
 39 |     parser.add_argument('--use_reward_normalization',
 40 |                         default=True, help="Whether to normalize rewards in replay buffer")
 41 |     parser.add_argument('--use_popart', default=False,
 42 |                         help="Whether to use popart to normalize the target loss")
 43 |     parser.add_argument('--popart_update_interval_step', type=int, default=2,
 44 |                         help="After how many train steps popart should be updated")
 45 |                         
 46 |     # prioritized experience replay
 47 |     parser.add_argument('--use_per', action='store_true', default=False,
 48 |                         help="Whether to use prioritized experience replay")
 49 |     parser.add_argument('--per_nu', type=float, default=0.9,
 50 |                         help="Weight of max TD error in formation of PER weights")
 51 |     parser.add_argument('--per_alpha', type=float, default=0.6,
 52 |                         help="Alpha term for prioritized experience replay")
 53 |     parser.add_argument('--per_eps', type=float, default=1e-6,
 54 |                         help="Eps term for prioritized experience replay")
 55 |     parser.add_argument('--per_beta_start', type=float, default=0.4,
 56 |                         help="Starting beta term for prioritized experience replay")
 57 | 
 58 |     # network parameters
 59 |     parser.add_argument("--use_centralized_Q", action='store_false',
 60 |                         default=True, help="Whether to use centralized Q function")
 61 |     parser.add_argument('--share_policy', action='store_false',
 62 |                         default=True, help="Whether agents share the same policy")
 63 |     parser.add_argument('--hidden_size', type=int, default=64,
 64 |                         help="Dimension of hidden layers for actor/critic networks")
 65 |     parser.add_argument('--layer_N', type=int, default=1,
 66 |                         help="Number of layers for actor/critic networks")
 67 |     parser.add_argument('--use_ReLU', action='store_false',
 68 |                         default=True, help="Whether to use ReLU")
 69 |     parser.add_argument('--use_feature_normalization', action='store_false',
 70 |                         default=True, help="Whether to apply layernorm to the inputs")
 71 |     parser.add_argument('--use_orthogonal', action='store_false', default=True,
 72 |                         help="Whether to use Orthogonal initialization for weights and 0 initialization for biases")
 73 |     parser.add_argument("--gain", type=float, default=0.01,
 74 |                         help="The gain # of last action layer")
 75 |     parser.add_argument("--use_conv1d", action='store_true',
 76 |                         default=False, help="Whether to use conv1d")
 77 |     parser.add_argument("--stacked_frames", type=int, default=1,
 78 |                         help="Dimension of hidden layers for actor/critic networks")
 79 | 
 80 |     # recurrent parameters
 81 |     parser.add_argument('--prev_act_inp', action='store_true', default=False,
 82 |                         help="Whether the actor input takes in previous actions as part of its input")
 83 |     parser.add_argument("--use_rnn_layer", action='store_false',
 84 |                         default=True, help='Whether to use a recurrent policy')
 85 |     parser.add_argument("--use_naive_recurrent_policy", action='store_false',
 86 |                         default=True, help='Whether to use a naive recurrent policy')
 87 |     # TODO now only 1 is support
 88 |     parser.add_argument("--recurrent_N", type=int, default=1)
 89 |     parser.add_argument('--data_chunk_length', type=int, default=80,
 90 |                         help="Time length of chunks used to train via BPTT")
 91 |     parser.add_argument('--burn_in_time', type=int, default=0,
 92 |                         help="Length of burn in time for RNN training, see R2D2 paper")
 93 | 
 94 |     # attn parameters
 95 |     parser.add_argument("--attn", action='store_true', default=False)
 96 |     parser.add_argument("--attn_N", type=int, default=1)
 97 |     parser.add_argument("--attn_size", type=int, default=64)
 98 |     parser.add_argument("--attn_heads", type=int, default=4)
 99 |     parser.add_argument("--dropout", type=float, default=0.0)
100 |     parser.add_argument("--use_average_pool",
101 |                         action='store_false', default=True)
102 |     parser.add_argument("--use_cat_self", action='store_false', default=True)
103 | 
104 |     # optimizer parameters
105 |     parser.add_argument('--lr', type=float, default=7e-4,
106 |                         help="Learning rate for Adam")
107 |     parser.add_argument("--opti_eps", type=float, default=1e-5,
108 |                         help='RMSprop optimizer epsilon (default: 1e-5)')
109 |     parser.add_argument("--weight_decay", type=float, default=0)
110 | 
111 |     # algo common parameters
112 |     parser.add_argument('--batch_size', type=int, default=32,
113 |                         help="Number of buffer transitions to train on at once")
114 |     parser.add_argument('--gamma', type=float, default=0.99,
115 |                         help="Discount factor for env")
116 |     parser.add_argument("--use_max_grad_norm",
117 |                         action='store_false', default=True)
118 |     parser.add_argument("--max_grad_norm", type=float, default=10.0,
119 |                         help='max norm of gradients (default: 0.5)')
120 |     parser.add_argument('--use_huber_loss', action='store_true',
121 |                         default=False, help="Whether to use Huber loss for critic update")
122 |     parser.add_argument("--huber_delta", type=float, default=10.0)
123 | 
124 |     # soft update parameters
125 |     parser.add_argument('--use_soft_update', action='store_false',
126 |                         default=True, help="Whether to use soft update")
127 |     parser.add_argument('--tau', type=float, default=0.005,
128 |                         help="Polyak update rate")
129 |     # hard update parameters
130 |     parser.add_argument('--hard_update_interval_episode', type=int, default=200,
131 |                         help="After how many episodes the lagging target should be updated")
132 |     parser.add_argument('--hard_update_interval', type=int, default=200,
133 |                         help="After how many timesteps the lagging target should be updated")
134 |     # rmatd3 parameters
135 |     parser.add_argument("--target_action_noise_std", default=0.2, help="Target action smoothing noise for matd3")
136 |     # rmasac parameters
137 |     parser.add_argument('--alpha', type=float, default=1.0,
138 |                         help="Initial temperature")
139 |     parser.add_argument('--target_entropy_coef', type=float,
140 |                         default=0.5, help="Initial temperature")
141 |     parser.add_argument('--automatic_entropy_tune', action='store_false',
142 |                         default=True, help="Whether use a centralized critic")
143 |     # qmix parameters
144 |     parser.add_argument('--use_double_q', action='store_false',
145 |                         default=True, help="Whether to use double q learning")
146 |     parser.add_argument('--hypernet_layers', type=int, default=2,
147 |                         help="Number of layers for hypernetworks. Must be either 1 or 2")
148 |     parser.add_argument('--mixer_hidden_dim', type=int, default=32,
149 |                         help="Dimension of hidden layer of mixing network")
150 |     parser.add_argument('--hypernet_hidden_dim', type=int, default=64,
151 |                         help="Dimension of hidden layer of hypernetwork (only applicable if hypernet_layers == 2")
152 | 
153 |     # exploration parameters
154 |     parser.add_argument('--num_random_episodes', type=int, default=5,
155 |                         help="Number of episodes to add to buffer with purely random actions")
156 |     parser.add_argument('--epsilon_start', type=float, default=1.0,
157 |                         help="Starting value for epsilon, for eps-greedy exploration")
158 |     parser.add_argument('--epsilon_finish', type=float, default=0.05,
159 |                         help="Ending value for epsilon, for eps-greedy exploration")
160 |     parser.add_argument('--epsilon_anneal_time', type=int, default=50000,
161 |                         help="Number of episodes until epsilon reaches epsilon_finish")
162 |     parser.add_argument('--act_noise_std', type=float,
163 |                         default=0.1, help="Action noise")
164 | 
165 |     # train parameters
166 |     parser.add_argument('--actor_train_interval_step', type=int, default=1,
167 |                         help="After how many critic updates actor should be updated")
168 |     parser.add_argument('--train_interval_episode', type=int, default=1,
169 |                         help="Number of env steps between updates to actor/critic")
170 |     parser.add_argument('--train_interval', type=int, default=100,
171 |                         help="Number of episodes between updates to actor/critic")
172 |     parser.add_argument("--use_value_active_masks",
173 |                         action='store_true', default=False)
174 | 
175 |     # eval parameters
176 |     parser.add_argument('--use_eval', action='store_false',
177 |                         default=True, help="Whether to conduct the evaluation")
178 |     parser.add_argument('--eval_interval', type=int,  default=10000,
179 |                         help="After how many episodes the policy should be evaled")
180 |     parser.add_argument('--num_eval_episodes', type=int, default=32,
181 |                         help="How many episodes to collect for each eval")
182 | 
183 |     # save parameters
184 |     parser.add_argument('--save_interval', type=int, default=100000,
185 |                         help="After how many episodes of training the policy model should be saved")
186 | 
187 |     # log parameters
188 |     parser.add_argument('--log_interval', type=int, default=1000,
189 |                         help="After how many episodes of training the policy model should be saved")
190 | 
191 |     # pretained parameters
192 |     parser.add_argument("--model_dir", type=str, default=None)
193 | 
194 |     return parser
195 | 


--------------------------------------------------------------------------------
/formation_gym/inbox/rendering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 2D rendering framework
  3 | """
  4 | from __future__ import division
  5 | import os
  6 | import six
  7 | import sys
  8 | 
  9 | if "Apple" in sys.version:
 10 |     if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
 11 |         os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
 12 |         # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
 13 | 
 14 | from gym import error
 15 | 
 16 | try:
 17 |     import pyglet
 18 | except ImportError as e:
 19 |     raise ImportError("HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.")
 20 | 
 21 | try:
 22 |     from pyglet.gl import *
 23 | except ImportError as e:
 24 |     raise ImportError("Error occured while running `from pyglet.gl import * HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python <your_script.py>'")
 25 | 
 26 | import math
 27 | import numpy as np
 28 | 
 29 | RAD2DEG = 57.29577951308232
 30 | 
 31 | def get_display(spec):
 32 |     """Convert a display specification (such as :0) into an actual Display
 33 |     object.
 34 | 
 35 |     Pyglet only supports multiple Displays on Linux.
 36 |     """
 37 |     if spec is None:
 38 |         return None
 39 |     elif isinstance(spec, six.string_types):
 40 |         return pyglet.canvas.Display(spec)
 41 |     else:
 42 |         raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
 43 | 
 44 | class Viewer(object):
 45 |     def __init__(self, width, height, display=None):
 46 |         display = get_display(display)
 47 | 
 48 |         self.width = width
 49 |         self.height = height
 50 | 
 51 |         self.window = pyglet.window.Window(width=width, height=height, display=display)
 52 |         self.window.on_close = self.window_closed_by_user
 53 |         self.geoms = []
 54 |         self.onetime_geoms = []
 55 |         self.transform = Transform()
 56 | 
 57 |         glEnable(GL_BLEND)
 58 |         # glEnable(GL_MULTISAMPLE)
 59 |         glEnable(GL_LINE_SMOOTH)
 60 |         # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
 61 |         glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
 62 |         glLineWidth(2.0)
 63 |         glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
 64 | 
 65 |     def close(self):
 66 |         self.window.close()
 67 | 
 68 |     def window_closed_by_user(self):
 69 |         self.close()
 70 | 
 71 |     def set_bounds(self, left, right, bottom, top):
 72 |         assert right > left and top > bottom
 73 |         scalex = self.width/(right-left)
 74 |         scaley = self.height/(top-bottom)
 75 |         self.transform = Transform(
 76 |             translation=(-left*scalex, -bottom*scaley),
 77 |             scale=(scalex, scaley))
 78 | 
 79 |     def add_geom(self, geom):
 80 |         self.geoms.append(geom)
 81 | 
 82 |     def add_onetime(self, geom):
 83 |         self.onetime_geoms.append(geom)
 84 | 
 85 |     def render(self, return_rgb_array=False):
 86 |         glClearColor(1,1,1,1)
 87 |         self.window.clear()
 88 |         self.window.switch_to()
 89 |         self.window.dispatch_events()
 90 |         self.transform.enable()
 91 |         for geom in self.geoms:
 92 |             geom.render()
 93 |         for geom in self.onetime_geoms:
 94 |             geom.render()
 95 |         self.transform.disable()
 96 |         arr = None
 97 |         if return_rgb_array:
 98 |             buffer = pyglet.image.get_buffer_manager().get_color_buffer()
 99 |             image_data = buffer.get_image_data()
100 |             arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
101 |             # In https://github.com/openai/gym-http-api/issues/2, we
102 |             # discovered that someone using Xmonad on Arch was having
103 |             # a window of size 598 x 398, though a 600 x 400 window
104 |             # was requested. (Guess Xmonad was preserving a pixel for
105 |             # the boundary.) So we use the buffer height/width rather
106 |             # than the requested one.
107 |             arr = arr.reshape(buffer.height, buffer.width, 4)
108 |             arr = arr[::-1,:,0:3]
109 |         self.window.flip()
110 |         self.onetime_geoms = []
111 |         return arr
112 | 
113 |     # Convenience
114 |     def draw_circle(self, radius=10, res=30, filled=True, **attrs):
115 |         geom = make_circle(radius=radius, res=res, filled=filled)
116 |         _add_attrs(geom, attrs)
117 |         self.add_onetime(geom)
118 |         return geom
119 | 
120 |     def draw_polygon(self, v, filled=True, **attrs):
121 |         geom = make_polygon(v=v, filled=filled)
122 |         _add_attrs(geom, attrs)
123 |         self.add_onetime(geom)
124 |         return geom
125 | 
126 |     def draw_polyline(self, v, **attrs):
127 |         geom = make_polyline(v=v)
128 |         _add_attrs(geom, attrs)
129 |         self.add_onetime(geom)
130 |         return geom
131 | 
132 |     def draw_line(self, start, end, **attrs):
133 |         geom = Line(start, end)
134 |         _add_attrs(geom, attrs)
135 |         self.add_onetime(geom)
136 |         return geom
137 | 
138 |     def get_array(self):
139 |         self.window.flip()
140 |         image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
141 |         self.window.flip()
142 |         arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
143 |         arr = arr.reshape(self.height, self.width, 4)
144 |         return arr[::-1,:,0:3]
145 | 
146 | def _add_attrs(geom, attrs):
147 |     if "color" in attrs:
148 |         geom.set_color(*attrs["color"])
149 |     if "linewidth" in attrs:
150 |         geom.set_linewidth(attrs["linewidth"])
151 | 
152 | class Geom(object):
153 |     def __init__(self):
154 |         self._color=Color((0, 0, 0, 1.0))
155 |         self.attrs = [self._color]
156 |     def render(self):
157 |         for attr in reversed(self.attrs):
158 |             attr.enable()
159 |         self.render1()
160 |         for attr in self.attrs:
161 |             attr.disable()
162 |     def render1(self):
163 |         raise NotImplementedError
164 |     def add_attr(self, attr):
165 |         self.attrs.append(attr)
166 |     def set_color(self, r, g, b, alpha=1):
167 |         self._color.vec4 = (r, g, b, alpha)
168 | 
169 | class Attr(object):
170 |     def enable(self):
171 |         raise NotImplementedError
172 |     def disable(self):
173 |         pass
174 | 
175 | class Transform(Attr):
176 |     def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)):
177 |         self.set_translation(*translation)
178 |         self.set_rotation(rotation)
179 |         self.set_scale(*scale)
180 |     def enable(self):
181 |         glPushMatrix()
182 |         glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint
183 |         glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
184 |         glScalef(self.scale[0], self.scale[1], 1)
185 |     def disable(self):
186 |         glPopMatrix()
187 |     def set_translation(self, newx, newy):
188 |         self.translation = (float(newx), float(newy))
189 |     def set_rotation(self, new):
190 |         self.rotation = float(new)
191 |     def set_scale(self, newx, newy):
192 |         self.scale = (float(newx), float(newy))
193 | 
194 | class Color(Attr):
195 |     def __init__(self, vec4):
196 |         self.vec4 = vec4
197 |     def enable(self):
198 |         glColor4f(*self.vec4)
199 | 
200 | class LineStyle(Attr):
201 |     def __init__(self, style):
202 |         self.style = style
203 |     def enable(self):
204 |         glEnable(GL_LINE_STIPPLE)
205 |         glLineStipple(1, self.style)
206 |     def disable(self):
207 |         glDisable(GL_LINE_STIPPLE)
208 | 
209 | class LineWidth(Attr):
210 |     def __init__(self, stroke):
211 |         self.stroke = stroke
212 |     def enable(self):
213 |         glLineWidth(self.stroke)
214 | 
215 | class Point(Geom):
216 |     def __init__(self):
217 |         Geom.__init__(self)
218 |     def render1(self):
219 |         glBegin(GL_POINTS) # draw point
220 |         glVertex3f(0.0, 0.0, 0.0)
221 |         glEnd()
222 | 
223 | class FilledPolygon(Geom):
224 |     def __init__(self, v):
225 |         Geom.__init__(self)
226 |         self.v = v
227 |     def render1(self):
228 |         if   len(self.v) == 4 : glBegin(GL_QUADS)
229 |         elif len(self.v)  > 4 : glBegin(GL_POLYGON)
230 |         else: glBegin(GL_TRIANGLES)
231 |         for p in self.v:
232 |             glVertex3f(p[0], p[1],0)  # draw each vertex
233 |         glEnd()
234 | 
235 |         color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5)
236 |         glColor4f(*color)
237 |         glBegin(GL_LINE_LOOP)
238 |         for p in self.v:
239 |             glVertex3f(p[0], p[1],0)  # draw each vertex
240 |         glEnd()
241 | 
242 | def make_circle(radius=10, res=30, filled=True):
243 |     points = []
244 |     for i in range(res):
245 |         ang = 2*math.pi*i / res
246 |         points.append((math.cos(ang)*radius, math.sin(ang)*radius))
247 |     if filled:
248 |         return FilledPolygon(points)
249 |     else:
250 |         return PolyLine(points, True)
251 | 
252 | def make_polygon(v, filled=True):
253 |     if filled: return FilledPolygon(v)
254 |     else: return PolyLine(v, True)
255 | 
256 | def make_polyline(v):
257 |     return PolyLine(v, False)
258 | 
259 | def make_capsule(length, width):
260 |     l, r, t, b = 0, length, width/2, -width/2
261 |     box = make_polygon([(l,b), (l,t), (r,t), (r,b)])
262 |     circ0 = make_circle(width/2)
263 |     circ1 = make_circle(width/2)
264 |     circ1.add_attr(Transform(translation=(length, 0)))
265 |     geom = Compound([box, circ0, circ1])
266 |     return geom
267 | 
268 | class Compound(Geom):
269 |     def __init__(self, gs):
270 |         Geom.__init__(self)
271 |         self.gs = gs
272 |         for g in self.gs:
273 |             g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
274 |     def render1(self):
275 |         for g in self.gs:
276 |             g.render()
277 | 
278 | class PolyLine(Geom):
279 |     def __init__(self, v, close):
280 |         Geom.__init__(self)
281 |         self.v = v
282 |         self.close = close
283 |         self.linewidth = LineWidth(1)
284 |         self.add_attr(self.linewidth)
285 |     def render1(self):
286 |         glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
287 |         for p in self.v:
288 |             glVertex3f(p[0], p[1],0)  # draw each vertex
289 |         glEnd()
290 |     def set_linewidth(self, x):
291 |         self.linewidth.stroke = x
292 | 
293 | class Line(Geom):
294 |     def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
295 |         Geom.__init__(self)
296 |         self.start = start
297 |         self.end = end
298 |         self.linewidth = LineWidth(1)
299 |         self.add_attr(self.linewidth)
300 | 
301 |     def render1(self):
302 |         glBegin(GL_LINES)
303 |         glVertex2f(*self.start)
304 |         glVertex2f(*self.end)
305 |         glEnd()
306 | 
307 | class Image(Geom):
308 |     def __init__(self, fname, width, height):
309 |         Geom.__init__(self)
310 |         self.width = width
311 |         self.height = height
312 |         img = pyglet.image.load(fname)
313 |         self.img = img
314 |         self.flip = False
315 |     def render1(self):
316 |         self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height)
317 | 
318 | # ================================================================
319 | 
320 | class SimpleImageViewer(object):
321 |     def __init__(self, display=None):
322 |         self.window = None
323 |         self.isopen = False
324 |         self.display = display
325 |     def imshow(self, arr):
326 |         if self.window is None:
327 |             height, width, channels = arr.shape
328 |             self.window = pyglet.window.Window(width=width, height=height, display=self.display)
329 |             self.width = width
330 |             self.height = height
331 |             self.isopen = True
332 |         assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
333 |         image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
334 |         self.window.clear()
335 |         self.window.switch_to()
336 |         self.window.dispatch_events()
337 |         image.blit(0,0)
338 |         self.window.flip()
339 |     def close(self):
340 |         if self.isopen:
341 |             self.window.close()
342 |             self.isopen = False
343 |     def __del__(self):
344 |         self.close()


--------------------------------------------------------------------------------
/formation_gym/rendering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 2D rendering framework
  3 | """
  4 | from __future__ import division
  5 | import os
  6 | import six
  7 | import sys
  8 | 
  9 | if "Apple" in sys.version:
 10 |     if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
 11 |         os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
 12 |         # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
 13 | 
 14 | from gym import error
 15 | 
 16 | try:
 17 |     import pyglet
 18 | except ImportError as e:
 19 |     print(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.")
 20 | 
 21 | try:
 22 |     from pyglet.gl import *
 23 | except ImportError as e:
 24 |     print(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python <your_script.py>'")
 25 | 
 26 | import math
 27 | import numpy as np
 28 | 
 29 | RAD2DEG = 57.29577951308232
 30 | 
 31 | def get_display(spec):
 32 |     """Convert a display specification (such as :0) into an actual Display
 33 |     object.
 34 | 
 35 |     Pyglet only supports multiple Displays on Linux.
 36 |     """
 37 |     if spec is None:
 38 |         return None
 39 |     elif isinstance(spec, six.string_types):
 40 |         return pyglet.canvas.Display(spec)
 41 |     else:
 42 |         raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
 43 | 
 44 | class Viewer(object):
 45 |     def __init__(self, width, height, display=None):
 46 |         display = get_display(display)
 47 | 
 48 |         self.width = width
 49 |         self.height = height
 50 | 
 51 |         self.window = pyglet.window.Window(width=width, height=height, display=display)
 52 |         self.window.on_close = self.window_closed_by_user
 53 |         self.geoms = []
 54 |         self.onetime_geoms = []
 55 |         self.transform = Transform()
 56 | 
 57 |         glEnable(GL_BLEND)
 58 |         # glEnable(GL_MULTISAMPLE)
 59 |         glEnable(GL_LINE_SMOOTH)
 60 |         # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
 61 |         glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
 62 |         glLineWidth(2.0)
 63 |         glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
 64 | 
 65 |     def close(self):
 66 |         self.window.close()
 67 | 
 68 |     def window_closed_by_user(self):
 69 |         self.close()
 70 | 
 71 |     def set_bounds(self, left, right, bottom, top):
 72 |         assert right > left and top > bottom
 73 |         scalex = self.width/(right-left)
 74 |         scaley = self.height/(top-bottom)
 75 |         self.transform = Transform(
 76 |             translation=(-left*scalex, -bottom*scaley),
 77 |             scale=(scalex, scaley))
 78 | 
 79 |     def add_geom(self, geom):
 80 |         self.geoms.append(geom)
 81 | 
 82 |     def add_onetime(self, geom):
 83 |         self.onetime_geoms.append(geom)
 84 | 
 85 |     def render(self, return_rgb_array=False):
 86 |         glClearColor(1,1,1,1)
 87 |         self.window.clear()
 88 |         self.window.switch_to()
 89 |         self.window.dispatch_events()
 90 |         self.transform.enable()
 91 |         for geom in self.geoms:
 92 |             geom.render()
 93 |         for geom in self.onetime_geoms:
 94 |             geom.render()
 95 |         self.transform.disable()
 96 |         arr = None
 97 |         if return_rgb_array:
 98 |             buffer = pyglet.image.get_buffer_manager().get_color_buffer()
 99 |             image_data = buffer.get_image_data()
100 |             arr = np.fromstring(image_data.get_data(), dtype=np.uint8, sep='')
101 |             # In https://github.com/openai/gym-http-api/issues/2, we
102 |             # discovered that someone using Xmonad on Arch was having
103 |             # a window of size 598 x 398, though a 600 x 400 window
104 |             # was requested. (Guess Xmonad was preserving a pixel for
105 |             # the boundary.) So we use the buffer height/width rather
106 |             # than the requested one.
107 |             arr = arr.reshape(buffer.height, buffer.width, 4)
108 |             arr = arr[::-1,:,0:3]
109 |         self.window.flip()
110 |         self.onetime_geoms = []
111 |         return arr
112 | 
113 |     # Convenience
114 |     def draw_circle(self, radius=10, res=30, filled=True, **attrs):
115 |         geom = make_circle(radius=radius, res=res, filled=filled)
116 |         _add_attrs(geom, attrs)
117 |         self.add_onetime(geom)
118 |         return geom
119 | 
120 |     def draw_polygon(self, v, filled=True, **attrs):
121 |         geom = make_polygon(v=v, filled=filled)
122 |         _add_attrs(geom, attrs)
123 |         self.add_onetime(geom)
124 |         return geom
125 | 
126 |     def draw_polyline(self, v, **attrs):
127 |         geom = make_polyline(v=v)
128 |         _add_attrs(geom, attrs)
129 |         self.add_onetime(geom)
130 |         return geom
131 | 
132 |     def draw_line(self, start, end, **attrs):
133 |         geom = Line(start, end)
134 |         _add_attrs(geom, attrs)
135 |         self.add_onetime(geom)
136 |         return geom
137 | 
138 |     def get_array(self):
139 |         self.window.flip()
140 |         image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
141 |         self.window.flip()
142 |         arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
143 |         arr = arr.reshape(self.height, self.width, 4)
144 |         return arr[::-1,:,0:3]
145 | 
146 | def _add_attrs(geom, attrs):
147 |     if "color" in attrs:
148 |         geom.set_color(*attrs["color"])
149 |     if "linewidth" in attrs:
150 |         geom.set_linewidth(attrs["linewidth"])
151 | 
152 | class Geom(object):
153 |     def __init__(self):
154 |         self._color=Color((0, 0, 0, 1.0))
155 |         self.attrs = [self._color]
156 |     def render(self):
157 |         for attr in reversed(self.attrs):
158 |             attr.enable()
159 |         self.render1()
160 |         for attr in self.attrs:
161 |             attr.disable()
162 |     def render1(self):
163 |         raise NotImplementedError
164 |     def add_attr(self, attr):
165 |         self.attrs.append(attr)
166 |     def set_color(self, r, g, b, alpha=1):
167 |         self._color.vec4 = (r, g, b, alpha)
168 | 
169 | class Attr(object):
170 |     def enable(self):
171 |         raise NotImplementedError
172 |     def disable(self):
173 |         pass
174 | 
175 | class Transform(Attr):
176 |     def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)):
177 |         self.set_translation(*translation)
178 |         self.set_rotation(rotation)
179 |         self.set_scale(*scale)
180 |     def enable(self):
181 |         glPushMatrix()
182 |         glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint
183 |         glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
184 |         glScalef(self.scale[0], self.scale[1], 1)
185 |     def disable(self):
186 |         glPopMatrix()
187 |     def set_translation(self, newx, newy):
188 |         self.translation = (float(newx), float(newy))
189 |     def set_rotation(self, new):
190 |         self.rotation = float(new)
191 |     def set_scale(self, newx, newy):
192 |         self.scale = (float(newx), float(newy))
193 | 
194 | class Color(Attr):
195 |     def __init__(self, vec4):
196 |         self.vec4 = vec4
197 |     def enable(self):
198 |         glColor4f(*self.vec4)
199 | 
200 | class LineStyle(Attr):
201 |     def __init__(self, style):
202 |         self.style = style
203 |     def enable(self):
204 |         glEnable(GL_LINE_STIPPLE)
205 |         glLineStipple(1, self.style)
206 |     def disable(self):
207 |         glDisable(GL_LINE_STIPPLE)
208 | 
209 | class LineWidth(Attr):
210 |     def __init__(self, stroke):
211 |         self.stroke = stroke
212 |     def enable(self):
213 |         glLineWidth(self.stroke)
214 | 
215 | class Point(Geom):
216 |     def __init__(self):
217 |         Geom.__init__(self)
218 |     def render1(self):
219 |         glBegin(GL_POINTS) # draw point
220 |         glVertex3f(0.0, 0.0, 0.0)
221 |         glEnd()
222 | 
223 | class FilledPolygon(Geom):
224 |     def __init__(self, v):
225 |         Geom.__init__(self)
226 |         self.v = v
227 |     def render1(self):
228 |         if   len(self.v) == 4 : glBegin(GL_QUADS)
229 |         elif len(self.v)  > 4 : glBegin(GL_POLYGON)
230 |         else: glBegin(GL_TRIANGLES)
231 |         for p in self.v:
232 |             glVertex3f(p[0], p[1],0)  # draw each vertex
233 |         glEnd()
234 | 
235 |         color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5)
236 |         glColor4f(*color)
237 |         glBegin(GL_LINE_LOOP)
238 |         for p in self.v:
239 |             glVertex3f(p[0], p[1],0)  # draw each vertex
240 |         glEnd()
241 | 
242 | def make_circle(radius=10, res=30, filled=True):
243 |     points = []
244 |     for i in range(res):
245 |         ang = 2*math.pi*i / res
246 |         points.append((math.cos(ang)*radius, math.sin(ang)*radius))
247 |     if filled:
248 |         return FilledPolygon(points)
249 |     else:
250 |         return PolyLine(points, True)
251 | 
252 | def make_polygon(v, filled=True):
253 |     if filled: return FilledPolygon(v)
254 |     else: return PolyLine(v, True)
255 | 
256 | def make_polyline(v):
257 |     return PolyLine(v, False)
258 | 
259 | def make_capsule(length, width):
260 |     l, r, t, b = 0, length, width/2, -width/2
261 |     box = make_polygon([(l,b), (l,t), (r,t), (r,b)])
262 |     circ0 = make_circle(width/2)
263 |     circ1 = make_circle(width/2)
264 |     circ1.add_attr(Transform(translation=(length, 0)))
265 |     geom = Compound([box, circ0, circ1])
266 |     return geom
267 | 
268 | class Compound(Geom):
269 |     def __init__(self, gs):
270 |         Geom.__init__(self)
271 |         self.gs = gs
272 |         for g in self.gs:
273 |             g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
274 |     def render1(self):
275 |         for g in self.gs:
276 |             g.render()
277 | 
278 | class PolyLine(Geom):
279 |     def __init__(self, v, close):
280 |         Geom.__init__(self)
281 |         self.v = v
282 |         self.close = close
283 |         self.linewidth = LineWidth(1)
284 |         self.add_attr(self.linewidth)
285 |     def render1(self):
286 |         glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
287 |         for p in self.v:
288 |             glVertex3f(p[0], p[1],0)  # draw each vertex
289 |         glEnd()
290 |     def set_linewidth(self, x):
291 |         self.linewidth.stroke = x
292 | 
293 | class Line(Geom):
294 |     def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
295 |         Geom.__init__(self)
296 |         self.start = start
297 |         self.end = end
298 |         self.linewidth = LineWidth(1)
299 |         self.add_attr(self.linewidth)
300 | 
301 |     def render1(self):
302 |         glBegin(GL_LINES)
303 |         glVertex2f(*self.start)
304 |         glVertex2f(*self.end)
305 |         glEnd()
306 | 
307 | class Image(Geom):
308 |     def __init__(self, fname, width, height):
309 |         Geom.__init__(self)
310 |         self.width = width
311 |         self.height = height
312 |         img = pyglet.image.load(fname)
313 |         self.img = img
314 |         self.flip = False
315 |     def render1(self):
316 |         self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height)
317 | 
318 | # ================================================================
319 | 
320 | class SimpleImageViewer(object):
321 |     def __init__(self, display=None):
322 |         self.window = None
323 |         self.isopen = False
324 |         self.display = display
325 |     def imshow(self, arr):
326 |         if self.window is None:
327 |             height, width, channels = arr.shape
328 |             self.window = pyglet.window.Window(width=width, height=height, display=self.display)
329 |             self.width = width
330 |             self.height = height
331 |             self.isopen = True
332 |         assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
333 |         image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
334 |         self.window.clear()
335 |         self.window.switch_to()
336 |         self.window.dispatch_events()
337 |         image.blit(0,0)
338 |         self.window.flip()
339 |     def close(self):
340 |         if self.isopen:
341 |             self.window.close()
342 |             self.isopen = False
343 |     def __del__(self):
344 |         self.close()
345 | 


--------------------------------------------------------------------------------
/train/maddpg-v4/utils.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | from multiprocessing import Process, Pipe
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | 
  8 | from gym.spaces import Box, Discrete, Tuple
  9 | from wrapper import DummyVecEnv, SubprocVecEnv
 10 | import formation_gym
 11 | 
 12 | def make_train_env(config):
 13 |     def get_env_fn(rank):
 14 |         def init_env():
 15 |             if config['env_name'] == "MPE":
 16 |                 env = formation_gym.make_env(config['scenario_name'], benchmark = False, num_agents = config['num_agents'])
 17 |             else:
 18 |                 print("Can not support the " +
 19 |                       config['env_name'] + "environment.")
 20 |                 raise NotImplementedError
 21 |             env.seed(config['seed'] + rank * 1000)
 22 |             return env
 23 |         return init_env
 24 |     if config['n_rollout_threads'] == 1:
 25 |         return DummyVecEnv([get_env_fn(0)])
 26 |     else:
 27 |         return SubprocVecEnv([get_env_fn(i) for i in range(config['n_rollout_threads'])])
 28 | 
 29 | def to_torch(input):
 30 |     return torch.from_numpy(input) if type(input) == np.ndarray else input
 31 | 
 32 | def get_config():
 33 |     with open(os.path.dirname(__file__)+"/parameters.yaml", "r") as stream:
 34 |         try:
 35 |             config = yaml.safe_load(stream)
 36 |         except yaml.YAMLError as exc:
 37 |             print(exc)
 38 |     return config
 39 | 
 40 | def get_dim_from_space(space):
 41 |     if isinstance(space, Box):
 42 |         dim = space.shape[0]
 43 |     elif isinstance(space, Discrete):
 44 |         dim = space.n
 45 |     elif isinstance(space, Tuple):
 46 |         dim = sum([get_dim_from_space(sp) for sp in space])
 47 |     elif "MultiDiscrete" in space.__class__.__name__:
 48 |         return (space.high - space.low) + 1
 49 |     elif isinstance(space, list):
 50 |         dim = space[0]
 51 |     else:
 52 |         raise Exception("Unrecognized space: ", type(space))
 53 |     return dim
 54 | 
 55 | def get_cent_act_dim(action_space):
 56 |     cent_act_dim = 0
 57 |     for space in action_space:
 58 |         dim = get_dim_from_space(space)
 59 |         if isinstance(dim, np.ndarray):
 60 |             cent_act_dim += int(sum(dim))
 61 |         else:
 62 |             cent_act_dim += dim
 63 |     return cent_act_dim
 64 |     
 65 | def get_state_dim(observation_dict, action_dict):
 66 |     combined_obs_dim = sum([get_dim_from_space(space)
 67 |                             for space in observation_dict.values()])
 68 |     combined_act_dim = 0
 69 |     for space in action_dict.values():
 70 |         dim = get_dim_from_space(space)
 71 |         if isinstance(dim, np.ndarray):
 72 |             combined_act_dim += int(sum(dim))
 73 |         else:
 74 |             combined_act_dim += dim
 75 |     return combined_obs_dim, combined_act_dim, combined_obs_dim+combined_act_dim
 76 | 
 77 | class DecayThenFlatSchedule():
 78 |     def __init__(self,
 79 |                  start,
 80 |                  finish,
 81 |                  time_length,
 82 |                  decay="exp"):
 83 | 
 84 |         self.start = start
 85 |         self.finish = finish
 86 |         self.time_length = time_length
 87 |         self.delta = (self.start - self.finish) / self.time_length
 88 |         self.decay = decay
 89 | 
 90 |         if self.decay in ["exp"]:
 91 |             self.exp_scaling = (-1) * self.time_length / \
 92 |                 np.log(self.finish) if self.finish > 0 else 1
 93 | 
 94 |     def eval(self, T):
 95 |         if self.decay in ["linear"]:
 96 |             return max(self.finish, self.start - self.delta * T)
 97 |         elif self.decay in ["exp"]:
 98 |             return min(self.start, max(self.finish, np.exp(- T / self.exp_scaling)))
 99 |     pass
100 | 
101 | class ACTLayer(nn.Module):
102 |     def __init__(self, config ,act_dim):
103 |         super(ACTLayer, self).__init__()
104 |         
105 |         self.multi_discrete = False
106 |         init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][config['use_orthogonal']]
107 |         def init_(m):
108 |             return init(m, init_method, lambda x: nn.init.constant_(x, 0), config['gain'])
109 | 
110 |         if isinstance(act_dim, np.ndarray):
111 |             # MultiDiscrete setting: have n Linear layers for each action
112 |             self.multi_discrete = True
113 |             self.action_outs = nn.ModuleList([init_(nn.Linear(config['hidden_size'], a_dim)) for a_dim in act_dim])
114 |         else:
115 |             self.action_out = init_(nn.Linear(config['hidden_size'], act_dim))
116 | 
117 |     def forward(self, x, no_sequence=False):
118 | 
119 |         if self.multi_discrete:
120 |             act_outs = []
121 |             for a_out in self.action_outs:
122 |                 act_out = a_out(x)
123 |                 if no_sequence:
124 |                     # remove the dummy first time dimension if the input didn't have a time dimension
125 |                     act_out = act_out[0, :, :]
126 |                 act_outs.append(act_out)
127 |         else:
128 |             act_outs = self.action_out(x)
129 |             if no_sequence:
130 |                 # remove the dummy first time dimension if the input didn't have a time dimension
131 |                 act_outs = act_outs[0, :, :]
132 | 
133 |         return act_outs
134 | class PopArt(nn.Module):
135 |     """ Normalize a vector of observations - across the first norm_axes dimensions"""
136 | 
137 |     def __init__(self, input_shape, norm_axes=1, beta=0.99999, per_element_update=False, epsilon=1e-5, device=torch.device("cpu")):
138 |         super(PopArt, self).__init__()
139 | 
140 |         self.input_shape = input_shape
141 |         self.norm_axes = norm_axes
142 |         self.epsilon = epsilon
143 |         self.beta = beta
144 |         self.per_element_update = per_element_update
145 |         self.device = device
146 |         self.tpdv = dict(dtype=torch.float32, device=device)
147 | 
148 |         self.running_mean = nn.Parameter(torch.zeros(input_shape, dtype=torch.float), requires_grad=False).to(self.device)
149 |         self.running_mean_sq = nn.Parameter(torch.zeros(input_shape, dtype=torch.float), requires_grad=False).to(self.device)
150 |         self.debiasing_term = nn.Parameter(torch.tensor(0.0, dtype=torch.float), requires_grad=False).to(self.device)
151 | 
152 |     def reset_parameters(self):
153 |         self.running_mean.zero_()
154 |         self.running_mean_sq.zero_()
155 |         self.debiasing_term.zero_()
156 | 
157 |     def running_mean_var(self):
158 |         debiased_mean = self.running_mean / self.debiasing_term.clamp(min=self.epsilon)
159 |         debiased_mean_sq = self.running_mean_sq / self.debiasing_term.clamp(min=self.epsilon)
160 |         debiased_var = (debiased_mean_sq - debiased_mean ** 2).clamp(max=self.alpha, min=1e-2)
161 |         return debiased_mean, debiased_var
162 | 
163 |     def forward(self, input_vector, train=True):
164 |         # Make sure input is float32
165 |         input_vector = input_vector.to(**self.tpdv)
166 | 
167 |         if train:
168 |             # Detach input before adding it to running means to avoid backpropping through it on
169 |             # subsequent batches.
170 |             
171 |             detached_input = input_vector.detach()           
172 |             batch_mean = detached_input.mean(dim=tuple(range(self.norm_axes)))           
173 |             batch_sq_mean = (detached_input ** 2).mean(dim=tuple(range(self.norm_axes)))
174 |             if self.per_element_update:
175 |                 batch_size = np.prod(detached_input.size()[:self.norm_axes])
176 |                 weight = self.beta ** batch_size
177 |             else:
178 |                 weight = self.beta
179 |             
180 |             self.running_mean.mul_(weight).add_(batch_mean * (1.0 - weight))
181 |             self.running_mean_sq.mul_(weight).add_(batch_sq_mean * (1.0 - weight))
182 |             self.debiasing_term.mul_(weight).add_(1.0 * (1.0 - weight))
183 | 
184 |         mean, var = self.running_mean_var()
185 |         out = (input_vector - mean[(None,) * self.norm_axes]) / torch.sqrt(var)[(None,) * self.norm_axes]
186 |         return out
187 | 
188 |     def denormalize(self, input_vector):
189 |         """ Transform normalized data back into original distribution """
190 |         input_vector = input_vector.to(**self.tpdv)
191 | 
192 |         mean, var = self.running_mean_var()
193 |         out = input_vector * torch.sqrt(var)[(None,) * self.norm_axes] + mean[(None,) * self.norm_axes]
194 |         return out
195 | 
196 | class MLPBase(nn.Module):
197 |     def __init__(self, config, inputs_dim):
198 |         super(MLPBase, self).__init__()
199 |         self.config = config
200 |         if self.config['use_feature_normalization']:
201 |             self.feature_norm = nn.LayerNorm(inputs_dim)
202 | 
203 |         if self.config['use_conv1d']:
204 |             self.conv = CONVLayer(self.config['stacked_frames'], self.config['hidden_size'], self.config['use_orthogonal'], self.config['use_ReLU'])
205 |             random_x = torch.FloatTensor(1, self.config['stacked_frames'], self.config['inputs_dim'])
206 |             random_out = self.conv(random_x)
207 |             assert len(random_out.shape)==3
208 |             inputs_dim = random_out.size(-1) * random_out.size(-2)
209 | 
210 |         self.mlp = MLPLayer(inputs_dim, self.config['hidden_size'],
211 |                               self.config['layer_N'], self.config['use_orthogonal'], self.config['use_ReLU'])
212 | 
213 |     def forward(self, x):
214 |         if self.config['use_feature_normalization']:
215 |             x = self.feature_norm(x)
216 | 
217 |         if self.config['use_conv1d']:
218 |             batch_size = x.size(0)
219 |             x = x.view(batch_size, self.config['stacked_frames'], -1)
220 |             x = self.conv(x)
221 |             x = x.view(batch_size, -1)
222 | 
223 |         x = self.mlp(x)
224 | 
225 |         return x
226 | 
227 | class MLPLayer(nn.Module):
228 |     def __init__(self, input_dim, hidden_size, layer_N, use_orthogonal, use_ReLU):
229 |         super(MLPLayer, self).__init__()
230 |         self._layer_N = layer_N
231 | 
232 |         active_func = [nn.Tanh(), nn.ReLU()][use_ReLU]
233 |         init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
234 |         gain = nn.init.calculate_gain(['tanh', 'relu'][use_ReLU])
235 | 
236 |         def init_(m):
237 |             return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain=gain)
238 | 
239 |         self.fc1 = nn.Sequential(
240 |             init_(nn.Linear(input_dim, hidden_size)), active_func, nn.LayerNorm(hidden_size))
241 |         self.fc_h = nn.Sequential(init_(
242 |             nn.Linear(hidden_size, hidden_size)), active_func, nn.LayerNorm(hidden_size))
243 |         self.fc2 = get_clones(self.fc_h, self._layer_N)
244 | 
245 |     def forward(self, x):
246 |         x = self.fc1(x)
247 |         for i in range(self._layer_N):
248 |             x = self.fc2[i](x)
249 |         return x
250 | 
251 | class CONVLayer(nn.Module):
252 |     def __init__(self, input_dim, hidden_size, use_orthogonal, use_ReLU):
253 |         super(CONVLayer, self).__init__()
254 | 
255 |         active_func = [nn.Tanh(), nn.ReLU()][use_ReLU]
256 |         init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
257 |         gain = nn.init.calculate_gain(['tanh', 'relu'][use_ReLU])
258 | 
259 |         def init_(m):
260 |             return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain=gain)
261 | 
262 |         self.conv = nn.Sequential(
263 |                 init_(nn.Conv1d(in_channels=input_dim, out_channels=hidden_size//4, kernel_size=3, stride=2, padding=0)), active_func, #nn.BatchNorm1d(hidden_size//4),
264 |                 init_(nn.Conv1d(in_channels=hidden_size//4, out_channels=hidden_size//2, kernel_size=3, stride=1, padding=1)), active_func, #nn.BatchNorm1d(hidden_size//2),
265 |                 init_(nn.Conv1d(in_channels=hidden_size//2, out_channels=hidden_size, kernel_size=3, stride=1, padding=1)), active_func)#, nn.BatchNorm1d(hidden_size))
266 | 
267 |     def forward(self, x):
268 |         x = self.conv(x)
269 |         return x
270 | 
271 | def tile_images(img_nhwc):
272 |     """
273 |     Tile N images into one big PxQ image
274 |     (P,Q) are chosen to be as close as possible, and if N
275 |     is square, then P=Q.
276 |     input: img_nhwc, list or array of images, ndim=4 once turned into array
277 |         n = batch index, h = height, w = width, c = channel
278 |     returns:
279 |         bigim_HWc, ndarray with ndim=3
280 |     """
281 |     img_nhwc = np.asarray(img_nhwc)
282 |     N, h, w, c = img_nhwc.shape
283 |     H = int(np.ceil(np.sqrt(N)))
284 |     W = int(np.ceil(float(N)/H))
285 |     img_nhwc = np.array(
286 |         list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
287 |     img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
288 |     img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
289 |     img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
290 |     return img_Hh_Ww_c


--------------------------------------------------------------------------------