├── train ├── maddpg-v3 │ ├── =2.0.0 │ ├── env │ │ ├── __init__.py │ │ ├── multiagent_particle_env.py │ │ └── wrapper.py │ └── main.py ├── maddpg-v2 │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── noise.py │ │ ├── networks.py │ │ ├── make_env.py │ │ ├── agents.py │ │ ├── misc.py │ │ ├── buffer.py │ │ └── env_wrappers.py │ ├── algorithms │ │ └── __init__.py │ ├── assets │ │ ├── predator_prey │ │ │ ├── 1.gif │ │ │ ├── 2.gif │ │ │ └── 3.gif │ │ ├── physical_deception │ │ │ ├── 1.gif │ │ │ ├── 2.gif │ │ │ └── 3.gif │ │ └── cooperative_communication │ │ │ ├── 1.gif │ │ │ ├── 2.gif │ │ │ └── 3.gif │ ├── evaluate.py │ └── main.py ├── maddpg-v1 │ ├── .gitignore │ ├── agent.py │ ├── main.py │ ├── maddpg │ │ ├── actor_critic.py │ │ └── maddpg.py │ ├── common │ │ ├── utils.py │ │ ├── replay_buffer.py │ │ └── arguments.py │ └── runner.py ├── ddpg │ ├── models │ │ └── dqn.pth │ └── test.py ├── mappo │ ├── train_formation.sh │ ├── inbox │ │ ├── train_formation.sh │ │ ├── render_formation.sh │ │ ├── render_formation.py │ │ └── train_formation.py │ └── train_formation.py ├── maddpg-v4 │ ├── parameters.yaml │ ├── train.py │ └── utils.py └── maddpg-v5 │ ├── render.py │ ├── train.py │ └── config.py ├── formation_gym ├── inbox │ ├── scenario.py │ ├── core.py │ └── rendering.py ├── scenario.py ├── policy.py ├── multi_discrete.py ├── envs │ ├── basic_formation_env.py │ ├── formation_hd_partial_range_env.py │ ├── formation_hd_partial_env.py │ ├── formation_hd_env.py │ └── formation_hd_obs_env.py ├── __init__.py └── rendering.py ├── setup.py ├── test.py ├── .gitignore └── README.md /train/maddpg-v3/=2.0.0: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /train/maddpg-v2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /train/maddpg-v2/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /train/maddpg-v2/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /train/maddpg-v1/.gitignore: -------------------------------------------------------------------------------- 1 | model_1/ 2 | model_2/ 3 | __pychache__/ -------------------------------------------------------------------------------- /train/ddpg/models/dqn.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/ddpg/models/dqn.pth -------------------------------------------------------------------------------- /train/maddpg-v2/assets/predator_prey/1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/predator_prey/1.gif -------------------------------------------------------------------------------- /train/maddpg-v2/assets/predator_prey/2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/predator_prey/2.gif -------------------------------------------------------------------------------- /train/maddpg-v2/assets/predator_prey/3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/predator_prey/3.gif -------------------------------------------------------------------------------- /train/maddpg-v2/assets/physical_deception/1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/physical_deception/1.gif -------------------------------------------------------------------------------- /train/maddpg-v2/assets/physical_deception/2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/physical_deception/2.gif -------------------------------------------------------------------------------- /train/maddpg-v2/assets/physical_deception/3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/physical_deception/3.gif -------------------------------------------------------------------------------- /train/maddpg-v2/assets/cooperative_communication/1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/cooperative_communication/1.gif -------------------------------------------------------------------------------- /train/maddpg-v2/assets/cooperative_communication/2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/cooperative_communication/2.gif -------------------------------------------------------------------------------- /train/maddpg-v2/assets/cooperative_communication/3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jc-bao/gym-formation/HEAD/train/maddpg-v2/assets/cooperative_communication/3.gif -------------------------------------------------------------------------------- /train/maddpg-v3/env/__init__.py: -------------------------------------------------------------------------------- 1 | from .multiagent_particle_env import RLlibMultiAgentParticleEnv as MultiAgentParticleEnv 2 | from .wrapper import FormationEnv 3 | 4 | __all__ = [ 5 | "MultiAgentParticleEnv", 6 | "FormationEnv" 7 | ] 8 | -------------------------------------------------------------------------------- /formation_gym/inbox/scenario.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # defines scenario upon which the world is built 4 | class BaseScenario(object): 5 | # create elements of the world 6 | def make_world(self): 7 | raise NotImplementedError() 8 | # create initial conditions of the world 9 | def reset_world(self, world): 10 | raise NotImplementedError() 11 | -------------------------------------------------------------------------------- /formation_gym/scenario.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # defines scenario upon which the world is built 4 | class BaseScenario(object): 5 | # create elements of the world 6 | def make_world(self): 7 | raise NotImplementedError() 8 | # create initial conditions of the world 9 | def reset_world(self, world): 10 | raise NotImplementedError() 11 | def info(self, agent, world): 12 | return {} 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from pathlib import Path 3 | 4 | setup( 5 | name='formation_gym', 6 | author="Chaoyi Pan", 7 | author_email="pcy19@mails.tsinghua.edu.cn", 8 | version='0.0.1', 9 | description="An OpenAI Gym Env for Formaion", 10 | long_description=Path("README.md").read_text(), 11 | classifiers=[ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: MIT License", 14 | "Operating System :: OS Independent", 15 | ], 16 | python_requires='>=3.6' 17 | ) 18 | -------------------------------------------------------------------------------- /train/mappo/train_formation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | env="MPE" 3 | scenario="formation_hd_env" # simple_speaker_listener # simple_reference 4 | num_agents=3 5 | algo="rmappo" 6 | exp="check" 7 | seed_max=1 8 | 9 | echo "env is ${env}, scenario is ${scenario}, algo is ${algo}, exp is ${exp}, max seed is ${seed_max}" 10 | for seed in `seq ${seed_max}`; 11 | do 12 | echo "seed is ${seed}:" 13 | python train_formation.py --use_valuenorm --env_name ${env} --algorithm_name ${algo} --experiment_name ${exp} --scenario_name ${scenario} --num_agents ${num_agents} --seed ${seed} --n_training_threads 1 --n_rollout_threads 128 --num_mini_batch 1 --episode_length 25 --num_env_steps 20000000 --ppo_epoch 10 --use_ReLU --gain 0.01 --lr 7e-4 --critic_lr 7e-4 --wandb_name "jc-bao" --user_name "jc-bao" 14 | done -------------------------------------------------------------------------------- /train/maddpg-v2/utils/noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | # from https://github.com/songrotek/DDPG/blob/master/ou_noise.py 5 | class OUNoise: 6 | def __init__(self, action_dimension, scale=0.1, mu=0, theta=0.15, sigma=0.2): 7 | self.action_dimension = action_dimension 8 | self.scale = scale 9 | self.mu = mu 10 | self.theta = theta 11 | self.sigma = sigma 12 | self.state = np.ones(self.action_dimension) * self.mu 13 | self.reset() 14 | 15 | def reset(self): 16 | self.state = np.ones(self.action_dimension) * self.mu 17 | 18 | def noise(self): 19 | x = self.state 20 | dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x)) 21 | self.state = x + dx 22 | return self.state * self.scale 23 | -------------------------------------------------------------------------------- /train/mappo/inbox/train_formation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | env="MPE" 3 | scenario="formation_hd_env" # simple_speaker_listener # simple_reference 4 | num_agents=3 5 | algo="rmappo" 6 | exp="check" 7 | seed_max=1 8 | 9 | echo "env is ${env}, scenario is ${scenario}, algo is ${algo}, exp is ${exp}, max seed is ${seed_max}" 10 | for seed in `seq ${seed_max}`; 11 | do 12 | echo "seed is ${seed}:" 13 | python train_formation.py --use_valuenorm --use_popart --env_name ${env} --algorithm_name ${algo} --experiment_name ${exp} --scenario_name ${scenario} --num_agents ${num_agents} --num_landmarks ${num_landmarks} --seed ${seed} --n_training_threads 1 --n_rollout_threads 128 --num_mini_batch 1 --episode_length 25 --num_env_steps 20000000 --ppo_epoch 10 --use_ReLU --gain 0.01 --lr 7e-4 --critic_lr 7e-4 --user_name "chaoyi" 14 | done -------------------------------------------------------------------------------- /train/mappo/inbox/render_formation.sh: -------------------------------------------------------------------------------- 1 | env="MPE" 2 | scenario="formation_hd_env" 3 | num_agents=3 4 | algo="rmappo" 5 | exp="render" 6 | seed_max=1 7 | 8 | echo "env is ${env}" 9 | for seed in `seq ${seed_max}` 10 | do 11 | # CUDA_VISIBLE_DEVICES=1 python render_formation.py --save_gifs --share_policy --env_name ${env} --algorithm_name ${algo} --experiment_name ${exp} --scenario_name ${scenario} --num_agents ${num_agents} --seed ${seed} --n_training_threads 1 --n_rollout_threads 1 --use_render --episode_length 25 --render_episodes 5 --model_dir "./results/MPE/formation_hd_env/rmappo/check/run15/models" 12 | python render_formation.py --save_gifs --env_name ${env} --algorithm_name ${algo} --experiment_name ${exp} --scenario_name ${scenario} --num_agents ${num_agents} --seed ${seed} --n_training_threads 1 --n_rollout_threads 1 --use_render --episode_length 50 --render_episodes 5 --model_dir "/Users/reedpan/Desktop/Research/gym_formation/train/mappo/results/MPE/formation_hd_env/rmappo/hd_3/run1/models" --gif_dir './results/gif' 13 | done -------------------------------------------------------------------------------- /train/maddpg-v1/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import os 4 | from maddpg.maddpg import MADDPG 5 | 6 | 7 | class Agent: 8 | def __init__(self, agent_id, args): 9 | self.args = args 10 | self.agent_id = agent_id 11 | self.policy = MADDPG(args, agent_id) 12 | 13 | def select_action(self, o, noise_rate, epsilon): 14 | if np.random.uniform() < epsilon: 15 | u = np.random.uniform(-self.args.high_action, self.args.high_action, self.args.action_shape[self.agent_id]) 16 | else: 17 | inputs = torch.tensor(o, dtype=torch.float32).unsqueeze(0) 18 | pi = self.policy.actor_network(inputs).squeeze(0) 19 | u = pi.cpu().numpy() 20 | noise = noise_rate * self.args.high_action * np.random.randn(*u.shape) # gaussian noise 21 | u += noise 22 | u = np.clip(u, -self.args.high_action, self.args.high_action) 23 | return u.copy() 24 | 25 | def learn(self, transitions, other_agents): 26 | self.policy.train(transitions, other_agents) 27 | 28 | -------------------------------------------------------------------------------- /train/maddpg-v1/main.py: -------------------------------------------------------------------------------- 1 | from runner import Runner 2 | from common.arguments import get_args 3 | from common.utils import make_env 4 | import numpy as np 5 | import random 6 | import torch 7 | import formation_gym 8 | 9 | ''' 10 | action = [0.1, 0.2, 0.4, 0.1, 0.2] 11 | ''' 12 | 13 | if __name__ == '__main__': 14 | # get the params 15 | args = get_args() 16 | env = formation_gym.make_env(args.scenario_name, benchmark = False, num_agents = args.num_agents) 17 | args.n_agents = args.num_agents 18 | args.n_players = 0 19 | args.obs_shape = [env.observation_space[i].shape[0] for i in range(args.n_agents)] # 每一维代表该agent的obs维度 20 | action_shape = [] 21 | for content in env.action_space: 22 | action_shape.append(content.shape[0]) 23 | args.action_shape = action_shape[:args.n_agents] # 每一维代表该agent的act维度 24 | args.high_action = 1 25 | args.low_action = -1 26 | runner = Runner(args, env) 27 | if args.evaluate: 28 | returns = runner.evaluate(True) 29 | print('Average returns is', returns) 30 | else: 31 | runner.run() -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | 4 | import formation_gym 5 | 6 | if __name__ == '__main__': 7 | parser = argparse.ArgumentParser(description=None) 8 | parser.add_argument('-s', '--scenario', default='formation_hd_env', help='Path of the scenario Python script.') 9 | parser.add_argument('-n', '--num-agents', type=int, default=3, help='Number of agents') 10 | parser.add_argument('-r', '--random', action='store_true', help='If use random policy.') 11 | parser.add_argument('--num-layer', type=int, default = 1, help = 'use hierachy policy to control') 12 | args = parser.parse_args() 13 | 14 | env = formation_gym.make_env(args.scenario, benchmark=False, num_agents = args.num_agents**args.num_layer) 15 | obs_n = env.reset() 16 | total_num_agents = args.num_agents**args.num_layer 17 | while True: 18 | # random policy 19 | if args.random: 20 | act_n = [space.sample() for space in env.action_space] 21 | # demo policy 22 | else: 23 | act_n = formation_gym.get_action_BFS(formation_gym.ezpolicy, obs_n, args.num_agents) 24 | # step environment 25 | obs_n, reward_n, done_n, _ = env.step(act_n) 26 | if np.all(done_n): 27 | obs_n = env.reset() 28 | # render all agent views 29 | env.render() -------------------------------------------------------------------------------- /train/maddpg-v1/maddpg/actor_critic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | # define the actor network 7 | class Actor(nn.Module): 8 | def __init__(self, args, agent_id): 9 | super(Actor, self).__init__() 10 | self.max_action = args.high_action 11 | self.fc1 = nn.Linear(args.obs_shape[agent_id], 64) 12 | self.fc2 = nn.Linear(64, 64) 13 | self.fc3 = nn.Linear(64, 64) 14 | self.action_out = nn.Linear(64, args.action_shape[agent_id]) 15 | 16 | def forward(self, x): 17 | x = F.relu(self.fc1(x)) 18 | x = F.relu(self.fc2(x)) 19 | x = F.relu(self.fc3(x)) 20 | actions = self.max_action * torch.tanh(self.action_out(x)) 21 | return actions 22 | 23 | 24 | class Critic(nn.Module): 25 | def __init__(self, args): 26 | super(Critic, self).__init__() 27 | self.max_action = args.high_action 28 | self.fc1 = nn.Linear(sum(args.obs_shape) + sum(args.action_shape), 64) 29 | self.fc2 = nn.Linear(64, 64) 30 | self.fc3 = nn.Linear(64, 64) 31 | self.q_out = nn.Linear(64, 1) 32 | 33 | def forward(self, state, action): 34 | state = torch.cat(state, dim=1) 35 | for i in range(len(action)): 36 | action[i] /= self.max_action 37 | action = torch.cat(action, dim=1) 38 | x = torch.cat([state, action], dim=1) 39 | x = F.relu(self.fc1(x)) 40 | x = F.relu(self.fc2(x)) 41 | x = F.relu(self.fc3(x)) 42 | q_value = self.q_out(x) 43 | return q_value 44 | -------------------------------------------------------------------------------- /train/maddpg-v2/utils/networks.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | class MLPNetwork(nn.Module): 5 | """ 6 | MLP network (can be used as value or policy) 7 | """ 8 | def __init__(self, input_dim, out_dim, hidden_dim=64, nonlin=F.relu, 9 | constrain_out=False, norm_in=True, discrete_action=True): 10 | """ 11 | Inputs: 12 | input_dim (int): Number of dimensions in input 13 | out_dim (int): Number of dimensions in output 14 | hidden_dim (int): Number of hidden dimensions 15 | nonlin (PyTorch function): Nonlinearity to apply to hidden layers 16 | """ 17 | super(MLPNetwork, self).__init__() 18 | 19 | if norm_in: # normalize inputs 20 | self.in_fn = nn.BatchNorm1d(input_dim) 21 | self.in_fn.weight.data.fill_(1) 22 | self.in_fn.bias.data.fill_(0) 23 | else: 24 | self.in_fn = lambda x: x 25 | self.fc1 = nn.Linear(input_dim, hidden_dim) 26 | self.fc2 = nn.Linear(hidden_dim, hidden_dim) 27 | self.fc3 = nn.Linear(hidden_dim, out_dim) 28 | self.nonlin = nonlin 29 | if constrain_out and not discrete_action: 30 | # initialize small to prevent saturation 31 | self.fc3.weight.data.uniform_(-3e-3, 3e-3) 32 | self.out_fn = F.tanh 33 | else: # logits for discrete action (will softmax later) 34 | self.out_fn = lambda x: x 35 | 36 | def forward(self, X): 37 | """ 38 | Inputs: 39 | X (PyTorch Matrix): Batch of observations 40 | Outputs: 41 | out (PyTorch Matrix): Output of network (actions, values, etc) 42 | """ 43 | h1 = self.nonlin(self.fc1(self.in_fn(X))) 44 | h2 = self.nonlin(self.fc2(h1)) 45 | out = self.out_fn(self.fc3(h2)) 46 | return out -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | env.bak/ 86 | venv.bak/ 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | .spyproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # mkdocs documentation 96 | /site 97 | 98 | # mypy 99 | .mypy_cache/ 100 | 101 | # DS Store 102 | .DS_Store 103 | 104 | # tensorboard 105 | /runs 106 | 107 | # debug folder 108 | /debug 109 | 110 | # training folder 111 | results/ 112 | ray_results/ 113 | logs/ 114 | log/ 115 | formation_gym.egg-info/ -------------------------------------------------------------------------------- /train/maddpg-v1/common/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import inspect 3 | import functools 4 | 5 | 6 | def store_args(method): 7 | """Stores provided method args as instance attributes. 8 | """ 9 | argspec = inspect.getfullargspec(method) 10 | defaults = {} 11 | if argspec.defaults is not None: 12 | defaults = dict( 13 | zip(argspec.args[-len(argspec.defaults):], argspec.defaults)) 14 | if argspec.kwonlydefaults is not None: 15 | defaults.update(argspec.kwonlydefaults) 16 | arg_names = argspec.args[1:] 17 | 18 | @functools.wraps(method) 19 | def wrapper(*positional_args, **keyword_args): 20 | self = positional_args[0] 21 | # Get default arg values 22 | args = defaults.copy() 23 | # Add provided arg values 24 | for name, value in zip(arg_names, positional_args[1:]): 25 | args[name] = value 26 | args.update(keyword_args) 27 | self.__dict__.update(args) 28 | return method(*positional_args, **keyword_args) 29 | 30 | return wrapper 31 | 32 | 33 | def make_env(args): 34 | from multiagent.environment import MultiAgentEnv 35 | import multiagent.scenarios as scenarios 36 | 37 | # load scenario from script 38 | scenario = scenarios.load(args.scenario_name + ".py").Scenario() 39 | 40 | # create world 41 | world = scenario.make_world() 42 | # create multiagent environment 43 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation) 44 | # env = MultiAgentEnv(world) 45 | args.n_players = env.n # 包含敌人的所有玩家个数 46 | args.n_agents = env.n - args.num_adversaries # 需要操控的玩家个数,虽然敌人也可以控制,但是双方都学习的话需要不同的算法 47 | args.obs_shape = [env.observation_space[i].shape[0] for i in range(args.n_agents)] # 每一维代表该agent的obs维度 48 | action_shape = [] 49 | for content in env.action_space: 50 | action_shape.append(content.n) 51 | args.action_shape = action_shape[:args.n_agents] # 每一维代表该agent的act维度 52 | args.high_action = 1 53 | args.low_action = -1 54 | return env, args 55 | -------------------------------------------------------------------------------- /formation_gym/policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pyglet.window import key 3 | 4 | # individual agent policy 5 | class Policy(object): 6 | def __init__(self): 7 | pass 8 | def action(self, obs): 9 | raise NotImplementedError() 10 | 11 | # interactive policy based on keyboard input 12 | # hard-coded to deal only with movement, not communication 13 | class InteractivePolicy(Policy): 14 | def __init__(self, env, agent_index): 15 | super(InteractivePolicy, self).__init__() 16 | self.env = env 17 | # hard-coded keyboard events 18 | self.move = [False for i in range(4)] 19 | self.comm = [False for i in range(env.world.dim_c)] 20 | # register keyboard events with this environment's window 21 | env.viewers[agent_index].window.on_key_press = self.key_press 22 | env.viewers[agent_index].window.on_key_release = self.key_release 23 | 24 | def action(self, obs): 25 | # ignore observation and just act based on keyboard events 26 | if self.env.discrete_action_input: 27 | u = 0 28 | if self.move[0]: u = 1 29 | if self.move[1]: u = 2 30 | if self.move[2]: u = 4 31 | if self.move[3]: u = 3 32 | else: 33 | u = np.zeros(5) # 5-d because of no-move action 34 | if self.move[0]: u[1] += 1.0 35 | if self.move[1]: u[2] += 1.0 36 | if self.move[3]: u[3] += 1.0 37 | if self.move[2]: u[4] += 1.0 38 | if True not in self.move: 39 | u[0] += 1.0 40 | return np.concatenate([u, np.zeros(self.env.world.dim_c)]) 41 | 42 | # keyboard event callbacks 43 | def key_press(self, k, mod): 44 | if k==key.LEFT: self.move[0] = True 45 | if k==key.RIGHT: self.move[1] = True 46 | if k==key.UP: self.move[2] = True 47 | if k==key.DOWN: self.move[3] = True 48 | def key_release(self, k, mod): 49 | if k==key.LEFT: self.move[0] = False 50 | if k==key.RIGHT: self.move[1] = False 51 | if k==key.UP: self.move[2] = False 52 | if k==key.DOWN: self.move[3] = False 53 | -------------------------------------------------------------------------------- /train/maddpg-v2/utils/make_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for creating a multiagent environment with one of the scenarios listed 3 | in ./scenarios/. 4 | Can be called by using, for example: 5 | env = make_env('simple_speaker_listener') 6 | After producing the env object, can be used similarly to an OpenAI gym 7 | environment. 8 | 9 | A policy using this environment must output actions in the form of a list 10 | for all agents. Each element of the list should be a numpy array, 11 | of size (env.world.dim_p + env.world.dim_c, 1). Physical actions precede 12 | communication actions in this array. See environment.py for more details. 13 | """ 14 | 15 | def make_env(scenario_name, benchmark=False, discrete_action=False): 16 | ''' 17 | Creates a MultiAgentEnv object as env. This can be used similar to a gym 18 | environment by calling env.reset() and env.step(). 19 | Use env.render() to view the environment on the screen. 20 | 21 | Input: 22 | scenario_name : name of the scenario from ./scenarios/ to be Returns 23 | (without the .py extension) 24 | benchmark : whether you want to produce benchmarking data 25 | (usually only done during evaluation) 26 | 27 | Some useful env properties (see environment.py): 28 | .observation_space : Returns the observation space for each agent 29 | .action_space : Returns the action space for each agent 30 | .n : Returns the number of Agents 31 | ''' 32 | from multiagent.environment import MultiAgentEnv 33 | import multiagent.scenarios as scenarios 34 | 35 | # load scenario from script 36 | scenario = scenarios.load(scenario_name + ".py").Scenario() 37 | # create world 38 | world = scenario.make_world() 39 | # create multiagent environment 40 | if benchmark: 41 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, 42 | scenario.observation, scenario.benchmark_data, 43 | discrete_action=discrete_action) 44 | else: 45 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, 46 | scenario.observation, 47 | discrete_action=discrete_action) 48 | return env 49 | -------------------------------------------------------------------------------- /train/maddpg-v1/common/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import numpy as np 3 | 4 | 5 | class Buffer: 6 | def __init__(self, args): 7 | self.size = args.buffer_size 8 | self.args = args 9 | # memory management 10 | self.current_size = 0 11 | # create the buffer to store info 12 | self.buffer = dict() 13 | for i in range(self.args.n_agents): 14 | self.buffer['o_%d' % i] = np.empty([self.size, self.args.obs_shape[i]]) 15 | self.buffer['u_%d' % i] = np.empty([self.size, self.args.action_shape[i]]) 16 | self.buffer['r_%d' % i] = np.empty([self.size]) 17 | self.buffer['o_next_%d' % i] = np.empty([self.size, self.args.obs_shape[i]]) 18 | # thread lock 19 | self.lock = threading.Lock() 20 | 21 | # store the episode 22 | def store_episode(self, o, u, r, o_next): 23 | idxs = self._get_storage_idx(inc=1) # 以transition的形式存,每次只存一条经验 24 | for i in range(self.args.n_agents): 25 | with self.lock: 26 | self.buffer['o_%d' % i][idxs] = o[i] 27 | self.buffer['u_%d' % i][idxs] = u[i] 28 | if isinstance(r[i], list): self.buffer['r_%d' % i][idxs] = r[i][0] 29 | else: self.buffer['r_%d' % i][idxs] = r[i] 30 | self.buffer['o_next_%d' % i][idxs] = o_next[i] 31 | 32 | # sample the data from the replay buffer 33 | def sample(self, batch_size): 34 | temp_buffer = {} 35 | idx = np.random.randint(0, self.current_size, batch_size) 36 | for key in self.buffer.keys(): 37 | temp_buffer[key] = self.buffer[key][idx] 38 | return temp_buffer 39 | 40 | def _get_storage_idx(self, inc=None): 41 | inc = inc or 1 42 | if self.current_size+inc <= self.size: 43 | idx = np.arange(self.current_size, self.current_size+inc) 44 | elif self.current_size < self.size: 45 | overflow = inc - (self.size - self.current_size) 46 | idx_a = np.arange(self.current_size, self.size) 47 | idx_b = np.random.randint(0, self.current_size, overflow) 48 | idx = np.concatenate([idx_a, idx_b]) 49 | else: 50 | idx = np.random.randint(0, self.size, inc) 51 | self.current_size = min(self.size, self.current_size+inc) 52 | if inc == 1: 53 | idx = idx[0] 54 | return idx 55 | -------------------------------------------------------------------------------- /formation_gym/multi_discrete.py: -------------------------------------------------------------------------------- 1 | # An old version of OpenAI Gym's multi_discrete.py. (Was getting affected by Gym updates) 2 | # (https://github.com/openai/gym/blob/1fb81d4e3fb780ccf77fec731287ba07da35eb84/gym/spaces/multi_discrete.py) 3 | 4 | import numpy as np 5 | 6 | import gym 7 | # from gym.spaces import prng 8 | 9 | class MultiDiscrete(gym.Space): 10 | """ 11 | - The multi-discrete action space consists of a series of discrete action spaces with different parameters 12 | - It can be adapted to both a Discrete action space or a continuous (Box) action space 13 | - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space 14 | - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space 15 | where the discrete action space can take any integers from `min` to `max` (both inclusive) 16 | Note: A value of 0 always need to represent the NOOP action. 17 | e.g. Nintendo Game Controller 18 | - Can be conceptualized as 3 discrete action spaces: 19 | 1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4 20 | 2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 21 | 3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1 22 | - Can be initialized as 23 | MultiDiscrete([ [0,4], [0,1], [0,1] ]) 24 | """ 25 | def __init__(self, array_of_param_array): 26 | self.low = np.array([x[0] for x in array_of_param_array]) 27 | self.high = np.array([x[1] for x in array_of_param_array]) 28 | self.num_discrete_space = self.low.shape[0] 29 | 30 | def sample(self): 31 | """ Returns a array with one sample from each discrete action space """ 32 | # For each row: round(random .* (max - min) + min, 0) 33 | np_random = np.random.RandomState() 34 | random_array = np_random.rand(self.num_discrete_space) 35 | return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)] 36 | def contains(self, x): 37 | return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all() 38 | 39 | @property 40 | def shape(self): 41 | return self.num_discrete_space 42 | def __repr__(self): 43 | return "MultiDiscrete" + str(self.num_discrete_space) 44 | def __eq__(self, other): 45 | return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high) -------------------------------------------------------------------------------- /train/maddpg-v1/common/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | """ 4 | Here are the param for the training 5 | 6 | """ 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments") 11 | # Environment 12 | parser.add_argument("--scenario-name", type=str, default="formation_hd_env", help="name of the scenario script") 13 | parser.add_argument("--max-episode-len", type=int, default=30, help="maximum episode length") 14 | parser.add_argument("--time-steps", type=int, default=1000000, help="number of time steps") # 2000000 15 | # 一个地图最多env.n个agents,用户可以定义min(env.n,num-adversaries)个敌人,剩下的是好的agent 16 | parser.add_argument("--num-adversaries", type=int, default=0, help="number of adversaries") 17 | parser.add_argument("--num-agents", type=int, default=3, help="number of agents") 18 | # Core training parameters 19 | parser.add_argument("--lr-actor", type=float, default=1e-4, help="learning rate of actor") 20 | parser.add_argument("--lr-critic", type=float, default=1e-4, help="learning rate of critic") 21 | parser.add_argument("--epsilon", type=float, default=0.1, help="epsilon greedy") 22 | parser.add_argument("--noise_rate", type=float, default=0.25, help="noise rate for sampling from a standard normal distribution ") 23 | parser.add_argument("--gamma", type=float, default=0.95, help="discount factor") 24 | parser.add_argument("--tau", type=float, default=0.01, help="parameter for updating the target network") 25 | parser.add_argument("--buffer-size", type=int, default=int(5e5), help="number of transitions can be stored in buffer") 26 | parser.add_argument("--batch-size", type=int, default=256, help="number of episodes to optimize at the same time") 27 | # Checkpointing 28 | parser.add_argument("--save-dir", type=str, default="./model_3", help="directory in which training state and model should be saved") 29 | parser.add_argument("--save-rate", type=int, default=10000, help="save model once every time this many episodes are completed") 30 | parser.add_argument("--model-idx", type=int, default=1, help="The index of saved model to load") 31 | 32 | # Evaluate 33 | parser.add_argument("--evaluate-episodes", type=int, default=10, help="number of episodes for evaluating") 34 | parser.add_argument("--evaluate-episode-len", type=int, default=30, help="length of episodes for evaluating") 35 | parser.add_argument("--evaluate", type=bool, default=False, help="whether to evaluate the model") 36 | parser.add_argument("--evaluate-rate", type=int, default=10000, help="how often to evaluate model") 37 | args = parser.parse_args() 38 | 39 | return args 40 | -------------------------------------------------------------------------------- /train/maddpg-v4/parameters.yaml: -------------------------------------------------------------------------------- 1 | # environment 2 | env_name: MPE 3 | scenario_name: formation_hd_env 4 | num_agents: 4 5 | env_steps: 1e7 6 | episode_length: 5000 # how many steps to evaluate and save 7 | train_interval: 500000 # how many steps to train 8 | 9 | # experinment 10 | experiment_index: 1 11 | seed: 1 12 | 13 | # policy 14 | share_policy: True 15 | 16 | # algorithm 17 | algorithm_name: maddpg 18 | gamma: 0.95 19 | use_same_share_obs: True # whether all agents share the same centralized observation[TBD] 20 | use_avail_acts: False # whether to store what actions are available. [TBD] 21 | use_reward_normalization: True # Whether to normalize rewards in replay buffer [TBD] 22 | use_popart: False # if use popart to handle multi-tasks 23 | popart_update_interval_step: 2 # after how many train steps popart should be updated 24 | use_value_active_masks: False # [TBD] [Q] 25 | use_huber_loss: False # Whether to use Huber loss for critic update to improve robustness [TBD] 26 | huber_delta: 10.0 27 | actor_update_interval: 1 # number of critic updates to perform between every update to the actor. [TBD] 28 | tau: 0.005 # Polyak update rate 29 | lr: 5e-4 # learning rate 30 | opti_eps: 1e-5 # RMSprop optimizer epsilon [Q] 31 | weight_decay: 0 # [Q] 32 | target_noise: False 33 | use_orthogonal: True 34 | use_feature_normalization: True # Whether to apply layernorm to the inputs 35 | use_ReLU: True 36 | use_conv1d: False # Whether to use conv1d 37 | stacked_frames: 1 # Dimension of hidden layers for actor/critic networks 38 | layer_N: 1 # Number of layers for actor/critic networks 39 | hidden_size: 64 # Dimension of hidden layers for actor/critic networks 40 | gain: 0.01 # gain for action last layer [Q] 41 | hidden_size: 64 #"Dimension of hidden layers for actor/critic networks") 42 | 43 | # exploration parameters 44 | epsilon_start: 1.0 45 | epsilon_finish: 0.05 46 | epsilon_anneal_time: 50000 47 | act_noise_std: 0.1 48 | num_random_episodes: 500 # [TBD] 49 | 50 | # replay buffer 51 | buffer_size: 32 # Number of buffer transitions to train on at once 52 | use_per: True # Whether to use prioritized experience replay 53 | per_alpha: 0.6 # Alpha term for prioritized experience replay, like learning rate 54 | per_beta_start: 0.4 # Starting beta term for prioritized experience replay 55 | per_eps: 1e-6 # Eps term for prioritized experience replay 56 | 57 | # policy 58 | 59 | # parallel 60 | n_training_threads: 8 # TBD 61 | n_rollout_threads: 1 #TBD 62 | 63 | # GPU 64 | device: 'gpu' 65 | cuda: True 66 | cuda_deterministic: False # TBD 67 | 68 | # save 69 | save_path: results 70 | restore: False 71 | save_interval: 100000 72 | 73 | # log 74 | log_interval: 1000 75 | 76 | # evaluate 77 | use_eval: True 78 | eval_interval: 10000 79 | num_eval_episodes: 5 -------------------------------------------------------------------------------- /train/ddpg/test.py: -------------------------------------------------------------------------------- 1 | import gym, torch, numpy as np, tianshou as ts 2 | from torch import nn 3 | from torch.utils.tensorboard import SummaryWriter 4 | from tianshou.utils import TensorboardLogger 5 | 6 | # make env 7 | train_envs = ts.env.DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(4)]) 8 | test_envs = ts.env.DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(1)]) 9 | 10 | # build the network 11 | class Net(nn.Module): 12 | def __init__(self, state_shape, action_shape): 13 | super().__init__() 14 | self.model = nn.Sequential( 15 | nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True), 16 | nn.Linear(128, 128), nn.ReLU(inplace=True), 17 | nn.Linear(128, 128), nn.ReLU(inplace=True), 18 | nn.Linear(128, np.prod(action_shape)), 19 | ) # inplace: calculate without copy prod: flatten the shape 20 | 21 | def forward(self, obs, state = None, info = {}): 22 | if not isinstance(obs, torch.Tensor): 23 | obs = torch.tensor(obs, dtype=torch.float) 24 | batch = obs.shape[0] 25 | logits = self.model(obs.view(batch, -1)) 26 | return logits, state 27 | 28 | state_shape = train_envs.observation_space[0].shape or train_envs.observation_space[0].n 29 | action_shape = train_envs.action_space[0].shape or train_envs.action_space[0].n 30 | net = Net(state_shape, action_shape) 31 | optim = torch.optim.Adam(net.parameters(), lr=1e-3) 32 | 33 | # set up policy 34 | policy = ts.policy.DQNPolicy(net, optim, discount_factor=0.9, estimation_step=3, target_update_freq=320) 35 | 36 | # setup collector 37 | train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True) 38 | test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True) 39 | 40 | # logger 41 | writer = SummaryWriter('log/dqn') 42 | logger = TensorboardLogger(writer) 43 | 44 | # trainer 45 | # step/epoch: collect 46 | # update/step: train after these steps 47 | # step/collect: update times according to collect 48 | result = ts.trainer.offpolicy_trainer( 49 | policy, train_collector, test_collector, 50 | max_epoch=10, step_per_epoch=10000, step_per_collect=10, 51 | update_per_step=0.1, episode_per_test=100, batch_size=64, 52 | train_fn=lambda epoch, env_step: policy.set_eps(0.1), 53 | test_fn=lambda epoch, env_step: policy.set_eps(0.05), 54 | stop_fn=lambda mean_rewards: mean_rewards >= train_envs.spec[0].reward_threshold, 55 | logger = logger) 56 | print(f'Finished training! Use {result["duration"]}') 57 | 58 | # save policy 59 | torch.save(policy.state_dict(), 'models/dqn.pth') 60 | # policy.load_state_dict(torch.load('dqn.pth')) 61 | # evaluate 62 | test_collector.collect(n_episode = 1, render = 1/30) 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Formation environment based on MPE 2 | 3 | multi-agent formation control environment implemented with MPE. 4 | 5 | ## Installation 6 | 7 | ``` 8 | git clone https://github.com/jc-bao/gym-formation.git 9 | cd gym-formation 10 | pip install -e . 11 | ``` 12 | 13 | ## Test 14 | ``` 15 | python test.py -s formation_hd_env --num-layer 1 16 | ``` 17 | Note: use `-r` flag to use random policy. 18 | 19 | ## TODO 20 | 21 | - [ ] Observation: reduce the number of observation in hierarchy policy. (now use fully observation) 22 | 23 | - [ ] Leader&Communication: choose the group leader in each layer smartly and communicate smartly. (now use the first agent as leader) 24 | 25 | - [ ] Target shape: achieve asymmetic shape. (now only symetric shape in higher level control) 26 | 27 | - [ ] Group: divide the group smartly to reduce formation time. (now the group was previously divided, more layers, less distributional) 28 | 29 | - [ ] Location: the esitimation of the position of groups. (now the group was located by inference the center of group by leader's observation) 30 | 31 | ## Extend to more agent use hierarchy policy 32 | 33 | ```python 34 | num_agents_per_layer = 3 # number of agents of your original policy network (or you can use ezpolicy provided by the package) 35 | num_layer = 2 # number of control layer, extend agent number to n^{layers} 36 | env = formation_gym.make_env('formation_hd_env', benchmark=False, num_agents = anum_agents_per_layer**num_layer) 37 | obs_n = env.reset() 38 | while True: 39 | # use BFS to extend your policy to larger scale 40 | act_n = formation_gym.get_action_BFS(YOUR_POLICY_HERE, obs_n, num_agents_per_layer) 41 | # step environment 42 | obs_n, reward_n, done_n, _ = env.step(act_n) 43 | ... 44 | ``` 45 | 46 | Note: 47 | 48 | * not recommend to use layer larger than 5, which will run 3^5 network in parallel. 49 | * make sure your policy network can correct turn the observation of single agent into action. 50 | * the `get_action_BFS` is based on [Breadth-first search](https://en.wikipedia.org/wiki/Breadth-first_search) 51 | * **get any target shape**: by using the function provided in env by calling `ideal_shape = env.generate_shape(num_layers = 3, layer_shapes = YOUR_TARGET_LAYER_SHAPE).reshape(-1,2)` and replace the observation counter part with it. 52 | 53 | ## Train 54 | 55 | Please Refer to `train/README.md` 56 | If you want to use another algorithm, here is the template: 57 | 58 | ``` 59 | import formation_gym 60 | 61 | env = formation_gym.make_env(your_scenario_name, if_use_benchmark, number_of_agents, episode_length) 62 | ``` 63 | 64 | ## Scenarios 65 | 66 | | basic_formation_env | formation_hd_env | 67 | | ------------------------------------------------------------ | ------------------------------------------------------------ | 68 | | The reimplemtation for OpenAI MPE spread enviroment. The target it reach the landmark. | Try to mimic the topology of landmarks only with relative observation. | 69 | | | ![Nov-24-2021 14-10-59](https://tva1.sinaimg.cn/large/008i3skNly1gwq7m2aj1pg30ii0i0e82.gif) | 70 | | ![plt](https://tva1.sinaimg.cn/large/008i3skNly1gukfvhkxraj60hs0dcaal02.jpg) | ![plt](https://tva1.sinaimg.cn/large/008i3skNly1gukfuj9pr7j60hs0dc3yz02.jpg) | 71 | 72 | ## Further information 73 | 74 | ``` 75 | action space = [if_moveable, action_1, ... action_n, comm_1, ... comm_n] 76 | ``` 77 | 78 | ### MVE Support 79 | 80 | * Action: `` 81 | -------------------------------------------------------------------------------- /train/maddpg-v2/evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import time 4 | import imageio 5 | import numpy as np 6 | from pathlib import Path 7 | from torch.autograd import Variable 8 | from utils.make_env import make_env 9 | from algorithms.maddpg import MADDPG 10 | 11 | import formation_gym 12 | 13 | 14 | def run(config): 15 | model_path = (Path('./models') / config.env_id / config.model_name / 16 | ('run%i' % config.run_num)) 17 | if config.incremental is not None: 18 | model_path = model_path / 'incremental' / ('model_ep%i.pt' % 19 | config.incremental) 20 | else: 21 | model_path = model_path / 'model.pt' 22 | 23 | if config.save_gifs: 24 | gif_path = model_path.parent / 'gifs' 25 | gif_path.mkdir(exist_ok=True) 26 | 27 | maddpg = MADDPG.init_from_save(model_path) 28 | env = formation_gym.make_env(config.env_id, False, config.agent_num) 29 | maddpg.prep_rollouts(device='cpu') # cpu 30 | ifi = 1 / config.fps # inter-frame interval 31 | 32 | for ep_i in range(config.n_episodes): 33 | print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) 34 | obs = env.reset() 35 | if config.save_gifs: 36 | frames = [] 37 | frames.append(env.render('rgb_array')[0]) 38 | env.render('human') 39 | for t_i in range(config.episode_length): 40 | calc_start = time.time() 41 | # rearrange observations to be per agent, and convert to torch Variable 42 | torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1), 43 | requires_grad=False) 44 | for i in range(maddpg.nagents)] 45 | # get actions as torch Variables 46 | torch_actions = maddpg.step(torch_obs, explore=False) 47 | # convert actions to numpy arrays 48 | actions = [ac.data.numpy().flatten() for ac in torch_actions] 49 | obs, rewards, dones, infos = env.step(actions) 50 | if config.save_gifs: 51 | frames.append(env.render('rgb_array')[0]) 52 | calc_end = time.time() 53 | elapsed = calc_end - calc_start 54 | if elapsed < ifi: 55 | time.sleep(ifi - elapsed) 56 | env.render('human') 57 | if config.save_gifs: 58 | gif_num = 0 59 | while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): 60 | gif_num += 1 61 | imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), 62 | frames, duration=ifi) 63 | 64 | env.close() 65 | 66 | 67 | if __name__ == '__main__': 68 | parser = argparse.ArgumentParser() 69 | parser.add_argument("--env_id", default='formation_hd_env', type = str, help="Name of environment") 70 | parser.add_argument("--model_name",default='model', type = str, help="Name of model") 71 | parser.add_argument("--run_num", default=1, type=int) 72 | parser.add_argument("--save_gifs", action="store_true", help="Saves gif of each episode into model directory") 73 | parser.add_argument("--incremental", default=None, type=int, help="Load incremental policy from given episode " + "rather than final policy") 74 | parser.add_argument("--n_episodes", default=10, type=int) 75 | parser.add_argument("--episode_length", default=30, type=int) 76 | parser.add_argument("--fps", default=30, type=int) 77 | parser.add_argument("--agent-num", type=int, default = 9) 78 | 79 | config = parser.parse_args() 80 | 81 | run(config) -------------------------------------------------------------------------------- /formation_gym/envs/basic_formation_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from multiagent.scenario import BaseScenario 4 | from multiagent.core import World, Agent, Landmark 5 | 6 | class Scenario(BaseScenario): 7 | def make_world(self, num_agents = 3, num_landmarks = 3): 8 | # world properties 9 | world = World() 10 | world.dim_c = 2 # communication channel 11 | world.collaborative = True 12 | # agent properties 13 | world.agents = [Agent() for i in range(num_agents)] 14 | for i, agent in enumerate(world.agents): 15 | agent.name = 'agent %d' % i 16 | agent.collide = True 17 | agent.silent = True 18 | agent.size = 0.1 19 | # landmark properties 20 | world.landmarks = [Landmark() for i in range(num_landmarks)] 21 | for i, landmark in enumerate(world.landmarks): 22 | landmark.name = 'landmarks %d' % i 23 | landmark.collide = False 24 | landmark.movable = False 25 | # initial conditions 26 | self.reset_world(world) 27 | return world 28 | 29 | def observation(self, agent, world): 30 | # landmark pos 31 | entity_pos = [] 32 | for entity in world.landmarks: 33 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 34 | # agent pos & communication 35 | other_pos = [] 36 | comm = [] 37 | for other in world.agents: 38 | if other is agent: continue 39 | comm.append(other.state.c) 40 | other_pos.append(other.state.p_pos - agent.state.p_pos) 41 | return np.concatenate([agent.state.p_vel]+[agent.state.p_pos]+entity_pos + other_pos + comm) 42 | 43 | def reward(self, agent, world): 44 | rew = 0 45 | for l in world.landmarks: 46 | dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents] 47 | rew -= min(dists) 48 | if agent.collide: 49 | for a in world.agents: 50 | if self.is_collision(a, agent): 51 | rew -= 1 52 | return rew 53 | 54 | def reset_world(self, world): 55 | # agent 56 | for agent in world.agents: 57 | agent.color = np.array([0.35, 0.35, 0.85]) 58 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 59 | agent.state.p_vel = np.zeros(world.dim_p) 60 | agent.state.c = np.zeros(world.dim_c) 61 | # landmark 62 | for landmark in world.landmarks: 63 | landmark.color = np.array([0.25, 0.25, 0.25]) 64 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 65 | landmark.state.p_vel = np.zeros(world.dim_p) 66 | 67 | def benchmark_data(self, agent, world): 68 | # get data to debug 69 | rew = self.reward(agent, world) 70 | collisions = 0 71 | if agent.collide: 72 | for a in world.agents: 73 | if self.is_collision(a, agent): 74 | collisions += 1 75 | min_dists = 0 76 | occupied_landmarks = 0 77 | for l in world.landmarks: 78 | dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents] 79 | min_dists += min(dists) 80 | if min(dists) < 0.1: 81 | occupied_landmarks += 1 82 | return { 83 | 'reward': rew, 84 | 'collisions': collisions, 85 | 'min_dists': min_dists, 86 | 'occupied_landmarks': occupied_landmarks 87 | } 88 | 89 | def is_collision(self, agent1, agent2): 90 | dist = np.linalg.norm(agent1.state.p_pos - agent2.state.p_pos) 91 | return dist < (agent1.size + agent2.size) 92 | 93 | -------------------------------------------------------------------------------- /train/maddpg-v4/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import numpy as np 4 | from pathlib import Path 5 | from gym.spaces import Box, Discrete, Tuple 6 | 7 | from utils import get_config, get_cent_act_dim, get_dim_from_space, make_train_env 8 | from runner import Runner 9 | 10 | if __name__ == "__main__": 11 | config = get_config() 12 | # torch setup 13 | if config['cuda'] and torch.cuda.is_available(): 14 | print("choose to use gpu...") 15 | device = torch.device("cuda:0") 16 | torch.set_num_threads(config['n_training_threads']) 17 | if config['cuda_deterministic']: 18 | torch.backends.cudnn.benchmark = False 19 | torch.backends.cudnn.deterministic = True 20 | else: 21 | print("choose to use cpu...") 22 | device = torch.device("cpu") 23 | torch.set_num_threads(config['n_training_threads']) 24 | torch.manual_seed(config['seed']) 25 | torch.cuda.manual_seed_all(config['seed']) 26 | np.random.seed(config['seed']) 27 | # dir setup 28 | run_dir = Path(os.path.dirname(os.path.abspath(__file__)) + '/' + config['save_path']+ '/'+config['scenario_name']+'_' + config['algorithm_name']+'_'+str(config['experiment_index'])) 29 | if not run_dir.exists(): 30 | os.makedirs(str(run_dir)) 31 | if not os.path.exists(run_dir/'logs'): 32 | os.makedirs(run_dir/'logs') 33 | if not os.path.exists(run_dir/'models'): 34 | os.makedirs(run_dir/'models') 35 | else: 36 | config['restore'] = True 37 | config['fullpath'] = str(run_dir) 38 | config['model_path'] = str(run_dir) + '/models' 39 | config['log_path'] = str(run_dir) + '/logs' 40 | # env setup 41 | env = make_train_env(config) 42 | eval_env = make_train_env(config) 43 | # algorithm setup 44 | if config['share_policy']: 45 | config['policy_info'] = { 46 | 'policy_0': {"cent_obs_dim": get_dim_from_space(env.share_observation_space[0]), 47 | "cent_act_dim": get_cent_act_dim(env.action_space), 48 | "obs_space": env.observation_space[0], 49 | "share_obs_space": env.share_observation_space[0], 50 | "act_space": env.action_space[0]} 51 | } 52 | def policy_mapping_fn(id): return 'policy_0' 53 | else: 54 | config['policy_info'] = { 55 | 'policy_' + str(agent_id): { 56 | "cent_obs_dim": get_dim_from_space(env.share_observation_space[agent_id]), 57 | "cent_act_dim": get_cent_act_dim(env.action_space), 58 | "obs_space": env.observation_space[agent_id], 59 | 'obs_dim': get_dim_from_space(env.observation_space[agent_id]), 60 | "share_obs_space": env.share_observation_space[agent_id], 61 | "act_space": env.action_space[agent_id], 62 | "act_dim": get_dim_from_space(env.action_space[agent_id]), 63 | "output_dim": sum(get_dim_from_space(env.action_space[agent_id])) if isinstance((get_dim_from_space(env.action_space[agent_id]), np.ndarray)) else get_dim_from_space(env.action_space[agent_id]), 64 | } 65 | for agent_id in range(config['num_agents']) 66 | } 67 | def policy_mapping_fn(agent_id): return 'policy_' + str(agent_id) 68 | # Q: why do we need this one 69 | config['policy_mapping_fn']=policy_mapping_fn 70 | # more parameters 71 | config['env'] = env 72 | config['eval_env'] = eval_env 73 | config['discrete'] = isinstance(config['policy_info']["act_space"], Discrete) or "MultiDiscrete" in (config['policy_info']["act_space"].__class__.__name__) 74 | config['multidiscrete'] = ("MultiDiscrete" in config['policy_info']["act_space"].__class__.__name__) 75 | config['tpdv'] = dict(dtype=torch.float32, device=config['device']) 76 | # train 77 | total_steps = 0 78 | runner = Runner(config) 79 | while total_steps < config['env_steps']: 80 | total_steps = runner.run() 81 | # close 82 | env.close() 83 | eval_env.close() -------------------------------------------------------------------------------- /train/maddpg-v1/runner.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | from agent import Agent 3 | from common.replay_buffer import Buffer 4 | import torch 5 | import os 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | import time 9 | 10 | 11 | class Runner: 12 | def __init__(self, args, env): 13 | self.args = args 14 | self.noise = args.noise_rate 15 | self.epsilon = args.epsilon 16 | self.episode_limit = args.max_episode_len 17 | self.env = env 18 | self.agents = self._init_agents() 19 | self.buffer = Buffer(args) 20 | self.save_path = 'results/' + self.args.save_dir + '/' + self.args.scenario_name 21 | if not os.path.exists(self.save_path): 22 | os.makedirs(self.save_path) 23 | 24 | def _init_agents(self): 25 | agents = [] 26 | for i in range(self.args.n_agents): 27 | agent = Agent(i, self.args) 28 | agents.append(agent) 29 | return agents 30 | 31 | def run(self): 32 | returns = [] 33 | for time_step in tqdm(range(self.args.time_steps)): 34 | # reset the environment 35 | if time_step % self.episode_limit == 0: 36 | s = self.env.reset() 37 | u = [] 38 | actions = [] 39 | with torch.no_grad(): 40 | for agent_id, agent in enumerate(self.agents): 41 | action = agent.select_action(s[agent_id], self.noise, self.epsilon) 42 | u.append(action) 43 | actions.append(action) 44 | for i in range(self.args.n_agents, self.args.n_players): 45 | actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0]) 46 | s_next, r, done, info = self.env.step(actions) 47 | self.buffer.store_episode(s[:self.args.n_agents], u, r[:self.args.n_agents], s_next[:self.args.n_agents]) 48 | s = s_next 49 | if self.buffer.current_size >= self.args.batch_size: 50 | transitions = self.buffer.sample(self.args.batch_size) 51 | for agent in self.agents: 52 | other_agents = self.agents.copy() 53 | other_agents.remove(agent) 54 | agent.learn(transitions, other_agents) 55 | if time_step > 0 and time_step % self.args.evaluate_rate == 0: 56 | returns.append(self.evaluate()) 57 | plt.figure() 58 | plt.plot(range(len(returns)), returns) 59 | plt.xlabel('episode * ' + str(self.args.evaluate_rate / self.episode_limit)) 60 | plt.ylabel('average returns') 61 | plt.savefig(self.save_path + '/plt.png', format='png') 62 | self.noise = max(0.05, self.noise - 0.0000005) 63 | self.epsilon = max(0.05, self.noise - 0.0000005) 64 | np.save(self.save_path + '/returns.pkl', returns) 65 | 66 | def evaluate(self, rnd = False): 67 | returns = [] 68 | for episode in range(self.args.evaluate_episodes): 69 | # reset the environment 70 | rewards = 0 71 | s = self.env.reset() 72 | for time_step in range(self.args.evaluate_episode_len): 73 | if rnd: 74 | self.env.render() 75 | # time.sleep(1) 76 | actions = [] 77 | with torch.no_grad(): 78 | for agent_id, agent in enumerate(self.agents): 79 | action = agent.select_action(s[agent_id], 0, 0) 80 | actions.append(action) 81 | for i in range(self.args.n_agents, self.args.n_players): 82 | actions.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0]) 83 | s_next, r, done, info = self.env.step(actions) 84 | if isinstance(r[0], list): rewards += r[0][0] 85 | else: rewards += r[0] 86 | s = s_next 87 | returns.append(rewards) 88 | print('Returns is', rewards, 'Final Reward:', r[0]) 89 | return sum(returns) / self.args.evaluate_episodes 90 | -------------------------------------------------------------------------------- /train/maddpg-v2/utils/agents.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch.autograd import Variable 3 | from torch.optim import Adam 4 | from .networks import MLPNetwork 5 | from .misc import hard_update, gumbel_softmax, onehot_from_logits 6 | from .noise import OUNoise 7 | 8 | class DDPGAgent(object): 9 | """ 10 | General class for DDPG agents (policy, critic, target policy, target 11 | critic, exploration noise) 12 | """ 13 | def __init__(self, num_in_pol, num_out_pol, num_in_critic, hidden_dim=64, 14 | lr=0.01, discrete_action=True): 15 | """ 16 | Inputs: 17 | num_in_pol (int): number of dimensions for policy input 18 | num_out_pol (int): number of dimensions for policy output 19 | num_in_critic (int): number of dimensions for critic input 20 | """ 21 | self.policy = MLPNetwork(num_in_pol, num_out_pol, 22 | hidden_dim=hidden_dim, 23 | constrain_out=True, 24 | discrete_action=discrete_action) 25 | self.critic = MLPNetwork(num_in_critic, 1, 26 | hidden_dim=hidden_dim, 27 | constrain_out=False) 28 | self.target_policy = MLPNetwork(num_in_pol, num_out_pol, 29 | hidden_dim=hidden_dim, 30 | constrain_out=True, 31 | discrete_action=discrete_action) 32 | self.target_critic = MLPNetwork(num_in_critic, 1, 33 | hidden_dim=hidden_dim, 34 | constrain_out=False) 35 | hard_update(self.target_policy, self.policy) 36 | hard_update(self.target_critic, self.critic) 37 | self.policy_optimizer = Adam(self.policy.parameters(), lr=lr) 38 | self.critic_optimizer = Adam(self.critic.parameters(), lr=lr) 39 | if not discrete_action: 40 | self.exploration = OUNoise(num_out_pol) 41 | else: 42 | self.exploration = 0.3 # epsilon for eps-greedy 43 | self.discrete_action = discrete_action 44 | 45 | def reset_noise(self): 46 | if not self.discrete_action: 47 | self.exploration.reset() 48 | 49 | def scale_noise(self, scale): 50 | if self.discrete_action: 51 | self.exploration = scale 52 | else: 53 | self.exploration.scale = scale 54 | 55 | def step(self, obs, explore=False): 56 | """ 57 | Take a step forward in environment for a minibatch of observations 58 | Inputs: 59 | obs (PyTorch Variable): Observations for this agent 60 | explore (boolean): Whether or not to add exploration noise 61 | Outputs: 62 | action (PyTorch Variable): Actions for this agent 63 | """ 64 | action = self.policy(obs) 65 | if self.discrete_action: 66 | if explore: 67 | action = gumbel_softmax(action, hard=True) 68 | else: 69 | action = onehot_from_logits(action) 70 | else: # continuous action 71 | if explore: 72 | action += Variable(Tensor(self.exploration.noise()), 73 | requires_grad=False) 74 | action = action.clamp(-1, 1) 75 | return action 76 | 77 | def get_params(self): 78 | return {'policy': self.policy.state_dict(), 79 | 'critic': self.critic.state_dict(), 80 | 'target_policy': self.target_policy.state_dict(), 81 | 'target_critic': self.target_critic.state_dict(), 82 | 'policy_optimizer': self.policy_optimizer.state_dict(), 83 | 'critic_optimizer': self.critic_optimizer.state_dict()} 84 | 85 | def load_params(self, params): 86 | self.policy.load_state_dict(params['policy']) 87 | self.critic.load_state_dict(params['critic']) 88 | self.target_policy.load_state_dict(params['target_policy']) 89 | self.target_critic.load_state_dict(params['target_critic']) 90 | self.policy_optimizer.load_state_dict(params['policy_optimizer']) 91 | self.critic_optimizer.load_state_dict(params['critic_optimizer']) 92 | -------------------------------------------------------------------------------- /train/maddpg-v2/utils/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn.functional as F 4 | import torch.distributed as dist 5 | from torch.autograd import Variable 6 | import numpy as np 7 | 8 | # https://github.com/ikostrikov/pytorch-ddpg-naf/blob/master/ddpg.py#L11 9 | def soft_update(target, source, tau): 10 | """ 11 | Perform DDPG soft update (move target params toward source based on weight 12 | factor tau) 13 | Inputs: 14 | target (torch.nn.Module): Net to copy parameters to 15 | source (torch.nn.Module): Net whose parameters to copy 16 | tau (float, 0 < x < 1): Weight factor for update 17 | """ 18 | for target_param, param in zip(target.parameters(), source.parameters()): 19 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) 20 | 21 | # https://github.com/ikostrikov/pytorch-ddpg-naf/blob/master/ddpg.py#L15 22 | def hard_update(target, source): 23 | """ 24 | Copy network parameters from source to target 25 | Inputs: 26 | target (torch.nn.Module): Net to copy parameters to 27 | source (torch.nn.Module): Net whose parameters to copy 28 | """ 29 | for target_param, param in zip(target.parameters(), source.parameters()): 30 | target_param.data.copy_(param.data) 31 | 32 | # https://github.com/seba-1511/dist_tuto.pth/blob/gh-pages/train_dist.py 33 | def average_gradients(model): 34 | """ Gradient averaging. """ 35 | size = float(dist.get_world_size()) 36 | for param in model.parameters(): 37 | dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0) 38 | param.grad.data /= size 39 | 40 | # https://github.com/seba-1511/dist_tuto.pth/blob/gh-pages/train_dist.py 41 | def init_processes(rank, size, fn, backend='gloo'): 42 | """ Initialize the distributed environment. """ 43 | os.environ['MASTER_ADDR'] = '127.0.0.1' 44 | os.environ['MASTER_PORT'] = '29500' 45 | dist.init_process_group(backend, rank=rank, world_size=size) 46 | fn(rank, size) 47 | 48 | def onehot_from_logits(logits, eps=0.0): 49 | """ 50 | Given batch of logits, return one-hot sample using epsilon greedy strategy 51 | (based on given epsilon) 52 | """ 53 | # get best (according to current policy) actions in one-hot form 54 | argmax_acs = (logits == logits.max(1, keepdim=True)[0]).float() 55 | if eps == 0.0: 56 | return argmax_acs 57 | # get random actions in one-hot form 58 | rand_acs = Variable(torch.eye(logits.shape[1])[[np.random.choice( 59 | range(logits.shape[1]), size=logits.shape[0])]], requires_grad=False) 60 | # chooses between best and random actions using epsilon greedy 61 | return torch.stack([argmax_acs[i] if r > eps else rand_acs[i] for i, r in 62 | enumerate(torch.rand(logits.shape[0]))]) 63 | 64 | # modified for PyTorch from https://github.com/ericjang/gumbel-softmax/blob/master/Categorical%20VAE.ipynb 65 | def sample_gumbel(shape, eps=1e-20, tens_type=torch.FloatTensor): 66 | """Sample from Gumbel(0, 1)""" 67 | U = Variable(tens_type(*shape).uniform_(), requires_grad=False) 68 | return -torch.log(-torch.log(U + eps) + eps) 69 | 70 | # modified for PyTorch from https://github.com/ericjang/gumbel-softmax/blob/master/Categorical%20VAE.ipynb 71 | def gumbel_softmax_sample(logits, temperature): 72 | """ Draw a sample from the Gumbel-Softmax distribution""" 73 | y = logits + sample_gumbel(logits.shape, tens_type=type(logits.data)) 74 | return F.softmax(y / temperature, dim=1) 75 | 76 | # modified for PyTorch from https://github.com/ericjang/gumbel-softmax/blob/master/Categorical%20VAE.ipynb 77 | def gumbel_softmax(logits, temperature=1.0, hard=False): 78 | """Sample from the Gumbel-Softmax distribution and optionally discretize. 79 | Args: 80 | logits: [batch_size, n_class] unnormalized log-probs 81 | temperature: non-negative scalar 82 | hard: if True, take argmax, but differentiate w.r.t. soft sample y 83 | Returns: 84 | [batch_size, n_class] sample from the Gumbel-Softmax distribution. 85 | If hard=True, then the returned sample will be one-hot, otherwise it will 86 | be a probabilitiy distribution that sums to 1 across classes 87 | """ 88 | y = gumbel_softmax_sample(logits, temperature) 89 | if hard: 90 | y_hard = onehot_from_logits(y) 91 | y = (y_hard - y).detach() + y 92 | return y 93 | -------------------------------------------------------------------------------- /train/maddpg-v3/env/multiagent_particle_env.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from gym.spaces import Discrete, Box, MultiDiscrete 6 | from ray import rllib 7 | from make_env import make_env 8 | 9 | import numpy as np 10 | import time 11 | 12 | 13 | class RLlibMultiAgentParticleEnv(rllib.MultiAgentEnv): 14 | """Wraps OpenAI Multi-Agent Particle env to be compatible with RLLib multi-agent.""" 15 | 16 | def __init__(self, **mpe_args): 17 | """Create a new Multi-Agent Particle env compatible with RLlib. 18 | 19 | Arguments: 20 | mpe_args (dict): Arguments to pass to the underlying 21 | make_env.make_env instance. 22 | 23 | Examples: 24 | >>> from rllib_env import RLlibMultiAgentParticleEnv 25 | >>> env = RLlibMultiAgentParticleEnv(scenario_name="simple_reference") 26 | >>> print(env.reset()) 27 | """ 28 | 29 | self._env = make_env(**mpe_args) 30 | self.num_agents = self._env.n 31 | self.agent_ids = list(range(self.num_agents)) 32 | 33 | self.observation_space_dict = self._make_dict(self._env.observation_space) 34 | self.action_space_dict = self._make_dict(self._env.action_space) 35 | 36 | def reset(self): 37 | """Resets the env and returns observations from ready agents. 38 | 39 | Returns: 40 | obs_dict: New observations for each ready agent. 41 | """ 42 | 43 | obs_dict = self._make_dict(self._env.reset()) 44 | return obs_dict 45 | 46 | def step(self, action_dict): 47 | """Returns observations from ready agents. 48 | 49 | The returns are dicts mapping from agent_id strings to values. The 50 | number of agents in the env can vary over time. 51 | 52 | Returns: 53 | obs_dict: 54 | New observations for each ready agent. 55 | rew_dict: 56 | Reward values for each ready agent. 57 | done_dict: 58 | Done values for each ready agent. 59 | The special key "__all__" (required) is used to indicate env termination. 60 | info_dict: 61 | Optional info values for each agent id. 62 | """ 63 | 64 | actions = list(action_dict.values()) 65 | obs_list, rew_list, done_list, info_list = self._env.step(actions) 66 | 67 | obs_dict = self._make_dict(obs_list) 68 | rew_dict = self._make_dict(rew_list) 69 | done_dict = self._make_dict(done_list) 70 | done_dict["__all__"] = all(done_list) 71 | # FIXME: Currently, this is the best option to transfer agent-wise termination signal without touching RLlib code hugely. 72 | # FIXME: Hopefully, this will be solved in the future. 73 | info_dict = self._make_dict([{"done": done} for done in done_list]) 74 | 75 | return obs_dict, rew_dict, done_dict, info_dict 76 | 77 | def render(self, mode='human'): 78 | time.sleep(0.05) 79 | self._env.render(mode=mode) 80 | 81 | def _make_dict(self, values): 82 | return dict(zip(self.agent_ids, values)) 83 | 84 | 85 | if __name__ == '__main__': 86 | for scenario_name in ["simple", 87 | "simple_adversary", 88 | "simple_crypto", 89 | "simple_push", 90 | "simple_reference", 91 | "simple_speaker_listener", 92 | "simple_spread", 93 | "simple_tag", 94 | "simple_world_comm"]: 95 | print("scenario_name: ", scenario_name) 96 | env = RLlibMultiAgentParticleEnv(scenario_name=scenario_name) 97 | print("obs: ", env.reset()) 98 | print(env.observation_space_dict) 99 | print(env.action_space_dict) 100 | 101 | action_dict = {} 102 | for i, ac_space in env.action_space_dict.items(): 103 | sample = ac_space.sample() 104 | if isinstance(ac_space, Discrete): 105 | action_dict[i] = np.zeros(ac_space.n) 106 | action_dict[i][sample] = 1.0 107 | elif isinstance(ac_space, Box): 108 | action_dict[i] = sample 109 | elif isinstance(ac_space, MultiDiscrete): 110 | print("sample: ", sample) 111 | print("ac_space: ", ac_space.nvec) 112 | action_dict[i] = np.zeros(sum(ac_space.nvec)) 113 | start_ls = np.cumsum([0] + list(ac_space.nvec))[:-1] 114 | for l in list(start_ls + sample): 115 | action_dict[i][l] = 1.0 116 | else: 117 | raise NotImplementedError 118 | 119 | print("action_dict: ", action_dict) 120 | 121 | for i in env.step(action_dict): 122 | print(i) 123 | -------------------------------------------------------------------------------- /train/maddpg-v3/env/wrapper.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from gym.spaces import Discrete, Box, MultiDiscrete 6 | from ray import rllib 7 | from make_env import make_env 8 | 9 | import formation_gym 10 | 11 | import numpy as np 12 | import time 13 | 14 | 15 | class FormationEnv(rllib.MultiAgentEnv): 16 | """Wraps OpenAI Multi-Agent Particle env to be compatible with RLLib multi-agent.""" 17 | 18 | def __init__(self, **mpe_args): 19 | """Create a new Multi-Agent Particle env compatible with RLlib. 20 | 21 | Arguments: 22 | mpe_args (dict): Arguments to pass to the underlying 23 | make_env.make_env instance. 24 | 25 | Examples: 26 | >>> from rllib_env import RLlibMultiAgentParticleEnv 27 | >>> env = RLlibMultiAgentParticleEnv(scenario_name="simple_reference") 28 | >>> print(env.reset()) 29 | """ 30 | print(mpe_args) 31 | self._env = formation_gym.make_env(**mpe_args) 32 | self.num_agents = self._env.num_agents 33 | self.agent_ids = list(range(self.num_agents)) 34 | 35 | self.observation_space_dict = self._make_dict(self._env.observation_space) 36 | self.action_space_dict = self._make_dict(self._env.action_space) 37 | 38 | def reset(self): 39 | """Resets the env and returns observations from ready agents. 40 | 41 | Returns: 42 | obs_dict: New observations for each ready agent. 43 | """ 44 | 45 | obs_dict = self._make_dict(self._env.reset()) 46 | return obs_dict 47 | 48 | def step(self, action_dict): 49 | """Returns observations from ready agents. 50 | 51 | The returns are dicts mapping from agent_id strings to values. The 52 | number of agents in the env can vary over time. 53 | 54 | Returns: 55 | obs_dict: 56 | New observations for each ready agent. 57 | rew_dict: 58 | Reward values for each ready agent. 59 | done_dict: 60 | Done values for each ready agent. 61 | The special key "__all__" (required) is used to indicate env termination. 62 | info_dict: 63 | Optional info values for each agent id. 64 | """ 65 | 66 | actions = list(action_dict.values()) 67 | obs_list, rew_list, done_list, info_list = self._env.step(actions) 68 | 69 | obs_dict = self._make_dict(obs_list) 70 | rew_dict = self._make_dict(rew_list) 71 | done_dict = self._make_dict(done_list) 72 | done_dict["__all__"] = all(done_list) 73 | # FIXME: Currently, this is the best option to transfer agent-wise termination signal without touching RLlib code hugely. 74 | # FIXME: Hopefully, this will be solved in the future. 75 | info_dict = self._make_dict([{"done": done} for done in done_list]) 76 | 77 | return obs_dict, rew_dict, done_dict, info_dict 78 | 79 | def render(self, mode='human'): 80 | time.sleep(0.05) 81 | self._env.render(mode=mode) 82 | 83 | def _make_dict(self, values): 84 | return dict(zip(self.agent_ids, values)) 85 | 86 | 87 | if __name__ == '__main__': 88 | for scenario_name in ["simple", 89 | "simple_adversary", 90 | "simple_crypto", 91 | "simple_push", 92 | "simple_reference", 93 | "simple_speaker_listener", 94 | "simple_spread", 95 | "simple_tag", 96 | "simple_world_comm"]: 97 | print("scenario_name: ", scenario_name) 98 | env = RLlibMultiAgentParticleEnv(scenario_name=scenario_name) 99 | print("obs: ", env.reset()) 100 | print(env.observation_space_dict) 101 | print(env.action_space_dict) 102 | 103 | action_dict = {} 104 | for i, ac_space in env.action_space_dict.items(): 105 | sample = ac_space.sample() 106 | if isinstance(ac_space, Discrete): 107 | action_dict[i] = np.zeros(ac_space.n) 108 | action_dict[i][sample] = 1.0 109 | elif isinstance(ac_space, Box): 110 | action_dict[i] = sample 111 | elif isinstance(ac_space, MultiDiscrete): 112 | print("sample: ", sample) 113 | print("ac_space: ", ac_space.nvec) 114 | action_dict[i] = np.zeros(sum(ac_space.nvec)) 115 | start_ls = np.cumsum([0] + list(ac_space.nvec))[:-1] 116 | for l in list(start_ls + sample): 117 | action_dict[i][l] = 1.0 118 | else: 119 | raise NotImplementedError 120 | 121 | print("action_dict: ", action_dict) 122 | 123 | for i in env.step(action_dict): 124 | print(i) 125 | -------------------------------------------------------------------------------- /train/mappo/inbox/render_formation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | import setproctitle 5 | import numpy as np 6 | from pathlib import Path 7 | 8 | import torch 9 | 10 | from config import get_config 11 | 12 | from onpolicy.envs.env_wrappers import SubprocVecEnv, DummyVecEnv 13 | 14 | import formation_gym 15 | 16 | def make_render_env(all_args): 17 | def get_env_fn(rank): 18 | def init_env(): 19 | if all_args.env_name == "MPE": 20 | env = formation_gym.make_env(all_args.scenario_name, False , all_args.num_agents) 21 | else: 22 | print("Can not support the " + 23 | all_args.env_name + "environment.") 24 | raise NotImplementedError 25 | env.seed(all_args.seed + rank * 1000) 26 | return env 27 | return init_env 28 | if all_args.n_rollout_threads == 1: 29 | return DummyVecEnv([get_env_fn(0)]) 30 | else: 31 | return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)]) 32 | 33 | def parse_args(args, parser): 34 | parser.add_argument('--scenario_name', type=str, 35 | default='simple_spread', help="Which scenario to run on") 36 | parser.add_argument('--num_agents', type=int, 37 | default=3, help="number of players") 38 | 39 | all_args = parser.parse_known_args(args)[0] 40 | 41 | return all_args 42 | 43 | 44 | def main(args): 45 | parser = get_config() 46 | all_args = parse_args(args, parser) 47 | 48 | if all_args.algorithm_name == "rmappo" or all_args.algorithm_name == "rmappg": 49 | assert ( 50 | all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ("check recurrent policy!") 51 | elif all_args.algorithm_name == "mappo" or all_args.algorithm_name == "mappg": 52 | assert (all_args.use_recurrent_policy and all_args.use_naive_recurrent_policy) == False, ( 53 | "check recurrent policy!") 54 | else: 55 | raise NotImplementedError 56 | 57 | assert (all_args.share_policy == True and all_args.scenario_name == 'simple_speaker_listener') == False, ( 58 | "The simple_speaker_listener scenario can not use shared policy. Please check the config.py.") 59 | 60 | assert all_args.use_render, ("u need to set use_render be True") 61 | assert not (all_args.model_dir == None or all_args.model_dir == ""), ("set model_dir first") 62 | assert all_args.n_rollout_threads==1, ("only support to use 1 env to render.") 63 | 64 | # cuda 65 | if all_args.cuda and torch.cuda.is_available(): 66 | print("choose to use gpu...") 67 | device = torch.device("cuda:0") 68 | torch.set_num_threads(all_args.n_training_threads) 69 | if all_args.cuda_deterministic: 70 | torch.backends.cudnn.benchmark = False 71 | torch.backends.cudnn.deterministic = True 72 | else: 73 | print("choose to use cpu...") 74 | device = torch.device("cpu") 75 | torch.set_num_threads(all_args.n_training_threads) 76 | 77 | # run dir 78 | run_dir = Path(os.path.dirname(os.path.abspath(__file__)) + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name 79 | print(run_dir) 80 | if not run_dir.exists(): 81 | os.makedirs(str(run_dir)) 82 | 83 | if not run_dir.exists(): 84 | curr_run = 'run1' 85 | else: 86 | exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run')] 87 | if len(exst_run_nums) == 0: 88 | curr_run = 'run1' 89 | else: 90 | curr_run = 'run%i' % (max(exst_run_nums) + 1) 91 | run_dir = run_dir / curr_run 92 | if not run_dir.exists(): 93 | os.makedirs(str(run_dir)) 94 | 95 | setproctitle.setproctitle(str(all_args.algorithm_name) + "-" + \ 96 | str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(all_args.user_name)) 97 | 98 | # seed 99 | torch.manual_seed(all_args.seed) 100 | torch.cuda.manual_seed_all(all_args.seed) 101 | np.random.seed(all_args.seed) 102 | 103 | # env init 104 | envs = make_render_env(all_args) 105 | eval_envs = None 106 | num_agents = all_args.num_agents 107 | 108 | config = { 109 | "all_args": all_args, 110 | "envs": envs, 111 | "eval_envs": eval_envs, 112 | "num_agents": num_agents, 113 | "device": device, 114 | "run_dir": run_dir 115 | } 116 | 117 | # run experiments 118 | if all_args.share_policy: 119 | from onpolicy.runner.shared.mpe_runner import MPERunner as Runner 120 | else: 121 | from onpolicy.runner.separated.mpe_runner import MPERunner as Runner 122 | 123 | runner = Runner(config) 124 | runner.render() 125 | 126 | # post process 127 | envs.close() 128 | 129 | if __name__ == "__main__": 130 | main(sys.argv[1:]) 131 | -------------------------------------------------------------------------------- /formation_gym/envs/formation_hd_partial_range_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial.distance import directed_hausdorff 3 | 4 | from formation_gym.scenario import BaseScenario 5 | from formation_gym.core import World, Agent, Landmark 6 | 7 | ''' 8 | use Hausdorff distance as reward function 9 | refer to https://www.wikiwand.com/en/Hausdorff_distance#/Applications 10 | 11 | partial observation environment 12 | ''' 13 | 14 | class Scenario(BaseScenario): 15 | def make_world(self, num_agents = 4, num_landmarks = 4, obs_range = 0.7, world_length = 25): 16 | self.obs_range = obs_range 17 | self.num_agents = num_agents 18 | # world properties 19 | world = World() 20 | world.world_length = world_length 21 | world.dim_c = 2 # communication channel 22 | world.collaborative = True 23 | # agent properties 24 | world.agents = [Agent() for i in range(num_agents)] 25 | for i, agent in enumerate(world.agents): 26 | agent.name = 'agent %d' % i 27 | agent.collide = True 28 | agent.silent = True 29 | agent.size = 0.04 30 | # landmark properties 31 | world.landmarks = [Landmark() for i in range(num_landmarks)] 32 | for i, landmark in enumerate(world.landmarks): 33 | landmark.name = 'landmarks %d' % i 34 | landmark.collide = False 35 | landmark.movable = False 36 | landmark.size = 0.02 37 | # initial conditions 38 | self.reset_world(world) 39 | return world 40 | 41 | def observation(self, agent, world): 42 | # landmark pos 43 | entity_pos = [] 44 | for entity in world.landmarks: 45 | entity_pos.append(entity.state.p_pos) 46 | # agent pos & communication 47 | other_pos = [] 48 | comm = [] 49 | # set range for watching 50 | for other in world.agents: 51 | if other is agent: continue 52 | comm.append(other.state.c) 53 | other_pos.append(np.clip(other.state.p_pos - agent.state.p_pos, [-self.obs_range, -self.obs_range], [self.obs_range, self.obs_range])) 54 | return np.concatenate([agent.state.p_vel]+entity_pos + other_pos + comm) 55 | 56 | def reward(self, agent, world): 57 | rew = 0 58 | u = [a.state.p_pos for a in world.agents] 59 | v = [l.state.p_pos for l in world.landmarks] 60 | delta = np.mean(u, 0) - np.mean(v, 0) 61 | u = u - np.mean(u, 0) 62 | v = v - np.mean(v, 0) 63 | rew = -max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0]) 64 | # change landmark pos and color 65 | # for i in range(len(world.landmarks)): 66 | # world.landmarks[i].state.p_pos += delta 67 | # dist = min([np.linalg.norm(a.state.p_pos - world.landmarks[i].state.p_pos) for a in world.agents]) 68 | # if dist <= 0.2: world.landmarks[i].color = np.array([0, 0.6, 0]) 69 | # self.set_bound(world) 70 | if agent.collide: 71 | for a in world.agents: 72 | if agent!=a and self.is_collision(a, agent): 73 | rew -= 1 74 | return rew 75 | 76 | def reset_world(self, world): 77 | # agent 78 | for agent in world.agents: 79 | agent.color = np.array([0.35, 0.35, 0.85]) 80 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 81 | agent.state.p_vel = np.zeros(world.dim_p) 82 | agent.state.c = np.zeros(world.dim_c) 83 | # landmark 84 | for landmark in world.landmarks: 85 | landmark.color = np.array([0.25, 0.25, 0.25]) 86 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 87 | landmark.state.p_vel = np.zeros(world.dim_p) 88 | 89 | def benchmark_data(self, agent, world): 90 | # get data to debug 91 | rew = self.reward(agent, world) 92 | collisions = 0 93 | if agent.collide: 94 | for a in world.agents: 95 | if self.is_collision(a, agent): 96 | collisions += 1 97 | min_dists = 0 98 | occupied_landmarks = 0 99 | for l in world.landmarks: 100 | dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents] 101 | min_dists += min(dists) 102 | if min(dists) < 0.1: 103 | occupied_landmarks += 1 104 | return { 105 | 'reward': rew, 106 | 'collisions': collisions, 107 | 'min_dists': min_dists, 108 | 'occupied_landmarks': occupied_landmarks 109 | } 110 | 111 | def is_collision(self, agent1, agent2): 112 | dist = np.linalg.norm(agent1.state.p_pos - agent2.state.p_pos) 113 | return dist < (agent1.size + agent2.size) 114 | 115 | def set_bound(self, world): 116 | for agent in world.agents: 117 | agent.state.p_pos = np.clip(agent.state.p_pos, [-2, -2], [2, 2]) 118 | 119 | -------------------------------------------------------------------------------- /train/maddpg-v2/utils/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch import Tensor 3 | from torch.autograd import Variable 4 | 5 | class ReplayBuffer(object): 6 | """ 7 | Replay Buffer for multi-agent RL with parallel rollouts 8 | """ 9 | def __init__(self, max_steps, num_agents, obs_dims, ac_dims): 10 | """ 11 | Inputs: 12 | max_steps (int): Maximum number of timepoints to store in buffer 13 | num_agents (int): Number of agents in environment 14 | obs_dims (list of ints): number of obervation dimensions for each 15 | agent 16 | ac_dims (list of ints): number of action dimensions for each agent 17 | """ 18 | self.max_steps = max_steps 19 | self.num_agents = num_agents 20 | self.obs_buffs = [] 21 | self.ac_buffs = [] 22 | self.rew_buffs = [] 23 | self.next_obs_buffs = [] 24 | self.done_buffs = [] 25 | for odim, adim in zip(obs_dims, ac_dims): 26 | self.obs_buffs.append(np.zeros((max_steps, odim))) 27 | self.ac_buffs.append(np.zeros((max_steps, adim))) 28 | self.rew_buffs.append(np.zeros(max_steps)) 29 | self.next_obs_buffs.append(np.zeros((max_steps, odim))) 30 | self.done_buffs.append(np.zeros(max_steps)) 31 | 32 | 33 | self.filled_i = 0 # index of first empty location in buffer (last index when full) 34 | self.curr_i = 0 # current index to write to (ovewrite oldest data) 35 | 36 | def __len__(self): 37 | return self.filled_i 38 | 39 | def push(self, observations, actions, rewards, next_observations, dones): 40 | nentries = observations.shape[0] # handle multiple parallel environments 41 | if self.curr_i + nentries > self.max_steps: 42 | rollover = self.max_steps - self.curr_i # num of indices to roll over 43 | for agent_i in range(self.num_agents): 44 | self.obs_buffs[agent_i] = np.roll(self.obs_buffs[agent_i], 45 | rollover, axis=0) 46 | self.ac_buffs[agent_i] = np.roll(self.ac_buffs[agent_i], 47 | rollover, axis=0) 48 | self.rew_buffs[agent_i] = np.roll(self.rew_buffs[agent_i], 49 | rollover) 50 | self.next_obs_buffs[agent_i] = np.roll( 51 | self.next_obs_buffs[agent_i], rollover, axis=0) 52 | self.done_buffs[agent_i] = np.roll(self.done_buffs[agent_i], 53 | rollover) 54 | self.curr_i = 0 55 | self.filled_i = self.max_steps 56 | for agent_i in range(self.num_agents): 57 | self.obs_buffs[agent_i][self.curr_i:self.curr_i + nentries] = np.vstack( 58 | observations[:, agent_i]) 59 | # actions are already batched by agent, so they are indexed differently 60 | self.ac_buffs[agent_i][self.curr_i:self.curr_i + nentries] = actions[agent_i] 61 | self.rew_buffs[agent_i][self.curr_i:self.curr_i + nentries] = rewards[:, agent_i][0] 62 | self.next_obs_buffs[agent_i][self.curr_i:self.curr_i + nentries] = np.vstack( 63 | next_observations[:, agent_i]) 64 | self.done_buffs[agent_i][self.curr_i:self.curr_i + nentries] = dones[:, agent_i] 65 | self.curr_i += nentries 66 | if self.filled_i < self.max_steps: 67 | self.filled_i += nentries 68 | if self.curr_i == self.max_steps: 69 | self.curr_i = 0 70 | 71 | def sample(self, N, to_gpu=False, norm_rews=True): 72 | inds = np.random.choice(np.arange(self.filled_i), size=N, 73 | replace=False) 74 | if to_gpu: 75 | cast = lambda x: Variable(Tensor(x), requires_grad=False).cuda() 76 | else: 77 | cast = lambda x: Variable(Tensor(x), requires_grad=False) 78 | if norm_rews: 79 | ret_rews = [cast((self.rew_buffs[i][inds] - 80 | self.rew_buffs[i][:self.filled_i].mean()) / 81 | self.rew_buffs[i][:self.filled_i].std()) 82 | for i in range(self.num_agents)] 83 | else: 84 | ret_rews = [cast(self.rew_buffs[i][inds]) for i in range(self.num_agents)] 85 | return ([cast(self.obs_buffs[i][inds]) for i in range(self.num_agents)], 86 | [cast(self.ac_buffs[i][inds]) for i in range(self.num_agents)], 87 | ret_rews, 88 | [cast(self.next_obs_buffs[i][inds]) for i in range(self.num_agents)], 89 | [cast(self.done_buffs[i][inds]) for i in range(self.num_agents)]) 90 | 91 | def get_average_rewards(self, N): 92 | if self.filled_i == self.max_steps: 93 | inds = np.arange(self.curr_i - N, self.curr_i) # allow for negative indexing 94 | else: 95 | inds = np.arange(max(0, self.curr_i - N), self.curr_i) 96 | return [self.rew_buffs[i][inds].mean() for i in range(self.num_agents)] 97 | -------------------------------------------------------------------------------- /train/maddpg-v2/utils/env_wrappers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modified from OpenAI Baselines code to work with multi-agent envs 3 | """ 4 | import numpy as np 5 | from multiprocessing import Process, Pipe 6 | from baselines.common.vec_env import VecEnv, CloudpickleWrapper 7 | 8 | 9 | def worker(remote, parent_remote, env_fn_wrapper): 10 | parent_remote.close() 11 | env = env_fn_wrapper.x() 12 | while True: 13 | cmd, data = remote.recv() 14 | if cmd == 'step': 15 | ob, reward, done, info = env.step(data) 16 | if all(done): 17 | ob = env.reset() 18 | remote.send((ob, reward, done, info)) 19 | elif cmd == 'reset': 20 | ob = env.reset() 21 | remote.send(ob) 22 | elif cmd == 'reset_task': 23 | ob = env.reset_task() 24 | remote.send(ob) 25 | elif cmd == 'close': 26 | remote.close() 27 | break 28 | elif cmd == 'get_spaces': 29 | remote.send((env.observation_space, env.action_space)) 30 | elif cmd == 'get_agent_types': 31 | if all([hasattr(a, 'adversary') for a in env.agents]): 32 | remote.send(['adversary' if a.adversary else 'agent' for a in 33 | env.agents]) 34 | else: 35 | remote.send(['agent' for _ in env.agents]) 36 | else: 37 | raise NotImplementedError 38 | 39 | 40 | class SubprocVecEnv(VecEnv): 41 | def __init__(self, env_fns, spaces=None): 42 | """ 43 | envs: list of gym environments to run in subprocesses 44 | """ 45 | self.waiting = False 46 | self.closed = False 47 | nenvs = len(env_fns) 48 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 49 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 50 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 51 | for p in self.ps: 52 | p.daemon = True # if the main process crashes, we should not cause things to hang 53 | p.start() 54 | for remote in self.work_remotes: 55 | remote.close() 56 | 57 | self.remotes[0].send(('get_spaces', None)) 58 | observation_space, action_space = self.remotes[0].recv() 59 | self.remotes[0].send(('get_agent_types', None)) 60 | self.agent_types = self.remotes[0].recv() 61 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 62 | 63 | def step_async(self, actions): 64 | for remote, action in zip(self.remotes, actions): 65 | remote.send(('step', action)) 66 | self.waiting = True 67 | 68 | def step_wait(self): 69 | results = [remote.recv() for remote in self.remotes] 70 | self.waiting = False 71 | obs, rews, dones, infos = zip(*results) 72 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 73 | 74 | def reset(self): 75 | for remote in self.remotes: 76 | remote.send(('reset', None)) 77 | return np.stack([remote.recv() for remote in self.remotes]) 78 | 79 | def reset_task(self): 80 | for remote in self.remotes: 81 | remote.send(('reset_task', None)) 82 | return np.stack([remote.recv() for remote in self.remotes]) 83 | 84 | def close(self): 85 | if self.closed: 86 | return 87 | if self.waiting: 88 | for remote in self.remotes: 89 | remote.recv() 90 | for remote in self.remotes: 91 | remote.send(('close', None)) 92 | for p in self.ps: 93 | p.join() 94 | self.closed = True 95 | 96 | 97 | class DummyVecEnv(VecEnv): 98 | def __init__(self, env_fns): 99 | self.envs = [fn() for fn in env_fns] 100 | env = self.envs[0] 101 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) 102 | if all([hasattr(a, 'adversary') for a in env.agents]): 103 | self.agent_types = ['adversary' if a.adversary else 'agent' for a in 104 | env.agents] 105 | else: 106 | self.agent_types = ['agent' for _ in env.agents] 107 | self.ts = np.zeros(len(self.envs), dtype='int') 108 | self.actions = None 109 | 110 | def step_async(self, actions): 111 | self.actions = actions 112 | 113 | def step_wait(self): 114 | results = [env.step(a) for (a,env) in zip(self.actions, self.envs)] 115 | obs, rews, dones, infos = map(np.array, zip(*results)) 116 | self.ts += 1 117 | for (i, done) in enumerate(dones): 118 | if all(done): 119 | obs[i] = self.envs[i].reset() 120 | self.ts[i] = 0 121 | self.actions = None 122 | return np.array(obs), np.array(rews), np.array(dones), infos 123 | 124 | def reset(self): 125 | results = [env.reset() for env in self.envs] 126 | return np.array(results) 127 | 128 | def close(self): 129 | return -------------------------------------------------------------------------------- /formation_gym/__init__.py: -------------------------------------------------------------------------------- 1 | import imp 2 | from .environment import MultiAgentEnv 3 | import os.path as osp 4 | import numpy as np 5 | 6 | def make_env(scenario_name='basic_formation_env', benchmark=False, num_agents = 3): 7 | # load scenario from script 8 | pathname = osp.join(osp.dirname(__file__), 'envs/'+scenario_name+'.py') 9 | scenario = imp.load_source('', pathname).Scenario() 10 | # create world 11 | world = scenario.make_world(num_agents) # use same number of agent and landmarks 12 | # create multiagent environment 13 | if benchmark: 14 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data, shared_viewer = True) 15 | else: 16 | env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, shared_viewer = True) 17 | return env 18 | 19 | def ezpolicy(obs): 20 | num_agents = len(obs)/6 21 | assert num_agents.is_integer(), num_agents 22 | num_agents = int(num_agents) 23 | # get info from observation 24 | p_vel = obs[:2] 25 | other_pos = obs[2:2*num_agents] 26 | ideal_shape = obs[4*num_agents-2:6*num_agents-2] 27 | ideal_shape = np.reshape(ideal_shape, (-1, 2)) 28 | ideal_shape = ideal_shape - np.mean(ideal_shape, axis = 0) 29 | ideal_vel = obs[-2:] 30 | # calculate relative formation 31 | current_shape = np.append(other_pos, [0,0]) 32 | current_shape = np.reshape(current_shape, (-1,2)) 33 | current_shape -= np.mean(current_shape, axis = 0) 34 | # get action 35 | sort_mark_idx = np.argsort([np.linalg.norm(current_shape[-1] - mark) for mark in ideal_shape]) # distance to different landmarks 36 | for idx in sort_mark_idx: 37 | closest_agent_idx = np.argmin([np.linalg.norm(agent - ideal_shape[idx]) for agent in current_shape]) 38 | if closest_agent_idx == (num_agents - 1) or idx == sort_mark_idx[-1]: # this agent is the closet agent 39 | act = np.clip(0.5*(ideal_shape[idx] - current_shape[-1]), -1, 1) 40 | break 41 | # add ideal velocity control to action 42 | done = np.linalg.norm(ideal_shape - current_shape) < 0.01 43 | if done: 44 | act += ideal_vel 45 | else: 46 | act += ideal_vel * 0.3 47 | return act 48 | 49 | def get_action_BFS(policy, obs, num_agents_per_layer): 50 | ''' 51 | :param policy: agent policy function 52 | :param obs: total observation 53 | :param num_agents_per_layer: number of agents per group 54 | ''' 55 | num_layer = np.log(len(obs))/ np.log(num_agents_per_layer) 56 | assert num_layer.is_integer(), 'Observation shape error!' 57 | queue = [obs] 58 | act = [] 59 | while queue: 60 | current_layer_obs = queue.pop(0) 61 | current_layer_num_agents = len(current_layer_obs) 62 | next_layer_num_agents = int(len(current_layer_obs)/num_agents_per_layer) 63 | for i in range(num_agents_per_layer): 64 | leader_obs = current_layer_obs[i*next_layer_num_agents] 65 | # get current layer leader observation 66 | p_vel = leader_obs[:2] 67 | # get observation of others by inference center 68 | current_shape = np.insert(leader_obs[2:2*current_layer_num_agents], 2*i*next_layer_num_agents, [0,0]).reshape((-1, 2)) 69 | layer_current_shape = np.array([np.mean(current_shape[next_layer_num_agents*k:next_layer_num_agents*(k+1)], axis = 0) for k in range(num_agents_per_layer)]) 70 | layer_current_shape -= layer_current_shape[i] 71 | layer_current_shape = np.delete(layer_current_shape, i, 0).flatten() 72 | # get ideal formation 73 | ideal_shape = np.reshape(leader_obs[4*current_layer_num_agents-2:6*current_layer_num_agents-2], (-1, 2)) 74 | layer_target_shape = np.array([np.mean(ideal_shape[next_layer_num_agents*(k):next_layer_num_agents*(k+1)], axis = 0) for k in range(num_agents_per_layer)]).flatten() 75 | # get ideal velocity 76 | layer_target_vel = leader_obs[-2:] 77 | obs_input = np.concatenate((p_vel, layer_current_shape, [0]*2*(num_agents_per_layer-1), layer_target_shape, layer_target_vel)) 78 | current_layer = np.log(current_layer_num_agents)/ np.log(num_agents_per_layer) 79 | next_layer_target_vel = policy(obs_input) * (current_layer) 80 | # next layer observation 81 | if next_layer_num_agents == 1: 82 | # END case: reach the last layer and append the action 83 | act.append(next_layer_target_vel) 84 | else: 85 | next_layer_obs = [] 86 | for j in range(i*next_layer_num_agents, (i+1)*next_layer_num_agents): 87 | # remove redundent observation 88 | obs_n = current_layer_obs[j] 89 | p_vel = obs_n[:2] 90 | others_pos = obs_n[2:2*current_layer_num_agents] 91 | others_pos = others_pos[2*i*next_layer_num_agents:2*(i+1)*next_layer_num_agents-2] 92 | comm = [0]*2*(next_layer_num_agents-1) 93 | shape = obs_n[4*current_layer_num_agents-2:6*current_layer_num_agents-2] 94 | shape = shape[2*i*next_layer_num_agents:2*(i+1)*next_layer_num_agents] 95 | tar_vel = next_layer_target_vel 96 | obs_n = np.concatenate((p_vel, others_pos, comm, shape, tar_vel)) 97 | next_layer_obs.append(obs_n) 98 | queue.append(next_layer_obs) 99 | return act -------------------------------------------------------------------------------- /formation_gym/envs/formation_hd_partial_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial.distance import directed_hausdorff 3 | 4 | from formation_gym.scenario import BaseScenario 5 | from formation_gym.core import World, Agent, Landmark 6 | 7 | ''' 8 | use Hausdorff distance as reward function 9 | refer to https://www.wikiwand.com/en/Hausdorff_distance#/Applications 10 | 11 | partial observation environment 12 | ''' 13 | 14 | class Scenario(BaseScenario): 15 | def make_world(self, num_agents = 5, num_landmarks = 5, num_obs = 3, world_length = 25): 16 | self.num_obs = num_obs 17 | self.num_agents = num_agents 18 | # world properties 19 | world = World() 20 | world.world_length = world_length 21 | world.dim_c = 2 # communication channel 22 | world.collaborative = True 23 | # agent properties 24 | world.agents = [Agent() for i in range(num_agents)] 25 | for i, agent in enumerate(world.agents): 26 | agent.name = 'agent %d' % i 27 | agent.collide = True 28 | agent.silent = True 29 | agent.size = 0.04 30 | # landmark properties 31 | world.landmarks = [Landmark() for i in range(num_landmarks)] 32 | for i, landmark in enumerate(world.landmarks): 33 | landmark.name = 'landmarks %d' % i 34 | landmark.collide = False 35 | landmark.movable = False 36 | landmark.size = 0.02 37 | # initial conditions 38 | self.reset_world(world) 39 | return world 40 | 41 | def observation(self, agent, world): 42 | # landmark pos 43 | entity_pos = [] 44 | for entity in world.landmarks: 45 | entity_pos.append(entity.state.p_pos) 46 | # agent pos & communication 47 | other_pos = [] 48 | comm = [] 49 | # way3: watch for 2 guys 50 | # get agent ID 51 | agent_id = int(agent.name.split()[-1]) 52 | idx = [i % self.num_agents for i in range(agent_id+1, agent_id+1 + self.num_obs)] 53 | for i in idx: 54 | other_pos.append(world.agents[i].state.p_pos - agent.state.p_pos) 55 | for other in world.agents: 56 | if other is agent: continue 57 | comm.append(other.state.c) 58 | # make the furthest point to zero 59 | # way1: make the far observation to zero 60 | # others_dist = np.linalg.norm(other_pos, axis = 1) 61 | # idx = np.argpartition(others_dist, self.num_obs) 62 | # for i in idx[self.num_obs:]: 63 | # other_pos[i] = np.zeros(world.dim_p) 64 | # way2: remove the far obs 65 | # other_pos = other_pos[idx[:self.num_obs]] 66 | return np.concatenate([agent.state.p_vel]+entity_pos + other_pos + comm) 67 | 68 | def reward(self, agent, world): 69 | rew = 0 70 | u = [a.state.p_pos for a in world.agents] 71 | v = [l.state.p_pos for l in world.landmarks] 72 | delta = np.mean(u, 0) - np.mean(v, 0) 73 | u = u - np.mean(u, 0) 74 | v = v - np.mean(v, 0) 75 | rew = -max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0]) 76 | # change landmark pos and color 77 | # for i in range(len(world.landmarks)): 78 | # world.landmarks[i].state.p_pos += delta 79 | # dist = min([np.linalg.norm(a.state.p_pos - world.landmarks[i].state.p_pos) for a in world.agents]) 80 | # if dist <= 0.2: world.landmarks[i].color = np.array([0, 0.6, 0]) 81 | # self.set_bound(world) 82 | if agent.collide: 83 | for a in world.agents: 84 | if agent!=a and self.is_collision(a, agent): 85 | rew -= 1 86 | return rew 87 | 88 | def reset_world(self, world): 89 | # agent 90 | for agent in world.agents: 91 | agent.color = np.array([0.35, 0.35, 0.85]) 92 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 93 | agent.state.p_vel = np.zeros(world.dim_p) 94 | agent.state.c = np.zeros(world.dim_c) 95 | # landmark 96 | for landmark in world.landmarks: 97 | landmark.color = np.array([0.25, 0.25, 0.25]) 98 | landmark.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 99 | landmark.state.p_vel = np.zeros(world.dim_p) 100 | 101 | def benchmark_data(self, agent, world): 102 | # get data to debug 103 | rew = self.reward(agent, world) 104 | collisions = 0 105 | if agent.collide: 106 | for a in world.agents: 107 | if self.is_collision(a, agent): 108 | collisions += 1 109 | min_dists = 0 110 | occupied_landmarks = 0 111 | for l in world.landmarks: 112 | dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents] 113 | min_dists += min(dists) 114 | if min(dists) < 0.1: 115 | occupied_landmarks += 1 116 | return { 117 | 'reward': rew, 118 | 'collisions': collisions, 119 | 'min_dists': min_dists, 120 | 'occupied_landmarks': occupied_landmarks 121 | } 122 | 123 | def is_collision(self, agent1, agent2): 124 | dist = np.linalg.norm(agent1.state.p_pos - agent2.state.p_pos) 125 | return dist < (agent1.size + agent2.size) 126 | 127 | def set_bound(self, world): 128 | for agent in world.agents: 129 | agent.state.p_pos = np.clip(agent.state.p_pos, [-2, -2], [2, 2]) 130 | 131 | -------------------------------------------------------------------------------- /train/mappo/inbox/train_formation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | import numpy as np 5 | from pathlib import Path 6 | import torch 7 | from config import get_config 8 | from onpolicy.envs.env_wrappers import SubprocVecEnv, DummyVecEnv 9 | 10 | import formation_gym 11 | 12 | """Train script for MPEs.""" 13 | 14 | def make_train_env(all_args): 15 | def get_env_fn(rank): 16 | def init_env(): 17 | if all_args.env_name == "MPE": 18 | env = formation_gym.make_env(all_args.scenario_name, False , all_args.num_agents) 19 | else: 20 | print("Can not support the " + 21 | all_args.env_name + "environment.") 22 | raise NotImplementedError 23 | env.seed(all_args.seed + rank * 1000) 24 | return env 25 | return init_env 26 | if all_args.n_rollout_threads == 1: 27 | return DummyVecEnv([get_env_fn(0)]) 28 | else: 29 | return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)]) 30 | 31 | 32 | def make_eval_env(all_args): 33 | def get_env_fn(rank): 34 | def init_env(): 35 | if all_args.env_name == "MPE": 36 | env = formation_gym.make_env(all_args.scenario_name, False , all_args.num_agents) 37 | else: 38 | print("Can not support the " + 39 | all_args.env_name + "environment.") 40 | raise NotImplementedError 41 | env.seed(all_args.seed * 50000 + rank * 10000) 42 | return env 43 | return init_env 44 | if all_args.n_eval_rollout_threads == 1: 45 | return DummyVecEnv([get_env_fn(0)]) 46 | else: 47 | return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)]) 48 | 49 | 50 | def parse_args(args, parser): 51 | parser.add_argument('--scenario_name', type=str, 52 | default='formation_hd_env', help="Which scenario to run on") 53 | parser.add_argument('--num_agents', type=int, 54 | default=3, help="number of players") 55 | 56 | all_args = parser.parse_known_args(args)[0] 57 | 58 | return all_args 59 | 60 | 61 | def main(args): 62 | parser = get_config() 63 | all_args = parse_args(args, parser) 64 | 65 | if all_args.algorithm_name == "rmappo": 66 | assert (all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ("check recurrent policy!") 67 | elif all_args.algorithm_name == "mappo": 68 | assert (all_args.use_recurrent_policy == False and all_args.use_naive_recurrent_policy == False), ("check recurrent policy!") 69 | else: 70 | raise NotImplementedError 71 | 72 | assert (all_args.share_policy == True and all_args.scenario_name == 'simple_speaker_listener') == False, ( 73 | "The simple_speaker_listener scenario can not use shared policy. Please check the config.py.") 74 | 75 | # cuda 76 | if all_args.cuda and torch.cuda.is_available(): 77 | print("choose to use gpu...") 78 | device = torch.device("cuda:0") 79 | torch.set_num_threads(all_args.n_training_threads) 80 | if all_args.cuda_deterministic: 81 | torch.backends.cudnn.benchmark = False 82 | torch.backends.cudnn.deterministic = True 83 | else: 84 | print("choose to use cpu...") 85 | device = torch.device("cpu") 86 | torch.set_num_threads(all_args.n_training_threads) 87 | 88 | # run dir 89 | # run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[0] + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name 90 | run_dir = Path(os.path.dirname(os.path.abspath(__file__)) + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name 91 | if not run_dir.exists(): 92 | os.makedirs(str(run_dir)) 93 | 94 | if not run_dir.exists(): 95 | curr_run = 'run1' 96 | else: 97 | exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run')] 98 | if len(exst_run_nums) == 0: 99 | curr_run = 'run1' 100 | else: 101 | curr_run = 'run%i' % (max(exst_run_nums) + 1) 102 | run_dir = run_dir / curr_run 103 | if not run_dir.exists(): 104 | os.makedirs(str(run_dir)) 105 | 106 | # seed 107 | torch.manual_seed(all_args.seed) 108 | torch.cuda.manual_seed_all(all_args.seed) 109 | np.random.seed(all_args.seed) 110 | 111 | # env init 112 | envs = make_train_env(all_args) 113 | eval_envs = make_eval_env(all_args) if all_args.use_eval else None 114 | num_agents = all_args.num_agents 115 | 116 | config = { 117 | "all_args": all_args, 118 | "envs": envs, 119 | "eval_envs": eval_envs, 120 | "num_agents": num_agents, 121 | "device": device, 122 | "run_dir": run_dir 123 | } 124 | 125 | # run experiments 126 | if all_args.share_policy: 127 | from onpolicy.runner.shared.mpe_runner import MPERunner as Runner 128 | else: 129 | from onpolicy.runner.separated.mpe_runner import MPERunner as Runner 130 | 131 | runner = Runner(config) 132 | runner.run() 133 | 134 | # post process 135 | envs.close() 136 | if all_args.use_eval and eval_envs is not envs: 137 | eval_envs.close() 138 | 139 | runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json')) 140 | runner.writter.close() 141 | 142 | 143 | if __name__ == "__main__": 144 | main(sys.argv[1:]) 145 | -------------------------------------------------------------------------------- /train/maddpg-v1/maddpg/maddpg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from maddpg.actor_critic import Actor, Critic 4 | 5 | 6 | class MADDPG: 7 | def __init__(self, args, agent_id): # 因为不同的agent的obs、act维度可能不一样,所以神经网络不同,需要agent_id来区分 8 | self.args = args 9 | self.agent_id = agent_id 10 | self.train_step = 0 11 | 12 | # create the network 13 | self.actor_network = Actor(args, agent_id) 14 | self.critic_network = Critic(args) 15 | 16 | # build up the target network 17 | self.actor_target_network = Actor(args, agent_id) 18 | self.critic_target_network = Critic(args) 19 | 20 | # load the weights into the target networks 21 | self.actor_target_network.load_state_dict(self.actor_network.state_dict()) 22 | self.critic_target_network.load_state_dict(self.critic_network.state_dict()) 23 | 24 | # create the optimizer 25 | self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) 26 | self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) 27 | 28 | # create the dict for store the model 29 | if not os.path.exists('results/' + self.args.save_dir): 30 | os.mkdir('results/' + self.args.save_dir) 31 | # path to save the model 32 | self.model_path = 'results/' + self.args.save_dir + '/' + self.args.scenario_name 33 | if not os.path.exists(self.model_path): 34 | os.mkdir(self.model_path) 35 | self.model_path = self.model_path + '/' + 'agent_%d' % agent_id 36 | if not os.path.exists(self.model_path): 37 | os.mkdir(self.model_path) 38 | 39 | # load model 40 | actor_fullpath = self.model_path + '/99_actor_params.pkl' 41 | critic_fullpath = self.model_path + '/99_critic_params.pkl' 42 | if os.path.exists(actor_fullpath): 43 | self.actor_network.load_state_dict(torch.load(actor_fullpath)) 44 | self.critic_network.load_state_dict(torch.load(critic_fullpath)) 45 | print('Agent {} successfully loaded actor_network: {}'.format(self.agent_id, self.model_path + '/actor_params.pkl')) 46 | print('Agent {} successfully loaded critic_network: {}'.format(self.agent_id,self.model_path + '/critic_params.pkl')) 47 | 48 | # soft update 49 | def _soft_update_target_network(self): 50 | for target_param, param in zip(self.actor_target_network.parameters(), self.actor_network.parameters()): 51 | target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data) 52 | 53 | for target_param, param in zip(self.critic_target_network.parameters(), self.critic_network.parameters()): 54 | target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data) 55 | 56 | # update the network 57 | def train(self, transitions, other_agents): 58 | for key in transitions.keys(): 59 | if torch.is_tensor(transitions[key]): continue 60 | transitions[key] = torch.tensor(transitions[key], dtype=torch.float32) 61 | r = transitions['r_%d' % self.agent_id] # 训练时只需要自己的reward 62 | o, u, o_next = [], [], [] # 用来装每个agent经验中的各项 63 | for agent_id in range(self.args.n_agents): 64 | o.append(transitions['o_%d' % agent_id]) 65 | u.append(transitions['u_%d' % agent_id]) 66 | o_next.append(transitions['o_next_%d' % agent_id]) 67 | 68 | # calculate the target Q value function 69 | u_next = [] 70 | with torch.no_grad(): 71 | # 得到下一个状态对应的动作 72 | index = 0 73 | for agent_id in range(self.args.n_agents): 74 | if agent_id == self.agent_id: 75 | u_next.append(self.actor_target_network(o_next[agent_id])) 76 | else: 77 | # 因为传入的other_agents要比总数少一个,可能中间某个agent是当前agent,不能遍历去选择动作 78 | u_next.append(other_agents[index].policy.actor_target_network(o_next[agent_id])) 79 | index += 1 80 | q_next = self.critic_target_network(o_next, u_next).detach() 81 | 82 | target_q = (r.unsqueeze(1) + self.args.gamma * q_next).detach() 83 | 84 | # the q loss 85 | q_value = self.critic_network(o, u) 86 | critic_loss = (target_q - q_value).pow(2).mean() 87 | 88 | # the actor loss 89 | # 重新选择联合动作中当前agent的动作,其他agent的动作不变 90 | u[self.agent_id] = self.actor_network(o[self.agent_id]) 91 | actor_loss = - self.critic_network(o, u).mean() 92 | # if self.agent_id == 0: 93 | # print('critic_loss is {}, actor_loss is {}'.format(critic_loss, actor_loss)) 94 | # update the network 95 | self.actor_optim.zero_grad() 96 | actor_loss.backward() 97 | self.actor_optim.step() 98 | self.critic_optim.zero_grad() 99 | critic_loss.backward() 100 | self.critic_optim.step() 101 | 102 | self._soft_update_target_network() 103 | if self.train_step > 0 and self.train_step % self.args.save_rate == 0: 104 | self.save_model(self.train_step) 105 | self.train_step += 1 106 | 107 | def save_model(self, train_step): 108 | num = str(train_step // self.args.save_rate) 109 | model_path = os.path.join('results/'+self.args.save_dir, self.args.scenario_name) 110 | if not os.path.exists(model_path): 111 | os.makedirs(model_path) 112 | model_path = os.path.join(model_path, 'agent_%d' % self.agent_id) 113 | if not os.path.exists(model_path): 114 | os.makedirs(model_path) 115 | torch.save(self.actor_network.state_dict(), model_path + '/' + 'actor_params.pkl') 116 | torch.save(self.critic_network.state_dict(), model_path + '/' + 'critic_params.pkl') 117 | 118 | 119 | -------------------------------------------------------------------------------- /train/maddpg-v5/render.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import numpy as np 4 | from pathlib import Path 5 | import socket 6 | import setproctitle 7 | import torch 8 | from config import get_config 9 | from offpolicy.utils.util import get_cent_act_dim, get_dim_from_space 10 | import formation_gym 11 | from offpolicy.envs.env_wrappers import DummyVecEnv, SubprocVecEnv 12 | 13 | 14 | def make_train_env(all_args): 15 | def get_env_fn(rank): 16 | def init_env(): 17 | print(all_args.env_name) 18 | if all_args.env_name == "formation": 19 | env = formation_gym.make_env(all_args.scenario_name, benchmark = False, num_agents = all_args.num_agents) 20 | else: 21 | print("Can not support the " + 22 | all_args.env_name + "environment.") 23 | raise NotImplementedError 24 | env.seed(all_args.seed + rank * 1000) 25 | return env 26 | return init_env 27 | if all_args.n_rollout_threads == 1: 28 | return DummyVecEnv([get_env_fn(0)]) 29 | else: 30 | return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)]) 31 | 32 | 33 | def make_eval_env(all_args): 34 | def get_env_fn(rank): 35 | def init_env(): 36 | if all_args.env_name == "formation": 37 | env = formation_gym.make_env(all_args.scenario_name, benchmark = False, num_agents = all_args.num_agents) 38 | else: 39 | print("Can not support the " + 40 | all_args.env_name + "environment.") 41 | raise NotImplementedError 42 | env.seed(all_args.seed * 50000 + rank * 10000) 43 | return env 44 | return init_env 45 | if all_args.n_eval_rollout_threads == 1: 46 | return DummyVecEnv([get_env_fn(0)]) 47 | else: 48 | return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)]) 49 | 50 | 51 | def parse_args(args, parser): 52 | parser.add_argument('--scenario_name', type=str, 53 | default='formation_hd_env', help="Which scenario to run on") 54 | parser.add_argument("--num_landmarks", type=int, default=3) 55 | parser.add_argument('--num_agents', type=int, 56 | default=3, help="number of agents") 57 | parser.add_argument('--use_same_share_obs', action='store_false', 58 | default=True, help="Whether to use available actions") 59 | 60 | all_args = parser.parse_known_args(args)[0] 61 | 62 | return all_args 63 | 64 | 65 | def main(args): 66 | parser = get_config() 67 | all_args = parse_args(args, parser) 68 | 69 | # cuda and # threads 70 | if all_args.cuda and torch.cuda.is_available(): 71 | print("choose to use gpu...") 72 | device = torch.device("cuda:0") 73 | torch.set_num_threads(all_args.n_training_threads) 74 | if all_args.cuda_deterministic: 75 | torch.backends.cudnn.benchmark = False 76 | torch.backends.cudnn.deterministic = True 77 | else: 78 | print("choose to use cpu...") 79 | device = torch.device("cpu") 80 | torch.set_num_threads(all_args.n_training_threads) 81 | 82 | # setup file to output tensorboard, hyperparameters, and saved models 83 | run_dir = Path(os.path.dirname(os.path.abspath(__file__)) + "/results") / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name 84 | 85 | # create env 86 | env = make_train_env(all_args) 87 | # env = formation_gym.make_env(all_args.scenario_name, False, all_args.num_agents) 88 | num_agents = all_args.num_agents 89 | 90 | # create policies and mapping fn 91 | if all_args.share_policy: 92 | policy_info = { 93 | 'policy_0': {"cent_obs_dim": get_dim_from_space(env.share_observation_space[0]), 94 | "cent_act_dim": get_cent_act_dim(env.action_space), 95 | "obs_space": env.observation_space[0], 96 | "share_obs_space": env.share_observation_space[0], 97 | "act_space": env.action_space[0]} 98 | } 99 | 100 | def policy_mapping_fn(id): return 'policy_0' 101 | else: 102 | policy_info = { 103 | 'policy_' + str(agent_id): {"cent_obs_dim": get_dim_from_space(env.share_observation_space[agent_id]), 104 | "cent_act_dim": get_cent_act_dim(env.action_space), 105 | "obs_space": env.observation_space[agent_id], 106 | "share_obs_space": env.share_observation_space[agent_id], 107 | "act_space": env.action_space[agent_id]} 108 | for agent_id in range(num_agents) 109 | } 110 | 111 | def policy_mapping_fn(agent_id): return 'policy_' + str(agent_id) 112 | 113 | # choose algo 114 | if all_args.algorithm_name in ["rmatd3", "rmaddpg", "rmasac", "qmix", "vdn"]: 115 | from offpolicy.runner.rnn.mpe_runner import MPERunner as Runner 116 | assert all_args.n_rollout_threads == 1, ( 117 | "only support 1 env in recurrent version.") 118 | eval_env = env 119 | elif all_args.algorithm_name in ["matd3", "maddpg", "masac", "mqmix", "mvdn"]: 120 | from offpolicy.runner.mlp.mpe_runner import MPERunner as Runner 121 | eval_env = make_eval_env(all_args) 122 | else: 123 | raise NotImplementedError 124 | 125 | config = {"args": all_args, 126 | "policy_info": policy_info, 127 | "policy_mapping_fn": policy_mapping_fn, 128 | "env": env, 129 | "eval_env": eval_env, 130 | "num_agents": num_agents, 131 | "device": device, 132 | "use_same_share_obs": all_args.use_same_share_obs, 133 | "run_dir": run_dir 134 | } 135 | 136 | runner = Runner(config=config) 137 | runner.eval(render = True) 138 | 139 | env.close() 140 | if all_args.use_eval and (eval_env is not env): 141 | eval_env.close() 142 | 143 | if __name__ == "__main__": 144 | main(sys.argv[1:]) 145 | -------------------------------------------------------------------------------- /formation_gym/envs/formation_hd_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial.distance import directed_hausdorff 3 | 4 | from formation_gym.scenario import BaseScenario 5 | from formation_gym.core import World, Agent, Landmark 6 | 7 | ''' 8 | use Hausdorff distance as reward function 9 | refer to https://www.wikiwand.com/en/Hausdorff_distance#/Applications 10 | ''' 11 | 12 | class Scenario(BaseScenario): 13 | def make_world(self, num_agents = 3, episode_length = 100): 14 | # world properties 15 | world = World() 16 | world.world_length = episode_length 17 | world.dim_c = 2 # communication channel 18 | world.collaborative = True 19 | self.num_agents = num_agents 20 | # agent properties 21 | world.agents = [Agent() for i in range(num_agents)] 22 | for i, agent in enumerate(world.agents): 23 | agent.name = 'agent %d' % i 24 | agent.collide = True 25 | agent.silent = True 26 | agent.size = 0.03 27 | # landmark properties 28 | world.landmarks = [Landmark() for i in range(num_agents)] 29 | for i, landmark in enumerate(world.landmarks): 30 | landmark.name = 'landmarks %d' % i 31 | landmark.collide = False 32 | landmark.movable = False 33 | landmark.size = 0.01 34 | # initial conditions 35 | self.reset_world(world) 36 | return world 37 | 38 | def observation(self, agent, world): 39 | # change landmark pos for visualization (Note: not necessary for training) 40 | u = [a.state.p_pos for a in world.agents] 41 | v = [l.state.p_pos for l in world.landmarks] 42 | delta = np.mean(u,0) - np.mean(v,0) 43 | for l in world.landmarks: 44 | l.state.p_pos += delta 45 | # for i in range(3): 46 | # u = [world.agents[i].state.p_pos for i in range(i*3, (i+1)*3)] 47 | # v = [world.landmarks[i].state.p_pos for i in range(i*3, (i+1)*3)] 48 | # delta = np.mean(u, 0) - np.mean(v, 0) 49 | # for j in range(3): 50 | # world.landmarks[i*3+j].state.p_pos += delta # synchronize the center of landmarks and agents 51 | # agent pos & communication 52 | other_pos = np.array([]) 53 | comm = np.array([]) 54 | for other in world.agents: 55 | if other is agent: continue 56 | comm = np.append(comm, other.state.c) 57 | other_pos = np.append(other_pos, other.state.p_pos - agent.state.p_pos) 58 | foo = [world.agents[i].state.p_pos for i in range(0, 3)] 59 | return np.concatenate((agent.state.p_vel, other_pos, comm, self.ideal_shape.flatten(), self.ideal_vel)) 60 | 61 | def reward(self, agent, world): 62 | # part1: formation reward: define by hausdorff distance 63 | rew = 0 64 | agent_shape = [a.state.p_pos for a in world.agents] 65 | agent_shape = agent_shape - np.mean(agent_shape, 0) 66 | rew = -max(directed_hausdorff(agent_shape, self.ideal_shape)[0], directed_hausdorff(self.ideal_shape, agent_shape)[0]) 67 | # part2: velocity reward: define by overall velocity difference 68 | mean_vel = np.mean([a.state.p_vel for a in world.agents], axis = 0) 69 | rew -= np.linalg.norm(self.ideal_vel - mean_vel) 70 | # part3: collision 71 | if agent.collide: 72 | for a in world.agents: 73 | if agent!=a and self.is_collision(a, agent): 74 | rew -= 1 75 | return rew 76 | 77 | def reset_world(self, world): 78 | # agent 79 | for i, agent in enumerate(world.agents): 80 | agent.color = np.array([0.35, 0.35, 0.85]) 81 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 82 | agent.state.p_vel = np.zeros(world.dim_p) 83 | agent.state.c = np.zeros(world.dim_c) 84 | # landmark: use can use `generate_shape` to generate target shape 85 | # self.ideal_shape = self.generate_shape(3).reshape(-1,2) 86 | self.ideal_shape = [] 87 | for i, landmark in enumerate(world.landmarks): 88 | landmark.color = np.array([0.25, 0.25, 0.25]) 89 | pos = np.random.uniform(-1, +1, world.dim_p) 90 | self.ideal_shape.append(pos) 91 | landmark.state.p_pos = self.ideal_shape[i] 92 | landmark.state.p_vel = np.zeros(world.dim_p) 93 | self.ideal_shape = self.ideal_shape - np.mean(self.ideal_shape, 0) 94 | # ideal velocity 95 | self.ideal_vel = np.random.uniform(-1, +1, world.dim_p) 96 | 97 | def benchmark_data(self, agent, world): 98 | # get data to debug 99 | rew = self.reward(agent, world) 100 | collisions = 0 101 | if agent.collide: 102 | for a in world.agents: 103 | if self.is_collision(a, agent): 104 | collisions += 1 105 | min_dists = 0 106 | occupied_landmarks = 0 107 | for l in world.landmarks: 108 | dists = [np.linalg.norm(a.state.p_pos - l.state.p_pos) for a in world.agents] 109 | min_dists += min(dists) 110 | if min(dists) < 0.1: 111 | occupied_landmarks += 1 112 | return { 113 | 'reward': rew, 114 | 'collisions': collisions, 115 | 'min_dists': min_dists, 116 | 'occupied_landmarks': occupied_landmarks 117 | } 118 | 119 | def is_collision(self, agent1, agent2): 120 | dist = np.linalg.norm(agent1.state.p_pos - agent2.state.p_pos) 121 | return dist < (agent1.size + agent2.size)/2 122 | 123 | def generate_shape(self, layer, layer_shapes = None): 124 | # this is default shape 125 | layer_shapes = layer_shapes or np.array([ 126 | [[0, -1], [0.5, 0], [0, 1]], 127 | [[0, 1.6], [-1, 0], [1, 0]], 128 | [[1.5, 0], [0, 0], [-1.5, 0]], 129 | [[0, 0.6], [1, 0], [-1, 0]], 130 | ]) 131 | num_layers = layer_shapes.shape[0] 132 | assert layer < num_layers, 'Layer shape is not enough!' 133 | num_agents_per_layer = layer_shapes.shape[1] 134 | if layer == 0: 135 | return layer_shapes[0] 136 | else: 137 | old_shape = self.generate_shape(layer-1) 138 | shape = np.array([(layer_shapes[layer][i] + old_shape * 0.45) for i in range(num_agents_per_layer)]) 139 | return shape 140 | 141 | if __name__ == '__main__': 142 | s = Scenario() -------------------------------------------------------------------------------- /train/mappo/train_formation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | import wandb 5 | import socket 6 | import setproctitle 7 | import numpy as np 8 | from pathlib import Path 9 | import torch 10 | from onpolicy.config import get_config 11 | import formation_gym 12 | from onpolicy.envs.env_wrappers import SubprocVecEnv, DummyVecEnv 13 | 14 | """Train script for formation control.""" 15 | 16 | def make_train_env(all_args): 17 | def get_env_fn(rank): 18 | def init_env(): 19 | if all_args.env_name == "MPE": 20 | env = formation_gym.make_env(all_args.scenario_name, False , all_args.num_agents) 21 | else: 22 | print("Can not support the " + 23 | all_args.env_name + "environment.") 24 | raise NotImplementedError 25 | env.seed(all_args.seed + rank * 1000) 26 | return env 27 | return init_env 28 | if all_args.n_rollout_threads == 1: 29 | return DummyVecEnv([get_env_fn(0)]) 30 | else: 31 | return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)]) 32 | 33 | 34 | def make_eval_env(all_args): 35 | def get_env_fn(rank): 36 | def init_env(): 37 | if all_args.env_name == "MPE": 38 | env = formation_gym.make_env(all_args.scenario_name, False , all_args.num_agents) 39 | else: 40 | print("Can not support the " + 41 | all_args.env_name + "environment.") 42 | raise NotImplementedError 43 | env.seed(all_args.seed * 50000 + rank * 10000) 44 | return env 45 | return init_env 46 | if all_args.n_eval_rollout_threads == 1: 47 | return DummyVecEnv([get_env_fn(0)]) 48 | else: 49 | return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_eval_rollout_threads)]) 50 | 51 | 52 | def parse_args(args, parser): 53 | parser.add_argument('--scenario_name', type=str, 54 | default='simple_spread', help="Which scenario to run on") 55 | parser.add_argument("--num_landmarks", type=int, default=3) 56 | parser.add_argument('--num_agents', type=int, 57 | default=2, help="number of players") 58 | 59 | all_args = parser.parse_known_args(args)[0] 60 | 61 | return all_args 62 | 63 | 64 | def main(args): 65 | parser = get_config() 66 | all_args = parse_args(args, parser) 67 | 68 | if all_args.algorithm_name == "rmappo": 69 | assert (all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ("check recurrent policy!") 70 | elif all_args.algorithm_name == "mappo": 71 | assert (all_args.use_recurrent_policy == False and all_args.use_naive_recurrent_policy == False), ("check recurrent policy!") 72 | else: 73 | raise NotImplementedError 74 | 75 | assert (all_args.share_policy == True and all_args.scenario_name == 'simple_speaker_listener') == False, ( 76 | "The simple_speaker_listener scenario can not use shared policy. Please check the config.py.") 77 | 78 | # cuda 79 | if all_args.cuda and torch.cuda.is_available(): 80 | print("choose to use gpu...") 81 | device = torch.device("cuda:0") 82 | torch.set_num_threads(all_args.n_training_threads) 83 | if all_args.cuda_deterministic: 84 | torch.backends.cudnn.benchmark = False 85 | torch.backends.cudnn.deterministic = True 86 | else: 87 | print("choose to use cpu...") 88 | device = torch.device("cpu") 89 | torch.set_num_threads(all_args.n_training_threads) 90 | 91 | # run dir 92 | run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[ 93 | 0] + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name 94 | if not run_dir.exists(): 95 | os.makedirs(str(run_dir)) 96 | 97 | # wandb 98 | if all_args.use_wandb: 99 | run = wandb.init(config=all_args, 100 | project=all_args.env_name, 101 | entity=all_args.user_name, 102 | notes=socket.gethostname(), 103 | name=str(all_args.algorithm_name) + "_" + 104 | str(all_args.experiment_name) + 105 | "_seed" + str(all_args.seed), 106 | group=all_args.scenario_name, 107 | dir=str(run_dir), 108 | job_type="training", 109 | reinit=True) 110 | else: 111 | if not run_dir.exists(): 112 | curr_run = 'run1' 113 | else: 114 | exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run')] 115 | if len(exst_run_nums) == 0: 116 | curr_run = 'run1' 117 | else: 118 | curr_run = 'run%i' % (max(exst_run_nums) + 1) 119 | run_dir = run_dir / curr_run 120 | if not run_dir.exists(): 121 | os.makedirs(str(run_dir)) 122 | 123 | setproctitle.setproctitle(str(all_args.algorithm_name) + "-" + \ 124 | str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(all_args.user_name)) 125 | 126 | # seed 127 | torch.manual_seed(all_args.seed) 128 | torch.cuda.manual_seed_all(all_args.seed) 129 | np.random.seed(all_args.seed) 130 | 131 | # env init 132 | envs = make_train_env(all_args) 133 | eval_envs = make_eval_env(all_args) if all_args.use_eval else None 134 | num_agents = all_args.num_agents 135 | 136 | config = { 137 | "all_args": all_args, 138 | "envs": envs, 139 | "eval_envs": eval_envs, 140 | "num_agents": num_agents, 141 | "device": device, 142 | "run_dir": run_dir 143 | } 144 | 145 | # run experiments 146 | if all_args.share_policy: 147 | from onpolicy.runner.shared.mpe_runner import MPERunner as Runner 148 | else: 149 | from onpolicy.runner.separated.mpe_runner import MPERunner as Runner 150 | 151 | runner = Runner(config) 152 | runner.run() 153 | 154 | # post process 155 | envs.close() 156 | if all_args.use_eval and eval_envs is not envs: 157 | eval_envs.close() 158 | 159 | if all_args.use_wandb: 160 | run.finish() 161 | else: 162 | runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json')) 163 | runner.writter.close() 164 | 165 | 166 | if __name__ == "__main__": 167 | main(sys.argv[1:]) 168 | -------------------------------------------------------------------------------- /formation_gym/envs/formation_hd_obs_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial.distance import directed_hausdorff 3 | 4 | from formation_gym.scenario import BaseScenario 5 | from formation_gym.core import World, Agent, Landmark, Wall 6 | 7 | ''' 8 | use Hausdorff distance as reward function 9 | refer to https://www.wikiwand.com/en/Hausdorff_distance#/Applications 10 | add obstables into consideration 11 | ''' 12 | 13 | class Scenario(BaseScenario): 14 | def make_world(self, num_agents = 4, num_landmarks = 4, num_obstacles = 3, world_length = 50): 15 | self.num_agents = num_agents 16 | self.num_landmarks = num_landmarks 17 | self.num_obstacles = num_obstacles 18 | # world properties 19 | world = World() 20 | world.world_length = world_length 21 | world.dim_c = 2 # communication channel 22 | world.collaborative = True 23 | # agent properties 24 | world.agents = [Agent() for i in range(num_agents)] 25 | for i, agent in enumerate(world.agents): 26 | agent.name = 'agent %d' % i 27 | agent.collide = True 28 | agent.silent = True 29 | agent.size = 0.1 30 | # landmark and obstacles properties 31 | world.landmarks = [Landmark() for i in range(num_landmarks + num_obstacles)] 32 | for i, landmark in enumerate(world.landmarks): 33 | # setup landmarks 34 | if i < num_landmarks: 35 | landmark.name = 'landmarks %d' % i 36 | landmark.collide = False 37 | landmark.movable = False 38 | landmark.size = 0.02 39 | # setup obstacles 40 | else: 41 | landmark.name = 'obstacles %d' % (i - num_landmarks) 42 | landmark.collide = True 43 | landmark.movable = True 44 | landmark.size = 0.15 45 | # setup walls 46 | # world.walls = [] 47 | # world.walls.append(Wall(orient='H',axis_pos=2.6,endpoints=(-2.2, 2.2),width=0.2,hard=True)) 48 | # world.walls.append(Wall(orient='H',axis_pos=-2.6,endpoints=(-2.2, 2.2),width=0.2,hard=True)) 49 | # world.walls.append(Wall(orient='V',axis_pos=2.2,endpoints=(-10, 10),width=0.2,hard=True)) 50 | # world.walls.append(Wall(orient='V',axis_pos=-2.2,endpoints=(-10, 10),width=0.2,hard=True)) 51 | # initial conditions 52 | self.reset_world(world) 53 | return world 54 | 55 | def observation(self, agent, world): 56 | # landmark pos 57 | entity_pos = [] 58 | for entity in world.landmarks[:self.num_landmarks]: 59 | entity_pos.append(entity.state.p_pos) 60 | for entity in world.landmarks[self.num_landmarks:]: 61 | entity_pos.append(entity.state.p_pos - agent.state.p_pos) 62 | # agent pos & communication 63 | other_pos = [] 64 | comm = [] 65 | for other in world.agents: 66 | if other is agent: continue 67 | comm.append(other.state.c) 68 | other_pos.append(other.state.p_pos - agent.state.p_pos) 69 | return np.concatenate([agent.state.p_vel]+entity_pos + other_pos + comm) 70 | 71 | def reward(self, agent, world): 72 | rew = 0 73 | u = [a.state.p_pos for a in world.agents] 74 | v = [l.state.p_pos for l in world.landmarks[:self.num_landmarks]] 75 | delta = np.mean(u, 0) - np.mean(v, 0) 76 | u = u - np.mean(u, 0) 77 | v = v - np.mean(v, 0) 78 | rew = -max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0]) 79 | # set boundary 80 | # self.set_bound(world) 81 | # change landmark pos and color 82 | for i, landmark in enumerate(world.landmarks): 83 | if i < self.num_landmarks: 84 | delta = [0, 0] 85 | landmark.state.p_pos += delta 86 | else: 87 | if landmark.state.p_pos[1] > -2.2: 88 | landmark.state.p_vel = np.array([0, -1]) 89 | else: landmark.state.p_vel = np.array([0, 0]) 90 | # dist = min([np.linalg.norm(a.state.p_pos - world.landmarks[i].state.p_pos) for a in world.agents]) 91 | # if dist <= 0.2: world.landmarks[i].color = np.array([0, 0.6, 0]) 92 | if agent.collide: 93 | for a in world.agents: 94 | if agent!=a and self.is_collision(a, agent): 95 | rew -= 2 96 | for l in world.landmarks[self.num_landmarks:]: 97 | if self.is_collision(l, agent): 98 | rew -= 2 99 | return rew 100 | 101 | def reset_world(self, world): 102 | # agent 103 | for agent in world.agents: 104 | agent.color = np.array([0.65, 0.65, 0.85]) 105 | agent.state.p_pos = np.random.uniform(-1, +1, world.dim_p) 106 | agent.state.p_vel = np.zeros(world.dim_p) 107 | agent.state.c = np.zeros(world.dim_c) 108 | # landmark 109 | for i, landmark in enumerate(world.landmarks): 110 | step = np.linspace(-1.8, 1.8, self.num_obstacles+1) 111 | # setup landmarks 112 | if i entity.max_speed: 167 | entity.state.p_vel = entity.state.p_vel / np.sqrt(np.square(entity.state.p_vel[0]) + 168 | np.square(entity.state.p_vel[1])) * entity.max_speed 169 | entity.state.p_pos += entity.state.p_vel * self.dt 170 | 171 | def update_agent_state(self, agent): 172 | # set communication state (directly for now) 173 | if agent.silent: 174 | agent.state.c = np.zeros(self.dim_c) 175 | else: 176 | noise = np.random.randn(*agent.action.c.shape) * agent.c_noise if agent.c_noise else 0.0 177 | agent.state.c = agent.action.c + noise 178 | 179 | # get collision forces for any contact between two entities 180 | def get_collision_force(self, entity_a, entity_b): 181 | if (not entity_a.collide) or (not entity_b.collide): 182 | return [None, None] # not a collider 183 | if (entity_a is entity_b): 184 | return [None, None] # don't collide against itself 185 | # compute actual distance between entities 186 | delta_pos = entity_a.state.p_pos - entity_b.state.p_pos 187 | dist = np.sqrt(np.sum(np.square(delta_pos))) 188 | # minimum allowable distance 189 | dist_min = entity_a.size + entity_b.size 190 | # softmax penetration 191 | k = self.contact_margin 192 | penetration = np.logaddexp(0, -(dist - dist_min)/k)*k 193 | force = self.contact_force * delta_pos / dist * penetration 194 | force_a = +force if entity_a.movable else None 195 | force_b = -force if entity_b.movable else None 196 | return [force_a, force_b] -------------------------------------------------------------------------------- /train/maddpg-v3/main.py: -------------------------------------------------------------------------------- 1 | import ray 2 | from ray.tune import run_experiments 3 | from ray.tune.registry import register_trainable, register_env 4 | from env import MultiAgentParticleEnv, FormationEnv 5 | import ray.rllib.contrib.maddpg.maddpg as maddpg 6 | import argparse 7 | 8 | import os 9 | 10 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 11 | 12 | 13 | class CustomStdOut(object): 14 | def _log_result(self, result): 15 | if result["training_iteration"] % 50 == 0: 16 | try: 17 | print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( 18 | result["timesteps_total"], 19 | result["episodes_total"], 20 | result["episode_reward_mean"], 21 | result["policy_reward_mean"], 22 | round(result["time_total_s"] - self.cur_time, 3) 23 | )) 24 | except: 25 | pass 26 | 27 | self.cur_time = result["time_total_s"] 28 | 29 | 30 | def parse_args(): 31 | parser = argparse.ArgumentParser("MADDPG with OpenAI MPE") 32 | 33 | # Environment 34 | parser.add_argument("--scenario", type=str, default="formation_hd_env", 35 | help="name of the scenario script") 36 | parser.add_argument("--max-episode-len", type=int, default=25, 37 | help="maximum episode length") 38 | parser.add_argument("--num-episodes", type=int, default=60000, 39 | help="number of episodes") 40 | parser.add_argument("--num-adversaries", type=int, default=0,help="number of adversaries") 41 | parser.add_argument("--num-agents", type=int, default=3,help="number of agents") 42 | parser.add_argument("--good-policy", type=str, default="maddpg", 43 | help="policy for good agents") 44 | parser.add_argument("--adv-policy", type=str, default="maddpg", 45 | help="policy of adversaries") 46 | 47 | # Core training parameters 48 | parser.add_argument("--lr", type=float, default=1e-2, 49 | help="learning rate for Adam optimizer") 50 | parser.add_argument("--gamma", type=float, default=0.95, 51 | help="discount factor") 52 | # NOTE: 1 iteration = sample_batch_size * num_workers timesteps * num_envs_per_worker 53 | parser.add_argument("--sample-batch-size", type=int, default=25, 54 | help="number of data points sampled /update /worker") 55 | parser.add_argument("--train-batch-size", type=int, default=1024, 56 | help="number of data points /update") 57 | parser.add_argument("--n-step", type=int, default=1, 58 | help="length of multistep value backup") 59 | parser.add_argument("--num-units", type=int, default=64, 60 | help="number of units in the mlp") 61 | 62 | # Checkpoint 63 | parser.add_argument("--checkpoint-freq", type=int, default=7500, 64 | help="save model once every time this many iterations are completed") 65 | parser.add_argument("--local-dir", type=str, default="./ray_results", 66 | help="path to save checkpoints") 67 | parser.add_argument("--restore", type=str, default=None, 68 | help="directory in which training state and model are loaded") 69 | 70 | # Parallelism 71 | parser.add_argument("--num-workers", type=int, default=1) 72 | parser.add_argument("--num-envs-per-worker", type=int, default=4) 73 | parser.add_argument("--num-gpus", type=int, default=0) 74 | 75 | return parser.parse_args() 76 | 77 | 78 | def main(args): 79 | # ray.init(redis_max_memory=int(1e10), object_store_memory=int(3e9)) 80 | ray.init() 81 | MADDPGAgent = maddpg.MADDPGTrainer.with_updates( 82 | mixins=[CustomStdOut] 83 | ) 84 | register_trainable("MADDPG", MADDPGAgent) 85 | 86 | if 'formation' not in args.scenario: 87 | def env_creater(mpe_args): 88 | return MultiAgentParticleEnv(**mpe_args) 89 | 90 | register_env("mpe", env_creater) 91 | 92 | env = env_creater({ 93 | "scenario_name": args.scenario 94 | }) 95 | else: 96 | def env_creater(mpe_args): 97 | return FormationEnv(**mpe_args) 98 | 99 | register_env("mpe", env_creater) 100 | 101 | env = env_creater({ 102 | "scenario_name": args.scenario, 103 | 'benchmark': False, 104 | 'num_agents': args.num_agents 105 | }) 106 | 107 | def gen_policy(i): 108 | use_local_critic = [ 109 | args.adv_policy == "ddpg" if i < args.num_adversaries else 110 | args.good_policy == "ddpg" for i in range(env.num_agents) 111 | ] 112 | return ( 113 | None, 114 | env.observation_space_dict[i], 115 | env.action_space_dict[i], 116 | { 117 | "agent_id": i, 118 | "use_local_critic": use_local_critic[i], 119 | "obs_space_dict": env.observation_space_dict, 120 | "act_space_dict": env.action_space_dict, 121 | } 122 | ) 123 | 124 | policies = {"policy_%d" %i: gen_policy(i) for i in range(len(env.observation_space_dict))} 125 | policy_ids = list(policies.keys()) 126 | 127 | run_experiments({ 128 | "MADDPG_RLLib": { 129 | "run": "MADDPG", 130 | "env": "mpe", 131 | "stop": { 132 | "episodes_total": args.num_episodes, 133 | }, 134 | "checkpoint_freq": args.checkpoint_freq, 135 | "local_dir": args.local_dir, 136 | "restore": args.restore, 137 | "config": { 138 | # === Log === 139 | "log_level": "ERROR", 140 | 141 | # === Environment === 142 | "env_config": { 143 | "scenario_name": args.scenario, 144 | }, 145 | "num_envs_per_worker": args.num_envs_per_worker, 146 | "horizon": args.max_episode_len, 147 | 148 | # === Policy Config === 149 | # --- Model --- 150 | "good_policy": args.good_policy, 151 | "adv_policy": args.adv_policy, 152 | "actor_hiddens": [args.num_units] * 2, 153 | "actor_hidden_activation": "relu", 154 | "critic_hiddens": [args.num_units] * 2, 155 | "critic_hidden_activation": "relu", 156 | "n_step": args.n_step, 157 | "gamma": args.gamma, 158 | 159 | # --- Exploration --- 160 | "tau": 0.01, 161 | 162 | # --- Replay buffer --- 163 | "buffer_size": int(1e6), 164 | 165 | # --- Optimization --- 166 | "actor_lr": args.lr, 167 | "critic_lr": args.lr, 168 | "learning_starts": args.train_batch_size * args.max_episode_len, 169 | # "sample_batch_size": args.sample_batch_size, 170 | "train_batch_size": args.train_batch_size, 171 | "batch_mode": "truncate_episodes", 172 | 173 | # --- Parallelism --- 174 | "num_workers": args.num_workers, 175 | "num_gpus": args.num_gpus, 176 | "num_gpus_per_worker": 0, 177 | 178 | # === Multi-agent setting === 179 | "multiagent": { 180 | "policies": policies, 181 | "policy_mapping_fn": ray.tune.function( 182 | lambda i: policy_ids[i] 183 | ) 184 | }, 185 | }, 186 | }, 187 | }, verbose=0) 188 | 189 | 190 | if __name__ == '__main__': 191 | args = parse_args() 192 | main(args) 193 | -------------------------------------------------------------------------------- /train/maddpg-v2/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import time 4 | import os 5 | import numpy as np 6 | from gym.spaces import Box, Discrete 7 | from pathlib import Path 8 | from torch.autograd import Variable 9 | from tensorboardX import SummaryWriter 10 | from utils.make_env import make_env 11 | from utils.buffer import ReplayBuffer 12 | from utils.env_wrappers import SubprocVecEnv, DummyVecEnv 13 | from algorithms.maddpg import MADDPG 14 | 15 | import formation_gym 16 | 17 | USE_CUDA = False # torch.cuda.is_available() 18 | 19 | def make_parallel_env(env_id, n_rollout_threads, seed, agent_num): 20 | def get_env_fn(rank): 21 | def init_env(): 22 | env = formation_gym.make_env(env_id ,benchmark = False, num_agents = agent_num) 23 | env.seed(seed + rank * 1000) 24 | np.random.seed(seed + rank * 1000) 25 | return env 26 | return init_env 27 | if n_rollout_threads == 1: 28 | return DummyVecEnv([get_env_fn(0)]) 29 | else: 30 | return SubprocVecEnv([get_env_fn(i) for i in range(n_rollout_threads)]) 31 | 32 | def run(config): 33 | model_dir = Path('./models') / config.env_id / config.model_name 34 | if not model_dir.exists(): 35 | curr_run = 'run1' 36 | else: 37 | exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in 38 | model_dir.iterdir() if 39 | str(folder.name).startswith('run')] 40 | if len(exst_run_nums) == 0: 41 | curr_run = 'run1' 42 | else: 43 | curr_run = 'run%i' % (max(exst_run_nums) + 1) 44 | run_dir = model_dir / curr_run 45 | log_dir = run_dir / 'logs' 46 | os.makedirs(log_dir) 47 | logger = SummaryWriter(str(log_dir)) 48 | 49 | torch.manual_seed(config.seed) 50 | np.random.seed(config.seed) 51 | if not USE_CUDA: 52 | torch.set_num_threads(config.n_training_threads) 53 | env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, 54 | config.agent_num) 55 | maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, 56 | adversary_alg=config.adversary_alg, 57 | tau=config.tau, 58 | lr=config.lr, 59 | hidden_dim=config.hidden_dim) 60 | replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, 61 | [obsp.shape[0] for obsp in env.observation_space], 62 | [acsp.shape[0] if isinstance(acsp, Box) else acsp.n 63 | for acsp in env.action_space]) 64 | t = 0 65 | for ep_i in range(0, config.n_episodes, config.n_rollout_threads): 66 | print("Episodes %i-%i of %i" % (ep_i + 1, 67 | ep_i + 1 + config.n_rollout_threads, 68 | config.n_episodes)) 69 | obs = env.reset() 70 | # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor 71 | maddpg.prep_rollouts(device='cpu') # gpu 72 | 73 | explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps 74 | maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) 75 | maddpg.reset_noise() 76 | 77 | for et_i in range(config.episode_length): 78 | # rearrange observations to be per agent, and convert to torch Variable 79 | torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), 80 | requires_grad=False) 81 | for i in range(maddpg.nagents)] 82 | # get actions as torch Variables 83 | torch_agent_actions = maddpg.step(torch_obs, explore=True) 84 | # convert actions to numpy arrays 85 | agent_actions = [ac.data.numpy() for ac in torch_agent_actions] 86 | # rearrange actions to be per environment 87 | actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] 88 | next_obs, rewards, dones, infos = env.step(actions) 89 | replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) 90 | obs = next_obs 91 | t += config.n_rollout_threads 92 | if (len(replay_buffer) >= config.batch_size and 93 | (t % config.steps_per_update) < config.n_rollout_threads): 94 | if USE_CUDA: 95 | maddpg.prep_training(device='gpu') 96 | else: 97 | maddpg.prep_training(device='cpu') 98 | for u_i in range(config.n_rollout_threads): 99 | for a_i in range(maddpg.nagents): 100 | sample = replay_buffer.sample(config.batch_size, 101 | to_gpu=USE_CUDA) 102 | maddpg.update(sample, a_i, logger=logger) 103 | maddpg.update_all_targets() 104 | maddpg.prep_rollouts(device='cpu') # cpu 105 | ep_rews = replay_buffer.get_average_rewards( 106 | config.episode_length * config.n_rollout_threads) 107 | for a_i, a_ep_rew in enumerate(ep_rews): 108 | logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) 109 | 110 | if ep_i % config.save_interval < config.n_rollout_threads: 111 | os.makedirs(run_dir / 'incremental', exist_ok=True) 112 | maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) 113 | maddpg.save(run_dir / 'model.pt') 114 | 115 | maddpg.save(run_dir / 'model.pt') 116 | env.close() 117 | logger.export_scalars_to_json(str(log_dir / 'summary.json')) 118 | logger.close() 119 | 120 | 121 | if __name__ == '__main__': 122 | parser = argparse.ArgumentParser() 123 | parser.add_argument("--env_id", default='simple_spread', type = str, help="Name of environment", ) 124 | parser.add_argument("--model_name", default='model', type = str, help="Name of directory to store " + 125 | "model/training contents") 126 | parser.add_argument("--seed", default=1, type=int, help="Random seed") 127 | parser.add_argument("--n_rollout_threads", default=1, type=int) # 1 128 | parser.add_argument("--n_training_threads", default=6, type=int) # 6 129 | parser.add_argument("--buffer_length", default=int(5e5), type=int) 130 | parser.add_argument("--n_episodes", default=25000, type=int) 131 | parser.add_argument("--episode_length", default=30, type=int) 132 | parser.add_argument("--steps_per_update", default=120, type=int) 133 | parser.add_argument("--batch_size", 134 | default=256, type=int, 135 | help="Batch size for model training") 136 | parser.add_argument("--n_exploration_eps", default=25000, type=int) 137 | parser.add_argument("--init_noise_scale", default=0.3, type=float) 138 | parser.add_argument("--final_noise_scale", default=0.0, type=float) 139 | parser.add_argument("--save_interval", default=10000, type=int) 140 | parser.add_argument("--hidden_dim", default=64, type=int) 141 | parser.add_argument("--lr", default=0.01, type=float) 142 | parser.add_argument("--tau", default=0.01, type=float) 143 | parser.add_argument("--agent_alg", 144 | default="MADDPG", type=str, 145 | choices=['MADDPG', 'DDPG']) 146 | parser.add_argument("--adversary_alg", 147 | default="MADDPG", type=str, 148 | choices=['MADDPG', 'DDPG']) 149 | parser.add_argument("--discrete_action", action='store_true') 150 | parser.add_argument("--agent-num", type=int, default = 9) 151 | 152 | config = parser.parse_args() 153 | 154 | run(config) 155 | -------------------------------------------------------------------------------- /train/maddpg-v5/config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def get_config(): 5 | parser = argparse.ArgumentParser( 6 | description="OFF-POLICY", formatter_class=argparse.RawDescriptionHelpFormatter) 7 | 8 | # prepare parameters 9 | parser.add_argument("--algorithm_name", type=str, default="rmaddpg", choices=[ 10 | "rmatd3", "rmaddpg", "rmasac", "qmix", "vdn", "matd3", "maddpg", "masac", "mqmix", "mvdn"]) 11 | parser.add_argument("--experiment_name", type=str, default="debug") 12 | parser.add_argument("--seed", type=int, default=1, 13 | help="Random seed for numpy/torch") 14 | parser.add_argument("--cuda", action='store_false', default=True) 15 | parser.add_argument("--cuda_deterministic", 16 | action='store_false', default=True) 17 | parser.add_argument('--n_training_threads', type=int, 18 | default=1, help="Number of torch threads for training") 19 | parser.add_argument('--n_rollout_threads', type=int, default=1, 20 | help="Number of parallel envs for training rollout") 21 | parser.add_argument('--n_eval_rollout_threads', type=int, default=1, 22 | help="Number of parallel envs for evaluating rollout") 23 | parser.add_argument('--num_env_steps', type=int, 24 | default=2000000, help="Number of env steps to train for") 25 | parser.add_argument('--use_wandb', action='store_true', default=False, 26 | help="Whether to use weights&biases, if not, use tensorboardX instead") 27 | parser.add_argument('--user_name', type=str, default="zoeyuchao") 28 | 29 | # env parameters 30 | parser.add_argument('--env_name', type=str, default="formation") 31 | parser.add_argument("--use_obs_instead_of_state", action='store_true', 32 | default=False, help="Whether to use global state or concatenated obs") 33 | 34 | # replay buffer parameters 35 | parser.add_argument('--episode_length', type=int, 36 | default=25, help="Max length for any episode") 37 | parser.add_argument('--buffer_size', type=int, default=5000, 38 | help="Max # of transitions that replay buffer can contain") 39 | parser.add_argument('--use_reward_normalization', 40 | default=True, help="Whether to normalize rewards in replay buffer") 41 | parser.add_argument('--use_popart', default=False, 42 | help="Whether to use popart to normalize the target loss") 43 | parser.add_argument('--popart_update_interval_step', type=int, default=2, 44 | help="After how many train steps popart should be updated") 45 | 46 | # prioritized experience replay 47 | parser.add_argument('--use_per', action='store_true', default=False, 48 | help="Whether to use prioritized experience replay") 49 | parser.add_argument('--per_nu', type=float, default=0.9, 50 | help="Weight of max TD error in formation of PER weights") 51 | parser.add_argument('--per_alpha', type=float, default=0.6, 52 | help="Alpha term for prioritized experience replay") 53 | parser.add_argument('--per_eps', type=float, default=1e-6, 54 | help="Eps term for prioritized experience replay") 55 | parser.add_argument('--per_beta_start', type=float, default=0.4, 56 | help="Starting beta term for prioritized experience replay") 57 | 58 | # network parameters 59 | parser.add_argument("--use_centralized_Q", action='store_false', 60 | default=True, help="Whether to use centralized Q function") 61 | parser.add_argument('--share_policy', action='store_false', 62 | default=True, help="Whether agents share the same policy") 63 | parser.add_argument('--hidden_size', type=int, default=64, 64 | help="Dimension of hidden layers for actor/critic networks") 65 | parser.add_argument('--layer_N', type=int, default=1, 66 | help="Number of layers for actor/critic networks") 67 | parser.add_argument('--use_ReLU', action='store_false', 68 | default=True, help="Whether to use ReLU") 69 | parser.add_argument('--use_feature_normalization', action='store_false', 70 | default=True, help="Whether to apply layernorm to the inputs") 71 | parser.add_argument('--use_orthogonal', action='store_false', default=True, 72 | help="Whether to use Orthogonal initialization for weights and 0 initialization for biases") 73 | parser.add_argument("--gain", type=float, default=0.01, 74 | help="The gain # of last action layer") 75 | parser.add_argument("--use_conv1d", action='store_true', 76 | default=False, help="Whether to use conv1d") 77 | parser.add_argument("--stacked_frames", type=int, default=1, 78 | help="Dimension of hidden layers for actor/critic networks") 79 | 80 | # recurrent parameters 81 | parser.add_argument('--prev_act_inp', action='store_true', default=False, 82 | help="Whether the actor input takes in previous actions as part of its input") 83 | parser.add_argument("--use_rnn_layer", action='store_false', 84 | default=True, help='Whether to use a recurrent policy') 85 | parser.add_argument("--use_naive_recurrent_policy", action='store_false', 86 | default=True, help='Whether to use a naive recurrent policy') 87 | # TODO now only 1 is support 88 | parser.add_argument("--recurrent_N", type=int, default=1) 89 | parser.add_argument('--data_chunk_length', type=int, default=80, 90 | help="Time length of chunks used to train via BPTT") 91 | parser.add_argument('--burn_in_time', type=int, default=0, 92 | help="Length of burn in time for RNN training, see R2D2 paper") 93 | 94 | # attn parameters 95 | parser.add_argument("--attn", action='store_true', default=False) 96 | parser.add_argument("--attn_N", type=int, default=1) 97 | parser.add_argument("--attn_size", type=int, default=64) 98 | parser.add_argument("--attn_heads", type=int, default=4) 99 | parser.add_argument("--dropout", type=float, default=0.0) 100 | parser.add_argument("--use_average_pool", 101 | action='store_false', default=True) 102 | parser.add_argument("--use_cat_self", action='store_false', default=True) 103 | 104 | # optimizer parameters 105 | parser.add_argument('--lr', type=float, default=7e-4, 106 | help="Learning rate for Adam") 107 | parser.add_argument("--opti_eps", type=float, default=1e-5, 108 | help='RMSprop optimizer epsilon (default: 1e-5)') 109 | parser.add_argument("--weight_decay", type=float, default=0) 110 | 111 | # algo common parameters 112 | parser.add_argument('--batch_size', type=int, default=32, 113 | help="Number of buffer transitions to train on at once") 114 | parser.add_argument('--gamma', type=float, default=0.99, 115 | help="Discount factor for env") 116 | parser.add_argument("--use_max_grad_norm", 117 | action='store_false', default=True) 118 | parser.add_argument("--max_grad_norm", type=float, default=10.0, 119 | help='max norm of gradients (default: 0.5)') 120 | parser.add_argument('--use_huber_loss', action='store_true', 121 | default=False, help="Whether to use Huber loss for critic update") 122 | parser.add_argument("--huber_delta", type=float, default=10.0) 123 | 124 | # soft update parameters 125 | parser.add_argument('--use_soft_update', action='store_false', 126 | default=True, help="Whether to use soft update") 127 | parser.add_argument('--tau', type=float, default=0.005, 128 | help="Polyak update rate") 129 | # hard update parameters 130 | parser.add_argument('--hard_update_interval_episode', type=int, default=200, 131 | help="After how many episodes the lagging target should be updated") 132 | parser.add_argument('--hard_update_interval', type=int, default=200, 133 | help="After how many timesteps the lagging target should be updated") 134 | # rmatd3 parameters 135 | parser.add_argument("--target_action_noise_std", default=0.2, help="Target action smoothing noise for matd3") 136 | # rmasac parameters 137 | parser.add_argument('--alpha', type=float, default=1.0, 138 | help="Initial temperature") 139 | parser.add_argument('--target_entropy_coef', type=float, 140 | default=0.5, help="Initial temperature") 141 | parser.add_argument('--automatic_entropy_tune', action='store_false', 142 | default=True, help="Whether use a centralized critic") 143 | # qmix parameters 144 | parser.add_argument('--use_double_q', action='store_false', 145 | default=True, help="Whether to use double q learning") 146 | parser.add_argument('--hypernet_layers', type=int, default=2, 147 | help="Number of layers for hypernetworks. Must be either 1 or 2") 148 | parser.add_argument('--mixer_hidden_dim', type=int, default=32, 149 | help="Dimension of hidden layer of mixing network") 150 | parser.add_argument('--hypernet_hidden_dim', type=int, default=64, 151 | help="Dimension of hidden layer of hypernetwork (only applicable if hypernet_layers == 2") 152 | 153 | # exploration parameters 154 | parser.add_argument('--num_random_episodes', type=int, default=5, 155 | help="Number of episodes to add to buffer with purely random actions") 156 | parser.add_argument('--epsilon_start', type=float, default=1.0, 157 | help="Starting value for epsilon, for eps-greedy exploration") 158 | parser.add_argument('--epsilon_finish', type=float, default=0.05, 159 | help="Ending value for epsilon, for eps-greedy exploration") 160 | parser.add_argument('--epsilon_anneal_time', type=int, default=50000, 161 | help="Number of episodes until epsilon reaches epsilon_finish") 162 | parser.add_argument('--act_noise_std', type=float, 163 | default=0.1, help="Action noise") 164 | 165 | # train parameters 166 | parser.add_argument('--actor_train_interval_step', type=int, default=1, 167 | help="After how many critic updates actor should be updated") 168 | parser.add_argument('--train_interval_episode', type=int, default=1, 169 | help="Number of env steps between updates to actor/critic") 170 | parser.add_argument('--train_interval', type=int, default=100, 171 | help="Number of episodes between updates to actor/critic") 172 | parser.add_argument("--use_value_active_masks", 173 | action='store_true', default=False) 174 | 175 | # eval parameters 176 | parser.add_argument('--use_eval', action='store_false', 177 | default=True, help="Whether to conduct the evaluation") 178 | parser.add_argument('--eval_interval', type=int, default=10000, 179 | help="After how many episodes the policy should be evaled") 180 | parser.add_argument('--num_eval_episodes', type=int, default=32, 181 | help="How many episodes to collect for each eval") 182 | 183 | # save parameters 184 | parser.add_argument('--save_interval', type=int, default=100000, 185 | help="After how many episodes of training the policy model should be saved") 186 | 187 | # log parameters 188 | parser.add_argument('--log_interval', type=int, default=1000, 189 | help="After how many episodes of training the policy model should be saved") 190 | 191 | # pretained parameters 192 | parser.add_argument("--model_dir", type=str, default=None) 193 | 194 | return parser 195 | -------------------------------------------------------------------------------- /formation_gym/inbox/rendering.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2D rendering framework 3 | """ 4 | from __future__ import division 5 | import os 6 | import six 7 | import sys 8 | 9 | if "Apple" in sys.version: 10 | if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ: 11 | os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib' 12 | # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite 13 | 14 | from gym import error 15 | 16 | try: 17 | import pyglet 18 | except ImportError as e: 19 | raise ImportError("HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.") 20 | 21 | try: 22 | from pyglet.gl import * 23 | except ImportError as e: 24 | raise ImportError("Error occured while running `from pyglet.gl import * HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '") 25 | 26 | import math 27 | import numpy as np 28 | 29 | RAD2DEG = 57.29577951308232 30 | 31 | def get_display(spec): 32 | """Convert a display specification (such as :0) into an actual Display 33 | object. 34 | 35 | Pyglet only supports multiple Displays on Linux. 36 | """ 37 | if spec is None: 38 | return None 39 | elif isinstance(spec, six.string_types): 40 | return pyglet.canvas.Display(spec) 41 | else: 42 | raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec)) 43 | 44 | class Viewer(object): 45 | def __init__(self, width, height, display=None): 46 | display = get_display(display) 47 | 48 | self.width = width 49 | self.height = height 50 | 51 | self.window = pyglet.window.Window(width=width, height=height, display=display) 52 | self.window.on_close = self.window_closed_by_user 53 | self.geoms = [] 54 | self.onetime_geoms = [] 55 | self.transform = Transform() 56 | 57 | glEnable(GL_BLEND) 58 | # glEnable(GL_MULTISAMPLE) 59 | glEnable(GL_LINE_SMOOTH) 60 | # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE) 61 | glHint(GL_LINE_SMOOTH_HINT, GL_NICEST) 62 | glLineWidth(2.0) 63 | glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) 64 | 65 | def close(self): 66 | self.window.close() 67 | 68 | def window_closed_by_user(self): 69 | self.close() 70 | 71 | def set_bounds(self, left, right, bottom, top): 72 | assert right > left and top > bottom 73 | scalex = self.width/(right-left) 74 | scaley = self.height/(top-bottom) 75 | self.transform = Transform( 76 | translation=(-left*scalex, -bottom*scaley), 77 | scale=(scalex, scaley)) 78 | 79 | def add_geom(self, geom): 80 | self.geoms.append(geom) 81 | 82 | def add_onetime(self, geom): 83 | self.onetime_geoms.append(geom) 84 | 85 | def render(self, return_rgb_array=False): 86 | glClearColor(1,1,1,1) 87 | self.window.clear() 88 | self.window.switch_to() 89 | self.window.dispatch_events() 90 | self.transform.enable() 91 | for geom in self.geoms: 92 | geom.render() 93 | for geom in self.onetime_geoms: 94 | geom.render() 95 | self.transform.disable() 96 | arr = None 97 | if return_rgb_array: 98 | buffer = pyglet.image.get_buffer_manager().get_color_buffer() 99 | image_data = buffer.get_image_data() 100 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 101 | # In https://github.com/openai/gym-http-api/issues/2, we 102 | # discovered that someone using Xmonad on Arch was having 103 | # a window of size 598 x 398, though a 600 x 400 window 104 | # was requested. (Guess Xmonad was preserving a pixel for 105 | # the boundary.) So we use the buffer height/width rather 106 | # than the requested one. 107 | arr = arr.reshape(buffer.height, buffer.width, 4) 108 | arr = arr[::-1,:,0:3] 109 | self.window.flip() 110 | self.onetime_geoms = [] 111 | return arr 112 | 113 | # Convenience 114 | def draw_circle(self, radius=10, res=30, filled=True, **attrs): 115 | geom = make_circle(radius=radius, res=res, filled=filled) 116 | _add_attrs(geom, attrs) 117 | self.add_onetime(geom) 118 | return geom 119 | 120 | def draw_polygon(self, v, filled=True, **attrs): 121 | geom = make_polygon(v=v, filled=filled) 122 | _add_attrs(geom, attrs) 123 | self.add_onetime(geom) 124 | return geom 125 | 126 | def draw_polyline(self, v, **attrs): 127 | geom = make_polyline(v=v) 128 | _add_attrs(geom, attrs) 129 | self.add_onetime(geom) 130 | return geom 131 | 132 | def draw_line(self, start, end, **attrs): 133 | geom = Line(start, end) 134 | _add_attrs(geom, attrs) 135 | self.add_onetime(geom) 136 | return geom 137 | 138 | def get_array(self): 139 | self.window.flip() 140 | image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() 141 | self.window.flip() 142 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 143 | arr = arr.reshape(self.height, self.width, 4) 144 | return arr[::-1,:,0:3] 145 | 146 | def _add_attrs(geom, attrs): 147 | if "color" in attrs: 148 | geom.set_color(*attrs["color"]) 149 | if "linewidth" in attrs: 150 | geom.set_linewidth(attrs["linewidth"]) 151 | 152 | class Geom(object): 153 | def __init__(self): 154 | self._color=Color((0, 0, 0, 1.0)) 155 | self.attrs = [self._color] 156 | def render(self): 157 | for attr in reversed(self.attrs): 158 | attr.enable() 159 | self.render1() 160 | for attr in self.attrs: 161 | attr.disable() 162 | def render1(self): 163 | raise NotImplementedError 164 | def add_attr(self, attr): 165 | self.attrs.append(attr) 166 | def set_color(self, r, g, b, alpha=1): 167 | self._color.vec4 = (r, g, b, alpha) 168 | 169 | class Attr(object): 170 | def enable(self): 171 | raise NotImplementedError 172 | def disable(self): 173 | pass 174 | 175 | class Transform(Attr): 176 | def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)): 177 | self.set_translation(*translation) 178 | self.set_rotation(rotation) 179 | self.set_scale(*scale) 180 | def enable(self): 181 | glPushMatrix() 182 | glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint 183 | glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0) 184 | glScalef(self.scale[0], self.scale[1], 1) 185 | def disable(self): 186 | glPopMatrix() 187 | def set_translation(self, newx, newy): 188 | self.translation = (float(newx), float(newy)) 189 | def set_rotation(self, new): 190 | self.rotation = float(new) 191 | def set_scale(self, newx, newy): 192 | self.scale = (float(newx), float(newy)) 193 | 194 | class Color(Attr): 195 | def __init__(self, vec4): 196 | self.vec4 = vec4 197 | def enable(self): 198 | glColor4f(*self.vec4) 199 | 200 | class LineStyle(Attr): 201 | def __init__(self, style): 202 | self.style = style 203 | def enable(self): 204 | glEnable(GL_LINE_STIPPLE) 205 | glLineStipple(1, self.style) 206 | def disable(self): 207 | glDisable(GL_LINE_STIPPLE) 208 | 209 | class LineWidth(Attr): 210 | def __init__(self, stroke): 211 | self.stroke = stroke 212 | def enable(self): 213 | glLineWidth(self.stroke) 214 | 215 | class Point(Geom): 216 | def __init__(self): 217 | Geom.__init__(self) 218 | def render1(self): 219 | glBegin(GL_POINTS) # draw point 220 | glVertex3f(0.0, 0.0, 0.0) 221 | glEnd() 222 | 223 | class FilledPolygon(Geom): 224 | def __init__(self, v): 225 | Geom.__init__(self) 226 | self.v = v 227 | def render1(self): 228 | if len(self.v) == 4 : glBegin(GL_QUADS) 229 | elif len(self.v) > 4 : glBegin(GL_POLYGON) 230 | else: glBegin(GL_TRIANGLES) 231 | for p in self.v: 232 | glVertex3f(p[0], p[1],0) # draw each vertex 233 | glEnd() 234 | 235 | color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5) 236 | glColor4f(*color) 237 | glBegin(GL_LINE_LOOP) 238 | for p in self.v: 239 | glVertex3f(p[0], p[1],0) # draw each vertex 240 | glEnd() 241 | 242 | def make_circle(radius=10, res=30, filled=True): 243 | points = [] 244 | for i in range(res): 245 | ang = 2*math.pi*i / res 246 | points.append((math.cos(ang)*radius, math.sin(ang)*radius)) 247 | if filled: 248 | return FilledPolygon(points) 249 | else: 250 | return PolyLine(points, True) 251 | 252 | def make_polygon(v, filled=True): 253 | if filled: return FilledPolygon(v) 254 | else: return PolyLine(v, True) 255 | 256 | def make_polyline(v): 257 | return PolyLine(v, False) 258 | 259 | def make_capsule(length, width): 260 | l, r, t, b = 0, length, width/2, -width/2 261 | box = make_polygon([(l,b), (l,t), (r,t), (r,b)]) 262 | circ0 = make_circle(width/2) 263 | circ1 = make_circle(width/2) 264 | circ1.add_attr(Transform(translation=(length, 0))) 265 | geom = Compound([box, circ0, circ1]) 266 | return geom 267 | 268 | class Compound(Geom): 269 | def __init__(self, gs): 270 | Geom.__init__(self) 271 | self.gs = gs 272 | for g in self.gs: 273 | g.attrs = [a for a in g.attrs if not isinstance(a, Color)] 274 | def render1(self): 275 | for g in self.gs: 276 | g.render() 277 | 278 | class PolyLine(Geom): 279 | def __init__(self, v, close): 280 | Geom.__init__(self) 281 | self.v = v 282 | self.close = close 283 | self.linewidth = LineWidth(1) 284 | self.add_attr(self.linewidth) 285 | def render1(self): 286 | glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP) 287 | for p in self.v: 288 | glVertex3f(p[0], p[1],0) # draw each vertex 289 | glEnd() 290 | def set_linewidth(self, x): 291 | self.linewidth.stroke = x 292 | 293 | class Line(Geom): 294 | def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)): 295 | Geom.__init__(self) 296 | self.start = start 297 | self.end = end 298 | self.linewidth = LineWidth(1) 299 | self.add_attr(self.linewidth) 300 | 301 | def render1(self): 302 | glBegin(GL_LINES) 303 | glVertex2f(*self.start) 304 | glVertex2f(*self.end) 305 | glEnd() 306 | 307 | class Image(Geom): 308 | def __init__(self, fname, width, height): 309 | Geom.__init__(self) 310 | self.width = width 311 | self.height = height 312 | img = pyglet.image.load(fname) 313 | self.img = img 314 | self.flip = False 315 | def render1(self): 316 | self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height) 317 | 318 | # ================================================================ 319 | 320 | class SimpleImageViewer(object): 321 | def __init__(self, display=None): 322 | self.window = None 323 | self.isopen = False 324 | self.display = display 325 | def imshow(self, arr): 326 | if self.window is None: 327 | height, width, channels = arr.shape 328 | self.window = pyglet.window.Window(width=width, height=height, display=self.display) 329 | self.width = width 330 | self.height = height 331 | self.isopen = True 332 | assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape" 333 | image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3) 334 | self.window.clear() 335 | self.window.switch_to() 336 | self.window.dispatch_events() 337 | image.blit(0,0) 338 | self.window.flip() 339 | def close(self): 340 | if self.isopen: 341 | self.window.close() 342 | self.isopen = False 343 | def __del__(self): 344 | self.close() -------------------------------------------------------------------------------- /formation_gym/rendering.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2D rendering framework 3 | """ 4 | from __future__ import division 5 | import os 6 | import six 7 | import sys 8 | 9 | if "Apple" in sys.version: 10 | if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ: 11 | os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib' 12 | # (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite 13 | 14 | from gym import error 15 | 16 | try: 17 | import pyglet 18 | except ImportError as e: 19 | print(suffix="HINT: you can install pyglet directly via 'pip install pyglet'. But if you really just want to install all Gym dependencies and not have to think about it, 'pip install -e .[all]' or 'pip install gym[all]' will do it.") 20 | 21 | try: 22 | from pyglet.gl import * 23 | except ImportError as e: 24 | print(prefix="Error occured while running `from pyglet.gl import *`",suffix="HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s \"-screen 0 1400x900x24\" python '") 25 | 26 | import math 27 | import numpy as np 28 | 29 | RAD2DEG = 57.29577951308232 30 | 31 | def get_display(spec): 32 | """Convert a display specification (such as :0) into an actual Display 33 | object. 34 | 35 | Pyglet only supports multiple Displays on Linux. 36 | """ 37 | if spec is None: 38 | return None 39 | elif isinstance(spec, six.string_types): 40 | return pyglet.canvas.Display(spec) 41 | else: 42 | raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec)) 43 | 44 | class Viewer(object): 45 | def __init__(self, width, height, display=None): 46 | display = get_display(display) 47 | 48 | self.width = width 49 | self.height = height 50 | 51 | self.window = pyglet.window.Window(width=width, height=height, display=display) 52 | self.window.on_close = self.window_closed_by_user 53 | self.geoms = [] 54 | self.onetime_geoms = [] 55 | self.transform = Transform() 56 | 57 | glEnable(GL_BLEND) 58 | # glEnable(GL_MULTISAMPLE) 59 | glEnable(GL_LINE_SMOOTH) 60 | # glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE) 61 | glHint(GL_LINE_SMOOTH_HINT, GL_NICEST) 62 | glLineWidth(2.0) 63 | glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA) 64 | 65 | def close(self): 66 | self.window.close() 67 | 68 | def window_closed_by_user(self): 69 | self.close() 70 | 71 | def set_bounds(self, left, right, bottom, top): 72 | assert right > left and top > bottom 73 | scalex = self.width/(right-left) 74 | scaley = self.height/(top-bottom) 75 | self.transform = Transform( 76 | translation=(-left*scalex, -bottom*scaley), 77 | scale=(scalex, scaley)) 78 | 79 | def add_geom(self, geom): 80 | self.geoms.append(geom) 81 | 82 | def add_onetime(self, geom): 83 | self.onetime_geoms.append(geom) 84 | 85 | def render(self, return_rgb_array=False): 86 | glClearColor(1,1,1,1) 87 | self.window.clear() 88 | self.window.switch_to() 89 | self.window.dispatch_events() 90 | self.transform.enable() 91 | for geom in self.geoms: 92 | geom.render() 93 | for geom in self.onetime_geoms: 94 | geom.render() 95 | self.transform.disable() 96 | arr = None 97 | if return_rgb_array: 98 | buffer = pyglet.image.get_buffer_manager().get_color_buffer() 99 | image_data = buffer.get_image_data() 100 | arr = np.fromstring(image_data.get_data(), dtype=np.uint8, sep='') 101 | # In https://github.com/openai/gym-http-api/issues/2, we 102 | # discovered that someone using Xmonad on Arch was having 103 | # a window of size 598 x 398, though a 600 x 400 window 104 | # was requested. (Guess Xmonad was preserving a pixel for 105 | # the boundary.) So we use the buffer height/width rather 106 | # than the requested one. 107 | arr = arr.reshape(buffer.height, buffer.width, 4) 108 | arr = arr[::-1,:,0:3] 109 | self.window.flip() 110 | self.onetime_geoms = [] 111 | return arr 112 | 113 | # Convenience 114 | def draw_circle(self, radius=10, res=30, filled=True, **attrs): 115 | geom = make_circle(radius=radius, res=res, filled=filled) 116 | _add_attrs(geom, attrs) 117 | self.add_onetime(geom) 118 | return geom 119 | 120 | def draw_polygon(self, v, filled=True, **attrs): 121 | geom = make_polygon(v=v, filled=filled) 122 | _add_attrs(geom, attrs) 123 | self.add_onetime(geom) 124 | return geom 125 | 126 | def draw_polyline(self, v, **attrs): 127 | geom = make_polyline(v=v) 128 | _add_attrs(geom, attrs) 129 | self.add_onetime(geom) 130 | return geom 131 | 132 | def draw_line(self, start, end, **attrs): 133 | geom = Line(start, end) 134 | _add_attrs(geom, attrs) 135 | self.add_onetime(geom) 136 | return geom 137 | 138 | def get_array(self): 139 | self.window.flip() 140 | image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data() 141 | self.window.flip() 142 | arr = np.fromstring(image_data.data, dtype=np.uint8, sep='') 143 | arr = arr.reshape(self.height, self.width, 4) 144 | return arr[::-1,:,0:3] 145 | 146 | def _add_attrs(geom, attrs): 147 | if "color" in attrs: 148 | geom.set_color(*attrs["color"]) 149 | if "linewidth" in attrs: 150 | geom.set_linewidth(attrs["linewidth"]) 151 | 152 | class Geom(object): 153 | def __init__(self): 154 | self._color=Color((0, 0, 0, 1.0)) 155 | self.attrs = [self._color] 156 | def render(self): 157 | for attr in reversed(self.attrs): 158 | attr.enable() 159 | self.render1() 160 | for attr in self.attrs: 161 | attr.disable() 162 | def render1(self): 163 | raise NotImplementedError 164 | def add_attr(self, attr): 165 | self.attrs.append(attr) 166 | def set_color(self, r, g, b, alpha=1): 167 | self._color.vec4 = (r, g, b, alpha) 168 | 169 | class Attr(object): 170 | def enable(self): 171 | raise NotImplementedError 172 | def disable(self): 173 | pass 174 | 175 | class Transform(Attr): 176 | def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1,1)): 177 | self.set_translation(*translation) 178 | self.set_rotation(rotation) 179 | self.set_scale(*scale) 180 | def enable(self): 181 | glPushMatrix() 182 | glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint 183 | glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0) 184 | glScalef(self.scale[0], self.scale[1], 1) 185 | def disable(self): 186 | glPopMatrix() 187 | def set_translation(self, newx, newy): 188 | self.translation = (float(newx), float(newy)) 189 | def set_rotation(self, new): 190 | self.rotation = float(new) 191 | def set_scale(self, newx, newy): 192 | self.scale = (float(newx), float(newy)) 193 | 194 | class Color(Attr): 195 | def __init__(self, vec4): 196 | self.vec4 = vec4 197 | def enable(self): 198 | glColor4f(*self.vec4) 199 | 200 | class LineStyle(Attr): 201 | def __init__(self, style): 202 | self.style = style 203 | def enable(self): 204 | glEnable(GL_LINE_STIPPLE) 205 | glLineStipple(1, self.style) 206 | def disable(self): 207 | glDisable(GL_LINE_STIPPLE) 208 | 209 | class LineWidth(Attr): 210 | def __init__(self, stroke): 211 | self.stroke = stroke 212 | def enable(self): 213 | glLineWidth(self.stroke) 214 | 215 | class Point(Geom): 216 | def __init__(self): 217 | Geom.__init__(self) 218 | def render1(self): 219 | glBegin(GL_POINTS) # draw point 220 | glVertex3f(0.0, 0.0, 0.0) 221 | glEnd() 222 | 223 | class FilledPolygon(Geom): 224 | def __init__(self, v): 225 | Geom.__init__(self) 226 | self.v = v 227 | def render1(self): 228 | if len(self.v) == 4 : glBegin(GL_QUADS) 229 | elif len(self.v) > 4 : glBegin(GL_POLYGON) 230 | else: glBegin(GL_TRIANGLES) 231 | for p in self.v: 232 | glVertex3f(p[0], p[1],0) # draw each vertex 233 | glEnd() 234 | 235 | color = (self._color.vec4[0] * 0.5, self._color.vec4[1] * 0.5, self._color.vec4[2] * 0.5, self._color.vec4[3] * 0.5) 236 | glColor4f(*color) 237 | glBegin(GL_LINE_LOOP) 238 | for p in self.v: 239 | glVertex3f(p[0], p[1],0) # draw each vertex 240 | glEnd() 241 | 242 | def make_circle(radius=10, res=30, filled=True): 243 | points = [] 244 | for i in range(res): 245 | ang = 2*math.pi*i / res 246 | points.append((math.cos(ang)*radius, math.sin(ang)*radius)) 247 | if filled: 248 | return FilledPolygon(points) 249 | else: 250 | return PolyLine(points, True) 251 | 252 | def make_polygon(v, filled=True): 253 | if filled: return FilledPolygon(v) 254 | else: return PolyLine(v, True) 255 | 256 | def make_polyline(v): 257 | return PolyLine(v, False) 258 | 259 | def make_capsule(length, width): 260 | l, r, t, b = 0, length, width/2, -width/2 261 | box = make_polygon([(l,b), (l,t), (r,t), (r,b)]) 262 | circ0 = make_circle(width/2) 263 | circ1 = make_circle(width/2) 264 | circ1.add_attr(Transform(translation=(length, 0))) 265 | geom = Compound([box, circ0, circ1]) 266 | return geom 267 | 268 | class Compound(Geom): 269 | def __init__(self, gs): 270 | Geom.__init__(self) 271 | self.gs = gs 272 | for g in self.gs: 273 | g.attrs = [a for a in g.attrs if not isinstance(a, Color)] 274 | def render1(self): 275 | for g in self.gs: 276 | g.render() 277 | 278 | class PolyLine(Geom): 279 | def __init__(self, v, close): 280 | Geom.__init__(self) 281 | self.v = v 282 | self.close = close 283 | self.linewidth = LineWidth(1) 284 | self.add_attr(self.linewidth) 285 | def render1(self): 286 | glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP) 287 | for p in self.v: 288 | glVertex3f(p[0], p[1],0) # draw each vertex 289 | glEnd() 290 | def set_linewidth(self, x): 291 | self.linewidth.stroke = x 292 | 293 | class Line(Geom): 294 | def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)): 295 | Geom.__init__(self) 296 | self.start = start 297 | self.end = end 298 | self.linewidth = LineWidth(1) 299 | self.add_attr(self.linewidth) 300 | 301 | def render1(self): 302 | glBegin(GL_LINES) 303 | glVertex2f(*self.start) 304 | glVertex2f(*self.end) 305 | glEnd() 306 | 307 | class Image(Geom): 308 | def __init__(self, fname, width, height): 309 | Geom.__init__(self) 310 | self.width = width 311 | self.height = height 312 | img = pyglet.image.load(fname) 313 | self.img = img 314 | self.flip = False 315 | def render1(self): 316 | self.img.blit(-self.width/2, -self.height/2, width=self.width, height=self.height) 317 | 318 | # ================================================================ 319 | 320 | class SimpleImageViewer(object): 321 | def __init__(self, display=None): 322 | self.window = None 323 | self.isopen = False 324 | self.display = display 325 | def imshow(self, arr): 326 | if self.window is None: 327 | height, width, channels = arr.shape 328 | self.window = pyglet.window.Window(width=width, height=height, display=self.display) 329 | self.width = width 330 | self.height = height 331 | self.isopen = True 332 | assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape" 333 | image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3) 334 | self.window.clear() 335 | self.window.switch_to() 336 | self.window.dispatch_events() 337 | image.blit(0,0) 338 | self.window.flip() 339 | def close(self): 340 | if self.isopen: 341 | self.window.close() 342 | self.isopen = False 343 | def __del__(self): 344 | self.close() 345 | -------------------------------------------------------------------------------- /train/maddpg-v4/utils.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | from multiprocessing import Process, Pipe 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | 8 | from gym.spaces import Box, Discrete, Tuple 9 | from wrapper import DummyVecEnv, SubprocVecEnv 10 | import formation_gym 11 | 12 | def make_train_env(config): 13 | def get_env_fn(rank): 14 | def init_env(): 15 | if config['env_name'] == "MPE": 16 | env = formation_gym.make_env(config['scenario_name'], benchmark = False, num_agents = config['num_agents']) 17 | else: 18 | print("Can not support the " + 19 | config['env_name'] + "environment.") 20 | raise NotImplementedError 21 | env.seed(config['seed'] + rank * 1000) 22 | return env 23 | return init_env 24 | if config['n_rollout_threads'] == 1: 25 | return DummyVecEnv([get_env_fn(0)]) 26 | else: 27 | return SubprocVecEnv([get_env_fn(i) for i in range(config['n_rollout_threads'])]) 28 | 29 | def to_torch(input): 30 | return torch.from_numpy(input) if type(input) == np.ndarray else input 31 | 32 | def get_config(): 33 | with open(os.path.dirname(__file__)+"/parameters.yaml", "r") as stream: 34 | try: 35 | config = yaml.safe_load(stream) 36 | except yaml.YAMLError as exc: 37 | print(exc) 38 | return config 39 | 40 | def get_dim_from_space(space): 41 | if isinstance(space, Box): 42 | dim = space.shape[0] 43 | elif isinstance(space, Discrete): 44 | dim = space.n 45 | elif isinstance(space, Tuple): 46 | dim = sum([get_dim_from_space(sp) for sp in space]) 47 | elif "MultiDiscrete" in space.__class__.__name__: 48 | return (space.high - space.low) + 1 49 | elif isinstance(space, list): 50 | dim = space[0] 51 | else: 52 | raise Exception("Unrecognized space: ", type(space)) 53 | return dim 54 | 55 | def get_cent_act_dim(action_space): 56 | cent_act_dim = 0 57 | for space in action_space: 58 | dim = get_dim_from_space(space) 59 | if isinstance(dim, np.ndarray): 60 | cent_act_dim += int(sum(dim)) 61 | else: 62 | cent_act_dim += dim 63 | return cent_act_dim 64 | 65 | def get_state_dim(observation_dict, action_dict): 66 | combined_obs_dim = sum([get_dim_from_space(space) 67 | for space in observation_dict.values()]) 68 | combined_act_dim = 0 69 | for space in action_dict.values(): 70 | dim = get_dim_from_space(space) 71 | if isinstance(dim, np.ndarray): 72 | combined_act_dim += int(sum(dim)) 73 | else: 74 | combined_act_dim += dim 75 | return combined_obs_dim, combined_act_dim, combined_obs_dim+combined_act_dim 76 | 77 | class DecayThenFlatSchedule(): 78 | def __init__(self, 79 | start, 80 | finish, 81 | time_length, 82 | decay="exp"): 83 | 84 | self.start = start 85 | self.finish = finish 86 | self.time_length = time_length 87 | self.delta = (self.start - self.finish) / self.time_length 88 | self.decay = decay 89 | 90 | if self.decay in ["exp"]: 91 | self.exp_scaling = (-1) * self.time_length / \ 92 | np.log(self.finish) if self.finish > 0 else 1 93 | 94 | def eval(self, T): 95 | if self.decay in ["linear"]: 96 | return max(self.finish, self.start - self.delta * T) 97 | elif self.decay in ["exp"]: 98 | return min(self.start, max(self.finish, np.exp(- T / self.exp_scaling))) 99 | pass 100 | 101 | class ACTLayer(nn.Module): 102 | def __init__(self, config ,act_dim): 103 | super(ACTLayer, self).__init__() 104 | 105 | self.multi_discrete = False 106 | init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][config['use_orthogonal']] 107 | def init_(m): 108 | return init(m, init_method, lambda x: nn.init.constant_(x, 0), config['gain']) 109 | 110 | if isinstance(act_dim, np.ndarray): 111 | # MultiDiscrete setting: have n Linear layers for each action 112 | self.multi_discrete = True 113 | self.action_outs = nn.ModuleList([init_(nn.Linear(config['hidden_size'], a_dim)) for a_dim in act_dim]) 114 | else: 115 | self.action_out = init_(nn.Linear(config['hidden_size'], act_dim)) 116 | 117 | def forward(self, x, no_sequence=False): 118 | 119 | if self.multi_discrete: 120 | act_outs = [] 121 | for a_out in self.action_outs: 122 | act_out = a_out(x) 123 | if no_sequence: 124 | # remove the dummy first time dimension if the input didn't have a time dimension 125 | act_out = act_out[0, :, :] 126 | act_outs.append(act_out) 127 | else: 128 | act_outs = self.action_out(x) 129 | if no_sequence: 130 | # remove the dummy first time dimension if the input didn't have a time dimension 131 | act_outs = act_outs[0, :, :] 132 | 133 | return act_outs 134 | class PopArt(nn.Module): 135 | """ Normalize a vector of observations - across the first norm_axes dimensions""" 136 | 137 | def __init__(self, input_shape, norm_axes=1, beta=0.99999, per_element_update=False, epsilon=1e-5, device=torch.device("cpu")): 138 | super(PopArt, self).__init__() 139 | 140 | self.input_shape = input_shape 141 | self.norm_axes = norm_axes 142 | self.epsilon = epsilon 143 | self.beta = beta 144 | self.per_element_update = per_element_update 145 | self.device = device 146 | self.tpdv = dict(dtype=torch.float32, device=device) 147 | 148 | self.running_mean = nn.Parameter(torch.zeros(input_shape, dtype=torch.float), requires_grad=False).to(self.device) 149 | self.running_mean_sq = nn.Parameter(torch.zeros(input_shape, dtype=torch.float), requires_grad=False).to(self.device) 150 | self.debiasing_term = nn.Parameter(torch.tensor(0.0, dtype=torch.float), requires_grad=False).to(self.device) 151 | 152 | def reset_parameters(self): 153 | self.running_mean.zero_() 154 | self.running_mean_sq.zero_() 155 | self.debiasing_term.zero_() 156 | 157 | def running_mean_var(self): 158 | debiased_mean = self.running_mean / self.debiasing_term.clamp(min=self.epsilon) 159 | debiased_mean_sq = self.running_mean_sq / self.debiasing_term.clamp(min=self.epsilon) 160 | debiased_var = (debiased_mean_sq - debiased_mean ** 2).clamp(max=self.alpha, min=1e-2) 161 | return debiased_mean, debiased_var 162 | 163 | def forward(self, input_vector, train=True): 164 | # Make sure input is float32 165 | input_vector = input_vector.to(**self.tpdv) 166 | 167 | if train: 168 | # Detach input before adding it to running means to avoid backpropping through it on 169 | # subsequent batches. 170 | 171 | detached_input = input_vector.detach() 172 | batch_mean = detached_input.mean(dim=tuple(range(self.norm_axes))) 173 | batch_sq_mean = (detached_input ** 2).mean(dim=tuple(range(self.norm_axes))) 174 | if self.per_element_update: 175 | batch_size = np.prod(detached_input.size()[:self.norm_axes]) 176 | weight = self.beta ** batch_size 177 | else: 178 | weight = self.beta 179 | 180 | self.running_mean.mul_(weight).add_(batch_mean * (1.0 - weight)) 181 | self.running_mean_sq.mul_(weight).add_(batch_sq_mean * (1.0 - weight)) 182 | self.debiasing_term.mul_(weight).add_(1.0 * (1.0 - weight)) 183 | 184 | mean, var = self.running_mean_var() 185 | out = (input_vector - mean[(None,) * self.norm_axes]) / torch.sqrt(var)[(None,) * self.norm_axes] 186 | return out 187 | 188 | def denormalize(self, input_vector): 189 | """ Transform normalized data back into original distribution """ 190 | input_vector = input_vector.to(**self.tpdv) 191 | 192 | mean, var = self.running_mean_var() 193 | out = input_vector * torch.sqrt(var)[(None,) * self.norm_axes] + mean[(None,) * self.norm_axes] 194 | return out 195 | 196 | class MLPBase(nn.Module): 197 | def __init__(self, config, inputs_dim): 198 | super(MLPBase, self).__init__() 199 | self.config = config 200 | if self.config['use_feature_normalization']: 201 | self.feature_norm = nn.LayerNorm(inputs_dim) 202 | 203 | if self.config['use_conv1d']: 204 | self.conv = CONVLayer(self.config['stacked_frames'], self.config['hidden_size'], self.config['use_orthogonal'], self.config['use_ReLU']) 205 | random_x = torch.FloatTensor(1, self.config['stacked_frames'], self.config['inputs_dim']) 206 | random_out = self.conv(random_x) 207 | assert len(random_out.shape)==3 208 | inputs_dim = random_out.size(-1) * random_out.size(-2) 209 | 210 | self.mlp = MLPLayer(inputs_dim, self.config['hidden_size'], 211 | self.config['layer_N'], self.config['use_orthogonal'], self.config['use_ReLU']) 212 | 213 | def forward(self, x): 214 | if self.config['use_feature_normalization']: 215 | x = self.feature_norm(x) 216 | 217 | if self.config['use_conv1d']: 218 | batch_size = x.size(0) 219 | x = x.view(batch_size, self.config['stacked_frames'], -1) 220 | x = self.conv(x) 221 | x = x.view(batch_size, -1) 222 | 223 | x = self.mlp(x) 224 | 225 | return x 226 | 227 | class MLPLayer(nn.Module): 228 | def __init__(self, input_dim, hidden_size, layer_N, use_orthogonal, use_ReLU): 229 | super(MLPLayer, self).__init__() 230 | self._layer_N = layer_N 231 | 232 | active_func = [nn.Tanh(), nn.ReLU()][use_ReLU] 233 | init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal] 234 | gain = nn.init.calculate_gain(['tanh', 'relu'][use_ReLU]) 235 | 236 | def init_(m): 237 | return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain=gain) 238 | 239 | self.fc1 = nn.Sequential( 240 | init_(nn.Linear(input_dim, hidden_size)), active_func, nn.LayerNorm(hidden_size)) 241 | self.fc_h = nn.Sequential(init_( 242 | nn.Linear(hidden_size, hidden_size)), active_func, nn.LayerNorm(hidden_size)) 243 | self.fc2 = get_clones(self.fc_h, self._layer_N) 244 | 245 | def forward(self, x): 246 | x = self.fc1(x) 247 | for i in range(self._layer_N): 248 | x = self.fc2[i](x) 249 | return x 250 | 251 | class CONVLayer(nn.Module): 252 | def __init__(self, input_dim, hidden_size, use_orthogonal, use_ReLU): 253 | super(CONVLayer, self).__init__() 254 | 255 | active_func = [nn.Tanh(), nn.ReLU()][use_ReLU] 256 | init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal] 257 | gain = nn.init.calculate_gain(['tanh', 'relu'][use_ReLU]) 258 | 259 | def init_(m): 260 | return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain=gain) 261 | 262 | self.conv = nn.Sequential( 263 | init_(nn.Conv1d(in_channels=input_dim, out_channels=hidden_size//4, kernel_size=3, stride=2, padding=0)), active_func, #nn.BatchNorm1d(hidden_size//4), 264 | init_(nn.Conv1d(in_channels=hidden_size//4, out_channels=hidden_size//2, kernel_size=3, stride=1, padding=1)), active_func, #nn.BatchNorm1d(hidden_size//2), 265 | init_(nn.Conv1d(in_channels=hidden_size//2, out_channels=hidden_size, kernel_size=3, stride=1, padding=1)), active_func)#, nn.BatchNorm1d(hidden_size)) 266 | 267 | def forward(self, x): 268 | x = self.conv(x) 269 | return x 270 | 271 | def tile_images(img_nhwc): 272 | """ 273 | Tile N images into one big PxQ image 274 | (P,Q) are chosen to be as close as possible, and if N 275 | is square, then P=Q. 276 | input: img_nhwc, list or array of images, ndim=4 once turned into array 277 | n = batch index, h = height, w = width, c = channel 278 | returns: 279 | bigim_HWc, ndarray with ndim=3 280 | """ 281 | img_nhwc = np.asarray(img_nhwc) 282 | N, h, w, c = img_nhwc.shape 283 | H = int(np.ceil(np.sqrt(N))) 284 | W = int(np.ceil(float(N)/H)) 285 | img_nhwc = np.array( 286 | list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)]) 287 | img_HWhwc = img_nhwc.reshape(H, W, h, w, c) 288 | img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4) 289 | img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c) 290 | return img_Hh_Ww_c --------------------------------------------------------------------------------