├── .gitignore ├── README.md ├── algos ├── __init__.py └── maTT │ ├── __init__.py │ ├── core.py │ ├── core_behavior.py │ ├── dql.py │ ├── evaluation.py │ ├── evaluation_behavior.py │ ├── modules.py │ ├── replay_buffer.py │ └── run_script.py ├── environment.yml ├── envs ├── __init__.py ├── maTTenv │ ├── __init__.py │ ├── agent_models.py │ ├── belief_tracker.py │ ├── display_wrapper.py │ ├── env │ │ ├── __init__.py │ │ ├── maTracking_Base.py │ │ ├── setTracking_model.py │ │ ├── setTracking_v0.py │ │ ├── setTracking_vGreedy.py │ │ ├── setTracking_vGru.py │ │ └── setTracking_vkGreedy.py │ ├── gen_init_pose.py │ ├── maps │ │ ├── empty100.cfg │ │ ├── empty100.yaml │ │ ├── empty1000.cfg │ │ ├── empty1000.yaml │ │ ├── empty20.cfg │ │ ├── empty20.yaml │ │ ├── empty8.cfg │ │ ├── empty8.yaml │ │ ├── emptyMed.cfg │ │ ├── emptyMed.yaml │ │ ├── emptySmall.cfg │ │ ├── emptySmall.yaml │ │ └── map_utils.py │ ├── metadata.py │ └── util.py ├── run_ma_tracking.py └── utilities │ ├── __init__.py │ └── ma_time_limit.py ├── requirements.txt ├── setup └── utils ├── __init__.py ├── logSpinUp.py ├── mpi_pytorch.py ├── mpi_tools.py └── serialization_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | #repo specific ignores 2 | results/ 3 | 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scalableMARL 2 | 3 | [Scalable Reinforcement Learning Policies for Multi-Agent Control](https://arxiv.org/abs/2011.08055) 4 | 5 | CD. Hsu, H. Jeong, GJ. Pappas, P. Chaudhari. "Scalable Reinforcement Learning Policies for Multi-Agent Control". IEEE International Conference on Intelligent Robots and Systems (IROS), Prague, Czech Republic, 2021. 6 | 7 | Multi-Agent Reinforcement Learning method to learn scalable control polices for multi-agent target tracking. 8 | 9 | + Author: Christopher Hsu 10 | + Email: chsu8@seas.upenn.edu 11 | + Affiliation: 12 | - Department of Electrical and Systems Engineering 13 | - GRASP Laboratory 14 | - @ University of Pennsylvania 15 | 16 | Currently supports Python3.8 and is developed in Ubuntu 20.04 17 | 18 | ## scalableMARL file structure 19 | Within scalableMARL (highlighting the important files): 20 | ``` 21 | scalableMARL 22 | |___algos 23 | |___maTT #RL alg folder for the target tracking environment 24 | |___core #Self-Attention-based Model Architecture 25 | |___core_behavior #Used for further evaluation (Ablation D.2.) 26 | |___dql #Soft Double Q-Learning 27 | |___evaluation #Evaluation for Main Results 28 | |___evaluation_behavior #Used for further evaluation (Ablation D.2.) 29 | |___modules #Self-Attention blocks 30 | |___replay_buffer #RL replay buffer for sets 31 | |___run_script #**Main run script to do training and evaluation 32 | |___envs 33 | |___maTTenv #multi-agent target tracking 34 | |___env 35 | |___setTracking_v0 #Standard environment (i.e. 4a4t tasks) 36 | |___setTracking_vGreedy #Baseline Greedy Heuristic 37 | |___setTracking_vGru #Experiment with Gru (Ablation D.3) 38 | |___setTracking_vkGreedy #Experiment with Scalability and Heuristic Mask k=4 (Ablation D.1) 39 | |___run_ma_tracking #Example scipt to run environment 40 | |___setup #set PYTHONPATH ($source setup) 41 | ``` 42 | 43 | + To setup scalableMARL, follow the instruction below. 44 | 45 | ## Set up python environment for the scalableMARL repository 46 | 47 | ### Install python3.8 (if it is not already installed) 48 | ``` 49 | #to check python version 50 | python3 -V 51 | 52 | sudo apt-get update 53 | sudo apt-get install python3.8-dev 54 | ``` 55 | 56 | ## Set up environment (conda or virtualenv) 57 | 58 | ### Set up with conda 59 | ``` 60 | conda env create -f environment.yml 61 | conda activate scalableMARL 62 | source setup 63 | ``` 64 | ### Set up virtualenv 65 | Python virtual environments are used to isolate package installation from the system 66 | 67 | Replace 'virtualenv name' with your choice of folder name 68 | ``` 69 | sudo apt-get install python3-venv 70 | 71 | python3 -m venv --system-site-packages ./'virtualenv name' 72 | # Activate the environment for use, any commands now will be done within this venv 73 | source ./'virtualenv name'/bin/activate 74 | 75 | # To deactivate (in terminal, exit out of venv), do not use during setup 76 | deactivate 77 | ``` 78 | Now that the virtualenv is activated, you can install packages that are isolated from your system 79 | 80 | When the venv is activated, you can now install packages and run scripts 81 | 82 | #### Install isolated packages in your venv 83 | ``` 84 | sudo apt-get install -y eog python3-tk python3-yaml python3-pip ssh git 85 | 86 | #This command will auto install packages from requirements.txt 87 | pip3 install --trusted-host pypi.python.org -r requirements.txt 88 | ``` 89 | 90 | ## Current workflow 91 | ### Setup repos 92 | ``` 93 | # activate env 94 | conda activate scalableMARL 95 | # or 96 | source ./'virtualenv name'/bin/activate 97 | # change directory to scalableMARL 98 | cd ./scalableMARL 99 | # setup repo ***important in order to set PYTHONPATH*** 100 | source setup 101 | ``` 102 | scalableMARL repo is ready to go 103 | 104 | ### Running an algorithm 105 | ``` 106 | # its best to run from the scalableMARL folder so that logging and saving is consistent 107 | cd ./scalableMARL 108 | # run the alg 109 | python3 algos/maTT/run_script.py 110 | 111 | # you can run the alg with different argument parameters. See within run_script for more options. 112 | # for example 113 | python3 algos/maTT/run_script.py --seed 0 --logdir ./results/maPredPrey --epochs 40 114 | ``` 115 | ### To test, evaluate, and render() 116 | ``` 117 | # for a general example 118 | python3 algos/maTT/run_script.py --mode test --render 1 --log_dir ./results/maTT/setTracking-v0_123456789/seed_0/ --nb_test_eps 50 119 | # for a saved policy in saved_results 120 | python3 algos/maTT/run_script.py --mode test --render 1 --log_dir ./saved_results/maTT/setTracking-v0_123456789/seed_0/ 121 | ``` 122 | ### To see training curves 123 | ``` 124 | tensorboard --logdir ./results/maTT/setTracking-v0_123456789/ 125 | ``` 126 | ### Citing scalableMARL 127 | If you reference or use scalableMARL in your research, please cite: 128 | ``` 129 | @misc{hsu2021scalable, 130 | title={Scalable Reinforcement Learning Policies for Multi-Agent Control}, 131 | author={Christopher D. Hsu and Heejin Jeong and George J. Pappas and Pratik Chaudhari}, 132 | year={2021}, 133 | eprint={2011.08055}, 134 | archivePrefix={arXiv}, 135 | primaryClass={cs.MA} 136 | } 137 | 138 | ``` 139 | 140 | 141 | -------------------------------------------------------------------------------- /algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/algos/__init__.py -------------------------------------------------------------------------------- /algos/maTT/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/algos/maTT/__init__.py -------------------------------------------------------------------------------- /algos/maTT/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from algos.maTT.modules import * 5 | 6 | __author__ = 'Christopher D Hsu' 7 | __copyright__ = '' 8 | __credits__ = ['Christopher D Hsu', 'SpinningUp'] 9 | __license__ = '' 10 | __version__ = '0.0.1' 11 | __maintainer__ = 'Christopher D Hsu' 12 | __email__ = 'chsu8@seas.upenn.edu' 13 | __status__ = 'Dev' 14 | 15 | def combined_shape(length, shape=None): 16 | if shape is None: 17 | return (length,) 18 | return (length, shape) if np.isscalar(shape) else (length, *shape) 19 | 20 | def count_vars(module): 21 | return sum([np.prod(p.shape) for p in module.parameters()]) 22 | 23 | 24 | class SoftActionSelector(nn.Module): 25 | ''' 26 | Soft parameterization of q value logits, 27 | pi_log = (1/Z)*(e^(v(x)) - min(v(x)) 28 | If determinstic take max value as action, 29 | Else (stochastic), 30 | Sample from multinomial of the soft logits. 31 | ''' 32 | def __init__(self, act_dim): 33 | super().__init__() 34 | self.act_dim = act_dim 35 | self.logsoftmax = nn.LogSoftmax(dim=1) 36 | 37 | 38 | def forward(self, q, deterministic=False, with_logprob=True): 39 | q_soft = q - torch.min(q) 40 | 41 | # Convert q values to log probability space 42 | try: 43 | pi_log = self.logsoftmax(q_soft) 44 | except: 45 | q_soft = q_soft.unsqueeze(0) 46 | pi_log = self.logsoftmax(q_soft) 47 | 48 | # Select action 49 | if deterministic: 50 | mu = torch.argmax(pi_log) 51 | pi_action = mu 52 | else: 53 | q_log_dist = torch.distributions.multinomial.Multinomial(1, logits=pi_log) 54 | action = q_log_dist.sample() 55 | pi_action = torch.argmax(action, dim=1, keepdim=True) 56 | 57 | # Calculate log probability if training 58 | if with_logprob: 59 | logp_pi = torch.gather(pi_log,1,pi_action) 60 | else: 61 | logp_pi = None 62 | 63 | return pi_action, logp_pi 64 | 65 | class DeepSetAttention(nn.Module): 66 | """ Written by Christopher Hsu: 67 | 68 | """ 69 | def __init__(self, dim_input, dim_output, num_outputs=1, 70 | dim_hidden=128, num_heads=4, ln=True): 71 | super().__init__() 72 | self.enc = nn.Sequential( 73 | SAB(dim_input, dim_hidden, num_heads, ln=ln), 74 | SAB(dim_hidden, dim_hidden, num_heads, ln=ln)) 75 | self.dec = nn.Sequential( 76 | SAB(dim_hidden, dim_hidden, num_heads, ln=ln), 77 | nn.Linear(dim_hidden, dim_output)) 78 | 79 | # v(x) 80 | def values(self, obs): 81 | v = self.enc(obs) 82 | v = v.sum(dim=1, keepdim=True) #pooling mechanism: sum, mean, max 83 | v = self.dec(v).squeeze() 84 | return v 85 | 86 | # q(x,a) 87 | def forward(self, obs, act): 88 | v = self.enc(obs) 89 | v = v.sum(dim=1, keepdim=True) #pooling mechanism: sum, mean, max 90 | v = self.dec(v).squeeze() 91 | q = torch.gather(v, 1, act) 92 | return q 93 | 94 | class DeepSetmodel(nn.Module): 95 | 96 | def __init__(self, observation_space, action_space, dim_hidden=128): 97 | super().__init__() 98 | 99 | obs_dim = observation_space.shape[0] 100 | act_dim = action_space.n 101 | 102 | # build policy and value functions 103 | self.pi = SoftActionSelector(act_dim) 104 | self.q1 = DeepSetAttention(dim_input=obs_dim, dim_output=act_dim, dim_hidden=dim_hidden) 105 | self.q2 = DeepSetAttention(dim_input=obs_dim, dim_output=act_dim, dim_hidden=dim_hidden) 106 | 107 | def act(self, obs, deterministic=False): 108 | with torch.no_grad(): 109 | v1 = self.q1.values(obs) 110 | v2 = self.q2.values(obs) 111 | 112 | a, _ = self.pi(v1+v2, deterministic, False) 113 | # Tensor to int 114 | return int(a) 115 | -------------------------------------------------------------------------------- /algos/maTT/core_behavior.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from algos.psspTT.modules import * 5 | 6 | """ 7 | Used during evaluation of policies: 8 | in order to take a look at the q values 9 | Used to analyze weak form of cooperation 10 | Could be used to look at attention 11 | """ 12 | __author__ = 'Christopher D Hsu' 13 | __copyright__ = '' 14 | __credits__ = ['Christopher D Hsu'] 15 | __license__ = '' 16 | __version__ = '0.0.1' 17 | __maintainer__ = 'Christopher D Hsu' 18 | __email__ = 'chsu8@seas.upenn.edu' 19 | __status__ = 'Dev' 20 | 21 | 22 | def combined_shape(length, shape=None): 23 | if shape is None: 24 | return (length,) 25 | return (length, shape) if np.isscalar(shape) else (length, *shape) 26 | 27 | def count_vars(module): 28 | return sum([np.prod(p.shape) for p in module.parameters()]) 29 | 30 | 31 | class SoftActionSelector(nn.Module): 32 | ''' 33 | Soft parameterization of q value logits, 34 | pi_log = (1/Z)*(e^(v(x)) - min(v(x)) 35 | If determinstic take max value as action, 36 | Else (stochastic), 37 | Sample from multinomial of the soft logits. 38 | ''' 39 | def __init__(self, act_dim): 40 | super().__init__() 41 | self.act_dim = act_dim 42 | self.logsoftmax = nn.LogSoftmax(dim=1) 43 | self.softmax = nn.Softmax(dim=1) 44 | 45 | 46 | def forward(self, q, deterministic=False, with_logprob=True): 47 | q_soft = q - torch.min(q) 48 | 49 | # Convert q values to log probability space 50 | try: 51 | pi_log = self.logsoftmax(q_soft) 52 | q_values = self.softmax(q_soft) 53 | except: 54 | q_soft = q_soft.unsqueeze(0) 55 | pi_log = self.logsoftmax(q_soft) 56 | q_values = self.softmax(q_soft) 57 | # Select action 58 | if deterministic: 59 | mu = torch.argmax(pi_log) 60 | pi_action = mu 61 | else: 62 | q_log_dist = torch.distributions.multinomial.Multinomial(1, logits=pi_log) 63 | action = q_log_dist.sample() 64 | pi_action = torch.argmax(action, dim=1, keepdim=True) 65 | 66 | # Calculate log probability if training 67 | if with_logprob: 68 | logp_pi = torch.gather(pi_log,1,pi_action) 69 | else: 70 | logp_pi = None 71 | 72 | return pi_action, logp_pi, q_values 73 | 74 | class DeepSetAttention(nn.Module): 75 | """ Written by Christopher Hsu: 76 | 77 | """ 78 | def __init__(self, dim_input, dim_output, num_outputs=1, 79 | dim_hidden=128, num_heads=4, ln=True): 80 | super().__init__() 81 | self.enc = nn.Sequential( 82 | SAB(dim_input, dim_hidden, num_heads, ln=ln), 83 | SAB(dim_hidden, dim_hidden, num_heads, ln=ln)) 84 | self.dec = nn.Sequential( 85 | SAB(dim_hidden, dim_hidden, num_heads, ln=ln), 86 | nn.Linear(dim_hidden, dim_output)) 87 | 88 | # v(x) 89 | def values(self, obs): 90 | v = self.enc(obs) 91 | v = v.sum(dim=1, keepdim=True) #pooling mechanism: sum, mean, max 92 | v = self.dec(v).squeeze() 93 | return v 94 | 95 | # q(x,a) 96 | def forward(self, obs, act): 97 | v = self.enc(obs) 98 | v = v.sum(dim=1, keepdim=True) #pooling mechanism: sum, mean, max 99 | v = self.dec(v).squeeze() 100 | q = torch.gather(v, 1, act) 101 | return q 102 | 103 | class DeepSetmodel(nn.Module): 104 | 105 | def __init__(self, observation_space, action_space, dim_hidden=128): 106 | super().__init__() 107 | 108 | obs_dim = observation_space.shape[0] 109 | act_dim = action_space.n 110 | 111 | # build policy and value functions 112 | self.pi = SoftActionSelector(act_dim) 113 | self.q1 = DeepSetAttention(dim_input=obs_dim, dim_output=act_dim, dim_hidden=dim_hidden) 114 | self.q2 = DeepSetAttention(dim_input=obs_dim, dim_output=act_dim, dim_hidden=dim_hidden) 115 | 116 | def act(self, obs, deterministic=False): 117 | with torch.no_grad(): 118 | v1 = self.q1.values(obs) 119 | v2 = self.q2.values(obs) 120 | 121 | a, _, q_values = self.pi(v1+v2, deterministic, False) 122 | # Tensor to int 123 | return int(a), q_values 124 | -------------------------------------------------------------------------------- /algos/maTT/dql.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import itertools 3 | import numpy as np 4 | import torch 5 | from torch.optim import Adam 6 | from torch.utils.tensorboard import SummaryWriter 7 | import gym 8 | from gym.spaces import Box, Discrete 9 | import time, os, random, pdb 10 | from utils.logSpinUp import EpochLogger 11 | import algos.maTT.core as core 12 | from algos.maTT.replay_buffer import ReplayBufferSet as ReplayBuffer 13 | 14 | __author__ = 'Christopher D Hsu' 15 | __copyright__ = '' 16 | __credits__ = ['Christopher D Hsu', 'SpinningUp'] 17 | __license__ = '' 18 | __version__ = '0.0.1' 19 | __maintainer__ = 'Christopher D Hsu' 20 | __email__ = 'chsu8@seas.upenn.edu' 21 | __status__ = 'Dev' 22 | 23 | 24 | SET_EVAL_v0 = [ 25 | {'nb_agents': 1, 'nb_targets': 1}, 26 | {'nb_agents': 2, 'nb_targets': 2}, 27 | {'nb_agents': 3, 'nb_targets': 3}, 28 | {'nb_agents': 4, 'nb_targets': 4}, 29 | ] 30 | 31 | def eval_set(num_agents, num_targets): 32 | agents = np.linspace(num_agents/2, num_agents, num=3, dtype=int) 33 | targets = np.linspace(num_agents/2, num_targets, num=3, dtype=int) 34 | params_set = [{'nb_agents':1, 'nb_targets':1}, 35 | {'nb_agents':4, 'nb_targets':4}] 36 | for a in agents: 37 | for t in targets: 38 | params_set.append({'nb_agents':a, 'nb_targets':t}) 39 | return params_set 40 | 41 | def test_agent(test_env, get_action, logger, num_test_episodes, 42 | num_agents, num_targets, render=False): 43 | """ Evaluate current policy over an environment set 44 | """ 45 | ## Either manually set evaluation set or auto fill 46 | params_set = SET_EVAL_v0 47 | # params_set = eval_set(num_agents, num_targets) 48 | 49 | for params in params_set: 50 | for j in range(num_test_episodes): 51 | done, ep_ret, ep_len = {'__all__':False}, 0, 0 52 | obs = test_env.reset(**params) 53 | while not done['__all__']: 54 | if render: 55 | test_env.render() 56 | action_dict = {} 57 | for agent_id, o in obs.items(): 58 | action_dict[agent_id] = get_action(o, deterministic=False) 59 | 60 | obs, rew, done, _ = test_env.step(action_dict) 61 | ep_ret += rew['__all__'] 62 | ep_len += 1 63 | logger.store(TestEpRet=ep_ret) 64 | 65 | 66 | def doubleQlearning(env_fn, model=core.DeepSetmodel, model_kwargs=dict(), seed=0, 67 | steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 68 | polyak=0.995, lr=1e-2, alpha=0.2, batch_size=100, start_steps=10000, 69 | update_after=4000, update_every=1, num_test_episodes=5, max_ep_len=200, 70 | logger_kwargs=dict(), save_freq=1, lr_period=0.7, grad_clip=5, render=False, 71 | torch_threads=1, amp=False): 72 | """ 73 | Soft Clipped Double Q-learning 74 | 75 | Written by Christopher Hsu 76 | based on SpinningUp structure 77 | 78 | Args: 79 | env_fn : A function which creates a copy of the environment. 80 | The environment must satisfy the OpenAI Gym API. 81 | 82 | policy: The constructor method for a PyTorch Module with an ``act`` 83 | method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. 84 | The ``act`` method and ``pi`` module should accept batches of 85 | observations as inputs, and ``q1`` and ``q2`` should accept a batch 86 | of observations. When called, ``act``, ``q1``, and ``q2`` should return: 87 | 88 | =========== ================ ====================================== 89 | Call Output Shape Description 90 | =========== ================ ====================================== 91 | ``act`` (act_dim) | Numpy array of actions for each 92 | | observation. 93 | ``q1`` (batch,) | Tensor containing one current estimate 94 | | of Q* for the provided observations 95 | | and action. 96 | ``q1.values``(batch,act_dim) | Tensor containing current estimate 97 | | of Q*'s' for observations. 98 | ``q2`` (batch,) | Tensor containing the other current 99 | | estimate of Q* for the provided observations 100 | | and action. 101 | ``q2.values``(batch,act_dim) | Tensor containing current estimate 102 | | of Q*'s' for observations. 103 | =========== ================ ====================================== 104 | 105 | Calling ``pi`` should return: 106 | 107 | =========== ================ ====================================== 108 | Symbol Shape Description 109 | =========== ================ ====================================== 110 | ``a`` (batch, act_dim) | Tensor containing actions from policy 111 | | given observations. 112 | ``logp_pi`` (batch,) | Tensor containing log probabilities of 113 | | actions in ``a``. Importantly: gradients 114 | | should be able to flow back into ``a``. 115 | =========== ================ ====================================== 116 | 117 | model_kwargs (dict): Any kwargs appropriate for the MLPmodel object 118 | you provided to double q learning. 119 | 120 | seed (int): Seed for random number generators. 121 | 122 | steps_per_epoch (int): Number of steps of interaction (state-action pairs) 123 | for the agent and the environment in each epoch. 124 | 125 | epochs (int): Number of epochs to run and train agent. 126 | 127 | replay_size (int): Maximum length of replay buffer. 128 | 129 | gamma (float): Discount factor. (Always between 0 and 1.) 130 | 131 | polyak (float): Interpolation factor in polyak averaging for target 132 | networks. Target networks are updated towards main networks 133 | according to: 134 | 135 | .. math:: \\theta_{\\text{targ}} \\leftarrow 136 | \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta 137 | 138 | where :math:`\\rho` is polyak. (Always between 0 and 1, usually 139 | close to 1.) 140 | 141 | lr (float): Learning rate (used for both policy and value learning). 142 | 143 | alpha (float): Entropy regularization coefficient. (Equivalent to 144 | inverse of reward scale in the original SAC paper.) 145 | 146 | batch_size (int): Minibatch size for SGD. 147 | 148 | start_steps (int): Number of steps for uniform-random action selection, 149 | before running real policy. Helps exploration. 150 | 151 | update_after (int): Number of env interactions to collect before 152 | starting to do gradient descent updates. Ensures replay buffer 153 | is full enough for useful updates. 154 | 155 | update_every (int): Number of env interactions that should elapse 156 | between gradient descent updates. Note: Regardless of how long 157 | you wait between updates, the ratio of env steps to gradient steps 158 | is locked to 1. 159 | 160 | num_test_episodes (int): Number of episodes to test the deterministic 161 | policy at the end of each epoch. 162 | 163 | max_ep_len (int): Maximum length of trajectory / episode / rollout. 164 | 165 | logger_kwargs (dict): Keyword args for EpochLogger. 166 | 167 | save_freq (int): How often (in terms of gap between epochs) to save 168 | the current policy and value function. 169 | 170 | """ 171 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 172 | torch.set_num_threads(torch_threads) 173 | 174 | logger = EpochLogger(**logger_kwargs) 175 | logger.save_config(locals()) 176 | writer = SummaryWriter(logger_kwargs['output_dir']) 177 | 178 | torch.manual_seed(seed) 179 | np.random.seed(seed) 180 | 181 | # env, test_env = env_fn(), env_fn() #Can be used if env is official gym env 182 | env = env_fn() 183 | test_env = deepcopy(env) 184 | obs_dim = env.observation_space.shape[0] 185 | act_dim = env.action_space.n 186 | 187 | # Create actor-critic module and target networks 188 | policy = model(env.observation_space, env.action_space, **model_kwargs).to(device) 189 | policy_targ = deepcopy(policy).to(device) 190 | 191 | # Freeze target networks with respect to optimizers (only update via polyak averaging) 192 | for p in policy_targ.parameters(): 193 | p.requires_grad = False 194 | 195 | # List of parameters for both Q-networks (save this for convenience) 196 | q_params = itertools.chain(policy.q1.parameters(), policy.q2.parameters()) 197 | 198 | # Experience Replay Buffer 199 | replay_buffer = ReplayBuffer(replay_size, obs_dim, act_dim) 200 | 201 | # Count variables (protip: try to get a feel for how different size networks behave!) 202 | var_counts = tuple(core.count_vars(module) for module in [policy.q1, policy.q2]) 203 | logger.log('\nNumber of parameters: \t q1: %d, \t q2: %d\n'%var_counts) 204 | 205 | # Set up optimizer 206 | q_optimizer = Adam(q_params, lr=lr) 207 | 208 | #AMP: automatic mixed precision: fp32 -> fp 16 209 | if amp: 210 | scaler = torch.cuda.amp.GradScaler() 211 | 212 | # Set up function for computing Q-losses 213 | def compute_loss_q(data): 214 | obs = data['obs'].to(device) 215 | act = data['act'].type(torch.LongTensor).to(device) 216 | rew = data['rew'].to(device) 217 | obs2 = data['obs2'].to(device) 218 | done = data['done'].type(torch.float32).to(device) 219 | 220 | q1 = policy.q1(obs,act) 221 | q2 = policy.q2(obs,act) 222 | 223 | # Bellman backup for Q functions 224 | with torch.cuda.amp.autocast(enabled=False): 225 | with torch.no_grad(): 226 | # Target actions come from *current* policy 227 | v1 = policy.q1.values(obs2) 228 | v2 = policy.q2.values(obs2) 229 | act2, logp_a2 = policy.pi(v1+v2) 230 | 231 | # Target Q-values 232 | q1_pi_targ = policy_targ.q1(obs2, act2) 233 | q2_pi_targ = policy_targ.q2(obs2, act2) 234 | q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) 235 | 236 | #Unsqueeze adds another dim, necessary to be column vectors 237 | # backup = r.unsqueeze(1) + gamma * (1 - done).unsqueeze(1) * (q_pi_targ - alpha * logp_a2) 238 | backup = rew.unsqueeze(1) + gamma * (q_pi_targ - alpha * logp_a2) 239 | 240 | # MSE loss against Bellman backup 241 | # loss_q1 = ((q1 - backup)**2).mean() 242 | # loss_q2 = ((q2 - backup)**2).mean() 243 | # Huber loss against Bellman backup 244 | huber = torch.nn.SmoothL1Loss() 245 | loss_q1 = huber(q1, backup) 246 | loss_q2 = huber(q2, backup) 247 | loss_q = loss_q1 + loss_q2 248 | 249 | # Useful info for logging, if training on gpu need to send to cpu to store 250 | try: 251 | q_info = dict(Q1Vals=q1.detach().numpy(), 252 | Q2Vals=q2.detach().numpy()) 253 | except: 254 | q_info = dict(Q1Vals=q1.cpu().detach().numpy(), 255 | Q2Vals=q2.cpu().detach().numpy()) 256 | 257 | return loss_q, q_info 258 | 259 | def update(data, lr_iter): 260 | # Update learning rate with cosine schedule 261 | lr = np.clip(0.0005*np.cos(np.pi*lr_iter/(total_steps*lr_period))+0.000501, 1e-5, 1e-3) 262 | q_optimizer.param_groups[0]['lr'] = lr 263 | 264 | # First run one gradient descent step for Q1 and Q2 265 | q_optimizer.zero_grad() 266 | 267 | # Automatic Mixed Precision, used for faster training with fp16 operations 268 | if amp: 269 | # Enables autocasting for the forward pass (model + loss) 270 | with torch.cuda.amp.autocast(): 271 | loss_q, q_info = compute_loss_q(data) 272 | # Calls backward() on scaled loss to create scaled gradients 273 | scaler.scale(loss_q).backward() 274 | # Unscales the gradients of optimizer's assigned params in-place 275 | scaler.unscale_(q_optimizer) 276 | # Clip gradient values 277 | torch.nn.utils.clip_grad_value_(policy.parameters(), grad_clip) 278 | # scaler.step() first unscales the gradients of the optimizer's assigned params. 279 | scaler.step(q_optimizer) 280 | # Updates the scale for next iteration. 281 | scaler.update() 282 | 283 | # Standard fp32 training operations 284 | else: 285 | loss_q, q_info = compute_loss_q(data) 286 | # Backprop 287 | loss_q.backward() 288 | # Clip gradient values 289 | torch.nn.utils.clip_grad_value_(policy.parameters(), grad_clip) 290 | # Gradient step 291 | q_optimizer.step() 292 | 293 | ## Record things 294 | logger.store(LossQ=loss_q.item(), **q_info) 295 | 296 | # Finally, update target networks by polyak averaging. 297 | with torch.no_grad(): 298 | for p, p_targ in zip(policy.parameters(), policy_targ.parameters()): 299 | # NB: We use an in-place operations "mul_", "add_" to update target 300 | # params, as opposed to "mul" and "add", which would make new tensors. 301 | p_targ.data.mul_(polyak) 302 | p_targ.data.add_((1 - polyak) * p.data) 303 | 304 | def get_action(obs, deterministic=False): 305 | # Unsqueeze obs to [1, n, d] 306 | return policy.act(torch.as_tensor(obs, dtype=torch.float32).unsqueeze(0).to(device), deterministic) 307 | 308 | # Prepare for interaction with environment 309 | total_steps = steps_per_epoch * epochs 310 | start_time = time.time() 311 | ep_ret, ep_len, best_test_ret = 0, 0, 0 312 | obs = env.reset() 313 | 314 | # Main loop: collect experience in env and update/log each epoch 315 | for t in range(total_steps): 316 | 317 | # Until start_steps have elapsed, randomly sample actions 318 | # from a uniform distribution for better exploration. Afterwards, 319 | # use the learned policy. 320 | action_dict = {} 321 | if t > start_steps: 322 | for agent_id, o in obs.items(): 323 | action_dict[agent_id] = get_action(o, deterministic=False) 324 | else: 325 | for agent_id, o in obs.items(): 326 | action_dict[agent_id] = env.action_space.sample() 327 | 328 | # Step the env 329 | obs2, rew, done, info = env.step(action_dict) 330 | ep_ret += rew['__all__'] 331 | ep_len += 1 332 | 333 | # Ignore the "done" signal if it comes from hitting the time 334 | # horizon (that is, when it's an artificial terminal signal 335 | # that isn't based on the agent's state) 336 | done['__all__'] = False if ep_len==max_ep_len else False 337 | 338 | # Store experience to replay buffer seperately for each agent 339 | for agent_id, o in obs.items(): 340 | replay_buffer.store(o, action_dict[agent_id], rew['__all__'], 341 | obs2[agent_id], float(done['__all__']))#, env.nb_targets) 342 | 343 | # Super critical, easy to overlook step: make sure to update 344 | # most recent observation! 345 | obs = obs2 346 | 347 | # End of trajectory handling 348 | if done['__all__'] or (ep_len == max_ep_len): 349 | logger.store(EpRet=ep_ret, EpLen=ep_len) 350 | ep_ret, ep_len = 0, 0 351 | obs = env.reset() 352 | 353 | # Update handling 354 | if t >= update_after and t % update_every == 0: 355 | for j in range(update_every): 356 | #Cosine learning rate schedule 357 | if t < total_steps*(1-lr_period): 358 | lr_iter = 0 359 | else: 360 | lr_iter = t-total_steps*(1-lr_period) 361 | 362 | batch = replay_buffer.sample_batch(batch_size)#, env.num_targets) 363 | update(data=batch, lr_iter=lr_iter) 364 | 365 | # End of epoch handling 366 | if (t+1) % steps_per_epoch == 0: 367 | epoch = (t+1) // steps_per_epoch 368 | logger.log_tabular('Epoch', epoch) 369 | 370 | # Test the performance of the deterministic version of the agent. 371 | test_agent(test_env, get_action, logger, num_test_episodes, env.num_agents, env.num_targets) 372 | # Averages test ep returns 373 | logger.log_tabular('TestEpRet', with_min_and_max=True) 374 | 375 | # Save model 376 | # if (epoch % save_freq == 0) or (epoch == epochs): 377 | # torch.save(policy.state_dict(), fpath+'model%d.pt'%epoch) 378 | 379 | # Save model based on best test episode return 380 | if logger.log_current_row['AverageTestEpRet'] > best_test_ret: 381 | logger.log('Saving model, AverageTestEpRet increase %d -> %d'% 382 | (best_test_ret, logger.log_current_row['AverageTestEpRet'])) 383 | 384 | fpath = logger_kwargs['output_dir']+'/state_dict/' 385 | os.makedirs(fpath, exist_ok=True) 386 | torch.save(policy.state_dict(), fpath+'model.pt') 387 | best_test_ret = logger.log_current_row['AverageTestEpRet'] 388 | 389 | 390 | # Log info about epoch 391 | logger.log_tabular('EpRet', with_min_and_max=True) 392 | logger.log_tabular('EpLen', average_only=True) 393 | logger.log_tabular('TotalEnvInteracts', t) 394 | logger.log_tabular('Q1Vals', with_min_and_max=True) 395 | logger.log_tabular('Q2Vals', with_min_and_max=True) 396 | logger.log_tabular('LossQ', average_only=True) 397 | logger.log_tabular('LR', q_optimizer.param_groups[0]['lr']) 398 | logger.log_tabular('Time', time.time()-start_time) 399 | 400 | # Tensorboard logger 401 | writer.add_scalar('AverageEpRet', logger.log_current_row['AverageEpRet'],t) 402 | writer.add_scalar('AverageTestEpRet', logger.log_current_row['AverageTestEpRet'],t) 403 | writer.add_scalar('AverageQ1Vals', logger.log_current_row['AverageQ1Vals'],t) 404 | writer.add_scalar('AverageQ2Vals', logger.log_current_row['AverageQ2Vals'],t) 405 | writer.add_scalar('HuberLossQ', logger.log_current_row['LossQ'],t) 406 | writer.add_scalar('LearningRate', logger.log_current_row['LR'],t) 407 | 408 | logger.dump_tabular() 409 | 410 | 411 | -------------------------------------------------------------------------------- /algos/maTT/evaluation.py: -------------------------------------------------------------------------------- 1 | import datetime, json, os, argparse, time 2 | import pickle, tabulate 3 | import matplotlib 4 | from matplotlib import pyplot as plt 5 | import numpy as np 6 | import os.path as osp 7 | import torch 8 | 9 | __author__ = 'Christopher D Hsu' 10 | __copyright__ = '' 11 | __credits__ = ['Christopher D Hsu'] 12 | __license__ = '' 13 | __version__ = '0.0.1' 14 | __maintainer__ = 'Christopher D Hsu' 15 | __email__ = 'chsu8@seas.upenn.edu' 16 | __status__ = 'Dev' 17 | 18 | def load_pytorch_policy(fpath, fname, model, deterministic=True): 19 | fname = osp.join(fpath,'state_dict/',fname) 20 | map_location = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 21 | model.load_state_dict(torch.load(fname, map_location)) 22 | 23 | # make function for producing an action given a single state 24 | def get_action(x, deterministic=True): 25 | with torch.no_grad(): 26 | x = torch.as_tensor(x, dtype=torch.float32).unsqueeze(0) 27 | action = model.act(x, deterministic) 28 | return action 29 | 30 | return get_action 31 | 32 | def eval_set(num_agents, num_targets): 33 | agents = np.linspace(num_agents/2, num_agents, num=3, dtype=int) 34 | targets = np.linspace(num_agents/2, num_targets, num=3, dtype=int) 35 | params_set = [{'nb_agents':1, 'nb_targets':1}, 36 | {'nb_agents':4, 'nb_targets':4}] 37 | for a in agents: 38 | for t in targets: 39 | params_set.append({'nb_agents':a, 'nb_targets':t}) 40 | return params_set 41 | 42 | class Test: 43 | def __init__(self): 44 | pass 45 | 46 | def test(self, args, env, act, torch_threads=1): 47 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 48 | torch.set_num_threads(torch_threads) 49 | 50 | seed = args.seed 51 | env.seed(seed) 52 | torch.manual_seed(seed) 53 | np.random.seed(seed) 54 | 55 | set_eval = eval_set(env.num_agents, env.num_targets) 56 | if args.eval_type == 'random': 57 | params_set = [{}] 58 | elif args.eval_type == 'fixed_nb': 59 | ## Either manually set evaluation set or auto fill 60 | params_set = SET_EVAL_v0 61 | # params_set = set_eval 62 | else: 63 | raise ValueError("Wrong evaluation type for ttenv.") 64 | 65 | timelimit_env = env 66 | while( not hasattr(timelimit_env, '_elapsed_steps')): 67 | timelimit_env = timelimit_env.env 68 | 69 | total_nlogdetcov = [] 70 | total_intruders = [] 71 | for params in params_set: 72 | ep = 0 73 | ep_nlogdetcov = [] #'Episode nLogDetCov' 74 | ep_intruders = [] 75 | time_elapsed = ['Elapsed Time (sec)'] 76 | 77 | while(ep < args.nb_test_eps): # test episode 78 | ep += 1 79 | s_time = time.time() 80 | episode_rew, nlogdetcov, ep_len, intruders = 0, 0, 0, 0 81 | done = {'__all__':False} 82 | obs = env.reset(**params) 83 | 84 | while not done['__all__']: 85 | if args.render: 86 | env.render() 87 | action_dict = {} 88 | for agent_id, o in obs.items(): 89 | action_dict[agent_id] = act(o, deterministic=False) 90 | obs, rew, done, info = env.step(action_dict) 91 | episode_rew += rew['__all__'] 92 | nlogdetcov += info['mean_nlogdetcov'] 93 | ep_len += 1 94 | 95 | time_elapsed.append(time.time() - s_time) 96 | ep_nlogdetcov.append(nlogdetcov) 97 | 98 | if args.render: 99 | print("Ep.%d - Episode reward : %.2f, Episode nLogDetCov : %.2f"%(ep, episode_rew, nlogdetcov)) 100 | if ep % 50 == 0: 101 | print("Ep.%d - Episode reward : %.2f, Episode nLogDetCov : %.2f"%(ep, episode_rew, nlogdetcov)) 102 | 103 | if args.record : 104 | env.moviewriter.finish() 105 | if args.ros_log : 106 | ros_log.save(args.log_dir) 107 | 108 | # Stats 109 | meanofeps = np.mean(ep_nlogdetcov) 110 | total_nlogdetcov.append(meanofeps) 111 | # Eval plots and saves 112 | if args.env == 'setTracking-vGreedy': 113 | eval_dir = os.path.join(os.path.split(args.log_dir)[0], 'greedy_eval_seed%d_'%(seed)+args.map) 114 | else: 115 | eval_dir = os.path.join(os.path.split(args.log_dir)[0], 'eval_seed%d_'%(seed)+args.map) 116 | model_seed = os.path.split(args.log_dir)[-1] 117 | # eval_dir = os.path.join(args.log_dir, 'eval_seed%d_'%(seed)+args.map) 118 | # model_seed = os.path.split(args.log_fname)[0] 119 | if not os.path.exists(eval_dir): 120 | os.makedirs(eval_dir) 121 | matplotlib.use('Agg') 122 | f0, ax0 = plt.subplots() 123 | _ = ax0.plot(ep_nlogdetcov, '.') 124 | _ = ax0.set_title(args.env) 125 | _ = ax0.set_xlabel('episode number') 126 | _ = ax0.set_ylabel('mean nlogdetcov') 127 | _ = ax0.axhline(y=meanofeps, color='r', linestyle='-', label='mean over episodes: %.2f'%(meanofeps)) 128 | _ = ax0.legend() 129 | _ = ax0.grid() 130 | _ = f0.savefig(os.path.join(eval_dir, "%da%dt_%d_eval_"%(env.nb_agents, env.nb_targets, args.nb_test_eps) 131 | +model_seed+".png")) 132 | plt.close() 133 | pickle.dump(ep_nlogdetcov, open(os.path.join(eval_dir,"%da%dt_%d_eval_"%(env.nb_agents, env.nb_targets, args.nb_test_eps)) 134 | +model_seed+".pkl", 'wb')) 135 | 136 | #Plot over all example episode sets 137 | f1, ax1 = plt.subplots() 138 | _ = ax1.plot(total_nlogdetcov, '.') 139 | _ = ax1.set_title(args.env) 140 | _ = ax1.set_xlabel('example episode set number') 141 | _ = ax1.set_ylabel('mean nlogdetcov over episodes') 142 | _ = ax1.grid() 143 | _ = f1.savefig(os.path.join(eval_dir,'all_%d_eval'%(args.nb_test_eps)+model_seed+'.png')) 144 | plt.close() 145 | pickle.dump(total_nlogdetcov, open(os.path.join(eval_dir,'all_%d_eval'%(args.nb_test_eps))+model_seed+'%da%dt'%(args.nb_agents,args.nb_targets)+'.pkl', 'wb')) 146 | 147 | SET_EVAL_v0 = [ 148 | # {'nb_agents': 1, 'nb_targets': 1}, 149 | # {'nb_agents': 2, 'nb_targets': 1}, 150 | # {'nb_agents': 3, 'nb_targets': 1}, 151 | # {'nb_agents': 4, 'nb_targets': 1}, 152 | # {'nb_agents': 1, 'nb_targets': 2}, 153 | # {'nb_agents': 2, 'nb_targets': 2}, 154 | # {'nb_agents': 3, 'nb_targets': 2}, 155 | # {'nb_agents': 4, 'nb_targets': 2}, 156 | # {'nb_agents': 1, 'nb_targets': 3}, 157 | # {'nb_agents': 2, 'nb_targets': 3}, 158 | # {'nb_agents': 3, 'nb_targets': 3}, 159 | # {'nb_agents': 4, 'nb_targets': 3}, 160 | # {'nb_agents': 1, 'nb_targets': 4}, 161 | # {'nb_agents': 2, 'nb_targets': 4}, 162 | # {'nb_agents': 3, 'nb_targets': 4}, 163 | {'nb_agents': 20, 'nb_targets': 20}, 164 | ] -------------------------------------------------------------------------------- /algos/maTT/evaluation_behavior.py: -------------------------------------------------------------------------------- 1 | import datetime, json, os, argparse, time, pdb 2 | import pickle, tabulate 3 | import matplotlib 4 | from matplotlib import pyplot as plt 5 | import numpy as np 6 | import os.path as osp 7 | import torch 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | __author__ = 'Christopher D Hsu' 11 | __copyright__ = '' 12 | __credits__ = ['Christopher D Hsu'] 13 | __license__ = '' 14 | __version__ = '0.0.1' 15 | __maintainer__ = 'Christopher D Hsu' 16 | __email__ = 'chsu8@seas.upenn.edu' 17 | __status__ = 'Dev' 18 | 19 | def load_pytorch_policy(fpath, fname, model, deterministic=True): 20 | fname = osp.join(fpath,'state_dict/',fname) 21 | map_location = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 22 | model.load_state_dict(torch.load(fname, map_location)) 23 | 24 | # make function for producing an action given a single state 25 | def get_action(x, deterministic=True): 26 | with torch.no_grad(): 27 | x = torch.as_tensor(x, dtype=torch.float32).unsqueeze(0) 28 | action = model.act(x, deterministic) 29 | return action 30 | 31 | return get_action 32 | 33 | 34 | def get_init_pose_list(nb_test_eps, eval_type): 35 | init_pose_list = [] 36 | if eval_type == 'fixed_2': 37 | for ii in range(nb_test_eps): 38 | left = np.random.uniform(20,30) 39 | Lyaxis = np.random.uniform(25,35) 40 | right = np.random.uniform(30,40) 41 | Ryaxis = np.random.uniform(35,45) 42 | init_pose_list.append({'agents':[[24.5, 15.5, 1.57], [26.5, 15.5, 1.57]], 43 | 'targets':[[left, Lyaxis, 0, 0],[right, Ryaxis, 0, 0]], 44 | 'belief_targets':[[left, Lyaxis, 0, 0], [right, Ryaxis, 0, 0]]}) 45 | else: 46 | for ii in range(nb_test_eps): 47 | xone = np.random.uniform(20,30) 48 | yone = np.random.uniform(15,25) 49 | xtwo = np.random.uniform(30,40) 50 | ytwo = np.random.uniform(25,35) 51 | xthree = np.random.uniform(20,30) 52 | ythree = np.random.uniform(35,45) 53 | xfour = np.random.uniform(10,20) 54 | yfour = np.random.uniform(35,45) 55 | init_pose_list.append({'agents':[[24.5, 10, 1.57], [26.5, 10, 1.57], 56 | [22.5, 10, 1.57], [28.5, 10, 1.57]], 57 | 'targets':[[xone, yone, 0, 0],[xtwo, ytwo, 0, 0], 58 | [xthree, ythree, 0, 0],[xfour, yfour, 0, 0]], 59 | 'belief_targets':[[xone, yone, 0, 0], [xtwo, ytwo, 0, 0], 60 | [xthree, ythree, 0, 0], [xfour, yfour, 0, 0]]}) 61 | 62 | return init_pose_list 63 | 64 | class TestBehavior: 65 | def __init__(self): 66 | pass 67 | 68 | def test(self, args, env, act, torch_threads=1): 69 | device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 70 | torch.set_num_threads(torch_threads) 71 | 72 | seed = args.seed 73 | env.seed(seed) 74 | torch.manual_seed(seed) 75 | np.random.seed(seed) 76 | 77 | eval_dir = os.path.join(os.path.split(args.log_dir)[0], 'behave_seed%d_'%(seed)+args.map) 78 | model_seed = os.path.split(args.log_dir)[-1] 79 | if not os.path.exists(eval_dir): 80 | os.makedirs(eval_dir) 81 | 82 | behave_dir = os.path.join(os.path.split(args.log_dir)[0], 'behave_seed%d_'%(seed)+args.map) 83 | model_seed = os.path.split(args.log_dir)[-1] 84 | if not os.path.exists(behave_dir): 85 | os.makedirs(behave_dir) 86 | # writer = SummaryWriter(behave_dir) 87 | 88 | if args.eval_type == 'random': 89 | params_set = [{}] 90 | elif args.eval_type == 'fixed_2': 91 | params_set = EVAL_BEHAVIOR_2 92 | tot_eplen = 60 93 | elif args.eval_type == 'fixed_4': 94 | params_set = EVAL_BEHAVIOR_4 95 | tot_eplen = 100 96 | else: 97 | raise ValueError("Wrong evaluation type for ttenv.") 98 | 99 | timelimit_env = env 100 | while( not hasattr(timelimit_env, '_elapsed_steps')): 101 | timelimit_env = timelimit_env.env 102 | 103 | if args.ros_log: 104 | from envs.target_tracking.ros_wrapper import RosLog 105 | ros_log = RosLog(num_targets=args.nb_targets, wrapped_num=args.ros + args.render + args.record + 1) 106 | 107 | init_pose_list = get_init_pose_list(args.nb_test_eps, args.eval_type) 108 | 109 | total_nlogdetcov = [] 110 | for params in params_set: 111 | ep = 0 112 | ep_nlogdetcov = [] #'Episode nLogDetCov' 113 | time_elapsed = ['Elapsed Time (sec)'] 114 | test_observations = np.zeros(args.nb_test_eps) 115 | 116 | while(ep < args.nb_test_eps): # test episode 117 | ep += 1 118 | s_time = time.time() 119 | episode_rew, nlogdetcov, ep_len = 0, 0, 0 120 | done = {'__all__':False} 121 | obs = env.reset(init_pose_list=init_pose_list, **params) 122 | 123 | all_observations = np.zeros(env.nb_targets, dtype=bool) 124 | 125 | 126 | bigq0 = [] 127 | bigq1 = [] 128 | 129 | while ep_len < tot_eplen: 130 | if args.render: 131 | env.render() 132 | if args.ros_log: 133 | ros_log.log(env) 134 | action_dict = {} 135 | q_dict = {} 136 | for agent_id, o in obs.items(): 137 | action_dict[agent_id], q_dict[agent_id] = act(o, deterministic=False) 138 | # record target observations 139 | observed = np.zeros(env.nb_targets, dtype=bool) 140 | all_observations = np.logical_or(all_observations, o[:,5].astype(bool)) 141 | 142 | if all(all_observations) == True: 143 | test_observations[ep-1] = 1 144 | 145 | obs, rew, done, info = env.step(action_dict) 146 | episode_rew += rew['__all__'] 147 | nlogdetcov += info['mean_nlogdetcov'] 148 | # log q values 149 | rearrange = [0,3,6,9,1,4,7,10,2,5,8,11] 150 | q0 = np.zeros((12)) 151 | q1 = np.zeros((12)) 152 | for ii, val in enumerate(rearrange): 153 | qs0 = q_dict['agent-0'].squeeze(0) 154 | qs1 = q_dict['agent-1'].squeeze(0) 155 | q0[ii] = qs0[val] 156 | q1[ii] = qs1[val] 157 | 158 | bigq0.append(q0) 159 | bigq1.append(q1) 160 | ep_len += 1 161 | 162 | bigq0 = np.asarray(bigq0) 163 | bigq1 = np.asarray(bigq1) 164 | 165 | 166 | time_elapsed.append(time.time() - s_time) 167 | ep_nlogdetcov.append(nlogdetcov) 168 | if args.render: 169 | print("Ep.%d - Episode reward : %.2f, Episode nLogDetCov : %.2f, ep len : %d"%(ep, episode_rew, nlogdetcov, ep_len)) 170 | if ep % 50 == 0: 171 | print("Ep.%d - Episode reward : %.2f, Episode nLogDetCov : %.2f"%(ep, episode_rew, nlogdetcov)) 172 | 173 | if args.record : 174 | env.moviewriter.finish() 175 | if args.ros_log : 176 | ros_log.save(args.log_dir) 177 | 178 | print(test_observations) 179 | print("Cooperation ratio over total evals: %.2f"%(np.sum(test_observations)/args.nb_test_eps)) 180 | 181 | # plot actions for agents in 3d bar graph 182 | f2 = plt.figure() 183 | ax2 = f2.add_subplot(121,projection='3d') 184 | ax3 = f2.add_subplot(122,projection='3d') 185 | 186 | 187 | lx = len(bigq0[0]) 188 | ly = len(bigq0[:,0]) 189 | xpos = np.arange(0,lx,1) 190 | ypos = np.arange(0,ly,1) 191 | xpos, ypos = np.meshgrid(xpos+0.25, ypos+0.25) 192 | 193 | xpos = xpos.flatten() 194 | ypos = ypos.flatten() 195 | zpos = np.zeros(lx*ly) 196 | 197 | dx = 0.5 *np.ones_like(zpos) 198 | dy = dx.copy() 199 | dz0 = bigq0.flatten() 200 | dz1 = bigq1.flatten() 201 | 202 | cs = ['r', 'r', 'r', 'r', 'g', 'g', 'g', 'g','b','b','b','b'] * ly 203 | 204 | ax2.bar3d(xpos,ypos,zpos, dx, dy, dz0, color=cs) 205 | ax3.bar3d(xpos,ypos,zpos, dx, dy, dz1, color=cs) 206 | # plt.show() 207 | 208 | EVAL_BEHAVIOR_2 = [ 209 | {'nb_agents': 2, 'nb_targets': 2}, 210 | ] 211 | EVAL_BEHAVIOR_4 = [ 212 | {'nb_agents': 4, 'nb_targets': 4}, 213 | ] 214 | -------------------------------------------------------------------------------- /algos/maTT/modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import math 5 | 6 | __author__ = 'Christopher D Hsu' 7 | __copyright__ = '' 8 | __credits__ = ['Christopher D Hsu'] 9 | __license__ = '' 10 | __version__ = '0.0.1' 11 | __maintainer__ = 'Christopher D Hsu' 12 | __email__ = 'chsu8@seas.upenn.edu' 13 | __status__ = 'Dev' 14 | 15 | """ 16 | @InProceedings{lee2019set, 17 | title={Set Transformer: A Framework for Attention-based Permutation-Invariant Neural Networks}, 18 | author={Lee, Juho and Lee, Yoonho and Kim, Jungtaek and Kosiorek, Adam and Choi, Seungjin and Teh, Yee Whye}, 19 | booktitle={Proceedings of the 36th International Conference on Machine Learning}, 20 | pages={3744--3753}, 21 | year={2019} 22 | } 23 | """ 24 | 25 | class MAB(nn.Module): 26 | def __init__(self, dim_Q, dim_K, dim_V, num_heads, ln=False): 27 | super(MAB, self).__init__() 28 | self.dim_V = dim_V 29 | self.num_heads = num_heads 30 | self.fc_q = nn.Linear(dim_Q, dim_V) 31 | self.fc_k = nn.Linear(dim_K, dim_V) 32 | self.fc_v = nn.Linear(dim_K, dim_V) 33 | if ln: 34 | self.ln0 = nn.LayerNorm(dim_V) 35 | self.ln1 = nn.LayerNorm(dim_V) 36 | self.fc_o = nn.Linear(dim_V, dim_V) 37 | 38 | def forward(self, Q, K): 39 | Q = self.fc_q(Q) 40 | K, V = self.fc_k(K), self.fc_v(K) 41 | 42 | dim_split = self.dim_V // self.num_heads 43 | Q_ = torch.cat(Q.split(dim_split, 2), 0) 44 | K_ = torch.cat(K.split(dim_split, 2), 0) 45 | V_ = torch.cat(V.split(dim_split, 2), 0) 46 | 47 | A = torch.softmax(Q_.bmm(K_.transpose(1,2))/math.sqrt(self.dim_V), 2) 48 | O = torch.cat((Q_ + A.bmm(V_)).split(Q.size(0), 0), 2) 49 | O = O if getattr(self, 'ln0', None) is None else self.ln0(O) 50 | O = O + F.relu(self.fc_o(O)) 51 | O = O if getattr(self, 'ln1', None) is None else self.ln1(O) 52 | return O 53 | 54 | 55 | class SAB(nn.Module): 56 | def __init__(self, dim_in, dim_out, num_heads, ln=False): 57 | super(SAB, self).__init__() 58 | self.mab = MAB(dim_in, dim_in, dim_out, num_heads, ln=ln) 59 | 60 | def forward(self, X): 61 | return self.mab(X, X) 62 | 63 | 64 | class ISAB(nn.Module): 65 | def __init__(self, dim_in, dim_out, num_heads, num_inds, ln=False): 66 | super(ISAB, self).__init__() 67 | self.I = nn.Parameter(torch.Tensor(1, num_inds, dim_out)) 68 | nn.init.xavier_uniform_(self.I) 69 | self.mab0 = MAB(dim_out, dim_in, dim_out, num_heads, ln=ln) 70 | self.mab1 = MAB(dim_in, dim_out, dim_out, num_heads, ln=ln) 71 | 72 | def forward(self, X): 73 | H = self.mab0(self.I.repeat(X.size(0), 1, 1), X) 74 | return self.mab1(X, H) 75 | 76 | 77 | class PMA(nn.Module): 78 | def __init__(self, dim, num_heads, num_seeds, ln=False): 79 | super(PMA, self).__init__() 80 | self.S = nn.Parameter(torch.Tensor(1, num_seeds, dim)) 81 | nn.init.xavier_uniform_(self.S) 82 | self.mab = MAB(dim, dim, dim, num_heads, ln=ln) 83 | 84 | def forward(self, X): 85 | return self.mab(self.S.repeat(X.size(0), 1, 1), X) -------------------------------------------------------------------------------- /algos/maTT/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import algos.maTT.core as core 4 | 5 | __author__ = 'Christopher D Hsu' 6 | __copyright__ = '' 7 | __credits__ = ['Christopher D Hsu', 'SpinningUp'] 8 | __license__ = '' 9 | __version__ = '0.0.1' 10 | __maintainer__ = 'Christopher D Hsu' 11 | __email__ = 'chsu8@seas.upenn.edu' 12 | __status__ = 'Dev' 13 | 14 | class ReplayBufferSet(object): 15 | def __init__(self, size, obs_dim, act_dim): 16 | """Create Replay buffer. edited to include nb_targets_idx buffer 17 | 18 | Parameters 19 | ---------- 20 | size: int 21 | Max number of transitions to store in the buffer. When the buffer 22 | overflows the old memories are dropped. 23 | """ 24 | self._storage = {} 25 | self._maxsize = size 26 | self._next_idx = 0 27 | self._obs_dim = obs_dim 28 | self._act_dim = act_dim 29 | 30 | def __len__(self): 31 | return len(self._storage) 32 | 33 | def store(self, obs_t, action, reward, obs_tp1, done):#, nb_targets): 34 | nb_targets = obs_t.shape[0] 35 | try: 36 | len(self._storage[nb_targets]) 37 | except: 38 | self._storage[nb_targets] = [] 39 | 40 | ## Combine into a tuple and append to storage 41 | data = (obs_t, obs_tp1, action, reward, done) 42 | 43 | if self._next_idx >= len(self._storage[nb_targets]): 44 | self._storage[nb_targets].append(data) 45 | else: 46 | self._storage[nb_targets][self._next_idx] = data 47 | 48 | self._next_idx = int((self._next_idx + 1) % self._maxsize) 49 | 50 | def _encode_sample(self, size, idxes, nb_targets): 51 | """Given indexes, return a batch of data 52 | action data is outputted as an column vector 53 | """ 54 | batch = dict(obs=np.zeros((size, nb_targets, self._obs_dim)), 55 | obs2=np.zeros((size, nb_targets, self._obs_dim)), 56 | act=np.zeros((size, 1)), 57 | rew=np.zeros((size)), 58 | done=np.zeros((size))) 59 | 60 | for i, idx in enumerate(idxes): 61 | data = self._storage[nb_targets][idx] 62 | obs_t, obs_tp1, action, reward, done = data 63 | 64 | batch['obs'][i] = obs_t 65 | batch['obs2'][i] = obs_tp1 66 | batch['act'][i] = action 67 | batch['rew'][i] = reward 68 | batch['done'][i] = done 69 | 70 | batch['obs'] = torch.as_tensor(batch['obs'], dtype=torch.float32) 71 | batch['obs2'] = torch.as_tensor(batch['obs2'], dtype=torch.float32) 72 | batch['act'] = torch.as_tensor(batch['act']) 73 | batch['rew'] = torch.as_tensor(batch['rew'], dtype=torch.float32) 74 | batch['done'] = torch.as_tensor(batch['done']) 75 | return batch 76 | 77 | def sample_batch(self, batch_size):#, num_targets): 78 | """Sample a batch of experiences. Randomly sample a batch of experiences all 79 | with the same number of targets. Number of targets is sampled from the range [1,num_targets] 80 | 81 | Parameters 82 | ---------- 83 | batch_size: int 84 | How many transitions to sample. 85 | 86 | Returns 87 | ------- 88 | obs_batch: np.array [batch, nb_targets, obs_dim] 89 | batch of observations 90 | act_batch: np.array [batch, 1] 91 | batch of actions executed given obs_batch 92 | rew_batch: np.array [batch] 93 | rewards received as results of executing act_batch 94 | next_obs_batch: np.array [batch, nb_targets, obs_dim] 95 | next set of observations seen after executing act_batch 96 | done_mask: np.array [batch] 97 | done_mask[i] = 1 if executing act_batch[i] resulted in 98 | the end of an episode and 0 otherwise. 99 | """ 100 | 101 | # nb_targets = np.random.random_integers(1, num_targets) 102 | nb_targets = np.random.randint(1, len(self._storage)) 103 | idxes = [np.random.randint(0, len(self._storage[nb_targets]) - 1) for _ in range(batch_size)] 104 | return self._encode_sample(batch_size, idxes, nb_targets) -------------------------------------------------------------------------------- /algos/maTT/run_script.py: -------------------------------------------------------------------------------- 1 | import pdb, argparse, os, datetime, json, pickle 2 | import torch 3 | import torch.nn as nn 4 | 5 | import gym 6 | from gym import wrappers 7 | 8 | from algos.maTT.dql import doubleQlearning 9 | import algos.maTT.core as core 10 | 11 | import envs 12 | 13 | __author__ = 'Christopher D Hsu' 14 | __copyright__ = '' 15 | __credits__ = ['Christopher D Hsu'] 16 | __license__ = '' 17 | __version__ = '0.0.1' 18 | __maintainer__ = 'Christopher D Hsu' 19 | __email__ = 'chsu8@seas.upenn.edu' 20 | __status__ = 'Dev' 21 | 22 | 23 | os.environ['OPENBLAS_NUM_THREADS'] = '1' 24 | os.environ['OMP_NUM_THREADS'] = '1' 25 | os.environ['MKL_NUM_THREADS'] = '1' 26 | 27 | BASE_DIR = os.path.dirname('/'.join(str.split(os.path.realpath(__file__),'/')[:-2])) 28 | 29 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 30 | parser.add_argument('--env', help='environment ID', default='setTracking-v0') 31 | parser.add_argument('--map', type=str, default="emptyMed") 32 | parser.add_argument('--nb_agents', type=int, default=4) 33 | parser.add_argument('--nb_targets', type=int, default=4) 34 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 35 | parser.add_argument('--mode', choices=['train', 'test', 'test-behavior'], default='train') 36 | parser.add_argument('--steps_per_epoch', type=int, default=25000) 37 | parser.add_argument('--epochs', type=int, default=20) 38 | parser.add_argument('--batch_size', type=int, default=256) 39 | parser.add_argument('--alpha', type=float, default=0.4) 40 | parser.add_argument('--gamma', type=float, default=.99) 41 | parser.add_argument('--polyak', type=float, default=0.999) #tau in polyak averaging 42 | parser.add_argument('--hiddens', type=int, default=128) 43 | parser.add_argument('--learning_rate', type=float, default=0.001) 44 | parser.add_argument('--learning_rate_period', type=float, default=0.7) #Back half portion with cosine lr schedule 45 | parser.add_argument('--grad_clip', type=int, default=0.2) 46 | parser.add_argument('--start_steps', type=int, default=20000) 47 | parser.add_argument('--update_after', type=int, default=20000) 48 | parser.add_argument('--num_eval_episodes', type=int, default=2) #During training 49 | parser.add_argument('--replay_size', type=int, default=int(1e6)) 50 | parser.add_argument('--max_ep_len', type=int, default=200) 51 | parser.add_argument('--checkpoint_freq', type=int, default=1) 52 | 53 | parser.add_argument('--record',type=int, default=0) 54 | parser.add_argument('--render', type=int, default=0) 55 | parser.add_argument('--nb_test_eps',type=int, default=50) 56 | parser.add_argument('--log_dir', type=str, default='./results/maTT') 57 | parser.add_argument('--log_fname', type=str, default='model.pt') 58 | parser.add_argument('--repeat', type=int, default=1) 59 | parser.add_argument('--eval_type', choices=['random', 'fixed_4', 60 | 'fixed_2', 'fixed_nb'], default='fixed_nb') 61 | 62 | parser.add_argument('--torch_threads', type=int, default=1) 63 | parser.add_argument('--amp', type=int, default=0) 64 | 65 | args = parser.parse_args() 66 | 67 | 68 | def train(seed, save_dir): 69 | save_dir_0 = os.path.join(save_dir, 'seed_%d'%seed) 70 | 71 | env = envs.make(args.env, 72 | 'ma_target_tracking', 73 | render=bool(args.render), 74 | record=bool(args.record), 75 | directory=save_dir_0, 76 | map_name=args.map, 77 | num_agents=args.nb_agents, 78 | num_targets=args.nb_targets, 79 | is_training=True, 80 | ) 81 | 82 | # Create env function 83 | env_fn = lambda : env 84 | 85 | #Training function 86 | model_kwargs = dict(dim_hidden=args.hiddens) 87 | logger_kwargs = dict(output_dir=save_dir_0, exp_name=save_dir_0) 88 | model = core.DeepSetmodel 89 | 90 | doubleQlearning( 91 | env_fn=env_fn, 92 | model=model, 93 | model_kwargs=model_kwargs, 94 | seed=seed, 95 | steps_per_epoch=args.steps_per_epoch, 96 | epochs=args.epochs, 97 | gamma=args.gamma, 98 | polyak=args.polyak, 99 | lr=args.learning_rate, 100 | lr_period=args.learning_rate_period, 101 | alpha=args.alpha, 102 | grad_clip=args.grad_clip, 103 | batch_size=args.batch_size, 104 | start_steps=args.start_steps, 105 | update_after=args.update_after, 106 | num_test_episodes=args.num_eval_episodes, 107 | replay_size=args.replay_size, 108 | max_ep_len=args.max_ep_len, 109 | logger_kwargs=logger_kwargs, 110 | save_freq=args.checkpoint_freq, 111 | render=bool(args.render), 112 | torch_threads=args.torch_threads, 113 | amp=bool(args.amp) 114 | ) 115 | 116 | def test(seed): 117 | from algos.maTT.evaluation import Test, load_pytorch_policy 118 | 119 | env = envs.make(args.env, 120 | 'ma_target_tracking', 121 | render=bool(args.render), 122 | record=bool(args.record), 123 | directory=args.log_dir, 124 | map_name=args.map, 125 | num_agents=args.nb_agents, 126 | num_targets=args.nb_targets, 127 | is_training=False, 128 | ) 129 | 130 | # Load saved policy 131 | model_kwargs = dict(dim_hidden=args.hiddens) 132 | model = core.DeepSetmodel(env.observation_space, env.action_space, **model_kwargs) 133 | policy = load_pytorch_policy(args.log_dir, args.log_fname, model) 134 | 135 | # Testing environment 136 | Eval = Test() 137 | Eval.test(args, env, policy) 138 | 139 | def testbehavior(seed): 140 | from algos.maTT.evaluation_behavior import TestBehavior, load_pytorch_policy 141 | import algos.maTT.core_behavior as core_behavior 142 | 143 | env = envs.make(args.env, 144 | 'ma_target_tracking', 145 | render=bool(args.render), 146 | record=bool(args.record), 147 | directory=args.log_dir, 148 | map_name=args.map, 149 | num_agents=args.nb_agents, 150 | num_targets=args.nb_targets, 151 | is_training=False, 152 | ) 153 | 154 | # Load saved policy 155 | model_kwargs = dict(dim_hidden=args.hiddens) 156 | model = core_behavior.DeepSetmodel(env.observation_space, env.action_space, **model_kwargs) 157 | policy = load_pytorch_policy(args.log_dir, args.log_fname, model) 158 | 159 | # Testing environment 160 | Eval = TestBehavior() 161 | Eval.test(args, env, policy) 162 | 163 | 164 | if __name__ == '__main__': 165 | if args.mode == 'train': 166 | save_dir = os.path.join(args.log_dir, '_'.join([args.env, datetime.datetime.now().strftime("%m%d%H%M")])) 167 | if not os.path.exists(save_dir): 168 | os.makedirs(save_dir) 169 | else: 170 | ValueError("The directory already exists...", save_dir) 171 | 172 | notes = input("Any notes for this experiment? : ") 173 | f = open(os.path.join(save_dir, "notes.txt"), 'w') 174 | f.write(notes) 175 | f.close() 176 | 177 | seed = args.seed 178 | list_records = [] 179 | for _ in range(args.repeat): 180 | print("===== TRAIN A TARGET TRACKING RL AGENT : SEED %d ====="%seed) 181 | results = train(seed, save_dir) 182 | json.dump(vars(args), open(os.path.join(save_dir, "seed_%d"%seed, 'learning_prop.json'), 'w')) 183 | seed += 1 184 | args.seed += 1 185 | 186 | elif args.mode =='test': 187 | test(args.seed) 188 | 189 | elif args.mode =='test-behavior': 190 | testbehavior(args.seed) 191 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: scalableMARL 2 | channels: 3 | - pytorch 4 | dependencies: 5 | - python=3.8 6 | - pip 7 | - numpy 8 | - nvidia::cudatoolkit=11.3 9 | - pytorch::pytorch 10 | - wheel 11 | - h5py 12 | - typing-extensions 13 | - cython #==0.29.0 14 | - numpy #==1.19.0 15 | - psutil ==5.8.0 16 | - tabulate ==0.8.3 17 | - llvmlite 18 | - numba #==0.52.0 19 | - cloudpickle ==1.6.0 20 | - joblib #==0.13.2 21 | - dill ==0.3.3 22 | - scikit-learn #==0.24.0 23 | - scipy #==1.5.4 24 | - pandas #==1.1.5 25 | - requests 26 | - lz4 27 | - pyyaml 28 | - mpi4py 29 | - tensorboard 30 | - seaborn 31 | - pillow 32 | - pip: 33 | - filterpy 34 | - opencv-python 35 | - librosa==0.4.2 36 | - tqdm==4.32.2 37 | - progressbar2==3.53.1 38 | - gym[atari] -------------------------------------------------------------------------------- /envs/__init__.py: -------------------------------------------------------------------------------- 1 | """ envs/ folder is for openAIgym-like simulation environments 2 | To use, 3 | >>> import envs 4 | >>> env = envs.make("NAME_OF_ENV") 5 | 6 | """ 7 | import gym 8 | 9 | def make(env_name, type, render=False, record=False, directory='', **kwargs): 10 | """ 11 | env_name : str 12 | name of an environment. (e.g. 'Cartpole-v0') 13 | type : str 14 | type of an environment. One of ['atari', 'classic_control', 15 | 'classic_mdp','target_tracking'] 16 | """ 17 | 18 | if type == 'ma_target_tracking': 19 | import envs.maTTenv 20 | env = maTTenv.make(env_name, render=render, record=record, 21 | directory=directory, **kwargs) 22 | 23 | else: 24 | raise ValueError('Designate the right type of the environment.') 25 | 26 | return env 27 | -------------------------------------------------------------------------------- /envs/maTTenv/__init__.py: -------------------------------------------------------------------------------- 1 | from envs.utilities.ma_time_limit import maTimeLimit 2 | 3 | def make(env_name, render=False, figID=0, record=False, directory='', 4 | T_steps=None, num_agents=2, num_targets=1, **kwargs): 5 | """ 6 | env_name : str 7 | name of an environment. (e.g. 'Cartpole-v0') 8 | type : str 9 | type of an environment. One of ['atari', 'classic_control', 10 | 'classic_mdp','target_tracking'] 11 | """ 12 | if T_steps is None: 13 | T_steps = 200 14 | 15 | if env_name == 'setTracking-v0': 16 | from envs.maTTenv.env.setTracking_v0 import setTrackingEnv0 17 | env0 = setTrackingEnv0(num_agents=num_agents, num_targets=num_targets, **kwargs) 18 | elif env_name == 'setTracking-vGreedy': 19 | from envs.maTTenv.env.setTracking_vGreedy import setTrackingEnvGreedy 20 | env0 = setTrackingEnvGreedy(num_agents=num_agents, num_targets=num_targets, **kwargs) 21 | elif env_name == 'setTracking-vkGreedy': 22 | from envs.maTTenv.env.setTracking_vkGreedy import setTrackingEnvkGreedy 23 | env0 = setTrackingEnvkGreedy(num_agents=num_agents, num_targets=num_targets, **kwargs) 24 | elif env_name == 'setTracking-model': 25 | from envs.maTTenv.env.setTracking_model import setTrackingEnvModel 26 | env0 = setTrackingEnvModel(num_agents=num_agents, num_targets=num_targets, **kwargs) 27 | elif env_name == 'setTracking-vGru': 28 | from envs.maTTenv.env.setTracking_vGru import setTrackingEnvGru 29 | env0 = setTrackingEnvGru(num_agents=num_agents, num_targets=num_targets, **kwargs) 30 | else: 31 | raise ValueError('No such environment exists.') 32 | 33 | env = maTimeLimit(env0, max_episode_steps=T_steps) 34 | 35 | if render: 36 | from envs.maTTenv.display_wrapper import Display2D 37 | env = Display2D(env, figID=figID) 38 | if record: 39 | from envs.maTTenv.display_wrapper import Video2D 40 | env = Video2D(env, dirname = directory) 41 | 42 | return env -------------------------------------------------------------------------------- /envs/maTTenv/agent_models.py: -------------------------------------------------------------------------------- 1 | """Dynamic Object Models 2 | 3 | AgentDoubleInt2D : Double Integrator Model in 2D 4 | state: x,y,xdot,ydot 5 | AgentSE2 : SE2 Model 6 | state x,y,theta 7 | 8 | Agent2DFixedPath : Model with a pre-defined path 9 | Agent_InfoPlanner : Model from the InfoPlanner repository 10 | 11 | SE2Dynamics : update dynamics function with a control input -- linear, angular velocities 12 | SEDynamicsVel : update dynamics function for contant linear and angular velocities 13 | 14 | edited by christopher-hsu from coco66 for multi_agent 15 | """ 16 | 17 | import numpy as np 18 | from envs.maTTenv.metadata import METADATA 19 | import envs.maTTenv.util as util 20 | 21 | class Agent(object): 22 | def __init__(self, agent_id, dim, sampling_period, limit, collision_func, margin=METADATA['margin']): 23 | self.agent_id = agent_id 24 | self.dim = dim 25 | self.sampling_period = sampling_period 26 | self.limit = limit 27 | self.collision_func = collision_func 28 | self.margin = margin 29 | 30 | def range_check(self): 31 | self.state = np.clip(self.state, self.limit[0], self.limit[1]) 32 | 33 | def collision_check(self, pos): 34 | return self.collision_func(pos[:2]) 35 | 36 | def margin_check(self, pos, target_pos): 37 | return any(np.sqrt(np.sum((pos - target_pos)**2, axis=1)) < self.margin) # no update 38 | 39 | def reset(self, init_state): 40 | self.state = init_state 41 | 42 | class AgentSE2(Agent): 43 | def __init__(self, agent_id, dim, sampling_period, limit, collision_func, 44 | margin=METADATA['margin'], policy=None): 45 | super().__init__(agent_id, dim, sampling_period, limit, collision_func, margin=margin) 46 | self.policy = policy 47 | 48 | def reset(self, init_state): 49 | super().reset(init_state) 50 | if self.policy: 51 | self.policy.reset(init_state) 52 | 53 | def update(self, control_input=None, margin_pos=None, col=False): 54 | """ 55 | control_input : [linear_velocity, angular_velocity] 56 | margin_pos : a minimum distance to a target 57 | """ 58 | if control_input is None: 59 | control_input = self.policy.get_control(self.state) 60 | 61 | new_state = SE2Dynamics(self.state, self.sampling_period, control_input) 62 | is_col = 0 63 | if self.collision_check(new_state[:2]): 64 | is_col = 1 65 | new_state[:2] = self.state[:2] 66 | if self.policy is not None: 67 | # self.policy.collision(new_state) 68 | corrected_policy = self.policy.collision(new_state) 69 | if corrected_policy is not None: 70 | new_state = SE2DynamicsVel(self.state, 71 | self.sampling_period, corrected_policy) 72 | 73 | elif margin_pos is not None: 74 | if self.margin_check(new_state[:2], margin_pos): 75 | new_state[:2] = self.state[:2] 76 | 77 | self.state = new_state 78 | self.range_check() 79 | 80 | return is_col 81 | 82 | def SE2Dynamics(x, dt, u): 83 | assert(len(x)==3) 84 | tw = dt * u[1] 85 | # Update the agent state 86 | if abs(tw) < 0.001: 87 | diff = np.array([dt*u[0]*np.cos(x[2]+tw/2), 88 | dt*u[0]*np.sin(x[2]+tw/2), 89 | tw]) 90 | else: 91 | diff = np.array([u[0]/u[1]*(np.sin(x[2]+tw) - np.sin(x[2])), 92 | u[0]/u[1]*(np.cos(x[2]) - np.cos(x[2]+tw)), 93 | tw]) 94 | new_x = x + diff 95 | new_x[2] = util.wrap_around(new_x[2]) 96 | return new_x 97 | 98 | def SE2DynamicsVel(x, dt, u=None): 99 | assert(len(x)==5) # x = [x,y,theta,v,w] 100 | if u is None: 101 | u = x[-2:] 102 | odom = SE2Dynamics(x[:3], dt, u) 103 | return np.concatenate((odom, u)) 104 | 105 | class AgentDoubleInt2D(Agent): 106 | def __init__(self, agent_id, dim, sampling_period, limit, collision_func, 107 | margin=METADATA['margin'], A=None, W=None): 108 | super().__init__(agent_id, dim, sampling_period, limit, collision_func, margin=margin) 109 | self.A = np.eye(self.dim) if A is None else A 110 | self.W = W 111 | 112 | def update(self, margin_pos=None): 113 | new_state = np.matmul(self.A, self.state) 114 | if self.W is not None: 115 | noise_sample = np.random.multivariate_normal(np.zeros(self.dim,), self.W) 116 | new_state += noise_sample 117 | if self.collision_check(new_state[:2]): 118 | new_state = self.collision_control(new_state) 119 | 120 | self.state = new_state 121 | # self.range_check() 122 | 123 | def collision_control(self, new_state): 124 | new_state[0] = self.state[0] 125 | new_state[1] = self.state[1] 126 | if self.dim > 2: 127 | new_state[2] = -2 * .2 * new_state[2] + np.random.normal(0.0, 0.2)#-0.001*np.sign(new_state[2]) 128 | new_state[3] = -2 * .2 * new_state[3] + np.random.normal(0.0, 0.2)#-0.001*np.sign(new_state[3]) 129 | return new_state 130 | 131 | class AgentDoubleInt2D_Avoidance(AgentDoubleInt2D): 132 | def __init__(self, agent_id, dim, sampling_period, limit, collision_func, 133 | margin=METADATA['margin'], A=None, W=None, obs_check_func=None): 134 | super().__init__(agent_id, dim, sampling_period, limit, 135 | collision_func, margin=margin, A=A, W=W) 136 | self.obs_check_func = obs_check_func 137 | 138 | def update(self, margin_pos=None): 139 | new_state = np.matmul(self.A, self.state[:self.dim]) 140 | if self.W is not None: 141 | noise_sample = np.random.multivariate_normal(np.zeros(self.dim,), self.W) 142 | new_state += noise_sample 143 | 144 | is_col = 0 145 | if self.collision_check(new_state[:2]): 146 | new_state = self.collision_control() 147 | is_col = 1 148 | 149 | if self.obs_check_func is not None: 150 | del_vx, del_vy = self.obstacle_detour_maneuver( 151 | r_margin=METADATA['target_speed_limit']*self.sampling_period*2) 152 | new_state[2] += del_vx 153 | new_state[3] += del_vy 154 | 155 | self.state = new_state 156 | self.range_check() 157 | return is_col 158 | 159 | def range_check(self): 160 | """ 161 | Limit the position and the velocity. 162 | self.limit[:][2] = self.limit[:][3] = speed limit. The velocity components 163 | are clipped proportional to the original values. 164 | """ 165 | self.state[:2] = np.clip(self.state[:2], self.limit[0][:2], self.limit[1][:2]) 166 | v_square = self.state[2:]**2 167 | del_v = np.sum(v_square) - self.limit[1][2]**2 168 | if del_v > 0.0: 169 | self.state[2] = np.sign(self.state[2]) * np.sqrt(max(0.0, 170 | v_square[0] - del_v * v_square[0] / (v_square[0] + v_square[1]))) 171 | self.state[3] = np.sign(self.state[3]) * np.sqrt(max(0.0, 172 | v_square[1] - del_v * v_square[1] / (v_square[0] + v_square[1]))) 173 | 174 | def collision_control(self): 175 | """ 176 | Assigns a new velocity deviating the agent with an angle (pi/2, pi) from 177 | the closest obstacle point. 178 | """ 179 | odom = [self.state[0], self.state[1], np.arctan2(self.state[3], self.state[2])] 180 | # obs_pos = self.obs_check_func(odom) 181 | v = np.sqrt(np.sum(np.square(self.state[2:]))) + np.random.normal(0.0,1.0) 182 | if obs_pos[1] >= 0: 183 | th = obs_pos[1] - (1 + np.random.random()) * np.pi/2 184 | else: 185 | th = obs_pos[1] + (1 + np.random.random()) * np.pi/2 186 | 187 | state = np.array([self.state[0], self.state[1], v * np.cos(th + odom[2]), v * np.sin(th + odom[2])]) 188 | return state 189 | 190 | def obstacle_detour_maneuver(self, r_margin=1.0): 191 | """ 192 | Returns del_vx, del_vy which will be added to the new state. 193 | This provides a repultive force from the closest obstacle point based 194 | on the current velocity, a linear distance, and an angular distance. 195 | 196 | Parameters: 197 | ---------- 198 | r_margin : float. A margin from an obstalce that you want to consider 199 | as the minimum distance the target can get close to the obstacle. 200 | """ 201 | odom = [self.state[0], self.state[1], np.arctan2(self.state[3], self.state[2])] 202 | # obs_pos = self.obs_check_func(odom) 203 | speed = np.sqrt(np.sum(self.state[2:]**2)) 204 | rot_ang = np.pi/2 * (1. + 1./(1. + np.exp(-(speed-0.5*METADATA['target_speed_limit'])))) 205 | if obs_pos is not None: 206 | acc = max(0.0, speed * np.cos(obs_pos[1])) / max(METADATA['margin2wall'], obs_pos[0] - r_margin) 207 | th = obs_pos[1] - rot_ang if obs_pos[1] >= 0 else obs_pos[1] + rot_ang 208 | del_vx = acc * np.cos(th + odom[2]) * self.sampling_period 209 | del_vy = acc * np.sin(th + odom[2]) * self.sampling_period 210 | return del_vx, del_vy 211 | else: 212 | return 0., 0. 213 | 214 | 215 | class Agent2DFixedPath(Agent): 216 | def __init__(self, dim, sampling_period, limit, collision_func, path, margin=METADATA['margin']): 217 | Agent.__init__(self, dim, sampling_period, limit, collision_func, margin=margin) 218 | self.path = path 219 | 220 | def update(self, margin_pos=None): 221 | # fixed policy for now 222 | self.t += 1 223 | new_state = np.concatenate((self.path[self.t][:2], self.path[self.t][-2:])) 224 | if margin_pos is not None: 225 | if type(margin_pos) != list: 226 | margin_pos = [margin_pos] 227 | in_margin = False 228 | while(True): 229 | in_margin = self.margin_check(new_state[:2], margin_pos) 230 | if in_margin: 231 | new_state[:2] = new_state[:2] + 0.01*(np.random.random((2,))-0.5) 232 | else: 233 | break 234 | self.state = new_state 235 | 236 | def reset(self, init_state): 237 | self.t = 0 238 | self.state = np.concatenate((self.path[self.t][:2], self.path[self.t][-2:])) 239 | 240 | 241 | class AgentDoubleInt2D_Nonlinear(AgentDoubleInt2D): 242 | def __init__(self, dim, sampling_period, limit, collision_func, 243 | margin=METADATA['margin'], A=None, W=None, obs_check_func=None): 244 | AgentDoubleInt2D.__init__(self, dim, sampling_period, limit, 245 | collision_func, margin=margin, A=A, W=W) 246 | self.obs_check_func = obs_check_func 247 | 248 | def update(self, margin_pos=None): 249 | new_state = np.matmul(self.A, self.state[:self.dim]) 250 | if self.W is not None: 251 | noise_sample = np.random.multivariate_normal(np.zeros(self.dim,), self.W) 252 | new_state += noise_sample 253 | 254 | is_col = 0 255 | if self.collision_check(new_state[:2]): 256 | new_state = self.collision_control() 257 | is_col = 1 258 | 259 | if self.obs_check_func is not None: 260 | del_vx, del_vy = self.obstacle_detour_maneuver( 261 | r_margin=METADATA['target_speed_limit']*self.sampling_period*2) 262 | new_state[2] += del_vx 263 | new_state[3] += del_vy 264 | 265 | self.state = new_state 266 | self.range_check() 267 | return is_col 268 | 269 | def range_check(self): 270 | """ 271 | Limit the position and the velocity. 272 | self.limit[:][2] = self.limit[:][3] = speed limit. The velocity components 273 | are clipped proportional to the original values. 274 | """ 275 | self.state[:2] = np.clip(self.state[:2], self.limit[0][:2], self.limit[1][:2]) 276 | v_square = self.state[2:]**2 277 | del_v = np.sum(v_square) - self.limit[1][2]**2 278 | if del_v > 0.0: 279 | self.state[2] = np.sign(self.state[2]) * np.sqrt(max(0.0, 280 | v_square[0] - del_v * v_square[0] / (v_square[0] + v_square[1]))) 281 | self.state[3] = np.sign(self.state[3]) * np.sqrt(max(0.0, 282 | v_square[1] - del_v * v_square[1] / (v_square[0] + v_square[1]))) 283 | 284 | def collision_control(self): 285 | """ 286 | Assigns a new velocity deviating the agent with an angle (pi/2, pi) from 287 | the closest obstacle point. 288 | """ 289 | odom = [self.state[0], self.state[1], np.arctan2(self.state[3], self.state[2])] 290 | obs_pos = self.obs_check_func(odom) 291 | v = np.sqrt(np.sum(np.square(self.state[2:]))) + np.random.normal(0.0,1.0) 292 | if obs_pos[1] >= 0: 293 | th = obs_pos[1] - (1 + np.random.random()) * np.pi/2 294 | else: 295 | th = obs_pos[1] + (1 + np.random.random()) * np.pi/2 296 | 297 | state = np.array([self.state[0], self.state[1], v * np.cos(th + odom[2]), v * np.sin(th + odom[2])]) 298 | return state 299 | 300 | def obstacle_detour_maneuver(self, r_margin=1.0): 301 | """ 302 | Returns del_vx, del_vy which will be added to the new state. 303 | This provides a repultive force from the closest obstacle point based 304 | on the current velocity, a linear distance, and an angular distance. 305 | 306 | Parameters: 307 | ---------- 308 | r_margin : float. A margin from an obstalce that you want to consider 309 | as the minimum distance the target can get close to the obstacle. 310 | """ 311 | odom = [self.state[0], self.state[1], np.arctan2(self.state[3], self.state[2])] 312 | obs_pos = self.obs_check_func(odom) 313 | speed = np.sqrt(np.sum(self.state[2:]**2)) 314 | rot_ang = np.pi/2 * (1. + 1./(1. + np.exp(-(speed-0.5*METADATA['target_speed_limit'])))) 315 | if obs_pos is not None: 316 | acc = max(0.0, speed * np.cos(obs_pos[1])) / max(METADATA['margin2wall'], obs_pos[0] - r_margin) 317 | th = obs_pos[1] - rot_ang if obs_pos[1] >= 0 else obs_pos[1] + rot_ang 318 | del_vx = acc * np.cos(th + odom[2]) * self.sampling_period 319 | del_vy = acc * np.sin(th + odom[2]) * self.sampling_period 320 | return del_vx, del_vy 321 | else: 322 | return 0., 0. -------------------------------------------------------------------------------- /envs/maTTenv/belief_tracker.py: -------------------------------------------------------------------------------- 1 | """Belief Trackers 2 | 3 | KFbelief : Belief Update using Kalman Filter 4 | UKFbelief : Belief Update using Unscented Kalman Filter using filterpy library 5 | """ 6 | import numpy as np 7 | from numpy import linalg as LA 8 | import envs.maTTenv.util as util 9 | 10 | from filterpy.kalman import JulierSigmaPoints, UnscentedKalmanFilter, ExtendedKalmanFilter 11 | 12 | class KFbelief(object): 13 | """ 14 | Kalman Filter for the target tracking problem. 15 | 16 | state : target state 17 | x : agent state 18 | z : observation (r, alpha) 19 | """ 20 | def __init__(self, agent_id, dim, limit, dim_z=2, A=None, W=None, 21 | obs_noise_func=None, collision_func=None): 22 | """ 23 | dim : dimension of state 24 | limit : An array of two vectors. 25 | limit[0] = minimum values for the state, 26 | limit[1] = maximum value for the state 27 | dim_z : dimension of observation, 28 | A : state transition matrix 29 | W : state noise matrix 30 | obs_noise_func : observation noise matrix function of z 31 | collision_func : collision checking function 32 | """ 33 | self.agent_id = agent_id 34 | self.dim = dim 35 | self.limit = limit 36 | self.A = np.eye(self.dim) if A is None else A 37 | self.W = W if W is not None else np.zeros((self.dim, self.dim)) 38 | self.obs_noise_func = obs_noise_func 39 | self.collision_func = collision_func 40 | self.cov = np.eye(self.dim) 41 | 42 | def reset(self, init_state, init_cov): 43 | self.state = init_state 44 | self.cov = init_cov*np.eye(self.dim) 45 | 46 | def predict(self): 47 | # Prediction 48 | state_new = np.matmul(self.A, self.state) 49 | cov_new = np.matmul(np.matmul(self.A, self.cov), self.A.T) + self.W 50 | if True: # LA.det(cov_new) < 1e6: 51 | self.cov = cov_new 52 | self.state = np.clip(state_new, self.limit[0], self.limit[1]) 53 | 54 | def update(self, z_t, x_t): 55 | """ 56 | Parameters 57 | -------- 58 | z_t : observation - radial and angular distances from the agent. 59 | x_t : agent state (x, y, orientation) in the global frame. 60 | """ 61 | # Kalman Filter Update 62 | r_pred, alpha_pred = util.relative_distance_polar( 63 | self.state[:2], x_t[:2], x_t[2]) 64 | diff_pred = self.state[:2] - x_t[:2] 65 | if self.dim == 2: 66 | Hmat = np.array([[diff_pred[0],diff_pred[1]], 67 | [-diff_pred[1]/r_pred, diff_pred[0]/r_pred]])/r_pred 68 | elif self.dim == 4: 69 | Hmat = np.array([[diff_pred[0], diff_pred[1], 0.0, 0.0], 70 | [-diff_pred[1]/r_pred, diff_pred[0]/r_pred, 0.0, 0.0]])/r_pred 71 | else: 72 | raise ValueError('target dimension for KF must be either 2 or 4') 73 | innov = z_t - np.array([r_pred, alpha_pred]) 74 | innov[1] = util.wrap_around(innov[1]) 75 | 76 | R = np.matmul(np.matmul(Hmat, self.cov), Hmat.T) \ 77 | + self.obs_noise_func((r_pred, alpha_pred)) 78 | K = np.matmul(np.matmul(self.cov, Hmat.T), LA.inv(R)) 79 | C = np.eye(self.dim) - np.matmul(K, Hmat) 80 | 81 | cov_new = np.matmul(C, self.cov) 82 | state_new = self.state + np.matmul(K, innov) 83 | 84 | if True: #LA.det(cov_new) < 1e6: 85 | self.cov = cov_new 86 | self.state = np.clip(state_new, self.limit[0], self.limit[1]) 87 | 88 | 89 | class UKFbelief(object): 90 | """ 91 | Unscented Kalman Filter from filterpy 92 | """ 93 | def __init__(self, dim, limit, dim_z=2, fx=None, W=None, obs_noise_func=None, 94 | collision_func=None, sampling_period=0.5, kappa=1): 95 | """ 96 | dim : dimension of state 97 | ***Assuming dim==3: (x,y,theta), dim==4: (x,y,xdot,ydot), dim==5: (x,y,theta,v,w) 98 | limit : An array of two vectors. limit[0] = minimum values for the state, 99 | limit[1] = maximum value for the state 100 | dim_z : dimension of observation, 101 | fx : x_tp1 = fx(x_t, dt), state dynamic function 102 | W : state noise matrix 103 | obs_noise_func : observation noise matrix function of z 104 | collision_func : collision checking function 105 | n : the number of sigma points 106 | """ 107 | self.dim = dim 108 | self.limit = limit 109 | self.W = W if W is not None else np.zeros((self.dim, self.dim)) 110 | self.obs_noise_func = obs_noise_func 111 | self.collision_func = collision_func 112 | 113 | def hx(y, agent_state, measure_func=util.relative_distance_polar): 114 | r_pred, alpha_pred = measure_func(y[:2], agent_state[:2], 115 | agent_state[2]) 116 | return np.array([r_pred, alpha_pred]) 117 | 118 | def x_mean_fn_(sigmas, Wm): 119 | if dim == 3: 120 | x = np.zeros(dim) 121 | sum_sin, sum_cos = 0., 0. 122 | for i in range(len(sigmas)): 123 | s = sigmas[i] 124 | x[0] += s[0] * Wm[i] 125 | x[1] += s[1] * Wm[i] 126 | sum_sin += np.sin(s[2])*Wm[i] 127 | sum_cos += np.cos(s[2])*Wm[i] 128 | x[2] = np.arctan2(sum_sin, sum_cos) 129 | return x 130 | elif dim == 5: 131 | x = np.zeros(dim) 132 | sum_sin, sum_cos = 0., 0. 133 | for i in range(len(sigmas)): 134 | s = sigmas[i] 135 | x[0] += s[0] * Wm[i] 136 | x[1] += s[1] * Wm[i] 137 | x[3] += s[3] * Wm[i] 138 | x[4] += s[4] * Wm[i] 139 | sum_sin += np.sin(s[2])*Wm[i] 140 | sum_cos += np.cos(s[2])*Wm[i] 141 | x[2] = np.arctan2(sum_sin, sum_cos) 142 | return x 143 | else: 144 | return None 145 | 146 | def z_mean_fn_(sigmas, Wm): 147 | x = np.zeros(dim_z) 148 | sum_sin, sum_cos = 0., 0. 149 | for i in range(len(sigmas)): 150 | s = sigmas[i] 151 | x[0] += s[0] * Wm[i] 152 | sum_sin += np.sin(s[1])*Wm[i] 153 | sum_cos += np.cos(s[1])*Wm[i] 154 | x[1] = np.arctan2(sum_sin, sum_cos) 155 | return x 156 | 157 | def residual_x_(x, xp): 158 | """ 159 | x : state, [x, y, theta] 160 | xp : predicted state 161 | """ 162 | if dim == 3 or dim == 5: 163 | r_x = x - xp 164 | r_x[2] = util.wrap_around(r_x[2]) 165 | return r_x 166 | else: 167 | return None 168 | 169 | def residual_z_(z, zp): 170 | """ 171 | z : observation, [r, alpha] 172 | zp : predicted observation 173 | """ 174 | r_z = z - zp 175 | r_z[1] = util.wrap_around(r_z[1]) 176 | return r_z 177 | 178 | sigmas = JulierSigmaPoints(n=dim, kappa=kappa) 179 | 180 | self.ukf = UnscentedKalmanFilter(dim, dim_z, sampling_period, fx=fx, 181 | hx=hx, points=sigmas, x_mean_fn=x_mean_fn_, 182 | z_mean_fn=z_mean_fn_, residual_x=residual_x_, 183 | residual_z=residual_z_) 184 | 185 | def reset(self, init_state, init_cov): 186 | self.state = init_state 187 | self.cov = init_cov*np.eye(self.dim) 188 | self.ukf.x = self.state 189 | self.ukf.P = self.cov 190 | self.ukf.Q = self.W # process noise matrix 191 | 192 | def update(self, observed, z_t, x_t, u_t=None): 193 | # Kalman Filter Update 194 | self.ukf.predict(u=u_t) 195 | 196 | if observed: 197 | r_pred, alpha_pred = util.relative_distance_polar(self.ukf.x[:2], x_t[:2], x_t[2]) 198 | self.ukf.update(z_t, R=self.obs_noise_func((r_pred, alpha_pred)), 199 | agent_state=x_t) 200 | 201 | cov_new = self.ukf.P 202 | state_new = self.ukf.x 203 | 204 | if LA.det(cov_new) < 1e6: 205 | self.cov = cov_new 206 | if not(self.collision_func(state_new[:2])): 207 | self.state = np.clip(state_new, self.limit[0], self.limit[1]) 208 | -------------------------------------------------------------------------------- /envs/maTTenv/display_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | edited by christopher-hsu from coco66 for multi_agent 3 | """ 4 | from gym import Wrapper 5 | import numpy as np 6 | from numpy import linalg as LA 7 | 8 | import pdb, os 9 | import matplotlib 10 | matplotlib.use('TkAgg') 11 | from matplotlib import pyplot as plt 12 | from matplotlib import patches 13 | from matplotlib import animation 14 | from envs.maTTenv.metadata import * 15 | 16 | class Display2D(Wrapper): 17 | def __init__(self, env, figID = 0, skip = 1, confidence=0.95): 18 | super(Display2D,self).__init__(env) 19 | self.figID = figID # figID = 0 : train, figID = 1 : test 20 | """ If used in run_maTracking to debug: self.env_core = env 21 | if used in visualize_ma for normal use: self.env_core = env.env""" 22 | self.env_core = env.env 23 | self.bin = self.env_core.MAP.mapres 24 | if self.env_core.MAP.map is None: 25 | self.map = np.zeros(self.env_core.MAP.mapdim) 26 | else: 27 | self.map = self.env_core.MAP.map 28 | self.mapmin = self.env_core.MAP.mapmin 29 | self.mapmax = self.env_core.MAP.mapmax 30 | self.mapres = self.env_core.MAP.mapres 31 | self.fig = plt.figure(self.figID) 32 | self.n_frames = 0 33 | self.skip = skip 34 | self.c_cf = np.sqrt(-2*np.log(1-confidence)) 35 | 36 | def pos2map(self, obs, sd): 37 | x = obs[:,0] 38 | y = obs[:,1] 39 | 40 | def close(self): 41 | plt.close(self.fig) 42 | 43 | def render(self, mode='empty', record=False, traj_num=0, batch_outputs=None): 44 | if not hasattr(self, 'traj'): 45 | raise ValueError('Must do a env.reset() first before calling env.render()') 46 | 47 | # num_agents = len(self.traj) 48 | num_agents = self.env_core.nb_agents 49 | if type(self.env_core.agents) == list: 50 | agent_pos = [self.env_core.agents[i].state for i in range(num_agents)] 51 | else: 52 | agent_pos = self.env_core.agents.state 53 | 54 | # num_targets = len(self.traj_y) 55 | num_targets = self.env_core.nb_targets 56 | if type(self.env_core.targets) == list: 57 | target_true_pos = [self.env_core.targets[i].state[:2] for i in range(num_targets)] 58 | target_b_state = [self.env_core.belief_targets[i].state for i in range(num_targets)] # state[3:5] 59 | target_cov = [self.env_core.belief_targets[i].cov for i in range(num_targets)] 60 | else: 61 | target_true_pos = self.env_core.targets.state[:,:2] 62 | target_b_state = self.env_core.belief_targets.state[:,:2] # state[3:5] 63 | target_cov = self.env_core.belief_targets.cov 64 | 65 | if self.n_frames%self.skip == 0: 66 | self.fig.clf() 67 | ax = self.fig.subplots() 68 | im = None 69 | if mode == 'empty': 70 | im = ax.imshow(self.map, cmap='gray_r', origin='lower', 71 | extent=[self.mapmin[0], self.mapmax[0], self.mapmin[1], self.mapmax[1]]) 72 | 73 | for ii in range(num_agents): 74 | #agents positions 75 | ax.plot(agent_pos[ii][0], agent_pos[ii][1], marker=(3, 0, agent_pos[ii][2]/np.pi*180-90), 76 | markersize=10, linestyle='None', markerfacecolor='b', markeredgecolor='b') 77 | ax.plot(self.traj[ii][0], self.traj[ii][1], 'b.', markersize=2) 78 | #agents sensor indicators 79 | sensor_arc = patches.Arc((agent_pos[ii][0], agent_pos[ii][1]), METADATA['sensor_r']*2, METADATA['sensor_r']*2, 80 | angle = agent_pos[ii][2]/np.pi*180, theta1 = -METADATA['fov']/2, theta2 = METADATA['fov']/2, facecolor='gray') 81 | ax.add_patch(sensor_arc) 82 | ax.plot([agent_pos[ii][0], agent_pos[ii][0]+METADATA['sensor_r']*np.cos(agent_pos[ii][2]+0.5*METADATA['fov']/180.0*np.pi)], 83 | [agent_pos[ii][1], agent_pos[ii][1]+METADATA['sensor_r']*np.sin(agent_pos[ii][2]+0.5*METADATA['fov']/180.0*np.pi)],'k', linewidth=0.5) 84 | ax.plot([agent_pos[ii][0], agent_pos[ii][0]+METADATA['sensor_r']*np.cos(agent_pos[ii][2]-0.5*METADATA['fov']/180.0*np.pi)], 85 | [agent_pos[ii][1], agent_pos[ii][1]+METADATA['sensor_r']*np.sin(agent_pos[ii][2]-0.5*METADATA['fov']/180.0*np.pi)],'k', linewidth=0.5) 86 | self.traj[ii][0].append(agent_pos[ii][0]) 87 | self.traj[ii][1].append(agent_pos[ii][1]) 88 | 89 | for jj in range(num_targets): 90 | ax.plot(self.traj_y[jj][0], self.traj_y[jj][1], 'r.', markersize=2) 91 | ax.plot(target_true_pos[jj][0], target_true_pos[jj][1], marker='o', markersize=5, 92 | linestyle='None', markerfacecolor='r', markeredgecolor='r') 93 | # Belief on target 94 | ax.plot(target_b_state[jj][0], target_b_state[jj][1], marker='o', markersize=10, 95 | linewidth=5, markerfacecolor='none', markeredgecolor='g') 96 | eig_val, eig_vec = LA.eig(target_cov[jj][:2,:2]) 97 | belief_target = patches.Ellipse((target_b_state[jj][0], target_b_state[jj][1]), 98 | 2*np.sqrt(eig_val[0])*self.c_cf, 2*np.sqrt(eig_val[1])*self.c_cf, 99 | angle = 180/np.pi*np.arctan2(eig_vec[0][1],eig_vec[0][0]) ,fill=True, 100 | zorder=2, facecolor='g', alpha=0.5) 101 | ax.add_patch(belief_target) 102 | self.traj_y[jj][0].append(target_true_pos[jj][0]) 103 | self.traj_y[jj][1].append(target_true_pos[jj][1]) 104 | 105 | ax.set_xlim((self.mapmin[0], self.mapmax[0])) 106 | ax.set_ylim((self.mapmin[1], self.mapmax[1])) 107 | ax.set_aspect('equal','box') 108 | ax.grid() 109 | ax.set_title(' '.join([mode.upper(),': Trajectory',str(traj_num)])) 110 | 111 | if not record : 112 | plt.draw() 113 | plt.pause(0.0005) 114 | self.n_frames += 1 115 | 116 | def reset(self, **kwargs): 117 | self.traj = [[[],[]]]*self.env_core.num_agents 118 | self.traj_y = [[[],[]]]*self.env_core.num_targets 119 | return self.env.reset(**kwargs) 120 | 121 | class Video2D(Wrapper): 122 | def __init__(self, env, dirname = '', skip = 1, dpi=80, local_view=False): 123 | super(Video2D, self).__init__(env) 124 | self.local_view = local_view 125 | self.skip = skip 126 | self.moviewriter = animation.FFMpegWriter() 127 | fname = os.path.join(dirname, 'eval_%da%dt.mp4'%(env.nb_agents,env.nb_targets)) 128 | self.moviewriter.setup(fig=env.fig, outfile=fname, dpi=dpi) 129 | if self.local_view: 130 | self.moviewriter0 = animation.FFMpegWriter() 131 | fname0 = os.path.join(dirname, 'train_local_%d.mp4'%np.random.randint(0,20)) 132 | self.moviewriter0.setup(fig=env.fig0, outfile=fname0, dpi=dpi) 133 | self.n_frames = 0 134 | 135 | def render(self, *args, **kwargs): 136 | if self.n_frames % self.skip == 0: 137 | #if traj_num % self.skip == 0: 138 | self.env.render(record=True, *args, **kwargs) 139 | self.moviewriter.grab_frame() 140 | if self.local_view: 141 | self.moviewriter0.grab_frame() 142 | self.n_frames += 1 143 | 144 | def reset(self, **kwargs): 145 | return self.env.reset(**kwargs) 146 | 147 | -------------------------------------------------------------------------------- /envs/maTTenv/env/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/envs/maTTenv/env/__init__.py -------------------------------------------------------------------------------- /envs/maTTenv/env/maTracking_Base.py: -------------------------------------------------------------------------------- 1 | import os, copy, pdb 2 | import numpy as np 3 | from numpy import linalg as LA 4 | import gym 5 | from gym import spaces, logger 6 | from gym.utils import seeding 7 | from envs.maTTenv.maps import map_utils 8 | import envs.maTTenv.util as util 9 | from envs.maTTenv.metadata import METADATA 10 | 11 | 12 | class maTrackingBase(gym.Env): 13 | def __init__(self, num_agents=2, num_targets=1, map_name='empty', 14 | is_training=True, known_noise=True, **kwargs): 15 | self.seed() #used with gym 16 | self.id = 'maTracking-base' 17 | self.action_space = spaces.Discrete(len(METADATA['action_v']) * \ 18 | len(METADATA['action_w'])) 19 | self.action_map = {} 20 | for (i,v) in enumerate(METADATA['action_v']): 21 | for (j,w) in enumerate(METADATA['action_w']): 22 | self.action_map[len(METADATA['action_w'])*i+j] = (v,w) 23 | assert(len(self.action_map.keys())==self.action_space.n) 24 | 25 | self.agent_dim = 3 26 | self.target_dim = 2 27 | self.num_agents = num_agents 28 | self.nb_agents = num_agents 29 | self.num_targets = num_targets 30 | self.nb_targets = num_targets 31 | self.viewer = None 32 | self.is_training = is_training 33 | 34 | self.sampling_period = 0.5 # sec 35 | self.sensor_r_sd = METADATA['sensor_r_sd'] 36 | self.sensor_b_sd = METADATA['sensor_b_sd'] 37 | self.sensor_r = METADATA['sensor_r'] 38 | self.fov = METADATA['fov'] 39 | map_dir_path = '/'.join(map_utils.__file__.split('/')[:-1]) 40 | self.MAP = map_utils.GridMap( 41 | map_path=os.path.join(map_dir_path, map_name), 42 | r_max = self.sensor_r, fov = self.fov/180.0*np.pi, 43 | margin2wall = METADATA['margin2wall']) 44 | 45 | self.agent_init_pos = np.array([self.MAP.origin[0], self.MAP.origin[1], 0.0]) 46 | self.target_init_pos = np.array(self.MAP.origin) 47 | self.target_init_cov = METADATA['target_init_cov'] 48 | 49 | self.reset_num = 0 50 | #needed for gym/core.py wrappers 51 | self.metadata = {'render.modes': []} 52 | self.reward_range = (-float('inf'), float('inf')) 53 | self.spec = None 54 | 55 | def seed(self, seed=None): 56 | '''EXTREMELY IMPORTANT for reproducability in env randomness 57 | RNG is set and every call of the function will produce the same results 58 | ''' 59 | self.np_random, seed = seeding.np_random(seed) 60 | return [seed] 61 | 62 | def setup_agents(self): 63 | """Construct all the agents for the environment""" 64 | raise NotImplementedError 65 | 66 | def setup_targets(self): 67 | """Construct all the targets for the environment""" 68 | raise NotImplementedError 69 | 70 | def setup_belief_targets(self): 71 | """Construct all the target beliefs for the environment""" 72 | raise NotImplementedError 73 | 74 | def reset(self, init_random=True): 75 | """Reset the state of the environment.""" 76 | raise NotImplementedError 77 | 78 | def step(self, action_dict): 79 | """Takes in dict of action and coverts them to map updates (obs, rewards)""" 80 | raise NotImplementedError 81 | 82 | def observation(self, target, agent): 83 | r, alpha = util.relative_distance_polar(target.state[:2], 84 | xy_base=agent.state[:2], 85 | theta_base=agent.state[2]) 86 | observed = (r <= self.sensor_r) \ 87 | & (abs(alpha) <= self.fov/2/180*np.pi) \ 88 | & (not(map_utils.is_blocked(self.MAP, agent.state, target.state))) 89 | z = None 90 | if observed: 91 | z = np.array([r, alpha]) 92 | # z += np.random.multivariate_normal(np.zeros(2,), self.observation_noise(z)) 93 | z += self.np_random.multivariate_normal(np.zeros(2,), self.observation_noise(z)) 94 | '''For some reason, self.np_random is needed only here instead of np.random in order for the 95 | RNG seed to work, if used in the gen_rand_pose functions RNG seed will NOT work ''' 96 | return observed, z 97 | 98 | def observation_noise(self, z): 99 | obs_noise_cov = np.array([[self.sensor_r_sd * self.sensor_r_sd, 0.0], #z[0]/self.sensor_r * self.sensor_r_sd, 0.0], 100 | [0.0, self.sensor_b_sd * self.sensor_b_sd]]) 101 | return obs_noise_cov 102 | 103 | def get_reward(self, obstacles_pt, observed, is_training=True): 104 | return reward_fun(self.belief_targets, is_training) 105 | 106 | def gen_rand_pose(self, o_xy, c_theta, min_lin_dist, max_lin_dist, min_ang_dist, max_ang_dist): 107 | """Generates random position and yaw. 108 | Parameters 109 | -------- 110 | o_xy : xy position of a point in the global frame which we compute a distance from. 111 | c_theta : angular position of a point in the global frame which we compute an angular distance from. 112 | min_lin_dist : the minimum linear distance from o_xy to a sample point. 113 | max_lin_dist : the maximum linear distance from o_xy to a sample point. 114 | min_ang_dist : the minimum angular distance (counter clockwise direction) from c_theta to a sample point. 115 | max_ang_dist : the maximum angular distance (counter clockwise direction) from c_theta to a sample point. 116 | """ 117 | if max_ang_dist < min_ang_dist: 118 | max_ang_dist += 2*np.pi 119 | rand_ang = util.wrap_around(np.random.rand() * \ 120 | (max_ang_dist - min_ang_dist) + min_ang_dist + c_theta) 121 | 122 | rand_r = np.random.rand() * (max_lin_dist - min_lin_dist) + min_lin_dist 123 | rand_xy = np.array([rand_r*np.cos(rand_ang), rand_r*np.sin(rand_ang)]) + o_xy 124 | is_valid = not(map_utils.is_collision(self.MAP, rand_xy)) 125 | return is_valid, [rand_xy[0], rand_xy[1], rand_ang] 126 | 127 | def get_init_pose(self, init_pose_list=[], **kwargs): 128 | """Generates initial positions for the agent, targets, and target beliefs. 129 | Parameters 130 | --------- 131 | init_pose_list : a list of dictionaries with pre-defined initial positions. 132 | lin_dist_range_target : a tuple of the minimum and maximum distance of a 133 | target from the agent. 134 | ang_dist_range_target : a tuple of the minimum and maximum angular 135 | distance (counter clockwise) of a target from the 136 | agent. -pi <= x <= pi 137 | lin_dist_range_belief : a tuple of the minimum and maximum distance of a 138 | belief from the target. 139 | ang_dist_range_belief : a tuple of the minimum and maximum angular 140 | distance (counter clockwise) of a belief from the 141 | target. -pi <= x <= pi 142 | blocked : True if there is an obstacle between a target and the agent. 143 | """ 144 | if init_pose_list: 145 | self.reset_num += 1 146 | return init_pose_list[self.reset_num-1] 147 | else: 148 | return self.get_init_pose_random(**kwargs) 149 | 150 | def get_init_pose_random(self, 151 | lin_dist_range_target=(METADATA['init_distance_min'], METADATA['init_distance_max']), 152 | ang_dist_range_target=(-np.pi, np.pi), 153 | lin_dist_range_belief=(METADATA['init_belief_distance_min'], METADATA['init_belief_distance_max']), 154 | ang_dist_range_belief=(-np.pi, np.pi), 155 | blocked=False, 156 | **kwargs): 157 | is_agent_valid = False 158 | init_pose = {} 159 | init_pose['agents'] = [] 160 | for ii in range(self.nb_agents): 161 | is_agent_valid = False 162 | if self.MAP.map is None and ii==0: 163 | if blocked: 164 | raise ValueError('Unable to find a blocked initial condition. There is no obstacle in this map.') 165 | a_init = self.agent_init_pos[:2] 166 | else: 167 | while(not is_agent_valid): 168 | a_init = np.random.random((2,)) * (self.MAP.mapmax-self.MAP.mapmin) + self.MAP.mapmin 169 | is_agent_valid = not(map_utils.is_collision(self.MAP, a_init)) 170 | init_pose_agent = [a_init[0], a_init[1], np.random.random() * 2 * np.pi - np.pi] 171 | init_pose['agents'].append(init_pose_agent) 172 | 173 | init_pose['targets'], init_pose['belief_targets'] = [], [] 174 | for jj in range(self.nb_targets): 175 | is_target_valid = False 176 | while(not is_target_valid): 177 | rand_agent = np.random.randint(self.nb_agents) 178 | is_target_valid, init_pose_target = self.gen_rand_pose( 179 | init_pose['agents'][rand_agent][:2], init_pose['agents'][rand_agent][2], 180 | lin_dist_range_target[0], lin_dist_range_target[1], 181 | ang_dist_range_target[0], ang_dist_range_target[1]) 182 | init_pose['targets'].append(init_pose_target) 183 | 184 | is_belief_valid, init_pose_belief = False, np.zeros((2,)) 185 | while((not is_belief_valid) and is_target_valid): 186 | is_belief_valid, init_pose_belief = self.gen_rand_pose( 187 | init_pose['targets'][jj][:2], init_pose['targets'][jj][2], 188 | lin_dist_range_belief[0], lin_dist_range_belief[1], 189 | ang_dist_range_belief[0], ang_dist_range_belief[1]) 190 | init_pose['belief_targets'].append(init_pose_belief) 191 | return init_pose 192 | 193 | def reward_fun(belief_targets, is_training=True, c_mean=0.1): 194 | 195 | detcov = [LA.det(b_target.cov) for b_target in belief_targets] 196 | r_detcov_mean = - np.mean(np.log(detcov)) 197 | reward = c_mean * r_detcov_mean 198 | 199 | mean_nlogdetcov = None 200 | if not(is_training): 201 | logdetcov = [np.log(LA.det(b_target.cov)) for b_target in belief_targets] 202 | mean_nlogdetcov = -np.mean(logdetcov) 203 | return reward, False, mean_nlogdetcov -------------------------------------------------------------------------------- /envs/maTTenv/env/setTracking_model.py: -------------------------------------------------------------------------------- 1 | import os, copy, pdb 2 | import numpy as np 3 | from numpy import linalg as LA 4 | from gym import spaces, logger 5 | from envs.maTTenv.maps import map_utils 6 | import envs.maTTenv.util as util 7 | from envs.maTTenv.agent_models import * 8 | from envs.maTTenv.belief_tracker import KFbelief 9 | from envs.maTTenv.metadata import METADATA 10 | from envs.maTTenv.env.maTracking_Base import maTrackingBase 11 | 12 | """ 13 | Target Tracking Environments for Reinforcement Learning. 14 | [Variables] 15 | 16 | d: radial coordinate of a belief target in the learner frame 17 | alpha : angular coordinate of a belief target in the learner frame 18 | ddot : radial velocity of a belief target in the learner frame 19 | alphadot : angular velocity of a belief target in the learner frame 20 | Sigma : Covariance of a belief target 21 | 22 | [Environment Description] 23 | Varying number of agents, varying number of randomly moving targets 24 | No obstacles 25 | Greedy target assignment, k targets are shown to agent. 26 | 27 | setTrackingEnvkGreedy : Double Integrator Target model with KF belief tracker 28 | obs state: [d, alpha, ddot, alphadot, logdet(Sigma), observed] *nb_targets 29 | where nb_targets and nb_agents vary between a range 30 | num_targets describes the upperbound on possible number of targets in env 31 | num_agents describes the upperbound on possible number of agents in env 32 | Target : Double Integrator model, [x,y,xdot,ydot] 33 | Belief Target : KF, Double Integrator model 34 | 35 | """ 36 | 37 | class setTrackingEnvModel(maTrackingBase): 38 | 39 | def __init__(self, num_agents=1, num_targets=2, map_name='empty', 40 | is_training=True, known_noise=True, **kwargs): 41 | super().__init__(num_agents=num_agents, num_targets=num_targets, 42 | map_name=map_name, is_training=is_training) 43 | 44 | self.id = 'setTracking-model' 45 | ###=======================================================### 46 | # Assign agents to closest k targets, if less targets than k, consider all targets 47 | self.k = 4 48 | ###=======================================================### 49 | self.nb_agents = num_agents #only for init, will change with reset() 50 | self.nb_targets = num_targets #only for init, will change with reset() 51 | self.agent_dim = 3 52 | self.target_dim = 4 53 | self.target_init_vel = METADATA['target_init_vel']*np.ones((2,)) 54 | # LIMIT 55 | self.limit = {} # 0: low, 1:highs 56 | self.limit['agent'] = [np.concatenate((self.MAP.mapmin,[-np.pi])), np.concatenate((self.MAP.mapmax, [np.pi]))] 57 | self.limit['target'] = [np.concatenate((self.MAP.mapmin,[-METADATA['target_vel_limit'], -METADATA['target_vel_limit']])), 58 | np.concatenate((self.MAP.mapmax, [METADATA['target_vel_limit'], METADATA['target_vel_limit']]))] 59 | rel_vel_limit = METADATA['target_vel_limit'] + METADATA['action_v'][0] # Maximum relative speed 60 | self.limit['state'] = [np.array(([0.0, -np.pi, -rel_vel_limit, -10*np.pi, -50.0, 0.0, 0.0, -np.pi ])), 61 | np.array(([600.0, np.pi, rel_vel_limit, 10*np.pi, 50.0, 2.0, self.sensor_r, np.pi]))] 62 | self.observation_space = spaces.Box(self.limit['state'][0], self.limit['state'][1], dtype=np.float32) 63 | self.targetA = np.concatenate((np.concatenate((np.eye(2), self.sampling_period*np.eye(2)), axis=1), 64 | [[0,0,1,0],[0,0,0,1]])) 65 | self.target_noise_cov = METADATA['const_q'] * np.concatenate(( 66 | np.concatenate((self.sampling_period**3/3*np.eye(2), self.sampling_period**2/2*np.eye(2)), axis=1), 67 | np.concatenate((self.sampling_period**2/2*np.eye(2), self.sampling_period*np.eye(2)),axis=1) )) 68 | if known_noise: 69 | self.target_true_noise_sd = self.target_noise_cov 70 | else: 71 | self.target_true_noise_sd = METADATA['const_q_true'] * np.concatenate(( 72 | np.concatenate((self.sampling_period**2/2*np.eye(2), self.sampling_period/2*np.eye(2)), axis=1), 73 | np.concatenate((self.sampling_period/2*np.eye(2), self.sampling_period*np.eye(2)),axis=1) )) 74 | 75 | # Build a robot 76 | self.setup_agents() 77 | # Build a target 78 | self.setup_targets() 79 | self.setup_belief_targets() 80 | # Use custom reward 81 | self.get_reward() 82 | 83 | def setup_agents(self): 84 | self.agents = [AgentSE2(agent_id = 'agent-' + str(i), 85 | dim=self.agent_dim, sampling_period=self.sampling_period, 86 | limit=self.limit['agent'], 87 | collision_func=lambda x: map_utils.is_collision(self.MAP, x)) 88 | for i in range(self.num_agents)] 89 | 90 | def setup_targets(self): 91 | self.targets = [AgentDoubleInt2D(agent_id = 'target-' + str(i), 92 | dim=self.target_dim, sampling_period=self.sampling_period, 93 | limit=self.limit['target'], 94 | collision_func=lambda x: map_utils.is_collision(self.MAP, x), 95 | A=self.targetA, W=self.target_true_noise_sd) 96 | for i in range(self.num_targets)] 97 | 98 | def setup_belief_targets(self): 99 | self.belief_targets = [KFbelief(agent_id = 'target-' + str(i), 100 | dim=self.target_dim, limit=self.limit['target'], A=self.targetA, 101 | W=self.target_noise_cov, obs_noise_func=self.observation_noise, 102 | collision_func=lambda x: map_utils.is_collision(self.MAP, x)) 103 | for i in range(self.num_targets)] 104 | 105 | def get_reward(self, obstacles_pt=None, observed=None, is_training=True): 106 | return reward_fun(self.nb_targets, self.belief_targets, is_training) 107 | 108 | def reset(self,**kwargs): 109 | """ 110 | Random initialization a number of agents and targets at the reset of the env epsiode. 111 | Agents are given random positions in the map, targets are given random positions near a random agent. 112 | Return an observation state dict with agent ids (keys) that refer to their observation 113 | """ 114 | try: 115 | self.nb_agents = kwargs['nb_agents'] 116 | self.nb_targets = kwargs['nb_targets'] 117 | except: 118 | self.nb_agents = np.random.random_integers(1, self.num_agents) 119 | self.nb_targets = np.random.random_integers(1, self.num_targets) 120 | obs_dict = {} 121 | # init_pose = self.get_init_pose(**kwargs) 122 | 123 | # Initialize agents 124 | for ii in range(self.nb_agents): 125 | self.agents[ii].reset(init_pose['agents'][ii]) 126 | obs_dict[self.agents[ii].agent_id] = [] 127 | 128 | # Initialize targets and beliefs 129 | for nn in range(self.nb_targets): 130 | self.belief_targets[nn].reset( 131 | init_state=np.concatenate((init_pose['belief_targets'][nn][:2], np.zeros(2))), 132 | init_cov=self.target_init_cov) 133 | self.targets[nn].reset(np.concatenate((init_pose['targets'][nn][:2], self.target_init_vel))) 134 | # For nb agents calculate belief of targets assigned 135 | for jj in range(self.nb_targets): 136 | for kk in range(self.nb_agents): 137 | r, alpha = util.relative_distance_polar(self.belief_targets[jj].state[:2], 138 | xy_base=self.agents[kk].state[:2], 139 | theta_base=self.agents[kk].state[2]) 140 | logdetcov = np.log(LA.det(self.belief_targets[jj].cov)) 141 | obs_dict[self.agents[kk].agent_id].append([r, alpha, 0.0, 0.0, logdetcov, 0.0, self.sensor_r, np.pi]) 142 | 143 | if self.nb_targets > self.k: 144 | for agent_id in obs_dict: 145 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 146 | close = np.argpartition(obs_dict[agent_id][:,0], self.k) 147 | obs_dict[agent_id] = obs_dict[agent_id][close[:self.k]] 148 | else: 149 | for agent_id in obs_dict: 150 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 151 | 152 | return obs_dict 153 | 154 | def step(self, action_dict): 155 | obs_dict = {} 156 | reward_dict = {} 157 | done_dict = {'__all__':False} 158 | info_dict = {} 159 | 160 | # Targets move (t -> t+1) 161 | for n in range(self.nb_targets): 162 | # self.targets[n].update() 163 | self.belief_targets[n].predict() # Belief state at t+1 164 | # Agents move (t -> t+1) and observe the targets 165 | for ii, agent_id in enumerate(action_dict): 166 | obs_dict[self.agents[ii].agent_id] = [] 167 | reward_dict[self.agents[ii].agent_id] = [] 168 | done_dict[self.agents[ii].agent_id] = [] 169 | 170 | action_vw = self.action_map[action_dict[agent_id]] 171 | 172 | # Locations of all targets and agents in order to maintain a margin between them 173 | margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]] 174 | for p, ids in enumerate(action_dict): 175 | if agent_id != ids: 176 | margin_pos.append(np.array(self.agents[p].state[:2])) 177 | _ = self.agents[ii].update(action_vw, margin_pos) 178 | 179 | # Target and map observations 180 | observed = np.zeros(self.nb_targets, dtype=bool) 181 | # obstacles_pt = map_utils.get_closest_obstacle(self.MAP, self.agents[ii].state) 182 | # if obstacles_pt is None: 183 | obstacles_pt = (self.sensor_r, np.pi) 184 | 185 | # Update beliefs of targets 186 | for jj in range(self.nb_targets): 187 | # Observe 188 | obs, z_t = self.observation(self.targets[jj], self.agents[ii]) 189 | observed[jj] = obs 190 | if obs: # if observed, update the target belief. 191 | self.belief_targets[jj].update(z_t, self.agents[ii].state) 192 | 193 | r_b, alpha_b = util.relative_distance_polar(self.belief_targets[jj].state[:2], 194 | xy_base=self.agents[ii].state[:2], 195 | theta_base=self.agents[ii].state[-1]) 196 | r_dot_b, alpha_dot_b = util.relative_velocity_polar( 197 | self.belief_targets[jj].state[:2], 198 | self.belief_targets[jj].state[2:], 199 | self.agents[ii].state[:2], self.agents[ii].state[-1], 200 | action_vw[0], action_vw[1]) 201 | obs_dict[agent_id].append([r_b, alpha_b, r_dot_b, alpha_dot_b, 202 | np.log(LA.det(self.belief_targets[jj].cov)), float(obs), self.sensor_r, np.pi]) 203 | # Assign agents to closest k targets, if less targets than k, consider all targets 204 | if self.nb_targets > self.k: 205 | for agent_id in obs_dict: 206 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 207 | close = np.argpartition(obs_dict[agent_id][:,0], self.k) 208 | obs_dict[agent_id] = obs_dict[agent_id][close[:self.k]] 209 | else: 210 | for agent_id in obs_dict: 211 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 212 | # Get all rewards after all agents and targets move (t -> t+1) 213 | reward, done, mean_nlogdetcov = self.get_reward(obstacles_pt, observed, self.is_training) 214 | reward_dict['__all__'], done_dict['__all__'], info_dict['mean_nlogdetcov'] = reward, done, mean_nlogdetcov 215 | return obs_dict, reward_dict, done_dict, info_dict 216 | 217 | def reward_fun(nb_targets, belief_targets, is_training=True, c_mean=0.1): 218 | detcov = [LA.det(b_target.cov) for b_target in belief_targets[:nb_targets]] 219 | r_detcov_mean = - np.mean(np.log(detcov)) 220 | reward = c_mean * r_detcov_mean 221 | 222 | mean_nlogdetcov = None 223 | if not(is_training): 224 | logdetcov = [np.log(LA.det(b_target.cov)) for b_target in belief_targets[:nb_targets]] 225 | mean_nlogdetcov = -np.mean(logdetcov) 226 | return reward, False, mean_nlogdetcov -------------------------------------------------------------------------------- /envs/maTTenv/env/setTracking_v0.py: -------------------------------------------------------------------------------- 1 | import os, copy, pdb 2 | import numpy as np 3 | from numpy import linalg as LA 4 | from gym import spaces, logger 5 | from envs.maTTenv.maps import map_utils 6 | import envs.maTTenv.util as util 7 | from envs.maTTenv.agent_models import * 8 | from envs.maTTenv.belief_tracker import KFbelief 9 | from envs.maTTenv.metadata import METADATA 10 | from envs.maTTenv.env.maTracking_Base import maTrackingBase 11 | 12 | """ 13 | Target Tracking Environments for Reinforcement Learning. 14 | [Variables] 15 | 16 | d: radial coordinate of a belief target in the learner frame 17 | alpha : angular coordinate of a belief target in the learner frame 18 | ddot : radial velocity of a belief target in the learner frame 19 | alphadot : angular velocity of a belief target in the learner frame 20 | Sigma : Covariance of a belief target 21 | 22 | [Environment Description] 23 | Varying number of agents, varying number of randomly moving targets 24 | No obstacles 25 | 26 | setTrackingEnv0 : Double Integrator Target model with KF belief tracker 27 | obs state: [d, alpha, ddot, alphadot, logdet(Sigma), observed] *nb_targets 28 | where nb_targets and nb_agents vary between a range 29 | num_targets describes the upperbound on possible number of targets in env 30 | num_agents describes the upperbound on possible number of agents in env 31 | Target : Double Integrator model, [x,y,xdot,ydot] 32 | Belief Target : KF, Double Integrator model 33 | 34 | """ 35 | 36 | class setTrackingEnv0(maTrackingBase): 37 | 38 | def __init__(self, num_agents=1, num_targets=2, map_name='empty', 39 | is_training=True, known_noise=True, **kwargs): 40 | super().__init__(num_agents=num_agents, num_targets=num_targets, 41 | map_name=map_name, is_training=is_training) 42 | 43 | self.id = 'setTracking-v0' 44 | self.nb_agents = num_agents #only for init, will change with reset() 45 | self.nb_targets = num_targets #only for init, will change with reset() 46 | self.agent_dim = 3 47 | self.target_dim = 4 48 | self.target_init_vel = METADATA['target_init_vel']*np.ones((2,)) 49 | # LIMIT 50 | self.limit = {} # 0: low, 1:highs 51 | self.limit['agent'] = [np.concatenate((self.MAP.mapmin,[-np.pi])), np.concatenate((self.MAP.mapmax, [np.pi]))] 52 | self.limit['target'] = [np.concatenate((self.MAP.mapmin,[-METADATA['target_vel_limit'], -METADATA['target_vel_limit']])), 53 | np.concatenate((self.MAP.mapmax, [METADATA['target_vel_limit'], METADATA['target_vel_limit']]))] 54 | rel_vel_limit = METADATA['target_vel_limit'] + METADATA['action_v'][0] # Maximum relative speed 55 | self.limit['state'] = [np.array(([0.0, -np.pi, -rel_vel_limit, -10*np.pi, -50.0, 0.0])), 56 | np.array(([600.0, np.pi, rel_vel_limit, 10*np.pi, 50.0, 2.0]))] 57 | self.observation_space = spaces.Box(self.limit['state'][0], self.limit['state'][1], dtype=np.float32) 58 | self.targetA = np.concatenate((np.concatenate((np.eye(2), self.sampling_period*np.eye(2)), axis=1), 59 | [[0,0,1,0],[0,0,0,1]])) 60 | self.target_noise_cov = METADATA['const_q'] * np.concatenate(( 61 | np.concatenate((self.sampling_period**3/3*np.eye(2), self.sampling_period**2/2*np.eye(2)), axis=1), 62 | np.concatenate((self.sampling_period**2/2*np.eye(2), self.sampling_period*np.eye(2)),axis=1) )) 63 | if known_noise: 64 | self.target_true_noise_sd = self.target_noise_cov 65 | else: 66 | self.target_true_noise_sd = METADATA['const_q_true'] * np.concatenate(( 67 | np.concatenate((self.sampling_period**2/2*np.eye(2), self.sampling_period/2*np.eye(2)), axis=1), 68 | np.concatenate((self.sampling_period/2*np.eye(2), self.sampling_period*np.eye(2)),axis=1) )) 69 | 70 | # Build a robot 71 | self.setup_agents() 72 | # Build a target 73 | self.setup_targets() 74 | self.setup_belief_targets() 75 | # Use custom reward 76 | self.get_reward() 77 | 78 | def setup_agents(self): 79 | self.agents = [AgentSE2(agent_id = 'agent-' + str(i), 80 | dim=self.agent_dim, sampling_period=self.sampling_period, 81 | limit=self.limit['agent'], 82 | collision_func=lambda x: map_utils.is_collision(self.MAP, x)) 83 | for i in range(self.num_agents)] 84 | 85 | def setup_targets(self): 86 | self.targets = [AgentDoubleInt2D(agent_id = 'target-' + str(i), 87 | dim=self.target_dim, sampling_period=self.sampling_period, 88 | limit=self.limit['target'], 89 | collision_func=lambda x: map_utils.is_collision(self.MAP, x), 90 | A=self.targetA, W=self.target_true_noise_sd) 91 | for i in range(self.num_targets)] 92 | 93 | def setup_belief_targets(self): 94 | self.belief_targets = [KFbelief(agent_id = 'target-' + str(i), 95 | dim=self.target_dim, limit=self.limit['target'], A=self.targetA, 96 | W=self.target_noise_cov, obs_noise_func=self.observation_noise, 97 | collision_func=lambda x: map_utils.is_collision(self.MAP, x)) 98 | for i in range(self.num_targets)] 99 | 100 | def get_reward(self, observed=None, is_training=True): 101 | return reward_fun(self.nb_targets, self.belief_targets, is_training) 102 | 103 | def reset(self,**kwargs): 104 | """ 105 | Random initialization a number of agents and targets at the reset of the env epsiode. 106 | Agents are given random positions in the map, targets are given random positions near a random agent. 107 | Return an observation state dict with agent ids (keys) that refer to their observation 108 | """ 109 | self.rng = np.random.default_rng() 110 | try: 111 | self.nb_agents = kwargs['nb_agents'] 112 | self.nb_targets = kwargs['nb_targets'] 113 | except: 114 | self.nb_agents = np.random.randint(1, self.num_agents) 115 | self.nb_targets = np.random.randint(1, self.num_targets) 116 | obs_dict = {} 117 | init_pose = self.get_init_pose(**kwargs) 118 | # Initialize agents 119 | for ii in range(self.nb_agents): 120 | self.agents[ii].reset(init_pose['agents'][ii]) 121 | obs_dict[self.agents[ii].agent_id] = [] 122 | 123 | # Initialize targets and beliefs 124 | for nn in range(self.nb_targets): 125 | self.belief_targets[nn].reset( 126 | init_state=np.concatenate((init_pose['belief_targets'][nn][:2], np.zeros(2))), 127 | init_cov=self.target_init_cov) 128 | self.targets[nn].reset(np.concatenate((init_pose['targets'][nn][:2], self.target_init_vel))) 129 | # For nb agents calculate belief of targets assigned 130 | for jj in range(self.nb_targets): 131 | for kk in range(self.nb_agents): 132 | r, alpha = util.relative_distance_polar(self.belief_targets[jj].state[:2], 133 | xy_base=self.agents[kk].state[:2], 134 | theta_base=self.agents[kk].state[2]) 135 | logdetcov = np.log(LA.det(self.belief_targets[jj].cov)) 136 | obs_dict[self.agents[kk].agent_id].append([r, alpha, 0.0, 0.0, logdetcov, 0.0]) 137 | for agent_id in obs_dict: 138 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 139 | return obs_dict 140 | 141 | def step(self, action_dict): 142 | obs_dict = {} 143 | reward_dict = {} 144 | done_dict = {'__all__':False} 145 | info_dict = {} 146 | 147 | # Targets move (t -> t+1) 148 | for n in range(self.nb_targets): 149 | self.targets[n].update() 150 | self.belief_targets[n].predict() # Belief state at t+1 151 | # Agents move (t -> t+1) and observe the targets 152 | for ii, agent_id in enumerate(action_dict): 153 | obs_dict[self.agents[ii].agent_id] = [] 154 | reward_dict[self.agents[ii].agent_id] = [] 155 | done_dict[self.agents[ii].agent_id] = [] 156 | 157 | action_vw = self.action_map[action_dict[agent_id]] 158 | 159 | # Locations of all targets and agents in order to maintain a margin between them 160 | margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]] 161 | for p, ids in enumerate(action_dict): 162 | if agent_id != ids: 163 | margin_pos.append(np.array(self.agents[p].state[:2])) 164 | _ = self.agents[ii].update(action_vw, margin_pos) 165 | 166 | # Target and map observations 167 | observed = np.zeros(self.nb_targets, dtype=bool) 168 | 169 | # Update beliefs of targets 170 | for jj in range(self.nb_targets): 171 | # Observe 172 | obs, z_t = self.observation(self.targets[jj], self.agents[ii]) 173 | observed[jj] = obs 174 | if obs: # if observed, update the target belief. 175 | self.belief_targets[jj].update(z_t, self.agents[ii].state) 176 | 177 | r_b, alpha_b = util.relative_distance_polar(self.belief_targets[jj].state[:2], 178 | xy_base=self.agents[ii].state[:2], 179 | theta_base=self.agents[ii].state[-1]) 180 | r_dot_b, alpha_dot_b = util.relative_velocity_polar( 181 | self.belief_targets[jj].state[:2], 182 | self.belief_targets[jj].state[2:], 183 | self.agents[ii].state[:2], self.agents[ii].state[-1], 184 | action_vw[0], action_vw[1]) 185 | obs_dict[agent_id].append([r_b, alpha_b, r_dot_b, alpha_dot_b, 186 | np.log(LA.det(self.belief_targets[jj].cov)), float(obs)]) 187 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 188 | # shuffle obs to promote permutation invariance 189 | self.rng.shuffle(obs_dict[agent_id]) 190 | # Get all rewards after all agents and targets move (t -> t+1) 191 | reward, done, mean_nlogdetcov = self.get_reward(observed, self.is_training) 192 | reward_dict['__all__'], done_dict['__all__'], info_dict['mean_nlogdetcov'] = reward, done, mean_nlogdetcov 193 | return obs_dict, reward_dict, done_dict, info_dict 194 | 195 | def reward_fun(nb_targets, belief_targets, is_training=True, c_mean=0.1): 196 | detcov = [LA.det(b_target.cov) for b_target in belief_targets[:nb_targets]] 197 | r_detcov_mean = - np.mean(np.log(detcov)) 198 | reward = c_mean * r_detcov_mean 199 | 200 | mean_nlogdetcov = None 201 | if not(is_training): 202 | logdetcov = [np.log(LA.det(b_target.cov)) for b_target in belief_targets[:nb_targets]] 203 | mean_nlogdetcov = -np.mean(logdetcov) 204 | return reward, False, mean_nlogdetcov -------------------------------------------------------------------------------- /envs/maTTenv/env/setTracking_vGreedy.py: -------------------------------------------------------------------------------- 1 | import os, copy, pdb 2 | import numpy as np 3 | from numpy import linalg as LA 4 | from gym import spaces, logger 5 | from envs.maTTenv.maps import map_utils 6 | import envs.maTTenv.util as util 7 | from envs.maTTenv.agent_models import * 8 | from envs.maTTenv.belief_tracker import KFbelief 9 | from envs.maTTenv.metadata import METADATA 10 | from envs.maTTenv.env.maTracking_Base import maTrackingBase 11 | 12 | """ 13 | Target Tracking Environments for Reinforcement Learning. 14 | [Variables] 15 | 16 | d: radial coordinate of a belief target in the learner frame 17 | alpha : angular coordinate of a belief target in the learner frame 18 | ddot : radial velocity of a belief target in the learner frame 19 | alphadot : angular velocity of a belief target in the learner frame 20 | Sigma : Covariance of a belief target 21 | o_d : linear distance to the closet obstacle point 22 | o_alpha : angular distance to the closet obstacle point 23 | 24 | [Environment Description] 25 | Varying number of agents, varying number of randomly moving targets 26 | No obstacles 27 | VERY Greedy target assignment, always goes to the closest target 28 | 29 | setTrackingEnvGreedy : Double Integrator Target model with KF belief tracker 30 | obs state: [d, alpha, ddot, alphadot, logdet(Sigma), observed] *nb_targets 31 | where nb_targets and nb_agents vary between a range 32 | num_targets describes the upperbound on possible number of targets in env 33 | num_agents describes the upperbound on possible number of agents in env 34 | Target : Double Integrator model, [x,y,xdot,ydot] 35 | Belief Target : KF, Double Integrator model 36 | 37 | """ 38 | 39 | class setTrackingEnvGreedy(maTrackingBase): 40 | 41 | def __init__(self, num_agents=1, num_targets=2, map_name='empty', 42 | is_training=True, known_noise=True, **kwargs): 43 | super().__init__(num_agents=num_agents, num_targets=num_targets, 44 | map_name=map_name, is_training=is_training) 45 | 46 | self.id = 'setTracking-vGreedy' 47 | self.nb_agents = num_agents #only for init, will change with reset() 48 | self.nb_targets = num_targets #only for init, will change with reset() 49 | self.agent_dim = 3 50 | self.target_dim = 4 51 | self.target_init_vel = METADATA['target_init_vel']*np.ones((2,)) 52 | # LIMIT 53 | self.limit = {} # 0: low, 1:highs 54 | self.limit['agent'] = [np.concatenate((self.MAP.mapmin,[-np.pi])), np.concatenate((self.MAP.mapmax, [np.pi]))] 55 | self.limit['target'] = [np.concatenate((self.MAP.mapmin,[-METADATA['target_vel_limit'], -METADATA['target_vel_limit']])), 56 | np.concatenate((self.MAP.mapmax, [METADATA['target_vel_limit'], METADATA['target_vel_limit']]))] 57 | rel_vel_limit = METADATA['target_vel_limit'] + METADATA['action_v'][0] # Maximum relative speed 58 | self.limit['state'] = [np.array(([0.0, -np.pi, -rel_vel_limit, -10*np.pi, -50.0, 0.0])), 59 | np.array(([600.0, np.pi, rel_vel_limit, 10*np.pi, 50.0, 2.0]))] 60 | self.observation_space = spaces.Box(self.limit['state'][0], self.limit['state'][1], dtype=np.float32) 61 | self.targetA = np.concatenate((np.concatenate((np.eye(2), self.sampling_period*np.eye(2)), axis=1), 62 | [[0,0,1,0],[0,0,0,1]])) 63 | self.target_noise_cov = METADATA['const_q'] * np.concatenate(( 64 | np.concatenate((self.sampling_period**3/3*np.eye(2), self.sampling_period**2/2*np.eye(2)), axis=1), 65 | np.concatenate((self.sampling_period**2/2*np.eye(2), self.sampling_period*np.eye(2)),axis=1) )) 66 | if known_noise: 67 | self.target_true_noise_sd = self.target_noise_cov 68 | else: 69 | self.target_true_noise_sd = METADATA['const_q_true'] * np.concatenate(( 70 | np.concatenate((self.sampling_period**2/2*np.eye(2), self.sampling_period/2*np.eye(2)), axis=1), 71 | np.concatenate((self.sampling_period/2*np.eye(2), self.sampling_period*np.eye(2)),axis=1) )) 72 | 73 | # Build a robot 74 | self.setup_agents() 75 | # Build a target 76 | self.setup_targets() 77 | self.setup_belief_targets() 78 | # Use custom reward 79 | self.get_reward() 80 | 81 | def setup_agents(self): 82 | self.agents = [AgentSE2(agent_id = 'agent-' + str(i), 83 | dim=self.agent_dim, sampling_period=self.sampling_period, 84 | limit=self.limit['agent'], 85 | collision_func=lambda x: map_utils.is_collision(self.MAP, x)) 86 | for i in range(self.num_agents)] 87 | 88 | def setup_targets(self): 89 | self.targets = [AgentDoubleInt2D(agent_id = 'target-' + str(i), 90 | dim=self.target_dim, sampling_period=self.sampling_period, 91 | limit=self.limit['target'], 92 | collision_func=lambda x: map_utils.is_collision(self.MAP, x), 93 | A=self.targetA, W=self.target_true_noise_sd) 94 | for i in range(self.num_targets)] 95 | 96 | def setup_belief_targets(self): 97 | self.belief_targets = [KFbelief(agent_id = 'target-' + str(i), 98 | dim=self.target_dim, limit=self.limit['target'], A=self.targetA, 99 | W=self.target_noise_cov, obs_noise_func=self.observation_noise, 100 | collision_func=lambda x: map_utils.is_collision(self.MAP, x)) 101 | for i in range(self.num_targets)] 102 | 103 | def get_reward(self, observed=None, is_training=True): 104 | return reward_fun(self.nb_targets, self.belief_targets, is_training) 105 | 106 | def reset(self,**kwargs): 107 | """ 108 | Random initialization a number of agents and targets at the reset of the env epsiode. 109 | Agents are given random positions in the map, targets are given random positions near a random agent. 110 | Return an observation state dict with agent ids (keys) that refer to their observation 111 | """ 112 | try: 113 | self.nb_agents = kwargs['nb_agents'] 114 | self.nb_targets = kwargs['nb_targets'] 115 | except: 116 | self.nb_agents = np.random.randint(1, self.num_agents) 117 | self.nb_targets = np.random.randint(1, self.num_targets) 118 | obs_dict = {} 119 | init_pose = self.get_init_pose(**kwargs) 120 | # Initialize agents 121 | for ii in range(self.nb_agents): 122 | self.agents[ii].reset(init_pose['agents'][ii]) 123 | obs_dict[self.agents[ii].agent_id] = [] 124 | 125 | # Initialize targets and beliefs 126 | for nn in range(self.nb_targets): 127 | self.belief_targets[nn].reset( 128 | init_state=np.concatenate((init_pose['belief_targets'][nn][:2], np.zeros(2))), 129 | init_cov=self.target_init_cov) 130 | self.targets[nn].reset(np.concatenate((init_pose['targets'][nn][:2], self.target_init_vel))) 131 | # For nb agents calculate belief of targets assigned 132 | for jj in range(self.nb_targets): 133 | for kk in range(self.nb_agents): 134 | r, alpha = util.relative_distance_polar(self.belief_targets[jj].state[:2], 135 | xy_base=self.agents[kk].state[:2], 136 | theta_base=self.agents[kk].state[2]) 137 | logdetcov = np.log(LA.det(self.belief_targets[jj].cov)) 138 | obs_dict[self.agents[kk].agent_id].append([r, alpha, 0.0, 0.0, logdetcov, 0.0]) 139 | # Greedily assign agents to closest target 140 | for agent_id in obs_dict: 141 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 142 | close = np.argmin(obs_dict[agent_id][:,0]) 143 | obs_dict[agent_id] = obs_dict[agent_id][None,close] 144 | return obs_dict 145 | 146 | def step(self, action_dict): 147 | obs_dict = {} 148 | reward_dict = {} 149 | done_dict = {'__all__':False} 150 | info_dict = {} 151 | 152 | # Targets move (t -> t+1) 153 | for n in range(self.nb_targets): 154 | self.targets[n].update() 155 | self.belief_targets[n].predict() # Belief state at t+1 156 | # Agents move (t -> t+1) and observe the targets 157 | for ii, agent_id in enumerate(action_dict): 158 | obs_dict[self.agents[ii].agent_id] = [] 159 | reward_dict[self.agents[ii].agent_id] = [] 160 | done_dict[self.agents[ii].agent_id] = [] 161 | 162 | action_vw = self.action_map[action_dict[agent_id]] 163 | 164 | # Locations of all targets and agents in order to maintain a margin between them 165 | margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]] 166 | for p, ids in enumerate(action_dict): 167 | if agent_id != ids: 168 | margin_pos.append(np.array(self.agents[p].state[:2])) 169 | _ = self.agents[ii].update(action_vw, margin_pos) 170 | 171 | # Target and map observations 172 | observed = np.zeros(self.nb_targets, dtype=bool) 173 | 174 | # Update beliefs of targets 175 | for jj in range(self.nb_targets): 176 | # Observe 177 | obs, z_t = self.observation(self.targets[jj], self.agents[ii]) 178 | observed[jj] = obs 179 | if obs: # if observed, update the target belief. 180 | self.belief_targets[jj].update(z_t, self.agents[ii].state) 181 | 182 | r_b, alpha_b = util.relative_distance_polar(self.belief_targets[jj].state[:2], 183 | xy_base=self.agents[ii].state[:2], 184 | theta_base=self.agents[ii].state[-1]) 185 | r_dot_b, alpha_dot_b = util.relative_velocity_polar( 186 | self.belief_targets[jj].state[:2], 187 | self.belief_targets[jj].state[2:], 188 | self.agents[ii].state[:2], self.agents[ii].state[-1], 189 | action_vw[0], action_vw[1]) 190 | obs_dict[agent_id].append([r_b, alpha_b, r_dot_b, alpha_dot_b, 191 | np.log(LA.det(self.belief_targets[jj].cov)), float(obs)]) 192 | # Greedily assign agents to closest target 193 | for agent_id in obs_dict: 194 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 195 | close = np.argmin(obs_dict[agent_id][:,0]) 196 | obs_dict[agent_id] = obs_dict[agent_id][None,close] 197 | # Get all rewards after all agents and targets move (t -> t+1) 198 | reward, done, mean_nlogdetcov = self.get_reward(observed, self.is_training) 199 | reward_dict['__all__'], done_dict['__all__'], info_dict['mean_nlogdetcov'] = reward, done, mean_nlogdetcov 200 | return obs_dict, reward_dict, done_dict, info_dict 201 | 202 | def reward_fun(nb_targets, belief_targets, is_training=True, c_mean=0.1): 203 | detcov = [LA.det(b_target.cov) for b_target in belief_targets[:nb_targets]] 204 | r_detcov_mean = - np.mean(np.log(detcov)) 205 | reward = c_mean * r_detcov_mean 206 | 207 | mean_nlogdetcov = None 208 | if not(is_training): 209 | logdetcov = [np.log(LA.det(b_target.cov)) for b_target in belief_targets[:nb_targets]] 210 | mean_nlogdetcov = -np.mean(logdetcov) 211 | return reward, False, mean_nlogdetcov -------------------------------------------------------------------------------- /envs/maTTenv/env/setTracking_vGru.py: -------------------------------------------------------------------------------- 1 | import os, copy, pdb 2 | import numpy as np 3 | from numpy import linalg as LA 4 | from gym import spaces, logger 5 | from envs.maTTenv.maps import map_utils 6 | import envs.maTTenv.util as util 7 | from envs.maTTenv.agent_models import * 8 | from envs.maTTenv.belief_tracker import KFbelief 9 | from envs.maTTenv.metadata import METADATA 10 | from envs.maTTenv.env.maTracking_Base import maTrackingBase 11 | 12 | """ 13 | Target Tracking Environments for Reinforcement Learning. 14 | [Variables] 15 | 16 | d: radial coordinate of a belief target in the learner frame 17 | alpha : angular coordinate of a belief target in the learner frame 18 | ddot : radial velocity of a belief target in the learner frame 19 | alphadot : angular velocity of a belief target in the learner frame 20 | Sigma : Covariance of a belief target 21 | 22 | [Environment Description] 23 | Use a gru to predict the belief of the target 24 | The kf is used as a tool for the reward 25 | 26 | Varying number of agents, varying number of randomly moving targets 27 | No obstacles 28 | 29 | setTrackingEnvGru : Double Integrator Target model with KF belief tracker 30 | obs state: [d, alpha, ddot, alphadot, observed] *nb_targets 31 | where nb_targets and nb_agents vary between a range 32 | num_targets describes the upperbound on possible number of targets in env 33 | num_agents describes the upperbound on possible number of agents in env 34 | Target : Double Integrator model, [x,y,xdot,ydot] 35 | Belief Target : KF, Double Integrator model 36 | 37 | """ 38 | 39 | class setTrackingEnvGru(maTrackingBase): 40 | 41 | def __init__(self, num_agents=1, num_targets=2, map_name='empty', 42 | is_training=True, known_noise=True, **kwargs): 43 | super().__init__(num_agents=num_agents, num_targets=num_targets, 44 | map_name=map_name, is_training=is_training) 45 | 46 | self.id = 'setTracking-vGru' 47 | self.nb_agents = num_agents #only for init, will change with reset() 48 | self.nb_targets = num_targets #only for init, will change with reset() 49 | self.agent_dim = 3 50 | self.target_dim = 4 51 | self.target_init_vel = METADATA['target_init_vel']*np.ones((2,)) 52 | # LIMIT 53 | self.limit = {} # 0: low, 1:highs 54 | self.limit['agent'] = [np.concatenate((self.MAP.mapmin,[-np.pi])), np.concatenate((self.MAP.mapmax, [np.pi]))] 55 | self.limit['target'] = [np.concatenate((self.MAP.mapmin,[-METADATA['target_vel_limit'], -METADATA['target_vel_limit']])), 56 | np.concatenate((self.MAP.mapmax, [METADATA['target_vel_limit'], METADATA['target_vel_limit']]))] 57 | rel_vel_limit = METADATA['target_vel_limit'] + METADATA['action_v'][0] # Maximum relative speed 58 | self.limit['state'] = [np.array(([0.0, -np.pi, -rel_vel_limit, -10*np.pi, -50.0, 0.0])), 59 | np.array(([600.0, np.pi, rel_vel_limit, 10*np.pi, 50.0, 2.0]))] 60 | self.observation_space = spaces.Box(self.limit['state'][0], self.limit['state'][1], dtype=np.float32) 61 | self.targetA = np.concatenate((np.concatenate((np.eye(2), self.sampling_period*np.eye(2)), axis=1), 62 | [[0,0,1,0],[0,0,0,1]])) 63 | self.target_noise_cov = METADATA['const_q'] * np.concatenate(( 64 | np.concatenate((self.sampling_period**3/3*np.eye(2), self.sampling_period**2/2*np.eye(2)), axis=1), 65 | np.concatenate((self.sampling_period**2/2*np.eye(2), self.sampling_period*np.eye(2)),axis=1) )) 66 | if known_noise: 67 | self.target_true_noise_sd = self.target_noise_cov 68 | else: 69 | self.target_true_noise_sd = METADATA['const_q_true'] * np.concatenate(( 70 | np.concatenate((self.sampling_period**2/2*np.eye(2), self.sampling_period/2*np.eye(2)), axis=1), 71 | np.concatenate((self.sampling_period/2*np.eye(2), self.sampling_period*np.eye(2)),axis=1) )) 72 | 73 | # Build a robot 74 | self.setup_agents() 75 | # Build a target 76 | self.setup_targets() 77 | self.setup_belief_targets() 78 | # Use custom reward 79 | self.get_reward() 80 | 81 | def setup_agents(self): 82 | self.agents = [AgentSE2(agent_id = 'agent-' + str(i), 83 | dim=self.agent_dim, sampling_period=self.sampling_period, 84 | limit=self.limit['agent'], 85 | collision_func=lambda x: map_utils.is_collision(self.MAP, x)) 86 | for i in range(self.num_agents)] 87 | 88 | def setup_targets(self): 89 | self.targets = [AgentDoubleInt2D(agent_id = 'target-' + str(i), 90 | dim=self.target_dim, sampling_period=self.sampling_period, 91 | limit=self.limit['target'], 92 | collision_func=lambda x: map_utils.is_collision(self.MAP, x), 93 | A=self.targetA, W=self.target_true_noise_sd) 94 | for i in range(self.num_targets)] 95 | 96 | def setup_belief_targets(self): 97 | self.belief_targets = [KFbelief(agent_id = 'target-' + str(i), 98 | dim=self.target_dim, limit=self.limit['target'], A=self.targetA, 99 | W=self.target_noise_cov, obs_noise_func=self.observation_noise, 100 | collision_func=lambda x: map_utils.is_collision(self.MAP, x)) 101 | for i in range(self.num_targets)] 102 | 103 | def get_reward(self, observed=None, is_training=True): 104 | return reward_fun(self.nb_targets, self.belief_targets, is_training) 105 | 106 | def reset(self,**kwargs): 107 | """ 108 | Random initialization a number of agents and targets at the reset of the env epsiode. 109 | Agents are given random positions in the map, targets are given random positions near a random agent. 110 | Return an observation state dict with agent ids (keys) that refer to their observation 111 | """ 112 | self.rng = np.random.default_rng() 113 | try: 114 | self.nb_agents = kwargs['nb_agents'] 115 | self.nb_targets = kwargs['nb_targets'] 116 | except: 117 | self.nb_agents = np.random.randint(1, self.num_agents) 118 | self.nb_targets = np.random.randint(1, self.num_targets) 119 | obs_dict = {} 120 | init_pose = self.get_init_pose(**kwargs) 121 | # Initialize agents 122 | for ii in range(self.nb_agents): 123 | self.agents[ii].reset(init_pose['agents'][ii]) 124 | obs_dict[self.agents[ii].agent_id] = [] 125 | 126 | # Initialize targets and beliefs 127 | for nn in range(self.nb_targets): 128 | self.belief_targets[nn].reset( 129 | init_state=np.concatenate((init_pose['belief_targets'][nn][:2], np.zeros(2))), 130 | init_cov=self.target_init_cov) 131 | self.targets[nn].reset(np.concatenate((init_pose['targets'][nn][:2], self.target_init_vel))) 132 | # For nb agents calculate belief of targets assigned 133 | for jj in range(self.nb_targets): 134 | for kk in range(self.nb_agents): 135 | r, alpha = util.relative_distance_polar(self.belief_targets[jj].state[:2], 136 | xy_base=self.agents[kk].state[:2], 137 | theta_base=self.agents[kk].state[2]) 138 | logdetcov = np.log(LA.det(self.belief_targets[jj].cov)) 139 | obs_dict[self.agents[kk].agent_id].append([r, alpha, 0.0, 0.0, 0.0]) 140 | for agent_id in obs_dict: 141 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 142 | self._obs_dict = obs_dict 143 | return obs_dict 144 | 145 | def step(self, action_dict): 146 | obs_dict = {} 147 | reward_dict = {} 148 | done_dict = {'__all__':False} 149 | info_dict = {} 150 | 151 | # Targets move (t -> t+1) 152 | for n in range(self.nb_targets): 153 | self.targets[n].update() 154 | self.belief_targets[n].predict() # Belief state at t+1 155 | # Agents move (t -> t+1) and observe the targets 156 | for ii, agent_id in enumerate(action_dict): 157 | obs_dict[self.agents[ii].agent_id] = [] 158 | reward_dict[self.agents[ii].agent_id] = [] 159 | done_dict[self.agents[ii].agent_id] = [] 160 | 161 | action_vw = self.action_map[action_dict[agent_id]] 162 | 163 | # Locations of all targets and agents in order to maintain a margin between them 164 | margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]] 165 | for p, ids in enumerate(action_dict): 166 | if agent_id != ids: 167 | margin_pos.append(np.array(self.agents[p].state[:2])) 168 | _ = self.agents[ii].update(action_vw, margin_pos) 169 | 170 | # Target and map observations 171 | observed = np.zeros(self.nb_targets, dtype=bool) 172 | # obstacles_pt = map_utils.get_closest_obstacle(self.MAP, self.agents[ii].state) 173 | # if obstacles_pt is None: 174 | obstacles_pt = (self.sensor_r, np.pi) 175 | 176 | # Update beliefs of targets 177 | for jj in range(self.nb_targets): 178 | # Observe 179 | obs, z_t = self.observation(self.targets[jj], self.agents[ii]) 180 | observed[jj] = obs 181 | if obs: # if observed, update the target belief. 182 | self.belief_targets[jj].update(z_t, self.agents[ii].state) 183 | 184 | r_b, alpha_b = util.relative_distance_polar(self.targets[jj].state[:2], 185 | xy_base=self.agents[ii].state[:2], 186 | theta_base=self.agents[ii].state[-1]) 187 | r_dot_b, alpha_dot_b = util.relative_velocity_polar( 188 | self.targets[jj].state[:2], 189 | self.targets[jj].state[2:], 190 | self.agents[ii].state[:2], self.agents[ii].state[-1], 191 | action_vw[0], action_vw[1]) 192 | obs_dict[agent_id].append([r_b, alpha_b, r_dot_b, alpha_dot_b, float(obs)]) 193 | else: 194 | ## if no obs 195 | obs_dict[agent_id].append([0.0, 0.0, 0.0, 0.0, float(obs)]) 196 | 197 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 198 | # store obs dict before shuffle 199 | self._obs_dict[agent_id] = obs_dict[agent_id] 200 | # shuffle obs to promote permutation invariance 201 | self.rng.shuffle(obs_dict[agent_id]) 202 | # Get all rewards after all agents and targets move (t -> t+1) 203 | reward, done, mean_nlogdetcov = self.get_reward(observed, self.is_training) 204 | reward_dict['__all__'], done_dict['__all__'], info_dict['mean_nlogdetcov'] = reward, done, mean_nlogdetcov 205 | return obs_dict, reward_dict, done_dict, info_dict 206 | 207 | def reward_fun(nb_targets, belief_targets, is_training=True, c_mean=0.1): 208 | detcov = [LA.det(b_target.cov) for b_target in belief_targets[:nb_targets]] 209 | r_detcov_mean = - np.mean(np.log(detcov)) 210 | reward = c_mean * r_detcov_mean 211 | 212 | mean_nlogdetcov = None 213 | if not(is_training): 214 | logdetcov = [np.log(LA.det(b_target.cov)) for b_target in belief_targets[:nb_targets]] 215 | mean_nlogdetcov = -np.mean(logdetcov) 216 | return reward, False, mean_nlogdetcov -------------------------------------------------------------------------------- /envs/maTTenv/env/setTracking_vkGreedy.py: -------------------------------------------------------------------------------- 1 | import os, copy, pdb 2 | import numpy as np 3 | from numpy import linalg as LA 4 | from gym import spaces, logger 5 | from envs.maTTenv.maps import map_utils 6 | import envs.maTTenv.util as util 7 | from envs.maTTenv.agent_models import * 8 | from envs.maTTenv.belief_tracker import KFbelief 9 | from envs.maTTenv.metadata import METADATA 10 | from envs.maTTenv.env.maTracking_Base import maTrackingBase 11 | 12 | """ 13 | Target Tracking Environments for Reinforcement Learning. 14 | [Variables] 15 | 16 | d: radial coordinate of a belief target in the learner frame 17 | alpha : angular coordinate of a belief target in the learner frame 18 | ddot : radial velocity of a belief target in the learner frame 19 | alphadot : angular velocity of a belief target in the learner frame 20 | Sigma : Covariance of a belief target 21 | 22 | [Environment Description] 23 | Varying number of agents, varying number of randomly moving targets 24 | No obstacles 25 | Greedy target assignment, k targets are shown to agent. 26 | 27 | setTrackingEnvkGreedy : Double Integrator Target model with KF belief tracker 28 | obs state: [d, alpha, ddot, alphadot, logdet(Sigma), observed] *nb_targets 29 | where nb_targets and nb_agents vary between a range 30 | num_targets describes the upperbound on possible number of targets in env 31 | num_agents describes the upperbound on possible number of agents in env 32 | Target : Double Integrator model, [x,y,xdot,ydot] 33 | Belief Target : KF, Double Integrator model 34 | 35 | """ 36 | 37 | class setTrackingEnvkGreedy(maTrackingBase): 38 | 39 | def __init__(self, num_agents=1, num_targets=2, map_name='empty', 40 | is_training=True, known_noise=True, **kwargs): 41 | super().__init__(num_agents=num_agents, num_targets=num_targets, 42 | map_name=map_name, is_training=is_training) 43 | 44 | self.id = 'setTracking-vkGreedy' 45 | ###=======================================================### 46 | # Assign agents to closest k targets, if less targets than k, consider all targets 47 | self.k = 4 48 | ###=======================================================### 49 | self.nb_agents = num_agents #only for init, will change with reset() 50 | self.nb_targets = num_targets #only for init, will change with reset() 51 | self.agent_dim = 3 52 | self.target_dim = 4 53 | self.target_init_vel = METADATA['target_init_vel']*np.ones((2,)) 54 | # LIMIT 55 | self.limit = {} # 0: low, 1:highs 56 | self.limit['agent'] = [np.concatenate((self.MAP.mapmin,[-np.pi])), np.concatenate((self.MAP.mapmax, [np.pi]))] 57 | self.limit['target'] = [np.concatenate((self.MAP.mapmin,[-METADATA['target_vel_limit'], -METADATA['target_vel_limit']])), 58 | np.concatenate((self.MAP.mapmax, [METADATA['target_vel_limit'], METADATA['target_vel_limit']]))] 59 | rel_vel_limit = METADATA['target_vel_limit'] + METADATA['action_v'][0] # Maximum relative speed 60 | self.limit['state'] = [np.array(([0.0, -np.pi, -rel_vel_limit, -10*np.pi, -50.0, 0.0])), 61 | np.array(([600.0, np.pi, rel_vel_limit, 10*np.pi, 50.0, 2.0]))] 62 | self.observation_space = spaces.Box(self.limit['state'][0], self.limit['state'][1], dtype=np.float32) 63 | self.targetA = np.concatenate((np.concatenate((np.eye(2), self.sampling_period*np.eye(2)), axis=1), 64 | [[0,0,1,0],[0,0,0,1]])) 65 | self.target_noise_cov = METADATA['const_q'] * np.concatenate(( 66 | np.concatenate((self.sampling_period**3/3*np.eye(2), self.sampling_period**2/2*np.eye(2)), axis=1), 67 | np.concatenate((self.sampling_period**2/2*np.eye(2), self.sampling_period*np.eye(2)),axis=1) )) 68 | if known_noise: 69 | self.target_true_noise_sd = self.target_noise_cov 70 | else: 71 | self.target_true_noise_sd = METADATA['const_q_true'] * np.concatenate(( 72 | np.concatenate((self.sampling_period**2/2*np.eye(2), self.sampling_period/2*np.eye(2)), axis=1), 73 | np.concatenate((self.sampling_period/2*np.eye(2), self.sampling_period*np.eye(2)),axis=1) )) 74 | 75 | # Build a robot 76 | self.setup_agents() 77 | # Build a target 78 | self.setup_targets() 79 | self.setup_belief_targets() 80 | # Use custom reward 81 | self.get_reward() 82 | 83 | def setup_agents(self): 84 | self.agents = [AgentSE2(agent_id = 'agent-' + str(i), 85 | dim=self.agent_dim, sampling_period=self.sampling_period, 86 | limit=self.limit['agent'], 87 | collision_func=lambda x: map_utils.is_collision(self.MAP, x)) 88 | for i in range(self.num_agents)] 89 | 90 | def setup_targets(self): 91 | self.targets = [AgentDoubleInt2D(agent_id = 'target-' + str(i), 92 | dim=self.target_dim, sampling_period=self.sampling_period, 93 | limit=self.limit['target'], 94 | collision_func=lambda x: map_utils.is_collision(self.MAP, x), 95 | A=self.targetA, W=self.target_true_noise_sd) 96 | for i in range(self.num_targets)] 97 | 98 | def setup_belief_targets(self): 99 | self.belief_targets = [KFbelief(agent_id = 'target-' + str(i), 100 | dim=self.target_dim, limit=self.limit['target'], A=self.targetA, 101 | W=self.target_noise_cov, obs_noise_func=self.observation_noise, 102 | collision_func=lambda x: map_utils.is_collision(self.MAP, x)) 103 | for i in range(self.num_targets)] 104 | 105 | def get_reward(self, obstacles_pt=None, observed=None, is_training=True): 106 | return reward_fun(self.nb_targets, self.belief_targets, is_training) 107 | 108 | def reset(self,**kwargs): 109 | """ 110 | Random initialization a number of agents and targets at the reset of the env epsiode. 111 | Agents are given random positions in the map, targets are given random positions near a random agent. 112 | Return an observation state dict with agent ids (keys) that refer to their observation 113 | """ 114 | try: 115 | self.nb_agents = kwargs['nb_agents'] 116 | self.nb_targets = kwargs['nb_targets'] 117 | except: 118 | self.nb_agents = np.random.randint(1, self.num_agents) 119 | self.nb_targets = np.random.randint(1, self.num_targets) 120 | obs_dict = {} 121 | init_pose = self.get_init_pose(**kwargs) 122 | # Initialize agents 123 | for ii in range(self.nb_agents): 124 | self.agents[ii].reset(init_pose['agents'][ii]) 125 | obs_dict[self.agents[ii].agent_id] = [] 126 | 127 | # Initialize targets and beliefs 128 | for nn in range(self.nb_targets): 129 | self.belief_targets[nn].reset( 130 | init_state=np.concatenate((init_pose['belief_targets'][nn][:2], np.zeros(2))), 131 | init_cov=self.target_init_cov) 132 | self.targets[nn].reset(np.concatenate((init_pose['targets'][nn][:2], self.target_init_vel))) 133 | # For nb agents calculate belief of targets assigned 134 | for jj in range(self.nb_targets): 135 | for kk in range(self.nb_agents): 136 | r, alpha = util.relative_distance_polar(self.belief_targets[jj].state[:2], 137 | xy_base=self.agents[kk].state[:2], 138 | theta_base=self.agents[kk].state[2]) 139 | logdetcov = np.log(LA.det(self.belief_targets[jj].cov)) 140 | obs_dict[self.agents[kk].agent_id].append([r, alpha, 0.0, 0.0, logdetcov, 0.0]) 141 | 142 | if self.nb_targets > self.k: 143 | for agent_id in obs_dict: 144 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 145 | close = np.argpartition(obs_dict[agent_id][:,0], self.k) 146 | obs_dict[agent_id] = obs_dict[agent_id][close[:self.k]] 147 | else: 148 | for agent_id in obs_dict: 149 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 150 | 151 | return obs_dict 152 | 153 | def step(self, action_dict): 154 | obs_dict = {} 155 | reward_dict = {} 156 | done_dict = {'__all__':False} 157 | info_dict = {} 158 | 159 | # Targets move (t -> t+1) 160 | for n in range(self.nb_targets): 161 | self.targets[n].update() 162 | self.belief_targets[n].predict() # Belief state at t+1 163 | # Agents move (t -> t+1) and observe the targets 164 | for ii, agent_id in enumerate(action_dict): 165 | obs_dict[self.agents[ii].agent_id] = [] 166 | reward_dict[self.agents[ii].agent_id] = [] 167 | done_dict[self.agents[ii].agent_id] = [] 168 | 169 | action_vw = self.action_map[action_dict[agent_id]] 170 | 171 | # Locations of all targets and agents in order to maintain a margin between them 172 | margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]] 173 | for p, ids in enumerate(action_dict): 174 | if agent_id != ids: 175 | margin_pos.append(np.array(self.agents[p].state[:2])) 176 | _ = self.agents[ii].update(action_vw, margin_pos) 177 | 178 | # Target and map observations 179 | observed = np.zeros(self.nb_targets, dtype=bool) 180 | # obstacles_pt = map_utils.get_closest_obstacle(self.MAP, self.agents[ii].state) 181 | # if obstacles_pt is None: 182 | obstacles_pt = (self.sensor_r, np.pi) 183 | 184 | # Update beliefs of targets 185 | for jj in range(self.nb_targets): 186 | # Observe 187 | obs, z_t = self.observation(self.targets[jj], self.agents[ii]) 188 | observed[jj] = obs 189 | if obs: # if observed, update the target belief. 190 | self.belief_targets[jj].update(z_t, self.agents[ii].state) 191 | 192 | r_b, alpha_b = util.relative_distance_polar(self.belief_targets[jj].state[:2], 193 | xy_base=self.agents[ii].state[:2], 194 | theta_base=self.agents[ii].state[-1]) 195 | r_dot_b, alpha_dot_b = util.relative_velocity_polar( 196 | self.belief_targets[jj].state[:2], 197 | self.belief_targets[jj].state[2:], 198 | self.agents[ii].state[:2], self.agents[ii].state[-1], 199 | action_vw[0], action_vw[1]) 200 | obs_dict[agent_id].append([r_b, alpha_b, r_dot_b, alpha_dot_b, 201 | np.log(LA.det(self.belief_targets[jj].cov)), float(obs)]) 202 | # Assign agents to closest k targets, if less targets than k, consider all targets 203 | if self.nb_targets > self.k: 204 | for agent_id in obs_dict: 205 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 206 | close = np.argpartition(obs_dict[agent_id][:,0], self.k) 207 | obs_dict[agent_id] = obs_dict[agent_id][close[:self.k]] 208 | else: 209 | for agent_id in obs_dict: 210 | obs_dict[agent_id] = np.asarray(obs_dict[agent_id]) 211 | # Get all rewards after all agents and targets move (t -> t+1) 212 | reward, done, mean_nlogdetcov = self.get_reward(obstacles_pt, observed, self.is_training) 213 | reward_dict['__all__'], done_dict['__all__'], info_dict['mean_nlogdetcov'] = reward, done, mean_nlogdetcov 214 | return obs_dict, reward_dict, done_dict, info_dict 215 | 216 | def reward_fun(nb_targets, belief_targets, is_training=True, c_mean=0.1): 217 | detcov = [LA.det(b_target.cov) for b_target in belief_targets[:nb_targets]] 218 | r_detcov_mean = - np.mean(np.log(detcov)) 219 | reward = c_mean * r_detcov_mean 220 | 221 | mean_nlogdetcov = None 222 | if not(is_training): 223 | logdetcov = [np.log(LA.det(b_target.cov)) for b_target in belief_targets[:nb_targets]] 224 | mean_nlogdetcov = -np.mean(logdetcov) 225 | return reward, False, mean_nlogdetcov -------------------------------------------------------------------------------- /envs/maTTenv/gen_init_pose.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates a set of initial positions of targets and the agent in ttenv. 3 | If you want to have more conditions to generate initial positions other than 4 | the current metadata version, provide values for the additional variables to 5 | the reset function. For example, 6 | ex_var = {'init_distance_min':10.0, 7 | 'init_distacne_max':15.0, 8 | 'target_direction':False, 9 | 'belief_direction':False, 10 | 'blocked':True } 11 | obs.reset(**ex_var) 12 | """ 13 | import numpy as np 14 | import envs 15 | import argparse 16 | import pickle 17 | import os, time 18 | 19 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 20 | parser.add_argument('--env', help='environment ID', default='maTracking-v2') 21 | parser.add_argument('--render', type=int, default=1) 22 | parser.add_argument('--map', type=str, default="emptyMed") 23 | parser.add_argument('--nb_agents', type=int, default=2) 24 | parser.add_argument('--nb_targets', type=int, default=2) 25 | parser.add_argument('--nb_init_pose', type=int, default=10) 26 | parser.add_argument('--log_dir', type=str, default='.') 27 | 28 | args = parser.parse_args() 29 | 30 | def main(): 31 | env = envs.make(args.env, 32 | 'ma_target_tracking', 33 | render=bool(args.render), 34 | directory=args.log_dir, 35 | map_name=args.map, 36 | num_agents=args.nb_agents, 37 | num_targets=args.nb_targets, 38 | is_training=False, 39 | ) 40 | timelimit_env = env 41 | while( not hasattr(timelimit_env, '_elapsed_steps')): 42 | timelimit_env = timelimit_env.env 43 | init_pose = [] 44 | while(len(init_pose) < args.nb_init_pose): # test episode 45 | obs, done = env.reset(), False 46 | if args.render: 47 | env.render() 48 | notes = input("%d, Pass? y/n"%len(init_pose)) 49 | if notes == "y": 50 | init_pose.append({'agents':[timelimit_env.env.agents[i].state for i in range(args.nb_agents)], 51 | 'targets':[timelimit_env.env.targets[i].state for i in range(args.nb_targets)], 52 | 'belief_targets':[timelimit_env.env.belief_targets[i].state for i in range(args.nb_targets)]}) 53 | 54 | pickle.dump(init_pose, open(os.path.join(args.log_dir,'init_pose_random_1015.pkl'), 'wb')) 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /envs/maTTenv/maps/empty100.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/envs/maTTenv/maps/empty100.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/empty100.yaml: -------------------------------------------------------------------------------- 1 | # Empty Map 2 | mapmin: [0, 0] 3 | mapmax: [250, 250] 4 | mapres: [.5, .5] 5 | mapdim: [500, 500] 6 | origin: [125, 125] 7 | origincells: [125, 125] 8 | datatype: c 9 | storage: colmajor 10 | mappath: emptyMap.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/empty1000.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/envs/maTTenv/maps/empty1000.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/empty1000.yaml: -------------------------------------------------------------------------------- 1 | # Empty Map 2 | mapmin: [0, 0] 3 | mapmax: [790, 790] 4 | mapres: [.5, .5] 5 | mapdim: [1580, 1580] 6 | origin: [395, 395] 7 | origincells: [395, 395] 8 | datatype: c 9 | storage: colmajor 10 | mappath: emptyMap.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/empty20.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/envs/maTTenv/maps/empty20.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/empty20.yaml: -------------------------------------------------------------------------------- 1 | # Empty Map 2 | mapmin: [0, 0] 3 | mapmax: [112, 112] 4 | mapres: [.5, .5] 5 | mapdim: [224, 224] 6 | origin: [56, 56] 7 | origincells: [56, 56] 8 | datatype: c 9 | storage: colmajor 10 | mappath: emptyMap.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/empty8.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/envs/maTTenv/maps/empty8.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/empty8.yaml: -------------------------------------------------------------------------------- 1 | # Empty Map 2 | mapmin: [0, 0] 3 | mapmax: [70, 70] 4 | mapres: [.5, .5] 5 | mapdim: [140, 140] 6 | origin: [35, 35] 7 | origincells: [35, 35] 8 | datatype: c 9 | storage: colmajor 10 | mappath: emptyMap.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/emptyMed.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/envs/maTTenv/maps/emptyMed.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/emptyMed.yaml: -------------------------------------------------------------------------------- 1 | # Empty Map 2 | mapmin: [0, 0] 3 | mapmax: [51, 51] 4 | mapres: [.5, .5] 5 | mapdim: [101, 101] 6 | origin: [25.5, 25.5] 7 | origincells: [50, 50] 8 | datatype: c 9 | storage: colmajor 10 | mappath: emptyMap.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/emptySmall.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/envs/maTTenv/maps/emptySmall.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/emptySmall.yaml: -------------------------------------------------------------------------------- 1 | # Empty Map 2 | mapmin: [0, 0] 3 | mapmax: [27, 27] 4 | mapres: [.2, .2] 5 | mapdim: [135, 135] 6 | origin: [13.5, 13.5] 7 | origincells: [67, 67] 8 | datatype: c 9 | storage: colmajor 10 | mappath: emptyMap.cfg -------------------------------------------------------------------------------- /envs/maTTenv/maps/map_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions / Objects for occupancy grid maps. 3 | 4 | !!IMPORTANT!! xy-rc coordinate relation: 5 | 2D map. (xmin, ymin) is located at the bottom left corner. 6 | rows are counted from the bottom to top. 7 | columns are counted from left to right. 8 | 9 | Matplotlib.pyplot displays a matrix with (r,c)=(0,0) located at the top left. 10 | Therefore, in the display_wrapper, the map is flipped. 11 | """ 12 | import numpy as np 13 | import yaml 14 | 15 | def round(x): 16 | if x >= 0: 17 | return int(x+0.5) 18 | else: 19 | return int(x-0.5) 20 | 21 | class GridMap(object): 22 | def __init__(self, map_path, r_max=1.0, fov=np.pi, margin2wall=0.5): 23 | map_config = yaml.safe_load(open(map_path+".yaml", "r")) 24 | self.map = np.loadtxt(map_path+".cfg") 25 | if 'empty' in map_path: 26 | self.map = None 27 | else: 28 | self.map_linear = np.squeeze(self.map.astype(np.int8).reshape(-1, 1)) 29 | self.mapdim = map_config['mapdim'] 30 | self.mapres = np.array(map_config['mapres']) 31 | self.mapmin = np.array(map_config['mapmin']) 32 | self.mapmax = np.array(map_config['mapmax']) 33 | self.margin2wall = margin2wall 34 | self.origin = map_config['origin'] 35 | self.r_max = r_max 36 | self.fov = fov 37 | 38 | def se2_to_cell(self, pos): 39 | pos = pos[:2] 40 | cell_idx = (pos - self.mapmin)/self.mapres - 0.5 41 | return round(cell_idx[0]), round(cell_idx[1]) 42 | 43 | def cell_to_se2(self, cell_idx): 44 | return ( np.array(cell_idx) + 0.5 ) * self.mapres + self.mapmin 45 | 46 | def bresenham2D(sx, sy, ex, ey): 47 | """ 48 | Bresenham's ray tracing algorithm in 2D from ESE650 2017 TA resources 49 | Inputs: 50 | (sx, sy) start point of ray 51 | (ex, ey) end point of ray 52 | Outputs: 53 | Indicies for x-axis and y-axis 54 | """ 55 | sx = int(round(sx)) 56 | sy = int(round(sy)) 57 | ex = int(round(ex)) 58 | ey = int(round(ey)) 59 | dx = abs(ex-sx) 60 | dy = abs(ey-sy) 61 | steep = abs(dy)>abs(dx) 62 | if steep: 63 | dx,dy = dy,dx # swap 64 | 65 | if dy == 0: 66 | q = np.zeros((dx+1,1)) 67 | else: 68 | q = np.append(0,np.greater_equal(np.diff(np.mod(np.arange( np.floor(dx/2), -dy*dx+np.floor(dx/2)-1,-dy),dx)),0)) 69 | if steep: 70 | if sy <= ey: 71 | y = np.arange(sy,ey+1) 72 | else: 73 | y = np.arange(sy,ey-1,-1) 74 | if sx <= ex: 75 | x = sx + np.cumsum(q) 76 | else: 77 | x = sx - np.cumsum(q) 78 | else: 79 | if sx <= ex: 80 | x = np.arange(sx,ex+1) 81 | else: 82 | x = np.arange(sx,ex-1,-1) 83 | if sy <= ey: 84 | y = sy + np.cumsum(q) 85 | else: 86 | y = sy - np.cumsum(q) 87 | return np.vstack((x,y)).astype(np.int16) 88 | 89 | def coord_change2g(vec, ang): 90 | assert(len(vec) == 2) 91 | # R * v 92 | return np.array([[np.cos(ang), -np.sin(ang)], [np.sin(ang), np.cos(ang)]])@vec 93 | 94 | def is_collision_ray_cell(map_obj, cell): 95 | """ 96 | cell : cell r, c index from left bottom. 97 | """ 98 | idx = cell[0] + map_obj.mapdim[0] * cell[1] 99 | if (cell[0] < 0) or (cell[1] < 0) or (cell[0] >= map_obj.mapdim[0]) or (cell[1] >= map_obj.mapdim[1]): 100 | return True 101 | elif (map_obj.map is not None) and map_obj.map_linear[idx] == 1: 102 | return True 103 | else: 104 | return False 105 | def is_blocked(map_obj, start_pos, end_pos): 106 | if map_obj.map is None: 107 | return False 108 | start_rc = map_obj.se2_to_cell(start_pos) 109 | end_rc = map_obj.se2_to_cell(end_pos) 110 | ray_cells = bresenham2D(start_rc[0], start_rc[1], end_rc[0], end_rc[1]) 111 | i = 0 112 | while(i < ray_cells.shape[-1]): 113 | if is_collision_ray_cell(map_obj, ray_cells[:,i]): 114 | return True 115 | i += 1 116 | return False 117 | 118 | def get_front_obstacle(map_obj, odom, **kwargs): 119 | ro_min_t = None 120 | start_rc = map_obj.se2_to_cell(odom[:2]) 121 | end_pt_global_frame = coord_change2g(np.array([map_obj.r_max*np.cos(0.0), map_obj.r_max*np.sin(0.0)]), odom[-1]) + odom[:2] 122 | if map_obj.map is None: 123 | if not(in_bound(map_obj, end_pt_global_frame)): 124 | end_rc = map_obj.se2_to_cell(end_pt_global_frame) 125 | ray_cells = bresenham2D(start_rc[0], start_rc[1], end_rc[0], end_rc[1]) 126 | i = 0 127 | while(i < ray_cells.shape[-1]): 128 | pt = map_obj.cell_to_se2(ray_cells[:,i]) 129 | if not(in_bound(map_obj, pt)): 130 | break 131 | i += 1 132 | if i < ray_cells.shape[-1]: # break! 133 | ro_min_t = np.sqrt(np.sum(np.square(pt - odom[:2]))) 134 | 135 | else: 136 | end_rc = map_obj.se2_to_cell(end_pt_global_frame) 137 | ray_cells = bresenham2D(start_rc[0], start_rc[1], end_rc[0], end_rc[1]) 138 | i = 0 139 | while(i < ray_cells.shape[-1]): # break! 140 | if is_collision_ray_cell(map_obj, ray_cells[:,i]): 141 | break 142 | i += 1 143 | if i < ray_cells.shape[-1]: 144 | ro_min_t = np.sqrt(np.sum(np.square(map_obj.cell_to_se2(ray_cells[:,i]) - odom[:2]))) 145 | 146 | if ro_min_t is None: 147 | return None 148 | else: 149 | return ro_min_t, 0.0 150 | 151 | def get_closest_obstacle(map_obj, odom, ang_res=0.05): 152 | """ 153 | Return the closest obstacle/boundary cell 154 | """ 155 | ang_grid = np.arange(-.5*map_obj.fov, .5*map_obj.fov, ang_res) 156 | closest_obstacle = (map_obj.r_max, 0.0) 157 | start_rc = map_obj.se2_to_cell(odom[:2]) 158 | for ang in ang_grid: 159 | end_pt_global_frame = coord_change2g(np.array([map_obj.r_max*np.cos(ang), map_obj.r_max*np.sin(ang)]), odom[-1]) + odom[:2] 160 | 161 | if map_obj.map is None: 162 | if not(in_bound(map_obj, end_pt_global_frame)): 163 | end_rc = map_obj.se2_to_cell(end_pt_global_frame) 164 | ray_cells = bresenham2D(start_rc[0], start_rc[1], end_rc[0], end_rc[1]) 165 | i = 0 166 | while(i < ray_cells.shape[-1]): 167 | pt = map_obj.cell_to_se2(ray_cells[:,i]) 168 | if not(in_bound(map_obj, pt)): 169 | break 170 | i += 1 171 | if i < ray_cells.shape[-1]: # break! 172 | ro_min_t = np.sqrt(np.sum(np.square(pt - odom[:2]))) 173 | if ro_min_t < closest_obstacle[0]: 174 | closest_obstacle = (ro_min_t, ang) 175 | 176 | else: 177 | end_rc = map_obj.se2_to_cell(end_pt_global_frame) 178 | ray_cells = bresenham2D(start_rc[0], start_rc[1], end_rc[0], end_rc[1]) 179 | i = 0 180 | while(i < ray_cells.shape[-1]): # break! 181 | if is_collision_ray_cell(map_obj, ray_cells[:,i]): 182 | break 183 | i += 1 184 | if i < ray_cells.shape[-1]: 185 | ro_min_t = np.sqrt(np.sum(np.square(map_obj.cell_to_se2(ray_cells[:,i]) - odom[:2]))) 186 | if ro_min_t < closest_obstacle[0]: 187 | closest_obstacle = (ro_min_t, ang) 188 | 189 | if closest_obstacle[0] == map_obj.r_max: 190 | return None 191 | else: 192 | return closest_obstacle 193 | 194 | def is_collision(map_obj, pos): 195 | if not(in_bound(map_obj, pos)): 196 | return True 197 | else: 198 | if map_obj.map is not None: 199 | n = np.ceil(map_obj.margin2wall/map_obj.mapres).astype(np.int16) 200 | cell = np.minimum([map_obj.mapdim[0]-1,map_obj.mapdim[1]-1] , map_obj.se2_to_cell(pos)) 201 | for r_add in np.arange(-n[1],n[1],1): 202 | for c_add in np.arange(-n[0],n[0],1): 203 | x_c = np.clip(cell[0]+r_add, 0, map_obj.mapdim[0]-1).astype(np.int16) 204 | y_c = np.clip(cell[1]+c_add,0,map_obj.mapdim[1]-1).astype(np.int16) 205 | idx = x_c + map_obj.mapdim[0] * y_c 206 | if map_obj.map_linear[idx] == 1: 207 | return True 208 | return False 209 | 210 | def in_bound(map_obj, pos): 211 | return not((pos[0] < map_obj.mapmin[0] + map_obj.margin2wall) 212 | or (pos[0] > map_obj.mapmax[0] - map_obj.margin2wall) 213 | or (pos[1] < map_obj.mapmin[1] + map_obj.margin2wall) 214 | or (pos[1] > map_obj.mapmax[1] - map_obj.margin2wall)) 215 | 216 | def local_map(map_obj, im_size, odom): 217 | """ 218 | im_size : the number of rows/columns 219 | """ 220 | R=np.array([[np.cos(odom[2] - np.pi/2), -np.sin(odom[2] - np.pi/2)], 221 | [np.sin(odom[2] - np.pi/2), np.cos(odom[2] - np.pi/2)]]) 222 | 223 | local_map = np.zeros((im_size, im_size)) 224 | local_mapmin = np.array([-im_size/2*map_obj.mapres[0], 0.0]) 225 | for r in range(im_size): 226 | for c in range(im_size): 227 | xy_local = cell_to_se2([r,c], local_mapmin, map_obj.mapres) 228 | xy_global = np.matmul(R, xy_local) + odom[:2] 229 | local_map[c,r] = int(is_collision_ray_cell(map_obj, map_obj.se2_to_cell(xy_global))) 230 | local_mapmin_g = np.matmul(R, local_mapmin) + odom[:2] 231 | # indvec = np.reshape([[[r,c] for r in range(im_size)] for c in range(im_size)], (-1,2)) 232 | # xy_local = cell_to_se2_batch(indvec, local_mapmin, map_obj.mapres) 233 | # xy_global = np.add(np.matmul(R, xy_local.T).T odom[:2]) 234 | 235 | return local_map, local_mapmin_g 236 | 237 | def se2_to_cell(pos, mapmin, mapres): 238 | pos = pos[:2] 239 | cell_idx = (pos - mapmin)/mapres - 0.5 240 | return round(cell_idx[0]), round(cell_idx[1]) 241 | 242 | def cell_to_se2(cell_idx, mapmin, mapres): 243 | return ( np.array(cell_idx) + 0.5 ) * mapres + mapmin 244 | 245 | def se2_to_cell_batch(pos, mapmin, mapres): 246 | """ 247 | Coversion for Batch input : pos = [batch_size, 2 or 3] 248 | 249 | OUTPUT: [batch_size,], [batch_size,] 250 | """ 251 | return round((pos[:,0] - mapmin[0])/mapres[0] - 0.5), round((pos[:,1] - mapmin[1])/mapres[1] - 0.5) 252 | 253 | def cell_to_se2_batch(cell_idx, mapmin, mapres): 254 | """ 255 | Coversion for Batch input : cell_idx = [batch_size, 2] 256 | 257 | OUTPUT: [batch_size, 2] 258 | """ 259 | return (cell_idx[:,0] + 0.5) * mapres[0] + mapmin[0], (cell_idx[:,1] + 0.5) * mapres[1] + mapmin[1] 260 | 261 | def generate_trajectory(map_obj): 262 | import matplotlib 263 | matplotlib.use('TkAgg') 264 | from matplotlib import pyplot as plt 265 | from scipy.interpolate import interp1d 266 | 267 | ax = plt.gca() 268 | fig = plt.gcf() 269 | implot = ax.imshow(map_obj.map, cmap='gray_r', origin='lower', extent=[map_obj.mapmin[0], map_obj.mapmax[0], map_obj.mapmin[1], map_obj.mapmax[1]]) 270 | 271 | path_points = [] 272 | def onclick(event): 273 | if event.xdata != None and event.ydata != None: 274 | ax.plot(event.xdata, event.ydata, 'ro') 275 | plt.draw() 276 | path_points.append([event.xdata, event.ydata]) 277 | cid = fig.canvas.mpl_connect('button_press_event', onclick) 278 | 279 | plt.show() 280 | print("%d points selected"%len(path_points)) 281 | path_points = np.array(path_points) 282 | interpol_num = 5 283 | passed = False 284 | vel = 0.2 285 | T_step = 100 286 | while(not passed): 287 | ax = plt.gca() 288 | fig = plt.gcf() 289 | implot = ax.imshow(map_obj.map, cmap='gray_r', origin='lower', extent=[map_obj.mapmin[0], map_obj.mapmax[0], map_obj.mapmin[1], map_obj.mapmax[1]]) 290 | 291 | sorted_paths = [] 292 | newx = [] 293 | newy = [] 294 | 295 | t_done = False 296 | t = 1 297 | while(not t_done): 298 | tmp = [path_points[t-1]] 299 | tmp_x = [] 300 | while((path_points[t-1,0] < path_points[t,0])): 301 | tmp.append(path_points[t]) 302 | distance = np.sqrt(np.sum(np.square( path_points[t] - path_points[t-1] ))) 303 | num = distance / (0.5 * vel) 304 | tmp_x.extend(np.linspace(path_points[t-1,0], path_points[t,0], num=num, endpoint=False)) 305 | t += 1 306 | if t >= len(path_points): 307 | t_done = True 308 | break 309 | if len(tmp) > 1: 310 | tmp = np.array(tmp) 311 | f0 = interp1d(tmp[:,0], tmp[:,1], kind='cubic') 312 | newx.extend(tmp_x) 313 | newy.extend(f0(np.array(tmp_x))) 314 | sorted_paths.append(tmp) 315 | if t_done: 316 | break 317 | 318 | tmp = [path_points[t-1]] 319 | tmp_x = [] 320 | while((path_points[t-1,0] >= path_points[t,0])): 321 | tmp.append(path_points[t]) 322 | distance = np.sqrt(np.sum(np.square( path_points[t] - path_points[t-1] ))) 323 | num = distance / (0.5 * vel) 324 | tmp_x.extend(np.linspace(path_points[t-1,0], path_points[t,0], num=num, endpoint=False)) 325 | t += 1 326 | if t >= len(path_points): 327 | t_done = True 328 | break 329 | tmp = np.array(tmp) 330 | f0 = interp1d(np.flip(tmp[:,0], -1), np.flip(tmp[:,1], -1), kind='cubic') 331 | newx.extend(tmp_x) 332 | newy.extend(f0(np.array(tmp_x))) 333 | sorted_paths.append(tmp) 334 | 335 | ax.plot(path_points[:,0], path_points[:,1],'ro') 336 | ax.plot(newx, newy,'b.') 337 | print("Total point number: %d"%len(newx)) 338 | plt.show() 339 | if len(newx) > T_step: 340 | passed = True 341 | else: 342 | interpol_num += 1 343 | newx = np.array(newx) 344 | newy = np.array(newy) 345 | x_vel = (newx[1:]-newx[:-1])/0.5 346 | y_vel = (newy[1:]-newy[:-1])/0.5 347 | theta = np.arctan2(newy[1:]-newy[:-1], newx[1:]-newx[:-1]) 348 | 349 | final = np.concatenate((newx[:-1, np.newaxis], newy[:-1, np.newaxis], theta[:,np.newaxis], x_vel[:,np.newaxis], y_vel[:,np.newaxis]), axis=1) 350 | np.save(open("path_sh_1.npy", "wb"), final) 351 | 352 | 353 | def generate_map(mapname, mapdim=(8,4), mapres=0.1): 354 | new_map = np.zeros((int(mapdim[0]/mapres), int(mapdim[1]/mapres)), dtype=np.int8) 355 | """ 356 | 0 0 0 0 357 | 0 1 0 0 358 | 0 0 0 0 359 | 0 0 1 0 360 | 0 0 0 0 361 | 0 1 0 0 362 | 0 1 0 0 363 | 0 0 0 0 364 | """ 365 | # Boundary 366 | new_map[0,:] = 1.0 367 | new_map[:,0] = 1.0 368 | new_map[-1,:] = 1.0 369 | new_map[:,-1] = 1.0 370 | 371 | # Obstacles 372 | new_map[int(1.0/mapres):int(2.0/mapres), int(1.0/mapres):int(2.0/mapres)] = 1.0 373 | new_map[int(3.0/mapres):int(4.0/mapres), int(2.0/mapres):int(3.0/mapres)] = 1.0 374 | new_map[int(5.0/mapres):int(7.0/mapres), int(1.0/mapres):int(2.0/mapres)] = 1.0 375 | 376 | new_map = new_map.astype(np.int8) 377 | np.savetxt(mapname, new_map, fmt='%d') 378 | -------------------------------------------------------------------------------- /envs/maTTenv/metadata.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | ##Beliefs are initialized near target 4 | METADATA={ 5 | 'version' : 1, 6 | 'sensor_r': 10.0, 7 | 'fov' : 90, 8 | 'sensor_r_sd': 0.2, # sensor range noise. 9 | 'sensor_b_sd': 0.01, # sensor bearing noise. 10 | 'target_init_cov': 30.0, # initial target diagonal Covariance. 11 | 'target_init_vel': 0.0, # target's initial velocity. 12 | 'target_vel_limit': 2.0, # velocity limit of targets. 13 | 'init_distance_min': 5.0, # the minimum distance btw targets and the agent. 14 | 'init_distance_max': 10.0, # the maximum distance btw targets and the agent. 15 | 'init_belief_distance_min': 0.0, # the minimum distance btw belief and the target. 16 | 'init_belief_distance_max': 5.0, # the maximum distance btw belief and the target. 17 | 'margin': 1.0, # a marginal distance btw targets and the agent. 18 | 'margin2wall': 0.5, # a marginal distance from a wall. 19 | 'action_v': [2, 1.33, 0.67, 0], # action primitives - linear velocities. 20 | 'action_w': [np.pi/2, 0, -np.pi/2], # action primitives - angular velocities. 21 | 'const_q': 0.001, # target noise constant in beliefs. 22 | 'const_q_true': 0.01, # target noise constant of actual targets. 23 | } -------------------------------------------------------------------------------- /envs/maTTenv/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def wrap_around(x): 3 | # x \in [-pi,pi) 4 | if x >= np.pi: 5 | return x - 2*np.pi 6 | elif x < -np.pi: 7 | return x + 2*np.pi 8 | else: 9 | return x 10 | 11 | def vectorized_wrap_around(x): 12 | x[x>=np.pi] -= 2*np.pi 13 | x[x<-np.pi] += 2*np.pi 14 | return x 15 | 16 | def global_relative_measure(x_target, x_main): 17 | diff = x_target - x_main 18 | r = np.sqrt(np.sum(diff**2, axis=1)) 19 | alpha = vectorized_wrap_around(np.arctan2(diff[:,1],diff[:,0])) 20 | #Rearrange to [r,alpha]*(nb_agents,nb_targets) 21 | global_state = np.ndarray.flatten(np.vstack((r,alpha)).T) 22 | return global_state 23 | 24 | def cartesian2polar(xy): 25 | r = np.sqrt(np.sum(xy**2)) 26 | alpha = np.arctan2(xy[1], xy[0]) 27 | return r, alpha 28 | 29 | def cartesian2polar_dot(x, y, x_dot, y_dot): 30 | r2 = x*x + y*y 31 | if r2 == 0.0: 32 | return 0.0, 0.0 33 | r_dot = (x*x_dot + y*y_dot)/np.sqrt(r2) 34 | alpha_dot = (x*y_dot - x_dot*y)/r2 35 | return r_dot, alpha_dot 36 | 37 | def transform_2d(vec, theta_base, xy_base = [0.0, 0.0]): 38 | """ 39 | Both vec and frame_xy are in the global coordinate. vec is a vector 40 | you want to transform with respect to a certain frame which is located at 41 | frame_xy with ang. 42 | R^T * (vec - frame_xy). 43 | R is a rotation matrix of the frame w.r.t the global frame. 44 | """ 45 | assert(len(vec) == 2) 46 | return np.matmul([[np.cos(theta_base), np.sin(theta_base)], [-np.sin(theta_base), np.cos(theta_base)]], 47 | vec - np.array(xy_base)) 48 | 49 | def rotation_2d_dot(xy_target, xy_dot_target, theta_base, theta_dot_base): 50 | """ 51 | Cartesian velocity in a rotating frame. 52 | """ 53 | s_b = np.sin(theta_base) 54 | c_b = np.cos(theta_base) 55 | x_dot_target_bframe = (-s_b * xy_target[0] + c_b * xy_target[1]) * theta_dot_base + \ 56 | c_b * xy_dot_target[0] + s_b * xy_dot_target[1] 57 | y_dot_target_bframe = - (c_b * xy_target[0] + s_b * xy_target[1]) * theta_dot_base - \ 58 | s_b * xy_dot_target[0] +c_b * xy_dot_target[1] 59 | return x_dot_target_bframe, y_dot_target_bframe 60 | 61 | def transform_2d_dot(xy_target, xy_dot_target, theta_base, theta_dot_base, xy_base, xy_dot_base): 62 | """ 63 | Cartesian velocity in a rotating and translating frame. 64 | """ 65 | rotated_xy_dot_target_bframe = rotation_2d_dot(xy_target, xy_dot_target, theta_base, theta_dot_base) 66 | rotated_xy_dot_base_bframe = rotation_2d_dot(xy_base, xy_dot_base, theta_base, theta_dot_base) 67 | return np.array(rotated_xy_dot_target_bframe) - np.array(rotated_xy_dot_base_bframe) 68 | 69 | def relative_distance_polar(xy_target, xy_base, theta_base): 70 | xy_target_base = transform_2d(xy_target, theta_base, xy_base) 71 | return cartesian2polar(xy_target_base) 72 | 73 | def relative_velocity_polar(xy_target, xy_dot_target, xy_base, theta_base, v_base, w_base): 74 | """ 75 | Relative velocity in a given polar coordinate (radial velocity, angular velocity). 76 | 77 | Parameters: 78 | --------- 79 | xy_target : xy coordinate of a target in the global frame. 80 | xy_dot_target : xy velocity of a target in the global frame. 81 | xy_base : xy coordinate of the origin of a base frame in the global frame. 82 | theta_base : orientation of a base frame in the global frame. 83 | v_base : translational velocity of a base frame in the global frame. 84 | w_base : rotational velocity of a base frame in the global frame. 85 | """ 86 | xy_dot_base = vw_to_xydot(v_base, w_base, theta_base) 87 | xy_target_base = transform_2d(xy_target, theta_base, xy_base) 88 | xy_dot_target_base = transform_2d_dot(xy_target, xy_dot_target, theta_base, 89 | w_base, xy_base, xy_dot_base) 90 | r_dot_b, alpha_dot_b = cartesian2polar_dot(xy_target_base[0], 91 | xy_target_base[1], xy_dot_target_base[0], xy_dot_target_base[1]) 92 | return r_dot_b, alpha_dot_b 93 | 94 | def relative_velocity_polar_se2(xyth_target, vw_target, xyth_base, vw_base): 95 | """ 96 | Radial and angular velocity of the target with respect to the base frame 97 | located at xy_b with a rotation of theta_b, moving with a translational 98 | velocity v and rotational velocity w. This function is designed specifically 99 | for the SE2 Agent and SE2 Target target case. 100 | 101 | Parameters 102 | --------- 103 | xyth_target : (x, y, orientation) of a target in the global frame. 104 | vw_target : translational and rotational velocity of a target in the global frame. 105 | xyth_base : (x, y, orientation) of a base frame in the global frame. 106 | vw_base : translational and rotational velocity of a base frame in the global frame. 107 | """ 108 | xy_dot_target = vw_to_xydot(vw_target[0], vw_target[1], xyth_base[2]) 109 | return relative_velocity_polar(xyth_target[:2], xy_dot_target, xyth_base[:2], 110 | xyth_base[2], vw_base[0], vw_base[1]) 111 | 112 | def vw_to_xydot(v, w, theta): 113 | """ 114 | Conversion from translational and rotational velocity to cartesian velocity 115 | in the global frame according to differential-drive dynamics. 116 | 117 | Parameters 118 | --------- 119 | v : translational velocity. 120 | w : rotational velocity. 121 | theta : orientation of a base object in the global frame. 122 | """ 123 | if w < 0.001: 124 | x_dot = v * np.cos(theta + w/2) 125 | y_dot = v * np.sin(theta + w/2) 126 | else: 127 | x_dot = v/w * (np.sin(theta + w) - np.sin(theta)) 128 | y_dot = v/w * (np.cos(theta) - np.cos(theta + w)) 129 | return x_dot, y_dot 130 | -------------------------------------------------------------------------------- /envs/run_ma_tracking.py: -------------------------------------------------------------------------------- 1 | import maTTenv 2 | import numpy as np 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 6 | parser.add_argument('--env', help='environment ID', type=str, default='setTracking-v0') 7 | parser.add_argument('--render', help='whether to render', type=int, default=0) 8 | parser.add_argument('--record', help='whether to record', type=int, default=0) 9 | parser.add_argument('--nb_agents', help='the number of agents', type=int, default=4) 10 | parser.add_argument('--nb_targets', help='the number of targets', type=int, default=4) 11 | parser.add_argument('--log_dir', help='a path to a directory to log your data', type=str, default='.') 12 | parser.add_argument('--map', type=str, default="emptyMed") 13 | 14 | args = parser.parse_args() 15 | 16 | # @profile 17 | def main(): 18 | env = maTTenv.make(args.env, 19 | render=args.render, 20 | record=args.record, 21 | directory=args.log_dir, 22 | map_name=args.map, 23 | num_agents=args.nb_agents, 24 | num_targets=args.nb_targets, 25 | is_training=False, 26 | ) 27 | nlogdetcov = [] 28 | action_dict = {} 29 | done = {'__all__':False} 30 | 31 | obs = env.reset() 32 | # See below why this check is needed for training or eval loop 33 | while not done['__all__']: 34 | if args.render: 35 | env.render() 36 | 37 | for agent_id, o in obs.items(): 38 | action_dict[agent_id] = env.action_space.sample() 39 | 40 | obs, rew, done, info = env.step(action_dict) 41 | nlogdetcov.append(info['mean_nlogdetcov']) 42 | 43 | print("Sum of negative logdet of the target belief covariances : %.2f"%np.sum(nlogdetcov)) 44 | 45 | if __name__ == "__main__": 46 | main() 47 | """ 48 | To use line_profiler 49 | add @profile before a function to profile 50 | kernprof -l run_ma_example.py --env setTracking-v3 --nb_agents 4 --nb_targets 4 --render 0 51 | python -m line_profiler run_ma_example.py.lprof 52 | 53 | Examples: 54 | >>> env = MyMultiAgentEnv() 55 | >>> obs = env.reset() 56 | >>> print(obs) 57 | { 58 | "agent_0": [2.4, 1.6], 59 | "agent_1": [3.4, -3.2], 60 | } 61 | >>> obs, rewards, dones, infos = env.step( 62 | action_dict={ 63 | "agent_0": 1, "agent_1": 0, 64 | }) 65 | >>> print(rew) 66 | { 67 | "agent_0": 3, 68 | "agent_1": -1, 69 | "__all__": 2, 70 | } 71 | >>> print(done) 72 | #Due to gym wrapper, done at TimeLimit is bool, True. 73 | #During episode, it is a dict so.. 74 | #While done is a dict keep running 75 | { 76 | "agent_0": False, # agent_0 is still running 77 | "agent_1": True, # agent_1 is done 78 | "__all__": False, # the env is not done 79 | } 80 | >>> print(info) 81 | { 82 | "agent_0": {}, # info for agent_0 83 | "agent_1": {}, # info for agent_1 84 | } 85 | """ -------------------------------------------------------------------------------- /envs/utilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/envs/utilities/__init__.py -------------------------------------------------------------------------------- /envs/utilities/ma_time_limit.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | '''Main difference from gym wrappers time_limit.py is that it updates a 4 | done dict in step rather than a done boolean 5 | Edited by Christopher Hsu 6 | ''' 7 | 8 | class maTimeLimit(gym.Wrapper): 9 | def __init__(self, env, max_episode_steps=None): 10 | super(maTimeLimit, self).__init__(env) 11 | if max_episode_steps is None and self.env.spec is not None: 12 | max_episode_steps = env.spec.max_episode_steps 13 | if self.env.spec is not None: 14 | self.env.spec.max_episode_steps = max_episode_steps 15 | self._max_episode_steps = max_episode_steps 16 | self._elapsed_steps = None 17 | 18 | def step(self, action): 19 | assert self._elapsed_steps is not None, "Cannot call env.step() before calling reset()" 20 | observation, reward, done, info = self.env.step(action) 21 | self._elapsed_steps += 1 22 | if self._elapsed_steps >= self._max_episode_steps: 23 | info['TimeLimit.truncated'] = not done 24 | done['__all__'] = True 25 | # done = True #og gym version 26 | return observation, reward, done, info 27 | 28 | def reset(self, **kwargs): 29 | self._elapsed_steps = 0 30 | return self.env.reset(**kwargs) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wheel 2 | h5py 3 | typing-extensions 4 | gym[atari] 5 | cython #==0.29.0 6 | numpy #==1.19.0 7 | librosa ==0.4.2 8 | psutil ==5.8.0 9 | opencv-python #==4.1.0.25 10 | tabulate ==0.8.3 11 | llvmlite 12 | numba #==0.52.0 13 | cloudpickle ==1.6.0 14 | tqdm ==4.32.2 15 | joblib #==0.13.2 16 | dill ==0.3.3 17 | progressbar2 ==3.53.1 18 | scikit-learn #==0.24.0 19 | scipy #==1.5.4 20 | pandas #==1.1.5 21 | requests 22 | lz4 23 | filterpy 24 | torch ==1.6.0 25 | # torchvision 26 | pyyaml 27 | mpi4py 28 | tensorboard 29 | seaborn 30 | pillow -------------------------------------------------------------------------------- /setup: -------------------------------------------------------------------------------- 1 | PYTHONPATH="${PYTHONPATH}:$PWD/." 2 | export PYTHONPATH 3 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grasp-lyrl/scalableMARL/01642fb5275494694db48deb9bbd66fab82ed3a5/utils/__init__.py -------------------------------------------------------------------------------- /utils/logSpinUp.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Some simple logging functionality, inspired by rllab's logging. 4 | 5 | Logs to a tab-separated-values file (path/to/output_directory/progress.txt) 6 | 7 | """ 8 | import json 9 | import joblib 10 | import shutil 11 | import numpy as np 12 | # import tensorflow as tf 13 | import torch 14 | import os.path as osp, time, atexit, os 15 | import warnings 16 | from utils.mpi_tools import proc_id, mpi_statistics_scalar 17 | from utils.serialization_utils import convert_json 18 | 19 | color2num = dict( 20 | gray=30, 21 | red=31, 22 | green=32, 23 | yellow=33, 24 | blue=34, 25 | magenta=35, 26 | cyan=36, 27 | white=37, 28 | crimson=38 29 | ) 30 | 31 | def colorize(string, color, bold=False, highlight=False): 32 | """ 33 | Colorize a string. 34 | 35 | This function was originally written by John Schulman. 36 | """ 37 | attr = [] 38 | num = color2num[color] 39 | if highlight: num += 10 40 | attr.append(str(num)) 41 | if bold: attr.append('1') 42 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 43 | 44 | def restore_tf_graph(sess, fpath): 45 | """ 46 | Loads graphs saved by Logger. 47 | 48 | Will output a dictionary whose keys and values are from the 'inputs' 49 | and 'outputs' dict you specified with logger.setup_tf_saver(). 50 | 51 | Args: 52 | sess: A Tensorflow session. 53 | fpath: Filepath to save directory. 54 | 55 | Returns: 56 | A dictionary mapping from keys to tensors in the computation graph 57 | loaded from ``fpath``. 58 | """ 59 | tf.saved_model.loader.load( 60 | sess, 61 | [tf.saved_model.tag_constants.SERVING], 62 | fpath 63 | ) 64 | model_info = joblib.load(osp.join(fpath, 'model_info.pkl')) 65 | graph = tf.get_default_graph() 66 | model = dict() 67 | model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['inputs'].items()}) 68 | model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['outputs'].items()}) 69 | return model 70 | 71 | class Logger: 72 | """ 73 | A general-purpose logger. 74 | 75 | Makes it easy to save diagnostics, hyperparameter configurations, the 76 | state of a training run, and the trained model. 77 | """ 78 | 79 | def __init__(self, output_dir=None, output_fname='progress.txt', exp_name=None): 80 | """ 81 | Initialize a Logger. 82 | 83 | Args: 84 | output_dir (string): A directory for saving results to. If 85 | ``None``, defaults to a temp directory of the form 86 | ``/tmp/experiments/somerandomnumber``. 87 | 88 | output_fname (string): Name for the tab-separated-value file 89 | containing metrics logged throughout a training run. 90 | Defaults to ``progress.txt``. 91 | 92 | exp_name (string): Experiment name. If you run multiple training 93 | runs and give them all the same ``exp_name``, the plotter 94 | will know to group them. (Use case: if you run the same 95 | hyperparameter configuration with multiple random seeds, you 96 | should give them all the same ``exp_name``.) 97 | """ 98 | if proc_id()==0: 99 | self.output_dir = output_dir or "/tmp/experiments/%i"%int(time.time()) 100 | if osp.exists(self.output_dir): 101 | print("Warning: Log dir %s already exists! Storing info there anyway."%self.output_dir) 102 | else: 103 | os.makedirs(self.output_dir) 104 | self.output_file = open(osp.join(self.output_dir, output_fname), 'w') 105 | atexit.register(self.output_file.close) 106 | print(colorize("Logging data to %s"%self.output_file.name, 'green', bold=True)) 107 | else: 108 | self.output_dir = None 109 | self.output_file = None 110 | self.first_row=True 111 | self.log_headers = [] 112 | self.log_current_row = {} 113 | self.exp_name = exp_name 114 | 115 | def log(self, msg, color='green'): 116 | """Print a colorized message to stdout.""" 117 | if proc_id()==0: 118 | print(colorize(msg, color, bold=True)) 119 | 120 | def log_tabular(self, key, val): 121 | """ 122 | Log a value of some diagnostic. 123 | 124 | Call this only once for each diagnostic quantity, each iteration. 125 | After using ``log_tabular`` to store values for each diagnostic, 126 | make sure to call ``dump_tabular`` to write them out to file and 127 | stdout (otherwise they will not get saved anywhere). 128 | """ 129 | if self.first_row: 130 | self.log_headers.append(key) 131 | else: 132 | assert key in self.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 133 | assert key not in self.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 134 | self.log_current_row[key] = val 135 | 136 | def save_config(self, config): 137 | """ 138 | Log an experiment configuration. 139 | 140 | Call this once at the top of your experiment, passing in all important 141 | config vars as a dict. This will serialize the config to JSON, while 142 | handling anything which can't be serialized in a graceful way (writing 143 | as informative a string as possible). 144 | 145 | Example use: 146 | 147 | .. code-block:: python 148 | 149 | logger = EpochLogger(**logger_kwargs) 150 | logger.save_config(locals()) 151 | """ 152 | config_json = convert_json(config) 153 | if self.exp_name is not None: 154 | config_json['exp_name'] = self.exp_name 155 | if proc_id()==0: 156 | output = json.dumps(config_json, separators=(',',':\t'), indent=4, sort_keys=True) 157 | print(colorize('Saving config:\n', color='cyan', bold=True)) 158 | print(output) 159 | with open(osp.join(self.output_dir, "config.json"), 'w') as out: 160 | out.write(output) 161 | 162 | def save_state(self, state_dict, itr=None): 163 | """ 164 | Saves the state of an experiment. 165 | 166 | To be clear: this is about saving *state*, not logging diagnostics. 167 | All diagnostic logging is separate from this function. This function 168 | will save whatever is in ``state_dict``---usually just a copy of the 169 | environment---and the most recent parameters for the model you 170 | previously set up saving for with ``setup_tf_saver``. 171 | 172 | Call with any frequency you prefer. If you only want to maintain a 173 | single state and overwrite it at each call with the most recent 174 | version, leave ``itr=None``. If you want to keep all of the states you 175 | save, provide unique (increasing) values for 'itr'. 176 | 177 | Args: 178 | state_dict (dict): Dictionary containing essential elements to 179 | describe the current state of training. 180 | 181 | itr: An int, or None. Current iteration of training. 182 | """ 183 | if proc_id()==0: 184 | fname = 'vars.pkl' if itr is None else 'vars%d.pkl'%itr 185 | try: 186 | joblib.dump(state_dict, osp.join(self.output_dir, fname)) 187 | except: 188 | self.log('Warning: could not pickle state_dict.', color='red') 189 | if hasattr(self, 'tf_saver_elements'): 190 | self._tf_simple_save(itr) 191 | if hasattr(self, 'pytorch_saver_elements'): 192 | self._pytorch_simple_save(itr) 193 | 194 | def setup_tf_saver(self, sess, inputs, outputs): 195 | """ 196 | Set up easy model saving for tensorflow. 197 | 198 | Call once, after defining your computation graph but before training. 199 | 200 | Args: 201 | sess: The Tensorflow session in which you train your computation 202 | graph. 203 | 204 | inputs (dict): A dictionary that maps from keys of your choice 205 | to the tensorflow placeholders that serve as inputs to the 206 | computation graph. Make sure that *all* of the placeholders 207 | needed for your outputs are included! 208 | 209 | outputs (dict): A dictionary that maps from keys of your choice 210 | to the outputs from your computation graph. 211 | """ 212 | self.tf_saver_elements = dict(session=sess, inputs=inputs, outputs=outputs) 213 | self.tf_saver_info = {'inputs': {k:v.name for k,v in inputs.items()}, 214 | 'outputs': {k:v.name for k,v in outputs.items()}} 215 | 216 | def _tf_simple_save(self, itr=None): 217 | """ 218 | Uses simple_save to save a trained model, plus info to make it easy 219 | to associated tensors to variables after restore. 220 | """ 221 | if proc_id()==0: 222 | assert hasattr(self, 'tf_saver_elements'), \ 223 | "First have to setup saving with self.setup_tf_saver" 224 | fpath = 'tf1_save' + ('%d'%itr if itr is not None else '') 225 | fpath = osp.join(self.output_dir, fpath) 226 | if osp.exists(fpath): 227 | # simple_save refuses to be useful if fpath already exists, 228 | # so just delete fpath if it's there. 229 | shutil.rmtree(fpath) 230 | tf.saved_model.simple_save(export_dir=fpath, **self.tf_saver_elements) 231 | joblib.dump(self.tf_saver_info, osp.join(fpath, 'model_info.pkl')) 232 | 233 | 234 | def setup_pytorch_saver(self, what_to_save): 235 | """ 236 | Set up easy model saving for a single PyTorch model. 237 | 238 | Because PyTorch saving and loading is especially painless, this is 239 | very minimal; we just need references to whatever we would like to 240 | pickle. This is integrated into the logger because the logger 241 | knows where the user would like to save information about this 242 | training run. 243 | 244 | Args: 245 | what_to_save: Any PyTorch model or serializable object containing 246 | PyTorch models. 247 | """ 248 | self.pytorch_saver_elements = what_to_save 249 | 250 | def _pytorch_simple_save(self, itr=None): 251 | """ 252 | Saves the PyTorch model (or models). 253 | """ 254 | if proc_id()==0: 255 | assert hasattr(self, 'pytorch_saver_elements'), \ 256 | "First have to setup saving with self.setup_pytorch_saver" 257 | fpath = 'pyt_save' 258 | fpath = osp.join(self.output_dir, fpath) 259 | fname = 'model' + ('%d'%itr if itr is not None else '') + '.pt' 260 | fname = osp.join(fpath, fname) 261 | os.makedirs(fpath, exist_ok=True) 262 | with warnings.catch_warnings(): 263 | warnings.simplefilter("ignore") 264 | # We are using a non-recommended way of saving PyTorch models, 265 | # by pickling whole objects (which are dependent on the exact 266 | # directory structure at the time of saving) as opposed to 267 | # just saving network weights. This works sufficiently well 268 | # for the purposes of Spinning Up, but you may want to do 269 | # something different for your personal PyTorch project. 270 | # We use a catch_warnings() context to avoid the warnings about 271 | # not being able to save the source code. 272 | torch.save(self.pytorch_saver_elements, fname) 273 | 274 | 275 | def dump_tabular(self): 276 | """ 277 | Write all of the diagnostics from the current iteration. 278 | 279 | Writes both to stdout, and to the output file. 280 | """ 281 | if proc_id()==0: 282 | vals = [] 283 | key_lens = [len(key) for key in self.log_headers] 284 | max_key_len = max(15,max(key_lens)) 285 | keystr = '%'+'%d'%max_key_len 286 | fmt = "| " + keystr + "s | %15s |" 287 | n_slashes = 22 + max_key_len 288 | print("-"*n_slashes) 289 | for key in self.log_headers: 290 | val = self.log_current_row.get(key, "") 291 | valstr = "%8.3g"%val if hasattr(val, "__float__") else val 292 | print(fmt%(key, valstr)) 293 | vals.append(val) 294 | print("-"*n_slashes, flush=True) 295 | if self.output_file is not None: 296 | if self.first_row: 297 | self.output_file.write("\t".join(self.log_headers)+"\n") 298 | self.output_file.write("\t".join(map(str,vals))+"\n") 299 | self.output_file.flush() 300 | self.log_current_row.clear() 301 | self.first_row=False 302 | 303 | class EpochLogger(Logger): 304 | """ 305 | A variant of Logger tailored for tracking average values over epochs. 306 | 307 | Typical use case: there is some quantity which is calculated many times 308 | throughout an epoch, and at the end of the epoch, you would like to 309 | report the average / std / min / max value of that quantity. 310 | 311 | With an EpochLogger, each time the quantity is calculated, you would 312 | use 313 | 314 | .. code-block:: python 315 | 316 | epoch_logger.store(NameOfQuantity=quantity_value) 317 | 318 | to load it into the EpochLogger's state. Then at the end of the epoch, you 319 | would use 320 | 321 | .. code-block:: python 322 | 323 | epoch_logger.log_tabular(NameOfQuantity, **options) 324 | 325 | to record the desired values. 326 | """ 327 | 328 | def __init__(self, *args, **kwargs): 329 | super().__init__(*args, **kwargs) 330 | self.epoch_dict = dict() 331 | 332 | def store(self, **kwargs): 333 | """ 334 | Save something into the epoch_logger's current state. 335 | 336 | Provide an arbitrary number of keyword arguments with numerical 337 | values. 338 | """ 339 | for k,v in kwargs.items(): 340 | if not(k in self.epoch_dict.keys()): 341 | self.epoch_dict[k] = [] 342 | self.epoch_dict[k].append(v) 343 | 344 | def log_tabular(self, key, val=None, with_min_and_max=False, average_only=False): 345 | """ 346 | Log a value or possibly the mean/std/min/max values of a diagnostic. 347 | 348 | Args: 349 | key (string): The name of the diagnostic. If you are logging a 350 | diagnostic whose state has previously been saved with 351 | ``store``, the key here has to match the key you used there. 352 | 353 | val: A value for the diagnostic. If you have previously saved 354 | values for this key via ``store``, do *not* provide a ``val`` 355 | here. 356 | 357 | with_min_and_max (bool): If true, log min and max values of the 358 | diagnostic over the epoch. 359 | 360 | average_only (bool): If true, do not log the standard deviation 361 | of the diagnostic over the epoch. 362 | """ 363 | if val is not None: 364 | super().log_tabular(key,val) 365 | else: 366 | v = self.epoch_dict[key] 367 | vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v 368 | stats = mpi_statistics_scalar(vals, with_min_and_max=with_min_and_max) 369 | super().log_tabular(key if average_only else 'Average' + key, stats[0]) 370 | if not(average_only): 371 | super().log_tabular('Std'+key, stats[1]) 372 | if with_min_and_max: 373 | super().log_tabular('Max'+key, stats[3]) 374 | super().log_tabular('Min'+key, stats[2]) 375 | self.epoch_dict[key] = [] 376 | 377 | def get_stats(self, key): 378 | """ 379 | Lets an algorithm ask the logger for mean/std/min/max of a diagnostic. 380 | """ 381 | v = self.epoch_dict[key] 382 | vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v 383 | return mpi_statistics_scalar(vals) -------------------------------------------------------------------------------- /utils/mpi_pytorch.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import numpy as np 3 | import os 4 | import torch 5 | from mpi4py import MPI 6 | from utils.mpi_tools import broadcast, mpi_avg, num_procs, proc_id 7 | 8 | def setup_pytorch_for_mpi(): 9 | """ 10 | Avoid slowdowns caused by each separate process's PyTorch using 11 | more than its fair share of CPU resources. 12 | """ 13 | #print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True) 14 | if torch.get_num_threads()==1: 15 | return 16 | fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1) 17 | torch.set_num_threads(fair_num_threads) 18 | #print('Proc %d: Reporting new number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True) 19 | 20 | def mpi_avg_grads(module): 21 | """ Average contents of gradient buffers across MPI processes. """ 22 | if num_procs()==1: 23 | return 24 | for p in module.parameters(): 25 | p_grad_numpy = p.grad.numpy() # numpy view of tensor data 26 | avg_p_grad = mpi_avg(p.grad) 27 | p_grad_numpy[:] = avg_p_grad[:] 28 | 29 | def sync_params(module): 30 | """ Sync all parameters of module across all MPI processes. """ 31 | if num_procs()==1: 32 | return 33 | for p in module.parameters(): 34 | p_numpy = p.data.numpy() 35 | broadcast(p_numpy) -------------------------------------------------------------------------------- /utils/mpi_tools.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import os, subprocess, sys 3 | import numpy as np 4 | 5 | 6 | def mpi_fork(n, bind_to_core=False): 7 | """ 8 | Re-launches the current script with workers linked by MPI. 9 | 10 | Also, terminates the original process that launched it. 11 | 12 | Taken almost without modification from the Baselines function of the 13 | `same name`_. 14 | 15 | .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py 16 | 17 | Args: 18 | n (int): Number of process to split into. 19 | 20 | bind_to_core (bool): Bind each MPI process to a core. 21 | """ 22 | if n<=1: 23 | return 24 | if os.getenv("IN_MPI") is None: 25 | env = os.environ.copy() 26 | env.update( 27 | MKL_NUM_THREADS="1", 28 | OMP_NUM_THREADS="1", 29 | IN_MPI="1" 30 | ) 31 | args = ["mpirun", "-np", str(n)] 32 | if bind_to_core: 33 | args += ["-bind-to", "core"] 34 | args += [sys.executable] + sys.argv 35 | subprocess.check_call(args, env=env) 36 | sys.exit() 37 | 38 | 39 | def msg(m, string=''): 40 | print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m)) 41 | 42 | def proc_id(): 43 | """Get rank of calling process.""" 44 | return MPI.COMM_WORLD.Get_rank() 45 | 46 | def allreduce(*args, **kwargs): 47 | return MPI.COMM_WORLD.Allreduce(*args, **kwargs) 48 | 49 | def num_procs(): 50 | """Count active MPI processes.""" 51 | return MPI.COMM_WORLD.Get_size() 52 | 53 | def broadcast(x, root=0): 54 | MPI.COMM_WORLD.Bcast(x, root=root) 55 | 56 | def mpi_op(x, op): 57 | x, scalar = ([x], True) if np.isscalar(x) else (x, False) 58 | x = np.asarray(x, dtype=np.float32) 59 | buff = np.zeros_like(x, dtype=np.float32) 60 | allreduce(x, buff, op=op) 61 | return buff[0] if scalar else buff 62 | 63 | def mpi_sum(x): 64 | return mpi_op(x, MPI.SUM) 65 | 66 | def mpi_avg(x): 67 | """Average a scalar or vector over MPI processes.""" 68 | return mpi_sum(x) / num_procs() 69 | 70 | def mpi_statistics_scalar(x, with_min_and_max=False): 71 | """ 72 | Get mean/std and optional min/max of scalar x across MPI processes. 73 | 74 | Args: 75 | x: An array containing samples of the scalar to produce statistics 76 | for. 77 | 78 | with_min_and_max (bool): If true, return min and max of x in 79 | addition to mean and std. 80 | """ 81 | x = np.array(x, dtype=np.float32) 82 | global_sum, global_n = mpi_sum([np.sum(x), len(x)]) 83 | mean = global_sum / global_n 84 | 85 | global_sum_sq = mpi_sum(np.sum((x - mean)**2)) 86 | std = np.sqrt(global_sum_sq / global_n) # compute global std 87 | 88 | if with_min_and_max: 89 | global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN) 90 | global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX) 91 | return mean, std, global_min, global_max 92 | return mean, std -------------------------------------------------------------------------------- /utils/serialization_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def convert_json(obj): 4 | """ Convert obj to a version which can be serialized with JSON. """ 5 | if is_json_serializable(obj): 6 | return obj 7 | else: 8 | if isinstance(obj, dict): 9 | return {convert_json(k): convert_json(v) 10 | for k,v in obj.items()} 11 | 12 | elif isinstance(obj, tuple): 13 | return (convert_json(x) for x in obj) 14 | 15 | elif isinstance(obj, list): 16 | return [convert_json(x) for x in obj] 17 | 18 | elif hasattr(obj,'__name__') and not('lambda' in obj.__name__): 19 | return convert_json(obj.__name__) 20 | 21 | elif hasattr(obj,'__dict__') and obj.__dict__: 22 | obj_dict = {convert_json(k): convert_json(v) 23 | for k,v in obj.__dict__.items()} 24 | return {str(obj): obj_dict} 25 | 26 | return str(obj) 27 | 28 | def is_json_serializable(v): 29 | try: 30 | json.dumps(v) 31 | return True 32 | except: 33 | return False --------------------------------------------------------------------------------