├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── SAC_agents.py ├── benchmark_agent.py ├── control_test.py ├── drone_env.py ├── images ├── E1000_n10_DiscretePolicy4_b02.png ├── E1500_n5_DiscretePolicy8_b02.png ├── collisions_hist.pdf └── delta_effect.pdf ├── learning_Q_test.py ├── matlab ├── cost_field.m ├── derivations_22ndPol.m ├── derivations_2ndPol.m ├── derivations_check.m ├── distance_def.m ├── images │ ├── dij.eps │ └── dij.fig ├── normal_multivariate_pdf.m ├── optimal_traj.m └── sol.mat ├── models ├── E1000_n10_DiscretePolicy4_b02-A2Cactors.pth ├── E1000_n10_DiscretePolicy4_b02-A2Ccritics.pth ├── E500_M30_LR1e4_badInitialState-actors.pth ├── E500_M30_LR1e4_badInitialState-critics.pth ├── cont_preloaded-A2Cactors.pth ├── cont_preloaded-A2Ccritics.pth ├── deltas │ ├── deltas0.01_softmax16-A2Cactors.pth │ ├── deltas0.01_softmax16-A2Ccritics.pth │ ├── deltas0.1_softmax16-A2Cactors.pth │ ├── deltas0.1_softmax16-A2Ccritics.pth │ ├── deltas0.2_softmax16-A2Cactors.pth │ ├── deltas0.2_softmax16-A2Ccritics.pth │ ├── deltas0.5_softmax16-A2Cactors.pth │ ├── deltas0.5_softmax16-A2Ccritics.pth │ ├── deltas0.8_softmax16-A2Cactors.pth │ ├── deltas0.8_softmax16-A2Ccritics.pth │ ├── deltas1.5_softmax16-A2Cactors.pth │ ├── deltas1.5_softmax16-A2Ccritics.pth │ ├── deltas1_softmax16-A2Cactors.pth │ ├── deltas1_softmax16-A2Ccritics.pth │ ├── deltas2.43_softmax16-A2Cactors.pth │ ├── deltas2.43_softmax16-A2Ccritics.pth │ ├── deltas2.5_softmax16-A2Cactors-old (1).pth │ ├── deltas2.5_softmax16-A2Ccritics-old (2).pth │ ├── deltas2_softmax16-A2Cactors.pth │ └── deltas2_softmax16-A2Ccritics.pth ├── deltas2.5_softmax16-A2Cactors.pth ├── deltas2.5_softmax16-A2Ccritics.pth ├── discrete-A2Cactors.pth ├── discrete-A2Ccritics.pth ├── final │ ├── cont_n5-A2Cactors.pth │ ├── cont_n5-A2Ccritics.pth │ ├── simple-A2Cactors.pth │ ├── simple-A2Ccritics.pth │ ├── softmax8_n4-A2Cactors.pth │ ├── softmax8_n4-A2Ccritics.pth │ ├── softmax8_n5-A2Cactors.pth │ ├── softmax8_n5-A2Ccritics.pth │ ├── softmax8_n8-A2Cactors.pth │ └── softmax8_n8-A2Ccritics.pth ├── n5_E1500_Advantage-actors.pth ├── n5_E1500_Advantage-critics.pth ├── softmax8_n5-A2Cactors.pth ├── softmax8_n5-A2Ccritics.pth └── trained_E1000_M50_LR001-actors.pth ├── policy_performance_variables_1 ├── policy_performance_variables_2 ├── profile.bat ├── spec-file.txt ├── train_problem.py ├── utils.py ├── variables_with_delta_change_1 ├── variables_with_delta_change_2 ├── variables_with_delta_change_3 └── variables_with_delta_change_4 /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # my own thoings 2 | notes.md 3 | 4 | # Folders 5 | videos/ 6 | *.prof 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | cover/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | .pybuilder/ 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | # For a library or package, you might want to ignore these files since the code is 94 | # intended to run in multiple environments; otherwise, check them in: 95 | # .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # poetry 105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 106 | # This is especially recommended for binary packages to ensure reproducibility, and is more 107 | # commonly ignored for libraries. 108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 109 | #poetry.lock 110 | 111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 112 | __pypackages__/ 113 | 114 | # Celery stuff 115 | celerybeat-schedule 116 | celerybeat.pid 117 | 118 | # SageMath parsed files 119 | *.sage.py 120 | 121 | # Environments 122 | .env 123 | .venv 124 | env/ 125 | venv/ 126 | ENV/ 127 | env.bak/ 128 | venv.bak/ 129 | 130 | # Spyder project settings 131 | .spyderproject 132 | .spyproject 133 | 134 | # Rope project settings 135 | .ropeproject 136 | 137 | # mkdocs documentation 138 | /site 139 | 140 | # mypy 141 | .mypy_cache/ 142 | .dmypy.json 143 | dmypy.json 144 | 145 | # Pyre type checker 146 | .pyre/ 147 | 148 | # pytype static type analyzer 149 | .pytype/ 150 | 151 | # Cython debug symbols 152 | cython_debug/ 153 | 154 | # PyCharm 155 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 156 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 157 | # and can be added to the global gitignore or merged into this file. For a more nuclear 158 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 159 | #.idea/ 160 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Andreu Matoses 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scalable Collision Avoidance RL 2 | 3 | 4 | Multi agent scalable reinforcement learning for formation control with collision avoidance. Paper and more information can be found in the [website 🌍](https://andreumatoses.github.io/research/msc-thesis-marl) 5 | 6 | 7 | 8 | 9 | This is part of my Mater's Thesis at the Royal Institute of Technology (KTH), Stockholm, Sweden (URL available soon). The scalable part of this work is inspired by the paper [Scalable Reinforcement Learning for Multi-Agent Networked Systems](https://arxiv.org/abs/1912.02906). The approach presented on the thesis exploits the structure of the designed reward to present ∆-disk local approximation of the individual Q functions and policy gradients. 10 | 11 | ## Important Scripts 12 | 13 | - [drone_env.py](drone_env.py): Script containing the environment class and its methods. The main methods follow the structure of the OpenGym RL environments, such as `.step()` and `.reset()`. 14 | - [train_problem.py](train_problem.py): Script containing the training of the main problem. 15 | - [SAC_agents.py](SAC_agents.py): Script containing the agent classes and its policy classes 16 | - [benchmark_agent.py](benchmark_agent.py): Scripts to run trained agents and benchmark their performance 17 | 18 | ## Training of the scalable agents 19 | The schema of the algorithm used is presented below. The scalable actor-critic agents are trained for each robotic agent. There are a total of *n* agents. 20 | 21 | ![image](https://user-images.githubusercontent.com/46297629/216669445-a07214a4-08e5-46d8-85e5-f30855f3e8fc.png) 22 | 23 | ## Structure of the training script 24 | ![image](https://user-images.githubusercontent.com/46297629/216669814-5e9465ef-f0a8-46cb-a53a-35645a799e70.png) 25 | 26 | ## Relevant results 27 | ### Softmax discrete policy 28 | The inidivual policy for each agent is of the shape: 29 | 30 | ![image](https://user-images.githubusercontent.com/46297629/216673328-1cea1dc8-26fe-4b50-9618-ccf30043801e.png) 31 | 32 | Some examples of trajectories obtained after succesful training are as follow 33 | 34 | ![image](https://user-images.githubusercontent.com/46297629/216674048-68c2473c-b398-4d60-8bfc-de6049065911.png) 35 | 36 | ### Continous normally distributed policy 37 | The inidivual policy for each agent is of the shape: 38 | 39 | ![image](https://user-images.githubusercontent.com/46297629/216673469-c5b07220-ee01-4d7f-a1bf-010b21619b19.png) 40 | 41 | Some examples of trajectories obtained after succesful training are as follow 42 | 43 | ![image](https://user-images.githubusercontent.com/46297629/216673912-91e69f5a-f6dc-49b3-ac77-fed1a8cecec5.png) 44 | 45 | ## Comparison of tested policies 46 | The number of collisions displayed on the results are defined as an agent intersection with a neighbour’s collision radius in a given time step. Each agent counts collisions separately, thus two agents colliding is counted as two different collisions and a collisions that lasts for several times steps is is also counted as different collisions. 47 | 48 | Percentage of simulations that achieve each number of collisions for each of the three tested policies, n = 5. The percentages for more than 14 collisions (under 1%) have been omitted. 49 | 50 | ![image](https://user-images.githubusercontent.com/46297629/216673529-5bf6a5be-c149-43cd-b0d7-ead7099d29dd.png) 51 | 52 | Effect of the ∆-disk radius (definition 12 on the thesis) on the global reward and number of collisions, averaged over 2000 runs of the trained policies, for the discrete softmax NN policy 53 | 54 | ![image](https://user-images.githubusercontent.com/46297629/216673581-b48750c7-eedc-4e04-92d4-21ade45c398a.png) 55 | 56 | The results also show that the average reward starts to decrease after a certain value of ∆i , in this case around 1. The average number of collisions also increases sharply back to values where the agent has nearly no vision. This unexpected behaviours is the result of significant increase in the complexity of the maximization problem that the policy gradient is trying to solve. Taking into account an increasing number of neighbours and from further distances, increases the variance of the estimated approximated gradient and as a result, the policy used is not able to find improvements. Indeed, for the final case of ∆i ≈ dˆi is not able to converge during training. 57 | 58 | -------------------------------------------------------------------------------- /SAC_agents.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | from collections import deque, namedtuple 7 | from utils import * 8 | 9 | class RandomAgent: 10 | ''' Agent taking actions uniformly at random, child of the class Agent''' 11 | def __init__(self, n_actions: int): 12 | # super(RandomAgent, self).__init__(n_actions) 13 | self.n_actions = n_actions 14 | 15 | def forward(self, state: np.ndarray) -> np.ndarray: 16 | ''' Compute a random action in [-1, 1] 17 | 18 | Returns: 19 | action (np.ndarray): array of float values containing the 20 | action. 21 | ''' 22 | return np.clip(-1 + 2 * np.random.rand(self.n_actions), -1, 1) 23 | 24 | class TrainedAgent: 25 | ''' Agent that loads and follows a learned policy/critic 26 | ''' 27 | def __init__(self, critics_name:str, actors_name:str, n_agents = "auto", discount = 0.99): 28 | 29 | file_name_critics = os.path.join("models", critics_name) 30 | # Load critic 31 | try: 32 | criticsNN = torch.load(file_name_critics) 33 | print(f'Loaded Critic, n_agents = {len(criticsNN)}, discount = {discount}. Network model[0]: {criticsNN[0]}') 34 | except: 35 | print(f'File {file_name_critics} not found!') 36 | exit(-1) 37 | 38 | self.criticsNN = criticsNN 39 | self.critics_name = critics_name 40 | 41 | if n_agents == "auto": 42 | self.n_agents = len(criticsNN) 43 | else: 44 | self.n_agents = n_agents 45 | 46 | # load actor 47 | file_name_actors = os.path.join("models", actors_name) 48 | 49 | try: 50 | actors = torch.load(file_name_actors) 51 | print(f'Loaded actors, n_agents = {len(criticsNN)}, discount = {discount}. Type: {type(actors[0])}') 52 | except: 53 | print(f'File {file_name_actors} not found!') 54 | exit(-1) 55 | self.actors = actors 56 | self.actors_name = actors_name 57 | 58 | self.discount = discount # to benchmark the critic 59 | 60 | def forward(self, z_states: list, N:list): 61 | """ Returns: 62 | action (np.ndarray): array of float values containing the 63 | action. The dimensionality is equal to self.n_actions from 64 | the parent class Agent 65 | """ 66 | actions = [] 67 | if type(self.actors[0]) is NormalPolicy or type(self.actors[0]) is NormalActorNN or type(self.actors[0]) is DiscreteSoftmaxNN: 68 | # z_state in this case 69 | for i in range(self.n_agents): 70 | z_state = z_states[i].flatten() 71 | Ni = N[i] 72 | if i < len(self.actors): 73 | actor = self.actors[i] 74 | else: 75 | actor = self.actors[0] 76 | 77 | action = actor.sample_action(z_state, Ni) 78 | actions.append(action) 79 | else: 80 | print(f"Error type of policy {type(self.actors[0])}") 81 | 82 | return actions 83 | 84 | def benchmark_cirtic(self, buffers: deque, only_one_NN = False): 85 | 86 | Gts = deque() # for debug, delete after 87 | V_approxs = deque() # for debug, delete after 88 | criticNN= self.criticsNN[0] 89 | 90 | for i in range(self.n_agents): 91 | # NN for this agent: 92 | if not only_one_NN: 93 | if i < len(self.criticsNN): 94 | criticNN = self.criticsNN[i] 95 | else: 96 | criticNN = self.criticsNN[0] 97 | 98 | # separate data from experience buffer 99 | buffer = buffers.buffers[i] 100 | states, actions, rewards, new_states, Ni, finished = zip(*buffer) 101 | 102 | # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s] 103 | # inputs = np.column_stack((states,actions)) 104 | # inputs = torch.tensor(inputs, dtype=torch.float32) 105 | inputs = torch.tensor(np.array(states), dtype=torch.float32) 106 | 107 | # Calculate the simulated Q value (target), Monte carlo Gt 108 | # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T) 109 | T = len(rewards) 110 | Gt_array = np.zeros([T]) 111 | Gt_array[-1] = rewards[-1] 112 | for t in range(T-2,-1,-1): 113 | Gt_array[t] = Gt_array[t+1]*self.discount + rewards[t] 114 | 115 | Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze() 116 | 117 | # value function: # calculate the approximated Q(s,a) = NN(input) 118 | V_approx = criticNN(inputs).squeeze() 119 | 120 | V_approxs.append(V_approx.detach().numpy()) 121 | Gts.append(Gt_array) # for debug 122 | 123 | # Gts is the simulated Q(st,at) values for each agent. Q(s,a)-V(s) ~= Gt-V(st) = A(s,a) 124 | return Gts, V_approxs 125 | 126 | class SA2CAgents: 127 | 128 | def __init__(self, n_agents, dim_local_state, dim_local_action, discount, epochs, learning_rate_critic = 10**(-3), learning_rate_actor = 10**(-3)) -> None: 129 | '''* dim_local_state is the total size of the localized vector that the input of the Q and pi approximations use, i.e (k+1)*dim''' 130 | 131 | self.n_agents = n_agents 132 | self.dim_local_state = dim_local_state 133 | self.dim_local_action = dim_local_action 134 | # self.policy_type = policy_type # What kind of policy (NN, stochastic normal dist, etc...) 135 | self.discount = discount 136 | self.epochs = epochs 137 | 138 | # preload_NN = "models\\final\\cont_n5" 139 | preload_NN = None 140 | # Define policy (actor) 141 | if preload_NN is None: 142 | # self.actors = [NormalPolicy(dim_local_state,dim_local_action) for i in range(n_agents)] 143 | self.actors = [DiscreteSoftmaxNN(dim_local_state, lr = learning_rate_actor, n_actions=16) for i in range(n_agents)] 144 | # self.actors = [NormalActorNN(dim_local_state, lr = learning_rate_actor, dim_action=dim_local_action) for i in range(n_agents)] 145 | self.learning_rate_actor = learning_rate_actor 146 | 147 | # List of NN that estimate Q (or V if we use advantage) 148 | # self.criticsNN = [CriticNN(dim_local_state + dim_local_action, output_size=1) for i in range(n_agents)] 149 | self.criticsNN = [CriticNN(dim_local_state, output_size=1) for i in range(n_agents)] 150 | self.critic_optimizers = [optim.Adam(self.criticsNN[i].parameters(),lr = learning_rate_critic) for i in range(n_agents)] 151 | else: 152 | try: 153 | actors = torch.load(preload_NN + "-A2Cactors.pth") 154 | print(f'Loaded actors, n_agents = {len(actors)}, discount = {discount}. Type: {type(actors[0])}') 155 | except: 156 | print(f'File {preload_NN + "-A2Cactors.pth"} not found!') 157 | exit(-1) 158 | self.actors = actors 159 | self.learning_rate_actor = learning_rate_actor 160 | try: 161 | criticsNN = torch.load(preload_NN + "-A2Ccritics.pth") 162 | print(f'Loaded Critic, n_agents = {len(criticsNN)}, discount = {discount}. Network model[0]: {criticsNN[0]}') 163 | except: 164 | print(f'File {preload_NN + "-A2Ccritics.pth"} not found!') 165 | exit(-1) 166 | self.criticsNN = criticsNN 167 | self.critic_optimizers = [optim.Adam(self.criticsNN[i].parameters(),lr = learning_rate_critic) for i in range(n_agents)] 168 | 169 | 170 | def forward(self, z_states, N) -> list: 171 | ''' Function that calculates the actions to take from the z_states list (control law) 172 | actions: list of row vectors [u1^T, u2^T,...]''' 173 | 174 | actions = deque() 175 | for i in range(self.n_agents): 176 | z_state = z_states[i].flatten() 177 | actions.append(self.actors[i].sample_action(z_state, N[i])) 178 | # actions.append(self.actors[i].sample_action(z_state, [1])) 179 | 180 | return actions 181 | 182 | def train_designed_policy(self, buffers: deque, actor_lr = None, return_grads = False): 183 | epochs = self.epochs 184 | 185 | if actor_lr is not None: 186 | self.learning_rate_actor = actor_lr 187 | 188 | Gts = deque() # for debug, delete after -> acces data Gts[i][t] 189 | 190 | # CRITIC LOOP 191 | for i in range(self.n_agents): 192 | # NN for this agent: 193 | criticNN = self.criticsNN[i] 194 | critic_optimizer = self.critic_optimizers[i] 195 | 196 | # separate data from experience buffer 197 | buffer = buffers.buffers[i] 198 | states, actions, rewards, new_states, Ni, finished = zip(*buffer) 199 | 200 | # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s] 201 | # inputs = np.column_stack((states,actions)) 202 | # inputs = torch.tensor(inputs, dtype=torch.float32) 203 | inputs = torch.tensor(np.array(states), dtype=torch.float32) 204 | 205 | # Calculate the simulated Q value (target), Monte carlo Gt 206 | # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T) 207 | T = len(rewards) 208 | Gt_array = np.zeros([T]) 209 | Gt_array[-1] = rewards[-1] 210 | for t in range(T-2,-1,-1): 211 | Gt_array[t] = Gt_array[t+1]*self.discount + rewards[t] 212 | 213 | Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze() 214 | Gts.append(Gt_array) # for debug 215 | 216 | ### Perfrom omega (critic) update: 217 | # Set gradient to zero 218 | critic_optimizer.zero_grad() 219 | # value function: # calculate the approximated V(s) = NN(input) 220 | V_approx = criticNN(inputs).squeeze() 221 | # Compute MSE loss, as E[Gt-V(s) = A(s,a)] = 0 222 | loss = nn.functional.mse_loss(V_approx, Gt) 223 | # Compute gradient 224 | loss.backward() 225 | # Clip gradient norm to avoid infinite gradient 226 | nn.utils.clip_grad_norm_(criticNN.parameters(), max_norm=10) 227 | # Update 228 | critic_optimizer.step() 229 | 230 | # ACTOR LOOP 231 | grad_norms = [] 232 | gi_norms = [] 233 | for i in range(self.n_agents): 234 | # to access buffer data: buffers.buffers[i][t].action, namedtuple('experience', ['z_state', 'action', 'reward', 'next_z', 'Ni', 'finished']) 235 | # Gt: Gts[i][t] 236 | actor = self.actors[i] 237 | gi = 0 #initialize to 0 238 | 239 | for t in range(T): 240 | zit = buffers.buffers[i][t].z_state 241 | ait = buffers.buffers[i][t].action 242 | Nit = buffers.buffers[i][t].Ni 243 | 244 | grad_actor = actor.compute_grad(zit,ait, Nit) 245 | # PUT Nit HERE INSTEAD of [1,2,3] 246 | # grad_actor = actor.clip_grad_norm(grad_actor,clip_norm=100) 247 | 248 | # Qj_sum = 0 249 | Advantage_j_sum = 0 250 | input_tensor = torch.tensor(zit, dtype=torch.float32) 251 | # Baseline is the Vi(s) for current agent. reduce variance and complexity 252 | Vi_baseline = self.criticsNN[i](input_tensor).detach().numpy()[0] 253 | for j in Nit: # i included here 254 | zjt = buffers.buffers[j][t].z_state 255 | ajt = buffers.buffers[j][t].action 256 | # Q_input_tensor = torch.tensor(np.hstack((zjt,ajt)), dtype=torch.float32) 257 | # Qj = self.criticsNN[j](Q_input_tensor).detach().numpy()[0] 258 | # input_tensor = torch.tensor(zjt, dtype=torch.float32) 259 | # Vj = self.criticsNN[j](input_tensor).detach().numpy()[0] 260 | Advantage_j_sum += (Gts[j][t] - Vi_baseline) 261 | # Qj_sum += Gts[j][t] 262 | 263 | # gi += self.discount**t * 1/self.n_agents* grad_actor * Qj_sum 264 | gi += self.discount**t * 1/self.n_agents* grad_actor * Advantage_j_sum 265 | 266 | # Update policy parameters with approx gradient gi (clipped to avoid infinity gradients) 267 | gi = actor.clip_grad_norm(gi, clip_norm=100) 268 | # MAKE SURE TO CLIP THE PARAMS from 0 to 2*Pi 269 | actor.parameters = actor.parameters + self.learning_rate_actor*gi 270 | # actor.parameters = np.clip(actor.parameters + self.learning_rate_actor*gi, -2*np.pi, 2*np.pi) 271 | 272 | # print(f"grad norms gi={np.linalg.norm(gi.flatten())}") 273 | if return_grads: 274 | grad_norms.append(np.linalg.norm(grad_actor.flatten())) 275 | gi_norms.append(np.linalg.norm(gi.flatten())) 276 | 277 | if return_grads: 278 | return grad_norms, gi_norms 279 | 280 | def train_NN(self, buffers: deque, actor_lr = None): 281 | epochs = self.epochs 282 | 283 | if actor_lr is not None: 284 | self.learning_rate_actor = actor_lr 285 | 286 | Gts = deque() # for debug, delete after -> acces data Gts[i][t] 287 | T = len(buffers) 288 | 289 | # CRITIC LOOP 290 | for i in range(self.n_agents): 291 | # NN for this agent: 292 | criticNN = self.criticsNN[i] 293 | critic_optimizer = self.critic_optimizers[i] 294 | 295 | # separate data from experience buffer 296 | buffer = buffers.buffers[i] 297 | states, actions, rewards, new_states, Ni, finished = zip(*buffer) 298 | 299 | # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s] 300 | inputs = torch.tensor(np.array(states), dtype=torch.float32) 301 | 302 | # Calculate the simulated Q value (target), Monte carlo Gt 303 | # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T) 304 | Gt_array = np.zeros([T]) 305 | Gt_array[-1] = rewards[-1] 306 | for t in range(T-2,-1,-1): 307 | Gt_array[t] = Gt_array[t+1]*self.discount + rewards[t] 308 | 309 | Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze() 310 | Gts.append(Gt_array) # for debug 311 | 312 | ## Perfrom omega (critic) update: 313 | # Set gradient to zero 314 | critic_optimizer.zero_grad() 315 | # value function: # calculate the approximated V(s) = NN(input) 316 | V_approx = criticNN(inputs).squeeze() 317 | # Compute MSE loss, as E[Gt-V(s) = A(s,a)] = 0 318 | loss = nn.functional.mse_loss(V_approx, Gt) 319 | # Compute gradient 320 | loss.backward() 321 | # Clip gradient norm to avoid infinite gradient 322 | nn.utils.clip_grad_norm_(criticNN.parameters(), max_norm=10) 323 | # Update 324 | critic_optimizer.step() 325 | 326 | # ACTOR LOOP 327 | for i in range(self.n_agents): 328 | # to access buffer data: buffers.buffers[i][t].action, namedtuple('experience', ['z_state', 'action', 'reward', 'next_z', 'Ni', 'finished']) 329 | # Gt: Gts[i][t] 330 | actor = self.actors[i] 331 | actor_loss = torch.tensor(0, dtype=torch.float32) 332 | 333 | for t in range(T): 334 | zit = buffers.buffers[i][t].z_state 335 | ait = buffers.buffers[i][t].action 336 | Nit = buffers.buffers[i][t].Ni 337 | 338 | log_prob_tensor = actor.log_p_of_a(zit,ait) 339 | 340 | Advantage_j_sum = 0 341 | input_tensor = torch.tensor(zit, dtype=torch.float32) 342 | # Baseline is the Vi(s) for current agent. reduce variance and complexity 343 | Vi_baseline = self.criticsNN[i](input_tensor).detach().numpy()[0] 344 | # Advantage_j_sum += (Gts[i][t] - Vi_baseline) 345 | for j in Nit: # i included here 346 | Advantage_j_sum += (Gts[j][t] - Vi_baseline) 347 | # Advantage_j_sum += (Gts[j][t]) 348 | 349 | # gi += self.discount**t * 1/self.n_agents* grad_actor * Qj_sum 350 | # actor_loss = actor_loss - self.discount**t * 1/self.n_agents* log_prob_tensor * Advantage_j_sum 351 | actor_loss = actor_loss -log_prob_tensor * 1/self.n_agents * self.discount**t * Advantage_j_sum 352 | 353 | # Update policy parameters 354 | actor.optimizer.zero_grad() 355 | actor_loss.backward() 356 | nn.utils.clip_grad_norm_(actor.parameters(), max_norm=10) 357 | actor.optimizer.step() 358 | 359 | 360 | def benchmark_cirtic(self, buffers: deque, only_one_NN = False): 361 | 362 | Gts = deque() # for debug, delete after 363 | V_approxs = deque() # for debug, delete after 364 | criticNN= self.criticsNN[0] 365 | 366 | for i in range(self.n_agents): 367 | # NN for this agent: 368 | if not only_one_NN: 369 | criticNN = self.criticsNN[i] 370 | 371 | # separate data from experience buffer 372 | buffer = buffers.buffers[i] 373 | states, actions, rewards, new_states, Ni, finished = zip(*buffer) 374 | 375 | # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) 376 | # inputs = np.column_stack((states,actions)) 377 | inputs = torch.tensor(np.array(states), dtype=torch.float32) 378 | ## Actor update 379 | 380 | # Calculate the simulated Q value (target), Monte carlo Gt 381 | # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T) 382 | T = len(rewards) 383 | Gt_array = np.zeros([T]) 384 | Gt_array[-1] = rewards[-1] 385 | for t in range(T-2,-1,-1): 386 | Gt_array[t] = Gt_array[t+1]*self.discount + rewards[t] 387 | 388 | Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze() 389 | 390 | # value function: # calculate the approximated Q(s,a) ~= Gt = V(s) + A(s,a) => Gt-V(s) = A(s,a) 391 | V_approx = criticNN(inputs).squeeze() 392 | V_approxs.append(V_approx.detach().numpy()) 393 | # Q_approxs.append(Gt_array) 394 | Gts.append(Gt_array) # for debug 395 | 396 | # Gts is the simulated Q(st,at) values for each agent. Q(s,a)-V(s) ~= Gt-V(st) = A(s,a) 397 | return Gts, V_approxs 398 | 399 | def save(self,filename = "network"): 400 | folder ="models" 401 | cirtic_name = filename + "-A2Ccritics.pth" 402 | actors_name = filename + "-A2Cactors.pth" 403 | 404 | torch.save(self.criticsNN, os.path.join(folder,cirtic_name)) 405 | print(f'Saved Critic NNs as {cirtic_name}') 406 | torch.save(self.actors, os.path.join(folder,actors_name)) 407 | print(f'Saved Actors List as {actors_name}') 408 | 409 | 410 | class SPPOAgents: 411 | def __init__(self, n_agents, dim_local_state, dim_local_action, discount, epochs, learning_rate_critic = 10**(-3), learning_rate_actor = 10**(-3), epsilon = 0.2) -> None: 412 | '''* dim_local_state is the total size of the localized vector that the input of the Q and pi approximations use, i.e (k+1)*dim''' 413 | 414 | self.n_agents = n_agents 415 | self.dim_local_state = dim_local_state 416 | self.dim_local_action = dim_local_action 417 | # self.policy_type = policy_type # What kind of policy (NN, stochastic normal dist, etc...) 418 | self.discount = discount 419 | self.epochs = epochs 420 | self.epsilon = epsilon 421 | 422 | # Define actor networks 423 | self.actorsNN = [NormalActorNN(dim_local_state, dim_action=dim_local_action) for i in range(n_agents)] 424 | self.learning_rate_actor = learning_rate_actor 425 | self.actor_optimizers = [optim.Adam(self.actorsNN[i].parameters(),lr = learning_rate_actor) for i in range(n_agents)] 426 | 427 | 428 | # List of NN that estimate Q (or V if we use advantage) 429 | # self.criticsNN = [CriticNN(dim_local_state + dim_local_action, output_size=1) for i in range(n_agents)] 430 | self.criticsNN = [CriticNN(dim_local_state, output_size=1) for i in range(n_agents)] 431 | self.critic_optimizers = [optim.Adam(self.criticsNN[i].parameters(),lr = learning_rate_critic) for i in range(n_agents)] 432 | 433 | def forward(self, z_states, N) -> list: 434 | ''' Function that calculates the actions to take from the z_states list (control law) 435 | actions: list of row vectors [u1^T, u2^T,...]''' 436 | 437 | actions = deque() 438 | for i in range(self.n_agents): 439 | z_state = z_states[i].flatten() 440 | actorNN = self.actorsNN[i] 441 | 442 | state_tensor = torch.tensor(z_state, dtype=torch.float32) 443 | mu_tensor,sigma_tensor = actorNN(state_tensor) 444 | 445 | # Normally distributed value with the mu and sigma (std^2) from ActorNN 446 | std = np.sqrt(sigma_tensor.detach().numpy()) 447 | action = np.random.normal(mu_tensor.detach().numpy(),std) 448 | 449 | # Acion must be between -1,1 450 | actions.append(np.clip(action,-1,1)) 451 | 452 | return actions 453 | 454 | def train(self, buffers: deque, actor_lr = None, return_grads = False): 455 | epochs = self.epochs 456 | 457 | if actor_lr is not None: 458 | self.learning_rate_actor = actor_lr 459 | 460 | T = len(buffers) 461 | # Gts = deque() # for debug, delete after -> acces data Gts[i][t] 462 | Git = np.zeros([self.n_agents, T]) # Git[i,t] 463 | Qjsum_estim = np.zeros([self.n_agents, T]) # Qjsum_estim[i,t] 464 | p_old_it = np.zeros([self.n_agents, T]) # p_old_it[i,t] 465 | 466 | 467 | # Agents LOOP, to create required variables 468 | for i in range(self.n_agents): 469 | # separate data from experience buffer 470 | buffer = buffers.buffers[i] 471 | states, actions, rewards, new_states, Ni, finished = zip(*buffer) 472 | 473 | # Calculate the simulated Q value (target), Monte carlo Gt 474 | # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T) 475 | # T = len(rewards) 476 | Gt_array = np.zeros([T]) 477 | Gt_array[-1] = rewards[-1] 478 | for t in range(T-2,-1,-1): 479 | Gt_array[t] = Gt_array[t+1]*self.discount + rewards[t] 480 | 481 | Git[i,:] = Gt_array 482 | 483 | # Calculate the advantage estimator (attempt), and old p of a 484 | for i in range(self.n_agents): 485 | # separate data from experience buffer 486 | buffer = buffers.buffers[i] 487 | states, actions, rewards, new_states, Ni, finished = zip(*buffer) 488 | 489 | # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s] 490 | # inputs = torch.tensor(np.column_stack((states,actions)), dtype=torch.float32) 491 | states_tensor = torch.tensor(np.array(states), dtype=torch.float32) 492 | actions_tensor = torch.tensor(np.array(actions), dtype=torch.float32) 493 | 494 | p_old_it[i,:] = self.probability_of_ai(states_tensor, actions_tensor, i).detach().numpy() 495 | Vi_baselines = self.criticsNN[i](states_tensor).squeeze().detach().numpy() 496 | # advantage_estim[i,:] = -Vi_baselines 497 | 498 | for t in range(T): 499 | Nit = buffers.buffers[i][t].Ni 500 | for j in Nit: # i included here, only uses local i 501 | Qjsum_estim[i,t] += Git[j,t] 502 | 503 | 504 | ### Training LOOP, per agent: ### 505 | for i in range(self.n_agents): 506 | buffer = buffers.buffers[i] 507 | states, actions, rewards, new_states, Ni, finished = zip(*buffer) 508 | Gt = torch.tensor(Git[i,:],dtype=torch.float32) 509 | states_tensor = torch.tensor(np.array(states), dtype=torch.float32) 510 | actions_tensor = torch.tensor(np.array(actions), dtype=torch.float32) 511 | 512 | Vi_baselines = self.criticsNN[i](states_tensor).squeeze() 513 | Adv = Qjsum - Vi_baselines 514 | Qjsum = torch.tensor(Qjsum_estim[i,:], dtype=torch.float32) 515 | 516 | # criticNN = self.criticsNN[i] 517 | # critic_optimizer = self.critic_optimizers[i] 518 | # actorNN = self.actorsNN[i] 519 | # actor_optimizer = self.actor_optimizers[i] 520 | 521 | # Perform training for epochs 522 | for m in range(epochs): 523 | 524 | ### CRITIC update: 525 | # Set gradient to zero 526 | self.critic_optimizers[i].zero_grad() 527 | # value function: # calculate the approximated V(s) = NN(input) 528 | V_approx = self.criticsNN[i](states_tensor).squeeze() 529 | # Compute MSE loss, as E[Gt-V(s) = A(s,a)] = 0 530 | loss = nn.functional.mse_loss(V_approx, Gt) 531 | # Compute gradient 532 | loss.backward() 533 | # Clip gradient norm to avoid infinite gradient 534 | nn.utils.clip_grad_norm_(self.criticsNN[i].parameters(), max_norm=10) 535 | # Update 536 | self.critic_optimizers[i].step() 537 | 538 | ### ACTOR update 539 | # Set gradient to zero 540 | self.actor_optimizers[i].zero_grad() 541 | pi_old = torch.tensor(p_old_it[i,:], dtype=torch.float32) 542 | # Compute new advantage with updated critic 543 | # Compute r_theta ratio of probabilities 544 | current_pi = self.probability_of_ai(states_tensor,actions_tensor, i) 545 | r_theta = current_pi/pi_old 546 | # Comute loss function - 1/N (min...) 547 | left_min = r_theta * Adv 548 | right_min = torch.clamp(r_theta, 1 - self.epsilon, 1 + self.epsilon) * Adv #clipping method for tensors 549 | loss = - torch.mean(torch.min(left_min,right_min)) 550 | # compute gradient 551 | loss.backward() 552 | # Clip gradient norm to avoid infinite gradient 553 | nn.utils.clip_grad_norm_(self.actorsNN[i].parameters(), max_norm=10) 554 | # Update 555 | self.actor_optimizers[i].step() 556 | 557 | 558 | def probability_of_ai(self,states_i,actions_i, agent:int): 559 | """ all variables are tensors. the time vector for current agent=i 560 | assume that the two action components are not correlated, 561 | thus P of both happening is p1*p2 562 | """ 563 | i = agent 564 | mu , sigma = self.actorsNN[i](states_i) 565 | 566 | # p_tensor = (2*np.pi*sigma**2)**(-1/2) * torch.exp(-(actions-mu)**2/(2*sigma**2)) 567 | # p = p_tensor[:,0]*p_tensor[:,1] # assuming uncorrelated action 568 | 569 | p1 = torch.pow(2 * np.pi * sigma[:,0], -0.5) * torch.exp(-(actions_i[:,0] - mu[:,0])**2 / (2 * sigma[:,0])) 570 | p2 = torch.pow(2 * np.pi * sigma[:,1], -0.5) * torch.exp(-(actions_i[:,1] - mu[:,1])**2 / (2 * sigma[:,1])) 571 | p = p1*p2 572 | 573 | return p 574 | 575 | def save(self,filename = "network"): 576 | folder ="models" 577 | cirtic_name = filename + "-PP0critics.pth" 578 | actors_name = filename + "-PP0actors.pth" 579 | 580 | torch.save(self.criticsNN, os.path.join(folder,cirtic_name)) 581 | print(f'Saved Critic NNs as {cirtic_name}') 582 | torch.save(self.actorsNN, os.path.join(folder,actors_name)) 583 | print(f'Saved Actors List as {actors_name}') -------------------------------------------------------------------------------- /benchmark_agent.py: -------------------------------------------------------------------------------- 1 | from collections import deque, namedtuple 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from matplotlib.ticker import PercentFormatter 5 | import drone_env 6 | from drone_env import running_average, plot_rewards, plot_grads 7 | from tqdm import tqdm, trange 8 | from SAC_agents import * 9 | from utils import ExperienceBuffers 10 | 11 | plt.style.use('seaborn-dark-palette') 12 | tex_fonts = { 13 | # Use LaTeX to write all text 14 | # "text.usetex": True, 15 | "font.family": "sans-serif", 16 | # Use 10pt font in plots, to match 10pt font in document 17 | "axes.labelsize": 10, 18 | "font.size": 10, 19 | # Make the legend/label fonts a little smaller 20 | "legend.fontsize": 10, 21 | "xtick.labelsize": 10, 22 | "ytick.labelsize": 10 23 | } 24 | plt.rcParams.update(tex_fonts) 25 | 26 | ### Set up parameters ### 27 | model_name = "final\\softmax8_n8" 28 | n_agents = 8 29 | deltas = np.ones(n_agents)*1 30 | env = drone_env.drones(n_agents=n_agents, n_obstacles=0, grid=[5, 5], end_formation="O", deltas=deltas ,simplify_zstate = True) 31 | env.collision_weight = 0.2 # old 0.2 32 | print(env) 33 | # env.show() 34 | 35 | N_Episodes = 1500 36 | episodes_to_plot = [0] 37 | 38 | # Initialize variables 39 | total_collisions_per_episode = deque() 40 | total_reward_per_episode = deque() 41 | total_true_reward_per_episode = deque() 42 | total_t = deque() 43 | mean_advantage = np.zeros([env.n_agents, N_Episodes]) 44 | 45 | # times = np.arange(0, T, step=drone_env.dt) + drone_env.dt 46 | 47 | agents = TrainedAgent(critics_name=model_name+"-A2Ccritics.pth", actors_name=model_name+"-A2Cactors.pth", n_agents=env.n_agents) 48 | print("### Running Trained agent (no learning)") 49 | print(f"Episodes = {N_Episodes}, max Time iterations = {drone_env.max_time_steps} (T = {drone_env.max_time_steps * drone_env.dt}s, dt = {drone_env.dt}s)") 50 | print(f"N of agents = {env.n_agents}, collision weight b = {env.collision_weight}") 51 | 52 | EPISODES = trange(N_Episodes, desc='Episode: ', leave=True) 53 | for episode in EPISODES: 54 | 55 | if episode+1 in episodes_to_plot: 56 | # reward_history = np.zeros([len(times), env.n_agents]) 57 | trajectory = [env.state.copy()] 58 | z_trajectory = [env.z_states] 59 | total_episode_reward = 0 60 | total_true_episode_reward = 0 61 | total_episode_collisions = 0 62 | # env.show() 63 | 64 | buffers = ExperienceBuffers(env.n_agents) 65 | 66 | # SIMULATION OVER T 67 | t_iter = 0 68 | finished = False 69 | while not finished: 70 | 71 | state = env.state 72 | z_states = env.z_states 73 | Ni = env.Ni 74 | 75 | # calculate actions based on current state 76 | # actions = drone_env.gradient_control(state, env) 77 | # actions = drone_env.proportional_control(state, env) 78 | actions = agents.forward(z_states, Ni) 79 | 80 | # Update environment one time step with the actions 81 | new_state, new_z, rewards, n_collisions, finished, true_rewards = env.step(actions) 82 | # EXPERIECE: [z_state, action, reward, next_z, finished] 83 | buffers.append(z_states, actions, rewards, new_z, Ni, finished) 84 | 85 | total_episode_reward += np.mean(rewards) 86 | total_true_episode_reward += np.mean(true_rewards) 87 | total_episode_collisions += n_collisions 88 | 89 | if episode+1 in episodes_to_plot: 90 | # reward_history[t_iter,:] = reward 91 | trajectory.append(new_state.copy()) 92 | z_trajectory.append(new_z) 93 | 94 | t_iter +=1 95 | 96 | # END OF EPISODE 97 | # Append episode reward 98 | total_reward_per_episode.append(total_episode_reward) 99 | total_true_reward_per_episode.append(total_true_episode_reward) 100 | total_collisions_per_episode.append(total_episode_collisions) 101 | total_t.append(t_iter) 102 | 103 | # Test Critic values 104 | Q_simulated, V_approx = agents.benchmark_cirtic(buffers, only_one_NN=False) 105 | advantage = [np.mean(np.power(Q_simulated[i]-V_approx[i],1)) for i in range(env.n_agents)] 106 | mean_advantage[:,episode] = np.array([advantage]) 107 | 108 | # print(f"Episode collisions = {total_episode_collisions}") 109 | # env.animate(trajectory,frame_time=0.1) 110 | 111 | # RESET ENVIRONMENT 112 | env.reset(renew_obstacles=False) 113 | 114 | # Set progress bar description with information 115 | average_reward = running_average(total_reward_per_episode, 50)[-1] 116 | average_true_reward = running_average(total_true_reward_per_episode, 50)[-1] 117 | average_collisions = running_average(total_collisions_per_episode, 50)[-1] 118 | average_t = running_average(total_t, 50)[-1] 119 | EPISODES.set_description( 120 | f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter} - Average: {average_reward:.1f}/{average_collisions:.2f}/{average_t}. True r={average_true_reward:.1f}.") 121 | 122 | # Plot current trajectory 123 | 124 | if episode+1 in episodes_to_plot: 125 | env.plot(trajectory, episode) 126 | env.animate(trajectory, z_trajectory, deltas, episode, name=f"trained-E{episode+1}", format="mp4") 127 | times = np.arange(0, t_iter)*drone_env.dt 128 | plt.figure() 129 | for i in range(env.n_agents): 130 | agent_color = drone_env.num_to_rgb(i,env.n_agents-1) 131 | plt.plot(times,Q_simulated[i], label=f"i={i}, simulated Q (Gt)", color = agent_color) 132 | plt.plot(times,V_approx[i],"--" , label=f"i={i}, approx V", color = tuple(0.9*x for x in agent_color)) 133 | plt.legend() 134 | plt.show() 135 | 136 | plot_rewards(total_reward_per_episode, total_true_reward_per_episode, total_collisions_per_episode, n_ep_running_average=50) 137 | 138 | plt.figure() 139 | for i in range(env.n_agents): 140 | agent_color = drone_env.num_to_rgb(i,env.n_agents-1) 141 | plt.plot(range(N_Episodes),mean_advantage[i,:], label=f"i={i}", color = agent_color) 142 | plt.xlabel("Episodes") 143 | plt.ylabel("trajectory 1/T * [Q(s,a)-V(s)] = mean_T A(s,a)") 144 | plt.legend() 145 | plt.grid() 146 | plt.show() 147 | 148 | plt.figure() 149 | # plt.gca().set_size_inches(4.5, 3.5) 150 | counts, bins, bars = plt.hist(total_collisions_per_episode, range(min(total_collisions_per_episode), max(total_collisions_per_episode) + 1, 2), weights=np.ones(len(total_collisions_per_episode)) / len(total_collisions_per_episode)) 151 | print(f"Runs with 0 coll. = {counts[0]*100:.2f}%. 2 coll. = {counts[1]*100:.2f}%") 152 | plt.gca().yaxis.set_major_formatter(PercentFormatter(1)) 153 | plt.title("Collision performance") 154 | plt.xlabel("Number of collisions") 155 | plt.ylabel("Frequency of Simulations") 156 | plt.show() 157 | -------------------------------------------------------------------------------- /control_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import drone_env 4 | from drone_env import running_average, plot_rewards 5 | from tqdm import tqdm, trange 6 | 7 | env = drone_env.drones(n_agents=5, n_obstacles=2, grid=[5, 5], end_formation="O") 8 | print(env) 9 | # env.show() 10 | 11 | T_Episodes = 2 12 | # Simulate for T seconds (default dt = drone_env.dt = 0.01s) 13 | T = 5 14 | 15 | # Initialize variables 16 | total_collisions_list = [] 17 | total_reward_list = [] 18 | times = np.arange(0, T, step=drone_env.dt) + drone_env.dt 19 | EPISODES = trange(T_Episodes, desc='Episode: ', leave=True) 20 | 21 | for episode in EPISODES: 22 | 23 | # reward_history = np.zeros([len(times), env.n_agents]) 24 | trajectory = [env.state.copy()] 25 | total_episode_reward = 0 26 | total_episode_collisions = 0 27 | # env.show() 28 | 29 | for t_iter, t in enumerate(times): 30 | # Simple gradient controller u_i = -grad_i, assuming Nj = V 31 | state = env.state 32 | 33 | # calculate actions based on current state 34 | # actions = drone_env.gradient_control(state, env) 35 | actions = drone_env.proportional_control(state, env) 36 | 37 | # Update environment one time step with the actions 38 | new_state, new_z, reward, n_collisions, finished = env.step(actions) 39 | 40 | total_episode_reward += np.sum(reward) 41 | total_episode_collisions += n_collisions 42 | 43 | # reward_history[t_iter,:] = reward 44 | trajectory.append(new_state.copy()) 45 | 46 | # Append episode reward 47 | total_reward_list.append(total_episode_reward) 48 | total_collisions_list.append(total_episode_collisions) 49 | 50 | # print(f"Episode collisions = {total_episode_collisions}") 51 | # env.animate(trajectory,frame_time=0.1) 52 | 53 | # RESET ENVIRONMENT 54 | env.reset(renew_obstacles=False) 55 | 56 | # Set progress bar description with information 57 | average_reward = running_average(total_reward_list, 50)[-1] 58 | average_collisions = running_average(total_collisions_list, 50)[-1] 59 | EPISODES.set_description( 60 | f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter+1} - Average: {average_reward:.1f}/{average_collisions:.2f}/{t_iter+1}") 61 | 62 | # Plot current trajectory 63 | env.plot(trajectory) 64 | 65 | plot_rewards(total_reward_list,total_collisions_list, n_ep_running_average=5) -------------------------------------------------------------------------------- /drone_env.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | import os 4 | import random 5 | import time 6 | from distutils.log import error 7 | 8 | import matplotlib.pyplot as plt 9 | from matplotlib import markers, animation 10 | import numpy as np 11 | from IPython import display 12 | 13 | ### TO DO ############ 14 | """ 15 | - Put back collision cost b = 16 | - Change back from not using Critic NN in training(),benchmark_critic()SACAgent (now trying Gt) 17 | - TrainedAgent for the actor loading 18 | - Add collisions to the animation. Add action arrows to animation? 19 | - (inactive now) Remove fixed random seed to initialize agents 20 | - 21 | - Plot of the Q function field for a fixed kth satates (varying xi) 22 | - Maybe clip the reward for near collision distances 23 | """ 24 | ###################### 25 | 26 | # Spatial dimension 27 | dim = 2 28 | # time step in seconds 29 | dt = 0.05 30 | max_time_steps = 200 31 | # Some colours 32 | LIGHT_RED = '#FFC4CC'; 33 | LIGHT_GREEN = '#95FD99'; 34 | BLACK = '#000000'; 35 | WHITE = '#FFFFFF'; 36 | LIGHT_PURPLE = '#E8D0FF'; 37 | LIGHT_ORANGE = '#FAE0C3'; 38 | YELLOW = "#FFFF00"; 39 | BLUE = '#98F5FF'; 40 | 41 | def num_to_rgb(val, max_val): 42 | if (val > max_val): 43 | raise ValueError("val must not be greater than max_val") 44 | if (val < 0 or max_val < 0): 45 | raise ValueError("arguments may not be negative") 46 | 47 | i = (val * 255 / max_val); 48 | r = round(math.sin(0.024 * i + 0) * 127 + 128)/255; 49 | g = round(math.sin(0.024 * i + 2) * 127 + 128)/255; 50 | b = round(math.sin(0.024 * i + 4) * 127 + 128)/255; 51 | return (r,g,b) 52 | 53 | class drones: 54 | 55 | def __init__(self, n_agents: int, n_obstacles: int, grid: list, end_formation: str, k_closest = 2, deltas: np.ndarray = None, simplify_zstate = False) -> None: 56 | """[summary] 57 | 58 | Args: 59 | i_agents (int): number of agents, integer 60 | n_obstacles (int): number of obstacles integer 61 | grid (list): list of the two axis limits delimiting the grid 62 | end_formation(str): what formation to reach e.g "O" is a circle 63 | k_closest(int): how many agents to account for in the Delta disk, for having a finite localized state (implementation) 64 | deltas(ndarray, column vector): vector of Deltas for each agent sensing radius 65 | """ 66 | self.n_agents = n_agents 67 | self.grid = grid 68 | self.goal = self.grid 69 | self.k_closest = k_closest 70 | self.simplify_zstate = simplify_zstate 71 | self.internal_t = 0 72 | self.collision_weight = 0.2 # Weight of collision cost per unit of time. -r = q|xi-xF|^2 + b log(d_i/d_ij) 73 | 74 | # Other geometry parameters 75 | self.drone_radius = np.ones(n_agents)*0.1 # radius of each drone in m 76 | 77 | # State space dynamics. For now, all agents have same A,B 78 | self.A = np.eye(dim) 79 | self.B = np.eye(dim)*dt 80 | 81 | # Initialize agents and obstacles, and check if delta is a correct value 82 | self.obstacles = self.create_obstacles(n_obstacles) 83 | self.end_points, self.d_safety = self.generate_formation(end_formation) 84 | 85 | if deltas is None: 86 | # In case of no deltas, we assume Dleta = d_safety, i.e no simplification (maximum deltas allowed) 87 | self.deltas = self.d_safety 88 | else: 89 | self.deltas = np.minimum(deltas, self.d_safety) 90 | if not np.all(deltas <= self.d_safety): 91 | print("Some deltas are greater than the final minimum distance between end positions. Using minimum distance between end positions for those cases instead.",f"deltas = {self.deltas}") 92 | 93 | self.state, self.z_states = self.init_agents(n_agents) 94 | 95 | # self.trajectory = [] 96 | # self.trajectory.append(self.state.copy()) 97 | 98 | def reset(self, renew_obstacles = True): 99 | self.state, self.z_states = self.init_agents(self.n_agents) 100 | self.internal_t = 0 101 | if renew_obstacles == True: 102 | self.obstacles = self.create_obstacles(self.n_obstacles) 103 | 104 | 105 | def __str__(self): 106 | print("Grid size: [x_lim, y_lim]\n",self.grid) 107 | print("State: [x, y, vx, vy, r]\n", self.state) 108 | print(f"z_sattes for k_closest = {self.k_closest}: simplify? {self.simplify_zstate}") 109 | print("safety distance for each agent:\n", self.d_safety) 110 | print("Deltas disk radius for each agent: \n", self.deltas) 111 | # print("Obstacles [x, y, r]:\n",self.obstacles) 112 | print(f"Collision cost weight (per unit of time) = {self.collision_weight} ") 113 | return "" 114 | 115 | def generate_formation(self,end_formation): 116 | """generate end point coordinates from a description of the end formation shape 117 | 118 | Args: 119 | end_formation (str): a label given to a shape of final formation, i.e "O" for circle 120 | 121 | Returns: 122 | formation: [xF1^T, xF2^T,...]^T. Column vector 123 | """ 124 | if end_formation == "O": 125 | size = min(self.grid)/2 126 | angle_step = 2*np.pi/self.n_agents 127 | formation = np.zeros([self.n_agents*dim,1]) 128 | 129 | for i in range(self.n_agents): 130 | formation[dim*i,0] = np.cos(i * angle_step)*0.9*self.grid[0]/2 + self.grid[0]/2 131 | formation[dim*i+1,0] = np.sin(i * angle_step)*0.9*self.grid[1]/2 + self.grid[1]/2 132 | 133 | else: 134 | error(str(end_formation) +" is Not a valid end formation identifier") 135 | 136 | # find maximum allowed d_safety (\hat{d}_i): di <= min(|| xFi -xFj || - li - lj) for all j!=i 137 | d_safety = np.zeros(self.n_agents) 138 | 139 | for i in range(self.n_agents): 140 | xFi = formation[dim*i:(i+1)*dim,0] 141 | li = self.drone_radius[i] 142 | d_i = np.infty 143 | for j in range(self.n_agents): 144 | if j != i: 145 | xFj = formation[dim*j:(j+1)*dim,0] 146 | lj = self.drone_radius[j] 147 | d_ij = np.linalg.norm(xFi-xFj) -li -lj 148 | d_i = min([d_i,d_ij]) 149 | 150 | d_safety[i] = d_i 151 | 152 | # formation: [xF1^T, xF2^T,...]^T. Column vector, d_safety (distance to closest end position from agent's end position) 153 | return formation, np.floor(d_safety*100)/100 154 | 155 | def create_obstacles(self,n_obstacles): 156 | self.n_obstacles = n_obstacles 157 | 158 | max_size = 0.1*np.max(self.grid) 159 | min_size = 0.05*max_size 160 | 161 | # generate random obstacles 162 | # Generate the obstacle coordinates and size 163 | # first column: x, second: y, third: r 164 | obstacles = np.random.rand(n_obstacles,dim+1) 165 | obstacles[:,0] = obstacles[:,0]*self.grid[0] 166 | obstacles[:,1] = obstacles[:,1]*self.grid[1] 167 | obstacles[:,dim] = obstacles[:,dim]*(max_size-min_size) + min_size 168 | 169 | return obstacles 170 | 171 | def init_agents(self,n_agents): 172 | # initialize state array: 173 | # column 1: x, col 2: y, col 3: vx, col 4 vy, col 5: l 174 | l = 0.1 # 10 cm 175 | self.n_agents = n_agents 176 | self.global_state_space = n_agents*(2*dim + 1) # x,y, vx,vy, radius 177 | 178 | grid = self.grid 179 | 180 | if self.simplify_zstate: 181 | # Only take into account position x,y (remove vx vy l) 182 | self.local_state_space = (dim)*(1+self.k_closest) 183 | else: 184 | self.local_state_space = (2*dim+1)*(1+self.k_closest) 185 | 186 | self.global_action_space = n_agents*dim # vx,vy (or ax,ay in the future) 187 | self.local_action_space = dim 188 | 189 | state = np.zeros([n_agents,5]) 190 | state[:,4] = l 191 | 192 | # Create a grid and choose random nodes without repetition 193 | delta_l = 2*1.1*l 194 | divisions = np.floor(np.array(grid)/delta_l) 195 | possible_coord = [] 196 | 197 | for idx in range(int(divisions[0])): 198 | for jdx in range(int(divisions[1])): 199 | coord = [idx*delta_l, jdx*delta_l] 200 | possible_coord.append(coord) 201 | 202 | # REMOVE THIS seed FOR true random initial state 203 | # random.seed(1) 204 | random_coord = np.array(random.sample(possible_coord, n_agents)) 205 | state[:,0:dim] = random_coord 206 | 207 | # Calculate localized states z (uing the reward funciton) 208 | _, _, z_states, Ni, _ = self.rewards(state, self.end_points, self.n_agents, self.d_safety, self.deltas) 209 | # Update the Ni graph 210 | self.Ni = Ni 211 | 212 | return state, z_states 213 | 214 | def step(self,actions): 215 | """_summary_ 216 | 217 | Args: 218 | actions (_type_):List of actions, each entry one agent [u1^T, u2^T, ...].Assuming each actions is a row vector with an entry for each dimension for each agent. 219 | 220 | Returns: 221 | state: new state after applying action 222 | z_states (list): localized states (from the new state) 223 | r_vec (row vector): vector for each localized reward for each agent. Average is total reward 224 | """ 225 | 226 | # Update state: s -> s' with the system dynamics 227 | for i in range(self.n_agents): 228 | 229 | Ai = self.A 230 | Bi = self.B 231 | 232 | xi = np.transpose(self.state[i,0:dim]) 233 | ui = actions[i] 234 | 235 | next_xi = np.matmul(Ai,xi) + np.matmul(Bi,ui) 236 | 237 | self.state[i,0:dim] = np.transpose(next_xi) 238 | self.state[i,dim:2*dim] = np.transpose(ui) 239 | 240 | 241 | # Calculate new individual reward [r1(s,a), r2,...] vector, plus related distance dependent values 242 | r_vec, n_collisions, z_states, Ni, true_r_vec = self.rewards(self.state, self.end_points, self.n_agents, self.d_safety, self.deltas) 243 | # Update the z and Ni graph 244 | self.z_states = z_states 245 | self.Ni = Ni 246 | 247 | # SHould return (s', r(s,a), n_collisions(s') ,finished) 248 | end_points = np.reshape(self.end_points,np.shape(self.state[:,0:dim])) 249 | error_from_end = np.linalg.norm(end_points-self.state[:,0:dim],axis = 1) 250 | 251 | if np.all(error_from_end <=0.2) or self.internal_t>= max_time_steps-1: 252 | finished = True 253 | else: 254 | finished = False 255 | 256 | self.internal_t += 1 257 | # TO DO: Proper is_finished 258 | return self.state, z_states, r_vec, n_collisions, finished, true_r_vec 259 | 260 | def rewards(self, state, end_points, n_agents, d_safety, deltas): 261 | ''' 262 | state: [column 1: x, col 2: y, col 3: vx, col 4 vy, col 5: r ] np.array[i,2*dim+1] 263 | end_points: column [x1, y1, x2, y2, ... ]^T 264 | d_safety: column d_i [d1, d2, ...]^T 265 | ''' 266 | n_agents = np.size(state,0) 267 | 268 | # weights: q|xi-xF|^2 + b log(d_i/d_ij). I multiply per dt as i assume is cost/time 269 | q = 2*dt 270 | b = self.collision_weight*dt 271 | 272 | xF = np.reshape(end_points,[n_agents,dim]) 273 | xi = state[:,0:dim] 274 | 275 | # row vector, q|xi-xF|^2 276 | to_goal_cost = q*np.power(np.linalg.norm(xF-xi,axis=1),2) 277 | 278 | # Collision cost 279 | 280 | d_ij, log_d, N_delta, collisions = self.distance_data(state,deltas,d_safety) 281 | 282 | collision_cost = b*np.sum(log_d*N_delta,1) 283 | real_collision_cost = b*np.sum(log_d,1) 284 | n_collisions = np.sum(collisions) 285 | 286 | # These are the approximated localized rewards 287 | reward_vector = -np.nan_to_num(to_goal_cost+collision_cost) 288 | true_reward_vector = -np.nan_to_num(to_goal_cost+real_collision_cost) 289 | 290 | # Calculate localized z states 291 | z_states, Ni = self.localized_states(state, end_points, N_delta, d_ij) 292 | 293 | return reward_vector, n_collisions, z_states, Ni, true_reward_vector 294 | 295 | def distance_data(self,state,deltas,d_safety): 296 | '''Return matrix of clipped distances matrix d[ij] 297 | Also returns normalized distance matrix 298 | Also returns collision binary graph 299 | Also Delta proximity neighbourhood 300 | 301 | deltas must be a column! 302 | graph N includes i as neighbour 303 | ''' 304 | 305 | n_agents = np.size(state,0) 306 | d_ij = np.zeros([n_agents,n_agents]) 307 | d_ij_norm = np.zeros_like(d_ij) 308 | 309 | for i in range(n_agents): 310 | xi = state[i,0:dim] 311 | li = self.drone_radius[i] 312 | for j in range(n_agents): 313 | if j != i: 314 | xj = state[j,0:dim] 315 | lj = self.drone_radius[j] 316 | 317 | # Calculate agents relative distance 318 | d_ij[i,j] = min(np.linalg.norm(xi-xj) -li -lj, d_safety[i]) 319 | if d_ij[i,j] == 0: # Handle unlikely case of exactly 0, if not then the coming division would be error 320 | d_ij[i,j] = -10**-6 321 | d_ij_norm[i,j] = d_safety[i]/d_ij[i,j] 322 | else: 323 | d_ij[i,j] = min(-li -li, d_safety[i]) 324 | # Just to be safe, the distance to itself in the normalized case i make it =1, as log(1)=0 so it is neutral 325 | d_ij_norm[i,j] = 1 326 | 327 | collisions = d_ij_norm <= 0 328 | N_delta = d_ij <= deltas 329 | # Handling negative logarithms (only for d normalized, to use in logarithms) 330 | d_ij_norm[collisions] = 9.99E3 331 | log_d = np.log(d_ij_norm) 332 | log_d[collisions] = 9.99E3 333 | 334 | return d_ij, log_d, N_delta, collisions 335 | 336 | def localized_states(self, state, end_points, N_delta, d_ij): 337 | n_agents = np.size(state,0) 338 | sorted_idx = np.argsort(d_ij,1) 339 | k = self.k_closest 340 | 341 | z = [] 342 | Ni_list = [] 343 | 344 | for i in range(n_agents): 345 | # How many agents are in Delta range, minus itself 346 | in_range = np.sum(N_delta[i,:])-1 347 | sorted_agents = sorted_idx[i,:] 348 | Ni = [i] 349 | 350 | xi = state[i,0:dim] 351 | xFi = end_points[i*dim:(i+1)*dim] 352 | 353 | # Adding zii as the first row 354 | Zi = np.zeros([k+1,2*dim+1]) 355 | Zi[0,:] = state[i,:].copy() 356 | # print(Zi,xFi.flaten() ,xi) 357 | Zi[0,0:dim] = -(xFi.flatten() - xi) 358 | 359 | for kth in range(1,k+1): 360 | # kth = 1,2,...k 361 | 362 | if kth <= in_range: 363 | # There exist a kth neighbour inside Delta 364 | j = sorted_agents[kth] 365 | Ni.append(j) 366 | xj = state[j,0:dim].copy() 367 | zj = state[j,:].copy() 368 | zj[0:dim] = xj-xi 369 | # zj[0:dim] = xj-xFi.flatten() 370 | # print(f"{kth}th closest agent is {j}, coord {xj}, rel coord {xj-xi}") 371 | 372 | else: 373 | # There is no neigbhour, thus using a null state (or state that should almost not add value) 374 | # I try for now to just add the next state in order, as if to just add the two closest even if outside the Delta range 375 | # Hopping that the NN learns that agents outside delta do not contribute to Q 376 | # Probably, the proper thing would be to project this next closest to the Delta boundary 377 | # j = sorted_agents[kth] 378 | # xj = state[j,0:dim].copy() 379 | # zj = state[j,:].copy() 380 | # zj[0:dim] = xj-xi 381 | 382 | # Create a "ghost" agent that is just behind agent, at a distance 1.1*Delta in the direction to the goal 383 | j = sorted_agents[kth] 384 | zi = Zi[0,0:dim] 385 | zj = state[j,:].copy() 386 | zj[0:dim] = zi/np.linalg.norm(zi) * self.deltas[i]*1.1 387 | 388 | Zi[kth,:] = zj 389 | 390 | if self.simplify_zstate: 391 | # Remove parts of the satte that overcomplicate: 392 | # No (vx,vy,l) 393 | z.append(Zi[:,0:dim]) 394 | else: 395 | z.append(Zi) 396 | 397 | Ni_list.append(Ni) 398 | 399 | # z is a list of arrays. Each array is the localized Delta state for each agent 400 | # Add: Ni_list = list of neighbours for each agent 401 | return z, Ni_list 402 | 403 | # %% Plotting methods 404 | def show(self, state = None, not_animate = True): 405 | 406 | if not_animate: 407 | state = self.state 408 | 409 | fig, ax = plt.subplots(); # note we must use plt.subplots, not plt.subplot 410 | # (or if you have an existing figure) 411 | # fig = plt.gcf() 412 | # ax = fig.gca() 413 | 414 | ax.set_xlim((0, self.grid[0])); 415 | ax.set_ylim((0, self.grid[1])); 416 | ax.grid(True); 417 | 418 | for i in range(self.n_obstacles): 419 | circle = plt.Circle((self.obstacles[i,0], self.obstacles[i,1]), self.obstacles[i,2], color='black'); 420 | ax.add_patch(circle); 421 | 422 | for i in range(self.n_agents): 423 | agent_color = num_to_rgb(i,self.n_agents-1) 424 | circle = plt.Circle((state[i,0], state[i,1]), state[i,4], color=agent_color, fill=False, label=f"{i+1}"); 425 | ax.add_patch(circle); 426 | # end point in plot 427 | ax.plot(self.end_points[i*dim,0],self.end_points[i*dim+1,0],color=agent_color,marker = "*"); 428 | 429 | ax.legend() 430 | 431 | if not_animate: 432 | plt.show() 433 | else: 434 | return fig; 435 | 436 | def animate_basic(self, trajectory,frame_time = 0.2, frames = 20): 437 | 438 | good_frame = 0 439 | each_frame = len(trajectory)/frames 440 | for n_frame,state in enumerate(trajectory): 441 | if each_frame > 1 and round(good_frame) == n_frame: 442 | good_frame+=each_frame 443 | else: 444 | continue 445 | fig = self.show(state, not_animate=False); 446 | display.display(fig); 447 | display.clear_output(wait=True) 448 | time.sleep(frame_time) 449 | 450 | def plot(self, trajectory, episode = None): 451 | 452 | # Create trajectory matrices -> time, [x(t=0), x(t+1), x(t+2)], | agent 453 | times = len(trajectory) 454 | x_cord = np.zeros([self.n_agents, times]) 455 | y_cord = np.zeros([self.n_agents, times]) 456 | collision_table = np.full([self.n_agents, times], False) 457 | 458 | for t,state in enumerate(trajectory): 459 | x_cord[:,t] = state[:,0] 460 | y_cord[:,t] = state[:,1] 461 | 462 | # Collision calc 463 | for i in range(self.n_agents): 464 | xi = state[i,0:dim] 465 | li = state[i,2*dim] 466 | for j in range(self.n_agents): 467 | if j != i: 468 | xj = state[j,0:dim] 469 | lj = state[j,2*dim] 470 | 471 | dij = np.linalg.norm(xi-xj) -li -lj 472 | if dij<=0: 473 | collision_table[i,t] = True 474 | 475 | collisions = np.sum(collision_table) 476 | 477 | # Plot obstacles and final state of agents 478 | fig, ax = plt.subplots(); # note we must use plt.subplots, not plt.subplot 479 | # (or if you have an existing figure) 480 | # fig = plt.gcf() 481 | # ax = fig.gca() 482 | fig.set_size_inches(4.5, 3.5) 483 | fig.tight_layout() 484 | 485 | # ax.set_xlim((0, self.grid[0])); 486 | # ax.set_ylim((0, self.grid[1])); 487 | ax.grid(True); 488 | 489 | # Plot obstacles 490 | for i in range(self.n_obstacles): 491 | circle = plt.Circle((self.obstacles[i,0], self.obstacles[i,1]), self.obstacles[i,2], color='black'); 492 | ax.add_patch(circle); 493 | 494 | # Plot agents and collisions 495 | for i in range(self.n_agents): 496 | agent_color = num_to_rgb(i,self.n_agents-1) 497 | circle = plt.Circle((state[i,0], state[i,1]), state[i,4], color=agent_color, fill=False, label=f"{i+1}"); 498 | ax.add_patch(circle); 499 | 500 | ax.plot(x_cord[i,:],y_cord[i,:],color=agent_color); 501 | ax.plot(self.end_points[i*dim,0],self.end_points[i*dim+1,0],color=agent_color,marker = "*"); 502 | 503 | collisions_xcord = x_cord[i,collision_table[i,:]] 504 | collisions_ycord = y_cord[i,collision_table[i,:]] 505 | total_markers = len(collisions_xcord) 506 | # problem when it is 0, markevery=np.floor(total_markers/2)) 507 | ax.plot(collisions_xcord,collisions_ycord,color=agent_color, marker = "v",fillstyle = "none", markevery=2); 508 | 509 | if episode is None: 510 | ax.set_title(f"{self.n_agents} agents, collisions = {collisions}") 511 | else: 512 | ax.set_title(f"Episode {episode+1} , {self.n_agents} agents, collisions = {collisions}") 513 | ax.legend(title="Agents") 514 | plt.show() 515 | 516 | def animate(self, trajectory, z_trajectory , deltas, episode, name = "test", format ="gif"): 517 | 518 | if format == "mp4": 519 | # plt.rcParams['animation.ffmpeg_path'] ='D:\\Programes portables\\ffmpeg\\bin\\ffmpeg.exe' 520 | plt.rcParams['animation.ffmpeg_path'] ='C:\\Users\\Andreu\\OneDrive - KTH\\programes\\ffmpeg\\bin\\ffmpeg.exe' 521 | 522 | 523 | fig, ax = plt.subplots(); # note we must use plt.subplots, not plt.subplot 524 | ax.set_xlim((-1, self.grid[0]+1)); 525 | ax.set_ylim((-1, self.grid[1]+1)); 526 | # ax.grid(True) 527 | ax.set_title(f"Delta = {deltas[0]}"); 528 | circles = [] 529 | d_circles = [] 530 | arrows = [] 531 | 532 | states = trajectory[0] 533 | z_states = z_trajectory[0] 534 | for i in range(self.n_agents): 535 | xi = states[i,0:dim] 536 | xFi = self.end_points[i*dim:(i+1)*dim,0].flatten() 537 | agent_color = num_to_rgb(i,self.n_agents-1) 538 | ax.plot(xFi[0],xFi[1],color=agent_color,marker = "*"); 539 | circle = plt.Circle((states[i,0], states[i,1]), states[i,4], color=agent_color, fill=False, label=f"{i+1}"); 540 | circles.append(ax.add_patch(circle)) 541 | 542 | delta_circle = plt.Circle((states[i,0], states[i,1]), states[i,4] + deltas[i], color="red", fill=False, ls = "--", alpha = 0.5); 543 | d_circles.append(ax.add_patch(delta_circle)) 544 | 545 | z_size = np.size(z_states[i],0) 546 | z_state = z_states[i] 547 | arrows_i = [] 548 | for k in range(z_size): 549 | if k == 0: 550 | star = xFi[0:dim] 551 | fini = xFi[0:dim] + z_state[k,0:dim] 552 | coords = np.array([star,fini]) 553 | arrows_i.append(ax.plot(coords[:,0], coords[:,1] , color = agent_color, lw = 0.5, alpha = 0.3)) 554 | else: 555 | star = xi[0:dim] 556 | fini = xi[0:dim] + z_state[k,0:dim] 557 | coords = np.array([star,fini]) 558 | arrows_i.append(ax.plot(coords[:,0], coords[:,1] , color = agent_color, lw = 0.5, alpha = 0.6)) 559 | arrows.append(arrows_i) 560 | 561 | plt.legend(loc = "upper right") 562 | 563 | def update_objects(t:int): 564 | states = trajectory[t] 565 | z_states = z_trajectory[t] 566 | ax.set_title(f"Episode {episode+1} .Deltas = {deltas[0]}. Time = {t*dt:.1f}s") 567 | 568 | for i in range(self.n_agents): 569 | xi = states[i,0:dim] 570 | xFi = self.end_points[i*dim:(i+1)*dim,0].flatten() 571 | agent_color = num_to_rgb(i,self.n_agents-1) 572 | 573 | z_size = np.size(z_states[i],0) 574 | z_state = z_states[i] 575 | for k in range(z_size): 576 | if k == 0: 577 | star = xFi[0:dim] 578 | fini = xFi[0:dim] + z_state[k,0:dim] 579 | coords = np.array([star,fini]) 580 | arrows[i][k][0].set_data(coords[:,0], coords[:,1]) 581 | pass 582 | else: 583 | star = xi[0:dim] 584 | fini = xi[0:dim] + z_state[k,0:dim] 585 | coords = np.array([star,fini]) 586 | arrows[i][k][0].set_data(coords[:,0], coords[:,1]) 587 | pass 588 | circles[i].center = states[i,0], states[i,1] 589 | d_circles[i].center = states[i,0], states[i,1] 590 | 591 | return circles, d_circles, arrows 592 | 593 | print("\nSaving animation...") 594 | anim = animation.FuncAnimation(fig, update_objects, len(trajectory), interval=dt) 595 | 596 | if format == "gif": 597 | writergif = animation.PillowWriter(fps=30) 598 | full_name = os.path.join("videos", name + ".gif") 599 | anim.save(full_name, writer=writergif) 600 | elif format == "mp4": 601 | FFwriter = animation.FFMpegWriter(fps=30) 602 | full_name = os.path.join("videos", name + ".mp4") 603 | anim.save(full_name, writer = FFwriter) 604 | else: 605 | print(f"format{format} not valid") 606 | 607 | print(f"Animation saved as {full_name}") 608 | 609 | 610 | 611 | # other control functions 612 | def gradient_control(state,env, u_max = 1): 613 | """Given a global state, calculates the direction of gradient of the cost (logarithm barrier) 614 | for each agent separately, taking into account all neighbours 615 | 616 | Args: 617 | state (np.array): each row is an agent state (x,y, vx,vy, l) 618 | env (_type_): drone environment 619 | 620 | Returns: 621 | actions: list of row vectors [u1^T, u2^T,...] 622 | """ 623 | 624 | # maximum value of control component 625 | # u_max m/s 626 | b = 0.1 627 | q = 1 628 | actions = [] 629 | 630 | for i in range(env.n_agents): 631 | xi = np.transpose(state[i,0:dim]) 632 | ri = state[i,4] 633 | xF = env.end_points[i*dim:(i+1)*dim,0] 634 | di_hat = env.d_safety[i] 635 | 636 | term1 = 2*(xi-xF) 637 | 638 | Ni = [j for j in range(env.n_agents) if j != i] # Complete graph (except itself), global knowledge 639 | # print(Ni) 640 | term2 = np.zeros_like(xi) 641 | for j in Ni: 642 | xj = np.transpose(state[j,0:2]) 643 | rj = state[j,4] 644 | dij = np.linalg.norm(xi-xj)-ri-rj 645 | 646 | if dij <= di_hat: 647 | term2+= (xi-xj) / (dij*np.linalg.norm(xi-xj)) 648 | 649 | grad = q*term1 - b*term2 650 | ui = np.clip(-grad , -u_max,u_max) 651 | actions.append(ui) 652 | 653 | return actions 654 | 655 | def proportional_control(state,env): 656 | 657 | # maximum value of control component 658 | u_max = 1 # m/s 659 | k_gain = 1 660 | actions = [] 661 | 662 | for i in range(env.n_agents): 663 | xi = np.transpose(state[i,0:dim]) 664 | ri = state[i,4] 665 | xF = env.end_points[i*dim:(i+1)*dim,0] 666 | # di_hat = env.d_safety[i] 667 | 668 | # zi = xF_i - xi (vector to goal) 669 | zi = xF-xi 670 | ui = k_gain * zi 671 | 672 | ui_norm = np.linalg.norm(ui) 673 | if ui_norm > u_max: 674 | # Cap the control norm to u_max 675 | ui = ui/ui_norm*u_max 676 | 677 | actions.append(ui) 678 | 679 | return actions 680 | 681 | # Plotting functions 682 | def running_average(x, N = 50): 683 | ''' Function used to compute the running average 684 | of the last N elements of a vector x 685 | ''' 686 | if len(x) >= N: 687 | y = np.copy(x) 688 | y[N-1:] = np.convolve(x, np.ones((N, )) / N, mode='valid') 689 | else: 690 | y = np.zeros_like(x) 691 | return y 692 | 693 | # Plot Rewards and steps 694 | def plot_rewards(episode_reward_list, episode_true_reward_list, collision_list, n_ep_running_average=50): 695 | episodes = [i for i in range(1, len(episode_reward_list)+1)] 696 | # Plot Rewards and steps 697 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 4.5)) 698 | # ax[0].plot(episodes, episode_reward_list, label='Approx. reward', color = "orange", alpha = 0.5) 699 | ax[0].plot(episodes, episode_reward_list, label='Global reward', color = "cyan", alpha = 0.5) 700 | # ax[0].plot(episodes, running_average(episode_reward_list, n_ep_running_average), label='Avg. approx. reward', color="red") 701 | ax[0].plot(episodes, running_average(episode_true_reward_list, n_ep_running_average), label='Avg. global reward', color="blue") 702 | 703 | ax[0].set_xlabel('Episodes') 704 | ax[0].set_ylabel('Total reward') 705 | ax[0].set_title('Total Reward vs Episodes') 706 | ax[0].legend() 707 | ax[0].grid(alpha=0.3) 708 | 709 | ax[1].plot(episodes,collision_list, label='Collisions per episode', color="cyan", alpha = 0.5) 710 | ax[1].plot(episodes, running_average(collision_list, n_ep_running_average), label='Avg. number of collisions per episode',color="blue") 711 | ax[1].set_xlabel('Episodes') 712 | ax[1].set_ylabel('Total number of collisions') 713 | ax[1].set_title('Total number of collisions vs Episodes') 714 | ax[1].legend() 715 | ax[1].grid(alpha=0.3) 716 | plt.show() 717 | 718 | def plot_grads(grad_per_episode:np.ndarray, gi_per_episode:np.ndarray): 719 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 9)) 720 | 721 | n_agents = np.size(grad_per_episode,1) 722 | episode_variable = [e for e in range(1, len(grad_per_episode)+1)] 723 | 724 | for i in range(n_agents): 725 | agent_color = num_to_rgb(i,n_agents-1) 726 | ax[0].plot(episode_variable, grad_per_episode[:,i], label=f"Agent {i+1}", color = agent_color) 727 | ax[0].set_xlabel('Episodes') 728 | ax[0].set_ylabel('Score function gradient') 729 | # ax[0].set_title('Total Reward vs Episodes') 730 | ax[0].legend() 731 | ax[0].grid(alpha=0.3) 732 | 733 | for i in range(n_agents): 734 | agent_color = num_to_rgb(i,n_agents-1) 735 | ax[1].plot(episode_variable, gi_per_episode[:,i], label=f"Agent {i+1}", color = agent_color) 736 | ax[1].set_xlabel('Episodes') 737 | ax[1].set_ylabel('Approximated gi gradient (max norm = 100)') 738 | # ax[1].set_title('Total number of collisions vs Episodes') 739 | ax[1].legend() 740 | ax[1].grid(alpha=0.3) 741 | plt.show() 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | -------------------------------------------------------------------------------- /images/E1000_n10_DiscretePolicy4_b02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/E1000_n10_DiscretePolicy4_b02.png -------------------------------------------------------------------------------- /images/E1500_n5_DiscretePolicy8_b02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/E1500_n5_DiscretePolicy8_b02.png -------------------------------------------------------------------------------- /images/collisions_hist.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/collisions_hist.pdf -------------------------------------------------------------------------------- /images/delta_effect.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/delta_effect.pdf -------------------------------------------------------------------------------- /learning_Q_test.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import deque, namedtuple 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import drone_env 6 | from drone_env import running_average, plot_rewards 7 | from tqdm import tqdm, trange 8 | from SAC_agents import SACAgents, ExperienceBuffers 9 | 10 | ### Set up parameters ### 11 | n_agents = 5 12 | deltas = np.ones(n_agents)*1.5 13 | env = drone_env.drones(n_agents=n_agents, n_obstacles=0, grid=[5, 5], end_formation="O", deltas=deltas ,simplify_zstate = True) 14 | print(env) 15 | # env.show() 16 | 17 | N_Episodes = 2 18 | 19 | T = 5 # Simulate for T seconds (default dt = drone_env.dt = 0.01s) t_iter t=500 20 | discount_factor = 0.99 21 | alpha_critic = 10**-3 22 | alpha_actor = 10**-2 23 | M = 30 # Epochs, i.e steps of the SDG for the critic NN 24 | dim_z = env.local_state_space # Dimension of the localized z_state space 25 | dim_a = env.local_action_space # Dimension of the local action space 26 | 27 | ### 28 | 29 | # Initialize variables 30 | total_collisions_list = [] 31 | total_reward_list = [] 32 | Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done']) 33 | 34 | times = np.arange(0, T, step=drone_env.dt) + drone_env.dt 35 | 36 | 37 | agents = SACAgents(n_agents=env.n_agents, dim_local_state = dim_z, dim_local_action=dim_a, discount=discount_factor, epochs=M, learning_rate_critic=alpha_critic, learning_rate_actor=alpha_critic) 38 | print("### Running Scalable-Actor-Critic with params: ###") 39 | print(f"Episodes = {N_Episodes}, Time iterations = {len(times)} (T = {T}s, dt = {drone_env.dt}s)") 40 | print(f"N of agents = {env.n_agents}, structure of critic NN = {agents.criticsNN[0].input_size}x{agents.criticsNN[0].L1}x{agents.criticsNN[0].L2}x{agents.criticsNN[0].output_size}") 41 | print(f"Discount = {discount_factor}, lr for NN critical = {alpha_critic}, lr for actor = {alpha_actor}, epochs M = {M}") 42 | 43 | 44 | EPISODES = trange(N_Episodes, desc='Episode: ', leave=True) 45 | for episode in EPISODES: 46 | 47 | # reward_history = np.zeros([len(times), env.n_agents]) 48 | trajectory = [env.state.copy()] 49 | total_episode_reward = 0 50 | total_episode_collisions = 0 51 | # env.show() 52 | 53 | buffers = ExperienceBuffers(env.n_agents) 54 | # SIMULATION OVER T 55 | for t_iter, time in enumerate(times): 56 | # Simple gradient controller u_i = -grad_i, assuming Nj = V 57 | state = env.state 58 | z_states = env.z_states 59 | Ni = env.Ni 60 | 61 | # calculate actions based on current state 62 | actions = drone_env.gradient_control(state, env) 63 | # actions = drone_env.proportional_control(state, env) 64 | # actions = agents.forward(z_states) 65 | 66 | # Update environment one time step with the actions 67 | new_state, new_z, rewards, n_collisions, finished = env.step(actions) 68 | # EXPERIECE: [z_state, action, reward, next_z, finished] 69 | buffers.append(z_states, actions, rewards,new_z, Ni, finished) 70 | 71 | total_episode_reward += np.mean(rewards) 72 | total_episode_collisions += n_collisions 73 | 74 | # reward_history[t_iter,:] = reward 75 | trajectory.append(new_state.copy()) 76 | 77 | # END OF EPISODE 78 | # Append episode reward 79 | total_reward_list.append(total_episode_reward) 80 | total_collisions_list.append(total_episode_collisions) 81 | 82 | # Train of critic with the data of the episode 83 | agents.train(buffers) 84 | Q_simulated, Q_approx = agents.benchmark_cirtic(buffers, only_one_NN=False) 85 | 86 | # print(f"Episode collisions = {total_episode_collisions}") 87 | # env.animate(trajectory,frame_time=0.1) 88 | 89 | # RESET ENVIRONMENT 90 | env.reset(renew_obstacles=False) 91 | 92 | # Set progress bar description with information 93 | average_reward = running_average(total_reward_list, 50)[-1] 94 | average_collisions = running_average(total_collisions_list, 50)[-1] 95 | EPISODES.set_description( 96 | f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter+1} - Average: {average_reward:.1f}/{average_collisions:.2f}/{t_iter+1}") 97 | 98 | # Plot current trajectory 99 | 100 | if episode >= N_Episodes-2: 101 | env.plot(trajectory) 102 | 103 | plt.figure() 104 | for i in range(env.n_agents): 105 | agent_color = drone_env.num_to_rgb(i,env.n_agents-1) 106 | plt.plot(times,Q_approx[i], label=f"i={i}, approx Q") 107 | plt.plot(times,Q_simulated[i], "--", label=f"i={i}, simulated Q") 108 | plt.legend() 109 | plt.show() 110 | 111 | agents.save(filename="Q_test") 112 | 113 | plot_rewards(total_reward_list,total_collisions_list, n_ep_running_average=5) -------------------------------------------------------------------------------- /matlab/cost_field.m: -------------------------------------------------------------------------------- 1 | clear all; 2 | close all; 3 | 4 | global ri rj a xB 5 | 6 | xB = [5 5]'; 7 | % xi = [1 1]'; 8 | xj = [0.5 0.5 3; 9 | 3 1 1]; 10 | ri = 0.1; 11 | rj = 0.1; 12 | a = 5; 13 | 14 | [X Y] = meshgrid(linspace(0,5,100)); 15 | cost_fun = 0*X; 16 | gradU_fun = cost_fun; 17 | gradV_fun = cost_fun; 18 | 19 | for i=1:size(X,2) 20 | for j=1:size(X,1) 21 | xi = [X(i,j) Y(i,j)]'; 22 | cost_fun(i,j) = cost(xi,xj); 23 | g = -grad(xi,xj); 24 | gradU_fun(i,j) = g(1); 25 | gradV_fun(i,j) = g(2); 26 | end 27 | end 28 | 29 | 30 | %% plots 31 | figure; 32 | surf(X,Y,cost_fun,'EdgeAlpha',0) 33 | hold on 34 | plot(xB(1),xB(2),'or','LineWidth',2) 35 | for j=1:size(xj,2) 36 | plot(xj(1,j),xj(2,j),'ob','LineWidth',2) 37 | end 38 | % plot(xi(1),xi(2),'ok','LineWidth',2) 39 | % xlim([0 6]); ylim([0 6]); 40 | % zlim([-inf,cost_fun(1,1)*1.1]) 41 | grid on; 42 | 43 | figure; 44 | % quiver(X,Y,gradU_fun,gradV_fun) 45 | streamslice(X,Y,gradU_fun,gradV_fun) 46 | 47 | %% Fnctions 48 | 49 | function g = grad(xi,Xj) 50 | global ri rj a xB 51 | 52 | term1 = 2*(xi-xB); 53 | term2 = Xj*0; 54 | 55 | for j=1:size(Xj,2) 56 | xj = Xj(:,j); 57 | 58 | term2(:,j) = 1/(norm(xi-xj)-ri-rj) * (xi-xj)/norm(xi-xj); 59 | end 60 | term2 = -a*sum(term2,2); 61 | g = term1+term2; 62 | g = term2; 63 | 64 | % g = g/norm(g); 65 | end 66 | 67 | function c = cost(xi,Xj) 68 | global ri rj a xB 69 | dij = vecnorm(xi-Xj)-ri-rj; 70 | dij(dij<0)=0; 71 | c = norm(xi-xB)^2 - a*sum(log(dij)); 72 | 73 | end 74 | -------------------------------------------------------------------------------- /matlab/derivations_22ndPol.m: -------------------------------------------------------------------------------- 1 | clear all; 2 | 3 | rng(1) 4 | 5 | z_dim = 6; 6 | dim = 2; 7 | z = sym('z%d%d', [z_dim 1],'real'); 8 | z1 = z(1:2); z2 = z(3:4); z3 = z(5:6); 9 | zValues = rand(size(z))*5; 10 | a = sym('a%d%d', [2 1],'real'); 11 | aValues = rand(size(a)); 12 | theta = sym('theta%d%d', [1 z_dim/dim],'real'); 13 | thetaValues = rand(size(theta)); 14 | 15 | R = sym('R%d%d', [dim z_dim],'real'); 16 | % R1div = Rderiv(theta(1)); R2div = Rderiv(theta(2)); R3div = Rderiv(theta(3)); 17 | % Rdiv = [R1div R2div R3div]; 18 | Sigma = sym('Sigma%d%d', [2 2],'real'); 19 | Sigma(2,1) = Sigma(1,2); 20 | % Sigma = inv(Sigma); 21 | SigmaValues = [0.1, 0.01;0.01,0.15]; -------------------------------------------------------------------------------- /matlab/derivations_2ndPol.m: -------------------------------------------------------------------------------- 1 | clear all; 2 | 3 | rng(1) 4 | 5 | z_dim = 6; 6 | dim = 2; 7 | z = sym('z%d%d', [z_dim 1],'real'); 8 | z1 = z(1:2); z2 = z(3:4); z3 = z(5:6); 9 | zValues = rand(size(z))*5; 10 | a = sym('a%d%d', [2 1],'real'); 11 | aValues = rand(size(a)); 12 | theta = sym('theta%d%d', [1 z_dim/dim],'real'); 13 | thetaValues = rand(size(theta)); 14 | R2d = @(angle) [cos(angle) -sin(angle); sin(angle) cos(angle)]; 15 | Rderiv = @(angle) [-sin(angle) -cos(angle); cos(angle) -sin(angle)]; 16 | R1 = R2d(theta(1)); R2 = R2d(theta(2)); R3 = R2d(theta(3)); 17 | R = [R1 R2 R3]; 18 | R1div = Rderiv(theta(1)); R2div = Rderiv(theta(2)); R3div = Rderiv(theta(3)); 19 | Rdiv = [R1div R2div R3div]; 20 | Sigma = sym('Sigma1%d%d', [2 2],'real'); 21 | Sigma(2,1) = Sigma(1,2); 22 | Sigma1 = inv(Sigma); 23 | SigmaValues = [0.1, 0.01;0.01,0.15]; 24 | 25 | val = @(eqq) double(subs(eqq,[z(:)',theta(:)',a(:)',Sigma(:)'],[zValues(:)',thetaValues(:)',aValues(:)',SigmaValues(:)'])); 26 | 27 | d = 2; 28 | original = log(1/((2*pi)^(d/2)*sqrt(det(Sigma1)))*exp(-1/2*(a-R*z)'*(Sigma1)*(a-R*z))); 29 | 30 | eqq1 = -1/2 * (a-R*z)' * (Sigma1) * (a-R*z); 31 | eqq2 =-1/2*z'*R'*(Sigma1)*R*z + a'*(Sigma1)*R*z - 1/2*a'*(Sigma1)*a; 32 | I = -1/2*z'*R'*(Sigma1)*R*z; 33 | II = a'*(Sigma1)*R*z; 34 | 35 | % for m=1:z_dim 36 | % dum = theta(:,m); 37 | % eval(['theta' num2str(m) ' = dum;']); 38 | % end 39 | 40 | Vgra = @(scalar) arrayfun(@(s,v) val(diff(s,v)),scalar*ones(size(theta)),theta); 41 | gra = @(scalar) arrayfun(@(s,v) diff(s,v),scalar*ones(size(theta)),theta); 42 | 43 | 44 | % 45 | grad1 = Vgra(eqq1); 46 | grad2 = Vgra(eqq2); 47 | 48 | % IIgrad = [val(a'*Sigma1*R1div*z1), val(a'*Sigma1*R2div*z2), val(a'*Sigma1*R3div*z3); Vgra(II)] 49 | % Igrad = val(-(Sigma1)*theta*z*z'); 50 | % total_grad = val((Sigma1)*(a-theta*z)*z') 51 | grad1 52 | 53 | 54 | %% loops 55 | % tot = 0; 56 | % l = 2; 57 | % zz = rand(l,1); 58 | % RR = rand(l,l); 59 | % SS = rand(l,l); 60 | % 61 | % for i=1:l 62 | % for j =1:l 63 | % tot = tot + z1(i)*R1(i,j)*Sigma1(i,j)*R1(j,i)*z1(j); 64 | % end 65 | % end 66 | % tot 67 | % 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /matlab/derivations_check.m: -------------------------------------------------------------------------------- 1 | clear all; 2 | 3 | rng(1) 4 | 5 | z_dim = 6; 6 | z = sym('z%d%d', [z_dim 1],'real'); 7 | % zValues = rand(size(z))*5; 8 | zValues = [1,2,3,4,5,6]'; 9 | a = sym('a%d%d', [2 1],'real'); 10 | % aValues = rand(size(a)); 11 | aValues = [0.5,1]'; 12 | theta = sym('theta%d%d', [2 z_dim],'real'); 13 | % thetaValues = rand(size(theta)); 14 | % thetaValues(:,5:6)=0; 15 | thetaValues = [1,0.5,1,0.5,1,0.5;2,0.6,2,0.6,2,0.6]; 16 | Sigma = sym('Sigma%d%d', [2 2],'real'); 17 | SigmaValues = [1,0.01;0.01,1.5]; 18 | val = @(eqq) double(subs(eqq,[z(:)',theta(:)',a(:)',Sigma(:)'],[zValues(:)',thetaValues(:)',aValues(:)',SigmaValues(:)'])); 19 | 20 | d = 2; 21 | original = log(1/((2*pi)^(d/2)*sqrt(det(Sigma)))*exp(-1/2*(a-theta*z)'*inv(Sigma)*(a-theta*z))); 22 | 23 | eqq1 = -1/2 * (a-theta*z)' * inv(Sigma) * (a-theta*z); 24 | eqq2 =-1/2*z'*theta'*inv(Sigma)*theta*z + a'*inv(Sigma)*theta*z - 1/2*a'*inv(Sigma)*a; 25 | I = -1/2*z'*theta'*inv(Sigma)*theta*z; 26 | II = a'*inv(Sigma)*theta*z; 27 | 28 | % for m=1:z_dim 29 | % dum = theta(:,m); 30 | % eval(['theta' num2str(m) ' = dum;']); 31 | % end 32 | 33 | gra = @(scalar) arrayfun(@(s,v) val(diff(s,v)),scalar*ones(size(theta)),theta); 34 | 35 | % 36 | grad1 = gra(eqq1); 37 | grad2 = gra(eqq2); 38 | 39 | IIgrad = val(inv(Sigma)*a*z'); 40 | Igrad = val(-inv(Sigma)*theta*z*z'); 41 | total_grad = val(inv(Sigma)*(a-theta*z)*z') 42 | grad1 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /matlab/distance_def.m: -------------------------------------------------------------------------------- 1 | %% Visualization of the distance between agents (variables) 2 | 3 | clear all; 4 | 5 | di_hat = 5; 6 | max_dist = 7; 7 | x_dist = linspace(0,max_dist,100); 8 | % capping to d_hat, i,e dij = min(di_hat, ||xi-xj||-li-lj) 9 | dij = x_dist; 10 | dij(x_dist>di_hat)=di_hat; 11 | dij_norm = di_hat./dij; 12 | log_d = log(dij_norm); 13 | 14 | %% plots 15 | figure; 16 | grid on; 17 | subplot(3,1,1) 18 | plot(x_dist,dij) 19 | title(['$\hat{d}_i =' num2str(di_hat) '$']) 20 | xlabel('$||x_i-x_j||-l_i-l_j$') 21 | ylabel('$d_{ij} = min(\hat{d}_i, ||x_i-x_j||-l_i-l_j)$') 22 | grid on; 23 | 24 | subplot(3,1,2) 25 | plot(x_dist,dij_norm) 26 | xlabel('$||x_i-x_j||-l_i-l_j$') 27 | ylabel('$\frac{\hat{d}_i}{d_{ij}}$') 28 | grid on; 29 | 30 | subplot(3,1,3) 31 | plot(x_dist,log_d) 32 | xlabel('$||x_i-x_j||-l_i-l_j$') 33 | ylabel('$log(\frac{\hat{d}_i}{d_{ij}})$') 34 | grid on; 35 | 36 | -------------------------------------------------------------------------------- /matlab/images/dij.fig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/matlab/images/dij.fig -------------------------------------------------------------------------------- /matlab/normal_multivariate_pdf.m: -------------------------------------------------------------------------------- 1 | clear all; 2 | 3 | mu = [0 0]; 4 | sigma = 0.1; 5 | Sigma = [sigma 0; 0 sigma]; 6 | 7 | 8 | delta_x = 0.05; 9 | 10 | x1 = -3:delta_x:3; 11 | x2 = -3:delta_x:3; 12 | [X1,X2] = meshgrid(x1,x2); 13 | X = [X1(:) X2(:)]; 14 | y = mvnpdf(X,mu,Sigma); 15 | y = reshape(y,length(x2),length(x1)); 16 | surf(x1,x2,y,'EdgeAlpha',0.1) 17 | caxis([min(y(:))-0.5*range(y(:)),max(y(:))]) 18 | % axis([-3 3 -3 3 0 0.4]) 19 | xlabel('x1') 20 | ylabel('x2') 21 | zlabel('Probability Density') 22 | 23 | volumen = sum(sum(y*delta_x^2)) 24 | -------------------------------------------------------------------------------- /matlab/optimal_traj.m: -------------------------------------------------------------------------------- 1 | clear all; 2 | close all; 3 | 4 | T = 30; 5 | i_agents = 3; 6 | ri =[1 1 1]*0; % drone diameter 7 | x_size = 2*i_agents; % dim* n_agents 8 | X_size = T*x_size; 9 | xF = [1 0 4 0 6 0]'; 10 | x0 = [-1 0 -2 0 -3 0]'; 11 | 12 | normalizing_r = ((xF(1:2:end)-xF(1:2:end)').^2+(xF(2:2:end)-xF(2:2:end)').^2).^0.5; 13 | normalizing_r = mink(normalizing_r,2,2); 14 | normalizing_r = normalizing_r(:,2); 15 | 16 | XB=kron(ones(T,1),xF); 17 | ineqA = eye(T) - (triu(ones(T),-1)-triu(ones(T))); 18 | ineqA = kron(ineqA,eye(x_size)); 19 | ineqA = [ineqA;-ineqA]; 20 | 21 | u_max = 0.4; 22 | ineqB = ones(X_size*2,1)*u_max; 23 | ineqB(1:x_size) = u_max + x0; 24 | ineqB(X_size+1:X_size+x_size) = u_max - x0; 25 | 26 | X0 = rand(X_size,1); 27 | % load sol 28 | % X0 = sol; 29 | 30 | fun = @(x) f(x,XB,x_size,T,normalizing_r,ri); 31 | 32 | [sol,fval] = fmincon(fun,X0,ineqA,ineqB) 33 | 34 | %% separate solution: 35 | sol_re = reshape(sol,x_size,T); 36 | x_sol = sol_re(1:2:end,:); 37 | y_sol = sol_re(2:2:end,:); 38 | 39 | for t=2:size(x_sol,2) 40 | figure(1) 41 | plot([x0(1:2:end)';x_sol(:,1:t-1)'],[x0(2:2:end)';y_sol(:,1:t-1)'],'-^') 42 | hold on 43 | plot(xF(1:2:end)',xF(2:2:end)','x') 44 | grid on; 45 | plot(x0(1:2:end),x0(2:2:end),'o') 46 | hold off; 47 | 48 | pause(0.2) 49 | drawnow 50 | end 51 | 52 | 53 | 54 | %% cost function 55 | function obj = f(X,XF,x_size,T,normalazing_r,ri) 56 | 57 | b = 0.4; 58 | 59 | % goal cost 60 | obj_end = (X-XF)'*(X-XF); 61 | 62 | % collision cost 63 | xt = reshape(X,x_size,T); 64 | x_t = xt(1:2:end,:); 65 | y_t = xt(2:2:end,:); 66 | 67 | obj_colision = 0; 68 | for t = 1:size(xt,2) 69 | x_diff = x_t(:,t)-x_t(:,t)'; 70 | y_diff = y_t(:,t)-y_t(:,t)'; 71 | 72 | D = (x_diff.^2 + y_diff.^2).^0.5 + (-ri-ri').*(1-eye(3)); 73 | D_clip = min(1,D./normalazing_r)+eye(size(D,1)); 74 | 75 | obj_colision = obj_colision -b*sum(sum(log(D_clip))); 76 | 77 | end 78 | 79 | obj=obj_end + obj_colision; 80 | end -------------------------------------------------------------------------------- /matlab/sol.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/matlab/sol.mat -------------------------------------------------------------------------------- /models/E1000_n10_DiscretePolicy4_b02-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E1000_n10_DiscretePolicy4_b02-A2Cactors.pth -------------------------------------------------------------------------------- /models/E1000_n10_DiscretePolicy4_b02-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E1000_n10_DiscretePolicy4_b02-A2Ccritics.pth -------------------------------------------------------------------------------- /models/E500_M30_LR1e4_badInitialState-actors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E500_M30_LR1e4_badInitialState-actors.pth -------------------------------------------------------------------------------- /models/E500_M30_LR1e4_badInitialState-critics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E500_M30_LR1e4_badInitialState-critics.pth -------------------------------------------------------------------------------- /models/cont_preloaded-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/cont_preloaded-A2Cactors.pth -------------------------------------------------------------------------------- /models/cont_preloaded-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/cont_preloaded-A2Ccritics.pth -------------------------------------------------------------------------------- /models/deltas/deltas0.01_softmax16-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.01_softmax16-A2Cactors.pth -------------------------------------------------------------------------------- /models/deltas/deltas0.01_softmax16-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.01_softmax16-A2Ccritics.pth -------------------------------------------------------------------------------- /models/deltas/deltas0.1_softmax16-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.1_softmax16-A2Cactors.pth -------------------------------------------------------------------------------- /models/deltas/deltas0.1_softmax16-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.1_softmax16-A2Ccritics.pth -------------------------------------------------------------------------------- /models/deltas/deltas0.2_softmax16-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.2_softmax16-A2Cactors.pth -------------------------------------------------------------------------------- /models/deltas/deltas0.2_softmax16-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.2_softmax16-A2Ccritics.pth -------------------------------------------------------------------------------- /models/deltas/deltas0.5_softmax16-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.5_softmax16-A2Cactors.pth -------------------------------------------------------------------------------- /models/deltas/deltas0.5_softmax16-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.5_softmax16-A2Ccritics.pth -------------------------------------------------------------------------------- /models/deltas/deltas0.8_softmax16-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.8_softmax16-A2Cactors.pth -------------------------------------------------------------------------------- /models/deltas/deltas0.8_softmax16-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.8_softmax16-A2Ccritics.pth -------------------------------------------------------------------------------- /models/deltas/deltas1.5_softmax16-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1.5_softmax16-A2Cactors.pth -------------------------------------------------------------------------------- /models/deltas/deltas1.5_softmax16-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1.5_softmax16-A2Ccritics.pth -------------------------------------------------------------------------------- /models/deltas/deltas1_softmax16-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1_softmax16-A2Cactors.pth -------------------------------------------------------------------------------- /models/deltas/deltas1_softmax16-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1_softmax16-A2Ccritics.pth -------------------------------------------------------------------------------- /models/deltas/deltas2.43_softmax16-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.43_softmax16-A2Cactors.pth -------------------------------------------------------------------------------- /models/deltas/deltas2.43_softmax16-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.43_softmax16-A2Ccritics.pth -------------------------------------------------------------------------------- /models/deltas/deltas2.5_softmax16-A2Cactors-old (1).pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.5_softmax16-A2Cactors-old (1).pth -------------------------------------------------------------------------------- /models/deltas/deltas2.5_softmax16-A2Ccritics-old (2).pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.5_softmax16-A2Ccritics-old (2).pth -------------------------------------------------------------------------------- /models/deltas/deltas2_softmax16-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2_softmax16-A2Cactors.pth -------------------------------------------------------------------------------- /models/deltas/deltas2_softmax16-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2_softmax16-A2Ccritics.pth -------------------------------------------------------------------------------- /models/deltas2.5_softmax16-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas2.5_softmax16-A2Cactors.pth -------------------------------------------------------------------------------- /models/deltas2.5_softmax16-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas2.5_softmax16-A2Ccritics.pth -------------------------------------------------------------------------------- /models/discrete-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/discrete-A2Cactors.pth -------------------------------------------------------------------------------- /models/discrete-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/discrete-A2Ccritics.pth -------------------------------------------------------------------------------- /models/final/cont_n5-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/cont_n5-A2Cactors.pth -------------------------------------------------------------------------------- /models/final/cont_n5-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/cont_n5-A2Ccritics.pth -------------------------------------------------------------------------------- /models/final/simple-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/simple-A2Cactors.pth -------------------------------------------------------------------------------- /models/final/simple-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/simple-A2Ccritics.pth -------------------------------------------------------------------------------- /models/final/softmax8_n4-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n4-A2Cactors.pth -------------------------------------------------------------------------------- /models/final/softmax8_n4-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n4-A2Ccritics.pth -------------------------------------------------------------------------------- /models/final/softmax8_n5-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n5-A2Cactors.pth -------------------------------------------------------------------------------- /models/final/softmax8_n5-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n5-A2Ccritics.pth -------------------------------------------------------------------------------- /models/final/softmax8_n8-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n8-A2Cactors.pth -------------------------------------------------------------------------------- /models/final/softmax8_n8-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n8-A2Ccritics.pth -------------------------------------------------------------------------------- /models/n5_E1500_Advantage-actors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/n5_E1500_Advantage-actors.pth -------------------------------------------------------------------------------- /models/n5_E1500_Advantage-critics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/n5_E1500_Advantage-critics.pth -------------------------------------------------------------------------------- /models/softmax8_n5-A2Cactors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/softmax8_n5-A2Cactors.pth -------------------------------------------------------------------------------- /models/softmax8_n5-A2Ccritics.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/softmax8_n5-A2Ccritics.pth -------------------------------------------------------------------------------- /models/trained_E1000_M50_LR001-actors.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/trained_E1000_M50_LR001-actors.pth -------------------------------------------------------------------------------- /policy_performance_variables_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/policy_performance_variables_1 -------------------------------------------------------------------------------- /policy_performance_variables_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/policy_performance_variables_2 -------------------------------------------------------------------------------- /profile.bat: -------------------------------------------------------------------------------- 1 | @REM Simple bat to profile scripts: profile file.py 2 | 3 | @echo off 4 | python -m cProfile -o dump.prof %1 5 | snakeviz dump.prof 6 | del dump.prof -------------------------------------------------------------------------------- /spec-file.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: win-64 4 | @EXPLICIT 5 | https://conda.anaconda.org/conda-forge/win-64/nodejs-16.12.0-h57928b3_0.tar.bz2 6 | https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2022.6.15-h5b45459_0.tar.bz2 7 | https://conda.anaconda.org/conda-forge/win-64/intel-openmp-2021.4.0-h57928b3_3556.tar.bz2 8 | https://conda.anaconda.org/conda-forge/win-64/mkl-include-2021.4.0-h0e2418a_729.tar.bz2 9 | https://conda.anaconda.org/conda-forge/win-64/msys2-conda-epoch-20160418-1.tar.bz2 10 | https://conda.anaconda.org/conda-forge/win-64/pandoc-2.16.1-h8ffe710_0.tar.bz2 11 | https://conda.anaconda.org/pytorch/noarch/pytorch-mutex-1.0-cpu.tar.bz2 12 | https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.20348.0-h57928b3_0.tar.bz2 13 | https://conda.anaconda.org/conda-forge/win-64/winpty-0.4.3-4.tar.bz2 14 | https://conda.anaconda.org/pytorch/noarch/cpuonly-2.0-0.tar.bz2 15 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gmp-6.1.0-2.tar.bz2 16 | https://conda.anaconda.org/conda-forge/win-64/m2w64-libwinpthread-git-5.0.0.4634.697f757-2.tar.bz2 17 | https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.29.30037-h902a5da_5.tar.bz2 18 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-core-5.3.0-7.tar.bz2 19 | https://conda.anaconda.org/conda-forge/win-64/vc-14.2-hb210afc_5.tar.bz2 20 | https://conda.anaconda.org/conda-forge/win-64/ffmpeg-4.3.1-ha925a31_0.tar.bz2 21 | https://conda.anaconda.org/conda-forge/win-64/icu-68.2-h0e60522_0.tar.bz2 22 | https://conda.anaconda.org/conda-forge/win-64/jbig-2.1-h8d14728_2003.tar.bz2 23 | https://conda.anaconda.org/conda-forge/win-64/jpeg-9d-h8ffe710_0.tar.bz2 24 | https://conda.anaconda.org/conda-forge/win-64/lerc-3.0-h0e60522_0.tar.bz2 25 | https://conda.anaconda.org/conda-forge/win-64/libclang-11.1.0-default_h5c34c98_1.tar.bz2 26 | https://conda.anaconda.org/conda-forge/win-64/libdeflate-1.8-h8ffe710_0.tar.bz2 27 | https://conda.anaconda.org/conda-forge/win-64/libsodium-1.0.18-h8d14728_1.tar.bz2 28 | https://conda.anaconda.org/conda-forge/win-64/libuv-1.42.0-h8ffe710_0.tar.bz2 29 | https://conda.anaconda.org/conda-forge/win-64/libzlib-1.2.11-h8ffe710_1013.tar.bz2 30 | https://conda.anaconda.org/conda-forge/win-64/lz4-c-1.9.3-h8ffe710_1.tar.bz2 31 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libgfortran-5.3.0-6.tar.bz2 32 | https://conda.anaconda.org/conda-forge/win-64/openssl-1.1.1q-h8ffe710_0.tar.bz2 33 | https://conda.anaconda.org/conda-forge/win-64/sqlite-3.36.0-h8ffe710_2.tar.bz2 34 | https://conda.anaconda.org/conda-forge/win-64/tbb-2021.4.0-h2d74725_1.tar.bz2 35 | https://conda.anaconda.org/conda-forge/win-64/tk-8.6.11-h8ffe710_1.tar.bz2 36 | https://conda.anaconda.org/conda-forge/win-64/xz-5.2.5-h62dcd97_1.tar.bz2 37 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-5.3.0-7.tar.bz2 38 | https://conda.anaconda.org/conda-forge/win-64/mkl-2021.4.0-h0e2418a_729.tar.bz2 39 | https://conda.anaconda.org/conda-forge/win-64/python-3.8.12-h7840368_2_cpython.tar.bz2 40 | https://conda.anaconda.org/conda-forge/win-64/zeromq-4.3.4-h0e60522_1.tar.bz2 41 | https://conda.anaconda.org/conda-forge/win-64/zlib-1.2.11-h8ffe710_1013.tar.bz2 42 | https://conda.anaconda.org/conda-forge/noarch/async_generator-1.10-py_0.tar.bz2 43 | https://conda.anaconda.org/conda-forge/noarch/attrs-21.2.0-pyhd8ed1ab_0.tar.bz2 44 | https://conda.anaconda.org/conda-forge/noarch/backcall-0.2.0-pyh9f0ad1d_0.tar.bz2 45 | https://conda.anaconda.org/conda-forge/noarch/backports-1.0-py_2.tar.bz2 46 | https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-2.0.0-pyhd8ed1ab_0.tar.bz2 47 | https://conda.anaconda.org/conda-forge/noarch/cloudpickle-2.0.0-pyhd8ed1ab_0.tar.bz2 48 | https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.4-pyh9f0ad1d_0.tar.bz2 49 | https://conda.anaconda.org/conda-forge/noarch/cycler-0.11.0-pyhd8ed1ab_0.tar.bz2 50 | https://conda.anaconda.org/conda-forge/noarch/decorator-5.1.0-pyhd8ed1ab_0.tar.bz2 51 | https://conda.anaconda.org/conda-forge/noarch/defusedxml-0.7.1-pyhd8ed1ab_0.tar.bz2 52 | https://conda.anaconda.org/conda-forge/noarch/entrypoints-0.3-pyhd8ed1ab_1003.tar.bz2 53 | https://conda.anaconda.org/conda-forge/noarch/idna-3.1-pyhd3deb0d_0.tar.bz2 54 | https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2 55 | https://conda.anaconda.org/conda-forge/noarch/json5-0.9.5-pyh9f0ad1d_0.tar.bz2 56 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab_widgets-1.0.2-pyhd8ed1ab_0.tar.bz2 57 | https://conda.anaconda.org/conda-forge/win-64/libblas-3.9.0-12_win64_mkl.tar.bz2 58 | https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.37-h1d00b33_2.tar.bz2 59 | https://conda.anaconda.org/conda-forge/win-64/mkl-devel-2021.4.0-h57928b3_730.tar.bz2 60 | https://conda.anaconda.org/conda-forge/noarch/nest-asyncio-1.5.1-pyhd8ed1ab_0.tar.bz2 61 | https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-pyh9f0ad1d_1.tar.bz2 62 | https://conda.anaconda.org/conda-forge/noarch/pandocfilters-1.5.0-pyhd8ed1ab_0.tar.bz2 63 | https://conda.anaconda.org/conda-forge/noarch/parso-0.8.2-pyhd8ed1ab_0.tar.bz2 64 | https://conda.anaconda.org/conda-forge/noarch/pickleshare-0.7.5-py_1003.tar.bz2 65 | https://conda.anaconda.org/conda-forge/noarch/prometheus_client-0.12.0-pyhd8ed1ab_0.tar.bz2 66 | https://conda.anaconda.org/conda-forge/noarch/pycodestyle-2.8.0-pyhd8ed1ab_0.tar.bz2 67 | https://conda.anaconda.org/conda-forge/noarch/pycparser-2.21-pyhd8ed1ab_0.tar.bz2 68 | https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.0.5-pyhd8ed1ab_0.tar.bz2 69 | https://conda.anaconda.org/conda-forge/win-64/python_abi-3.8-2_cp38.tar.bz2 70 | https://conda.anaconda.org/conda-forge/noarch/pytz-2021.3-pyhd8ed1ab_0.tar.bz2 71 | https://conda.anaconda.org/conda-forge/noarch/qtpy-1.11.2-pyhd8ed1ab_0.tar.bz2 72 | https://conda.anaconda.org/conda-forge/noarch/send2trash-1.8.0-pyhd8ed1ab_0.tar.bz2 73 | https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2 74 | https://conda.anaconda.org/conda-forge/noarch/testpath-0.5.0-pyhd8ed1ab_0.tar.bz2 75 | https://conda.anaconda.org/conda-forge/noarch/textwrap3-0.9.2-py_0.tar.bz2 76 | https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_0.tar.bz2 77 | https://conda.anaconda.org/conda-forge/noarch/traitlets-5.1.1-pyhd8ed1ab_0.tar.bz2 78 | https://conda.anaconda.org/conda-forge/noarch/typing_extensions-3.10.0.2-pyha770c72_0.tar.bz2 79 | https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-py_1.tar.bz2 80 | https://conda.anaconda.org/conda-forge/noarch/wheel-0.37.0-pyhd8ed1ab_1.tar.bz2 81 | https://conda.anaconda.org/conda-forge/noarch/zipp-3.6.0-pyhd8ed1ab_0.tar.bz2 82 | https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.0-h6255e5f_0.tar.bz2 83 | https://conda.anaconda.org/conda-forge/noarch/autopep8-1.6.0-pyhd8ed1ab_1.tar.bz2 84 | https://conda.anaconda.org/conda-forge/noarch/babel-2.9.1-pyh44b312d_0.tar.bz2 85 | https://conda.anaconda.org/conda-forge/win-64/certifi-2022.6.15-py38haa244fe_0.tar.bz2 86 | https://conda.anaconda.org/conda-forge/win-64/cffi-1.15.0-py38hd8c33c5_0.tar.bz2 87 | https://conda.anaconda.org/conda-forge/win-64/chardet-4.0.0-py38haa244fe_2.tar.bz2 88 | https://conda.anaconda.org/conda-forge/win-64/debugpy-1.5.1-py38h885f38d_0.tar.bz2 89 | https://conda.anaconda.org/conda-forge/win-64/freetype-2.10.4-h546665d_1.tar.bz2 90 | https://conda.anaconda.org/conda-forge/win-64/future-0.18.2-py38haa244fe_4.tar.bz2 91 | https://conda.anaconda.org/conda-forge/win-64/importlib-metadata-4.8.2-py38haa244fe_0.tar.bz2 92 | https://conda.anaconda.org/conda-forge/noarch/importlib_resources-5.4.0-pyhd8ed1ab_0.tar.bz2 93 | https://conda.anaconda.org/conda-forge/win-64/jedi-0.18.0-py38haa244fe_3.tar.bz2 94 | https://conda.anaconda.org/conda-forge/win-64/kiwisolver-1.3.2-py38hbd9d945_1.tar.bz2 95 | https://conda.anaconda.org/conda-forge/win-64/libcblas-3.9.0-12_win64_mkl.tar.bz2 96 | https://conda.anaconda.org/conda-forge/win-64/liblapack-3.9.0-12_win64_mkl.tar.bz2 97 | https://conda.anaconda.org/conda-forge/win-64/libtiff-4.3.0-hd413186_2.tar.bz2 98 | https://conda.anaconda.org/conda-forge/win-64/markupsafe-2.0.1-py38h294d835_1.tar.bz2 99 | https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.3-pyhd8ed1ab_0.tar.bz2 100 | https://conda.anaconda.org/conda-forge/win-64/mistune-0.8.4-py38h294d835_1005.tar.bz2 101 | https://conda.anaconda.org/conda-forge/noarch/packaging-21.0-pyhd8ed1ab_0.tar.bz2 102 | https://conda.anaconda.org/conda-forge/win-64/pybox2d-2.3.10-py38h885f38d_1.tar.bz2 103 | https://conda.anaconda.org/conda-forge/win-64/pyqt5-sip-4.19.18-py38h885f38d_7.tar.bz2 104 | https://conda.anaconda.org/conda-forge/win-64/pyrsistent-0.18.0-py38h294d835_0.tar.bz2 105 | https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2 106 | https://conda.anaconda.org/conda-forge/win-64/pywin32-302-py38h294d835_2.tar.bz2 107 | https://conda.anaconda.org/conda-forge/win-64/pywinpty-1.1.5-py38hd3f51b4_1.tar.bz2 108 | https://conda.anaconda.org/conda-forge/win-64/pyzmq-22.3.0-py38h09162b1_1.tar.bz2 109 | https://conda.anaconda.org/conda-forge/win-64/qt-5.12.9-h5909a2a_4.tar.bz2 110 | https://conda.anaconda.org/conda-forge/win-64/setuptools-58.5.3-py38haa244fe_0.tar.bz2 111 | https://conda.anaconda.org/conda-forge/win-64/sniffio-1.2.0-py38haa244fe_2.tar.bz2 112 | https://conda.anaconda.org/conda-forge/win-64/tornado-6.1-py38h294d835_2.tar.bz2 113 | https://conda.anaconda.org/conda-forge/noarch/tqdm-4.62.3-pyhd8ed1ab_0.tar.bz2 114 | https://conda.anaconda.org/conda-forge/win-64/websocket-client-0.57.0-py38haa244fe_6.tar.bz2 115 | https://conda.anaconda.org/conda-forge/win-64/win_inet_pton-1.1.0-py38haa244fe_3.tar.bz2 116 | https://conda.anaconda.org/conda-forge/win-64/anyio-3.3.4-py38haa244fe_0.tar.bz2 117 | https://conda.anaconda.org/conda-forge/win-64/argon2-cffi-21.1.0-py38h294d835_2.tar.bz2 118 | https://conda.anaconda.org/conda-forge/noarch/backports.functools_lru_cache-1.6.4-pyhd8ed1ab_0.tar.bz2 119 | https://conda.anaconda.org/conda-forge/noarch/bleach-4.1.0-pyhd8ed1ab_0.tar.bz2 120 | https://conda.anaconda.org/conda-forge/win-64/brotlipy-0.7.0-py38h294d835_1003.tar.bz2 121 | https://conda.anaconda.org/conda-forge/win-64/cryptography-35.0.0-py38hb7941b4_2.tar.bz2 122 | https://conda.anaconda.org/conda-forge/noarch/jinja2-3.0.2-pyhd8ed1ab_0.tar.bz2 123 | https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.2.1-pyhd8ed1ab_0.tar.bz2 124 | https://conda.anaconda.org/conda-forge/win-64/jupyter_core-4.9.1-py38haa244fe_0.tar.bz2 125 | https://conda.anaconda.org/conda-forge/win-64/lcms2-2.12-h2a16943_0.tar.bz2 126 | https://conda.anaconda.org/conda-forge/win-64/liblapacke-3.9.0-12_win64_mkl.tar.bz2 127 | https://conda.anaconda.org/conda-forge/win-64/numpy-1.21.4-py38h089cfbf_0.tar.bz2 128 | https://conda.anaconda.org/conda-forge/win-64/openjpeg-2.4.0-hb211442_1.tar.bz2 129 | https://conda.anaconda.org/conda-forge/noarch/pip-21.3.1-pyhd8ed1ab_0.tar.bz2 130 | https://conda.anaconda.org/conda-forge/win-64/pyglet-1.5.16-py38haa244fe_0.tar.bz2 131 | https://conda.anaconda.org/conda-forge/noarch/pygments-2.10.0-pyhd8ed1ab_0.tar.bz2 132 | https://conda.anaconda.org/conda-forge/win-64/pyqt-impl-5.12.3-py38h885f38d_7.tar.bz2 133 | https://conda.anaconda.org/conda-forge/win-64/pysocks-1.7.1-py38haa244fe_4.tar.bz2 134 | https://conda.anaconda.org/conda-forge/noarch/snakeviz-2.1.1-pyhd8ed1ab_0.tar.bz2 135 | https://conda.anaconda.org/conda-forge/win-64/terminado-0.12.1-py38haa244fe_1.tar.bz2 136 | https://conda.anaconda.org/conda-forge/win-64/blas-devel-3.9.0-12_win64_mkl.tar.bz2 137 | https://conda.anaconda.org/conda-forge/win-64/gym-0.21.0-py38h595d716_0.tar.bz2 138 | https://conda.anaconda.org/conda-forge/noarch/jupyter_client-7.0.6-pyhd8ed1ab_0.tar.bz2 139 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab_pygments-0.1.2-pyh9f0ad1d_0.tar.bz2 140 | https://conda.anaconda.org/conda-forge/noarch/nbformat-5.1.3-pyhd8ed1ab_0.tar.bz2 141 | https://conda.anaconda.org/conda-forge/win-64/pandas-1.3.4-py38h5d928e2_1.tar.bz2 142 | https://conda.anaconda.org/conda-forge/win-64/pillow-8.3.2-py38h794f750_0.tar.bz2 143 | https://conda.anaconda.org/conda-forge/noarch/pyopenssl-21.0.0-pyhd8ed1ab_0.tar.bz2 144 | https://conda.anaconda.org/conda-forge/win-64/pyqtchart-5.12-py38h885f38d_7.tar.bz2 145 | https://conda.anaconda.org/conda-forge/win-64/pyqtwebengine-5.12.1-py38h885f38d_7.tar.bz2 146 | https://conda.anaconda.org/conda-forge/win-64/scipy-1.7.2-py38ha1292f7_0.tar.bz2 147 | https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.5-pyh9f0ad1d_2.tar.bz2 148 | https://conda.anaconda.org/conda-forge/noarch/autograd-1.4-pyhd8ed1ab_0.tar.bz2 149 | https://conda.anaconda.org/conda-forge/win-64/blas-2.112-mkl.tar.bz2 150 | https://conda.anaconda.org/conda-forge/win-64/matplotlib-base-3.4.3-py38h1f000d6_1.tar.bz2 151 | https://conda.anaconda.org/conda-forge/noarch/nbclient-0.5.5-pyhd8ed1ab_0.tar.bz2 152 | https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.22-pyha770c72_0.tar.bz2 153 | https://conda.anaconda.org/conda-forge/win-64/pyqt-5.12.3-py38haa244fe_7.tar.bz2 154 | https://conda.anaconda.org/conda-forge/noarch/urllib3-1.26.7-pyhd8ed1ab_0.tar.bz2 155 | https://conda.anaconda.org/conda-forge/win-64/ipython-7.29.0-py38h595d716_1.tar.bz2 156 | https://conda.anaconda.org/conda-forge/win-64/matplotlib-3.4.3-py38haa244fe_1.tar.bz2 157 | https://conda.anaconda.org/conda-forge/win-64/nbconvert-6.2.0-py38haa244fe_0.tar.bz2 158 | https://conda.anaconda.org/conda-forge/noarch/prompt_toolkit-3.0.22-hd8ed1ab_0.tar.bz2 159 | https://conda.anaconda.org/pytorch/win-64/pytorch-1.10.0-py3.8_cpu_0.tar.bz2 160 | https://conda.anaconda.org/conda-forge/noarch/requests-2.26.0-pyhd8ed1ab_0.tar.bz2 161 | https://conda.anaconda.org/conda-forge/win-64/ipykernel-6.4.2-py38h595d716_0.tar.bz2 162 | https://conda.anaconda.org/conda-forge/noarch/jupyter_server-1.11.2-pyhd8ed1ab_0.tar.bz2 163 | https://conda.anaconda.org/pytorch/win-64/torchvision-0.11.1-py38_cpu.tar.bz2 164 | https://conda.anaconda.org/conda-forge/noarch/jupyter_console-6.4.0-pyhd8ed1ab_0.tar.bz2 165 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab_server-2.8.2-pyhd8ed1ab_0.tar.bz2 166 | https://conda.anaconda.org/conda-forge/noarch/notebook-6.4.5-pyha770c72_0.tar.bz2 167 | https://conda.anaconda.org/conda-forge/noarch/qtconsole-5.2.0-pyhd8ed1ab_0.tar.bz2 168 | https://conda.anaconda.org/conda-forge/noarch/nbclassic-0.3.4-pyhd8ed1ab_0.tar.bz2 169 | https://conda.anaconda.org/conda-forge/win-64/widgetsnbextension-3.5.2-py38haa244fe_0.tar.bz2 170 | https://conda.anaconda.org/conda-forge/noarch/ipywidgets-7.6.5-pyhd8ed1ab_0.tar.bz2 171 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab-3.2.2-pyhd8ed1ab_0.tar.bz2 172 | https://conda.anaconda.org/conda-forge/win-64/jupyter-1.0.0-py38haa244fe_6.tar.bz2 173 | -------------------------------------------------------------------------------- /train_problem.py: -------------------------------------------------------------------------------- 1 | from collections import deque, namedtuple 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import drone_env 5 | from drone_env import running_average, plot_rewards, plot_grads 6 | from tqdm import tqdm, trange 7 | from SAC_agents import SA2CAgents, RandomAgent, TrainedAgent, SPPOAgents 8 | from utils import ExperienceBuffers, DiscreteSoftmaxNN, NormalPolicy 9 | 10 | plt.style.use('seaborn-dark-palette') 11 | tex_fonts = { 12 | # Use LaTeX to write all text 13 | # "text.usetex": True, 14 | "font.family": "sans-serif", 15 | # Use 10pt font in plots, to match 10pt font in document 16 | "axes.labelsize": 10, 17 | "font.size": 10, 18 | # Make the legend/label fonts a little smaller 19 | "legend.fontsize": 10, 20 | "xtick.labelsize": 10, 21 | "ytick.labelsize": 10 22 | } 23 | plt.rcParams.update(tex_fonts) 24 | 25 | 26 | ### Set up parameters ### 27 | n_agents = 5 28 | deltas = np.ones(n_agents)*2.43 29 | # deltas = None 30 | env = drone_env.drones(n_agents=n_agents, n_obstacles=0, grid=[5, 5], end_formation="O", deltas=deltas ,simplify_zstate = True) 31 | env.collision_weight = 0.2 # old 0.2 32 | print(env) 33 | # env.show() 34 | 35 | N_Episodes = 3000 36 | episodes_to_plot = [3000] 37 | # episodes_to_plot = [1500] 38 | save_name = "deltas2.5_softmax16" 39 | 40 | discount_factor = 0.99 41 | alpha_critic = 10**-3 42 | alpha_actor = 10**-3 43 | M = 10 # Epochs, i.e steps of the SDG for the actor-critic NN in PPO variant 44 | dim_z = env.local_state_space # Dimension of the localized z_state space 45 | dim_a = env.local_action_space # Dimension of the local action space 46 | 47 | 48 | # Initialize variables 49 | total_collisions_per_episode = [] 50 | total_reward_per_episode = [] 51 | total_true_reward_per_episode =[] 52 | total_t = [] 53 | grad_per_episode = np.zeros([N_Episodes, n_agents]) 54 | gi_per_episode = np.zeros_like(grad_per_episode) 55 | 56 | # times = np.arange(0, T, step=drone_env.dt) + drone_env.dt 57 | 58 | 59 | agents = SA2CAgents(n_agents=env.n_agents, dim_local_state = dim_z, dim_local_action=dim_a, discount=discount_factor, epochs=M, learning_rate_critic=alpha_critic, learning_rate_actor=alpha_critic) 60 | print(f"### Running {type(agents)}, actor: {type(agents.actors[0])} with params: ###") 61 | print(f"Episodes = {N_Episodes}, max Time iterations = {drone_env.max_time_steps} (T = {drone_env.max_time_steps * drone_env.dt}s, dt = {drone_env.dt}s)") 62 | print(f"N of agents = {env.n_agents}, structure of critic NN = {agents.criticsNN[0].input_size}x{agents.criticsNN[0].L1}x{agents.criticsNN[0].L2}x{agents.criticsNN[0].output_size}") 63 | print(f"Discount = {discount_factor}, lr for NN critical = {alpha_critic}, lr for actor = {alpha_actor}, collision weight b = {env.collision_weight}") 64 | 65 | EPISODES = trange(N_Episodes, desc='Episode: ', leave=True) 66 | for episode in EPISODES: 67 | 68 | if episode+1 in episodes_to_plot: 69 | # reward_history = np.zeros([len(times), env.n_agents]) 70 | trajectory = [env.state.copy()] 71 | z_trajectory = [env.z_states] 72 | total_episode_reward = 0 73 | total_true_episode_reward = 0 74 | total_episode_collisions = 0 75 | # env.show() 76 | 77 | buffers = ExperienceBuffers(env.n_agents) 78 | 79 | # SIMULATION OVER T 80 | t_iter = 0 81 | finished = False 82 | while not finished: 83 | 84 | state = env.state 85 | z_states = env.z_states 86 | Ni = env.Ni 87 | 88 | # calculate actions based on current state 89 | # actions = drone_env.gradient_control(state, env) 90 | # actions = drone_env.proportional_control(state, env) 91 | actions = agents.forward(z_states, Ni) 92 | 93 | # Update environment one time step with the actions 94 | new_state, new_z, rewards, n_collisions, finished, true_rewards = env.step(actions) 95 | # EXPERIECE: [z_state, action, reward, next_z, finished] 96 | buffers.append(z_states, actions, rewards, new_z, Ni, finished) 97 | 98 | total_episode_reward += np.mean(rewards) 99 | total_true_episode_reward += np.mean(true_rewards) 100 | total_episode_collisions += n_collisions 101 | 102 | if episode+1 in episodes_to_plot: 103 | # reward_history[t_iter,:] = reward 104 | trajectory.append(new_state.copy()) 105 | z_trajectory.append(new_z) 106 | 107 | t_iter +=1 108 | 109 | ### END OF EPISODES 110 | # Train of critic with the data of the episode 111 | # current_grad_norms, current_gi_norms = agents.train(buffers, actor_lr = alpha_actor, return_grads=True) 112 | if type(agents.actors[0]) is NormalPolicy: 113 | agents.train_designed_policy(buffers, actor_lr = alpha_actor, return_grads=False) 114 | else: 115 | agents.train_NN(buffers, actor_lr = alpha_actor) 116 | 117 | # Append episodic variables/logs 118 | total_reward_per_episode.append(total_episode_reward) 119 | total_true_reward_per_episode.append(total_true_episode_reward) 120 | total_collisions_per_episode.append(total_episode_collisions) 121 | total_t.append(t_iter) 122 | # grad_per_episode[episode,:] = np.array(current_grad_norms) 123 | # gi_per_episode[episode,:] = np.array(current_gi_norms) 124 | 125 | if episode+1 in episodes_to_plot: 126 | Q_simulated, V_approx = agents.benchmark_cirtic(buffers, only_one_NN=False) 127 | 128 | # print(f"Episode collisions = {total_episode_collisions}") 129 | # env.animate(trajectory,frame_time=0.1) 130 | 131 | # RESET ENVIRONMENT 132 | env.reset(renew_obstacles=False) 133 | 134 | # Set progress bar description with information 135 | average_reward = running_average(total_reward_per_episode, 50)[-1] 136 | average_true_reward = running_average(total_true_reward_per_episode, 50)[-1] 137 | average_collisions = running_average(total_collisions_per_episode, 50)[-1] 138 | average_t = running_average(total_t, 50)[-1] 139 | EPISODES.set_description( 140 | f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter} - Average: {average_reward:.1f}/{average_collisions:.2f}/{average_t}. True r={average_true_reward:.1f}.") 141 | 142 | # Plot current trajectory 143 | 144 | if episode+1 in episodes_to_plot: 145 | env.plot(trajectory, episode) 146 | env.animate(trajectory, z_trajectory, deltas, episode, name=f"training-E{episode+1}", format="mp4") 147 | times = np.arange(0, t_iter)*drone_env.dt 148 | plt.figure() 149 | for i in range(env.n_agents): 150 | agent_color = drone_env.num_to_rgb(i,env.n_agents-1) 151 | plt.plot(times,Q_simulated[i], label=f"i={i}, simulated Q (Gt)", color = agent_color) 152 | plt.plot(times,V_approx[i],"--" , label=f"i={i}, approx V", color = tuple(0.9*x for x in agent_color)) 153 | if type(agents.actors[0]) is NormalPolicy: 154 | print(f"Agent {i} params = {agents.actors[i].parameters}") 155 | plt.legend() 156 | plt.show() 157 | 158 | agents.save(filename=save_name) 159 | 160 | plot_rewards(total_reward_per_episode, total_true_reward_per_episode, total_collisions_per_episode, n_ep_running_average=50) 161 | # plt.savefig("images/reward_training.pdf",format='pdf', bbox_inches='tight') 162 | # plot_grads(grad_per_episode,gi_per_episode) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from turtle import forward 3 | import numpy as np 4 | from autograd import numpy as anp 5 | from autograd import grad 6 | import torch 7 | # import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | from collections import deque, namedtuple 11 | 12 | """ Classes used for agents and other useful functions """ 13 | 14 | class CriticNN(nn.Module): 15 | """ Create local critic network 16 | """ 17 | # NN sizes: define size of hidden layer 18 | L1 = 200 19 | L2 = 200 20 | 21 | def __init__(self, input_size, output_size = 1): 22 | super().__init__() 23 | 24 | self.input_size = input_size 25 | self.output_size = output_size 26 | 27 | # Create input layer with ReLU activation 28 | self.input_layer = nn.Linear(input_size, self.L1) 29 | self.input_layer_activation = nn.ReLU() 30 | 31 | # Create hidden layers. 1 32 | self.hidden_layer1 = nn.Linear(self.L1, self.L2) 33 | self.hidden_layer1_activation = nn.ReLU() 34 | 35 | # Create output layer. NO ACTIVATION 36 | self.output_layer = nn.Linear(self.L2, output_size) 37 | 38 | def forward(self, z): 39 | '''z must be a properly formated vector of z (torch tensor)''' 40 | # Function used to compute the forward pass 41 | 42 | # Compute first layer 43 | l1 = self.input_layer(z) 44 | l1 = self.input_layer_activation(l1) 45 | 46 | # Compute hidden layers 47 | l2 = self.hidden_layer1(l1) 48 | l2 = self.hidden_layer1_activation(l2) 49 | 50 | # Compute output layer 51 | out = self.output_layer(l2) 52 | 53 | return out 54 | 55 | class NormalActorNN(nn.Module): 56 | """ NN for a policy that as input takes the z state and outputs 2D means and sigma of a independent normal distributions 57 | In this case: z[1x6] -> mu[1x2], sigma^2[1x2] 58 | """ 59 | def __init__(self, input_size, lr ,dim_action): 60 | super().__init__() 61 | 62 | self.dim_action = dim_action 63 | # NN sizes: define size of layers 64 | Ls = 400 65 | hidden_1= 200 66 | hidden_2= 200 67 | 68 | # Ls, Create input layer with ReLU activation 69 | self.input_layer = nn.Linear(input_size, Ls) 70 | self.input_layer_activation = nn.ReLU() 71 | 72 | # Create hidden layer, Ls- > head1 73 | self.hidden_layer1 = nn.Linear(Ls, hidden_1) 74 | self.hidden_layer1_activation = nn.ReLU() 75 | # Create hidden layer, Ls- > head2 76 | self.hidden_layer2 = nn.Linear(Ls, hidden_2) 77 | self.hidden_layer2_activation = nn.ReLU() 78 | 79 | # ctreate output layers for each head: out_1: means (tanh in [-1,1]). out_2: sigma^2 (sigmoid in [0,1]) 80 | self.out_1 = nn.Linear(hidden_1,dim_action) 81 | self.out_1_activation = nn.Tanh() 82 | self.out_2 = nn.Linear(hidden_2,dim_action) 83 | self.out_2_activation = nn.Sigmoid() 84 | 85 | # Use adam optimizer 86 | self.optimizer = optim.Adam(self.parameters(), lr = lr) 87 | 88 | def forward(self, z): 89 | # Compute first layer 90 | l1 = self.input_layer(z) 91 | l1 = self.input_layer_activation(l1) 92 | 93 | # Compute hidden layers 94 | l2_head1 = self.hidden_layer1(l1) 95 | l2_head1 = self.hidden_layer1_activation(l2_head1) 96 | 97 | l2_head2 = self.hidden_layer2(l1) 98 | l2_head2 = self.hidden_layer2_activation(l2_head2) 99 | 100 | # Compute output layers 101 | out_1 = self.out_1(l2_head1) 102 | out_1 = self.out_1_activation(out_1) 103 | 104 | out_2 = self.out_2(l2_head2) 105 | out_2 = self.out_2_activation(out_2) 106 | 107 | # out_1 = mu, out_2 = sigma^2 108 | return out_1,out_2 109 | 110 | def sample_action(self,z:np.ndarray, N=None): 111 | state_tensor = torch.tensor(z, dtype=torch.float32) 112 | mu_tensor,sigma_tensor = self.forward(state_tensor) 113 | # Normally distributed value with the mu and sigma (std^2) from ActorNN 114 | std = np.sqrt(sigma_tensor.detach().numpy()) 115 | action = np.random.normal(mu_tensor.detach().numpy(),std) 116 | # action = np.clip(action,-1,1) 117 | return action 118 | 119 | def log_p_of_a(self,z:np.ndarray, a:np.ndarray): 120 | state_tensor = torch.tensor(z, dtype=torch.float32) 121 | mu , sigma = self.forward(state_tensor) 122 | if mu.dim() == 1: 123 | p1 = torch.pow(2 * np.pi * sigma[0], -0.5) * torch.exp(-(a[0] - mu[0])**2 / (2 * sigma[0])) 124 | p2 = torch.pow(2 * np.pi * sigma[1], -0.5) * torch.exp(-(a[1] - mu[1])**2 / (2 * sigma[1])) 125 | else: 126 | p1 = torch.pow(2 * np.pi * sigma[:,0], -0.5) * torch.exp(-(a[:,0] - mu[:,0])**2 / (2 * sigma[:,0])) 127 | p2 = torch.pow(2 * np.pi * sigma[:,1], -0.5) * torch.exp(-(a[:,1] - mu[:,1])**2 / (2 * sigma[:,1])) 128 | 129 | p = p1*p2 130 | return torch.log(p) 131 | 132 | class NormalPolicy: 133 | """Policy that uses a multivatriable normal distribution. 134 | The parameters are theta, which the angles of the Rot mat for each vector: 135 | parameters: mu = theta * z 136 | gradient: w.r.t theta 137 | covariance matrix: Constant for now, not a parameter 138 | 139 | CAREFUL: Changes the shape of z,a inputs to columns 140 | 141 | NOTICE: individual values of p(a|z) can be greater than 1, as this is a density funct. (pdf, continous) 142 | the pdf of a singular point makes no sense, neeeds to be over a differential of a (i.e pdf is per unit lenght) 143 | """ 144 | def __init__(self, input_size, output_size = 2, Sigma = None) -> None: 145 | 146 | self.take_all_states = False 147 | self.dim = output_size 148 | self.z_dim = input_size 149 | 150 | # param =anp.array([-1.6,-1.6,-1.6]) 151 | param =-anp.ones(int(self.z_dim/self.dim))*0 152 | 153 | self.parameters = param 154 | if Sigma is None: 155 | self.Sigma = anp.eye(self.dim)*0.3 156 | else: 157 | self.Sigma = Sigma 158 | 159 | def p_of_a(self, z:np.ndarray, a:np.ndarray) -> np.ndarray: 160 | ''' a needs to be a row vector (1D flat) 161 | z needs to be a row vector (1D flat) 162 | ''' 163 | pass 164 | 165 | def compute_grad(self, z:np.ndarray, a:np.ndarray, Ni): 166 | ''' a needs to be a row vector (1D flat) 167 | z needs to be a row vector (1D flat) 168 | Ni indicates the states that are neighbors 169 | ''' 170 | # Make vectors proper shape (column, for math) 171 | z.shape = (np.size(z),1) 172 | a.shape = (np.size(a),1) 173 | 174 | # Used to only calculate the gradient of the states that actually count 175 | if self.take_all_states: 176 | idx = anp.ones(int(self.z_dim/self.dim)) 177 | else: 178 | idx = np.arange(1,int(self.z_dim/self.dim+1))<=len(Ni) 179 | 180 | # Define scalar function to which apply numerical gradient: https://github.com/HIPS/autograd/blob/master/docs/tutorial.md 181 | def my_fun(variable): 182 | R0 = anp.array([[anp.cos(variable[0]), -anp.sin(variable[0])],[anp.sin(variable[0]),anp.cos(variable[0])]])*idx[0] 183 | R1 = anp.array([[anp.cos(variable[1]), -anp.sin(variable[1])],[anp.sin(variable[1]),anp.cos(variable[1])]])*idx[1] 184 | R2 = anp.array([[anp.cos(variable[2]), -anp.sin(variable[2])],[anp.sin(variable[2]),anp.cos(variable[2])]])*idx[2] 185 | R = anp.concatenate((R0,R1,R2),1) 186 | 187 | return (-1/2*(a- R @ z).T @ np.linalg.inv(self.Sigma) @ (a- R @ z))[0,0] 188 | 189 | grad_fun = grad(my_fun) 190 | self.grad = grad_fun(self.parameters) 191 | z.shape = (np.size(z),) 192 | a.shape = (np.size(a),) 193 | 194 | return self.grad 195 | 196 | def clip_grad_norm(self, grad:np.ndarray, clip_norm:float): 197 | # If the gradient norm is to be clipped to a value: 198 | grad_norm = np.linalg.norm(grad.flatten()) 199 | # If the current norm is less than the clipping, do nothing. If more, make the norm=cliped_norm 200 | if grad_norm <= clip_norm: 201 | return grad 202 | else: 203 | return grad * clip_norm/grad_norm 204 | 205 | 206 | def sample_action(self, z:np.ndarray, Ni): 207 | # Maybe add a mask so that null states are not accounted 208 | z.shape = (np.size(z),1) 209 | 210 | # Used to only calculate the gradient of the states that actually count 211 | if self.take_all_states: 212 | idx = anp.ones(int(self.z_dim/self.dim)) 213 | else: 214 | idx = np.arange(1,int(self.z_dim/self.dim+1))<=len(Ni) 215 | 216 | variable = self.parameters 217 | R0 = anp.array([[anp.cos(variable[0]), -anp.sin(variable[0])],[anp.sin(variable[0]),anp.cos(variable[0])]])*idx[0] 218 | R1 = anp.array([[anp.cos(variable[1]), -anp.sin(variable[1])],[anp.sin(variable[1]),anp.cos(variable[1])]])*idx[1] 219 | R2 = anp.array([[anp.cos(variable[2]), -anp.sin(variable[2])],[anp.sin(variable[2]),anp.cos(variable[2])]])*idx[2] 220 | # R3 = anp.array([[anp.cos(variable[3]), -anp.sin(variable[3])],[anp.sin(variable[3]),anp.cos(variable[3])]])*idx[3] 221 | R = anp.concatenate((R0,R1,R2),1) 222 | 223 | mu = (R @ z).flatten() 224 | 225 | z.shape = (np.size(z),) 226 | a = np.random.multivariate_normal(mu, self.Sigma) 227 | 228 | # Clip the action to not have infinite action 229 | return np.clip(a,-2,+2) 230 | 231 | 232 | class ExperienceBuffers: 233 | """ List of buffers for each agent. 234 | each agent has its own buffer: i: ['z_state', 'action', 'local_reward', 'next_z', 'is_finished'] 235 | to get data, example: buffers.buffers[i][t].action 236 | """ 237 | def __init__(self, n_agents): 238 | # Create buffer for each agent 239 | self.buffers = [deque() for i in range(n_agents)] 240 | self.n_agents = n_agents 241 | self.experience = namedtuple('experience', 242 | ['z_state', 'action', 'reward', 'next_z', 'Ni', 'finished']) 243 | 244 | def append(self,z_states, actions, rewards, new_z, Ni, finished): 245 | # Append experience to the buffer 246 | for i in range(self.n_agents): 247 | # Create localized expereince touple. Also, flatten state and action vectors 248 | exp = self.experience(z_states[i].flatten(), actions[i].flatten(), rewards[i], new_z[i].flatten(), Ni[i], finished) 249 | self.buffers[i].append(exp) 250 | 251 | def __len__(self): 252 | # overload len operator 253 | return len(self.buffers[0]) 254 | 255 | class DiscreteSoftmaxNN(nn.Module): 256 | """ 257 | NN for the policy with finite actions, in which the input is the state and the output are softmax probabilities for each of the actions 258 | """ 259 | def __init__(self, input_size, lr , n_actions = 8): 260 | super().__init__() 261 | 262 | # Create action dictionary action_arg -> 2D continuos action array 263 | action_norm = 1 264 | action_list = [] 265 | for n_action in range(n_actions): 266 | ax = np.cos(n_action/n_actions * 2*np.pi)*action_norm 267 | ay = np.sin(n_action/n_actions * 2*np.pi)*action_norm 268 | action_list.append(np.array([ax,ay])) 269 | self.action_list = np.array(action_list) 270 | 271 | # Structure z -> Ls[ReLu] -> hidden1[ReLu] -> out[softmax] 272 | Ls = 300 273 | hidden_1 = 300 274 | self.n_actions = n_actions 275 | 276 | # Ls, Create input layer with ReLU activation 277 | self.input_layer = nn.Linear(input_size, Ls) 278 | self.input_layer_activation = nn.ReLU() 279 | # Create hidden layer, Ls- > head1 280 | self.hidden_layer1 = nn.Linear(Ls, hidden_1) 281 | self.hidden_layer1_activation = nn.ReLU() 282 | # ctreate output layers with softmax -> probabilities 283 | self.out_1 = nn.Linear(hidden_1,n_actions) 284 | self.out_1_activation = nn.Softmax(dim=0) 285 | 286 | # Use adam optimizer 287 | self.optimizer = optim.Adam(self.parameters(), lr = lr) 288 | 289 | def forward(self,z): 290 | # Compute first layer 291 | l1 = self.input_layer(z) 292 | l1 = self.input_layer_activation(l1) 293 | 294 | # Compute hidden layers 295 | l2 = self.hidden_layer1(l1) 296 | l2 = self.hidden_layer1_activation(l2) 297 | 298 | # Compute output layers 299 | out_1 = self.out_1(l2) 300 | out_1 = self.out_1_activation(out_1) 301 | 302 | return out_1 303 | 304 | def sample_action(self,z:np.ndarray, N=None): 305 | state_tensor = torch.tensor(z, dtype=torch.float32) 306 | probs = self.forward(state_tensor) 307 | chosen_action_arg = np.random.choice(self.n_actions, p = probs.squeeze().detach().numpy()) 308 | 309 | return self.action_list[chosen_action_arg] 310 | 311 | def log_p_of_a(self,z:np.ndarray, a:np.ndarray): 312 | # Find argument corresponding to a array 313 | action_arg = np.where((self.action_list[:,0] == a[0]) & (self.action_list[:,1]==a[1]))[0][0] 314 | 315 | state_tensor = torch.tensor(z, dtype=torch.float32) 316 | probs = self.forward(state_tensor) 317 | log_prob = torch.log(probs.squeeze()[action_arg]) 318 | return log_prob 319 | -------------------------------------------------------------------------------- /variables_with_delta_change_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_1 -------------------------------------------------------------------------------- /variables_with_delta_change_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_2 -------------------------------------------------------------------------------- /variables_with_delta_change_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_3 -------------------------------------------------------------------------------- /variables_with_delta_change_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_4 --------------------------------------------------------------------------------