├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── SAC_agents.py
├── benchmark_agent.py
├── control_test.py
├── drone_env.py
├── images
    ├── E1000_n10_DiscretePolicy4_b02.png
    ├── E1500_n5_DiscretePolicy8_b02.png
    ├── collisions_hist.pdf
    └── delta_effect.pdf
├── learning_Q_test.py
├── matlab
    ├── cost_field.m
    ├── derivations_22ndPol.m
    ├── derivations_2ndPol.m
    ├── derivations_check.m
    ├── distance_def.m
    ├── images
    │   ├── dij.eps
    │   └── dij.fig
    ├── normal_multivariate_pdf.m
    ├── optimal_traj.m
    └── sol.mat
├── models
    ├── E1000_n10_DiscretePolicy4_b02-A2Cactors.pth
    ├── E1000_n10_DiscretePolicy4_b02-A2Ccritics.pth
    ├── E500_M30_LR1e4_badInitialState-actors.pth
    ├── E500_M30_LR1e4_badInitialState-critics.pth
    ├── cont_preloaded-A2Cactors.pth
    ├── cont_preloaded-A2Ccritics.pth
    ├── deltas
    │   ├── deltas0.01_softmax16-A2Cactors.pth
    │   ├── deltas0.01_softmax16-A2Ccritics.pth
    │   ├── deltas0.1_softmax16-A2Cactors.pth
    │   ├── deltas0.1_softmax16-A2Ccritics.pth
    │   ├── deltas0.2_softmax16-A2Cactors.pth
    │   ├── deltas0.2_softmax16-A2Ccritics.pth
    │   ├── deltas0.5_softmax16-A2Cactors.pth
    │   ├── deltas0.5_softmax16-A2Ccritics.pth
    │   ├── deltas0.8_softmax16-A2Cactors.pth
    │   ├── deltas0.8_softmax16-A2Ccritics.pth
    │   ├── deltas1.5_softmax16-A2Cactors.pth
    │   ├── deltas1.5_softmax16-A2Ccritics.pth
    │   ├── deltas1_softmax16-A2Cactors.pth
    │   ├── deltas1_softmax16-A2Ccritics.pth
    │   ├── deltas2.43_softmax16-A2Cactors.pth
    │   ├── deltas2.43_softmax16-A2Ccritics.pth
    │   ├── deltas2.5_softmax16-A2Cactors-old (1).pth
    │   ├── deltas2.5_softmax16-A2Ccritics-old (2).pth
    │   ├── deltas2_softmax16-A2Cactors.pth
    │   └── deltas2_softmax16-A2Ccritics.pth
    ├── deltas2.5_softmax16-A2Cactors.pth
    ├── deltas2.5_softmax16-A2Ccritics.pth
    ├── discrete-A2Cactors.pth
    ├── discrete-A2Ccritics.pth
    ├── final
    │   ├── cont_n5-A2Cactors.pth
    │   ├── cont_n5-A2Ccritics.pth
    │   ├── simple-A2Cactors.pth
    │   ├── simple-A2Ccritics.pth
    │   ├── softmax8_n4-A2Cactors.pth
    │   ├── softmax8_n4-A2Ccritics.pth
    │   ├── softmax8_n5-A2Cactors.pth
    │   ├── softmax8_n5-A2Ccritics.pth
    │   ├── softmax8_n8-A2Cactors.pth
    │   └── softmax8_n8-A2Ccritics.pth
    ├── n5_E1500_Advantage-actors.pth
    ├── n5_E1500_Advantage-critics.pth
    ├── softmax8_n5-A2Cactors.pth
    ├── softmax8_n5-A2Ccritics.pth
    └── trained_E1000_M50_LR001-actors.pth
├── policy_performance_variables_1
├── policy_performance_variables_2
├── profile.bat
├── spec-file.txt
├── train_problem.py
├── utils.py
├── variables_with_delta_change_1
├── variables_with_delta_change_2
├── variables_with_delta_change_3
└── variables_with_delta_change_4


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # my own thoings
  2 | notes.md
  3 | 
  4 | # Folders
  5 | videos/
  6 | *.prof
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | cover/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | .pybuilder/
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # IPython
 89 | profile_default/
 90 | ipython_config.py
 91 | 
 92 | # pyenv
 93 | #   For a library or package, you might want to ignore these files since the code is
 94 | #   intended to run in multiple environments; otherwise, check them in:
 95 | # .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # poetry
105 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
107 | #   commonly ignored for libraries.
108 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109 | #poetry.lock
110 | 
111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
112 | __pypackages__/
113 | 
114 | # Celery stuff
115 | celerybeat-schedule
116 | celerybeat.pid
117 | 
118 | # SageMath parsed files
119 | *.sage.py
120 | 
121 | # Environments
122 | .env
123 | .venv
124 | env/
125 | venv/
126 | ENV/
127 | env.bak/
128 | venv.bak/
129 | 
130 | # Spyder project settings
131 | .spyderproject
132 | .spyproject
133 | 
134 | # Rope project settings
135 | .ropeproject
136 | 
137 | # mkdocs documentation
138 | /site
139 | 
140 | # mypy
141 | .mypy_cache/
142 | .dmypy.json
143 | dmypy.json
144 | 
145 | # Pyre type checker
146 | .pyre/
147 | 
148 | # pytype static type analyzer
149 | .pytype/
150 | 
151 | # Cython debug symbols
152 | cython_debug/
153 | 
154 | # PyCharm
155 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
156 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
157 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
158 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
159 | #.idea/
160 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Andreu Matoses
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scalable Collision Avoidance RL
 2 | 
 3 | 
 4 |  Multi agent scalable reinforcement learning for formation control with collision avoidance. Paper and more information can be found in the [website 🌍](https://andreumatoses.github.io/research/msc-thesis-marl)
 5 | 
 6 | <img src="https://user-images.githubusercontent.com/46297629/216661630-d4896daa-9ce8-4c72-936e-972fcae30afc.gif" width="350"> <img src="https://user-images.githubusercontent.com/46297629/216676961-4b61d1f0-9b1e-4ca9-ad81-2c36c6a77886.gif" width="350">
 7 | 
 8 | 
 9 |  This is part of my Mater's Thesis at the Royal Institute of Technology (KTH), Stockholm, Sweden (URL available soon). The scalable part of this work is inspired by the paper [Scalable Reinforcement Learning for Multi-Agent Networked Systems](https://arxiv.org/abs/1912.02906). The approach presented on the thesis exploits the structure of the designed reward to present ∆-disk local approximation of the individual Q functions and policy gradients. 
10 |  
11 |  ## Important Scripts
12 |  
13 |  - [drone_env.py](drone_env.py): Script containing the environment class and its methods. The main methods follow the structure of the OpenGym RL environments, such as `.step()` and `.reset()`.
14 |  - [train_problem.py](train_problem.py): Script containing the training of the main problem. 
15 |  - [SAC_agents.py](SAC_agents.py): Script containing the agent classes and its policy classes
16 |  - [benchmark_agent.py](benchmark_agent.py): Scripts to run trained agents and benchmark their performance
17 |  
18 |  ## Training of the scalable agents
19 |  The schema of the algorithm used is presented below. The scalable actor-critic agents are trained for each robotic agent. There are a total of *n* agents.
20 |  
21 |  ![image](https://user-images.githubusercontent.com/46297629/216669445-a07214a4-08e5-46d8-85e5-f30855f3e8fc.png)
22 | 
23 | ## Structure of the training script
24 | ![image](https://user-images.githubusercontent.com/46297629/216669814-5e9465ef-f0a8-46cb-a53a-35645a799e70.png)
25 | 
26 | ## Relevant results
27 | ### Softmax discrete policy
28 | The inidivual policy for each agent is of the shape:
29 | 
30 | ![image](https://user-images.githubusercontent.com/46297629/216673328-1cea1dc8-26fe-4b50-9618-ccf30043801e.png)
31 | 
32 | Some examples of trajectories obtained after succesful training are as follow
33 | 
34 | ![image](https://user-images.githubusercontent.com/46297629/216674048-68c2473c-b398-4d60-8bfc-de6049065911.png)
35 | 
36 | ### Continous normally distributed policy
37 | The inidivual policy for each agent is of the shape:
38 | 
39 | ![image](https://user-images.githubusercontent.com/46297629/216673469-c5b07220-ee01-4d7f-a1bf-010b21619b19.png)
40 | 
41 | Some examples of trajectories obtained after succesful training are as follow
42 | 
43 | ![image](https://user-images.githubusercontent.com/46297629/216673912-91e69f5a-f6dc-49b3-ac77-fed1a8cecec5.png)
44 | 
45 | ## Comparison of tested policies 
46 | The number of collisions displayed on the results are defined as an agent intersection with a neighbour’s collision radius in a given time step. Each agent counts collisions separately, thus two agents colliding is counted as two different collisions and a collisions that lasts for several times steps is is also counted as different collisions.
47 | 
48 | Percentage of simulations that achieve each number of collisions for each of the three tested policies, n = 5. The percentages for more than 14 collisions (under 1%) have been omitted.
49 | 
50 | ![image](https://user-images.githubusercontent.com/46297629/216673529-5bf6a5be-c149-43cd-b0d7-ead7099d29dd.png)
51 | 
52 | Effect of the ∆-disk radius (definition 12 on the thesis) on the global reward and number of collisions, averaged over 2000 runs of the trained policies, for the discrete softmax NN policy
53 | 
54 | ![image](https://user-images.githubusercontent.com/46297629/216673581-b48750c7-eedc-4e04-92d4-21ade45c398a.png)
55 | 
56 | The results also show that the average reward starts to decrease after a certain value of ∆i , in this case around 1. The average number of collisions also increases sharply back to values where the agent has nearly no vision. This unexpected behaviours is the result of significant increase in the complexity of the maximization problem that the policy gradient is trying to solve. Taking into account an increasing number of neighbours and from further distances, increases the variance of the estimated approximated gradient and as a result, the policy used is not able to find improvements. Indeed, for the final case of ∆i ≈ dˆi is not able to converge during training.
57 | 
58 | 


--------------------------------------------------------------------------------
/SAC_agents.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.optim as optim
  6 | from collections import deque, namedtuple
  7 | from utils import *
  8 | 
  9 | class RandomAgent:
 10 |     ''' Agent taking actions uniformly at random, child of the class Agent'''
 11 |     def __init__(self, n_actions: int):
 12 |         # super(RandomAgent, self).__init__(n_actions)
 13 |         self.n_actions = n_actions
 14 | 
 15 |     def forward(self, state: np.ndarray) -> np.ndarray:
 16 |         ''' Compute a random action in [-1, 1]
 17 | 
 18 |             Returns:
 19 |                 action (np.ndarray): array of float values containing the
 20 |                     action.
 21 |         '''
 22 |         return np.clip(-1 + 2 * np.random.rand(self.n_actions), -1, 1)
 23 | 
 24 | class TrainedAgent:
 25 |     ''' Agent that loads and follows a learned policy/critic
 26 |      '''
 27 |     def __init__(self, critics_name:str, actors_name:str, n_agents = "auto", discount = 0.99):
 28 | 
 29 |         file_name_critics = os.path.join("models", critics_name)
 30 |         # Load critic
 31 |         try:
 32 |             criticsNN = torch.load(file_name_critics)
 33 |             print(f'Loaded Critic, n_agents = {len(criticsNN)}, discount = {discount}. Network model[0]: {criticsNN[0]}')
 34 |         except:
 35 |             print(f'File {file_name_critics} not found!')
 36 |             exit(-1)
 37 | 
 38 |         self.criticsNN = criticsNN
 39 |         self.critics_name = critics_name
 40 | 
 41 |         if n_agents == "auto":
 42 |             self.n_agents = len(criticsNN)
 43 |         else:
 44 |             self.n_agents = n_agents
 45 | 
 46 |         # load actor
 47 |         file_name_actors = os.path.join("models", actors_name)
 48 | 
 49 |         try:
 50 |             actors = torch.load(file_name_actors)
 51 |             print(f'Loaded actors, n_agents = {len(criticsNN)}, discount = {discount}. Type: {type(actors[0])}')
 52 |         except:
 53 |             print(f'File {file_name_actors} not found!')
 54 |             exit(-1)
 55 |         self.actors = actors
 56 |         self.actors_name = actors_name
 57 | 
 58 |         self.discount = discount # to benchmark the critic
 59 | 
 60 |     def forward(self, z_states: list, N:list):
 61 |         """ Returns:
 62 |                 action (np.ndarray): array of float values containing the
 63 |                     action. The dimensionality is equal to self.n_actions from
 64 |                     the parent class Agent
 65 |         """
 66 |         actions = []
 67 |         if type(self.actors[0]) is NormalPolicy or type(self.actors[0]) is NormalActorNN or type(self.actors[0]) is DiscreteSoftmaxNN:
 68 |             # z_state in this case
 69 |             for i in range(self.n_agents):
 70 |                 z_state = z_states[i].flatten()
 71 |                 Ni = N[i]
 72 |                 if i < len(self.actors):
 73 |                     actor = self.actors[i]
 74 |                 else:
 75 |                     actor = self.actors[0]
 76 | 
 77 |                 action = actor.sample_action(z_state, Ni)
 78 |                 actions.append(action)
 79 |         else:
 80 |             print(f"Error type of policy {type(self.actors[0])}")
 81 | 
 82 |         return actions
 83 | 
 84 |     def benchmark_cirtic(self, buffers: deque, only_one_NN = False):
 85 | 
 86 |         Gts = deque() # for debug, delete after
 87 |         V_approxs = deque() # for debug, delete after
 88 |         criticNN= self.criticsNN[0]
 89 | 
 90 |         for i in range(self.n_agents):
 91 |             # NN for this agent:
 92 |             if not only_one_NN:
 93 |                 if i < len(self.criticsNN):
 94 |                     criticNN = self.criticsNN[i]
 95 |                 else:
 96 |                     criticNN = self.criticsNN[0]
 97 |             
 98 |             # separate data from experience buffer
 99 |             buffer = buffers.buffers[i]
100 |             states, actions, rewards, new_states, Ni, finished = zip(*buffer)
101 | 
102 |             # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s]
103 |             # inputs = np.column_stack((states,actions))
104 |             # inputs = torch.tensor(inputs, dtype=torch.float32)
105 |             inputs = torch.tensor(np.array(states), dtype=torch.float32)
106 | 
107 |             # Calculate the simulated Q value (target), Monte carlo Gt
108 |             # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T)
109 |             T = len(rewards)
110 |             Gt_array = np.zeros([T])
111 |             Gt_array[-1] = rewards[-1]
112 |             for t in range(T-2,-1,-1):
113 |                 Gt_array[t]  = Gt_array[t+1]*self.discount + rewards[t]
114 | 
115 |             Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze()
116 | 
117 |             # value function: # calculate the approximated Q(s,a) = NN(input)
118 |             V_approx = criticNN(inputs).squeeze()
119 | 
120 |             V_approxs.append(V_approx.detach().numpy())
121 |             Gts.append(Gt_array) # for debug
122 | 
123 |         # Gts is the simulated Q(st,at) values for each agent. Q(s,a)-V(s) ~= Gt-V(st) = A(s,a)
124 |         return Gts, V_approxs
125 | 
126 | class SA2CAgents:
127 | 
128 |     def __init__(self, n_agents, dim_local_state, dim_local_action, discount, epochs, learning_rate_critic = 10**(-3), learning_rate_actor = 10**(-3)) -> None:
129 |         '''* dim_local_state is the total size of the localized vector that the input of the Q and pi approximations use, i.e (k+1)*dim'''
130 | 
131 |         self.n_agents = n_agents
132 |         self.dim_local_state = dim_local_state
133 |         self.dim_local_action = dim_local_action
134 |         # self.policy_type = policy_type # What kind of policy (NN, stochastic normal dist, etc...)
135 |         self.discount = discount
136 |         self.epochs = epochs
137 | 
138 |         # preload_NN = "models\\final\\cont_n5"
139 |         preload_NN = None
140 |         # Define policy (actor)
141 |         if preload_NN is None:
142 |              # self.actors = [NormalPolicy(dim_local_state,dim_local_action) for i in range(n_agents)]
143 |             self.actors = [DiscreteSoftmaxNN(dim_local_state, lr = learning_rate_actor, n_actions=16) for i in range(n_agents)]
144 |             # self.actors = [NormalActorNN(dim_local_state, lr = learning_rate_actor, dim_action=dim_local_action) for i in range(n_agents)]
145 |             self.learning_rate_actor = learning_rate_actor
146 | 
147 |             # List of NN that estimate Q (or V if we use advantage)
148 |             # self.criticsNN = [CriticNN(dim_local_state + dim_local_action, output_size=1) for i in range(n_agents)]
149 |             self.criticsNN = [CriticNN(dim_local_state, output_size=1) for i in range(n_agents)]
150 |             self.critic_optimizers = [optim.Adam(self.criticsNN[i].parameters(),lr = learning_rate_critic) for i in range(n_agents)]
151 |         else:
152 |             try:
153 |                 actors = torch.load(preload_NN + "-A2Cactors.pth")
154 |                 print(f'Loaded actors, n_agents = {len(actors)}, discount = {discount}. Type: {type(actors[0])}')
155 |             except:
156 |                 print(f'File {preload_NN + "-A2Cactors.pth"} not found!')
157 |                 exit(-1)
158 |             self.actors = actors
159 |             self.learning_rate_actor = learning_rate_actor
160 |             try:
161 |                 criticsNN = torch.load(preload_NN + "-A2Ccritics.pth")
162 |                 print(f'Loaded Critic, n_agents = {len(criticsNN)}, discount = {discount}. Network model[0]: {criticsNN[0]}')
163 |             except:
164 |                 print(f'File {preload_NN + "-A2Ccritics.pth"} not found!')
165 |                 exit(-1)
166 |             self.criticsNN = criticsNN
167 |             self.critic_optimizers = [optim.Adam(self.criticsNN[i].parameters(),lr = learning_rate_critic) for i in range(n_agents)]
168 | 
169 | 
170 |     def forward(self, z_states, N) -> list:
171 |         ''' Function that calculates the actions to take from the z_states list (control law) 
172 |             actions: list of row vectors [u1^T, u2^T,...]'''
173 | 
174 |         actions = deque()
175 |         for i in range(self.n_agents):
176 |             z_state = z_states[i].flatten()
177 |             actions.append(self.actors[i].sample_action(z_state, N[i]))
178 |             # actions.append(self.actors[i].sample_action(z_state, [1]))
179 | 
180 |         return actions
181 | 
182 |     def train_designed_policy(self, buffers: deque, actor_lr = None, return_grads = False):
183 |         epochs = self.epochs
184 | 
185 |         if actor_lr is not None:
186 |             self.learning_rate_actor = actor_lr
187 | 
188 |         Gts = deque() # for debug, delete after -> acces data Gts[i][t]
189 | 
190 |         # CRITIC LOOP
191 |         for i in range(self.n_agents):
192 |             # NN for this agent:
193 |             criticNN = self.criticsNN[i]
194 |             critic_optimizer = self.critic_optimizers[i]
195 | 
196 |             # separate data from experience buffer
197 |             buffer = buffers.buffers[i]
198 |             states, actions, rewards, new_states, Ni, finished = zip(*buffer)
199 | 
200 |             # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s]
201 |             # inputs = np.column_stack((states,actions))
202 |             # inputs = torch.tensor(inputs, dtype=torch.float32)
203 |             inputs = torch.tensor(np.array(states), dtype=torch.float32)
204 | 
205 |             # Calculate the simulated Q value (target), Monte carlo Gt
206 |             # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T)
207 |             T = len(rewards)
208 |             Gt_array = np.zeros([T])
209 |             Gt_array[-1] = rewards[-1]
210 |             for t in range(T-2,-1,-1):
211 |                 Gt_array[t]  = Gt_array[t+1]*self.discount + rewards[t]
212 | 
213 |             Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze()
214 |             Gts.append(Gt_array) # for debug
215 | 
216 |             ### Perfrom omega (critic) update:
217 |             # Set gradient to zero
218 |             critic_optimizer.zero_grad()
219 |             # value function: # calculate the approximated V(s) = NN(input)
220 |             V_approx = criticNN(inputs).squeeze()
221 |             # Compute MSE loss, as E[Gt-V(s) = A(s,a)] = 0
222 |             loss = nn.functional.mse_loss(V_approx, Gt)
223 |             # Compute gradient
224 |             loss.backward()
225 |             # Clip gradient norm to avoid infinite gradient
226 |             nn.utils.clip_grad_norm_(criticNN.parameters(), max_norm=10) 
227 |             # Update
228 |             critic_optimizer.step()
229 |         
230 |         # ACTOR LOOP
231 |         grad_norms = []
232 |         gi_norms = []
233 |         for i in range(self.n_agents):
234 |             # to access buffer data: buffers.buffers[i][t].action, namedtuple('experience', ['z_state', 'action', 'reward', 'next_z', 'Ni', 'finished'])
235 |             # Gt: Gts[i][t]
236 |             actor = self.actors[i]
237 |             gi = 0 #initialize to 0
238 | 
239 |             for t in range(T):
240 |                 zit = buffers.buffers[i][t].z_state
241 |                 ait = buffers.buffers[i][t].action
242 |                 Nit = buffers.buffers[i][t].Ni
243 |                 
244 |                 grad_actor = actor.compute_grad(zit,ait, Nit)
245 |                 # PUT Nit HERE INSTEAD of [1,2,3]
246 |                 # grad_actor = actor.clip_grad_norm(grad_actor,clip_norm=100)
247 | 
248 |                 # Qj_sum = 0
249 |                 Advantage_j_sum = 0
250 |                 input_tensor =  torch.tensor(zit, dtype=torch.float32)
251 |                 # Baseline is the Vi(s) for current agent. reduce variance and complexity
252 |                 Vi_baseline = self.criticsNN[i](input_tensor).detach().numpy()[0]
253 |                 for j in Nit: # i included here
254 |                     zjt = buffers.buffers[j][t].z_state
255 |                     ajt = buffers.buffers[j][t].action
256 |                     # Q_input_tensor =  torch.tensor(np.hstack((zjt,ajt)), dtype=torch.float32)
257 |                     # Qj = self.criticsNN[j](Q_input_tensor).detach().numpy()[0]
258 |                     # input_tensor =  torch.tensor(zjt, dtype=torch.float32)
259 |                     # Vj = self.criticsNN[j](input_tensor).detach().numpy()[0]
260 |                     Advantage_j_sum += (Gts[j][t] - Vi_baseline)
261 |                     # Qj_sum += Gts[j][t]
262 | 
263 |                 # gi += self.discount**t * 1/self.n_agents* grad_actor * Qj_sum
264 |                 gi += self.discount**t * 1/self.n_agents* grad_actor * Advantage_j_sum
265 | 
266 |             # Update policy parameters with approx gradient gi (clipped to avoid infinity gradients)
267 |             gi = actor.clip_grad_norm(gi, clip_norm=100)
268 |             # MAKE SURE TO CLIP THE PARAMS from 0 to 2*Pi
269 |             actor.parameters =  actor.parameters + self.learning_rate_actor*gi
270 |             # actor.parameters =  np.clip(actor.parameters + self.learning_rate_actor*gi, -2*np.pi, 2*np.pi)
271 | 
272 |             # print(f"grad norms gi={np.linalg.norm(gi.flatten())}")
273 |             if return_grads:
274 |                 grad_norms.append(np.linalg.norm(grad_actor.flatten()))
275 |                 gi_norms.append(np.linalg.norm(gi.flatten()))
276 | 
277 |         if return_grads:        
278 |             return grad_norms, gi_norms
279 | 
280 |     def train_NN(self, buffers: deque, actor_lr = None):
281 |         epochs = self.epochs
282 | 
283 |         if actor_lr is not None:
284 |             self.learning_rate_actor = actor_lr
285 | 
286 |         Gts = deque() # for debug, delete after -> acces data Gts[i][t]
287 |         T = len(buffers)
288 | 
289 |         # CRITIC LOOP
290 |         for i in range(self.n_agents):
291 |             # NN for this agent:
292 |             criticNN = self.criticsNN[i]
293 |             critic_optimizer = self.critic_optimizers[i]
294 | 
295 |             # separate data from experience buffer
296 |             buffer = buffers.buffers[i]
297 |             states, actions, rewards, new_states, Ni, finished = zip(*buffer)
298 | 
299 |             # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s]
300 |             inputs = torch.tensor(np.array(states), dtype=torch.float32)
301 | 
302 |             # Calculate the simulated Q value (target), Monte carlo Gt
303 |             # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T)
304 |             Gt_array = np.zeros([T])
305 |             Gt_array[-1] = rewards[-1]
306 |             for t in range(T-2,-1,-1):
307 |                 Gt_array[t]  = Gt_array[t+1]*self.discount + rewards[t]
308 | 
309 |             Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze()
310 |             Gts.append(Gt_array) # for debug
311 | 
312 |             ## Perfrom omega (critic) update:
313 |             # Set gradient to zero
314 |             critic_optimizer.zero_grad()
315 |             # value function: # calculate the approximated V(s) = NN(input)
316 |             V_approx = criticNN(inputs).squeeze()
317 |             # Compute MSE loss, as E[Gt-V(s) = A(s,a)] = 0
318 |             loss = nn.functional.mse_loss(V_approx, Gt)
319 |             # Compute gradient
320 |             loss.backward()
321 |             # Clip gradient norm to avoid infinite gradient
322 |             nn.utils.clip_grad_norm_(criticNN.parameters(), max_norm=10) 
323 |             # Update
324 |             critic_optimizer.step()
325 |         
326 |         # ACTOR LOOP
327 |         for i in range(self.n_agents):
328 |             # to access buffer data: buffers.buffers[i][t].action, namedtuple('experience', ['z_state', 'action', 'reward', 'next_z', 'Ni', 'finished'])
329 |             # Gt: Gts[i][t]
330 |             actor = self.actors[i]
331 |             actor_loss = torch.tensor(0, dtype=torch.float32)
332 | 
333 |             for t in range(T):
334 |                 zit = buffers.buffers[i][t].z_state
335 |                 ait = buffers.buffers[i][t].action
336 |                 Nit = buffers.buffers[i][t].Ni
337 |                 
338 |                 log_prob_tensor = actor.log_p_of_a(zit,ait)
339 | 
340 |                 Advantage_j_sum = 0
341 |                 input_tensor =  torch.tensor(zit, dtype=torch.float32)
342 |                 # Baseline is the Vi(s) for current agent. reduce variance and complexity
343 |                 Vi_baseline = self.criticsNN[i](input_tensor).detach().numpy()[0]
344 |                 # Advantage_j_sum += (Gts[i][t] - Vi_baseline)
345 |                 for j in Nit: # i included here
346 |                     Advantage_j_sum += (Gts[j][t] - Vi_baseline)
347 |                     # Advantage_j_sum += (Gts[j][t])
348 | 
349 |                 # gi += self.discount**t * 1/self.n_agents* grad_actor * Qj_sum
350 |                 # actor_loss = actor_loss -  self.discount**t * 1/self.n_agents* log_prob_tensor * Advantage_j_sum
351 |                 actor_loss = actor_loss -log_prob_tensor * 1/self.n_agents * self.discount**t * Advantage_j_sum
352 | 
353 |             # Update policy parameters
354 |             actor.optimizer.zero_grad()
355 |             actor_loss.backward()
356 |             nn.utils.clip_grad_norm_(actor.parameters(), max_norm=10)
357 |             actor.optimizer.step()
358 | 
359 |     
360 |     def benchmark_cirtic(self, buffers: deque, only_one_NN = False):
361 | 
362 |         Gts = deque() # for debug, delete after
363 |         V_approxs = deque() # for debug, delete after
364 |         criticNN= self.criticsNN[0]
365 | 
366 |         for i in range(self.n_agents):
367 |             # NN for this agent:
368 |             if not only_one_NN:
369 |                 criticNN = self.criticsNN[i]
370 |             
371 |             # separate data from experience buffer
372 |             buffer = buffers.buffers[i]
373 |             states, actions, rewards, new_states, Ni, finished = zip(*buffer)
374 | 
375 |             # Create input tensor, This one: input = [s,a] tensor -> Q(s,a)
376 |             # inputs = np.column_stack((states,actions))
377 |             inputs = torch.tensor(np.array(states), dtype=torch.float32)
378 |             ## Actor update
379 | 
380 |             # Calculate the simulated Q value (target), Monte carlo Gt
381 |             # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T)
382 |             T = len(rewards)
383 |             Gt_array = np.zeros([T])
384 |             Gt_array[-1] = rewards[-1]
385 |             for t in range(T-2,-1,-1):
386 |                 Gt_array[t]  = Gt_array[t+1]*self.discount + rewards[t]
387 | 
388 |             Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze()
389 | 
390 |             # value function: # calculate the approximated Q(s,a) ~= Gt = V(s) + A(s,a) => Gt-V(s) = A(s,a)
391 |             V_approx = criticNN(inputs).squeeze()
392 |             V_approxs.append(V_approx.detach().numpy())
393 |             # Q_approxs.append(Gt_array)
394 |             Gts.append(Gt_array) # for debug
395 | 
396 |         # Gts is the simulated Q(st,at) values for each agent. Q(s,a)-V(s) ~= Gt-V(st) = A(s,a)
397 |         return Gts, V_approxs
398 | 
399 |     def save(self,filename = "network"):
400 |         folder ="models"
401 |         cirtic_name = filename + "-A2Ccritics.pth"
402 |         actors_name = filename + "-A2Cactors.pth"
403 | 
404 |         torch.save(self.criticsNN, os.path.join(folder,cirtic_name))
405 |         print(f'Saved Critic NNs as {cirtic_name}')
406 |         torch.save(self.actors, os.path.join(folder,actors_name))
407 |         print(f'Saved Actors List as {actors_name}')
408 | 
409 | 
410 | class SPPOAgents:
411 |     def __init__(self, n_agents, dim_local_state, dim_local_action, discount, epochs, learning_rate_critic = 10**(-3), learning_rate_actor = 10**(-3), epsilon = 0.2) -> None:
412 |         '''* dim_local_state is the total size of the localized vector that the input of the Q and pi approximations use, i.e (k+1)*dim'''
413 | 
414 |         self.n_agents = n_agents
415 |         self.dim_local_state = dim_local_state
416 |         self.dim_local_action = dim_local_action
417 |         # self.policy_type = policy_type # What kind of policy (NN, stochastic normal dist, etc...)
418 |         self.discount = discount
419 |         self.epochs = epochs
420 |         self.epsilon = epsilon
421 | 
422 |         # Define actor networks
423 |         self.actorsNN = [NormalActorNN(dim_local_state, dim_action=dim_local_action) for i in range(n_agents)]
424 |         self.learning_rate_actor = learning_rate_actor
425 |         self.actor_optimizers = [optim.Adam(self.actorsNN[i].parameters(),lr = learning_rate_actor) for i in range(n_agents)]
426 | 
427 | 
428 |         # List of NN that estimate Q (or V if we use advantage)
429 |         # self.criticsNN = [CriticNN(dim_local_state + dim_local_action, output_size=1) for i in range(n_agents)]
430 |         self.criticsNN = [CriticNN(dim_local_state, output_size=1) for i in range(n_agents)]
431 |         self.critic_optimizers = [optim.Adam(self.criticsNN[i].parameters(),lr = learning_rate_critic) for i in range(n_agents)]
432 | 
433 |     def forward(self, z_states, N) -> list:
434 |         ''' Function that calculates the actions to take from the z_states list (control law) 
435 |             actions: list of row vectors [u1^T, u2^T,...]'''
436 | 
437 |         actions = deque()
438 |         for i in range(self.n_agents):
439 |             z_state = z_states[i].flatten()
440 |             actorNN = self.actorsNN[i]
441 | 
442 |             state_tensor = torch.tensor(z_state, dtype=torch.float32)
443 |             mu_tensor,sigma_tensor = actorNN(state_tensor)
444 | 
445 |             # Normally distributed value with the mu and sigma (std^2) from ActorNN
446 |             std = np.sqrt(sigma_tensor.detach().numpy())
447 |             action = np.random.normal(mu_tensor.detach().numpy(),std)
448 | 
449 |             # Acion must be between -1,1
450 |             actions.append(np.clip(action,-1,1))
451 | 
452 |         return actions
453 | 
454 |     def train(self, buffers: deque, actor_lr = None, return_grads = False):
455 |         epochs = self.epochs
456 | 
457 |         if actor_lr is not None:
458 |             self.learning_rate_actor = actor_lr
459 | 
460 |         T = len(buffers)
461 |         # Gts = deque() # for debug, delete after -> acces data Gts[i][t]
462 |         Git = np.zeros([self.n_agents, T]) # Git[i,t]
463 |         Qjsum_estim = np.zeros([self.n_agents, T]) # Qjsum_estim[i,t]
464 |         p_old_it = np.zeros([self.n_agents, T]) # p_old_it[i,t]
465 |         
466 | 
467 |         # Agents LOOP, to create required variables
468 |         for i in range(self.n_agents):
469 |             # separate data from experience buffer
470 |             buffer = buffers.buffers[i]
471 |             states, actions, rewards, new_states, Ni, finished = zip(*buffer)
472 | 
473 |             # Calculate the simulated Q value (target), Monte carlo Gt
474 |             # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T)
475 |             # T = len(rewards)
476 |             Gt_array = np.zeros([T])
477 |             Gt_array[-1] = rewards[-1]
478 |             for t in range(T-2,-1,-1):
479 |                 Gt_array[t]  = Gt_array[t+1]*self.discount + rewards[t]
480 | 
481 |             Git[i,:] = Gt_array
482 |         
483 |         # Calculate the advantage estimator (attempt), and old p of a
484 |         for i in range(self.n_agents):
485 |             # separate data from experience buffer
486 |             buffer = buffers.buffers[i]
487 |             states, actions, rewards, new_states, Ni, finished = zip(*buffer)
488 | 
489 |             # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s]
490 |             # inputs = torch.tensor(np.column_stack((states,actions)), dtype=torch.float32)
491 |             states_tensor = torch.tensor(np.array(states), dtype=torch.float32)
492 |             actions_tensor = torch.tensor(np.array(actions), dtype=torch.float32)
493 |            
494 |             p_old_it[i,:] = self.probability_of_ai(states_tensor, actions_tensor, i).detach().numpy()
495 |             Vi_baselines = self.criticsNN[i](states_tensor).squeeze().detach().numpy()
496 |             # advantage_estim[i,:] = -Vi_baselines
497 | 
498 |             for t in range(T):
499 |                 Nit = buffers.buffers[i][t].Ni
500 |                 for j in Nit: # i included here, only uses local i
501 |                     Qjsum_estim[i,t] += Git[j,t]
502 | 
503 | 
504 |         ### Training LOOP, per agent: ###
505 |         for i in range(self.n_agents):
506 |             buffer = buffers.buffers[i]
507 |             states, actions, rewards, new_states, Ni, finished = zip(*buffer)
508 |             Gt = torch.tensor(Git[i,:],dtype=torch.float32)
509 |             states_tensor = torch.tensor(np.array(states), dtype=torch.float32)
510 |             actions_tensor = torch.tensor(np.array(actions), dtype=torch.float32)
511 | 
512 |             Vi_baselines = self.criticsNN[i](states_tensor).squeeze()
513 |             Adv = Qjsum - Vi_baselines
514 |             Qjsum =  torch.tensor(Qjsum_estim[i,:], dtype=torch.float32)
515 | 
516 |             # criticNN = self.criticsNN[i]
517 |             # critic_optimizer = self.critic_optimizers[i]
518 |             # actorNN = self.actorsNN[i]
519 |             # actor_optimizer = self.actor_optimizers[i]
520 | 
521 |             # Perform training for epochs
522 |             for m in range(epochs):
523 | 
524 |                 ### CRITIC update:
525 |                 # Set gradient to zero
526 |                 self.critic_optimizers[i].zero_grad() 
527 |                 # value function: # calculate the approximated V(s) = NN(input)
528 |                 V_approx = self.criticsNN[i](states_tensor).squeeze()
529 |                 # Compute MSE loss, as E[Gt-V(s) = A(s,a)] = 0
530 |                 loss = nn.functional.mse_loss(V_approx, Gt)
531 |                 # Compute gradient
532 |                 loss.backward()
533 |                 # Clip gradient norm to avoid infinite gradient
534 |                 nn.utils.clip_grad_norm_(self.criticsNN[i].parameters(), max_norm=10) 
535 |                 # Update
536 |                 self.critic_optimizers[i].step()
537 | 
538 |                 ### ACTOR update
539 |                 # Set gradient to zero
540 |                 self.actor_optimizers[i].zero_grad()
541 |                 pi_old = torch.tensor(p_old_it[i,:], dtype=torch.float32)
542 |                 # Compute new advantage with updated critic
543 |                 # Compute r_theta ratio of probabilities
544 |                 current_pi = self.probability_of_ai(states_tensor,actions_tensor, i)
545 |                 r_theta = current_pi/pi_old
546 |                 # Comute loss function - 1/N (min...)
547 |                 left_min = r_theta * Adv
548 |                 right_min = torch.clamp(r_theta, 1 - self.epsilon, 1 + self.epsilon) * Adv #clipping method for tensors
549 |                 loss = - torch.mean(torch.min(left_min,right_min))
550 |                 # compute gradient
551 |                 loss.backward()
552 |                 # Clip gradient norm to avoid infinite gradient
553 |                 nn.utils.clip_grad_norm_(self.actorsNN[i].parameters(), max_norm=10) 
554 |                 # Update
555 |                 self.actor_optimizers[i].step()
556 | 
557 | 
558 |     def probability_of_ai(self,states_i,actions_i, agent:int):
559 |         """ all variables are tensors. the time vector for current agent=i
560 |             assume that the two action components are not correlated,
561 |             thus P of both happening is p1*p2
562 |         """
563 |         i = agent
564 |         mu , sigma = self.actorsNN[i](states_i)
565 | 
566 |         # p_tensor = (2*np.pi*sigma**2)**(-1/2) * torch.exp(-(actions-mu)**2/(2*sigma**2))
567 |         # p = p_tensor[:,0]*p_tensor[:,1] # assuming uncorrelated action
568 | 
569 |         p1 = torch.pow(2 * np.pi * sigma[:,0], -0.5) * torch.exp(-(actions_i[:,0] - mu[:,0])**2 / (2 * sigma[:,0]))
570 |         p2 = torch.pow(2 * np.pi * sigma[:,1], -0.5) * torch.exp(-(actions_i[:,1] - mu[:,1])**2 / (2 * sigma[:,1]))
571 |         p = p1*p2
572 | 
573 |         return p      
574 | 
575 |     def save(self,filename = "network"):
576 |         folder ="models"
577 |         cirtic_name = filename + "-PP0critics.pth"
578 |         actors_name = filename + "-PP0actors.pth"
579 | 
580 |         torch.save(self.criticsNN, os.path.join(folder,cirtic_name))
581 |         print(f'Saved Critic NNs as {cirtic_name}')
582 |         torch.save(self.actorsNN, os.path.join(folder,actors_name))
583 |         print(f'Saved Actors List as {actors_name}')


--------------------------------------------------------------------------------
/benchmark_agent.py:
--------------------------------------------------------------------------------
  1 | from collections import deque, namedtuple
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | from matplotlib.ticker import PercentFormatter
  5 | import drone_env
  6 | from drone_env import running_average, plot_rewards, plot_grads
  7 | from tqdm import tqdm, trange
  8 | from SAC_agents import *
  9 | from utils import ExperienceBuffers
 10 | 
 11 | plt.style.use('seaborn-dark-palette')
 12 | tex_fonts = {
 13 |     # Use LaTeX to write all text
 14 |     #     "text.usetex": True,
 15 |     "font.family": "sans-serif",
 16 |     # Use 10pt font in plots, to match 10pt font in document
 17 |     "axes.labelsize": 10,
 18 |     "font.size": 10,
 19 |     # Make the legend/label fonts a little smaller
 20 |     "legend.fontsize": 10,
 21 |     "xtick.labelsize": 10,
 22 |     "ytick.labelsize": 10
 23 | }
 24 | plt.rcParams.update(tex_fonts)
 25 | 
 26 | ### Set up parameters ###
 27 | model_name = "final\\softmax8_n8"
 28 | n_agents = 8
 29 | deltas = np.ones(n_agents)*1
 30 | env = drone_env.drones(n_agents=n_agents, n_obstacles=0, grid=[5, 5], end_formation="O", deltas=deltas ,simplify_zstate = True)
 31 | env.collision_weight = 0.2 # old 0.2
 32 | print(env)
 33 | # env.show()
 34 | 
 35 | N_Episodes = 1500
 36 | episodes_to_plot = [0]
 37 | 
 38 | # Initialize variables
 39 | total_collisions_per_episode = deque()
 40 | total_reward_per_episode = deque()
 41 | total_true_reward_per_episode = deque()
 42 | total_t = deque()
 43 | mean_advantage = np.zeros([env.n_agents, N_Episodes])
 44 | 
 45 | # times = np.arange(0, T, step=drone_env.dt) + drone_env.dt
 46 | 
 47 | agents = TrainedAgent(critics_name=model_name+"-A2Ccritics.pth", actors_name=model_name+"-A2Cactors.pth", n_agents=env.n_agents)
 48 | print("### Running Trained agent (no learning)")
 49 | print(f"Episodes = {N_Episodes}, max Time iterations = {drone_env.max_time_steps} (T = {drone_env.max_time_steps * drone_env.dt}s, dt = {drone_env.dt}s)")
 50 | print(f"N of agents = {env.n_agents}, collision weight b = {env.collision_weight}")
 51 | 
 52 | EPISODES = trange(N_Episodes, desc='Episode: ', leave=True)
 53 | for episode in EPISODES:
 54 | 
 55 |     if episode+1 in episodes_to_plot:
 56 |         # reward_history = np.zeros([len(times), env.n_agents])
 57 |         trajectory = [env.state.copy()]
 58 |         z_trajectory = [env.z_states]    
 59 |     total_episode_reward = 0
 60 |     total_true_episode_reward = 0
 61 |     total_episode_collisions = 0
 62 |     # env.show()
 63 | 
 64 |     buffers = ExperienceBuffers(env.n_agents)
 65 | 
 66 |     # SIMULATION OVER T
 67 |     t_iter = 0
 68 |     finished = False
 69 |     while not finished:
 70 | 
 71 |         state = env.state
 72 |         z_states = env.z_states
 73 |         Ni = env.Ni
 74 | 
 75 |         # calculate actions based on current state
 76 |         # actions = drone_env.gradient_control(state, env)
 77 |         # actions = drone_env.proportional_control(state, env)
 78 |         actions = agents.forward(z_states, Ni)
 79 | 
 80 |         # Update environment one time step with the actions
 81 |         new_state, new_z, rewards, n_collisions, finished, true_rewards = env.step(actions)
 82 |         # EXPERIECE: [z_state, action, reward, next_z, finished]
 83 |         buffers.append(z_states, actions, rewards, new_z, Ni, finished)
 84 | 
 85 |         total_episode_reward += np.mean(rewards)
 86 |         total_true_episode_reward += np.mean(true_rewards)
 87 |         total_episode_collisions += n_collisions
 88 | 
 89 |         if episode+1 in episodes_to_plot:
 90 |             # reward_history[t_iter,:] = reward
 91 |             trajectory.append(new_state.copy())
 92 |             z_trajectory.append(new_z)
 93 | 
 94 |         t_iter +=1
 95 | 
 96 |     # END OF EPISODE
 97 |     # Append episode reward
 98 |     total_reward_per_episode.append(total_episode_reward)
 99 |     total_true_reward_per_episode.append(total_true_episode_reward)
100 |     total_collisions_per_episode.append(total_episode_collisions)
101 |     total_t.append(t_iter)
102 | 
103 |     # Test Critic values
104 |     Q_simulated, V_approx = agents.benchmark_cirtic(buffers, only_one_NN=False)
105 |     advantage = [np.mean(np.power(Q_simulated[i]-V_approx[i],1)) for i in range(env.n_agents)]
106 |     mean_advantage[:,episode] = np.array([advantage])
107 | 
108 |     # print(f"Episode collisions = {total_episode_collisions}")
109 |     # env.animate(trajectory,frame_time=0.1)
110 | 
111 |     # RESET ENVIRONMENT
112 |     env.reset(renew_obstacles=False)
113 | 
114 |     # Set progress bar description with information
115 |     average_reward = running_average(total_reward_per_episode, 50)[-1]
116 |     average_true_reward = running_average(total_true_reward_per_episode, 50)[-1]
117 |     average_collisions = running_average(total_collisions_per_episode, 50)[-1]
118 |     average_t = running_average(total_t, 50)[-1]
119 |     EPISODES.set_description(
120 |         f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter} - Average: {average_reward:.1f}/{average_collisions:.2f}/{average_t}. True r={average_true_reward:.1f}.")
121 | 
122 |     # Plot current trajectory
123 | 
124 |     if episode+1 in episodes_to_plot:
125 |         env.plot(trajectory, episode)
126 |         env.animate(trajectory, z_trajectory, deltas, episode, name=f"trained-E{episode+1}", format="mp4")
127 |         times = np.arange(0, t_iter)*drone_env.dt
128 |         plt.figure()
129 |         for i in range(env.n_agents):
130 |             agent_color = drone_env.num_to_rgb(i,env.n_agents-1)
131 |             plt.plot(times,Q_simulated[i], label=f"i={i}, simulated Q (Gt)", color = agent_color)
132 |             plt.plot(times,V_approx[i],"--" , label=f"i={i}, approx V", color = tuple(0.9*x for x in agent_color))
133 |         plt.legend()
134 |         plt.show()
135 | 
136 | plot_rewards(total_reward_per_episode, total_true_reward_per_episode, total_collisions_per_episode, n_ep_running_average=50)
137 | 
138 | plt.figure()
139 | for i in range(env.n_agents):
140 |         agent_color = drone_env.num_to_rgb(i,env.n_agents-1)
141 |         plt.plot(range(N_Episodes),mean_advantage[i,:], label=f"i={i}", color = agent_color)
142 | plt.xlabel("Episodes")
143 | plt.ylabel("trajectory 1/T * [Q(s,a)-V(s)] = mean_T A(s,a)")
144 | plt.legend()
145 | plt.grid()
146 | plt.show()
147 | 
148 | plt.figure()
149 | # plt.gca().set_size_inches(4.5, 3.5)
150 | counts, bins, bars = plt.hist(total_collisions_per_episode, range(min(total_collisions_per_episode), max(total_collisions_per_episode) + 1, 2), weights=np.ones(len(total_collisions_per_episode)) / len(total_collisions_per_episode))
151 | print(f"Runs with 0 coll. = {counts[0]*100:.2f}%. 2 coll. = {counts[1]*100:.2f}%")
152 | plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
153 | plt.title("Collision performance")
154 | plt.xlabel("Number of collisions")
155 | plt.ylabel("Frequency of Simulations")
156 | plt.show()
157 | 


--------------------------------------------------------------------------------
/control_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import drone_env
 4 | from drone_env import running_average, plot_rewards
 5 | from tqdm import tqdm, trange
 6 | 
 7 | env = drone_env.drones(n_agents=5, n_obstacles=2, grid=[5, 5], end_formation="O")
 8 | print(env)
 9 | # env.show()
10 | 
11 | T_Episodes = 2
12 | # Simulate for T seconds (default dt = drone_env.dt = 0.01s)
13 | T = 5
14 | 
15 | # Initialize variables
16 | total_collisions_list = []
17 | total_reward_list = []
18 | times = np.arange(0, T, step=drone_env.dt) + drone_env.dt
19 | EPISODES = trange(T_Episodes, desc='Episode: ', leave=True)
20 | 
21 | for episode in EPISODES:
22 | 
23 |     # reward_history = np.zeros([len(times), env.n_agents])
24 |     trajectory = [env.state.copy()]
25 |     total_episode_reward = 0
26 |     total_episode_collisions = 0
27 |     # env.show()
28 | 
29 |     for t_iter, t in enumerate(times):
30 |         # Simple gradient controller u_i = -grad_i, assuming Nj = V
31 |         state = env.state
32 | 
33 |         # calculate actions based on current state
34 |         # actions = drone_env.gradient_control(state, env)
35 |         actions = drone_env.proportional_control(state, env)
36 | 
37 |         # Update environment one time step with the actions
38 |         new_state, new_z, reward, n_collisions, finished = env.step(actions)
39 | 
40 |         total_episode_reward += np.sum(reward)
41 |         total_episode_collisions += n_collisions
42 | 
43 |         # reward_history[t_iter,:] = reward
44 |         trajectory.append(new_state.copy())
45 | 
46 |     # Append episode reward
47 |     total_reward_list.append(total_episode_reward)
48 |     total_collisions_list.append(total_episode_collisions)
49 | 
50 |     # print(f"Episode collisions = {total_episode_collisions}")
51 |     # env.animate(trajectory,frame_time=0.1)
52 | 
53 |     # RESET ENVIRONMENT
54 |     env.reset(renew_obstacles=False)
55 | 
56 |     # Set progress bar description with information
57 |     average_reward = running_average(total_reward_list, 50)[-1]
58 |     average_collisions = running_average(total_collisions_list, 50)[-1]
59 |     EPISODES.set_description(
60 |         f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter+1} - Average: {average_reward:.1f}/{average_collisions:.2f}/{t_iter+1}")
61 | 
62 |     # Plot current trajectory
63 |     env.plot(trajectory)
64 | 
65 | plot_rewards(total_reward_list,total_collisions_list, n_ep_running_average=5)


--------------------------------------------------------------------------------
/drone_env.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import math
  3 | import os
  4 | import random
  5 | import time
  6 | from distutils.log import error
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | from matplotlib import markers, animation
 10 | import numpy as np
 11 | from IPython import display
 12 | 
 13 | ### TO DO ############
 14 | """
 15 | - Put back collision cost b = 
 16 | - Change back from not using Critic NN in training(),benchmark_critic()SACAgent (now trying Gt)
 17 | - TrainedAgent for the actor loading
 18 | - Add collisions to the animation. Add action arrows to animation?
 19 | - (inactive now) Remove fixed random seed to initialize agents
 20 | - 
 21 | - Plot of the Q function field for a fixed kth satates (varying xi)
 22 | - Maybe clip the reward for near collision distances
 23 | """
 24 | ######################
 25 | 
 26 | # Spatial dimension
 27 | dim = 2 
 28 | # time step in seconds
 29 | dt = 0.05
 30 | max_time_steps = 200
 31 | # Some colours
 32 | LIGHT_RED    = '#FFC4CC';
 33 | LIGHT_GREEN  = '#95FD99';
 34 | BLACK        = '#000000';
 35 | WHITE        = '#FFFFFF';
 36 | LIGHT_PURPLE = '#E8D0FF';
 37 | LIGHT_ORANGE = '#FAE0C3';
 38 | YELLOW       = "#FFFF00";
 39 | BLUE         = '#98F5FF';
 40 | 
 41 | def num_to_rgb(val, max_val):
 42 |     if (val > max_val):
 43 |         raise ValueError("val must not be greater than max_val")
 44 |     if (val < 0 or max_val < 0):
 45 |         raise ValueError("arguments may not be negative")
 46 | 
 47 |     i = (val * 255 / max_val);
 48 |     r = round(math.sin(0.024 * i + 0) * 127 + 128)/255;
 49 |     g = round(math.sin(0.024 * i + 2) * 127 + 128)/255;
 50 |     b = round(math.sin(0.024 * i + 4) * 127 + 128)/255;
 51 |     return (r,g,b)
 52 | 
 53 | class drones:
 54 |     
 55 |     def __init__(self, n_agents: int, n_obstacles: int, grid: list, end_formation: str, k_closest = 2, deltas: np.ndarray = None, simplify_zstate = False) -> None:
 56 |         """[summary]
 57 | 
 58 |         Args:
 59 |             i_agents (int): number of agents, integer
 60 |             n_obstacles (int): number of obstacles integer
 61 |             grid (list): list of the two axis limits delimiting the grid
 62 |             end_formation(str): what formation to reach e.g "O" is a circle
 63 |             k_closest(int): how many agents to account for in the Delta disk, for having a finite localized state (implementation)
 64 |             deltas(ndarray, column vector): vector of Deltas for each agent sensing radius
 65 |         """
 66 |         self.n_agents = n_agents
 67 |         self.grid = grid
 68 |         self.goal = self.grid
 69 |         self.k_closest = k_closest
 70 |         self.simplify_zstate = simplify_zstate
 71 |         self.internal_t = 0
 72 |         self.collision_weight = 0.2 # Weight of collision cost per unit of time. -r = q|xi-xF|^2 + b log(d_i/d_ij)
 73 | 
 74 |         # Other geometry parameters
 75 |         self.drone_radius = np.ones(n_agents)*0.1 # radius of each drone in m
 76 | 
 77 |         # State space dynamics. For now, all agents have same A,B
 78 |         self.A = np.eye(dim)
 79 |         self.B = np.eye(dim)*dt
 80 | 
 81 |         # Initialize agents and obstacles, and check if delta is a correct value
 82 |         self.obstacles = self.create_obstacles(n_obstacles)
 83 |         self.end_points, self.d_safety = self.generate_formation(end_formation)
 84 | 
 85 |         if deltas is None:
 86 |             # In case of no deltas, we assume Dleta = d_safety, i.e no simplification (maximum deltas allowed)
 87 |             self.deltas = self.d_safety
 88 |         else:
 89 |             self.deltas = np.minimum(deltas, self.d_safety)
 90 |             if not np.all(deltas <= self.d_safety):
 91 |                 print("Some deltas are greater than the final minimum distance between end positions. Using minimum distance between end positions for those cases instead.",f"deltas = {self.deltas}")
 92 | 
 93 |         self.state, self.z_states = self.init_agents(n_agents)
 94 | 
 95 |         # self.trajectory = []
 96 |         # self.trajectory.append(self.state.copy())
 97 | 
 98 |     def reset(self, renew_obstacles = True):
 99 |         self.state, self.z_states = self.init_agents(self.n_agents)
100 |         self.internal_t = 0
101 |         if renew_obstacles == True:
102 |            self.obstacles = self.create_obstacles(self.n_obstacles)
103 | 
104 |     
105 |     def __str__(self):
106 |         print("Grid size: [x_lim, y_lim]\n",self.grid)
107 |         print("State: [x, y, vx, vy, r]\n", self.state)
108 |         print(f"z_sattes for k_closest = {self.k_closest}: simplify? {self.simplify_zstate}")
109 |         print("safety distance for each agent:\n", self.d_safety)
110 |         print("Deltas disk radius for each agent: \n", self.deltas)
111 |         # print("Obstacles [x, y, r]:\n",self.obstacles)
112 |         print(f"Collision cost weight (per unit of time) = {self.collision_weight} ")
113 |         return ""
114 | 
115 |     def generate_formation(self,end_formation):
116 |         """generate end point coordinates from a description of the end formation shape
117 | 
118 |         Args:
119 |             end_formation (str): a label given to a shape of final formation, i.e "O" for circle
120 | 
121 |         Returns:
122 |             formation: [xF1^T, xF2^T,...]^T. Column vector
123 |         """
124 |         if end_formation == "O":
125 |             size = min(self.grid)/2
126 |             angle_step = 2*np.pi/self.n_agents
127 |             formation = np.zeros([self.n_agents*dim,1])
128 | 
129 |             for i in range(self.n_agents):
130 |                 formation[dim*i,0] = np.cos(i * angle_step)*0.9*self.grid[0]/2 + self.grid[0]/2
131 |                 formation[dim*i+1,0] = np.sin(i * angle_step)*0.9*self.grid[1]/2 + self.grid[1]/2
132 | 
133 |         else:
134 |             error(str(end_formation) +" is Not a valid end formation identifier")
135 | 
136 |         # find maximum allowed d_safety (\hat{d}_i): di <= min(|| xFi -xFj || - li - lj) for all j!=i
137 |         d_safety = np.zeros(self.n_agents)
138 | 
139 |         for i in range(self.n_agents):
140 |             xFi = formation[dim*i:(i+1)*dim,0]
141 |             li = self.drone_radius[i]
142 |             d_i = np.infty
143 |             for j in range(self.n_agents): 
144 |                 if j != i:
145 |                     xFj = formation[dim*j:(j+1)*dim,0]
146 |                     lj = self.drone_radius[j]
147 |                     d_ij = np.linalg.norm(xFi-xFj) -li -lj
148 |                     d_i = min([d_i,d_ij])
149 | 
150 |             d_safety[i] = d_i
151 | 
152 |         # formation: [xF1^T, xF2^T,...]^T. Column vector, d_safety (distance to closest end position from agent's end position)
153 |         return formation, np.floor(d_safety*100)/100
154 | 
155 |     def create_obstacles(self,n_obstacles):
156 |         self.n_obstacles = n_obstacles
157 | 
158 |         max_size = 0.1*np.max(self.grid)
159 |         min_size = 0.05*max_size
160 | 
161 |         # generate random obstacles
162 |         # Generate the obstacle coordinates and size
163 |         # first column: x, second: y, third: r
164 |         obstacles = np.random.rand(n_obstacles,dim+1)
165 |         obstacles[:,0] = obstacles[:,0]*self.grid[0]
166 |         obstacles[:,1] = obstacles[:,1]*self.grid[1]
167 |         obstacles[:,dim] = obstacles[:,dim]*(max_size-min_size) + min_size 
168 | 
169 |         return obstacles
170 | 
171 |     def init_agents(self,n_agents):
172 |         # initialize state array:
173 |         # column 1: x, col 2: y, col 3: vx, col 4 vy, col 5: l
174 |         l = 0.1 # 10 cm
175 |         self.n_agents  = n_agents
176 |         self.global_state_space  = n_agents*(2*dim + 1) # x,y, vx,vy, radius
177 | 
178 |         grid = self.grid
179 | 
180 |         if self.simplify_zstate:
181 |             # Only take into account position x,y (remove vx vy l)
182 |             self.local_state_space = (dim)*(1+self.k_closest)
183 |         else:
184 |             self.local_state_space = (2*dim+1)*(1+self.k_closest)
185 | 
186 |         self.global_action_space = n_agents*dim # vx,vy (or ax,ay in the future)
187 |         self.local_action_space = dim
188 | 
189 |         state = np.zeros([n_agents,5])
190 |         state[:,4] = l
191 | 
192 |         # Create a grid and choose random nodes without repetition
193 |         delta_l = 2*1.1*l
194 |         divisions = np.floor(np.array(grid)/delta_l)
195 |         possible_coord = []
196 | 
197 |         for idx in range(int(divisions[0])):
198 |             for jdx in range(int(divisions[1])):
199 |                 coord = [idx*delta_l, jdx*delta_l]
200 |                 possible_coord.append(coord)
201 | 
202 |         # REMOVE THIS seed FOR true random initial state
203 |         # random.seed(1)
204 |         random_coord = np.array(random.sample(possible_coord, n_agents))
205 |         state[:,0:dim] = random_coord
206 | 
207 |         # Calculate localized states z (uing the reward funciton)
208 |         _, _, z_states, Ni, _ = self.rewards(state, self.end_points, self.n_agents, self.d_safety, self.deltas)
209 |         # Update the Ni graph
210 |         self.Ni = Ni
211 | 
212 |         return state, z_states
213 |     
214 |     def step(self,actions):
215 |         """_summary_
216 | 
217 |         Args:
218 |             actions (_type_):List of actions, each entry one agent [u1^T, u2^T, ...].Assuming each actions is a row vector with an entry for each dimension for each agent.
219 | 
220 |         Returns:
221 |             state: new state after applying action
222 |             z_states (list): localized states (from the new state)
223 |             r_vec (row vector): vector for each localized reward for each agent. Average is total reward 
224 |         """
225 | 
226 |         # Update state: s -> s' with the system dynamics
227 |         for i in range(self.n_agents):
228 | 
229 |             Ai = self.A
230 |             Bi = self.B
231 | 
232 |             xi = np.transpose(self.state[i,0:dim])
233 |             ui = actions[i]
234 | 
235 |             next_xi = np.matmul(Ai,xi) + np.matmul(Bi,ui)
236 | 
237 |             self.state[i,0:dim] = np.transpose(next_xi)
238 |             self.state[i,dim:2*dim] = np.transpose(ui)
239 | 
240 | 
241 |         # Calculate new individual reward [r1(s,a), r2,...] vector, plus related distance dependent values
242 |         r_vec, n_collisions, z_states, Ni, true_r_vec = self.rewards(self.state, self.end_points, self.n_agents, self.d_safety, self.deltas)
243 |         # Update the z and Ni graph
244 |         self.z_states = z_states
245 |         self.Ni = Ni
246 | 
247 |         # SHould return (s', r(s,a), n_collisions(s') ,finished)
248 |         end_points = np.reshape(self.end_points,np.shape(self.state[:,0:dim]))
249 |         error_from_end = np.linalg.norm(end_points-self.state[:,0:dim],axis = 1)
250 | 
251 |         if np.all(error_from_end <=0.2) or self.internal_t>= max_time_steps-1:
252 |             finished = True
253 |         else:
254 |             finished = False
255 | 
256 |         self.internal_t += 1
257 |         # TO DO: Proper is_finished
258 |         return self.state, z_states, r_vec, n_collisions, finished, true_r_vec
259 | 
260 |     def rewards(self, state, end_points, n_agents, d_safety, deltas):
261 |         '''
262 |         state: [column 1: x, col 2: y, col 3: vx, col 4 vy, col 5: r ] np.array[i,2*dim+1]
263 |         end_points: column [x1, y1, x2, y2, ... ]^T
264 |         d_safety: column d_i [d1, d2, ...]^T
265 |         '''
266 |         n_agents = np.size(state,0)
267 | 
268 |         # weights: q|xi-xF|^2 + b log(d_i/d_ij). I multiply per dt as i assume is cost/time
269 |         q = 2*dt
270 |         b = self.collision_weight*dt
271 | 
272 |         xF = np.reshape(end_points,[n_agents,dim])
273 |         xi = state[:,0:dim]
274 | 
275 |         # row vector, q|xi-xF|^2
276 |         to_goal_cost = q*np.power(np.linalg.norm(xF-xi,axis=1),2)
277 | 
278 |         # Collision cost
279 | 
280 |         d_ij, log_d, N_delta, collisions = self.distance_data(state,deltas,d_safety)
281 | 
282 |         collision_cost = b*np.sum(log_d*N_delta,1)
283 |         real_collision_cost = b*np.sum(log_d,1)
284 |         n_collisions = np.sum(collisions)
285 | 
286 |         # These are the approximated localized rewards
287 |         reward_vector = -np.nan_to_num(to_goal_cost+collision_cost)
288 |         true_reward_vector = -np.nan_to_num(to_goal_cost+real_collision_cost)
289 | 
290 |         # Calculate localized z states
291 |         z_states, Ni = self.localized_states(state, end_points, N_delta, d_ij)
292 | 
293 |         return reward_vector, n_collisions, z_states, Ni, true_reward_vector
294 | 
295 |     def distance_data(self,state,deltas,d_safety):
296 |         '''Return matrix of clipped distances matrix d[ij]
297 |            Also returns normalized distance matrix
298 |            Also returns collision binary graph 
299 |            Also Delta proximity neighbourhood
300 |            
301 |            deltas must be a column!
302 |            graph N includes i as neighbour 
303 |            '''
304 | 
305 |         n_agents = np.size(state,0)
306 |         d_ij = np.zeros([n_agents,n_agents])
307 |         d_ij_norm = np.zeros_like(d_ij)
308 | 
309 |         for i in range(n_agents):
310 |             xi = state[i,0:dim]
311 |             li = self.drone_radius[i]
312 |             for j in range(n_agents):
313 |                 if j != i:
314 |                     xj = state[j,0:dim]
315 |                     lj = self.drone_radius[j]
316 | 
317 |                     # Calculate agents relative distance
318 |                     d_ij[i,j] = min(np.linalg.norm(xi-xj) -li -lj, d_safety[i])
319 |                     if d_ij[i,j] == 0: # Handle unlikely case of exactly 0, if not then the coming division would be error
320 |                         d_ij[i,j] = -10**-6
321 |                     d_ij_norm[i,j] = d_safety[i]/d_ij[i,j]
322 |                 else:
323 |                     d_ij[i,j] = min(-li -li, d_safety[i])
324 |                     # Just to be safe, the distance to itself in the normalized case i make it =1, as log(1)=0 so it is neutral
325 |                     d_ij_norm[i,j] = 1
326 | 
327 |         collisions = d_ij_norm <= 0
328 |         N_delta = d_ij <= deltas
329 |         # Handling negative logarithms (only for d normalized, to use in logarithms)
330 |         d_ij_norm[collisions] = 9.99E3
331 |         log_d = np.log(d_ij_norm)
332 |         log_d[collisions] = 9.99E3
333 | 
334 |         return d_ij, log_d, N_delta, collisions
335 | 
336 |     def localized_states(self, state, end_points, N_delta, d_ij):
337 |         n_agents = np.size(state,0)
338 |         sorted_idx = np.argsort(d_ij,1)
339 |         k = self.k_closest
340 | 
341 |         z = []
342 |         Ni_list = []
343 | 
344 |         for i in range(n_agents):
345 |             # How many agents are in Delta range, minus itself
346 |             in_range = np.sum(N_delta[i,:])-1
347 |             sorted_agents = sorted_idx[i,:]
348 |             Ni = [i]
349 | 
350 |             xi = state[i,0:dim]
351 |             xFi = end_points[i*dim:(i+1)*dim]
352 | 
353 |             # Adding zii as the first row
354 |             Zi = np.zeros([k+1,2*dim+1])
355 |             Zi[0,:] = state[i,:].copy()
356 |             # print(Zi,xFi.flaten() ,xi)
357 |             Zi[0,0:dim] = -(xFi.flatten() - xi)
358 | 
359 |             for kth in range(1,k+1):
360 |             # kth = 1,2,...k
361 |             
362 |                 if kth <= in_range:
363 |                     # There exist a kth neighbour inside Delta
364 |                     j = sorted_agents[kth]
365 |                     Ni.append(j)
366 |                     xj = state[j,0:dim].copy()
367 |                     zj = state[j,:].copy()
368 |                     zj[0:dim] = xj-xi
369 |                     # zj[0:dim] = xj-xFi.flatten()
370 |                     # print(f"{kth}th closest agent is {j}, coord {xj}, rel coord {xj-xi}")
371 | 
372 |                 else: 
373 |                     # There is no neigbhour, thus using a null state (or state that should almost not add value)
374 |                     # I try for now to just add the next state in order, as if to just add the two closest even if outside the Delta range
375 |                     # Hopping that the NN learns that agents outside delta do not contribute to Q
376 |                     # Probably, the proper thing would be to project this next closest to the Delta boundary
377 |                     # j = sorted_agents[kth]
378 |                     # xj = state[j,0:dim].copy()
379 |                     # zj = state[j,:].copy()
380 |                     # zj[0:dim] = xj-xi
381 | 
382 |                     # Create a "ghost" agent that is just behind agent, at a distance 1.1*Delta in the direction to the goal
383 |                     j = sorted_agents[kth]
384 |                     zi = Zi[0,0:dim]
385 |                     zj = state[j,:].copy()
386 |                     zj[0:dim] = zi/np.linalg.norm(zi) * self.deltas[i]*1.1
387 | 
388 |                 Zi[kth,:] = zj
389 |             
390 |             if self.simplify_zstate:
391 |                 # Remove parts of the satte that overcomplicate:
392 |                 # No (vx,vy,l)
393 |                 z.append(Zi[:,0:dim])
394 |             else:
395 |                 z.append(Zi)
396 |             
397 |             Ni_list.append(Ni)
398 | 
399 |         # z is a list of arrays. Each array is the localized Delta state for each agent
400 |         # Add: Ni_list = list of neighbours for each agent
401 |         return z, Ni_list
402 | 
403 |     # %% Plotting methods 
404 |     def show(self, state = None, not_animate = True):
405 | 
406 |         if not_animate:
407 |             state = self.state
408 | 
409 |         fig, ax = plt.subplots(); # note we must use plt.subplots, not plt.subplot
410 |         # (or if you have an existing figure)
411 |         # fig = plt.gcf()
412 |         # ax = fig.gca()
413 | 
414 |         ax.set_xlim((0, self.grid[0]));
415 |         ax.set_ylim((0, self.grid[1]));
416 |         ax.grid(True);
417 | 
418 |         for i in range(self.n_obstacles):
419 |             circle = plt.Circle((self.obstacles[i,0], self.obstacles[i,1]), self.obstacles[i,2], color='black');
420 |             ax.add_patch(circle);
421 | 
422 |         for i in range(self.n_agents):
423 |             agent_color = num_to_rgb(i,self.n_agents-1)
424 |             circle = plt.Circle((state[i,0], state[i,1]), state[i,4], color=agent_color, fill=False, label=f"{i+1}");
425 |             ax.add_patch(circle);
426 |             # end point in plot
427 |             ax.plot(self.end_points[i*dim,0],self.end_points[i*dim+1,0],color=agent_color,marker = "*");
428 |         
429 |         ax.legend()
430 | 
431 |         if not_animate:
432 |             plt.show()
433 |         else:
434 |             return fig;
435 | 
436 |     def animate_basic(self, trajectory,frame_time = 0.2, frames = 20):
437 | 
438 |         good_frame = 0
439 |         each_frame = len(trajectory)/frames
440 |         for n_frame,state in enumerate(trajectory):
441 |             if each_frame > 1 and round(good_frame) == n_frame:
442 |                 good_frame+=each_frame
443 |             else:
444 |                 continue
445 |             fig = self.show(state, not_animate=False);
446 |             display.display(fig);
447 |             display.clear_output(wait=True)
448 |             time.sleep(frame_time)
449 | 
450 |     def plot(self, trajectory, episode = None):
451 | 
452 |         # Create trajectory matrices -> time,  [x(t=0), x(t+1), x(t+2)],  | agent
453 |         times = len(trajectory)
454 |         x_cord = np.zeros([self.n_agents, times])
455 |         y_cord = np.zeros([self.n_agents, times])
456 |         collision_table = np.full([self.n_agents, times], False)
457 | 
458 |         for t,state in enumerate(trajectory):
459 |             x_cord[:,t] = state[:,0]
460 |             y_cord[:,t] = state[:,1]
461 | 
462 |             # Collision calc
463 |             for i in range(self.n_agents):
464 |                 xi = state[i,0:dim]
465 |                 li = state[i,2*dim]
466 |                 for j in range(self.n_agents):
467 |                     if j != i:
468 |                         xj = state[j,0:dim]
469 |                         lj = state[j,2*dim]
470 | 
471 |                         dij = np.linalg.norm(xi-xj) -li -lj
472 |                         if dij<=0:
473 |                             collision_table[i,t] = True
474 | 
475 |         collisions = np.sum(collision_table)
476 | 
477 |         # Plot obstacles and final state of agents
478 |         fig, ax = plt.subplots(); # note we must use plt.subplots, not plt.subplot
479 |         # (or if you have an existing figure)
480 |         # fig = plt.gcf()
481 |         # ax = fig.gca()
482 |         fig.set_size_inches(4.5, 3.5)
483 |         fig.tight_layout()
484 | 
485 |         # ax.set_xlim((0, self.grid[0]));
486 |         # ax.set_ylim((0, self.grid[1]));
487 |         ax.grid(True);
488 | 
489 |         # Plot obstacles
490 |         for i in range(self.n_obstacles):
491 |             circle = plt.Circle((self.obstacles[i,0], self.obstacles[i,1]), self.obstacles[i,2], color='black');
492 |             ax.add_patch(circle);
493 | 
494 |         # Plot agents and collisions
495 |         for i in range(self.n_agents):
496 |             agent_color = num_to_rgb(i,self.n_agents-1)
497 |             circle = plt.Circle((state[i,0], state[i,1]), state[i,4], color=agent_color, fill=False, label=f"{i+1}");
498 |             ax.add_patch(circle);
499 | 
500 |             ax.plot(x_cord[i,:],y_cord[i,:],color=agent_color);
501 |             ax.plot(self.end_points[i*dim,0],self.end_points[i*dim+1,0],color=agent_color,marker = "*");
502 | 
503 |             collisions_xcord = x_cord[i,collision_table[i,:]]
504 |             collisions_ycord = y_cord[i,collision_table[i,:]]
505 |             total_markers  = len(collisions_xcord)
506 |             # problem when it is 0, markevery=np.floor(total_markers/2))
507 |             ax.plot(collisions_xcord,collisions_ycord,color=agent_color, marker = "v",fillstyle = "none", markevery=2);
508 | 
509 |         if episode is None:
510 |             ax.set_title(f"{self.n_agents} agents, collisions = {collisions}")
511 |         else:
512 |             ax.set_title(f"Episode {episode+1} , {self.n_agents} agents, collisions = {collisions}")
513 |         ax.legend(title="Agents")
514 |         plt.show()
515 |     
516 |     def animate(self, trajectory, z_trajectory , deltas, episode, name = "test", format ="gif"):
517 | 
518 |         if format == "mp4":
519 |             # plt.rcParams['animation.ffmpeg_path'] ='D:\\Programes portables\\ffmpeg\\bin\\ffmpeg.exe'
520 |             plt.rcParams['animation.ffmpeg_path'] ='C:\\Users\\Andreu\\OneDrive - KTH\\programes\\ffmpeg\\bin\\ffmpeg.exe'
521 | 
522 | 
523 |         fig, ax = plt.subplots(); # note we must use plt.subplots, not plt.subplot
524 |         ax.set_xlim((-1, self.grid[0]+1));
525 |         ax.set_ylim((-1, self.grid[1]+1));
526 |         # ax.grid(True)
527 |         ax.set_title(f"Delta = {deltas[0]}");
528 |         circles = []
529 |         d_circles = []
530 |         arrows = []
531 | 
532 |         states = trajectory[0]
533 |         z_states = z_trajectory[0]
534 |         for i in range(self.n_agents):
535 |             xi = states[i,0:dim]
536 |             xFi = self.end_points[i*dim:(i+1)*dim,0].flatten()
537 |             agent_color = num_to_rgb(i,self.n_agents-1)
538 |             ax.plot(xFi[0],xFi[1],color=agent_color,marker = "*");
539 |             circle = plt.Circle((states[i,0], states[i,1]), states[i,4], color=agent_color, fill=False, label=f"{i+1}");
540 |             circles.append(ax.add_patch(circle))
541 |             
542 |             delta_circle = plt.Circle((states[i,0], states[i,1]), states[i,4] + deltas[i], color="red", fill=False, ls = "--", alpha = 0.5);
543 |             d_circles.append(ax.add_patch(delta_circle))
544 | 
545 |             z_size = np.size(z_states[i],0)
546 |             z_state = z_states[i]
547 |             arrows_i = []
548 |             for k in range(z_size):
549 |                 if k == 0:
550 |                     star = xFi[0:dim]
551 |                     fini = xFi[0:dim] + z_state[k,0:dim]
552 |                     coords = np.array([star,fini])
553 |                     arrows_i.append(ax.plot(coords[:,0], coords[:,1] , color = agent_color, lw = 0.5, alpha = 0.3))
554 |                 else:
555 |                     star = xi[0:dim]
556 |                     fini = xi[0:dim] + z_state[k,0:dim]
557 |                     coords = np.array([star,fini])
558 |                     arrows_i.append(ax.plot(coords[:,0], coords[:,1] , color = agent_color, lw = 0.5, alpha = 0.6))
559 |             arrows.append(arrows_i)
560 | 
561 |         plt.legend(loc = "upper right")
562 | 
563 |         def update_objects(t:int):
564 |             states = trajectory[t]
565 |             z_states = z_trajectory[t]
566 |             ax.set_title(f"Episode {episode+1} .Deltas = {deltas[0]}. Time = {t*dt:.1f}s")
567 | 
568 |             for i in range(self.n_agents):
569 |                 xi = states[i,0:dim]
570 |                 xFi = self.end_points[i*dim:(i+1)*dim,0].flatten()
571 |                 agent_color = num_to_rgb(i,self.n_agents-1)
572 | 
573 |                 z_size = np.size(z_states[i],0)
574 |                 z_state = z_states[i]
575 |                 for k in range(z_size):
576 |                     if k == 0:
577 |                         star = xFi[0:dim]
578 |                         fini = xFi[0:dim] + z_state[k,0:dim]
579 |                         coords = np.array([star,fini])
580 |                         arrows[i][k][0].set_data(coords[:,0], coords[:,1])
581 |                         pass
582 |                     else:
583 |                         star = xi[0:dim]
584 |                         fini = xi[0:dim] + z_state[k,0:dim]
585 |                         coords = np.array([star,fini])
586 |                         arrows[i][k][0].set_data(coords[:,0], coords[:,1])
587 |                         pass
588 |                 circles[i].center = states[i,0], states[i,1]
589 |                 d_circles[i].center = states[i,0], states[i,1]
590 | 
591 |             return circles, d_circles, arrows
592 |         
593 |         print("\nSaving animation...")
594 |         anim = animation.FuncAnimation(fig, update_objects, len(trajectory), interval=dt)
595 | 
596 |         if format == "gif":
597 |             writergif = animation.PillowWriter(fps=30)
598 |             full_name = os.path.join("videos", name + ".gif")
599 |             anim.save(full_name, writer=writergif)
600 |         elif format == "mp4":
601 |             FFwriter = animation.FFMpegWriter(fps=30)
602 |             full_name = os.path.join("videos", name + ".mp4")
603 |             anim.save(full_name, writer = FFwriter)
604 |         else:
605 |             print(f"format{format} not valid")
606 | 
607 |         print(f"Animation saved as {full_name}")
608 | 
609 | 
610 | 
611 | # other control functions
612 | def gradient_control(state,env, u_max = 1):
613 |     """Given a global state, calculates the direction of gradient of the cost (logarithm barrier)
614 |         for each agent separately, taking into account all neighbours
615 | 
616 |     Args:
617 |         state (np.array): each row is an agent state (x,y, vx,vy, l)
618 |         env (_type_): drone environment
619 | 
620 |     Returns:
621 |         actions: list of row vectors [u1^T, u2^T,...]
622 |     """
623 | 
624 |     # maximum value of control component 
625 |     # u_max m/s
626 |     b = 0.1
627 |     q = 1
628 |     actions = []
629 | 
630 |     for i in range(env.n_agents):
631 |         xi = np.transpose(state[i,0:dim])
632 |         ri = state[i,4]
633 |         xF = env.end_points[i*dim:(i+1)*dim,0]
634 |         di_hat = env.d_safety[i]
635 | 
636 |         term1 = 2*(xi-xF)
637 | 
638 |         Ni = [j for j in range(env.n_agents) if j != i] # Complete graph (except itself), global knowledge
639 |         # print(Ni)
640 |         term2 = np.zeros_like(xi)
641 |         for j in Ni:
642 |             xj = np.transpose(state[j,0:2])
643 |             rj = state[j,4]
644 |             dij = np.linalg.norm(xi-xj)-ri-rj
645 | 
646 |             if dij <= di_hat:
647 |                 term2+= (xi-xj) / (dij*np.linalg.norm(xi-xj))
648 | 
649 |         grad = q*term1 - b*term2
650 |         ui = np.clip(-grad , -u_max,u_max)
651 |         actions.append(ui) 
652 | 
653 |     return actions
654 | 
655 | def proportional_control(state,env):
656 | 
657 |     # maximum value of control component 
658 |     u_max = 1 # m/s
659 |     k_gain = 1
660 |     actions = []
661 | 
662 |     for i in range(env.n_agents):
663 |         xi = np.transpose(state[i,0:dim])
664 |         ri = state[i,4]
665 |         xF = env.end_points[i*dim:(i+1)*dim,0]
666 |         # di_hat = env.d_safety[i]
667 | 
668 |         # zi = xF_i - xi (vector to goal)
669 |         zi = xF-xi
670 |         ui = k_gain * zi
671 | 
672 |         ui_norm = np.linalg.norm(ui)
673 |         if ui_norm > u_max:
674 |             # Cap the control norm to u_max
675 |             ui = ui/ui_norm*u_max
676 | 
677 |         actions.append(ui)
678 | 
679 |     return actions
680 | 
681 | # Plotting functions
682 | def running_average(x, N = 50):
683 |     ''' Function used to compute the running average
684 |         of the last N elements of a vector x
685 |     '''
686 |     if len(x) >= N:
687 |         y = np.copy(x)
688 |         y[N-1:] = np.convolve(x, np.ones((N, )) / N, mode='valid')
689 |     else:
690 |         y = np.zeros_like(x)
691 |     return y
692 | 
693 | # Plot Rewards and steps
694 | def plot_rewards(episode_reward_list, episode_true_reward_list, collision_list, n_ep_running_average=50):
695 |     episodes = [i for i in range(1, len(episode_reward_list)+1)]
696 |     # Plot Rewards and steps
697 |     fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 4.5))
698 |     # ax[0].plot(episodes, episode_reward_list, label='Approx. reward', color = "orange", alpha = 0.5)
699 |     ax[0].plot(episodes, episode_reward_list, label='Global reward', color = "cyan", alpha = 0.5)
700 |     # ax[0].plot(episodes, running_average(episode_reward_list, n_ep_running_average), label='Avg. approx. reward', color="red")
701 |     ax[0].plot(episodes, running_average(episode_true_reward_list, n_ep_running_average), label='Avg. global reward', color="blue")
702 | 
703 |     ax[0].set_xlabel('Episodes')
704 |     ax[0].set_ylabel('Total reward')
705 |     ax[0].set_title('Total Reward vs Episodes')
706 |     ax[0].legend()
707 |     ax[0].grid(alpha=0.3)
708 | 
709 |     ax[1].plot(episodes,collision_list, label='Collisions per episode', color="cyan", alpha = 0.5)
710 |     ax[1].plot(episodes, running_average(collision_list, n_ep_running_average), label='Avg. number of collisions per episode',color="blue")
711 |     ax[1].set_xlabel('Episodes')
712 |     ax[1].set_ylabel('Total number of collisions')
713 |     ax[1].set_title('Total number of collisions vs Episodes')
714 |     ax[1].legend()
715 |     ax[1].grid(alpha=0.3)
716 |     plt.show()
717 |  
718 | def plot_grads(grad_per_episode:np.ndarray, gi_per_episode:np.ndarray):
719 |     fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 9))
720 | 
721 |     n_agents = np.size(grad_per_episode,1)
722 |     episode_variable = [e for e in range(1, len(grad_per_episode)+1)]
723 | 
724 |     for i in range(n_agents):
725 |         agent_color = num_to_rgb(i,n_agents-1)
726 |         ax[0].plot(episode_variable, grad_per_episode[:,i], label=f"Agent {i+1}", color = agent_color)
727 |     ax[0].set_xlabel('Episodes')
728 |     ax[0].set_ylabel('Score function gradient')
729 |     # ax[0].set_title('Total Reward vs Episodes')
730 |     ax[0].legend()
731 |     ax[0].grid(alpha=0.3)
732 | 
733 |     for i in range(n_agents):
734 |         agent_color = num_to_rgb(i,n_agents-1)
735 |         ax[1].plot(episode_variable, gi_per_episode[:,i], label=f"Agent {i+1}", color = agent_color)
736 |     ax[1].set_xlabel('Episodes')
737 |     ax[1].set_ylabel('Approximated gi gradient (max norm = 100)')
738 |     # ax[1].set_title('Total number of collisions vs Episodes')
739 |     ax[1].legend()
740 |     ax[1].grid(alpha=0.3)
741 |     plt.show()
742 |         
743 | 
744 | 
745 | 
746 | 
747 |     
748 | 
749 | 
750 |         
751 |         
752 | 
753 | 
754 | 
755 | 


--------------------------------------------------------------------------------
/images/E1000_n10_DiscretePolicy4_b02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/E1000_n10_DiscretePolicy4_b02.png


--------------------------------------------------------------------------------
/images/E1500_n5_DiscretePolicy8_b02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/E1500_n5_DiscretePolicy8_b02.png


--------------------------------------------------------------------------------
/images/collisions_hist.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/collisions_hist.pdf


--------------------------------------------------------------------------------
/images/delta_effect.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/delta_effect.pdf


--------------------------------------------------------------------------------
/learning_Q_test.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from collections import deque, namedtuple
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import drone_env
  6 | from drone_env import running_average, plot_rewards
  7 | from tqdm import tqdm, trange
  8 | from SAC_agents import SACAgents, ExperienceBuffers
  9 | 
 10 | ### Set up parameters ###
 11 | n_agents = 5
 12 | deltas = np.ones(n_agents)*1.5
 13 | env = drone_env.drones(n_agents=n_agents, n_obstacles=0, grid=[5, 5], end_formation="O", deltas=deltas ,simplify_zstate = True)
 14 | print(env)
 15 | # env.show()
 16 | 
 17 | N_Episodes = 2
 18 | 
 19 | T = 5 # Simulate for T seconds (default dt = drone_env.dt = 0.01s) t_iter t=500
 20 | discount_factor = 0.99
 21 | alpha_critic = 10**-3
 22 | alpha_actor = 10**-2
 23 | M = 30 # Epochs, i.e steps of the SDG for the critic NN
 24 | dim_z = env.local_state_space # Dimension of the localized z_state space
 25 | dim_a = env.local_action_space # Dimension of the local action space
 26 | 
 27 | ### 
 28 | 
 29 | # Initialize variables
 30 | total_collisions_list = []
 31 | total_reward_list = []
 32 | Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])
 33 | 
 34 | times = np.arange(0, T, step=drone_env.dt) + drone_env.dt
 35 | 
 36 | 
 37 | agents = SACAgents(n_agents=env.n_agents, dim_local_state = dim_z, dim_local_action=dim_a, discount=discount_factor, epochs=M, learning_rate_critic=alpha_critic, learning_rate_actor=alpha_critic)
 38 | print("### Running Scalable-Actor-Critic with params: ###")
 39 | print(f"Episodes = {N_Episodes}, Time iterations = {len(times)} (T = {T}s, dt = {drone_env.dt}s)")
 40 | print(f"N of agents = {env.n_agents}, structure of critic NN = {agents.criticsNN[0].input_size}x{agents.criticsNN[0].L1}x{agents.criticsNN[0].L2}x{agents.criticsNN[0].output_size}")
 41 | print(f"Discount = {discount_factor}, lr for NN critical  = {alpha_critic}, lr for actor  = {alpha_actor}, epochs M = {M}")
 42 | 
 43 | 
 44 | EPISODES = trange(N_Episodes, desc='Episode: ', leave=True)
 45 | for episode in EPISODES:
 46 | 
 47 |     # reward_history = np.zeros([len(times), env.n_agents])
 48 |     trajectory = [env.state.copy()]
 49 |     total_episode_reward = 0
 50 |     total_episode_collisions = 0
 51 |     # env.show()
 52 | 
 53 |     buffers = ExperienceBuffers(env.n_agents)
 54 |     # SIMULATION OVER T
 55 |     for t_iter, time in enumerate(times):
 56 |         # Simple gradient controller u_i = -grad_i, assuming Nj = V
 57 |         state = env.state
 58 |         z_states = env.z_states
 59 |         Ni = env.Ni
 60 | 
 61 |         # calculate actions based on current state
 62 |         actions = drone_env.gradient_control(state, env)
 63 |         # actions = drone_env.proportional_control(state, env)
 64 |         # actions = agents.forward(z_states)
 65 | 
 66 |         # Update environment one time step with the actions
 67 |         new_state, new_z, rewards, n_collisions, finished = env.step(actions)
 68 |         # EXPERIECE: [z_state, action, reward, next_z, finished]
 69 |         buffers.append(z_states, actions, rewards,new_z, Ni, finished)
 70 | 
 71 |         total_episode_reward += np.mean(rewards)
 72 |         total_episode_collisions += n_collisions
 73 | 
 74 |         # reward_history[t_iter,:] = reward
 75 |         trajectory.append(new_state.copy())
 76 | 
 77 |     # END OF EPISODE
 78 |     # Append episode reward
 79 |     total_reward_list.append(total_episode_reward)
 80 |     total_collisions_list.append(total_episode_collisions)
 81 | 
 82 |     # Train of critic with the data of the episode
 83 |     agents.train(buffers)
 84 |     Q_simulated, Q_approx = agents.benchmark_cirtic(buffers, only_one_NN=False)
 85 | 
 86 |     # print(f"Episode collisions = {total_episode_collisions}")
 87 |     # env.animate(trajectory,frame_time=0.1)
 88 | 
 89 |     # RESET ENVIRONMENT
 90 |     env.reset(renew_obstacles=False)
 91 | 
 92 |     # Set progress bar description with information
 93 |     average_reward = running_average(total_reward_list, 50)[-1]
 94 |     average_collisions = running_average(total_collisions_list, 50)[-1]
 95 |     EPISODES.set_description(
 96 |         f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter+1} - Average: {average_reward:.1f}/{average_collisions:.2f}/{t_iter+1}")
 97 | 
 98 |     # Plot current trajectory
 99 | 
100 |     if episode >= N_Episodes-2:
101 |         env.plot(trajectory)
102 | 
103 |         plt.figure()
104 |         for i in range(env.n_agents):
105 |             agent_color = drone_env.num_to_rgb(i,env.n_agents-1)
106 |             plt.plot(times,Q_approx[i], label=f"i={i}, approx Q")
107 |             plt.plot(times,Q_simulated[i], "--", label=f"i={i}, simulated Q")
108 |         plt.legend()
109 |         plt.show()
110 | 
111 | agents.save(filename="Q_test")
112 | 
113 | plot_rewards(total_reward_list,total_collisions_list, n_ep_running_average=5)


--------------------------------------------------------------------------------
/matlab/cost_field.m:
--------------------------------------------------------------------------------
 1 | clear all;
 2 | close all;
 3 | 
 4 | global ri rj a xB
 5 | 
 6 | xB = [5 5]';
 7 | % xi = [1 1]';
 8 | xj = [0.5 0.5 3; 
 9 |       3 1 1];
10 | ri = 0.1;
11 | rj = 0.1;
12 | a = 5;
13 | 
14 | [X Y] = meshgrid(linspace(0,5,100));
15 | cost_fun = 0*X;
16 | gradU_fun = cost_fun;
17 | gradV_fun = cost_fun;
18 | 
19 | for i=1:size(X,2)
20 |     for j=1:size(X,1)
21 |         xi = [X(i,j) Y(i,j)]';
22 |         cost_fun(i,j) = cost(xi,xj);
23 |         g = -grad(xi,xj);
24 |         gradU_fun(i,j) = g(1);
25 |         gradV_fun(i,j) = g(2);
26 |     end
27 | end
28 | 
29 | 
30 | %% plots
31 | figure;
32 | surf(X,Y,cost_fun,'EdgeAlpha',0)
33 | hold on
34 | plot(xB(1),xB(2),'or','LineWidth',2)
35 | for j=1:size(xj,2)
36 |     plot(xj(1,j),xj(2,j),'ob','LineWidth',2)
37 | end
38 | % plot(xi(1),xi(2),'ok','LineWidth',2)
39 | % xlim([0 6]); ylim([0 6]); 
40 | % zlim([-inf,cost_fun(1,1)*1.1])
41 | grid on;
42 | 
43 | figure;
44 | % quiver(X,Y,gradU_fun,gradV_fun)
45 | streamslice(X,Y,gradU_fun,gradV_fun)
46 | 
47 | %% Fnctions
48 | 
49 | function g = grad(xi,Xj)
50 |     global ri rj a xB
51 | 
52 |     term1 = 2*(xi-xB);
53 |     term2 = Xj*0;
54 |     
55 |     for j=1:size(Xj,2)
56 |         xj = Xj(:,j);
57 |         
58 |         term2(:,j) = 1/(norm(xi-xj)-ri-rj) * (xi-xj)/norm(xi-xj);
59 |     end
60 |     term2 = -a*sum(term2,2);
61 |     g = term1+term2;
62 |     g = term2;
63 | 
64 | %     g = g/norm(g);
65 | end
66 | 
67 | function c = cost(xi,Xj)
68 |     global ri rj a xB
69 |     dij = vecnorm(xi-Xj)-ri-rj;
70 |     dij(dij<0)=0;
71 |     c = norm(xi-xB)^2 - a*sum(log(dij));
72 |     
73 | end
74 | 


--------------------------------------------------------------------------------
/matlab/derivations_22ndPol.m:
--------------------------------------------------------------------------------
 1 | clear all;
 2 | 
 3 | rng(1)
 4 | 
 5 | z_dim = 6;
 6 | dim = 2;
 7 | z = sym('z%d%d', [z_dim 1],'real');
 8 | z1 = z(1:2); z2 = z(3:4); z3 = z(5:6);
 9 | zValues = rand(size(z))*5;
10 | a = sym('a%d%d', [2 1],'real');
11 | aValues = rand(size(a));
12 | theta = sym('theta%d%d', [1 z_dim/dim],'real');
13 | thetaValues = rand(size(theta));
14 | 
15 | R = sym('R%d%d', [dim z_dim],'real');
16 | % R1div =  Rderiv(theta(1)); R2div = Rderiv(theta(2)); R3div = Rderiv(theta(3));
17 | % Rdiv = [R1div R2div R3div];
18 | Sigma = sym('Sigma%d%d', [2 2],'real');
19 | Sigma(2,1) = Sigma(1,2); 
20 | % Sigma = inv(Sigma);
21 | SigmaValues = [0.1, 0.01;0.01,0.15];


--------------------------------------------------------------------------------
/matlab/derivations_2ndPol.m:
--------------------------------------------------------------------------------
 1 | clear all;
 2 | 
 3 | rng(1)
 4 | 
 5 | z_dim = 6;
 6 | dim = 2;
 7 | z = sym('z%d%d', [z_dim 1],'real');
 8 | z1 = z(1:2); z2 = z(3:4); z3 = z(5:6);
 9 | zValues = rand(size(z))*5;
10 | a = sym('a%d%d', [2 1],'real');
11 | aValues = rand(size(a));
12 | theta = sym('theta%d%d', [1 z_dim/dim],'real');
13 | thetaValues = rand(size(theta));
14 | R2d = @(angle) [cos(angle) -sin(angle); sin(angle) cos(angle)];
15 | Rderiv = @(angle) [-sin(angle) -cos(angle); cos(angle) -sin(angle)];
16 | R1 = R2d(theta(1)); R2 = R2d(theta(2)); R3 = R2d(theta(3)); 
17 | R = [R1 R2 R3];
18 | R1div =  Rderiv(theta(1)); R2div = Rderiv(theta(2)); R3div = Rderiv(theta(3));
19 | Rdiv = [R1div R2div R3div];
20 | Sigma = sym('Sigma1%d%d', [2 2],'real');
21 | Sigma(2,1) = Sigma(1,2); 
22 | Sigma1 = inv(Sigma);
23 | SigmaValues = [0.1, 0.01;0.01,0.15];
24 | 
25 | val = @(eqq) double(subs(eqq,[z(:)',theta(:)',a(:)',Sigma(:)'],[zValues(:)',thetaValues(:)',aValues(:)',SigmaValues(:)']));
26 | 
27 | d = 2;
28 | original = log(1/((2*pi)^(d/2)*sqrt(det(Sigma1)))*exp(-1/2*(a-R*z)'*(Sigma1)*(a-R*z)));
29 | 
30 | eqq1 = -1/2 * (a-R*z)' * (Sigma1) * (a-R*z);
31 | eqq2 =-1/2*z'*R'*(Sigma1)*R*z + a'*(Sigma1)*R*z - 1/2*a'*(Sigma1)*a;
32 | I = -1/2*z'*R'*(Sigma1)*R*z;
33 | II = a'*(Sigma1)*R*z;
34 | 
35 | % for m=1:z_dim
36 | %     dum = theta(:,m);
37 | %     eval(['theta' num2str(m) ' = dum;']);
38 | % end
39 | 
40 | Vgra = @(scalar) arrayfun(@(s,v) val(diff(s,v)),scalar*ones(size(theta)),theta);
41 | gra = @(scalar) arrayfun(@(s,v) diff(s,v),scalar*ones(size(theta)),theta);
42 | 
43 | 
44 | %
45 | grad1 = Vgra(eqq1);
46 | grad2 = Vgra(eqq2);
47 | 
48 | % IIgrad = [val(a'*Sigma1*R1div*z1), val(a'*Sigma1*R2div*z2), val(a'*Sigma1*R3div*z3); Vgra(II)]
49 | % Igrad = val(-(Sigma1)*theta*z*z');
50 | % total_grad = val((Sigma1)*(a-theta*z)*z')
51 | grad1
52 | 
53 | 
54 | %% loops
55 | % tot = 0;
56 | % l = 2;
57 | % zz = rand(l,1);
58 | % RR = rand(l,l);
59 | % SS = rand(l,l);
60 | % 
61 | % for i=1:l
62 | %     for j =1:l
63 | %         tot = tot + z1(i)*R1(i,j)*Sigma1(i,j)*R1(j,i)*z1(j);
64 | %     end
65 | % end
66 | % tot
67 | % 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/matlab/derivations_check.m:
--------------------------------------------------------------------------------
 1 | clear all;
 2 | 
 3 | rng(1)
 4 | 
 5 | z_dim = 6;
 6 | z = sym('z%d%d', [z_dim 1],'real');
 7 | % zValues = rand(size(z))*5;
 8 | zValues = [1,2,3,4,5,6]';
 9 | a = sym('a%d%d', [2 1],'real');
10 | % aValues = rand(size(a));
11 | aValues = [0.5,1]';
12 | theta = sym('theta%d%d', [2 z_dim],'real');
13 | % thetaValues = rand(size(theta));
14 | % thetaValues(:,5:6)=0;
15 | thetaValues = [1,0.5,1,0.5,1,0.5;2,0.6,2,0.6,2,0.6];
16 | Sigma = sym('Sigma%d%d', [2 2],'real');
17 | SigmaValues = [1,0.01;0.01,1.5];
18 | val = @(eqq) double(subs(eqq,[z(:)',theta(:)',a(:)',Sigma(:)'],[zValues(:)',thetaValues(:)',aValues(:)',SigmaValues(:)']));
19 | 
20 | d = 2;
21 | original = log(1/((2*pi)^(d/2)*sqrt(det(Sigma)))*exp(-1/2*(a-theta*z)'*inv(Sigma)*(a-theta*z)));
22 | 
23 | eqq1 = -1/2 * (a-theta*z)' * inv(Sigma) * (a-theta*z);
24 | eqq2 =-1/2*z'*theta'*inv(Sigma)*theta*z + a'*inv(Sigma)*theta*z - 1/2*a'*inv(Sigma)*a;
25 | I = -1/2*z'*theta'*inv(Sigma)*theta*z;
26 | II = a'*inv(Sigma)*theta*z;
27 | 
28 | % for m=1:z_dim
29 | %     dum = theta(:,m);
30 | %     eval(['theta' num2str(m) ' = dum;']);
31 | % end
32 | 
33 | gra = @(scalar) arrayfun(@(s,v) val(diff(s,v)),scalar*ones(size(theta)),theta);
34 | 
35 | %
36 | grad1 = gra(eqq1);
37 | grad2 = gra(eqq2);
38 | 
39 | IIgrad = val(inv(Sigma)*a*z');
40 | Igrad = val(-inv(Sigma)*theta*z*z');
41 | total_grad = val(inv(Sigma)*(a-theta*z)*z')
42 | grad1
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/matlab/distance_def.m:
--------------------------------------------------------------------------------
 1 | %% Visualization of the distance between agents (variables)
 2 | 
 3 | clear all;
 4 | 
 5 | di_hat = 5;
 6 | max_dist = 7;
 7 | x_dist = linspace(0,max_dist,100);
 8 | % capping to d_hat, i,e dij = min(di_hat, ||xi-xj||-li-lj)
 9 | dij = x_dist;
10 | dij(x_dist>di_hat)=di_hat;
11 | dij_norm = di_hat./dij;
12 | log_d = log(dij_norm);
13 | 
14 | %% plots
15 | figure;
16 | grid on;
17 | subplot(3,1,1)
18 | plot(x_dist,dij)
19 | title(['$\hat{d}_i =' num2str(di_hat) '$'])
20 | xlabel('$||x_i-x_j||-l_i-l_j$')
21 | ylabel('$d_{ij} = min(\hat{d}_i, ||x_i-x_j||-l_i-l_j)$')
22 | grid on;
23 | 
24 | subplot(3,1,2)
25 | plot(x_dist,dij_norm)
26 | xlabel('$||x_i-x_j||-l_i-l_j$')
27 | ylabel('$\frac{\hat{d}_i}{d_{ij}}$')
28 | grid on;
29 | 
30 | subplot(3,1,3)
31 | plot(x_dist,log_d)
32 | xlabel('$||x_i-x_j||-l_i-l_j$')
33 | ylabel('$log(\frac{\hat{d}_i}{d_{ij}})$')
34 | grid on;
35 | 
36 | 


--------------------------------------------------------------------------------
/matlab/images/dij.fig:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/matlab/images/dij.fig


--------------------------------------------------------------------------------
/matlab/normal_multivariate_pdf.m:
--------------------------------------------------------------------------------
 1 | clear all;
 2 | 
 3 | mu = [0 0];
 4 | sigma = 0.1;
 5 | Sigma = [sigma 0; 0 sigma];
 6 | 
 7 | 
 8 | delta_x = 0.05;
 9 | 
10 | x1 = -3:delta_x:3;
11 | x2 = -3:delta_x:3;
12 | [X1,X2] = meshgrid(x1,x2);
13 | X = [X1(:) X2(:)];
14 | y = mvnpdf(X,mu,Sigma);
15 | y = reshape(y,length(x2),length(x1));
16 | surf(x1,x2,y,'EdgeAlpha',0.1)
17 | caxis([min(y(:))-0.5*range(y(:)),max(y(:))])
18 | % axis([-3 3 -3 3 0 0.4])
19 | xlabel('x1')
20 | ylabel('x2')
21 | zlabel('Probability Density')
22 | 
23 | volumen = sum(sum(y*delta_x^2))
24 | 


--------------------------------------------------------------------------------
/matlab/optimal_traj.m:
--------------------------------------------------------------------------------
 1 | clear all;
 2 | close all;
 3 | 
 4 | T = 30;
 5 | i_agents = 3;
 6 | ri =[1 1 1]*0; % drone diameter
 7 | x_size = 2*i_agents; % dim* n_agents
 8 | X_size = T*x_size;
 9 | xF = [1 0 4 0 6 0]';
10 | x0 = [-1 0 -2 0 -3 0]';
11 | 
12 | normalizing_r = ((xF(1:2:end)-xF(1:2:end)').^2+(xF(2:2:end)-xF(2:2:end)').^2).^0.5;
13 | normalizing_r = mink(normalizing_r,2,2);
14 | normalizing_r = normalizing_r(:,2);
15 | 
16 | XB=kron(ones(T,1),xF);
17 | ineqA = eye(T) - (triu(ones(T),-1)-triu(ones(T)));
18 | ineqA = kron(ineqA,eye(x_size));
19 | ineqA = [ineqA;-ineqA];
20 | 
21 | u_max = 0.4;
22 | ineqB = ones(X_size*2,1)*u_max;
23 | ineqB(1:x_size) = u_max + x0;
24 | ineqB(X_size+1:X_size+x_size) = u_max - x0;
25 | 
26 | X0 = rand(X_size,1);
27 | % load sol
28 | % X0 = sol;
29 | 
30 | fun = @(x) f(x,XB,x_size,T,normalizing_r,ri);
31 | 
32 | [sol,fval] = fmincon(fun,X0,ineqA,ineqB)
33 | 
34 | %% separate solution:
35 | sol_re = reshape(sol,x_size,T);
36 | x_sol = sol_re(1:2:end,:);
37 | y_sol = sol_re(2:2:end,:);
38 | 
39 | for t=2:size(x_sol,2)
40 |     figure(1)
41 |     plot([x0(1:2:end)';x_sol(:,1:t-1)'],[x0(2:2:end)';y_sol(:,1:t-1)'],'-^')
42 |     hold on
43 |     plot(xF(1:2:end)',xF(2:2:end)','x')
44 |     grid on;
45 |     plot(x0(1:2:end),x0(2:2:end),'o')
46 |     hold off;
47 |     
48 |     pause(0.2)
49 |     drawnow
50 | end
51 | 
52 | 
53 | 
54 | %% cost function
55 | function obj = f(X,XF,x_size,T,normalazing_r,ri)
56 | 
57 | b = 0.4;
58 | 
59 | % goal cost
60 | obj_end = (X-XF)'*(X-XF);
61 | 
62 | % collision cost
63 | xt = reshape(X,x_size,T);
64 | x_t = xt(1:2:end,:);
65 | y_t = xt(2:2:end,:);
66 | 
67 | obj_colision = 0;
68 | for t = 1:size(xt,2)
69 |     x_diff = x_t(:,t)-x_t(:,t)';
70 |     y_diff = y_t(:,t)-y_t(:,t)';
71 |     
72 |     D = (x_diff.^2 + y_diff.^2).^0.5 + (-ri-ri').*(1-eye(3));
73 |     D_clip = min(1,D./normalazing_r)+eye(size(D,1));
74 |     
75 |     obj_colision = obj_colision -b*sum(sum(log(D_clip)));
76 |     
77 | end
78 | 
79 | obj=obj_end + obj_colision;
80 | end


--------------------------------------------------------------------------------
/matlab/sol.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/matlab/sol.mat


--------------------------------------------------------------------------------
/models/E1000_n10_DiscretePolicy4_b02-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E1000_n10_DiscretePolicy4_b02-A2Cactors.pth


--------------------------------------------------------------------------------
/models/E1000_n10_DiscretePolicy4_b02-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E1000_n10_DiscretePolicy4_b02-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/E500_M30_LR1e4_badInitialState-actors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E500_M30_LR1e4_badInitialState-actors.pth


--------------------------------------------------------------------------------
/models/E500_M30_LR1e4_badInitialState-critics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E500_M30_LR1e4_badInitialState-critics.pth


--------------------------------------------------------------------------------
/models/cont_preloaded-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/cont_preloaded-A2Cactors.pth


--------------------------------------------------------------------------------
/models/cont_preloaded-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/cont_preloaded-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/deltas/deltas0.01_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.01_softmax16-A2Cactors.pth


--------------------------------------------------------------------------------
/models/deltas/deltas0.01_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.01_softmax16-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/deltas/deltas0.1_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.1_softmax16-A2Cactors.pth


--------------------------------------------------------------------------------
/models/deltas/deltas0.1_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.1_softmax16-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/deltas/deltas0.2_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.2_softmax16-A2Cactors.pth


--------------------------------------------------------------------------------
/models/deltas/deltas0.2_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.2_softmax16-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/deltas/deltas0.5_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.5_softmax16-A2Cactors.pth


--------------------------------------------------------------------------------
/models/deltas/deltas0.5_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.5_softmax16-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/deltas/deltas0.8_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.8_softmax16-A2Cactors.pth


--------------------------------------------------------------------------------
/models/deltas/deltas0.8_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.8_softmax16-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/deltas/deltas1.5_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1.5_softmax16-A2Cactors.pth


--------------------------------------------------------------------------------
/models/deltas/deltas1.5_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1.5_softmax16-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/deltas/deltas1_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1_softmax16-A2Cactors.pth


--------------------------------------------------------------------------------
/models/deltas/deltas1_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1_softmax16-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/deltas/deltas2.43_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.43_softmax16-A2Cactors.pth


--------------------------------------------------------------------------------
/models/deltas/deltas2.43_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.43_softmax16-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/deltas/deltas2.5_softmax16-A2Cactors-old (1).pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.5_softmax16-A2Cactors-old (1).pth


--------------------------------------------------------------------------------
/models/deltas/deltas2.5_softmax16-A2Ccritics-old (2).pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.5_softmax16-A2Ccritics-old (2).pth


--------------------------------------------------------------------------------
/models/deltas/deltas2_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2_softmax16-A2Cactors.pth


--------------------------------------------------------------------------------
/models/deltas/deltas2_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2_softmax16-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/deltas2.5_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas2.5_softmax16-A2Cactors.pth


--------------------------------------------------------------------------------
/models/deltas2.5_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas2.5_softmax16-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/discrete-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/discrete-A2Cactors.pth


--------------------------------------------------------------------------------
/models/discrete-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/discrete-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/final/cont_n5-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/cont_n5-A2Cactors.pth


--------------------------------------------------------------------------------
/models/final/cont_n5-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/cont_n5-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/final/simple-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/simple-A2Cactors.pth


--------------------------------------------------------------------------------
/models/final/simple-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/simple-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/final/softmax8_n4-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n4-A2Cactors.pth


--------------------------------------------------------------------------------
/models/final/softmax8_n4-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n4-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/final/softmax8_n5-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n5-A2Cactors.pth


--------------------------------------------------------------------------------
/models/final/softmax8_n5-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n5-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/final/softmax8_n8-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n8-A2Cactors.pth


--------------------------------------------------------------------------------
/models/final/softmax8_n8-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n8-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/n5_E1500_Advantage-actors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/n5_E1500_Advantage-actors.pth


--------------------------------------------------------------------------------
/models/n5_E1500_Advantage-critics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/n5_E1500_Advantage-critics.pth


--------------------------------------------------------------------------------
/models/softmax8_n5-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/softmax8_n5-A2Cactors.pth


--------------------------------------------------------------------------------
/models/softmax8_n5-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/softmax8_n5-A2Ccritics.pth


--------------------------------------------------------------------------------
/models/trained_E1000_M50_LR001-actors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/trained_E1000_M50_LR001-actors.pth


--------------------------------------------------------------------------------
/policy_performance_variables_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/policy_performance_variables_1


--------------------------------------------------------------------------------
/policy_performance_variables_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/policy_performance_variables_2


--------------------------------------------------------------------------------
/profile.bat:
--------------------------------------------------------------------------------
1 | @REM Simple bat to profile scripts: profile file.py
2 | 
3 | @echo off
4 | python -m cProfile -o dump.prof %1
5 | snakeviz dump.prof
6 | del dump.prof


--------------------------------------------------------------------------------
/spec-file.txt:
--------------------------------------------------------------------------------
  1 | # This file may be used to create an environment using:
  2 | # $ conda create --name <env> --file <this file>
  3 | # platform: win-64
  4 | @EXPLICIT
  5 | https://conda.anaconda.org/conda-forge/win-64/nodejs-16.12.0-h57928b3_0.tar.bz2
  6 | https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2022.6.15-h5b45459_0.tar.bz2
  7 | https://conda.anaconda.org/conda-forge/win-64/intel-openmp-2021.4.0-h57928b3_3556.tar.bz2
  8 | https://conda.anaconda.org/conda-forge/win-64/mkl-include-2021.4.0-h0e2418a_729.tar.bz2
  9 | https://conda.anaconda.org/conda-forge/win-64/msys2-conda-epoch-20160418-1.tar.bz2
 10 | https://conda.anaconda.org/conda-forge/win-64/pandoc-2.16.1-h8ffe710_0.tar.bz2
 11 | https://conda.anaconda.org/pytorch/noarch/pytorch-mutex-1.0-cpu.tar.bz2
 12 | https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.20348.0-h57928b3_0.tar.bz2
 13 | https://conda.anaconda.org/conda-forge/win-64/winpty-0.4.3-4.tar.bz2
 14 | https://conda.anaconda.org/pytorch/noarch/cpuonly-2.0-0.tar.bz2
 15 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gmp-6.1.0-2.tar.bz2
 16 | https://conda.anaconda.org/conda-forge/win-64/m2w64-libwinpthread-git-5.0.0.4634.697f757-2.tar.bz2
 17 | https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.29.30037-h902a5da_5.tar.bz2
 18 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-core-5.3.0-7.tar.bz2
 19 | https://conda.anaconda.org/conda-forge/win-64/vc-14.2-hb210afc_5.tar.bz2
 20 | https://conda.anaconda.org/conda-forge/win-64/ffmpeg-4.3.1-ha925a31_0.tar.bz2
 21 | https://conda.anaconda.org/conda-forge/win-64/icu-68.2-h0e60522_0.tar.bz2
 22 | https://conda.anaconda.org/conda-forge/win-64/jbig-2.1-h8d14728_2003.tar.bz2
 23 | https://conda.anaconda.org/conda-forge/win-64/jpeg-9d-h8ffe710_0.tar.bz2
 24 | https://conda.anaconda.org/conda-forge/win-64/lerc-3.0-h0e60522_0.tar.bz2
 25 | https://conda.anaconda.org/conda-forge/win-64/libclang-11.1.0-default_h5c34c98_1.tar.bz2
 26 | https://conda.anaconda.org/conda-forge/win-64/libdeflate-1.8-h8ffe710_0.tar.bz2
 27 | https://conda.anaconda.org/conda-forge/win-64/libsodium-1.0.18-h8d14728_1.tar.bz2
 28 | https://conda.anaconda.org/conda-forge/win-64/libuv-1.42.0-h8ffe710_0.tar.bz2
 29 | https://conda.anaconda.org/conda-forge/win-64/libzlib-1.2.11-h8ffe710_1013.tar.bz2
 30 | https://conda.anaconda.org/conda-forge/win-64/lz4-c-1.9.3-h8ffe710_1.tar.bz2
 31 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libgfortran-5.3.0-6.tar.bz2
 32 | https://conda.anaconda.org/conda-forge/win-64/openssl-1.1.1q-h8ffe710_0.tar.bz2
 33 | https://conda.anaconda.org/conda-forge/win-64/sqlite-3.36.0-h8ffe710_2.tar.bz2
 34 | https://conda.anaconda.org/conda-forge/win-64/tbb-2021.4.0-h2d74725_1.tar.bz2
 35 | https://conda.anaconda.org/conda-forge/win-64/tk-8.6.11-h8ffe710_1.tar.bz2
 36 | https://conda.anaconda.org/conda-forge/win-64/xz-5.2.5-h62dcd97_1.tar.bz2
 37 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-5.3.0-7.tar.bz2
 38 | https://conda.anaconda.org/conda-forge/win-64/mkl-2021.4.0-h0e2418a_729.tar.bz2
 39 | https://conda.anaconda.org/conda-forge/win-64/python-3.8.12-h7840368_2_cpython.tar.bz2
 40 | https://conda.anaconda.org/conda-forge/win-64/zeromq-4.3.4-h0e60522_1.tar.bz2
 41 | https://conda.anaconda.org/conda-forge/win-64/zlib-1.2.11-h8ffe710_1013.tar.bz2
 42 | https://conda.anaconda.org/conda-forge/noarch/async_generator-1.10-py_0.tar.bz2
 43 | https://conda.anaconda.org/conda-forge/noarch/attrs-21.2.0-pyhd8ed1ab_0.tar.bz2
 44 | https://conda.anaconda.org/conda-forge/noarch/backcall-0.2.0-pyh9f0ad1d_0.tar.bz2
 45 | https://conda.anaconda.org/conda-forge/noarch/backports-1.0-py_2.tar.bz2
 46 | https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-2.0.0-pyhd8ed1ab_0.tar.bz2
 47 | https://conda.anaconda.org/conda-forge/noarch/cloudpickle-2.0.0-pyhd8ed1ab_0.tar.bz2
 48 | https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.4-pyh9f0ad1d_0.tar.bz2
 49 | https://conda.anaconda.org/conda-forge/noarch/cycler-0.11.0-pyhd8ed1ab_0.tar.bz2
 50 | https://conda.anaconda.org/conda-forge/noarch/decorator-5.1.0-pyhd8ed1ab_0.tar.bz2
 51 | https://conda.anaconda.org/conda-forge/noarch/defusedxml-0.7.1-pyhd8ed1ab_0.tar.bz2
 52 | https://conda.anaconda.org/conda-forge/noarch/entrypoints-0.3-pyhd8ed1ab_1003.tar.bz2
 53 | https://conda.anaconda.org/conda-forge/noarch/idna-3.1-pyhd3deb0d_0.tar.bz2
 54 | https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2
 55 | https://conda.anaconda.org/conda-forge/noarch/json5-0.9.5-pyh9f0ad1d_0.tar.bz2
 56 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab_widgets-1.0.2-pyhd8ed1ab_0.tar.bz2
 57 | https://conda.anaconda.org/conda-forge/win-64/libblas-3.9.0-12_win64_mkl.tar.bz2
 58 | https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.37-h1d00b33_2.tar.bz2
 59 | https://conda.anaconda.org/conda-forge/win-64/mkl-devel-2021.4.0-h57928b3_730.tar.bz2
 60 | https://conda.anaconda.org/conda-forge/noarch/nest-asyncio-1.5.1-pyhd8ed1ab_0.tar.bz2
 61 | https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-pyh9f0ad1d_1.tar.bz2
 62 | https://conda.anaconda.org/conda-forge/noarch/pandocfilters-1.5.0-pyhd8ed1ab_0.tar.bz2
 63 | https://conda.anaconda.org/conda-forge/noarch/parso-0.8.2-pyhd8ed1ab_0.tar.bz2
 64 | https://conda.anaconda.org/conda-forge/noarch/pickleshare-0.7.5-py_1003.tar.bz2
 65 | https://conda.anaconda.org/conda-forge/noarch/prometheus_client-0.12.0-pyhd8ed1ab_0.tar.bz2
 66 | https://conda.anaconda.org/conda-forge/noarch/pycodestyle-2.8.0-pyhd8ed1ab_0.tar.bz2
 67 | https://conda.anaconda.org/conda-forge/noarch/pycparser-2.21-pyhd8ed1ab_0.tar.bz2
 68 | https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.0.5-pyhd8ed1ab_0.tar.bz2
 69 | https://conda.anaconda.org/conda-forge/win-64/python_abi-3.8-2_cp38.tar.bz2
 70 | https://conda.anaconda.org/conda-forge/noarch/pytz-2021.3-pyhd8ed1ab_0.tar.bz2
 71 | https://conda.anaconda.org/conda-forge/noarch/qtpy-1.11.2-pyhd8ed1ab_0.tar.bz2
 72 | https://conda.anaconda.org/conda-forge/noarch/send2trash-1.8.0-pyhd8ed1ab_0.tar.bz2
 73 | https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2
 74 | https://conda.anaconda.org/conda-forge/noarch/testpath-0.5.0-pyhd8ed1ab_0.tar.bz2
 75 | https://conda.anaconda.org/conda-forge/noarch/textwrap3-0.9.2-py_0.tar.bz2
 76 | https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_0.tar.bz2
 77 | https://conda.anaconda.org/conda-forge/noarch/traitlets-5.1.1-pyhd8ed1ab_0.tar.bz2
 78 | https://conda.anaconda.org/conda-forge/noarch/typing_extensions-3.10.0.2-pyha770c72_0.tar.bz2
 79 | https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-py_1.tar.bz2
 80 | https://conda.anaconda.org/conda-forge/noarch/wheel-0.37.0-pyhd8ed1ab_1.tar.bz2
 81 | https://conda.anaconda.org/conda-forge/noarch/zipp-3.6.0-pyhd8ed1ab_0.tar.bz2
 82 | https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.0-h6255e5f_0.tar.bz2
 83 | https://conda.anaconda.org/conda-forge/noarch/autopep8-1.6.0-pyhd8ed1ab_1.tar.bz2
 84 | https://conda.anaconda.org/conda-forge/noarch/babel-2.9.1-pyh44b312d_0.tar.bz2
 85 | https://conda.anaconda.org/conda-forge/win-64/certifi-2022.6.15-py38haa244fe_0.tar.bz2
 86 | https://conda.anaconda.org/conda-forge/win-64/cffi-1.15.0-py38hd8c33c5_0.tar.bz2
 87 | https://conda.anaconda.org/conda-forge/win-64/chardet-4.0.0-py38haa244fe_2.tar.bz2
 88 | https://conda.anaconda.org/conda-forge/win-64/debugpy-1.5.1-py38h885f38d_0.tar.bz2
 89 | https://conda.anaconda.org/conda-forge/win-64/freetype-2.10.4-h546665d_1.tar.bz2
 90 | https://conda.anaconda.org/conda-forge/win-64/future-0.18.2-py38haa244fe_4.tar.bz2
 91 | https://conda.anaconda.org/conda-forge/win-64/importlib-metadata-4.8.2-py38haa244fe_0.tar.bz2
 92 | https://conda.anaconda.org/conda-forge/noarch/importlib_resources-5.4.0-pyhd8ed1ab_0.tar.bz2
 93 | https://conda.anaconda.org/conda-forge/win-64/jedi-0.18.0-py38haa244fe_3.tar.bz2
 94 | https://conda.anaconda.org/conda-forge/win-64/kiwisolver-1.3.2-py38hbd9d945_1.tar.bz2
 95 | https://conda.anaconda.org/conda-forge/win-64/libcblas-3.9.0-12_win64_mkl.tar.bz2
 96 | https://conda.anaconda.org/conda-forge/win-64/liblapack-3.9.0-12_win64_mkl.tar.bz2
 97 | https://conda.anaconda.org/conda-forge/win-64/libtiff-4.3.0-hd413186_2.tar.bz2
 98 | https://conda.anaconda.org/conda-forge/win-64/markupsafe-2.0.1-py38h294d835_1.tar.bz2
 99 | https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.3-pyhd8ed1ab_0.tar.bz2
100 | https://conda.anaconda.org/conda-forge/win-64/mistune-0.8.4-py38h294d835_1005.tar.bz2
101 | https://conda.anaconda.org/conda-forge/noarch/packaging-21.0-pyhd8ed1ab_0.tar.bz2
102 | https://conda.anaconda.org/conda-forge/win-64/pybox2d-2.3.10-py38h885f38d_1.tar.bz2
103 | https://conda.anaconda.org/conda-forge/win-64/pyqt5-sip-4.19.18-py38h885f38d_7.tar.bz2
104 | https://conda.anaconda.org/conda-forge/win-64/pyrsistent-0.18.0-py38h294d835_0.tar.bz2
105 | https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2
106 | https://conda.anaconda.org/conda-forge/win-64/pywin32-302-py38h294d835_2.tar.bz2
107 | https://conda.anaconda.org/conda-forge/win-64/pywinpty-1.1.5-py38hd3f51b4_1.tar.bz2
108 | https://conda.anaconda.org/conda-forge/win-64/pyzmq-22.3.0-py38h09162b1_1.tar.bz2
109 | https://conda.anaconda.org/conda-forge/win-64/qt-5.12.9-h5909a2a_4.tar.bz2
110 | https://conda.anaconda.org/conda-forge/win-64/setuptools-58.5.3-py38haa244fe_0.tar.bz2
111 | https://conda.anaconda.org/conda-forge/win-64/sniffio-1.2.0-py38haa244fe_2.tar.bz2
112 | https://conda.anaconda.org/conda-forge/win-64/tornado-6.1-py38h294d835_2.tar.bz2
113 | https://conda.anaconda.org/conda-forge/noarch/tqdm-4.62.3-pyhd8ed1ab_0.tar.bz2
114 | https://conda.anaconda.org/conda-forge/win-64/websocket-client-0.57.0-py38haa244fe_6.tar.bz2
115 | https://conda.anaconda.org/conda-forge/win-64/win_inet_pton-1.1.0-py38haa244fe_3.tar.bz2
116 | https://conda.anaconda.org/conda-forge/win-64/anyio-3.3.4-py38haa244fe_0.tar.bz2
117 | https://conda.anaconda.org/conda-forge/win-64/argon2-cffi-21.1.0-py38h294d835_2.tar.bz2
118 | https://conda.anaconda.org/conda-forge/noarch/backports.functools_lru_cache-1.6.4-pyhd8ed1ab_0.tar.bz2
119 | https://conda.anaconda.org/conda-forge/noarch/bleach-4.1.0-pyhd8ed1ab_0.tar.bz2
120 | https://conda.anaconda.org/conda-forge/win-64/brotlipy-0.7.0-py38h294d835_1003.tar.bz2
121 | https://conda.anaconda.org/conda-forge/win-64/cryptography-35.0.0-py38hb7941b4_2.tar.bz2
122 | https://conda.anaconda.org/conda-forge/noarch/jinja2-3.0.2-pyhd8ed1ab_0.tar.bz2
123 | https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.2.1-pyhd8ed1ab_0.tar.bz2
124 | https://conda.anaconda.org/conda-forge/win-64/jupyter_core-4.9.1-py38haa244fe_0.tar.bz2
125 | https://conda.anaconda.org/conda-forge/win-64/lcms2-2.12-h2a16943_0.tar.bz2
126 | https://conda.anaconda.org/conda-forge/win-64/liblapacke-3.9.0-12_win64_mkl.tar.bz2
127 | https://conda.anaconda.org/conda-forge/win-64/numpy-1.21.4-py38h089cfbf_0.tar.bz2
128 | https://conda.anaconda.org/conda-forge/win-64/openjpeg-2.4.0-hb211442_1.tar.bz2
129 | https://conda.anaconda.org/conda-forge/noarch/pip-21.3.1-pyhd8ed1ab_0.tar.bz2
130 | https://conda.anaconda.org/conda-forge/win-64/pyglet-1.5.16-py38haa244fe_0.tar.bz2
131 | https://conda.anaconda.org/conda-forge/noarch/pygments-2.10.0-pyhd8ed1ab_0.tar.bz2
132 | https://conda.anaconda.org/conda-forge/win-64/pyqt-impl-5.12.3-py38h885f38d_7.tar.bz2
133 | https://conda.anaconda.org/conda-forge/win-64/pysocks-1.7.1-py38haa244fe_4.tar.bz2
134 | https://conda.anaconda.org/conda-forge/noarch/snakeviz-2.1.1-pyhd8ed1ab_0.tar.bz2
135 | https://conda.anaconda.org/conda-forge/win-64/terminado-0.12.1-py38haa244fe_1.tar.bz2
136 | https://conda.anaconda.org/conda-forge/win-64/blas-devel-3.9.0-12_win64_mkl.tar.bz2
137 | https://conda.anaconda.org/conda-forge/win-64/gym-0.21.0-py38h595d716_0.tar.bz2
138 | https://conda.anaconda.org/conda-forge/noarch/jupyter_client-7.0.6-pyhd8ed1ab_0.tar.bz2
139 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab_pygments-0.1.2-pyh9f0ad1d_0.tar.bz2
140 | https://conda.anaconda.org/conda-forge/noarch/nbformat-5.1.3-pyhd8ed1ab_0.tar.bz2
141 | https://conda.anaconda.org/conda-forge/win-64/pandas-1.3.4-py38h5d928e2_1.tar.bz2
142 | https://conda.anaconda.org/conda-forge/win-64/pillow-8.3.2-py38h794f750_0.tar.bz2
143 | https://conda.anaconda.org/conda-forge/noarch/pyopenssl-21.0.0-pyhd8ed1ab_0.tar.bz2
144 | https://conda.anaconda.org/conda-forge/win-64/pyqtchart-5.12-py38h885f38d_7.tar.bz2
145 | https://conda.anaconda.org/conda-forge/win-64/pyqtwebengine-5.12.1-py38h885f38d_7.tar.bz2
146 | https://conda.anaconda.org/conda-forge/win-64/scipy-1.7.2-py38ha1292f7_0.tar.bz2
147 | https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.5-pyh9f0ad1d_2.tar.bz2
148 | https://conda.anaconda.org/conda-forge/noarch/autograd-1.4-pyhd8ed1ab_0.tar.bz2
149 | https://conda.anaconda.org/conda-forge/win-64/blas-2.112-mkl.tar.bz2
150 | https://conda.anaconda.org/conda-forge/win-64/matplotlib-base-3.4.3-py38h1f000d6_1.tar.bz2
151 | https://conda.anaconda.org/conda-forge/noarch/nbclient-0.5.5-pyhd8ed1ab_0.tar.bz2
152 | https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.22-pyha770c72_0.tar.bz2
153 | https://conda.anaconda.org/conda-forge/win-64/pyqt-5.12.3-py38haa244fe_7.tar.bz2
154 | https://conda.anaconda.org/conda-forge/noarch/urllib3-1.26.7-pyhd8ed1ab_0.tar.bz2
155 | https://conda.anaconda.org/conda-forge/win-64/ipython-7.29.0-py38h595d716_1.tar.bz2
156 | https://conda.anaconda.org/conda-forge/win-64/matplotlib-3.4.3-py38haa244fe_1.tar.bz2
157 | https://conda.anaconda.org/conda-forge/win-64/nbconvert-6.2.0-py38haa244fe_0.tar.bz2
158 | https://conda.anaconda.org/conda-forge/noarch/prompt_toolkit-3.0.22-hd8ed1ab_0.tar.bz2
159 | https://conda.anaconda.org/pytorch/win-64/pytorch-1.10.0-py3.8_cpu_0.tar.bz2
160 | https://conda.anaconda.org/conda-forge/noarch/requests-2.26.0-pyhd8ed1ab_0.tar.bz2
161 | https://conda.anaconda.org/conda-forge/win-64/ipykernel-6.4.2-py38h595d716_0.tar.bz2
162 | https://conda.anaconda.org/conda-forge/noarch/jupyter_server-1.11.2-pyhd8ed1ab_0.tar.bz2
163 | https://conda.anaconda.org/pytorch/win-64/torchvision-0.11.1-py38_cpu.tar.bz2
164 | https://conda.anaconda.org/conda-forge/noarch/jupyter_console-6.4.0-pyhd8ed1ab_0.tar.bz2
165 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab_server-2.8.2-pyhd8ed1ab_0.tar.bz2
166 | https://conda.anaconda.org/conda-forge/noarch/notebook-6.4.5-pyha770c72_0.tar.bz2
167 | https://conda.anaconda.org/conda-forge/noarch/qtconsole-5.2.0-pyhd8ed1ab_0.tar.bz2
168 | https://conda.anaconda.org/conda-forge/noarch/nbclassic-0.3.4-pyhd8ed1ab_0.tar.bz2
169 | https://conda.anaconda.org/conda-forge/win-64/widgetsnbextension-3.5.2-py38haa244fe_0.tar.bz2
170 | https://conda.anaconda.org/conda-forge/noarch/ipywidgets-7.6.5-pyhd8ed1ab_0.tar.bz2
171 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab-3.2.2-pyhd8ed1ab_0.tar.bz2
172 | https://conda.anaconda.org/conda-forge/win-64/jupyter-1.0.0-py38haa244fe_6.tar.bz2
173 | 


--------------------------------------------------------------------------------
/train_problem.py:
--------------------------------------------------------------------------------
  1 | from collections import deque, namedtuple
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import drone_env
  5 | from drone_env import running_average, plot_rewards, plot_grads
  6 | from tqdm import tqdm, trange
  7 | from SAC_agents import SA2CAgents, RandomAgent, TrainedAgent, SPPOAgents
  8 | from utils import ExperienceBuffers, DiscreteSoftmaxNN, NormalPolicy
  9 | 
 10 | plt.style.use('seaborn-dark-palette')
 11 | tex_fonts = {
 12 |     # Use LaTeX to write all text
 13 |     #     "text.usetex": True,
 14 |     "font.family": "sans-serif",
 15 |     # Use 10pt font in plots, to match 10pt font in document
 16 |     "axes.labelsize": 10,
 17 |     "font.size": 10,
 18 |     # Make the legend/label fonts a little smaller
 19 |     "legend.fontsize": 10,
 20 |     "xtick.labelsize": 10,
 21 |     "ytick.labelsize": 10
 22 | }
 23 | plt.rcParams.update(tex_fonts)
 24 | 
 25 | 
 26 | ### Set up parameters ###
 27 | n_agents = 5
 28 | deltas = np.ones(n_agents)*2.43
 29 | # deltas = None
 30 | env = drone_env.drones(n_agents=n_agents, n_obstacles=0, grid=[5, 5], end_formation="O", deltas=deltas ,simplify_zstate = True)
 31 | env.collision_weight = 0.2 # old 0.2
 32 | print(env)
 33 | # env.show()
 34 | 
 35 | N_Episodes = 3000  
 36 | episodes_to_plot = [3000]
 37 | # episodes_to_plot = [1500]
 38 | save_name = "deltas2.5_softmax16"
 39 | 
 40 | discount_factor = 0.99
 41 | alpha_critic = 10**-3
 42 | alpha_actor = 10**-3
 43 | M = 10 # Epochs, i.e steps of the SDG for the actor-critic NN in PPO variant
 44 | dim_z = env.local_state_space # Dimension of the localized z_state space
 45 | dim_a = env.local_action_space # Dimension of the local action space
 46 | 
 47 | 
 48 | # Initialize variables
 49 | total_collisions_per_episode = []
 50 | total_reward_per_episode = []
 51 | total_true_reward_per_episode =[]
 52 | total_t = []
 53 | grad_per_episode = np.zeros([N_Episodes, n_agents])
 54 | gi_per_episode = np.zeros_like(grad_per_episode)
 55 | 
 56 | # times = np.arange(0, T, step=drone_env.dt) + drone_env.dt
 57 | 
 58 | 
 59 | agents = SA2CAgents(n_agents=env.n_agents, dim_local_state = dim_z, dim_local_action=dim_a, discount=discount_factor, epochs=M, learning_rate_critic=alpha_critic, learning_rate_actor=alpha_critic)
 60 | print(f"### Running {type(agents)}, actor: {type(agents.actors[0])} with params: ###")
 61 | print(f"Episodes = {N_Episodes}, max Time iterations = {drone_env.max_time_steps} (T = {drone_env.max_time_steps * drone_env.dt}s, dt = {drone_env.dt}s)")
 62 | print(f"N of agents = {env.n_agents}, structure of critic NN = {agents.criticsNN[0].input_size}x{agents.criticsNN[0].L1}x{agents.criticsNN[0].L2}x{agents.criticsNN[0].output_size}")
 63 | print(f"Discount = {discount_factor}, lr for NN critical  = {alpha_critic}, lr for actor  = {alpha_actor}, collision weight b = {env.collision_weight}")
 64 | 
 65 | EPISODES = trange(N_Episodes, desc='Episode: ', leave=True)
 66 | for episode in EPISODES:
 67 | 
 68 |     if episode+1 in episodes_to_plot:
 69 |         # reward_history = np.zeros([len(times), env.n_agents])
 70 |         trajectory = [env.state.copy()]
 71 |         z_trajectory = [env.z_states]
 72 |     total_episode_reward = 0
 73 |     total_true_episode_reward = 0
 74 |     total_episode_collisions = 0
 75 |     # env.show()
 76 | 
 77 |     buffers = ExperienceBuffers(env.n_agents)
 78 | 
 79 |     # SIMULATION OVER T
 80 |     t_iter = 0
 81 |     finished = False
 82 |     while not finished:
 83 |         
 84 |         state = env.state
 85 |         z_states = env.z_states
 86 |         Ni = env.Ni
 87 | 
 88 |         # calculate actions based on current state
 89 |         # actions = drone_env.gradient_control(state, env)
 90 |         # actions = drone_env.proportional_control(state, env)
 91 |         actions = agents.forward(z_states, Ni)
 92 | 
 93 |         # Update environment one time step with the actions
 94 |         new_state, new_z, rewards, n_collisions, finished, true_rewards = env.step(actions)
 95 |         # EXPERIECE: [z_state, action, reward, next_z, finished]
 96 |         buffers.append(z_states, actions, rewards, new_z, Ni, finished)
 97 | 
 98 |         total_episode_reward += np.mean(rewards)
 99 |         total_true_episode_reward += np.mean(true_rewards)
100 |         total_episode_collisions += n_collisions
101 | 
102 |         if episode+1 in episodes_to_plot:
103 |             # reward_history[t_iter,:] = reward
104 |             trajectory.append(new_state.copy())
105 |             z_trajectory.append(new_z)
106 |         
107 |         t_iter +=1
108 | 
109 |     ### END OF EPISODES
110 |     # Train of critic with the data of the episode
111 |     # current_grad_norms, current_gi_norms = agents.train(buffers, actor_lr = alpha_actor, return_grads=True)
112 |     if type(agents.actors[0]) is NormalPolicy:
113 |         agents.train_designed_policy(buffers, actor_lr = alpha_actor, return_grads=False)
114 |     else:
115 |         agents.train_NN(buffers, actor_lr = alpha_actor)
116 | 
117 |     # Append episodic variables/logs
118 |     total_reward_per_episode.append(total_episode_reward)
119 |     total_true_reward_per_episode.append(total_true_episode_reward)
120 |     total_collisions_per_episode.append(total_episode_collisions)
121 |     total_t.append(t_iter)
122 |     # grad_per_episode[episode,:] = np.array(current_grad_norms)
123 |     # gi_per_episode[episode,:] = np.array(current_gi_norms)
124 | 
125 |     if episode+1 in episodes_to_plot:
126 |         Q_simulated, V_approx = agents.benchmark_cirtic(buffers, only_one_NN=False)
127 | 
128 |     # print(f"Episode collisions = {total_episode_collisions}")
129 |     # env.animate(trajectory,frame_time=0.1)
130 | 
131 |     # RESET ENVIRONMENT
132 |     env.reset(renew_obstacles=False)
133 | 
134 |     # Set progress bar description with information
135 |     average_reward = running_average(total_reward_per_episode, 50)[-1]
136 |     average_true_reward = running_average(total_true_reward_per_episode, 50)[-1]
137 |     average_collisions = running_average(total_collisions_per_episode, 50)[-1]
138 |     average_t = running_average(total_t, 50)[-1]
139 |     EPISODES.set_description(
140 |         f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter} - Average: {average_reward:.1f}/{average_collisions:.2f}/{average_t}. True r={average_true_reward:.1f}.")
141 | 
142 |     # Plot current trajectory
143 | 
144 |     if episode+1 in episodes_to_plot:
145 |         env.plot(trajectory, episode)
146 |         env.animate(trajectory, z_trajectory, deltas, episode, name=f"training-E{episode+1}", format="mp4")
147 |         times = np.arange(0, t_iter)*drone_env.dt
148 |         plt.figure()
149 |         for i in range(env.n_agents):
150 |             agent_color = drone_env.num_to_rgb(i,env.n_agents-1)
151 |             plt.plot(times,Q_simulated[i], label=f"i={i}, simulated Q (Gt)", color = agent_color)
152 |             plt.plot(times,V_approx[i],"--" , label=f"i={i}, approx V", color = tuple(0.9*x for x in agent_color))
153 |             if type(agents.actors[0]) is NormalPolicy:
154 |                 print(f"Agent {i} params = {agents.actors[i].parameters}")
155 |         plt.legend()
156 |         plt.show()
157 | 
158 | agents.save(filename=save_name)
159 | 
160 | plot_rewards(total_reward_per_episode, total_true_reward_per_episode, total_collisions_per_episode, n_ep_running_average=50)
161 | # plt.savefig("images/reward_training.pdf",format='pdf', bbox_inches='tight')
162 | # plot_grads(grad_per_episode,gi_per_episode)


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from turtle import forward
  3 | import numpy as np
  4 | from autograd import numpy as anp
  5 | from autograd import grad
  6 | import torch
  7 | # import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | from collections import deque, namedtuple
 11 | 
 12 | """  Classes used for agents and other useful functions  """
 13 | 
 14 | class CriticNN(nn.Module):
 15 |     """ Create local critic network
 16 |     """
 17 |     # NN sizes: define size of hidden layer
 18 |     L1 = 200
 19 |     L2 = 200
 20 | 
 21 |     def __init__(self, input_size, output_size = 1):
 22 |         super().__init__()
 23 | 
 24 |         self.input_size = input_size
 25 |         self.output_size = output_size
 26 | 
 27 |         # Create input layer with ReLU activation
 28 |         self.input_layer = nn.Linear(input_size, self.L1)
 29 |         self.input_layer_activation = nn.ReLU()
 30 | 
 31 |         # Create hidden layers. 1
 32 |         self.hidden_layer1 = nn.Linear(self.L1, self.L2)
 33 |         self.hidden_layer1_activation = nn.ReLU()
 34 | 
 35 |         # Create output layer. NO ACTIVATION
 36 |         self.output_layer = nn.Linear(self.L2, output_size)
 37 | 
 38 |     def forward(self, z):
 39 |         '''z must be a properly formated vector of z (torch tensor)'''
 40 |         # Function used to compute the forward pass
 41 | 
 42 |         # Compute first layer
 43 |         l1 = self.input_layer(z)
 44 |         l1 = self.input_layer_activation(l1)
 45 | 
 46 |         # Compute hidden layers
 47 |         l2 = self.hidden_layer1(l1)
 48 |         l2 = self.hidden_layer1_activation(l2)
 49 | 
 50 |         # Compute output layer
 51 |         out = self.output_layer(l2)
 52 | 
 53 |         return out
 54 | 
 55 | class NormalActorNN(nn.Module):
 56 |     """ NN for a policy that as input takes the z state and outputs 2D means and sigma of a independent normal distributions
 57 |         In this case: z[1x6] -> mu[1x2], sigma^2[1x2]
 58 |     """
 59 |     def __init__(self, input_size, lr ,dim_action):
 60 |         super().__init__()
 61 | 
 62 |         self.dim_action = dim_action
 63 |         # NN sizes: define size of layers
 64 |         Ls = 400
 65 |         hidden_1= 200
 66 |         hidden_2= 200
 67 | 
 68 |         # Ls, Create input layer with ReLU activation
 69 |         self.input_layer = nn.Linear(input_size, Ls)
 70 |         self.input_layer_activation = nn.ReLU()
 71 | 
 72 |         # Create hidden layer, Ls- > head1
 73 |         self.hidden_layer1 = nn.Linear(Ls, hidden_1)
 74 |         self.hidden_layer1_activation = nn.ReLU()
 75 |         # Create hidden layer, Ls- > head2
 76 |         self.hidden_layer2 = nn.Linear(Ls, hidden_2)
 77 |         self.hidden_layer2_activation = nn.ReLU()
 78 | 
 79 |         # ctreate output layers for each head: out_1: means (tanh in [-1,1]). out_2: sigma^2 (sigmoid in [0,1])
 80 |         self.out_1 = nn.Linear(hidden_1,dim_action)
 81 |         self.out_1_activation = nn.Tanh()
 82 |         self.out_2 = nn.Linear(hidden_2,dim_action)
 83 |         self.out_2_activation = nn.Sigmoid()
 84 | 
 85 |         # Use adam optimizer
 86 |         self.optimizer = optim.Adam(self.parameters(), lr = lr)
 87 | 
 88 |     def forward(self, z):
 89 |         # Compute first layer
 90 |         l1 = self.input_layer(z)
 91 |         l1 = self.input_layer_activation(l1)
 92 | 
 93 |         # Compute hidden layers
 94 |         l2_head1 = self.hidden_layer1(l1)
 95 |         l2_head1 = self.hidden_layer1_activation(l2_head1)
 96 | 
 97 |         l2_head2 = self.hidden_layer2(l1)
 98 |         l2_head2 = self.hidden_layer2_activation(l2_head2)
 99 | 
100 |         # Compute output layers
101 |         out_1 = self.out_1(l2_head1)
102 |         out_1 = self.out_1_activation(out_1)
103 | 
104 |         out_2 = self.out_2(l2_head2)
105 |         out_2 = self.out_2_activation(out_2)
106 | 
107 |         # out_1 = mu, out_2 = sigma^2
108 |         return out_1,out_2
109 | 
110 |     def sample_action(self,z:np.ndarray, N=None):
111 |         state_tensor = torch.tensor(z, dtype=torch.float32)
112 |         mu_tensor,sigma_tensor = self.forward(state_tensor)
113 |         # Normally distributed value with the mu and sigma (std^2) from ActorNN
114 |         std = np.sqrt(sigma_tensor.detach().numpy())
115 |         action = np.random.normal(mu_tensor.detach().numpy(),std)
116 |         # action = np.clip(action,-1,1)
117 |         return action
118 | 
119 |     def log_p_of_a(self,z:np.ndarray, a:np.ndarray):
120 |         state_tensor = torch.tensor(z, dtype=torch.float32)
121 |         mu , sigma = self.forward(state_tensor)
122 |         if mu.dim() == 1:
123 |             p1 = torch.pow(2 * np.pi * sigma[0], -0.5) * torch.exp(-(a[0] - mu[0])**2 / (2 * sigma[0]))
124 |             p2 = torch.pow(2 * np.pi * sigma[1], -0.5) * torch.exp(-(a[1] - mu[1])**2 / (2 * sigma[1]))
125 |         else:
126 |             p1 = torch.pow(2 * np.pi * sigma[:,0], -0.5) * torch.exp(-(a[:,0] - mu[:,0])**2 / (2 * sigma[:,0]))
127 |             p2 = torch.pow(2 * np.pi * sigma[:,1], -0.5) * torch.exp(-(a[:,1] - mu[:,1])**2 / (2 * sigma[:,1]))
128 |         
129 |         p = p1*p2
130 |         return torch.log(p)
131 |         
132 | class NormalPolicy:
133 |     """Policy that uses a multivatriable normal distribution.
134 |         The parameters are theta, which the angles of the Rot mat for each vector:
135 |         parameters: mu = theta * z
136 |         gradient: w.r.t theta
137 |         covariance matrix: Constant for now, not a parameter
138 | 
139 |         CAREFUL: Changes the shape of z,a inputs to columns
140 | 
141 |         NOTICE: individual values of p(a|z) can be greater than 1, as this is a density funct.  (pdf, continous)
142 |         the pdf of a singular point makes no sense, neeeds to be over a differential of a (i.e pdf is per unit lenght)
143 |     """
144 |     def __init__(self, input_size, output_size = 2, Sigma = None) -> None:
145 | 
146 |         self.take_all_states = False
147 |         self.dim = output_size
148 |         self.z_dim = input_size
149 | 
150 |         # param =anp.array([-1.6,-1.6,-1.6])
151 |         param =-anp.ones(int(self.z_dim/self.dim))*0
152 | 
153 |         self.parameters = param
154 |         if Sigma is None:
155 |             self.Sigma = anp.eye(self.dim)*0.3
156 |         else:
157 |             self.Sigma = Sigma
158 | 
159 |     def p_of_a(self, z:np.ndarray, a:np.ndarray) -> np.ndarray:
160 |         ''' a needs to be a row vector (1D flat)
161 |             z needs to be a row vector (1D flat)
162 |         '''
163 |         pass
164 | 
165 |     def compute_grad(self, z:np.ndarray, a:np.ndarray, Ni):
166 |         ''' a needs to be a row vector (1D flat)
167 |             z needs to be a row vector (1D flat)
168 |             Ni indicates the states that are neighbors
169 |         '''
170 |         # Make vectors proper shape (column, for math)
171 |         z.shape = (np.size(z),1)
172 |         a.shape = (np.size(a),1)
173 | 
174 |         # Used to only calculate the gradient of the states that actually count
175 |         if self.take_all_states:
176 |             idx = anp.ones(int(self.z_dim/self.dim))
177 |         else:
178 |             idx = np.arange(1,int(self.z_dim/self.dim+1))<=len(Ni)
179 | 
180 |         # Define scalar function to which apply numerical gradient: https://github.com/HIPS/autograd/blob/master/docs/tutorial.md
181 |         def my_fun(variable):
182 |             R0 = anp.array([[anp.cos(variable[0]), -anp.sin(variable[0])],[anp.sin(variable[0]),anp.cos(variable[0])]])*idx[0]
183 |             R1 = anp.array([[anp.cos(variable[1]), -anp.sin(variable[1])],[anp.sin(variable[1]),anp.cos(variable[1])]])*idx[1]
184 |             R2 = anp.array([[anp.cos(variable[2]), -anp.sin(variable[2])],[anp.sin(variable[2]),anp.cos(variable[2])]])*idx[2]
185 |             R = anp.concatenate((R0,R1,R2),1)
186 | 
187 |             return (-1/2*(a- R @ z).T @ np.linalg.inv(self.Sigma) @ (a- R @ z))[0,0]
188 |         
189 |         grad_fun = grad(my_fun)
190 |         self.grad = grad_fun(self.parameters)
191 |         z.shape = (np.size(z),)
192 |         a.shape = (np.size(a),)
193 | 
194 |         return self.grad
195 |     
196 |     def clip_grad_norm(self, grad:np.ndarray, clip_norm:float):
197 |         # If the gradient norm is to be clipped to a value:
198 |         grad_norm = np.linalg.norm(grad.flatten())
199 |         # If the current norm is less than the clipping, do nothing. If more, make the norm=cliped_norm
200 |         if grad_norm <= clip_norm:
201 |             return grad
202 |         else:
203 |             return grad * clip_norm/grad_norm
204 | 
205 | 
206 |     def sample_action(self, z:np.ndarray, Ni):
207 |         # Maybe add a mask so that null states are not accounted 
208 |         z.shape = (np.size(z),1)
209 | 
210 |         # Used to only calculate the gradient of the states that actually count
211 |         if self.take_all_states:
212 |             idx = anp.ones(int(self.z_dim/self.dim))
213 |         else:
214 |             idx = np.arange(1,int(self.z_dim/self.dim+1))<=len(Ni)
215 | 
216 |         variable = self.parameters
217 |         R0 = anp.array([[anp.cos(variable[0]), -anp.sin(variable[0])],[anp.sin(variable[0]),anp.cos(variable[0])]])*idx[0]
218 |         R1 = anp.array([[anp.cos(variable[1]), -anp.sin(variable[1])],[anp.sin(variable[1]),anp.cos(variable[1])]])*idx[1]
219 |         R2 = anp.array([[anp.cos(variable[2]), -anp.sin(variable[2])],[anp.sin(variable[2]),anp.cos(variable[2])]])*idx[2]
220 |         # R3 = anp.array([[anp.cos(variable[3]), -anp.sin(variable[3])],[anp.sin(variable[3]),anp.cos(variable[3])]])*idx[3]
221 |         R = anp.concatenate((R0,R1,R2),1)
222 | 
223 |         mu = (R @ z).flatten()
224 |         
225 |         z.shape = (np.size(z),)
226 |         a = np.random.multivariate_normal(mu, self.Sigma)
227 |         
228 |         # Clip the action to not have infinite action
229 |         return np.clip(a,-2,+2)
230 |         
231 | 
232 | class ExperienceBuffers:
233 |     """ List of buffers for each agent.
234 |         each agent has its own buffer: i: ['z_state', 'action', 'local_reward', 'next_z', 'is_finished']
235 |         to get data, example: buffers.buffers[i][t].action
236 |     """
237 |     def __init__(self, n_agents):
238 |         # Create buffer for each agent
239 |         self.buffers = [deque() for i in range(n_agents)]
240 |         self.n_agents = n_agents
241 |         self.experience = namedtuple('experience',
242 |                             ['z_state', 'action', 'reward', 'next_z', 'Ni', 'finished'])
243 | 
244 |     def append(self,z_states, actions, rewards, new_z, Ni, finished):
245 |         # Append experience to the buffer
246 |         for i in range(self.n_agents):
247 |             # Create localized expereince touple. Also, flatten state and action vectors
248 |             exp = self.experience(z_states[i].flatten(), actions[i].flatten(), rewards[i], new_z[i].flatten(), Ni[i], finished)
249 |             self.buffers[i].append(exp)
250 | 
251 |     def __len__(self):
252 |         # overload len operator
253 |         return len(self.buffers[0])
254 | 
255 | class DiscreteSoftmaxNN(nn.Module):
256 |     """ 
257 |         NN for the policy with finite actions, in which the input is the state and the output are softmax probabilities for each of the actions
258 |     """
259 |     def __init__(self, input_size, lr , n_actions = 8):
260 |         super().__init__()
261 | 
262 |         # Create action dictionary action_arg -> 2D continuos action array
263 |         action_norm = 1
264 |         action_list = []
265 |         for n_action in range(n_actions):
266 |             ax = np.cos(n_action/n_actions * 2*np.pi)*action_norm
267 |             ay = np.sin(n_action/n_actions * 2*np.pi)*action_norm
268 |             action_list.append(np.array([ax,ay]))
269 |         self.action_list = np.array(action_list)
270 | 
271 |         # Structure z -> Ls[ReLu] -> hidden1[ReLu] -> out[softmax]
272 |         Ls = 300
273 |         hidden_1 = 300
274 |         self.n_actions = n_actions
275 | 
276 |         # Ls, Create input layer with ReLU activation
277 |         self.input_layer = nn.Linear(input_size, Ls)
278 |         self.input_layer_activation = nn.ReLU()
279 |         # Create hidden layer, Ls- > head1
280 |         self.hidden_layer1 = nn.Linear(Ls, hidden_1)
281 |         self.hidden_layer1_activation = nn.ReLU()
282 |         # ctreate output layers with softmax -> probabilities
283 |         self.out_1 = nn.Linear(hidden_1,n_actions)
284 |         self.out_1_activation = nn.Softmax(dim=0)
285 | 
286 |         # Use adam optimizer
287 |         self.optimizer = optim.Adam(self.parameters(), lr = lr)
288 | 
289 |     def forward(self,z):
290 |         # Compute first layer
291 |         l1 = self.input_layer(z)
292 |         l1 = self.input_layer_activation(l1)
293 | 
294 |         # Compute hidden layers
295 |         l2 = self.hidden_layer1(l1)
296 |         l2 = self.hidden_layer1_activation(l2)
297 | 
298 |         # Compute output layers
299 |         out_1 = self.out_1(l2)
300 |         out_1 = self.out_1_activation(out_1)
301 | 
302 |         return out_1
303 | 
304 |     def sample_action(self,z:np.ndarray, N=None):
305 |         state_tensor = torch.tensor(z, dtype=torch.float32)
306 |         probs = self.forward(state_tensor)
307 |         chosen_action_arg = np.random.choice(self.n_actions, p = probs.squeeze().detach().numpy())
308 | 
309 |         return self.action_list[chosen_action_arg]
310 | 
311 |     def log_p_of_a(self,z:np.ndarray, a:np.ndarray):
312 |         # Find argument corresponding to a array
313 |         action_arg = np.where((self.action_list[:,0] == a[0]) & (self.action_list[:,1]==a[1]))[0][0]
314 | 
315 |         state_tensor = torch.tensor(z, dtype=torch.float32)
316 |         probs = self.forward(state_tensor)
317 |         log_prob = torch.log(probs.squeeze()[action_arg])
318 |         return log_prob
319 | 


--------------------------------------------------------------------------------
/variables_with_delta_change_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_1


--------------------------------------------------------------------------------
/variables_with_delta_change_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_2


--------------------------------------------------------------------------------
/variables_with_delta_change_3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_3


--------------------------------------------------------------------------------
/variables_with_delta_change_4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_4


--------------------------------------------------------------------------------