├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── SAC_agents.py
├── benchmark_agent.py
├── control_test.py
├── drone_env.py
├── images
├── E1000_n10_DiscretePolicy4_b02.png
├── E1500_n5_DiscretePolicy8_b02.png
├── collisions_hist.pdf
└── delta_effect.pdf
├── learning_Q_test.py
├── matlab
├── cost_field.m
├── derivations_22ndPol.m
├── derivations_2ndPol.m
├── derivations_check.m
├── distance_def.m
├── images
│ ├── dij.eps
│ └── dij.fig
├── normal_multivariate_pdf.m
├── optimal_traj.m
└── sol.mat
├── models
├── E1000_n10_DiscretePolicy4_b02-A2Cactors.pth
├── E1000_n10_DiscretePolicy4_b02-A2Ccritics.pth
├── E500_M30_LR1e4_badInitialState-actors.pth
├── E500_M30_LR1e4_badInitialState-critics.pth
├── cont_preloaded-A2Cactors.pth
├── cont_preloaded-A2Ccritics.pth
├── deltas
│ ├── deltas0.01_softmax16-A2Cactors.pth
│ ├── deltas0.01_softmax16-A2Ccritics.pth
│ ├── deltas0.1_softmax16-A2Cactors.pth
│ ├── deltas0.1_softmax16-A2Ccritics.pth
│ ├── deltas0.2_softmax16-A2Cactors.pth
│ ├── deltas0.2_softmax16-A2Ccritics.pth
│ ├── deltas0.5_softmax16-A2Cactors.pth
│ ├── deltas0.5_softmax16-A2Ccritics.pth
│ ├── deltas0.8_softmax16-A2Cactors.pth
│ ├── deltas0.8_softmax16-A2Ccritics.pth
│ ├── deltas1.5_softmax16-A2Cactors.pth
│ ├── deltas1.5_softmax16-A2Ccritics.pth
│ ├── deltas1_softmax16-A2Cactors.pth
│ ├── deltas1_softmax16-A2Ccritics.pth
│ ├── deltas2.43_softmax16-A2Cactors.pth
│ ├── deltas2.43_softmax16-A2Ccritics.pth
│ ├── deltas2.5_softmax16-A2Cactors-old (1).pth
│ ├── deltas2.5_softmax16-A2Ccritics-old (2).pth
│ ├── deltas2_softmax16-A2Cactors.pth
│ └── deltas2_softmax16-A2Ccritics.pth
├── deltas2.5_softmax16-A2Cactors.pth
├── deltas2.5_softmax16-A2Ccritics.pth
├── discrete-A2Cactors.pth
├── discrete-A2Ccritics.pth
├── final
│ ├── cont_n5-A2Cactors.pth
│ ├── cont_n5-A2Ccritics.pth
│ ├── simple-A2Cactors.pth
│ ├── simple-A2Ccritics.pth
│ ├── softmax8_n4-A2Cactors.pth
│ ├── softmax8_n4-A2Ccritics.pth
│ ├── softmax8_n5-A2Cactors.pth
│ ├── softmax8_n5-A2Ccritics.pth
│ ├── softmax8_n8-A2Cactors.pth
│ └── softmax8_n8-A2Ccritics.pth
├── n5_E1500_Advantage-actors.pth
├── n5_E1500_Advantage-critics.pth
├── softmax8_n5-A2Cactors.pth
├── softmax8_n5-A2Ccritics.pth
└── trained_E1000_M50_LR001-actors.pth
├── policy_performance_variables_1
├── policy_performance_variables_2
├── profile.bat
├── spec-file.txt
├── train_problem.py
├── utils.py
├── variables_with_delta_change_1
├── variables_with_delta_change_2
├── variables_with_delta_change_3
└── variables_with_delta_change_4
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # my own thoings
2 | notes.md
3 |
4 | # Folders
5 | videos/
6 | *.prof
7 |
8 | # Byte-compiled / optimized / DLL files
9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 |
13 | # C extensions
14 | *.so
15 |
16 | # Distribution / packaging
17 | .Python
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | share/python-wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | MANIFEST
35 |
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .nox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *.cover
56 | *.py,cover
57 | .hypothesis/
58 | .pytest_cache/
59 | cover/
60 |
61 | # Translations
62 | *.mo
63 | *.pot
64 |
65 | # Django stuff:
66 | *.log
67 | local_settings.py
68 | db.sqlite3
69 | db.sqlite3-journal
70 |
71 | # Flask stuff:
72 | instance/
73 | .webassets-cache
74 |
75 | # Scrapy stuff:
76 | .scrapy
77 |
78 | # Sphinx documentation
79 | docs/_build/
80 |
81 | # PyBuilder
82 | .pybuilder/
83 | target/
84 |
85 | # Jupyter Notebook
86 | .ipynb_checkpoints
87 |
88 | # IPython
89 | profile_default/
90 | ipython_config.py
91 |
92 | # pyenv
93 | # For a library or package, you might want to ignore these files since the code is
94 | # intended to run in multiple environments; otherwise, check them in:
95 | # .python-version
96 |
97 | # pipenv
98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | # install all needed dependencies.
102 | #Pipfile.lock
103 |
104 | # poetry
105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106 | # This is especially recommended for binary packages to ensure reproducibility, and is more
107 | # commonly ignored for libraries.
108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109 | #poetry.lock
110 |
111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
112 | __pypackages__/
113 |
114 | # Celery stuff
115 | celerybeat-schedule
116 | celerybeat.pid
117 |
118 | # SageMath parsed files
119 | *.sage.py
120 |
121 | # Environments
122 | .env
123 | .venv
124 | env/
125 | venv/
126 | ENV/
127 | env.bak/
128 | venv.bak/
129 |
130 | # Spyder project settings
131 | .spyderproject
132 | .spyproject
133 |
134 | # Rope project settings
135 | .ropeproject
136 |
137 | # mkdocs documentation
138 | /site
139 |
140 | # mypy
141 | .mypy_cache/
142 | .dmypy.json
143 | dmypy.json
144 |
145 | # Pyre type checker
146 | .pyre/
147 |
148 | # pytype static type analyzer
149 | .pytype/
150 |
151 | # Cython debug symbols
152 | cython_debug/
153 |
154 | # PyCharm
155 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
156 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
157 | # and can be added to the global gitignore or merged into this file. For a more nuclear
158 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
159 | #.idea/
160 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Andreu Matoses
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Scalable Collision Avoidance RL
2 |
3 |
4 | Multi agent scalable reinforcement learning for formation control with collision avoidance. Paper and more information can be found in the [website 🌍](https://andreumatoses.github.io/research/msc-thesis-marl)
5 |
6 |
7 |
8 |
9 | This is part of my Mater's Thesis at the Royal Institute of Technology (KTH), Stockholm, Sweden (URL available soon). The scalable part of this work is inspired by the paper [Scalable Reinforcement Learning for Multi-Agent Networked Systems](https://arxiv.org/abs/1912.02906). The approach presented on the thesis exploits the structure of the designed reward to present ∆-disk local approximation of the individual Q functions and policy gradients.
10 |
11 | ## Important Scripts
12 |
13 | - [drone_env.py](drone_env.py): Script containing the environment class and its methods. The main methods follow the structure of the OpenGym RL environments, such as `.step()` and `.reset()`.
14 | - [train_problem.py](train_problem.py): Script containing the training of the main problem.
15 | - [SAC_agents.py](SAC_agents.py): Script containing the agent classes and its policy classes
16 | - [benchmark_agent.py](benchmark_agent.py): Scripts to run trained agents and benchmark their performance
17 |
18 | ## Training of the scalable agents
19 | The schema of the algorithm used is presented below. The scalable actor-critic agents are trained for each robotic agent. There are a total of *n* agents.
20 |
21 | 
22 |
23 | ## Structure of the training script
24 | 
25 |
26 | ## Relevant results
27 | ### Softmax discrete policy
28 | The inidivual policy for each agent is of the shape:
29 |
30 | 
31 |
32 | Some examples of trajectories obtained after succesful training are as follow
33 |
34 | 
35 |
36 | ### Continous normally distributed policy
37 | The inidivual policy for each agent is of the shape:
38 |
39 | 
40 |
41 | Some examples of trajectories obtained after succesful training are as follow
42 |
43 | 
44 |
45 | ## Comparison of tested policies
46 | The number of collisions displayed on the results are defined as an agent intersection with a neighbour’s collision radius in a given time step. Each agent counts collisions separately, thus two agents colliding is counted as two different collisions and a collisions that lasts for several times steps is is also counted as different collisions.
47 |
48 | Percentage of simulations that achieve each number of collisions for each of the three tested policies, n = 5. The percentages for more than 14 collisions (under 1%) have been omitted.
49 |
50 | 
51 |
52 | Effect of the ∆-disk radius (definition 12 on the thesis) on the global reward and number of collisions, averaged over 2000 runs of the trained policies, for the discrete softmax NN policy
53 |
54 | 
55 |
56 | The results also show that the average reward starts to decrease after a certain value of ∆i , in this case around 1. The average number of collisions also increases sharply back to values where the agent has nearly no vision. This unexpected behaviours is the result of significant increase in the complexity of the maximization problem that the policy gradient is trying to solve. Taking into account an increasing number of neighbours and from further distances, increases the variance of the estimated approximated gradient and as a result, the policy used is not able to find improvements. Indeed, for the final case of ∆i ≈ dˆi is not able to converge during training.
57 |
58 |
--------------------------------------------------------------------------------
/SAC_agents.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 | import torch.optim as optim
6 | from collections import deque, namedtuple
7 | from utils import *
8 |
9 | class RandomAgent:
10 | ''' Agent taking actions uniformly at random, child of the class Agent'''
11 | def __init__(self, n_actions: int):
12 | # super(RandomAgent, self).__init__(n_actions)
13 | self.n_actions = n_actions
14 |
15 | def forward(self, state: np.ndarray) -> np.ndarray:
16 | ''' Compute a random action in [-1, 1]
17 |
18 | Returns:
19 | action (np.ndarray): array of float values containing the
20 | action.
21 | '''
22 | return np.clip(-1 + 2 * np.random.rand(self.n_actions), -1, 1)
23 |
24 | class TrainedAgent:
25 | ''' Agent that loads and follows a learned policy/critic
26 | '''
27 | def __init__(self, critics_name:str, actors_name:str, n_agents = "auto", discount = 0.99):
28 |
29 | file_name_critics = os.path.join("models", critics_name)
30 | # Load critic
31 | try:
32 | criticsNN = torch.load(file_name_critics)
33 | print(f'Loaded Critic, n_agents = {len(criticsNN)}, discount = {discount}. Network model[0]: {criticsNN[0]}')
34 | except:
35 | print(f'File {file_name_critics} not found!')
36 | exit(-1)
37 |
38 | self.criticsNN = criticsNN
39 | self.critics_name = critics_name
40 |
41 | if n_agents == "auto":
42 | self.n_agents = len(criticsNN)
43 | else:
44 | self.n_agents = n_agents
45 |
46 | # load actor
47 | file_name_actors = os.path.join("models", actors_name)
48 |
49 | try:
50 | actors = torch.load(file_name_actors)
51 | print(f'Loaded actors, n_agents = {len(criticsNN)}, discount = {discount}. Type: {type(actors[0])}')
52 | except:
53 | print(f'File {file_name_actors} not found!')
54 | exit(-1)
55 | self.actors = actors
56 | self.actors_name = actors_name
57 |
58 | self.discount = discount # to benchmark the critic
59 |
60 | def forward(self, z_states: list, N:list):
61 | """ Returns:
62 | action (np.ndarray): array of float values containing the
63 | action. The dimensionality is equal to self.n_actions from
64 | the parent class Agent
65 | """
66 | actions = []
67 | if type(self.actors[0]) is NormalPolicy or type(self.actors[0]) is NormalActorNN or type(self.actors[0]) is DiscreteSoftmaxNN:
68 | # z_state in this case
69 | for i in range(self.n_agents):
70 | z_state = z_states[i].flatten()
71 | Ni = N[i]
72 | if i < len(self.actors):
73 | actor = self.actors[i]
74 | else:
75 | actor = self.actors[0]
76 |
77 | action = actor.sample_action(z_state, Ni)
78 | actions.append(action)
79 | else:
80 | print(f"Error type of policy {type(self.actors[0])}")
81 |
82 | return actions
83 |
84 | def benchmark_cirtic(self, buffers: deque, only_one_NN = False):
85 |
86 | Gts = deque() # for debug, delete after
87 | V_approxs = deque() # for debug, delete after
88 | criticNN= self.criticsNN[0]
89 |
90 | for i in range(self.n_agents):
91 | # NN for this agent:
92 | if not only_one_NN:
93 | if i < len(self.criticsNN):
94 | criticNN = self.criticsNN[i]
95 | else:
96 | criticNN = self.criticsNN[0]
97 |
98 | # separate data from experience buffer
99 | buffer = buffers.buffers[i]
100 | states, actions, rewards, new_states, Ni, finished = zip(*buffer)
101 |
102 | # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s]
103 | # inputs = np.column_stack((states,actions))
104 | # inputs = torch.tensor(inputs, dtype=torch.float32)
105 | inputs = torch.tensor(np.array(states), dtype=torch.float32)
106 |
107 | # Calculate the simulated Q value (target), Monte carlo Gt
108 | # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T)
109 | T = len(rewards)
110 | Gt_array = np.zeros([T])
111 | Gt_array[-1] = rewards[-1]
112 | for t in range(T-2,-1,-1):
113 | Gt_array[t] = Gt_array[t+1]*self.discount + rewards[t]
114 |
115 | Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze()
116 |
117 | # value function: # calculate the approximated Q(s,a) = NN(input)
118 | V_approx = criticNN(inputs).squeeze()
119 |
120 | V_approxs.append(V_approx.detach().numpy())
121 | Gts.append(Gt_array) # for debug
122 |
123 | # Gts is the simulated Q(st,at) values for each agent. Q(s,a)-V(s) ~= Gt-V(st) = A(s,a)
124 | return Gts, V_approxs
125 |
126 | class SA2CAgents:
127 |
128 | def __init__(self, n_agents, dim_local_state, dim_local_action, discount, epochs, learning_rate_critic = 10**(-3), learning_rate_actor = 10**(-3)) -> None:
129 | '''* dim_local_state is the total size of the localized vector that the input of the Q and pi approximations use, i.e (k+1)*dim'''
130 |
131 | self.n_agents = n_agents
132 | self.dim_local_state = dim_local_state
133 | self.dim_local_action = dim_local_action
134 | # self.policy_type = policy_type # What kind of policy (NN, stochastic normal dist, etc...)
135 | self.discount = discount
136 | self.epochs = epochs
137 |
138 | # preload_NN = "models\\final\\cont_n5"
139 | preload_NN = None
140 | # Define policy (actor)
141 | if preload_NN is None:
142 | # self.actors = [NormalPolicy(dim_local_state,dim_local_action) for i in range(n_agents)]
143 | self.actors = [DiscreteSoftmaxNN(dim_local_state, lr = learning_rate_actor, n_actions=16) for i in range(n_agents)]
144 | # self.actors = [NormalActorNN(dim_local_state, lr = learning_rate_actor, dim_action=dim_local_action) for i in range(n_agents)]
145 | self.learning_rate_actor = learning_rate_actor
146 |
147 | # List of NN that estimate Q (or V if we use advantage)
148 | # self.criticsNN = [CriticNN(dim_local_state + dim_local_action, output_size=1) for i in range(n_agents)]
149 | self.criticsNN = [CriticNN(dim_local_state, output_size=1) for i in range(n_agents)]
150 | self.critic_optimizers = [optim.Adam(self.criticsNN[i].parameters(),lr = learning_rate_critic) for i in range(n_agents)]
151 | else:
152 | try:
153 | actors = torch.load(preload_NN + "-A2Cactors.pth")
154 | print(f'Loaded actors, n_agents = {len(actors)}, discount = {discount}. Type: {type(actors[0])}')
155 | except:
156 | print(f'File {preload_NN + "-A2Cactors.pth"} not found!')
157 | exit(-1)
158 | self.actors = actors
159 | self.learning_rate_actor = learning_rate_actor
160 | try:
161 | criticsNN = torch.load(preload_NN + "-A2Ccritics.pth")
162 | print(f'Loaded Critic, n_agents = {len(criticsNN)}, discount = {discount}. Network model[0]: {criticsNN[0]}')
163 | except:
164 | print(f'File {preload_NN + "-A2Ccritics.pth"} not found!')
165 | exit(-1)
166 | self.criticsNN = criticsNN
167 | self.critic_optimizers = [optim.Adam(self.criticsNN[i].parameters(),lr = learning_rate_critic) for i in range(n_agents)]
168 |
169 |
170 | def forward(self, z_states, N) -> list:
171 | ''' Function that calculates the actions to take from the z_states list (control law)
172 | actions: list of row vectors [u1^T, u2^T,...]'''
173 |
174 | actions = deque()
175 | for i in range(self.n_agents):
176 | z_state = z_states[i].flatten()
177 | actions.append(self.actors[i].sample_action(z_state, N[i]))
178 | # actions.append(self.actors[i].sample_action(z_state, [1]))
179 |
180 | return actions
181 |
182 | def train_designed_policy(self, buffers: deque, actor_lr = None, return_grads = False):
183 | epochs = self.epochs
184 |
185 | if actor_lr is not None:
186 | self.learning_rate_actor = actor_lr
187 |
188 | Gts = deque() # for debug, delete after -> acces data Gts[i][t]
189 |
190 | # CRITIC LOOP
191 | for i in range(self.n_agents):
192 | # NN for this agent:
193 | criticNN = self.criticsNN[i]
194 | critic_optimizer = self.critic_optimizers[i]
195 |
196 | # separate data from experience buffer
197 | buffer = buffers.buffers[i]
198 | states, actions, rewards, new_states, Ni, finished = zip(*buffer)
199 |
200 | # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s]
201 | # inputs = np.column_stack((states,actions))
202 | # inputs = torch.tensor(inputs, dtype=torch.float32)
203 | inputs = torch.tensor(np.array(states), dtype=torch.float32)
204 |
205 | # Calculate the simulated Q value (target), Monte carlo Gt
206 | # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T)
207 | T = len(rewards)
208 | Gt_array = np.zeros([T])
209 | Gt_array[-1] = rewards[-1]
210 | for t in range(T-2,-1,-1):
211 | Gt_array[t] = Gt_array[t+1]*self.discount + rewards[t]
212 |
213 | Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze()
214 | Gts.append(Gt_array) # for debug
215 |
216 | ### Perfrom omega (critic) update:
217 | # Set gradient to zero
218 | critic_optimizer.zero_grad()
219 | # value function: # calculate the approximated V(s) = NN(input)
220 | V_approx = criticNN(inputs).squeeze()
221 | # Compute MSE loss, as E[Gt-V(s) = A(s,a)] = 0
222 | loss = nn.functional.mse_loss(V_approx, Gt)
223 | # Compute gradient
224 | loss.backward()
225 | # Clip gradient norm to avoid infinite gradient
226 | nn.utils.clip_grad_norm_(criticNN.parameters(), max_norm=10)
227 | # Update
228 | critic_optimizer.step()
229 |
230 | # ACTOR LOOP
231 | grad_norms = []
232 | gi_norms = []
233 | for i in range(self.n_agents):
234 | # to access buffer data: buffers.buffers[i][t].action, namedtuple('experience', ['z_state', 'action', 'reward', 'next_z', 'Ni', 'finished'])
235 | # Gt: Gts[i][t]
236 | actor = self.actors[i]
237 | gi = 0 #initialize to 0
238 |
239 | for t in range(T):
240 | zit = buffers.buffers[i][t].z_state
241 | ait = buffers.buffers[i][t].action
242 | Nit = buffers.buffers[i][t].Ni
243 |
244 | grad_actor = actor.compute_grad(zit,ait, Nit)
245 | # PUT Nit HERE INSTEAD of [1,2,3]
246 | # grad_actor = actor.clip_grad_norm(grad_actor,clip_norm=100)
247 |
248 | # Qj_sum = 0
249 | Advantage_j_sum = 0
250 | input_tensor = torch.tensor(zit, dtype=torch.float32)
251 | # Baseline is the Vi(s) for current agent. reduce variance and complexity
252 | Vi_baseline = self.criticsNN[i](input_tensor).detach().numpy()[0]
253 | for j in Nit: # i included here
254 | zjt = buffers.buffers[j][t].z_state
255 | ajt = buffers.buffers[j][t].action
256 | # Q_input_tensor = torch.tensor(np.hstack((zjt,ajt)), dtype=torch.float32)
257 | # Qj = self.criticsNN[j](Q_input_tensor).detach().numpy()[0]
258 | # input_tensor = torch.tensor(zjt, dtype=torch.float32)
259 | # Vj = self.criticsNN[j](input_tensor).detach().numpy()[0]
260 | Advantage_j_sum += (Gts[j][t] - Vi_baseline)
261 | # Qj_sum += Gts[j][t]
262 |
263 | # gi += self.discount**t * 1/self.n_agents* grad_actor * Qj_sum
264 | gi += self.discount**t * 1/self.n_agents* grad_actor * Advantage_j_sum
265 |
266 | # Update policy parameters with approx gradient gi (clipped to avoid infinity gradients)
267 | gi = actor.clip_grad_norm(gi, clip_norm=100)
268 | # MAKE SURE TO CLIP THE PARAMS from 0 to 2*Pi
269 | actor.parameters = actor.parameters + self.learning_rate_actor*gi
270 | # actor.parameters = np.clip(actor.parameters + self.learning_rate_actor*gi, -2*np.pi, 2*np.pi)
271 |
272 | # print(f"grad norms gi={np.linalg.norm(gi.flatten())}")
273 | if return_grads:
274 | grad_norms.append(np.linalg.norm(grad_actor.flatten()))
275 | gi_norms.append(np.linalg.norm(gi.flatten()))
276 |
277 | if return_grads:
278 | return grad_norms, gi_norms
279 |
280 | def train_NN(self, buffers: deque, actor_lr = None):
281 | epochs = self.epochs
282 |
283 | if actor_lr is not None:
284 | self.learning_rate_actor = actor_lr
285 |
286 | Gts = deque() # for debug, delete after -> acces data Gts[i][t]
287 | T = len(buffers)
288 |
289 | # CRITIC LOOP
290 | for i in range(self.n_agents):
291 | # NN for this agent:
292 | criticNN = self.criticsNN[i]
293 | critic_optimizer = self.critic_optimizers[i]
294 |
295 | # separate data from experience buffer
296 | buffer = buffers.buffers[i]
297 | states, actions, rewards, new_states, Ni, finished = zip(*buffer)
298 |
299 | # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s]
300 | inputs = torch.tensor(np.array(states), dtype=torch.float32)
301 |
302 | # Calculate the simulated Q value (target), Monte carlo Gt
303 | # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T)
304 | Gt_array = np.zeros([T])
305 | Gt_array[-1] = rewards[-1]
306 | for t in range(T-2,-1,-1):
307 | Gt_array[t] = Gt_array[t+1]*self.discount + rewards[t]
308 |
309 | Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze()
310 | Gts.append(Gt_array) # for debug
311 |
312 | ## Perfrom omega (critic) update:
313 | # Set gradient to zero
314 | critic_optimizer.zero_grad()
315 | # value function: # calculate the approximated V(s) = NN(input)
316 | V_approx = criticNN(inputs).squeeze()
317 | # Compute MSE loss, as E[Gt-V(s) = A(s,a)] = 0
318 | loss = nn.functional.mse_loss(V_approx, Gt)
319 | # Compute gradient
320 | loss.backward()
321 | # Clip gradient norm to avoid infinite gradient
322 | nn.utils.clip_grad_norm_(criticNN.parameters(), max_norm=10)
323 | # Update
324 | critic_optimizer.step()
325 |
326 | # ACTOR LOOP
327 | for i in range(self.n_agents):
328 | # to access buffer data: buffers.buffers[i][t].action, namedtuple('experience', ['z_state', 'action', 'reward', 'next_z', 'Ni', 'finished'])
329 | # Gt: Gts[i][t]
330 | actor = self.actors[i]
331 | actor_loss = torch.tensor(0, dtype=torch.float32)
332 |
333 | for t in range(T):
334 | zit = buffers.buffers[i][t].z_state
335 | ait = buffers.buffers[i][t].action
336 | Nit = buffers.buffers[i][t].Ni
337 |
338 | log_prob_tensor = actor.log_p_of_a(zit,ait)
339 |
340 | Advantage_j_sum = 0
341 | input_tensor = torch.tensor(zit, dtype=torch.float32)
342 | # Baseline is the Vi(s) for current agent. reduce variance and complexity
343 | Vi_baseline = self.criticsNN[i](input_tensor).detach().numpy()[0]
344 | # Advantage_j_sum += (Gts[i][t] - Vi_baseline)
345 | for j in Nit: # i included here
346 | Advantage_j_sum += (Gts[j][t] - Vi_baseline)
347 | # Advantage_j_sum += (Gts[j][t])
348 |
349 | # gi += self.discount**t * 1/self.n_agents* grad_actor * Qj_sum
350 | # actor_loss = actor_loss - self.discount**t * 1/self.n_agents* log_prob_tensor * Advantage_j_sum
351 | actor_loss = actor_loss -log_prob_tensor * 1/self.n_agents * self.discount**t * Advantage_j_sum
352 |
353 | # Update policy parameters
354 | actor.optimizer.zero_grad()
355 | actor_loss.backward()
356 | nn.utils.clip_grad_norm_(actor.parameters(), max_norm=10)
357 | actor.optimizer.step()
358 |
359 |
360 | def benchmark_cirtic(self, buffers: deque, only_one_NN = False):
361 |
362 | Gts = deque() # for debug, delete after
363 | V_approxs = deque() # for debug, delete after
364 | criticNN= self.criticsNN[0]
365 |
366 | for i in range(self.n_agents):
367 | # NN for this agent:
368 | if not only_one_NN:
369 | criticNN = self.criticsNN[i]
370 |
371 | # separate data from experience buffer
372 | buffer = buffers.buffers[i]
373 | states, actions, rewards, new_states, Ni, finished = zip(*buffer)
374 |
375 | # Create input tensor, This one: input = [s,a] tensor -> Q(s,a)
376 | # inputs = np.column_stack((states,actions))
377 | inputs = torch.tensor(np.array(states), dtype=torch.float32)
378 | ## Actor update
379 |
380 | # Calculate the simulated Q value (target), Monte carlo Gt
381 | # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T)
382 | T = len(rewards)
383 | Gt_array = np.zeros([T])
384 | Gt_array[-1] = rewards[-1]
385 | for t in range(T-2,-1,-1):
386 | Gt_array[t] = Gt_array[t+1]*self.discount + rewards[t]
387 |
388 | Gt = torch.tensor(Gt_array, dtype=torch.float32).squeeze()
389 |
390 | # value function: # calculate the approximated Q(s,a) ~= Gt = V(s) + A(s,a) => Gt-V(s) = A(s,a)
391 | V_approx = criticNN(inputs).squeeze()
392 | V_approxs.append(V_approx.detach().numpy())
393 | # Q_approxs.append(Gt_array)
394 | Gts.append(Gt_array) # for debug
395 |
396 | # Gts is the simulated Q(st,at) values for each agent. Q(s,a)-V(s) ~= Gt-V(st) = A(s,a)
397 | return Gts, V_approxs
398 |
399 | def save(self,filename = "network"):
400 | folder ="models"
401 | cirtic_name = filename + "-A2Ccritics.pth"
402 | actors_name = filename + "-A2Cactors.pth"
403 |
404 | torch.save(self.criticsNN, os.path.join(folder,cirtic_name))
405 | print(f'Saved Critic NNs as {cirtic_name}')
406 | torch.save(self.actors, os.path.join(folder,actors_name))
407 | print(f'Saved Actors List as {actors_name}')
408 |
409 |
410 | class SPPOAgents:
411 | def __init__(self, n_agents, dim_local_state, dim_local_action, discount, epochs, learning_rate_critic = 10**(-3), learning_rate_actor = 10**(-3), epsilon = 0.2) -> None:
412 | '''* dim_local_state is the total size of the localized vector that the input of the Q and pi approximations use, i.e (k+1)*dim'''
413 |
414 | self.n_agents = n_agents
415 | self.dim_local_state = dim_local_state
416 | self.dim_local_action = dim_local_action
417 | # self.policy_type = policy_type # What kind of policy (NN, stochastic normal dist, etc...)
418 | self.discount = discount
419 | self.epochs = epochs
420 | self.epsilon = epsilon
421 |
422 | # Define actor networks
423 | self.actorsNN = [NormalActorNN(dim_local_state, dim_action=dim_local_action) for i in range(n_agents)]
424 | self.learning_rate_actor = learning_rate_actor
425 | self.actor_optimizers = [optim.Adam(self.actorsNN[i].parameters(),lr = learning_rate_actor) for i in range(n_agents)]
426 |
427 |
428 | # List of NN that estimate Q (or V if we use advantage)
429 | # self.criticsNN = [CriticNN(dim_local_state + dim_local_action, output_size=1) for i in range(n_agents)]
430 | self.criticsNN = [CriticNN(dim_local_state, output_size=1) for i in range(n_agents)]
431 | self.critic_optimizers = [optim.Adam(self.criticsNN[i].parameters(),lr = learning_rate_critic) for i in range(n_agents)]
432 |
433 | def forward(self, z_states, N) -> list:
434 | ''' Function that calculates the actions to take from the z_states list (control law)
435 | actions: list of row vectors [u1^T, u2^T,...]'''
436 |
437 | actions = deque()
438 | for i in range(self.n_agents):
439 | z_state = z_states[i].flatten()
440 | actorNN = self.actorsNN[i]
441 |
442 | state_tensor = torch.tensor(z_state, dtype=torch.float32)
443 | mu_tensor,sigma_tensor = actorNN(state_tensor)
444 |
445 | # Normally distributed value with the mu and sigma (std^2) from ActorNN
446 | std = np.sqrt(sigma_tensor.detach().numpy())
447 | action = np.random.normal(mu_tensor.detach().numpy(),std)
448 |
449 | # Acion must be between -1,1
450 | actions.append(np.clip(action,-1,1))
451 |
452 | return actions
453 |
454 | def train(self, buffers: deque, actor_lr = None, return_grads = False):
455 | epochs = self.epochs
456 |
457 | if actor_lr is not None:
458 | self.learning_rate_actor = actor_lr
459 |
460 | T = len(buffers)
461 | # Gts = deque() # for debug, delete after -> acces data Gts[i][t]
462 | Git = np.zeros([self.n_agents, T]) # Git[i,t]
463 | Qjsum_estim = np.zeros([self.n_agents, T]) # Qjsum_estim[i,t]
464 | p_old_it = np.zeros([self.n_agents, T]) # p_old_it[i,t]
465 |
466 |
467 | # Agents LOOP, to create required variables
468 | for i in range(self.n_agents):
469 | # separate data from experience buffer
470 | buffer = buffers.buffers[i]
471 | states, actions, rewards, new_states, Ni, finished = zip(*buffer)
472 |
473 | # Calculate the simulated Q value (target), Monte carlo Gt
474 | # Going backwards, G(t) = gamma * G(t+1) + r(t), with G(T)=r(T)
475 | # T = len(rewards)
476 | Gt_array = np.zeros([T])
477 | Gt_array[-1] = rewards[-1]
478 | for t in range(T-2,-1,-1):
479 | Gt_array[t] = Gt_array[t+1]*self.discount + rewards[t]
480 |
481 | Git[i,:] = Gt_array
482 |
483 | # Calculate the advantage estimator (attempt), and old p of a
484 | for i in range(self.n_agents):
485 | # separate data from experience buffer
486 | buffer = buffers.buffers[i]
487 | states, actions, rewards, new_states, Ni, finished = zip(*buffer)
488 |
489 | # Create input tensor, This one: input = [s,a] tensor -> Q(s,a) [200x8]. If instead V(s), input = [s]
490 | # inputs = torch.tensor(np.column_stack((states,actions)), dtype=torch.float32)
491 | states_tensor = torch.tensor(np.array(states), dtype=torch.float32)
492 | actions_tensor = torch.tensor(np.array(actions), dtype=torch.float32)
493 |
494 | p_old_it[i,:] = self.probability_of_ai(states_tensor, actions_tensor, i).detach().numpy()
495 | Vi_baselines = self.criticsNN[i](states_tensor).squeeze().detach().numpy()
496 | # advantage_estim[i,:] = -Vi_baselines
497 |
498 | for t in range(T):
499 | Nit = buffers.buffers[i][t].Ni
500 | for j in Nit: # i included here, only uses local i
501 | Qjsum_estim[i,t] += Git[j,t]
502 |
503 |
504 | ### Training LOOP, per agent: ###
505 | for i in range(self.n_agents):
506 | buffer = buffers.buffers[i]
507 | states, actions, rewards, new_states, Ni, finished = zip(*buffer)
508 | Gt = torch.tensor(Git[i,:],dtype=torch.float32)
509 | states_tensor = torch.tensor(np.array(states), dtype=torch.float32)
510 | actions_tensor = torch.tensor(np.array(actions), dtype=torch.float32)
511 |
512 | Vi_baselines = self.criticsNN[i](states_tensor).squeeze()
513 | Adv = Qjsum - Vi_baselines
514 | Qjsum = torch.tensor(Qjsum_estim[i,:], dtype=torch.float32)
515 |
516 | # criticNN = self.criticsNN[i]
517 | # critic_optimizer = self.critic_optimizers[i]
518 | # actorNN = self.actorsNN[i]
519 | # actor_optimizer = self.actor_optimizers[i]
520 |
521 | # Perform training for epochs
522 | for m in range(epochs):
523 |
524 | ### CRITIC update:
525 | # Set gradient to zero
526 | self.critic_optimizers[i].zero_grad()
527 | # value function: # calculate the approximated V(s) = NN(input)
528 | V_approx = self.criticsNN[i](states_tensor).squeeze()
529 | # Compute MSE loss, as E[Gt-V(s) = A(s,a)] = 0
530 | loss = nn.functional.mse_loss(V_approx, Gt)
531 | # Compute gradient
532 | loss.backward()
533 | # Clip gradient norm to avoid infinite gradient
534 | nn.utils.clip_grad_norm_(self.criticsNN[i].parameters(), max_norm=10)
535 | # Update
536 | self.critic_optimizers[i].step()
537 |
538 | ### ACTOR update
539 | # Set gradient to zero
540 | self.actor_optimizers[i].zero_grad()
541 | pi_old = torch.tensor(p_old_it[i,:], dtype=torch.float32)
542 | # Compute new advantage with updated critic
543 | # Compute r_theta ratio of probabilities
544 | current_pi = self.probability_of_ai(states_tensor,actions_tensor, i)
545 | r_theta = current_pi/pi_old
546 | # Comute loss function - 1/N (min...)
547 | left_min = r_theta * Adv
548 | right_min = torch.clamp(r_theta, 1 - self.epsilon, 1 + self.epsilon) * Adv #clipping method for tensors
549 | loss = - torch.mean(torch.min(left_min,right_min))
550 | # compute gradient
551 | loss.backward()
552 | # Clip gradient norm to avoid infinite gradient
553 | nn.utils.clip_grad_norm_(self.actorsNN[i].parameters(), max_norm=10)
554 | # Update
555 | self.actor_optimizers[i].step()
556 |
557 |
558 | def probability_of_ai(self,states_i,actions_i, agent:int):
559 | """ all variables are tensors. the time vector for current agent=i
560 | assume that the two action components are not correlated,
561 | thus P of both happening is p1*p2
562 | """
563 | i = agent
564 | mu , sigma = self.actorsNN[i](states_i)
565 |
566 | # p_tensor = (2*np.pi*sigma**2)**(-1/2) * torch.exp(-(actions-mu)**2/(2*sigma**2))
567 | # p = p_tensor[:,0]*p_tensor[:,1] # assuming uncorrelated action
568 |
569 | p1 = torch.pow(2 * np.pi * sigma[:,0], -0.5) * torch.exp(-(actions_i[:,0] - mu[:,0])**2 / (2 * sigma[:,0]))
570 | p2 = torch.pow(2 * np.pi * sigma[:,1], -0.5) * torch.exp(-(actions_i[:,1] - mu[:,1])**2 / (2 * sigma[:,1]))
571 | p = p1*p2
572 |
573 | return p
574 |
575 | def save(self,filename = "network"):
576 | folder ="models"
577 | cirtic_name = filename + "-PP0critics.pth"
578 | actors_name = filename + "-PP0actors.pth"
579 |
580 | torch.save(self.criticsNN, os.path.join(folder,cirtic_name))
581 | print(f'Saved Critic NNs as {cirtic_name}')
582 | torch.save(self.actorsNN, os.path.join(folder,actors_name))
583 | print(f'Saved Actors List as {actors_name}')
--------------------------------------------------------------------------------
/benchmark_agent.py:
--------------------------------------------------------------------------------
1 | from collections import deque, namedtuple
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | from matplotlib.ticker import PercentFormatter
5 | import drone_env
6 | from drone_env import running_average, plot_rewards, plot_grads
7 | from tqdm import tqdm, trange
8 | from SAC_agents import *
9 | from utils import ExperienceBuffers
10 |
11 | plt.style.use('seaborn-dark-palette')
12 | tex_fonts = {
13 | # Use LaTeX to write all text
14 | # "text.usetex": True,
15 | "font.family": "sans-serif",
16 | # Use 10pt font in plots, to match 10pt font in document
17 | "axes.labelsize": 10,
18 | "font.size": 10,
19 | # Make the legend/label fonts a little smaller
20 | "legend.fontsize": 10,
21 | "xtick.labelsize": 10,
22 | "ytick.labelsize": 10
23 | }
24 | plt.rcParams.update(tex_fonts)
25 |
26 | ### Set up parameters ###
27 | model_name = "final\\softmax8_n8"
28 | n_agents = 8
29 | deltas = np.ones(n_agents)*1
30 | env = drone_env.drones(n_agents=n_agents, n_obstacles=0, grid=[5, 5], end_formation="O", deltas=deltas ,simplify_zstate = True)
31 | env.collision_weight = 0.2 # old 0.2
32 | print(env)
33 | # env.show()
34 |
35 | N_Episodes = 1500
36 | episodes_to_plot = [0]
37 |
38 | # Initialize variables
39 | total_collisions_per_episode = deque()
40 | total_reward_per_episode = deque()
41 | total_true_reward_per_episode = deque()
42 | total_t = deque()
43 | mean_advantage = np.zeros([env.n_agents, N_Episodes])
44 |
45 | # times = np.arange(0, T, step=drone_env.dt) + drone_env.dt
46 |
47 | agents = TrainedAgent(critics_name=model_name+"-A2Ccritics.pth", actors_name=model_name+"-A2Cactors.pth", n_agents=env.n_agents)
48 | print("### Running Trained agent (no learning)")
49 | print(f"Episodes = {N_Episodes}, max Time iterations = {drone_env.max_time_steps} (T = {drone_env.max_time_steps * drone_env.dt}s, dt = {drone_env.dt}s)")
50 | print(f"N of agents = {env.n_agents}, collision weight b = {env.collision_weight}")
51 |
52 | EPISODES = trange(N_Episodes, desc='Episode: ', leave=True)
53 | for episode in EPISODES:
54 |
55 | if episode+1 in episodes_to_plot:
56 | # reward_history = np.zeros([len(times), env.n_agents])
57 | trajectory = [env.state.copy()]
58 | z_trajectory = [env.z_states]
59 | total_episode_reward = 0
60 | total_true_episode_reward = 0
61 | total_episode_collisions = 0
62 | # env.show()
63 |
64 | buffers = ExperienceBuffers(env.n_agents)
65 |
66 | # SIMULATION OVER T
67 | t_iter = 0
68 | finished = False
69 | while not finished:
70 |
71 | state = env.state
72 | z_states = env.z_states
73 | Ni = env.Ni
74 |
75 | # calculate actions based on current state
76 | # actions = drone_env.gradient_control(state, env)
77 | # actions = drone_env.proportional_control(state, env)
78 | actions = agents.forward(z_states, Ni)
79 |
80 | # Update environment one time step with the actions
81 | new_state, new_z, rewards, n_collisions, finished, true_rewards = env.step(actions)
82 | # EXPERIECE: [z_state, action, reward, next_z, finished]
83 | buffers.append(z_states, actions, rewards, new_z, Ni, finished)
84 |
85 | total_episode_reward += np.mean(rewards)
86 | total_true_episode_reward += np.mean(true_rewards)
87 | total_episode_collisions += n_collisions
88 |
89 | if episode+1 in episodes_to_plot:
90 | # reward_history[t_iter,:] = reward
91 | trajectory.append(new_state.copy())
92 | z_trajectory.append(new_z)
93 |
94 | t_iter +=1
95 |
96 | # END OF EPISODE
97 | # Append episode reward
98 | total_reward_per_episode.append(total_episode_reward)
99 | total_true_reward_per_episode.append(total_true_episode_reward)
100 | total_collisions_per_episode.append(total_episode_collisions)
101 | total_t.append(t_iter)
102 |
103 | # Test Critic values
104 | Q_simulated, V_approx = agents.benchmark_cirtic(buffers, only_one_NN=False)
105 | advantage = [np.mean(np.power(Q_simulated[i]-V_approx[i],1)) for i in range(env.n_agents)]
106 | mean_advantage[:,episode] = np.array([advantage])
107 |
108 | # print(f"Episode collisions = {total_episode_collisions}")
109 | # env.animate(trajectory,frame_time=0.1)
110 |
111 | # RESET ENVIRONMENT
112 | env.reset(renew_obstacles=False)
113 |
114 | # Set progress bar description with information
115 | average_reward = running_average(total_reward_per_episode, 50)[-1]
116 | average_true_reward = running_average(total_true_reward_per_episode, 50)[-1]
117 | average_collisions = running_average(total_collisions_per_episode, 50)[-1]
118 | average_t = running_average(total_t, 50)[-1]
119 | EPISODES.set_description(
120 | f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter} - Average: {average_reward:.1f}/{average_collisions:.2f}/{average_t}. True r={average_true_reward:.1f}.")
121 |
122 | # Plot current trajectory
123 |
124 | if episode+1 in episodes_to_plot:
125 | env.plot(trajectory, episode)
126 | env.animate(trajectory, z_trajectory, deltas, episode, name=f"trained-E{episode+1}", format="mp4")
127 | times = np.arange(0, t_iter)*drone_env.dt
128 | plt.figure()
129 | for i in range(env.n_agents):
130 | agent_color = drone_env.num_to_rgb(i,env.n_agents-1)
131 | plt.plot(times,Q_simulated[i], label=f"i={i}, simulated Q (Gt)", color = agent_color)
132 | plt.plot(times,V_approx[i],"--" , label=f"i={i}, approx V", color = tuple(0.9*x for x in agent_color))
133 | plt.legend()
134 | plt.show()
135 |
136 | plot_rewards(total_reward_per_episode, total_true_reward_per_episode, total_collisions_per_episode, n_ep_running_average=50)
137 |
138 | plt.figure()
139 | for i in range(env.n_agents):
140 | agent_color = drone_env.num_to_rgb(i,env.n_agents-1)
141 | plt.plot(range(N_Episodes),mean_advantage[i,:], label=f"i={i}", color = agent_color)
142 | plt.xlabel("Episodes")
143 | plt.ylabel("trajectory 1/T * [Q(s,a)-V(s)] = mean_T A(s,a)")
144 | plt.legend()
145 | plt.grid()
146 | plt.show()
147 |
148 | plt.figure()
149 | # plt.gca().set_size_inches(4.5, 3.5)
150 | counts, bins, bars = plt.hist(total_collisions_per_episode, range(min(total_collisions_per_episode), max(total_collisions_per_episode) + 1, 2), weights=np.ones(len(total_collisions_per_episode)) / len(total_collisions_per_episode))
151 | print(f"Runs with 0 coll. = {counts[0]*100:.2f}%. 2 coll. = {counts[1]*100:.2f}%")
152 | plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
153 | plt.title("Collision performance")
154 | plt.xlabel("Number of collisions")
155 | plt.ylabel("Frequency of Simulations")
156 | plt.show()
157 |
--------------------------------------------------------------------------------
/control_test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import drone_env
4 | from drone_env import running_average, plot_rewards
5 | from tqdm import tqdm, trange
6 |
7 | env = drone_env.drones(n_agents=5, n_obstacles=2, grid=[5, 5], end_formation="O")
8 | print(env)
9 | # env.show()
10 |
11 | T_Episodes = 2
12 | # Simulate for T seconds (default dt = drone_env.dt = 0.01s)
13 | T = 5
14 |
15 | # Initialize variables
16 | total_collisions_list = []
17 | total_reward_list = []
18 | times = np.arange(0, T, step=drone_env.dt) + drone_env.dt
19 | EPISODES = trange(T_Episodes, desc='Episode: ', leave=True)
20 |
21 | for episode in EPISODES:
22 |
23 | # reward_history = np.zeros([len(times), env.n_agents])
24 | trajectory = [env.state.copy()]
25 | total_episode_reward = 0
26 | total_episode_collisions = 0
27 | # env.show()
28 |
29 | for t_iter, t in enumerate(times):
30 | # Simple gradient controller u_i = -grad_i, assuming Nj = V
31 | state = env.state
32 |
33 | # calculate actions based on current state
34 | # actions = drone_env.gradient_control(state, env)
35 | actions = drone_env.proportional_control(state, env)
36 |
37 | # Update environment one time step with the actions
38 | new_state, new_z, reward, n_collisions, finished = env.step(actions)
39 |
40 | total_episode_reward += np.sum(reward)
41 | total_episode_collisions += n_collisions
42 |
43 | # reward_history[t_iter,:] = reward
44 | trajectory.append(new_state.copy())
45 |
46 | # Append episode reward
47 | total_reward_list.append(total_episode_reward)
48 | total_collisions_list.append(total_episode_collisions)
49 |
50 | # print(f"Episode collisions = {total_episode_collisions}")
51 | # env.animate(trajectory,frame_time=0.1)
52 |
53 | # RESET ENVIRONMENT
54 | env.reset(renew_obstacles=False)
55 |
56 | # Set progress bar description with information
57 | average_reward = running_average(total_reward_list, 50)[-1]
58 | average_collisions = running_average(total_collisions_list, 50)[-1]
59 | EPISODES.set_description(
60 | f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter+1} - Average: {average_reward:.1f}/{average_collisions:.2f}/{t_iter+1}")
61 |
62 | # Plot current trajectory
63 | env.plot(trajectory)
64 |
65 | plot_rewards(total_reward_list,total_collisions_list, n_ep_running_average=5)
--------------------------------------------------------------------------------
/drone_env.py:
--------------------------------------------------------------------------------
1 |
2 | import math
3 | import os
4 | import random
5 | import time
6 | from distutils.log import error
7 |
8 | import matplotlib.pyplot as plt
9 | from matplotlib import markers, animation
10 | import numpy as np
11 | from IPython import display
12 |
13 | ### TO DO ############
14 | """
15 | - Put back collision cost b =
16 | - Change back from not using Critic NN in training(),benchmark_critic()SACAgent (now trying Gt)
17 | - TrainedAgent for the actor loading
18 | - Add collisions to the animation. Add action arrows to animation?
19 | - (inactive now) Remove fixed random seed to initialize agents
20 | -
21 | - Plot of the Q function field for a fixed kth satates (varying xi)
22 | - Maybe clip the reward for near collision distances
23 | """
24 | ######################
25 |
26 | # Spatial dimension
27 | dim = 2
28 | # time step in seconds
29 | dt = 0.05
30 | max_time_steps = 200
31 | # Some colours
32 | LIGHT_RED = '#FFC4CC';
33 | LIGHT_GREEN = '#95FD99';
34 | BLACK = '#000000';
35 | WHITE = '#FFFFFF';
36 | LIGHT_PURPLE = '#E8D0FF';
37 | LIGHT_ORANGE = '#FAE0C3';
38 | YELLOW = "#FFFF00";
39 | BLUE = '#98F5FF';
40 |
41 | def num_to_rgb(val, max_val):
42 | if (val > max_val):
43 | raise ValueError("val must not be greater than max_val")
44 | if (val < 0 or max_val < 0):
45 | raise ValueError("arguments may not be negative")
46 |
47 | i = (val * 255 / max_val);
48 | r = round(math.sin(0.024 * i + 0) * 127 + 128)/255;
49 | g = round(math.sin(0.024 * i + 2) * 127 + 128)/255;
50 | b = round(math.sin(0.024 * i + 4) * 127 + 128)/255;
51 | return (r,g,b)
52 |
53 | class drones:
54 |
55 | def __init__(self, n_agents: int, n_obstacles: int, grid: list, end_formation: str, k_closest = 2, deltas: np.ndarray = None, simplify_zstate = False) -> None:
56 | """[summary]
57 |
58 | Args:
59 | i_agents (int): number of agents, integer
60 | n_obstacles (int): number of obstacles integer
61 | grid (list): list of the two axis limits delimiting the grid
62 | end_formation(str): what formation to reach e.g "O" is a circle
63 | k_closest(int): how many agents to account for in the Delta disk, for having a finite localized state (implementation)
64 | deltas(ndarray, column vector): vector of Deltas for each agent sensing radius
65 | """
66 | self.n_agents = n_agents
67 | self.grid = grid
68 | self.goal = self.grid
69 | self.k_closest = k_closest
70 | self.simplify_zstate = simplify_zstate
71 | self.internal_t = 0
72 | self.collision_weight = 0.2 # Weight of collision cost per unit of time. -r = q|xi-xF|^2 + b log(d_i/d_ij)
73 |
74 | # Other geometry parameters
75 | self.drone_radius = np.ones(n_agents)*0.1 # radius of each drone in m
76 |
77 | # State space dynamics. For now, all agents have same A,B
78 | self.A = np.eye(dim)
79 | self.B = np.eye(dim)*dt
80 |
81 | # Initialize agents and obstacles, and check if delta is a correct value
82 | self.obstacles = self.create_obstacles(n_obstacles)
83 | self.end_points, self.d_safety = self.generate_formation(end_formation)
84 |
85 | if deltas is None:
86 | # In case of no deltas, we assume Dleta = d_safety, i.e no simplification (maximum deltas allowed)
87 | self.deltas = self.d_safety
88 | else:
89 | self.deltas = np.minimum(deltas, self.d_safety)
90 | if not np.all(deltas <= self.d_safety):
91 | print("Some deltas are greater than the final minimum distance between end positions. Using minimum distance between end positions for those cases instead.",f"deltas = {self.deltas}")
92 |
93 | self.state, self.z_states = self.init_agents(n_agents)
94 |
95 | # self.trajectory = []
96 | # self.trajectory.append(self.state.copy())
97 |
98 | def reset(self, renew_obstacles = True):
99 | self.state, self.z_states = self.init_agents(self.n_agents)
100 | self.internal_t = 0
101 | if renew_obstacles == True:
102 | self.obstacles = self.create_obstacles(self.n_obstacles)
103 |
104 |
105 | def __str__(self):
106 | print("Grid size: [x_lim, y_lim]\n",self.grid)
107 | print("State: [x, y, vx, vy, r]\n", self.state)
108 | print(f"z_sattes for k_closest = {self.k_closest}: simplify? {self.simplify_zstate}")
109 | print("safety distance for each agent:\n", self.d_safety)
110 | print("Deltas disk radius for each agent: \n", self.deltas)
111 | # print("Obstacles [x, y, r]:\n",self.obstacles)
112 | print(f"Collision cost weight (per unit of time) = {self.collision_weight} ")
113 | return ""
114 |
115 | def generate_formation(self,end_formation):
116 | """generate end point coordinates from a description of the end formation shape
117 |
118 | Args:
119 | end_formation (str): a label given to a shape of final formation, i.e "O" for circle
120 |
121 | Returns:
122 | formation: [xF1^T, xF2^T,...]^T. Column vector
123 | """
124 | if end_formation == "O":
125 | size = min(self.grid)/2
126 | angle_step = 2*np.pi/self.n_agents
127 | formation = np.zeros([self.n_agents*dim,1])
128 |
129 | for i in range(self.n_agents):
130 | formation[dim*i,0] = np.cos(i * angle_step)*0.9*self.grid[0]/2 + self.grid[0]/2
131 | formation[dim*i+1,0] = np.sin(i * angle_step)*0.9*self.grid[1]/2 + self.grid[1]/2
132 |
133 | else:
134 | error(str(end_formation) +" is Not a valid end formation identifier")
135 |
136 | # find maximum allowed d_safety (\hat{d}_i): di <= min(|| xFi -xFj || - li - lj) for all j!=i
137 | d_safety = np.zeros(self.n_agents)
138 |
139 | for i in range(self.n_agents):
140 | xFi = formation[dim*i:(i+1)*dim,0]
141 | li = self.drone_radius[i]
142 | d_i = np.infty
143 | for j in range(self.n_agents):
144 | if j != i:
145 | xFj = formation[dim*j:(j+1)*dim,0]
146 | lj = self.drone_radius[j]
147 | d_ij = np.linalg.norm(xFi-xFj) -li -lj
148 | d_i = min([d_i,d_ij])
149 |
150 | d_safety[i] = d_i
151 |
152 | # formation: [xF1^T, xF2^T,...]^T. Column vector, d_safety (distance to closest end position from agent's end position)
153 | return formation, np.floor(d_safety*100)/100
154 |
155 | def create_obstacles(self,n_obstacles):
156 | self.n_obstacles = n_obstacles
157 |
158 | max_size = 0.1*np.max(self.grid)
159 | min_size = 0.05*max_size
160 |
161 | # generate random obstacles
162 | # Generate the obstacle coordinates and size
163 | # first column: x, second: y, third: r
164 | obstacles = np.random.rand(n_obstacles,dim+1)
165 | obstacles[:,0] = obstacles[:,0]*self.grid[0]
166 | obstacles[:,1] = obstacles[:,1]*self.grid[1]
167 | obstacles[:,dim] = obstacles[:,dim]*(max_size-min_size) + min_size
168 |
169 | return obstacles
170 |
171 | def init_agents(self,n_agents):
172 | # initialize state array:
173 | # column 1: x, col 2: y, col 3: vx, col 4 vy, col 5: l
174 | l = 0.1 # 10 cm
175 | self.n_agents = n_agents
176 | self.global_state_space = n_agents*(2*dim + 1) # x,y, vx,vy, radius
177 |
178 | grid = self.grid
179 |
180 | if self.simplify_zstate:
181 | # Only take into account position x,y (remove vx vy l)
182 | self.local_state_space = (dim)*(1+self.k_closest)
183 | else:
184 | self.local_state_space = (2*dim+1)*(1+self.k_closest)
185 |
186 | self.global_action_space = n_agents*dim # vx,vy (or ax,ay in the future)
187 | self.local_action_space = dim
188 |
189 | state = np.zeros([n_agents,5])
190 | state[:,4] = l
191 |
192 | # Create a grid and choose random nodes without repetition
193 | delta_l = 2*1.1*l
194 | divisions = np.floor(np.array(grid)/delta_l)
195 | possible_coord = []
196 |
197 | for idx in range(int(divisions[0])):
198 | for jdx in range(int(divisions[1])):
199 | coord = [idx*delta_l, jdx*delta_l]
200 | possible_coord.append(coord)
201 |
202 | # REMOVE THIS seed FOR true random initial state
203 | # random.seed(1)
204 | random_coord = np.array(random.sample(possible_coord, n_agents))
205 | state[:,0:dim] = random_coord
206 |
207 | # Calculate localized states z (uing the reward funciton)
208 | _, _, z_states, Ni, _ = self.rewards(state, self.end_points, self.n_agents, self.d_safety, self.deltas)
209 | # Update the Ni graph
210 | self.Ni = Ni
211 |
212 | return state, z_states
213 |
214 | def step(self,actions):
215 | """_summary_
216 |
217 | Args:
218 | actions (_type_):List of actions, each entry one agent [u1^T, u2^T, ...].Assuming each actions is a row vector with an entry for each dimension for each agent.
219 |
220 | Returns:
221 | state: new state after applying action
222 | z_states (list): localized states (from the new state)
223 | r_vec (row vector): vector for each localized reward for each agent. Average is total reward
224 | """
225 |
226 | # Update state: s -> s' with the system dynamics
227 | for i in range(self.n_agents):
228 |
229 | Ai = self.A
230 | Bi = self.B
231 |
232 | xi = np.transpose(self.state[i,0:dim])
233 | ui = actions[i]
234 |
235 | next_xi = np.matmul(Ai,xi) + np.matmul(Bi,ui)
236 |
237 | self.state[i,0:dim] = np.transpose(next_xi)
238 | self.state[i,dim:2*dim] = np.transpose(ui)
239 |
240 |
241 | # Calculate new individual reward [r1(s,a), r2,...] vector, plus related distance dependent values
242 | r_vec, n_collisions, z_states, Ni, true_r_vec = self.rewards(self.state, self.end_points, self.n_agents, self.d_safety, self.deltas)
243 | # Update the z and Ni graph
244 | self.z_states = z_states
245 | self.Ni = Ni
246 |
247 | # SHould return (s', r(s,a), n_collisions(s') ,finished)
248 | end_points = np.reshape(self.end_points,np.shape(self.state[:,0:dim]))
249 | error_from_end = np.linalg.norm(end_points-self.state[:,0:dim],axis = 1)
250 |
251 | if np.all(error_from_end <=0.2) or self.internal_t>= max_time_steps-1:
252 | finished = True
253 | else:
254 | finished = False
255 |
256 | self.internal_t += 1
257 | # TO DO: Proper is_finished
258 | return self.state, z_states, r_vec, n_collisions, finished, true_r_vec
259 |
260 | def rewards(self, state, end_points, n_agents, d_safety, deltas):
261 | '''
262 | state: [column 1: x, col 2: y, col 3: vx, col 4 vy, col 5: r ] np.array[i,2*dim+1]
263 | end_points: column [x1, y1, x2, y2, ... ]^T
264 | d_safety: column d_i [d1, d2, ...]^T
265 | '''
266 | n_agents = np.size(state,0)
267 |
268 | # weights: q|xi-xF|^2 + b log(d_i/d_ij). I multiply per dt as i assume is cost/time
269 | q = 2*dt
270 | b = self.collision_weight*dt
271 |
272 | xF = np.reshape(end_points,[n_agents,dim])
273 | xi = state[:,0:dim]
274 |
275 | # row vector, q|xi-xF|^2
276 | to_goal_cost = q*np.power(np.linalg.norm(xF-xi,axis=1),2)
277 |
278 | # Collision cost
279 |
280 | d_ij, log_d, N_delta, collisions = self.distance_data(state,deltas,d_safety)
281 |
282 | collision_cost = b*np.sum(log_d*N_delta,1)
283 | real_collision_cost = b*np.sum(log_d,1)
284 | n_collisions = np.sum(collisions)
285 |
286 | # These are the approximated localized rewards
287 | reward_vector = -np.nan_to_num(to_goal_cost+collision_cost)
288 | true_reward_vector = -np.nan_to_num(to_goal_cost+real_collision_cost)
289 |
290 | # Calculate localized z states
291 | z_states, Ni = self.localized_states(state, end_points, N_delta, d_ij)
292 |
293 | return reward_vector, n_collisions, z_states, Ni, true_reward_vector
294 |
295 | def distance_data(self,state,deltas,d_safety):
296 | '''Return matrix of clipped distances matrix d[ij]
297 | Also returns normalized distance matrix
298 | Also returns collision binary graph
299 | Also Delta proximity neighbourhood
300 |
301 | deltas must be a column!
302 | graph N includes i as neighbour
303 | '''
304 |
305 | n_agents = np.size(state,0)
306 | d_ij = np.zeros([n_agents,n_agents])
307 | d_ij_norm = np.zeros_like(d_ij)
308 |
309 | for i in range(n_agents):
310 | xi = state[i,0:dim]
311 | li = self.drone_radius[i]
312 | for j in range(n_agents):
313 | if j != i:
314 | xj = state[j,0:dim]
315 | lj = self.drone_radius[j]
316 |
317 | # Calculate agents relative distance
318 | d_ij[i,j] = min(np.linalg.norm(xi-xj) -li -lj, d_safety[i])
319 | if d_ij[i,j] == 0: # Handle unlikely case of exactly 0, if not then the coming division would be error
320 | d_ij[i,j] = -10**-6
321 | d_ij_norm[i,j] = d_safety[i]/d_ij[i,j]
322 | else:
323 | d_ij[i,j] = min(-li -li, d_safety[i])
324 | # Just to be safe, the distance to itself in the normalized case i make it =1, as log(1)=0 so it is neutral
325 | d_ij_norm[i,j] = 1
326 |
327 | collisions = d_ij_norm <= 0
328 | N_delta = d_ij <= deltas
329 | # Handling negative logarithms (only for d normalized, to use in logarithms)
330 | d_ij_norm[collisions] = 9.99E3
331 | log_d = np.log(d_ij_norm)
332 | log_d[collisions] = 9.99E3
333 |
334 | return d_ij, log_d, N_delta, collisions
335 |
336 | def localized_states(self, state, end_points, N_delta, d_ij):
337 | n_agents = np.size(state,0)
338 | sorted_idx = np.argsort(d_ij,1)
339 | k = self.k_closest
340 |
341 | z = []
342 | Ni_list = []
343 |
344 | for i in range(n_agents):
345 | # How many agents are in Delta range, minus itself
346 | in_range = np.sum(N_delta[i,:])-1
347 | sorted_agents = sorted_idx[i,:]
348 | Ni = [i]
349 |
350 | xi = state[i,0:dim]
351 | xFi = end_points[i*dim:(i+1)*dim]
352 |
353 | # Adding zii as the first row
354 | Zi = np.zeros([k+1,2*dim+1])
355 | Zi[0,:] = state[i,:].copy()
356 | # print(Zi,xFi.flaten() ,xi)
357 | Zi[0,0:dim] = -(xFi.flatten() - xi)
358 |
359 | for kth in range(1,k+1):
360 | # kth = 1,2,...k
361 |
362 | if kth <= in_range:
363 | # There exist a kth neighbour inside Delta
364 | j = sorted_agents[kth]
365 | Ni.append(j)
366 | xj = state[j,0:dim].copy()
367 | zj = state[j,:].copy()
368 | zj[0:dim] = xj-xi
369 | # zj[0:dim] = xj-xFi.flatten()
370 | # print(f"{kth}th closest agent is {j}, coord {xj}, rel coord {xj-xi}")
371 |
372 | else:
373 | # There is no neigbhour, thus using a null state (or state that should almost not add value)
374 | # I try for now to just add the next state in order, as if to just add the two closest even if outside the Delta range
375 | # Hopping that the NN learns that agents outside delta do not contribute to Q
376 | # Probably, the proper thing would be to project this next closest to the Delta boundary
377 | # j = sorted_agents[kth]
378 | # xj = state[j,0:dim].copy()
379 | # zj = state[j,:].copy()
380 | # zj[0:dim] = xj-xi
381 |
382 | # Create a "ghost" agent that is just behind agent, at a distance 1.1*Delta in the direction to the goal
383 | j = sorted_agents[kth]
384 | zi = Zi[0,0:dim]
385 | zj = state[j,:].copy()
386 | zj[0:dim] = zi/np.linalg.norm(zi) * self.deltas[i]*1.1
387 |
388 | Zi[kth,:] = zj
389 |
390 | if self.simplify_zstate:
391 | # Remove parts of the satte that overcomplicate:
392 | # No (vx,vy,l)
393 | z.append(Zi[:,0:dim])
394 | else:
395 | z.append(Zi)
396 |
397 | Ni_list.append(Ni)
398 |
399 | # z is a list of arrays. Each array is the localized Delta state for each agent
400 | # Add: Ni_list = list of neighbours for each agent
401 | return z, Ni_list
402 |
403 | # %% Plotting methods
404 | def show(self, state = None, not_animate = True):
405 |
406 | if not_animate:
407 | state = self.state
408 |
409 | fig, ax = plt.subplots(); # note we must use plt.subplots, not plt.subplot
410 | # (or if you have an existing figure)
411 | # fig = plt.gcf()
412 | # ax = fig.gca()
413 |
414 | ax.set_xlim((0, self.grid[0]));
415 | ax.set_ylim((0, self.grid[1]));
416 | ax.grid(True);
417 |
418 | for i in range(self.n_obstacles):
419 | circle = plt.Circle((self.obstacles[i,0], self.obstacles[i,1]), self.obstacles[i,2], color='black');
420 | ax.add_patch(circle);
421 |
422 | for i in range(self.n_agents):
423 | agent_color = num_to_rgb(i,self.n_agents-1)
424 | circle = plt.Circle((state[i,0], state[i,1]), state[i,4], color=agent_color, fill=False, label=f"{i+1}");
425 | ax.add_patch(circle);
426 | # end point in plot
427 | ax.plot(self.end_points[i*dim,0],self.end_points[i*dim+1,0],color=agent_color,marker = "*");
428 |
429 | ax.legend()
430 |
431 | if not_animate:
432 | plt.show()
433 | else:
434 | return fig;
435 |
436 | def animate_basic(self, trajectory,frame_time = 0.2, frames = 20):
437 |
438 | good_frame = 0
439 | each_frame = len(trajectory)/frames
440 | for n_frame,state in enumerate(trajectory):
441 | if each_frame > 1 and round(good_frame) == n_frame:
442 | good_frame+=each_frame
443 | else:
444 | continue
445 | fig = self.show(state, not_animate=False);
446 | display.display(fig);
447 | display.clear_output(wait=True)
448 | time.sleep(frame_time)
449 |
450 | def plot(self, trajectory, episode = None):
451 |
452 | # Create trajectory matrices -> time, [x(t=0), x(t+1), x(t+2)], | agent
453 | times = len(trajectory)
454 | x_cord = np.zeros([self.n_agents, times])
455 | y_cord = np.zeros([self.n_agents, times])
456 | collision_table = np.full([self.n_agents, times], False)
457 |
458 | for t,state in enumerate(trajectory):
459 | x_cord[:,t] = state[:,0]
460 | y_cord[:,t] = state[:,1]
461 |
462 | # Collision calc
463 | for i in range(self.n_agents):
464 | xi = state[i,0:dim]
465 | li = state[i,2*dim]
466 | for j in range(self.n_agents):
467 | if j != i:
468 | xj = state[j,0:dim]
469 | lj = state[j,2*dim]
470 |
471 | dij = np.linalg.norm(xi-xj) -li -lj
472 | if dij<=0:
473 | collision_table[i,t] = True
474 |
475 | collisions = np.sum(collision_table)
476 |
477 | # Plot obstacles and final state of agents
478 | fig, ax = plt.subplots(); # note we must use plt.subplots, not plt.subplot
479 | # (or if you have an existing figure)
480 | # fig = plt.gcf()
481 | # ax = fig.gca()
482 | fig.set_size_inches(4.5, 3.5)
483 | fig.tight_layout()
484 |
485 | # ax.set_xlim((0, self.grid[0]));
486 | # ax.set_ylim((0, self.grid[1]));
487 | ax.grid(True);
488 |
489 | # Plot obstacles
490 | for i in range(self.n_obstacles):
491 | circle = plt.Circle((self.obstacles[i,0], self.obstacles[i,1]), self.obstacles[i,2], color='black');
492 | ax.add_patch(circle);
493 |
494 | # Plot agents and collisions
495 | for i in range(self.n_agents):
496 | agent_color = num_to_rgb(i,self.n_agents-1)
497 | circle = plt.Circle((state[i,0], state[i,1]), state[i,4], color=agent_color, fill=False, label=f"{i+1}");
498 | ax.add_patch(circle);
499 |
500 | ax.plot(x_cord[i,:],y_cord[i,:],color=agent_color);
501 | ax.plot(self.end_points[i*dim,0],self.end_points[i*dim+1,0],color=agent_color,marker = "*");
502 |
503 | collisions_xcord = x_cord[i,collision_table[i,:]]
504 | collisions_ycord = y_cord[i,collision_table[i,:]]
505 | total_markers = len(collisions_xcord)
506 | # problem when it is 0, markevery=np.floor(total_markers/2))
507 | ax.plot(collisions_xcord,collisions_ycord,color=agent_color, marker = "v",fillstyle = "none", markevery=2);
508 |
509 | if episode is None:
510 | ax.set_title(f"{self.n_agents} agents, collisions = {collisions}")
511 | else:
512 | ax.set_title(f"Episode {episode+1} , {self.n_agents} agents, collisions = {collisions}")
513 | ax.legend(title="Agents")
514 | plt.show()
515 |
516 | def animate(self, trajectory, z_trajectory , deltas, episode, name = "test", format ="gif"):
517 |
518 | if format == "mp4":
519 | # plt.rcParams['animation.ffmpeg_path'] ='D:\\Programes portables\\ffmpeg\\bin\\ffmpeg.exe'
520 | plt.rcParams['animation.ffmpeg_path'] ='C:\\Users\\Andreu\\OneDrive - KTH\\programes\\ffmpeg\\bin\\ffmpeg.exe'
521 |
522 |
523 | fig, ax = plt.subplots(); # note we must use plt.subplots, not plt.subplot
524 | ax.set_xlim((-1, self.grid[0]+1));
525 | ax.set_ylim((-1, self.grid[1]+1));
526 | # ax.grid(True)
527 | ax.set_title(f"Delta = {deltas[0]}");
528 | circles = []
529 | d_circles = []
530 | arrows = []
531 |
532 | states = trajectory[0]
533 | z_states = z_trajectory[0]
534 | for i in range(self.n_agents):
535 | xi = states[i,0:dim]
536 | xFi = self.end_points[i*dim:(i+1)*dim,0].flatten()
537 | agent_color = num_to_rgb(i,self.n_agents-1)
538 | ax.plot(xFi[0],xFi[1],color=agent_color,marker = "*");
539 | circle = plt.Circle((states[i,0], states[i,1]), states[i,4], color=agent_color, fill=False, label=f"{i+1}");
540 | circles.append(ax.add_patch(circle))
541 |
542 | delta_circle = plt.Circle((states[i,0], states[i,1]), states[i,4] + deltas[i], color="red", fill=False, ls = "--", alpha = 0.5);
543 | d_circles.append(ax.add_patch(delta_circle))
544 |
545 | z_size = np.size(z_states[i],0)
546 | z_state = z_states[i]
547 | arrows_i = []
548 | for k in range(z_size):
549 | if k == 0:
550 | star = xFi[0:dim]
551 | fini = xFi[0:dim] + z_state[k,0:dim]
552 | coords = np.array([star,fini])
553 | arrows_i.append(ax.plot(coords[:,0], coords[:,1] , color = agent_color, lw = 0.5, alpha = 0.3))
554 | else:
555 | star = xi[0:dim]
556 | fini = xi[0:dim] + z_state[k,0:dim]
557 | coords = np.array([star,fini])
558 | arrows_i.append(ax.plot(coords[:,0], coords[:,1] , color = agent_color, lw = 0.5, alpha = 0.6))
559 | arrows.append(arrows_i)
560 |
561 | plt.legend(loc = "upper right")
562 |
563 | def update_objects(t:int):
564 | states = trajectory[t]
565 | z_states = z_trajectory[t]
566 | ax.set_title(f"Episode {episode+1} .Deltas = {deltas[0]}. Time = {t*dt:.1f}s")
567 |
568 | for i in range(self.n_agents):
569 | xi = states[i,0:dim]
570 | xFi = self.end_points[i*dim:(i+1)*dim,0].flatten()
571 | agent_color = num_to_rgb(i,self.n_agents-1)
572 |
573 | z_size = np.size(z_states[i],0)
574 | z_state = z_states[i]
575 | for k in range(z_size):
576 | if k == 0:
577 | star = xFi[0:dim]
578 | fini = xFi[0:dim] + z_state[k,0:dim]
579 | coords = np.array([star,fini])
580 | arrows[i][k][0].set_data(coords[:,0], coords[:,1])
581 | pass
582 | else:
583 | star = xi[0:dim]
584 | fini = xi[0:dim] + z_state[k,0:dim]
585 | coords = np.array([star,fini])
586 | arrows[i][k][0].set_data(coords[:,0], coords[:,1])
587 | pass
588 | circles[i].center = states[i,0], states[i,1]
589 | d_circles[i].center = states[i,0], states[i,1]
590 |
591 | return circles, d_circles, arrows
592 |
593 | print("\nSaving animation...")
594 | anim = animation.FuncAnimation(fig, update_objects, len(trajectory), interval=dt)
595 |
596 | if format == "gif":
597 | writergif = animation.PillowWriter(fps=30)
598 | full_name = os.path.join("videos", name + ".gif")
599 | anim.save(full_name, writer=writergif)
600 | elif format == "mp4":
601 | FFwriter = animation.FFMpegWriter(fps=30)
602 | full_name = os.path.join("videos", name + ".mp4")
603 | anim.save(full_name, writer = FFwriter)
604 | else:
605 | print(f"format{format} not valid")
606 |
607 | print(f"Animation saved as {full_name}")
608 |
609 |
610 |
611 | # other control functions
612 | def gradient_control(state,env, u_max = 1):
613 | """Given a global state, calculates the direction of gradient of the cost (logarithm barrier)
614 | for each agent separately, taking into account all neighbours
615 |
616 | Args:
617 | state (np.array): each row is an agent state (x,y, vx,vy, l)
618 | env (_type_): drone environment
619 |
620 | Returns:
621 | actions: list of row vectors [u1^T, u2^T,...]
622 | """
623 |
624 | # maximum value of control component
625 | # u_max m/s
626 | b = 0.1
627 | q = 1
628 | actions = []
629 |
630 | for i in range(env.n_agents):
631 | xi = np.transpose(state[i,0:dim])
632 | ri = state[i,4]
633 | xF = env.end_points[i*dim:(i+1)*dim,0]
634 | di_hat = env.d_safety[i]
635 |
636 | term1 = 2*(xi-xF)
637 |
638 | Ni = [j for j in range(env.n_agents) if j != i] # Complete graph (except itself), global knowledge
639 | # print(Ni)
640 | term2 = np.zeros_like(xi)
641 | for j in Ni:
642 | xj = np.transpose(state[j,0:2])
643 | rj = state[j,4]
644 | dij = np.linalg.norm(xi-xj)-ri-rj
645 |
646 | if dij <= di_hat:
647 | term2+= (xi-xj) / (dij*np.linalg.norm(xi-xj))
648 |
649 | grad = q*term1 - b*term2
650 | ui = np.clip(-grad , -u_max,u_max)
651 | actions.append(ui)
652 |
653 | return actions
654 |
655 | def proportional_control(state,env):
656 |
657 | # maximum value of control component
658 | u_max = 1 # m/s
659 | k_gain = 1
660 | actions = []
661 |
662 | for i in range(env.n_agents):
663 | xi = np.transpose(state[i,0:dim])
664 | ri = state[i,4]
665 | xF = env.end_points[i*dim:(i+1)*dim,0]
666 | # di_hat = env.d_safety[i]
667 |
668 | # zi = xF_i - xi (vector to goal)
669 | zi = xF-xi
670 | ui = k_gain * zi
671 |
672 | ui_norm = np.linalg.norm(ui)
673 | if ui_norm > u_max:
674 | # Cap the control norm to u_max
675 | ui = ui/ui_norm*u_max
676 |
677 | actions.append(ui)
678 |
679 | return actions
680 |
681 | # Plotting functions
682 | def running_average(x, N = 50):
683 | ''' Function used to compute the running average
684 | of the last N elements of a vector x
685 | '''
686 | if len(x) >= N:
687 | y = np.copy(x)
688 | y[N-1:] = np.convolve(x, np.ones((N, )) / N, mode='valid')
689 | else:
690 | y = np.zeros_like(x)
691 | return y
692 |
693 | # Plot Rewards and steps
694 | def plot_rewards(episode_reward_list, episode_true_reward_list, collision_list, n_ep_running_average=50):
695 | episodes = [i for i in range(1, len(episode_reward_list)+1)]
696 | # Plot Rewards and steps
697 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 4.5))
698 | # ax[0].plot(episodes, episode_reward_list, label='Approx. reward', color = "orange", alpha = 0.5)
699 | ax[0].plot(episodes, episode_reward_list, label='Global reward', color = "cyan", alpha = 0.5)
700 | # ax[0].plot(episodes, running_average(episode_reward_list, n_ep_running_average), label='Avg. approx. reward', color="red")
701 | ax[0].plot(episodes, running_average(episode_true_reward_list, n_ep_running_average), label='Avg. global reward', color="blue")
702 |
703 | ax[0].set_xlabel('Episodes')
704 | ax[0].set_ylabel('Total reward')
705 | ax[0].set_title('Total Reward vs Episodes')
706 | ax[0].legend()
707 | ax[0].grid(alpha=0.3)
708 |
709 | ax[1].plot(episodes,collision_list, label='Collisions per episode', color="cyan", alpha = 0.5)
710 | ax[1].plot(episodes, running_average(collision_list, n_ep_running_average), label='Avg. number of collisions per episode',color="blue")
711 | ax[1].set_xlabel('Episodes')
712 | ax[1].set_ylabel('Total number of collisions')
713 | ax[1].set_title('Total number of collisions vs Episodes')
714 | ax[1].legend()
715 | ax[1].grid(alpha=0.3)
716 | plt.show()
717 |
718 | def plot_grads(grad_per_episode:np.ndarray, gi_per_episode:np.ndarray):
719 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 9))
720 |
721 | n_agents = np.size(grad_per_episode,1)
722 | episode_variable = [e for e in range(1, len(grad_per_episode)+1)]
723 |
724 | for i in range(n_agents):
725 | agent_color = num_to_rgb(i,n_agents-1)
726 | ax[0].plot(episode_variable, grad_per_episode[:,i], label=f"Agent {i+1}", color = agent_color)
727 | ax[0].set_xlabel('Episodes')
728 | ax[0].set_ylabel('Score function gradient')
729 | # ax[0].set_title('Total Reward vs Episodes')
730 | ax[0].legend()
731 | ax[0].grid(alpha=0.3)
732 |
733 | for i in range(n_agents):
734 | agent_color = num_to_rgb(i,n_agents-1)
735 | ax[1].plot(episode_variable, gi_per_episode[:,i], label=f"Agent {i+1}", color = agent_color)
736 | ax[1].set_xlabel('Episodes')
737 | ax[1].set_ylabel('Approximated gi gradient (max norm = 100)')
738 | # ax[1].set_title('Total number of collisions vs Episodes')
739 | ax[1].legend()
740 | ax[1].grid(alpha=0.3)
741 | plt.show()
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
--------------------------------------------------------------------------------
/images/E1000_n10_DiscretePolicy4_b02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/E1000_n10_DiscretePolicy4_b02.png
--------------------------------------------------------------------------------
/images/E1500_n5_DiscretePolicy8_b02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/E1500_n5_DiscretePolicy8_b02.png
--------------------------------------------------------------------------------
/images/collisions_hist.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/collisions_hist.pdf
--------------------------------------------------------------------------------
/images/delta_effect.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/images/delta_effect.pdf
--------------------------------------------------------------------------------
/learning_Q_test.py:
--------------------------------------------------------------------------------
1 |
2 | from collections import deque, namedtuple
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import drone_env
6 | from drone_env import running_average, plot_rewards
7 | from tqdm import tqdm, trange
8 | from SAC_agents import SACAgents, ExperienceBuffers
9 |
10 | ### Set up parameters ###
11 | n_agents = 5
12 | deltas = np.ones(n_agents)*1.5
13 | env = drone_env.drones(n_agents=n_agents, n_obstacles=0, grid=[5, 5], end_formation="O", deltas=deltas ,simplify_zstate = True)
14 | print(env)
15 | # env.show()
16 |
17 | N_Episodes = 2
18 |
19 | T = 5 # Simulate for T seconds (default dt = drone_env.dt = 0.01s) t_iter t=500
20 | discount_factor = 0.99
21 | alpha_critic = 10**-3
22 | alpha_actor = 10**-2
23 | M = 30 # Epochs, i.e steps of the SDG for the critic NN
24 | dim_z = env.local_state_space # Dimension of the localized z_state space
25 | dim_a = env.local_action_space # Dimension of the local action space
26 |
27 | ###
28 |
29 | # Initialize variables
30 | total_collisions_list = []
31 | total_reward_list = []
32 | Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])
33 |
34 | times = np.arange(0, T, step=drone_env.dt) + drone_env.dt
35 |
36 |
37 | agents = SACAgents(n_agents=env.n_agents, dim_local_state = dim_z, dim_local_action=dim_a, discount=discount_factor, epochs=M, learning_rate_critic=alpha_critic, learning_rate_actor=alpha_critic)
38 | print("### Running Scalable-Actor-Critic with params: ###")
39 | print(f"Episodes = {N_Episodes}, Time iterations = {len(times)} (T = {T}s, dt = {drone_env.dt}s)")
40 | print(f"N of agents = {env.n_agents}, structure of critic NN = {agents.criticsNN[0].input_size}x{agents.criticsNN[0].L1}x{agents.criticsNN[0].L2}x{agents.criticsNN[0].output_size}")
41 | print(f"Discount = {discount_factor}, lr for NN critical = {alpha_critic}, lr for actor = {alpha_actor}, epochs M = {M}")
42 |
43 |
44 | EPISODES = trange(N_Episodes, desc='Episode: ', leave=True)
45 | for episode in EPISODES:
46 |
47 | # reward_history = np.zeros([len(times), env.n_agents])
48 | trajectory = [env.state.copy()]
49 | total_episode_reward = 0
50 | total_episode_collisions = 0
51 | # env.show()
52 |
53 | buffers = ExperienceBuffers(env.n_agents)
54 | # SIMULATION OVER T
55 | for t_iter, time in enumerate(times):
56 | # Simple gradient controller u_i = -grad_i, assuming Nj = V
57 | state = env.state
58 | z_states = env.z_states
59 | Ni = env.Ni
60 |
61 | # calculate actions based on current state
62 | actions = drone_env.gradient_control(state, env)
63 | # actions = drone_env.proportional_control(state, env)
64 | # actions = agents.forward(z_states)
65 |
66 | # Update environment one time step with the actions
67 | new_state, new_z, rewards, n_collisions, finished = env.step(actions)
68 | # EXPERIECE: [z_state, action, reward, next_z, finished]
69 | buffers.append(z_states, actions, rewards,new_z, Ni, finished)
70 |
71 | total_episode_reward += np.mean(rewards)
72 | total_episode_collisions += n_collisions
73 |
74 | # reward_history[t_iter,:] = reward
75 | trajectory.append(new_state.copy())
76 |
77 | # END OF EPISODE
78 | # Append episode reward
79 | total_reward_list.append(total_episode_reward)
80 | total_collisions_list.append(total_episode_collisions)
81 |
82 | # Train of critic with the data of the episode
83 | agents.train(buffers)
84 | Q_simulated, Q_approx = agents.benchmark_cirtic(buffers, only_one_NN=False)
85 |
86 | # print(f"Episode collisions = {total_episode_collisions}")
87 | # env.animate(trajectory,frame_time=0.1)
88 |
89 | # RESET ENVIRONMENT
90 | env.reset(renew_obstacles=False)
91 |
92 | # Set progress bar description with information
93 | average_reward = running_average(total_reward_list, 50)[-1]
94 | average_collisions = running_average(total_collisions_list, 50)[-1]
95 | EPISODES.set_description(
96 | f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter+1} - Average: {average_reward:.1f}/{average_collisions:.2f}/{t_iter+1}")
97 |
98 | # Plot current trajectory
99 |
100 | if episode >= N_Episodes-2:
101 | env.plot(trajectory)
102 |
103 | plt.figure()
104 | for i in range(env.n_agents):
105 | agent_color = drone_env.num_to_rgb(i,env.n_agents-1)
106 | plt.plot(times,Q_approx[i], label=f"i={i}, approx Q")
107 | plt.plot(times,Q_simulated[i], "--", label=f"i={i}, simulated Q")
108 | plt.legend()
109 | plt.show()
110 |
111 | agents.save(filename="Q_test")
112 |
113 | plot_rewards(total_reward_list,total_collisions_list, n_ep_running_average=5)
--------------------------------------------------------------------------------
/matlab/cost_field.m:
--------------------------------------------------------------------------------
1 | clear all;
2 | close all;
3 |
4 | global ri rj a xB
5 |
6 | xB = [5 5]';
7 | % xi = [1 1]';
8 | xj = [0.5 0.5 3;
9 | 3 1 1];
10 | ri = 0.1;
11 | rj = 0.1;
12 | a = 5;
13 |
14 | [X Y] = meshgrid(linspace(0,5,100));
15 | cost_fun = 0*X;
16 | gradU_fun = cost_fun;
17 | gradV_fun = cost_fun;
18 |
19 | for i=1:size(X,2)
20 | for j=1:size(X,1)
21 | xi = [X(i,j) Y(i,j)]';
22 | cost_fun(i,j) = cost(xi,xj);
23 | g = -grad(xi,xj);
24 | gradU_fun(i,j) = g(1);
25 | gradV_fun(i,j) = g(2);
26 | end
27 | end
28 |
29 |
30 | %% plots
31 | figure;
32 | surf(X,Y,cost_fun,'EdgeAlpha',0)
33 | hold on
34 | plot(xB(1),xB(2),'or','LineWidth',2)
35 | for j=1:size(xj,2)
36 | plot(xj(1,j),xj(2,j),'ob','LineWidth',2)
37 | end
38 | % plot(xi(1),xi(2),'ok','LineWidth',2)
39 | % xlim([0 6]); ylim([0 6]);
40 | % zlim([-inf,cost_fun(1,1)*1.1])
41 | grid on;
42 |
43 | figure;
44 | % quiver(X,Y,gradU_fun,gradV_fun)
45 | streamslice(X,Y,gradU_fun,gradV_fun)
46 |
47 | %% Fnctions
48 |
49 | function g = grad(xi,Xj)
50 | global ri rj a xB
51 |
52 | term1 = 2*(xi-xB);
53 | term2 = Xj*0;
54 |
55 | for j=1:size(Xj,2)
56 | xj = Xj(:,j);
57 |
58 | term2(:,j) = 1/(norm(xi-xj)-ri-rj) * (xi-xj)/norm(xi-xj);
59 | end
60 | term2 = -a*sum(term2,2);
61 | g = term1+term2;
62 | g = term2;
63 |
64 | % g = g/norm(g);
65 | end
66 |
67 | function c = cost(xi,Xj)
68 | global ri rj a xB
69 | dij = vecnorm(xi-Xj)-ri-rj;
70 | dij(dij<0)=0;
71 | c = norm(xi-xB)^2 - a*sum(log(dij));
72 |
73 | end
74 |
--------------------------------------------------------------------------------
/matlab/derivations_22ndPol.m:
--------------------------------------------------------------------------------
1 | clear all;
2 |
3 | rng(1)
4 |
5 | z_dim = 6;
6 | dim = 2;
7 | z = sym('z%d%d', [z_dim 1],'real');
8 | z1 = z(1:2); z2 = z(3:4); z3 = z(5:6);
9 | zValues = rand(size(z))*5;
10 | a = sym('a%d%d', [2 1],'real');
11 | aValues = rand(size(a));
12 | theta = sym('theta%d%d', [1 z_dim/dim],'real');
13 | thetaValues = rand(size(theta));
14 |
15 | R = sym('R%d%d', [dim z_dim],'real');
16 | % R1div = Rderiv(theta(1)); R2div = Rderiv(theta(2)); R3div = Rderiv(theta(3));
17 | % Rdiv = [R1div R2div R3div];
18 | Sigma = sym('Sigma%d%d', [2 2],'real');
19 | Sigma(2,1) = Sigma(1,2);
20 | % Sigma = inv(Sigma);
21 | SigmaValues = [0.1, 0.01;0.01,0.15];
--------------------------------------------------------------------------------
/matlab/derivations_2ndPol.m:
--------------------------------------------------------------------------------
1 | clear all;
2 |
3 | rng(1)
4 |
5 | z_dim = 6;
6 | dim = 2;
7 | z = sym('z%d%d', [z_dim 1],'real');
8 | z1 = z(1:2); z2 = z(3:4); z3 = z(5:6);
9 | zValues = rand(size(z))*5;
10 | a = sym('a%d%d', [2 1],'real');
11 | aValues = rand(size(a));
12 | theta = sym('theta%d%d', [1 z_dim/dim],'real');
13 | thetaValues = rand(size(theta));
14 | R2d = @(angle) [cos(angle) -sin(angle); sin(angle) cos(angle)];
15 | Rderiv = @(angle) [-sin(angle) -cos(angle); cos(angle) -sin(angle)];
16 | R1 = R2d(theta(1)); R2 = R2d(theta(2)); R3 = R2d(theta(3));
17 | R = [R1 R2 R3];
18 | R1div = Rderiv(theta(1)); R2div = Rderiv(theta(2)); R3div = Rderiv(theta(3));
19 | Rdiv = [R1div R2div R3div];
20 | Sigma = sym('Sigma1%d%d', [2 2],'real');
21 | Sigma(2,1) = Sigma(1,2);
22 | Sigma1 = inv(Sigma);
23 | SigmaValues = [0.1, 0.01;0.01,0.15];
24 |
25 | val = @(eqq) double(subs(eqq,[z(:)',theta(:)',a(:)',Sigma(:)'],[zValues(:)',thetaValues(:)',aValues(:)',SigmaValues(:)']));
26 |
27 | d = 2;
28 | original = log(1/((2*pi)^(d/2)*sqrt(det(Sigma1)))*exp(-1/2*(a-R*z)'*(Sigma1)*(a-R*z)));
29 |
30 | eqq1 = -1/2 * (a-R*z)' * (Sigma1) * (a-R*z);
31 | eqq2 =-1/2*z'*R'*(Sigma1)*R*z + a'*(Sigma1)*R*z - 1/2*a'*(Sigma1)*a;
32 | I = -1/2*z'*R'*(Sigma1)*R*z;
33 | II = a'*(Sigma1)*R*z;
34 |
35 | % for m=1:z_dim
36 | % dum = theta(:,m);
37 | % eval(['theta' num2str(m) ' = dum;']);
38 | % end
39 |
40 | Vgra = @(scalar) arrayfun(@(s,v) val(diff(s,v)),scalar*ones(size(theta)),theta);
41 | gra = @(scalar) arrayfun(@(s,v) diff(s,v),scalar*ones(size(theta)),theta);
42 |
43 |
44 | %
45 | grad1 = Vgra(eqq1);
46 | grad2 = Vgra(eqq2);
47 |
48 | % IIgrad = [val(a'*Sigma1*R1div*z1), val(a'*Sigma1*R2div*z2), val(a'*Sigma1*R3div*z3); Vgra(II)]
49 | % Igrad = val(-(Sigma1)*theta*z*z');
50 | % total_grad = val((Sigma1)*(a-theta*z)*z')
51 | grad1
52 |
53 |
54 | %% loops
55 | % tot = 0;
56 | % l = 2;
57 | % zz = rand(l,1);
58 | % RR = rand(l,l);
59 | % SS = rand(l,l);
60 | %
61 | % for i=1:l
62 | % for j =1:l
63 | % tot = tot + z1(i)*R1(i,j)*Sigma1(i,j)*R1(j,i)*z1(j);
64 | % end
65 | % end
66 | % tot
67 | %
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/matlab/derivations_check.m:
--------------------------------------------------------------------------------
1 | clear all;
2 |
3 | rng(1)
4 |
5 | z_dim = 6;
6 | z = sym('z%d%d', [z_dim 1],'real');
7 | % zValues = rand(size(z))*5;
8 | zValues = [1,2,3,4,5,6]';
9 | a = sym('a%d%d', [2 1],'real');
10 | % aValues = rand(size(a));
11 | aValues = [0.5,1]';
12 | theta = sym('theta%d%d', [2 z_dim],'real');
13 | % thetaValues = rand(size(theta));
14 | % thetaValues(:,5:6)=0;
15 | thetaValues = [1,0.5,1,0.5,1,0.5;2,0.6,2,0.6,2,0.6];
16 | Sigma = sym('Sigma%d%d', [2 2],'real');
17 | SigmaValues = [1,0.01;0.01,1.5];
18 | val = @(eqq) double(subs(eqq,[z(:)',theta(:)',a(:)',Sigma(:)'],[zValues(:)',thetaValues(:)',aValues(:)',SigmaValues(:)']));
19 |
20 | d = 2;
21 | original = log(1/((2*pi)^(d/2)*sqrt(det(Sigma)))*exp(-1/2*(a-theta*z)'*inv(Sigma)*(a-theta*z)));
22 |
23 | eqq1 = -1/2 * (a-theta*z)' * inv(Sigma) * (a-theta*z);
24 | eqq2 =-1/2*z'*theta'*inv(Sigma)*theta*z + a'*inv(Sigma)*theta*z - 1/2*a'*inv(Sigma)*a;
25 | I = -1/2*z'*theta'*inv(Sigma)*theta*z;
26 | II = a'*inv(Sigma)*theta*z;
27 |
28 | % for m=1:z_dim
29 | % dum = theta(:,m);
30 | % eval(['theta' num2str(m) ' = dum;']);
31 | % end
32 |
33 | gra = @(scalar) arrayfun(@(s,v) val(diff(s,v)),scalar*ones(size(theta)),theta);
34 |
35 | %
36 | grad1 = gra(eqq1);
37 | grad2 = gra(eqq2);
38 |
39 | IIgrad = val(inv(Sigma)*a*z');
40 | Igrad = val(-inv(Sigma)*theta*z*z');
41 | total_grad = val(inv(Sigma)*(a-theta*z)*z')
42 | grad1
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/matlab/distance_def.m:
--------------------------------------------------------------------------------
1 | %% Visualization of the distance between agents (variables)
2 |
3 | clear all;
4 |
5 | di_hat = 5;
6 | max_dist = 7;
7 | x_dist = linspace(0,max_dist,100);
8 | % capping to d_hat, i,e dij = min(di_hat, ||xi-xj||-li-lj)
9 | dij = x_dist;
10 | dij(x_dist>di_hat)=di_hat;
11 | dij_norm = di_hat./dij;
12 | log_d = log(dij_norm);
13 |
14 | %% plots
15 | figure;
16 | grid on;
17 | subplot(3,1,1)
18 | plot(x_dist,dij)
19 | title(['$\hat{d}_i =' num2str(di_hat) '$'])
20 | xlabel('$||x_i-x_j||-l_i-l_j$')
21 | ylabel('$d_{ij} = min(\hat{d}_i, ||x_i-x_j||-l_i-l_j)$')
22 | grid on;
23 |
24 | subplot(3,1,2)
25 | plot(x_dist,dij_norm)
26 | xlabel('$||x_i-x_j||-l_i-l_j$')
27 | ylabel('$\frac{\hat{d}_i}{d_{ij}}$')
28 | grid on;
29 |
30 | subplot(3,1,3)
31 | plot(x_dist,log_d)
32 | xlabel('$||x_i-x_j||-l_i-l_j$')
33 | ylabel('$log(\frac{\hat{d}_i}{d_{ij}})$')
34 | grid on;
35 |
36 |
--------------------------------------------------------------------------------
/matlab/images/dij.fig:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/matlab/images/dij.fig
--------------------------------------------------------------------------------
/matlab/normal_multivariate_pdf.m:
--------------------------------------------------------------------------------
1 | clear all;
2 |
3 | mu = [0 0];
4 | sigma = 0.1;
5 | Sigma = [sigma 0; 0 sigma];
6 |
7 |
8 | delta_x = 0.05;
9 |
10 | x1 = -3:delta_x:3;
11 | x2 = -3:delta_x:3;
12 | [X1,X2] = meshgrid(x1,x2);
13 | X = [X1(:) X2(:)];
14 | y = mvnpdf(X,mu,Sigma);
15 | y = reshape(y,length(x2),length(x1));
16 | surf(x1,x2,y,'EdgeAlpha',0.1)
17 | caxis([min(y(:))-0.5*range(y(:)),max(y(:))])
18 | % axis([-3 3 -3 3 0 0.4])
19 | xlabel('x1')
20 | ylabel('x2')
21 | zlabel('Probability Density')
22 |
23 | volumen = sum(sum(y*delta_x^2))
24 |
--------------------------------------------------------------------------------
/matlab/optimal_traj.m:
--------------------------------------------------------------------------------
1 | clear all;
2 | close all;
3 |
4 | T = 30;
5 | i_agents = 3;
6 | ri =[1 1 1]*0; % drone diameter
7 | x_size = 2*i_agents; % dim* n_agents
8 | X_size = T*x_size;
9 | xF = [1 0 4 0 6 0]';
10 | x0 = [-1 0 -2 0 -3 0]';
11 |
12 | normalizing_r = ((xF(1:2:end)-xF(1:2:end)').^2+(xF(2:2:end)-xF(2:2:end)').^2).^0.5;
13 | normalizing_r = mink(normalizing_r,2,2);
14 | normalizing_r = normalizing_r(:,2);
15 |
16 | XB=kron(ones(T,1),xF);
17 | ineqA = eye(T) - (triu(ones(T),-1)-triu(ones(T)));
18 | ineqA = kron(ineqA,eye(x_size));
19 | ineqA = [ineqA;-ineqA];
20 |
21 | u_max = 0.4;
22 | ineqB = ones(X_size*2,1)*u_max;
23 | ineqB(1:x_size) = u_max + x0;
24 | ineqB(X_size+1:X_size+x_size) = u_max - x0;
25 |
26 | X0 = rand(X_size,1);
27 | % load sol
28 | % X0 = sol;
29 |
30 | fun = @(x) f(x,XB,x_size,T,normalizing_r,ri);
31 |
32 | [sol,fval] = fmincon(fun,X0,ineqA,ineqB)
33 |
34 | %% separate solution:
35 | sol_re = reshape(sol,x_size,T);
36 | x_sol = sol_re(1:2:end,:);
37 | y_sol = sol_re(2:2:end,:);
38 |
39 | for t=2:size(x_sol,2)
40 | figure(1)
41 | plot([x0(1:2:end)';x_sol(:,1:t-1)'],[x0(2:2:end)';y_sol(:,1:t-1)'],'-^')
42 | hold on
43 | plot(xF(1:2:end)',xF(2:2:end)','x')
44 | grid on;
45 | plot(x0(1:2:end),x0(2:2:end),'o')
46 | hold off;
47 |
48 | pause(0.2)
49 | drawnow
50 | end
51 |
52 |
53 |
54 | %% cost function
55 | function obj = f(X,XF,x_size,T,normalazing_r,ri)
56 |
57 | b = 0.4;
58 |
59 | % goal cost
60 | obj_end = (X-XF)'*(X-XF);
61 |
62 | % collision cost
63 | xt = reshape(X,x_size,T);
64 | x_t = xt(1:2:end,:);
65 | y_t = xt(2:2:end,:);
66 |
67 | obj_colision = 0;
68 | for t = 1:size(xt,2)
69 | x_diff = x_t(:,t)-x_t(:,t)';
70 | y_diff = y_t(:,t)-y_t(:,t)';
71 |
72 | D = (x_diff.^2 + y_diff.^2).^0.5 + (-ri-ri').*(1-eye(3));
73 | D_clip = min(1,D./normalazing_r)+eye(size(D,1));
74 |
75 | obj_colision = obj_colision -b*sum(sum(log(D_clip)));
76 |
77 | end
78 |
79 | obj=obj_end + obj_colision;
80 | end
--------------------------------------------------------------------------------
/matlab/sol.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/matlab/sol.mat
--------------------------------------------------------------------------------
/models/E1000_n10_DiscretePolicy4_b02-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E1000_n10_DiscretePolicy4_b02-A2Cactors.pth
--------------------------------------------------------------------------------
/models/E1000_n10_DiscretePolicy4_b02-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E1000_n10_DiscretePolicy4_b02-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/E500_M30_LR1e4_badInitialState-actors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E500_M30_LR1e4_badInitialState-actors.pth
--------------------------------------------------------------------------------
/models/E500_M30_LR1e4_badInitialState-critics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/E500_M30_LR1e4_badInitialState-critics.pth
--------------------------------------------------------------------------------
/models/cont_preloaded-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/cont_preloaded-A2Cactors.pth
--------------------------------------------------------------------------------
/models/cont_preloaded-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/cont_preloaded-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/deltas/deltas0.01_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.01_softmax16-A2Cactors.pth
--------------------------------------------------------------------------------
/models/deltas/deltas0.01_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.01_softmax16-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/deltas/deltas0.1_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.1_softmax16-A2Cactors.pth
--------------------------------------------------------------------------------
/models/deltas/deltas0.1_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.1_softmax16-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/deltas/deltas0.2_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.2_softmax16-A2Cactors.pth
--------------------------------------------------------------------------------
/models/deltas/deltas0.2_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.2_softmax16-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/deltas/deltas0.5_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.5_softmax16-A2Cactors.pth
--------------------------------------------------------------------------------
/models/deltas/deltas0.5_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.5_softmax16-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/deltas/deltas0.8_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.8_softmax16-A2Cactors.pth
--------------------------------------------------------------------------------
/models/deltas/deltas0.8_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas0.8_softmax16-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/deltas/deltas1.5_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1.5_softmax16-A2Cactors.pth
--------------------------------------------------------------------------------
/models/deltas/deltas1.5_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1.5_softmax16-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/deltas/deltas1_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1_softmax16-A2Cactors.pth
--------------------------------------------------------------------------------
/models/deltas/deltas1_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas1_softmax16-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/deltas/deltas2.43_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.43_softmax16-A2Cactors.pth
--------------------------------------------------------------------------------
/models/deltas/deltas2.43_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.43_softmax16-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/deltas/deltas2.5_softmax16-A2Cactors-old (1).pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.5_softmax16-A2Cactors-old (1).pth
--------------------------------------------------------------------------------
/models/deltas/deltas2.5_softmax16-A2Ccritics-old (2).pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2.5_softmax16-A2Ccritics-old (2).pth
--------------------------------------------------------------------------------
/models/deltas/deltas2_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2_softmax16-A2Cactors.pth
--------------------------------------------------------------------------------
/models/deltas/deltas2_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas/deltas2_softmax16-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/deltas2.5_softmax16-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas2.5_softmax16-A2Cactors.pth
--------------------------------------------------------------------------------
/models/deltas2.5_softmax16-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/deltas2.5_softmax16-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/discrete-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/discrete-A2Cactors.pth
--------------------------------------------------------------------------------
/models/discrete-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/discrete-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/final/cont_n5-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/cont_n5-A2Cactors.pth
--------------------------------------------------------------------------------
/models/final/cont_n5-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/cont_n5-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/final/simple-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/simple-A2Cactors.pth
--------------------------------------------------------------------------------
/models/final/simple-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/simple-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/final/softmax8_n4-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n4-A2Cactors.pth
--------------------------------------------------------------------------------
/models/final/softmax8_n4-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n4-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/final/softmax8_n5-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n5-A2Cactors.pth
--------------------------------------------------------------------------------
/models/final/softmax8_n5-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n5-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/final/softmax8_n8-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n8-A2Cactors.pth
--------------------------------------------------------------------------------
/models/final/softmax8_n8-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/final/softmax8_n8-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/n5_E1500_Advantage-actors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/n5_E1500_Advantage-actors.pth
--------------------------------------------------------------------------------
/models/n5_E1500_Advantage-critics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/n5_E1500_Advantage-critics.pth
--------------------------------------------------------------------------------
/models/softmax8_n5-A2Cactors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/softmax8_n5-A2Cactors.pth
--------------------------------------------------------------------------------
/models/softmax8_n5-A2Ccritics.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/softmax8_n5-A2Ccritics.pth
--------------------------------------------------------------------------------
/models/trained_E1000_M50_LR001-actors.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/models/trained_E1000_M50_LR001-actors.pth
--------------------------------------------------------------------------------
/policy_performance_variables_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/policy_performance_variables_1
--------------------------------------------------------------------------------
/policy_performance_variables_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/policy_performance_variables_2
--------------------------------------------------------------------------------
/profile.bat:
--------------------------------------------------------------------------------
1 | @REM Simple bat to profile scripts: profile file.py
2 |
3 | @echo off
4 | python -m cProfile -o dump.prof %1
5 | snakeviz dump.prof
6 | del dump.prof
--------------------------------------------------------------------------------
/spec-file.txt:
--------------------------------------------------------------------------------
1 | # This file may be used to create an environment using:
2 | # $ conda create --name --file
3 | # platform: win-64
4 | @EXPLICIT
5 | https://conda.anaconda.org/conda-forge/win-64/nodejs-16.12.0-h57928b3_0.tar.bz2
6 | https://conda.anaconda.org/conda-forge/win-64/ca-certificates-2022.6.15-h5b45459_0.tar.bz2
7 | https://conda.anaconda.org/conda-forge/win-64/intel-openmp-2021.4.0-h57928b3_3556.tar.bz2
8 | https://conda.anaconda.org/conda-forge/win-64/mkl-include-2021.4.0-h0e2418a_729.tar.bz2
9 | https://conda.anaconda.org/conda-forge/win-64/msys2-conda-epoch-20160418-1.tar.bz2
10 | https://conda.anaconda.org/conda-forge/win-64/pandoc-2.16.1-h8ffe710_0.tar.bz2
11 | https://conda.anaconda.org/pytorch/noarch/pytorch-mutex-1.0-cpu.tar.bz2
12 | https://conda.anaconda.org/conda-forge/win-64/ucrt-10.0.20348.0-h57928b3_0.tar.bz2
13 | https://conda.anaconda.org/conda-forge/win-64/winpty-0.4.3-4.tar.bz2
14 | https://conda.anaconda.org/pytorch/noarch/cpuonly-2.0-0.tar.bz2
15 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gmp-6.1.0-2.tar.bz2
16 | https://conda.anaconda.org/conda-forge/win-64/m2w64-libwinpthread-git-5.0.0.4634.697f757-2.tar.bz2
17 | https://conda.anaconda.org/conda-forge/win-64/vs2015_runtime-14.29.30037-h902a5da_5.tar.bz2
18 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-core-5.3.0-7.tar.bz2
19 | https://conda.anaconda.org/conda-forge/win-64/vc-14.2-hb210afc_5.tar.bz2
20 | https://conda.anaconda.org/conda-forge/win-64/ffmpeg-4.3.1-ha925a31_0.tar.bz2
21 | https://conda.anaconda.org/conda-forge/win-64/icu-68.2-h0e60522_0.tar.bz2
22 | https://conda.anaconda.org/conda-forge/win-64/jbig-2.1-h8d14728_2003.tar.bz2
23 | https://conda.anaconda.org/conda-forge/win-64/jpeg-9d-h8ffe710_0.tar.bz2
24 | https://conda.anaconda.org/conda-forge/win-64/lerc-3.0-h0e60522_0.tar.bz2
25 | https://conda.anaconda.org/conda-forge/win-64/libclang-11.1.0-default_h5c34c98_1.tar.bz2
26 | https://conda.anaconda.org/conda-forge/win-64/libdeflate-1.8-h8ffe710_0.tar.bz2
27 | https://conda.anaconda.org/conda-forge/win-64/libsodium-1.0.18-h8d14728_1.tar.bz2
28 | https://conda.anaconda.org/conda-forge/win-64/libuv-1.42.0-h8ffe710_0.tar.bz2
29 | https://conda.anaconda.org/conda-forge/win-64/libzlib-1.2.11-h8ffe710_1013.tar.bz2
30 | https://conda.anaconda.org/conda-forge/win-64/lz4-c-1.9.3-h8ffe710_1.tar.bz2
31 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libgfortran-5.3.0-6.tar.bz2
32 | https://conda.anaconda.org/conda-forge/win-64/openssl-1.1.1q-h8ffe710_0.tar.bz2
33 | https://conda.anaconda.org/conda-forge/win-64/sqlite-3.36.0-h8ffe710_2.tar.bz2
34 | https://conda.anaconda.org/conda-forge/win-64/tbb-2021.4.0-h2d74725_1.tar.bz2
35 | https://conda.anaconda.org/conda-forge/win-64/tk-8.6.11-h8ffe710_1.tar.bz2
36 | https://conda.anaconda.org/conda-forge/win-64/xz-5.2.5-h62dcd97_1.tar.bz2
37 | https://conda.anaconda.org/conda-forge/win-64/m2w64-gcc-libs-5.3.0-7.tar.bz2
38 | https://conda.anaconda.org/conda-forge/win-64/mkl-2021.4.0-h0e2418a_729.tar.bz2
39 | https://conda.anaconda.org/conda-forge/win-64/python-3.8.12-h7840368_2_cpython.tar.bz2
40 | https://conda.anaconda.org/conda-forge/win-64/zeromq-4.3.4-h0e60522_1.tar.bz2
41 | https://conda.anaconda.org/conda-forge/win-64/zlib-1.2.11-h8ffe710_1013.tar.bz2
42 | https://conda.anaconda.org/conda-forge/noarch/async_generator-1.10-py_0.tar.bz2
43 | https://conda.anaconda.org/conda-forge/noarch/attrs-21.2.0-pyhd8ed1ab_0.tar.bz2
44 | https://conda.anaconda.org/conda-forge/noarch/backcall-0.2.0-pyh9f0ad1d_0.tar.bz2
45 | https://conda.anaconda.org/conda-forge/noarch/backports-1.0-py_2.tar.bz2
46 | https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-2.0.0-pyhd8ed1ab_0.tar.bz2
47 | https://conda.anaconda.org/conda-forge/noarch/cloudpickle-2.0.0-pyhd8ed1ab_0.tar.bz2
48 | https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.4-pyh9f0ad1d_0.tar.bz2
49 | https://conda.anaconda.org/conda-forge/noarch/cycler-0.11.0-pyhd8ed1ab_0.tar.bz2
50 | https://conda.anaconda.org/conda-forge/noarch/decorator-5.1.0-pyhd8ed1ab_0.tar.bz2
51 | https://conda.anaconda.org/conda-forge/noarch/defusedxml-0.7.1-pyhd8ed1ab_0.tar.bz2
52 | https://conda.anaconda.org/conda-forge/noarch/entrypoints-0.3-pyhd8ed1ab_1003.tar.bz2
53 | https://conda.anaconda.org/conda-forge/noarch/idna-3.1-pyhd3deb0d_0.tar.bz2
54 | https://conda.anaconda.org/conda-forge/noarch/ipython_genutils-0.2.0-py_1.tar.bz2
55 | https://conda.anaconda.org/conda-forge/noarch/json5-0.9.5-pyh9f0ad1d_0.tar.bz2
56 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab_widgets-1.0.2-pyhd8ed1ab_0.tar.bz2
57 | https://conda.anaconda.org/conda-forge/win-64/libblas-3.9.0-12_win64_mkl.tar.bz2
58 | https://conda.anaconda.org/conda-forge/win-64/libpng-1.6.37-h1d00b33_2.tar.bz2
59 | https://conda.anaconda.org/conda-forge/win-64/mkl-devel-2021.4.0-h57928b3_730.tar.bz2
60 | https://conda.anaconda.org/conda-forge/noarch/nest-asyncio-1.5.1-pyhd8ed1ab_0.tar.bz2
61 | https://conda.anaconda.org/conda-forge/noarch/olefile-0.46-pyh9f0ad1d_1.tar.bz2
62 | https://conda.anaconda.org/conda-forge/noarch/pandocfilters-1.5.0-pyhd8ed1ab_0.tar.bz2
63 | https://conda.anaconda.org/conda-forge/noarch/parso-0.8.2-pyhd8ed1ab_0.tar.bz2
64 | https://conda.anaconda.org/conda-forge/noarch/pickleshare-0.7.5-py_1003.tar.bz2
65 | https://conda.anaconda.org/conda-forge/noarch/prometheus_client-0.12.0-pyhd8ed1ab_0.tar.bz2
66 | https://conda.anaconda.org/conda-forge/noarch/pycodestyle-2.8.0-pyhd8ed1ab_0.tar.bz2
67 | https://conda.anaconda.org/conda-forge/noarch/pycparser-2.21-pyhd8ed1ab_0.tar.bz2
68 | https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.0.5-pyhd8ed1ab_0.tar.bz2
69 | https://conda.anaconda.org/conda-forge/win-64/python_abi-3.8-2_cp38.tar.bz2
70 | https://conda.anaconda.org/conda-forge/noarch/pytz-2021.3-pyhd8ed1ab_0.tar.bz2
71 | https://conda.anaconda.org/conda-forge/noarch/qtpy-1.11.2-pyhd8ed1ab_0.tar.bz2
72 | https://conda.anaconda.org/conda-forge/noarch/send2trash-1.8.0-pyhd8ed1ab_0.tar.bz2
73 | https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2
74 | https://conda.anaconda.org/conda-forge/noarch/testpath-0.5.0-pyhd8ed1ab_0.tar.bz2
75 | https://conda.anaconda.org/conda-forge/noarch/textwrap3-0.9.2-py_0.tar.bz2
76 | https://conda.anaconda.org/conda-forge/noarch/toml-0.10.2-pyhd8ed1ab_0.tar.bz2
77 | https://conda.anaconda.org/conda-forge/noarch/traitlets-5.1.1-pyhd8ed1ab_0.tar.bz2
78 | https://conda.anaconda.org/conda-forge/noarch/typing_extensions-3.10.0.2-pyha770c72_0.tar.bz2
79 | https://conda.anaconda.org/conda-forge/noarch/webencodings-0.5.1-py_1.tar.bz2
80 | https://conda.anaconda.org/conda-forge/noarch/wheel-0.37.0-pyhd8ed1ab_1.tar.bz2
81 | https://conda.anaconda.org/conda-forge/noarch/zipp-3.6.0-pyhd8ed1ab_0.tar.bz2
82 | https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.0-h6255e5f_0.tar.bz2
83 | https://conda.anaconda.org/conda-forge/noarch/autopep8-1.6.0-pyhd8ed1ab_1.tar.bz2
84 | https://conda.anaconda.org/conda-forge/noarch/babel-2.9.1-pyh44b312d_0.tar.bz2
85 | https://conda.anaconda.org/conda-forge/win-64/certifi-2022.6.15-py38haa244fe_0.tar.bz2
86 | https://conda.anaconda.org/conda-forge/win-64/cffi-1.15.0-py38hd8c33c5_0.tar.bz2
87 | https://conda.anaconda.org/conda-forge/win-64/chardet-4.0.0-py38haa244fe_2.tar.bz2
88 | https://conda.anaconda.org/conda-forge/win-64/debugpy-1.5.1-py38h885f38d_0.tar.bz2
89 | https://conda.anaconda.org/conda-forge/win-64/freetype-2.10.4-h546665d_1.tar.bz2
90 | https://conda.anaconda.org/conda-forge/win-64/future-0.18.2-py38haa244fe_4.tar.bz2
91 | https://conda.anaconda.org/conda-forge/win-64/importlib-metadata-4.8.2-py38haa244fe_0.tar.bz2
92 | https://conda.anaconda.org/conda-forge/noarch/importlib_resources-5.4.0-pyhd8ed1ab_0.tar.bz2
93 | https://conda.anaconda.org/conda-forge/win-64/jedi-0.18.0-py38haa244fe_3.tar.bz2
94 | https://conda.anaconda.org/conda-forge/win-64/kiwisolver-1.3.2-py38hbd9d945_1.tar.bz2
95 | https://conda.anaconda.org/conda-forge/win-64/libcblas-3.9.0-12_win64_mkl.tar.bz2
96 | https://conda.anaconda.org/conda-forge/win-64/liblapack-3.9.0-12_win64_mkl.tar.bz2
97 | https://conda.anaconda.org/conda-forge/win-64/libtiff-4.3.0-hd413186_2.tar.bz2
98 | https://conda.anaconda.org/conda-forge/win-64/markupsafe-2.0.1-py38h294d835_1.tar.bz2
99 | https://conda.anaconda.org/conda-forge/noarch/matplotlib-inline-0.1.3-pyhd8ed1ab_0.tar.bz2
100 | https://conda.anaconda.org/conda-forge/win-64/mistune-0.8.4-py38h294d835_1005.tar.bz2
101 | https://conda.anaconda.org/conda-forge/noarch/packaging-21.0-pyhd8ed1ab_0.tar.bz2
102 | https://conda.anaconda.org/conda-forge/win-64/pybox2d-2.3.10-py38h885f38d_1.tar.bz2
103 | https://conda.anaconda.org/conda-forge/win-64/pyqt5-sip-4.19.18-py38h885f38d_7.tar.bz2
104 | https://conda.anaconda.org/conda-forge/win-64/pyrsistent-0.18.0-py38h294d835_0.tar.bz2
105 | https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.8.2-pyhd8ed1ab_0.tar.bz2
106 | https://conda.anaconda.org/conda-forge/win-64/pywin32-302-py38h294d835_2.tar.bz2
107 | https://conda.anaconda.org/conda-forge/win-64/pywinpty-1.1.5-py38hd3f51b4_1.tar.bz2
108 | https://conda.anaconda.org/conda-forge/win-64/pyzmq-22.3.0-py38h09162b1_1.tar.bz2
109 | https://conda.anaconda.org/conda-forge/win-64/qt-5.12.9-h5909a2a_4.tar.bz2
110 | https://conda.anaconda.org/conda-forge/win-64/setuptools-58.5.3-py38haa244fe_0.tar.bz2
111 | https://conda.anaconda.org/conda-forge/win-64/sniffio-1.2.0-py38haa244fe_2.tar.bz2
112 | https://conda.anaconda.org/conda-forge/win-64/tornado-6.1-py38h294d835_2.tar.bz2
113 | https://conda.anaconda.org/conda-forge/noarch/tqdm-4.62.3-pyhd8ed1ab_0.tar.bz2
114 | https://conda.anaconda.org/conda-forge/win-64/websocket-client-0.57.0-py38haa244fe_6.tar.bz2
115 | https://conda.anaconda.org/conda-forge/win-64/win_inet_pton-1.1.0-py38haa244fe_3.tar.bz2
116 | https://conda.anaconda.org/conda-forge/win-64/anyio-3.3.4-py38haa244fe_0.tar.bz2
117 | https://conda.anaconda.org/conda-forge/win-64/argon2-cffi-21.1.0-py38h294d835_2.tar.bz2
118 | https://conda.anaconda.org/conda-forge/noarch/backports.functools_lru_cache-1.6.4-pyhd8ed1ab_0.tar.bz2
119 | https://conda.anaconda.org/conda-forge/noarch/bleach-4.1.0-pyhd8ed1ab_0.tar.bz2
120 | https://conda.anaconda.org/conda-forge/win-64/brotlipy-0.7.0-py38h294d835_1003.tar.bz2
121 | https://conda.anaconda.org/conda-forge/win-64/cryptography-35.0.0-py38hb7941b4_2.tar.bz2
122 | https://conda.anaconda.org/conda-forge/noarch/jinja2-3.0.2-pyhd8ed1ab_0.tar.bz2
123 | https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.2.1-pyhd8ed1ab_0.tar.bz2
124 | https://conda.anaconda.org/conda-forge/win-64/jupyter_core-4.9.1-py38haa244fe_0.tar.bz2
125 | https://conda.anaconda.org/conda-forge/win-64/lcms2-2.12-h2a16943_0.tar.bz2
126 | https://conda.anaconda.org/conda-forge/win-64/liblapacke-3.9.0-12_win64_mkl.tar.bz2
127 | https://conda.anaconda.org/conda-forge/win-64/numpy-1.21.4-py38h089cfbf_0.tar.bz2
128 | https://conda.anaconda.org/conda-forge/win-64/openjpeg-2.4.0-hb211442_1.tar.bz2
129 | https://conda.anaconda.org/conda-forge/noarch/pip-21.3.1-pyhd8ed1ab_0.tar.bz2
130 | https://conda.anaconda.org/conda-forge/win-64/pyglet-1.5.16-py38haa244fe_0.tar.bz2
131 | https://conda.anaconda.org/conda-forge/noarch/pygments-2.10.0-pyhd8ed1ab_0.tar.bz2
132 | https://conda.anaconda.org/conda-forge/win-64/pyqt-impl-5.12.3-py38h885f38d_7.tar.bz2
133 | https://conda.anaconda.org/conda-forge/win-64/pysocks-1.7.1-py38haa244fe_4.tar.bz2
134 | https://conda.anaconda.org/conda-forge/noarch/snakeviz-2.1.1-pyhd8ed1ab_0.tar.bz2
135 | https://conda.anaconda.org/conda-forge/win-64/terminado-0.12.1-py38haa244fe_1.tar.bz2
136 | https://conda.anaconda.org/conda-forge/win-64/blas-devel-3.9.0-12_win64_mkl.tar.bz2
137 | https://conda.anaconda.org/conda-forge/win-64/gym-0.21.0-py38h595d716_0.tar.bz2
138 | https://conda.anaconda.org/conda-forge/noarch/jupyter_client-7.0.6-pyhd8ed1ab_0.tar.bz2
139 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab_pygments-0.1.2-pyh9f0ad1d_0.tar.bz2
140 | https://conda.anaconda.org/conda-forge/noarch/nbformat-5.1.3-pyhd8ed1ab_0.tar.bz2
141 | https://conda.anaconda.org/conda-forge/win-64/pandas-1.3.4-py38h5d928e2_1.tar.bz2
142 | https://conda.anaconda.org/conda-forge/win-64/pillow-8.3.2-py38h794f750_0.tar.bz2
143 | https://conda.anaconda.org/conda-forge/noarch/pyopenssl-21.0.0-pyhd8ed1ab_0.tar.bz2
144 | https://conda.anaconda.org/conda-forge/win-64/pyqtchart-5.12-py38h885f38d_7.tar.bz2
145 | https://conda.anaconda.org/conda-forge/win-64/pyqtwebengine-5.12.1-py38h885f38d_7.tar.bz2
146 | https://conda.anaconda.org/conda-forge/win-64/scipy-1.7.2-py38ha1292f7_0.tar.bz2
147 | https://conda.anaconda.org/conda-forge/noarch/wcwidth-0.2.5-pyh9f0ad1d_2.tar.bz2
148 | https://conda.anaconda.org/conda-forge/noarch/autograd-1.4-pyhd8ed1ab_0.tar.bz2
149 | https://conda.anaconda.org/conda-forge/win-64/blas-2.112-mkl.tar.bz2
150 | https://conda.anaconda.org/conda-forge/win-64/matplotlib-base-3.4.3-py38h1f000d6_1.tar.bz2
151 | https://conda.anaconda.org/conda-forge/noarch/nbclient-0.5.5-pyhd8ed1ab_0.tar.bz2
152 | https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.22-pyha770c72_0.tar.bz2
153 | https://conda.anaconda.org/conda-forge/win-64/pyqt-5.12.3-py38haa244fe_7.tar.bz2
154 | https://conda.anaconda.org/conda-forge/noarch/urllib3-1.26.7-pyhd8ed1ab_0.tar.bz2
155 | https://conda.anaconda.org/conda-forge/win-64/ipython-7.29.0-py38h595d716_1.tar.bz2
156 | https://conda.anaconda.org/conda-forge/win-64/matplotlib-3.4.3-py38haa244fe_1.tar.bz2
157 | https://conda.anaconda.org/conda-forge/win-64/nbconvert-6.2.0-py38haa244fe_0.tar.bz2
158 | https://conda.anaconda.org/conda-forge/noarch/prompt_toolkit-3.0.22-hd8ed1ab_0.tar.bz2
159 | https://conda.anaconda.org/pytorch/win-64/pytorch-1.10.0-py3.8_cpu_0.tar.bz2
160 | https://conda.anaconda.org/conda-forge/noarch/requests-2.26.0-pyhd8ed1ab_0.tar.bz2
161 | https://conda.anaconda.org/conda-forge/win-64/ipykernel-6.4.2-py38h595d716_0.tar.bz2
162 | https://conda.anaconda.org/conda-forge/noarch/jupyter_server-1.11.2-pyhd8ed1ab_0.tar.bz2
163 | https://conda.anaconda.org/pytorch/win-64/torchvision-0.11.1-py38_cpu.tar.bz2
164 | https://conda.anaconda.org/conda-forge/noarch/jupyter_console-6.4.0-pyhd8ed1ab_0.tar.bz2
165 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab_server-2.8.2-pyhd8ed1ab_0.tar.bz2
166 | https://conda.anaconda.org/conda-forge/noarch/notebook-6.4.5-pyha770c72_0.tar.bz2
167 | https://conda.anaconda.org/conda-forge/noarch/qtconsole-5.2.0-pyhd8ed1ab_0.tar.bz2
168 | https://conda.anaconda.org/conda-forge/noarch/nbclassic-0.3.4-pyhd8ed1ab_0.tar.bz2
169 | https://conda.anaconda.org/conda-forge/win-64/widgetsnbextension-3.5.2-py38haa244fe_0.tar.bz2
170 | https://conda.anaconda.org/conda-forge/noarch/ipywidgets-7.6.5-pyhd8ed1ab_0.tar.bz2
171 | https://conda.anaconda.org/conda-forge/noarch/jupyterlab-3.2.2-pyhd8ed1ab_0.tar.bz2
172 | https://conda.anaconda.org/conda-forge/win-64/jupyter-1.0.0-py38haa244fe_6.tar.bz2
173 |
--------------------------------------------------------------------------------
/train_problem.py:
--------------------------------------------------------------------------------
1 | from collections import deque, namedtuple
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import drone_env
5 | from drone_env import running_average, plot_rewards, plot_grads
6 | from tqdm import tqdm, trange
7 | from SAC_agents import SA2CAgents, RandomAgent, TrainedAgent, SPPOAgents
8 | from utils import ExperienceBuffers, DiscreteSoftmaxNN, NormalPolicy
9 |
10 | plt.style.use('seaborn-dark-palette')
11 | tex_fonts = {
12 | # Use LaTeX to write all text
13 | # "text.usetex": True,
14 | "font.family": "sans-serif",
15 | # Use 10pt font in plots, to match 10pt font in document
16 | "axes.labelsize": 10,
17 | "font.size": 10,
18 | # Make the legend/label fonts a little smaller
19 | "legend.fontsize": 10,
20 | "xtick.labelsize": 10,
21 | "ytick.labelsize": 10
22 | }
23 | plt.rcParams.update(tex_fonts)
24 |
25 |
26 | ### Set up parameters ###
27 | n_agents = 5
28 | deltas = np.ones(n_agents)*2.43
29 | # deltas = None
30 | env = drone_env.drones(n_agents=n_agents, n_obstacles=0, grid=[5, 5], end_formation="O", deltas=deltas ,simplify_zstate = True)
31 | env.collision_weight = 0.2 # old 0.2
32 | print(env)
33 | # env.show()
34 |
35 | N_Episodes = 3000
36 | episodes_to_plot = [3000]
37 | # episodes_to_plot = [1500]
38 | save_name = "deltas2.5_softmax16"
39 |
40 | discount_factor = 0.99
41 | alpha_critic = 10**-3
42 | alpha_actor = 10**-3
43 | M = 10 # Epochs, i.e steps of the SDG for the actor-critic NN in PPO variant
44 | dim_z = env.local_state_space # Dimension of the localized z_state space
45 | dim_a = env.local_action_space # Dimension of the local action space
46 |
47 |
48 | # Initialize variables
49 | total_collisions_per_episode = []
50 | total_reward_per_episode = []
51 | total_true_reward_per_episode =[]
52 | total_t = []
53 | grad_per_episode = np.zeros([N_Episodes, n_agents])
54 | gi_per_episode = np.zeros_like(grad_per_episode)
55 |
56 | # times = np.arange(0, T, step=drone_env.dt) + drone_env.dt
57 |
58 |
59 | agents = SA2CAgents(n_agents=env.n_agents, dim_local_state = dim_z, dim_local_action=dim_a, discount=discount_factor, epochs=M, learning_rate_critic=alpha_critic, learning_rate_actor=alpha_critic)
60 | print(f"### Running {type(agents)}, actor: {type(agents.actors[0])} with params: ###")
61 | print(f"Episodes = {N_Episodes}, max Time iterations = {drone_env.max_time_steps} (T = {drone_env.max_time_steps * drone_env.dt}s, dt = {drone_env.dt}s)")
62 | print(f"N of agents = {env.n_agents}, structure of critic NN = {agents.criticsNN[0].input_size}x{agents.criticsNN[0].L1}x{agents.criticsNN[0].L2}x{agents.criticsNN[0].output_size}")
63 | print(f"Discount = {discount_factor}, lr for NN critical = {alpha_critic}, lr for actor = {alpha_actor}, collision weight b = {env.collision_weight}")
64 |
65 | EPISODES = trange(N_Episodes, desc='Episode: ', leave=True)
66 | for episode in EPISODES:
67 |
68 | if episode+1 in episodes_to_plot:
69 | # reward_history = np.zeros([len(times), env.n_agents])
70 | trajectory = [env.state.copy()]
71 | z_trajectory = [env.z_states]
72 | total_episode_reward = 0
73 | total_true_episode_reward = 0
74 | total_episode_collisions = 0
75 | # env.show()
76 |
77 | buffers = ExperienceBuffers(env.n_agents)
78 |
79 | # SIMULATION OVER T
80 | t_iter = 0
81 | finished = False
82 | while not finished:
83 |
84 | state = env.state
85 | z_states = env.z_states
86 | Ni = env.Ni
87 |
88 | # calculate actions based on current state
89 | # actions = drone_env.gradient_control(state, env)
90 | # actions = drone_env.proportional_control(state, env)
91 | actions = agents.forward(z_states, Ni)
92 |
93 | # Update environment one time step with the actions
94 | new_state, new_z, rewards, n_collisions, finished, true_rewards = env.step(actions)
95 | # EXPERIECE: [z_state, action, reward, next_z, finished]
96 | buffers.append(z_states, actions, rewards, new_z, Ni, finished)
97 |
98 | total_episode_reward += np.mean(rewards)
99 | total_true_episode_reward += np.mean(true_rewards)
100 | total_episode_collisions += n_collisions
101 |
102 | if episode+1 in episodes_to_plot:
103 | # reward_history[t_iter,:] = reward
104 | trajectory.append(new_state.copy())
105 | z_trajectory.append(new_z)
106 |
107 | t_iter +=1
108 |
109 | ### END OF EPISODES
110 | # Train of critic with the data of the episode
111 | # current_grad_norms, current_gi_norms = agents.train(buffers, actor_lr = alpha_actor, return_grads=True)
112 | if type(agents.actors[0]) is NormalPolicy:
113 | agents.train_designed_policy(buffers, actor_lr = alpha_actor, return_grads=False)
114 | else:
115 | agents.train_NN(buffers, actor_lr = alpha_actor)
116 |
117 | # Append episodic variables/logs
118 | total_reward_per_episode.append(total_episode_reward)
119 | total_true_reward_per_episode.append(total_true_episode_reward)
120 | total_collisions_per_episode.append(total_episode_collisions)
121 | total_t.append(t_iter)
122 | # grad_per_episode[episode,:] = np.array(current_grad_norms)
123 | # gi_per_episode[episode,:] = np.array(current_gi_norms)
124 |
125 | if episode+1 in episodes_to_plot:
126 | Q_simulated, V_approx = agents.benchmark_cirtic(buffers, only_one_NN=False)
127 |
128 | # print(f"Episode collisions = {total_episode_collisions}")
129 | # env.animate(trajectory,frame_time=0.1)
130 |
131 | # RESET ENVIRONMENT
132 | env.reset(renew_obstacles=False)
133 |
134 | # Set progress bar description with information
135 | average_reward = running_average(total_reward_per_episode, 50)[-1]
136 | average_true_reward = running_average(total_true_reward_per_episode, 50)[-1]
137 | average_collisions = running_average(total_collisions_per_episode, 50)[-1]
138 | average_t = running_average(total_t, 50)[-1]
139 | EPISODES.set_description(
140 | f"Episode {episode} - Reward/Collisions/Steps: {total_episode_reward:.1f}/{total_episode_collisions}/{t_iter} - Average: {average_reward:.1f}/{average_collisions:.2f}/{average_t}. True r={average_true_reward:.1f}.")
141 |
142 | # Plot current trajectory
143 |
144 | if episode+1 in episodes_to_plot:
145 | env.plot(trajectory, episode)
146 | env.animate(trajectory, z_trajectory, deltas, episode, name=f"training-E{episode+1}", format="mp4")
147 | times = np.arange(0, t_iter)*drone_env.dt
148 | plt.figure()
149 | for i in range(env.n_agents):
150 | agent_color = drone_env.num_to_rgb(i,env.n_agents-1)
151 | plt.plot(times,Q_simulated[i], label=f"i={i}, simulated Q (Gt)", color = agent_color)
152 | plt.plot(times,V_approx[i],"--" , label=f"i={i}, approx V", color = tuple(0.9*x for x in agent_color))
153 | if type(agents.actors[0]) is NormalPolicy:
154 | print(f"Agent {i} params = {agents.actors[i].parameters}")
155 | plt.legend()
156 | plt.show()
157 |
158 | agents.save(filename=save_name)
159 |
160 | plot_rewards(total_reward_per_episode, total_true_reward_per_episode, total_collisions_per_episode, n_ep_running_average=50)
161 | # plt.savefig("images/reward_training.pdf",format='pdf', bbox_inches='tight')
162 | # plot_grads(grad_per_episode,gi_per_episode)
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from turtle import forward
3 | import numpy as np
4 | from autograd import numpy as anp
5 | from autograd import grad
6 | import torch
7 | # import torch
8 | import torch.nn as nn
9 | import torch.optim as optim
10 | from collections import deque, namedtuple
11 |
12 | """ Classes used for agents and other useful functions """
13 |
14 | class CriticNN(nn.Module):
15 | """ Create local critic network
16 | """
17 | # NN sizes: define size of hidden layer
18 | L1 = 200
19 | L2 = 200
20 |
21 | def __init__(self, input_size, output_size = 1):
22 | super().__init__()
23 |
24 | self.input_size = input_size
25 | self.output_size = output_size
26 |
27 | # Create input layer with ReLU activation
28 | self.input_layer = nn.Linear(input_size, self.L1)
29 | self.input_layer_activation = nn.ReLU()
30 |
31 | # Create hidden layers. 1
32 | self.hidden_layer1 = nn.Linear(self.L1, self.L2)
33 | self.hidden_layer1_activation = nn.ReLU()
34 |
35 | # Create output layer. NO ACTIVATION
36 | self.output_layer = nn.Linear(self.L2, output_size)
37 |
38 | def forward(self, z):
39 | '''z must be a properly formated vector of z (torch tensor)'''
40 | # Function used to compute the forward pass
41 |
42 | # Compute first layer
43 | l1 = self.input_layer(z)
44 | l1 = self.input_layer_activation(l1)
45 |
46 | # Compute hidden layers
47 | l2 = self.hidden_layer1(l1)
48 | l2 = self.hidden_layer1_activation(l2)
49 |
50 | # Compute output layer
51 | out = self.output_layer(l2)
52 |
53 | return out
54 |
55 | class NormalActorNN(nn.Module):
56 | """ NN for a policy that as input takes the z state and outputs 2D means and sigma of a independent normal distributions
57 | In this case: z[1x6] -> mu[1x2], sigma^2[1x2]
58 | """
59 | def __init__(self, input_size, lr ,dim_action):
60 | super().__init__()
61 |
62 | self.dim_action = dim_action
63 | # NN sizes: define size of layers
64 | Ls = 400
65 | hidden_1= 200
66 | hidden_2= 200
67 |
68 | # Ls, Create input layer with ReLU activation
69 | self.input_layer = nn.Linear(input_size, Ls)
70 | self.input_layer_activation = nn.ReLU()
71 |
72 | # Create hidden layer, Ls- > head1
73 | self.hidden_layer1 = nn.Linear(Ls, hidden_1)
74 | self.hidden_layer1_activation = nn.ReLU()
75 | # Create hidden layer, Ls- > head2
76 | self.hidden_layer2 = nn.Linear(Ls, hidden_2)
77 | self.hidden_layer2_activation = nn.ReLU()
78 |
79 | # ctreate output layers for each head: out_1: means (tanh in [-1,1]). out_2: sigma^2 (sigmoid in [0,1])
80 | self.out_1 = nn.Linear(hidden_1,dim_action)
81 | self.out_1_activation = nn.Tanh()
82 | self.out_2 = nn.Linear(hidden_2,dim_action)
83 | self.out_2_activation = nn.Sigmoid()
84 |
85 | # Use adam optimizer
86 | self.optimizer = optim.Adam(self.parameters(), lr = lr)
87 |
88 | def forward(self, z):
89 | # Compute first layer
90 | l1 = self.input_layer(z)
91 | l1 = self.input_layer_activation(l1)
92 |
93 | # Compute hidden layers
94 | l2_head1 = self.hidden_layer1(l1)
95 | l2_head1 = self.hidden_layer1_activation(l2_head1)
96 |
97 | l2_head2 = self.hidden_layer2(l1)
98 | l2_head2 = self.hidden_layer2_activation(l2_head2)
99 |
100 | # Compute output layers
101 | out_1 = self.out_1(l2_head1)
102 | out_1 = self.out_1_activation(out_1)
103 |
104 | out_2 = self.out_2(l2_head2)
105 | out_2 = self.out_2_activation(out_2)
106 |
107 | # out_1 = mu, out_2 = sigma^2
108 | return out_1,out_2
109 |
110 | def sample_action(self,z:np.ndarray, N=None):
111 | state_tensor = torch.tensor(z, dtype=torch.float32)
112 | mu_tensor,sigma_tensor = self.forward(state_tensor)
113 | # Normally distributed value with the mu and sigma (std^2) from ActorNN
114 | std = np.sqrt(sigma_tensor.detach().numpy())
115 | action = np.random.normal(mu_tensor.detach().numpy(),std)
116 | # action = np.clip(action,-1,1)
117 | return action
118 |
119 | def log_p_of_a(self,z:np.ndarray, a:np.ndarray):
120 | state_tensor = torch.tensor(z, dtype=torch.float32)
121 | mu , sigma = self.forward(state_tensor)
122 | if mu.dim() == 1:
123 | p1 = torch.pow(2 * np.pi * sigma[0], -0.5) * torch.exp(-(a[0] - mu[0])**2 / (2 * sigma[0]))
124 | p2 = torch.pow(2 * np.pi * sigma[1], -0.5) * torch.exp(-(a[1] - mu[1])**2 / (2 * sigma[1]))
125 | else:
126 | p1 = torch.pow(2 * np.pi * sigma[:,0], -0.5) * torch.exp(-(a[:,0] - mu[:,0])**2 / (2 * sigma[:,0]))
127 | p2 = torch.pow(2 * np.pi * sigma[:,1], -0.5) * torch.exp(-(a[:,1] - mu[:,1])**2 / (2 * sigma[:,1]))
128 |
129 | p = p1*p2
130 | return torch.log(p)
131 |
132 | class NormalPolicy:
133 | """Policy that uses a multivatriable normal distribution.
134 | The parameters are theta, which the angles of the Rot mat for each vector:
135 | parameters: mu = theta * z
136 | gradient: w.r.t theta
137 | covariance matrix: Constant for now, not a parameter
138 |
139 | CAREFUL: Changes the shape of z,a inputs to columns
140 |
141 | NOTICE: individual values of p(a|z) can be greater than 1, as this is a density funct. (pdf, continous)
142 | the pdf of a singular point makes no sense, neeeds to be over a differential of a (i.e pdf is per unit lenght)
143 | """
144 | def __init__(self, input_size, output_size = 2, Sigma = None) -> None:
145 |
146 | self.take_all_states = False
147 | self.dim = output_size
148 | self.z_dim = input_size
149 |
150 | # param =anp.array([-1.6,-1.6,-1.6])
151 | param =-anp.ones(int(self.z_dim/self.dim))*0
152 |
153 | self.parameters = param
154 | if Sigma is None:
155 | self.Sigma = anp.eye(self.dim)*0.3
156 | else:
157 | self.Sigma = Sigma
158 |
159 | def p_of_a(self, z:np.ndarray, a:np.ndarray) -> np.ndarray:
160 | ''' a needs to be a row vector (1D flat)
161 | z needs to be a row vector (1D flat)
162 | '''
163 | pass
164 |
165 | def compute_grad(self, z:np.ndarray, a:np.ndarray, Ni):
166 | ''' a needs to be a row vector (1D flat)
167 | z needs to be a row vector (1D flat)
168 | Ni indicates the states that are neighbors
169 | '''
170 | # Make vectors proper shape (column, for math)
171 | z.shape = (np.size(z),1)
172 | a.shape = (np.size(a),1)
173 |
174 | # Used to only calculate the gradient of the states that actually count
175 | if self.take_all_states:
176 | idx = anp.ones(int(self.z_dim/self.dim))
177 | else:
178 | idx = np.arange(1,int(self.z_dim/self.dim+1))<=len(Ni)
179 |
180 | # Define scalar function to which apply numerical gradient: https://github.com/HIPS/autograd/blob/master/docs/tutorial.md
181 | def my_fun(variable):
182 | R0 = anp.array([[anp.cos(variable[0]), -anp.sin(variable[0])],[anp.sin(variable[0]),anp.cos(variable[0])]])*idx[0]
183 | R1 = anp.array([[anp.cos(variable[1]), -anp.sin(variable[1])],[anp.sin(variable[1]),anp.cos(variable[1])]])*idx[1]
184 | R2 = anp.array([[anp.cos(variable[2]), -anp.sin(variable[2])],[anp.sin(variable[2]),anp.cos(variable[2])]])*idx[2]
185 | R = anp.concatenate((R0,R1,R2),1)
186 |
187 | return (-1/2*(a- R @ z).T @ np.linalg.inv(self.Sigma) @ (a- R @ z))[0,0]
188 |
189 | grad_fun = grad(my_fun)
190 | self.grad = grad_fun(self.parameters)
191 | z.shape = (np.size(z),)
192 | a.shape = (np.size(a),)
193 |
194 | return self.grad
195 |
196 | def clip_grad_norm(self, grad:np.ndarray, clip_norm:float):
197 | # If the gradient norm is to be clipped to a value:
198 | grad_norm = np.linalg.norm(grad.flatten())
199 | # If the current norm is less than the clipping, do nothing. If more, make the norm=cliped_norm
200 | if grad_norm <= clip_norm:
201 | return grad
202 | else:
203 | return grad * clip_norm/grad_norm
204 |
205 |
206 | def sample_action(self, z:np.ndarray, Ni):
207 | # Maybe add a mask so that null states are not accounted
208 | z.shape = (np.size(z),1)
209 |
210 | # Used to only calculate the gradient of the states that actually count
211 | if self.take_all_states:
212 | idx = anp.ones(int(self.z_dim/self.dim))
213 | else:
214 | idx = np.arange(1,int(self.z_dim/self.dim+1))<=len(Ni)
215 |
216 | variable = self.parameters
217 | R0 = anp.array([[anp.cos(variable[0]), -anp.sin(variable[0])],[anp.sin(variable[0]),anp.cos(variable[0])]])*idx[0]
218 | R1 = anp.array([[anp.cos(variable[1]), -anp.sin(variable[1])],[anp.sin(variable[1]),anp.cos(variable[1])]])*idx[1]
219 | R2 = anp.array([[anp.cos(variable[2]), -anp.sin(variable[2])],[anp.sin(variable[2]),anp.cos(variable[2])]])*idx[2]
220 | # R3 = anp.array([[anp.cos(variable[3]), -anp.sin(variable[3])],[anp.sin(variable[3]),anp.cos(variable[3])]])*idx[3]
221 | R = anp.concatenate((R0,R1,R2),1)
222 |
223 | mu = (R @ z).flatten()
224 |
225 | z.shape = (np.size(z),)
226 | a = np.random.multivariate_normal(mu, self.Sigma)
227 |
228 | # Clip the action to not have infinite action
229 | return np.clip(a,-2,+2)
230 |
231 |
232 | class ExperienceBuffers:
233 | """ List of buffers for each agent.
234 | each agent has its own buffer: i: ['z_state', 'action', 'local_reward', 'next_z', 'is_finished']
235 | to get data, example: buffers.buffers[i][t].action
236 | """
237 | def __init__(self, n_agents):
238 | # Create buffer for each agent
239 | self.buffers = [deque() for i in range(n_agents)]
240 | self.n_agents = n_agents
241 | self.experience = namedtuple('experience',
242 | ['z_state', 'action', 'reward', 'next_z', 'Ni', 'finished'])
243 |
244 | def append(self,z_states, actions, rewards, new_z, Ni, finished):
245 | # Append experience to the buffer
246 | for i in range(self.n_agents):
247 | # Create localized expereince touple. Also, flatten state and action vectors
248 | exp = self.experience(z_states[i].flatten(), actions[i].flatten(), rewards[i], new_z[i].flatten(), Ni[i], finished)
249 | self.buffers[i].append(exp)
250 |
251 | def __len__(self):
252 | # overload len operator
253 | return len(self.buffers[0])
254 |
255 | class DiscreteSoftmaxNN(nn.Module):
256 | """
257 | NN for the policy with finite actions, in which the input is the state and the output are softmax probabilities for each of the actions
258 | """
259 | def __init__(self, input_size, lr , n_actions = 8):
260 | super().__init__()
261 |
262 | # Create action dictionary action_arg -> 2D continuos action array
263 | action_norm = 1
264 | action_list = []
265 | for n_action in range(n_actions):
266 | ax = np.cos(n_action/n_actions * 2*np.pi)*action_norm
267 | ay = np.sin(n_action/n_actions * 2*np.pi)*action_norm
268 | action_list.append(np.array([ax,ay]))
269 | self.action_list = np.array(action_list)
270 |
271 | # Structure z -> Ls[ReLu] -> hidden1[ReLu] -> out[softmax]
272 | Ls = 300
273 | hidden_1 = 300
274 | self.n_actions = n_actions
275 |
276 | # Ls, Create input layer with ReLU activation
277 | self.input_layer = nn.Linear(input_size, Ls)
278 | self.input_layer_activation = nn.ReLU()
279 | # Create hidden layer, Ls- > head1
280 | self.hidden_layer1 = nn.Linear(Ls, hidden_1)
281 | self.hidden_layer1_activation = nn.ReLU()
282 | # ctreate output layers with softmax -> probabilities
283 | self.out_1 = nn.Linear(hidden_1,n_actions)
284 | self.out_1_activation = nn.Softmax(dim=0)
285 |
286 | # Use adam optimizer
287 | self.optimizer = optim.Adam(self.parameters(), lr = lr)
288 |
289 | def forward(self,z):
290 | # Compute first layer
291 | l1 = self.input_layer(z)
292 | l1 = self.input_layer_activation(l1)
293 |
294 | # Compute hidden layers
295 | l2 = self.hidden_layer1(l1)
296 | l2 = self.hidden_layer1_activation(l2)
297 |
298 | # Compute output layers
299 | out_1 = self.out_1(l2)
300 | out_1 = self.out_1_activation(out_1)
301 |
302 | return out_1
303 |
304 | def sample_action(self,z:np.ndarray, N=None):
305 | state_tensor = torch.tensor(z, dtype=torch.float32)
306 | probs = self.forward(state_tensor)
307 | chosen_action_arg = np.random.choice(self.n_actions, p = probs.squeeze().detach().numpy())
308 |
309 | return self.action_list[chosen_action_arg]
310 |
311 | def log_p_of_a(self,z:np.ndarray, a:np.ndarray):
312 | # Find argument corresponding to a array
313 | action_arg = np.where((self.action_list[:,0] == a[0]) & (self.action_list[:,1]==a[1]))[0][0]
314 |
315 | state_tensor = torch.tensor(z, dtype=torch.float32)
316 | probs = self.forward(state_tensor)
317 | log_prob = torch.log(probs.squeeze()[action_arg])
318 | return log_prob
319 |
--------------------------------------------------------------------------------
/variables_with_delta_change_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_1
--------------------------------------------------------------------------------
/variables_with_delta_change_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_2
--------------------------------------------------------------------------------
/variables_with_delta_change_3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_3
--------------------------------------------------------------------------------
/variables_with_delta_change_4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreuMatoses/scalable-collision-avoidance-RL/2803c104ba14b3782ed3a3f9beb3287e45959c8e/variables_with_delta_change_4
--------------------------------------------------------------------------------