├── LICENSE ├── NAV_ACL_hpyerparameters.json ├── Networks.py ├── README.md ├── RobotTask.py ├── SAC_Trainers.py ├── SAC_hyperparameters.json ├── buffers.py ├── nav_acl.py ├── pretrain_CAE.py ├── process_utils.py └── sac_v2.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 ai-lab-science 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAV_ACL_hpyerparameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "default_task_unity" : { 3 | "robot_pose" : { 4 | "translation" :[-2.8,0.0,0.01], 5 | "rotation_yaw" : [0] 6 | }, 7 | "dolly_pose" : { 8 | "translation" : [2.5,0.15,0.01], 9 | "rotation_yaw" : [0] 10 | }, 11 | "obstacle_pose": { 12 | "translation" : [0.5,1.50,0.1], 13 | "rotation_yaw" : [0] 14 | }, 15 | "obstacle_1_pose": { 16 | "translation" : [0.5,-1.7,0.1], 17 | "rotation_yaw" : [0] 18 | }, 19 | "obstacle_2_pose": { 20 | "translation" : [2,-1.7,0.1], 21 | "rotation_yaw" : [0] 22 | }, 23 | "obstacle_3_pose": { 24 | "translation" : [2.5,2,0.1], 25 | "rotation_yaw" : [0] 26 | } 27 | }, 28 | "default_test_unity" : { 29 | "robot_pose" : { 30 | "translation" :[0.0,2.5,0.01], 31 | "rotation_yaw" : [0] 32 | }, 33 | "dolly_pose" : { 34 | "translation" : [2.0,0,0.01], 35 | "rotation_yaw" : [0] 36 | }, 37 | "obstacle_pose": { 38 | "translation" : [10,0.5,0.01], 39 | "rotation_yaw" : [0] 40 | }, 41 | "obstacle_1_pose": { 42 | "translation" : [10,4.1,0.01], 43 | "rotation_yaw" : [0] 44 | }, 45 | "obstacle_2_pose": { 46 | "translation" : [10,-1.7,0.1], 47 | "rotation_yaw" : [0] 48 | }, 49 | "obstacle_3_pose": { 50 | "translation" : [10,2,0.1], 51 | "rotation_yaw" : [0] 52 | } 53 | }, 54 | "crazy_test_unity" : { 55 | "robot_pose" : { 56 | "translation" :[2.0,109.0,0.01], 57 | "rotation_yaw" : [180] 58 | }, 59 | "dolly_pose" : { 60 | "translation" : [-7,111.0,0.01], 61 | "rotation_yaw" : [10] 62 | }, 63 | "obstacle_pose": { 64 | "translation" : [4.0,111.0,0.1], 65 | "rotation_yaw" : [0] 66 | }, 67 | "obstacle_1_pose": { 68 | "translation" : [8.0,111.0,0.1], 69 | "rotation_yaw" : [0] 70 | }, 71 | "obstacle_2_pose": { 72 | "translation" : [9.0,111.0,0.1], 73 | "rotation_yaw" : [0] 74 | }, 75 | "obstacle_3_pose": { 76 | "translation" : [2.0,111.5,0.1], 77 | "rotation_yaw" : [0] 78 | } 79 | }, 80 | "default_task_omniverse" : { 81 | "robot_pose" : { 82 | "translation" :[1.1,0.0,0.01], 83 | "rotation_yaw" : [0] 84 | }, 85 | "dolly_pose" : { 86 | "translation" : [2.5,0.15,0.01], 87 | "rotation_yaw" : [0] 88 | }, 89 | "obstacle_pose": { 90 | "translation" : [0.5,1.50,0.1], 91 | "rotation_yaw" : [0] 92 | }, 93 | "obstacle_1_pose": { 94 | "translation" : [0.5,-1.7,0.1], 95 | "rotation_yaw" : [0] 96 | }, 97 | "obstacle_2_pose": { 98 | "translation" : [2,-1.7,0.1], 99 | "rotation_yaw" : [0] 100 | }, 101 | "obstacle_3_pose": { 102 | "translation" : [2.5,2,0.1], 103 | "rotation_yaw" : [0] 104 | } 105 | }, 106 | 107 | "randomization_params" : { 108 | "robot_randomization": {"translation_rnd_xyz":[1,1,0],"rotation_rnd":[180]}, 109 | "dolly_randomization": {"translation_rnd_xyz":[0.7,0.4,0],"rotation_rnd":[30]}, 110 | "num_obstacles" : 4, 111 | "dolly_pos_spline_angle": 30, 112 | "obstacle_pos_spline_mean" : 0.5, 113 | "min_dist_dolly_robot": 1.5, 114 | "max_dist_dolly_robot": 5.0, 115 | "min_dist_dolly_obs": 2.0, 116 | "max_dist_dolly_obs": 5.1, 117 | "min_dist_robot_obs": 2.0, 118 | "max_dist_robot_obs": 4.1, 119 | "del_obstacle_translation": [1000,1000,1000], 120 | "obstacle_randomization": {"translation_rnd_xyz":[0.8,0.8,0],"rotation_rnd":[12]} 121 | }, 122 | "runtimetasks_from_database": true , 123 | "update_nav_nets_every_N_episodes": 10, 124 | "nav_acl_max_AF_task_samples" : 100, 125 | "nav_acl_AF_boundaries_task_samples" : 100, 126 | "nav_acl_hidden_dim": 16, 127 | "nav_acl_batch_mode": true, 128 | "nav_acl_batch_size": 16, 129 | "nav_acl_lr": 0.004, 130 | "only_random_tasks": false, 131 | "normalize_tasks": true, 132 | "q_ricculum_learning" : true, 133 | "task_generation_method" : "AF", 134 | "create_new_AF_database" : false, 135 | "new_AF_database_size" : 6000, 136 | "use_AF_database" : false, 137 | "AF_database_path" : "/home/developer/Training_results/Qricculum_Learning/big_and_small/hoffentlich/", 138 | "GOID_limits": {"lower_limit" : 0.4, 139 | "upper_limit" : 0.6}, 140 | 141 | "adaptive_filtering_params" : {"nav_beta": 1.0, 142 | "nav_gamma_low": 1.0, 143 | "nav_gamma_hi": 0.1, 144 | "p_random_max" : 0.15, 145 | "nav_P_omega": 0.97} 146 | } 147 | -------------------------------------------------------------------------------- /Networks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.distributions import Normal 5 | import numpy as np 6 | import torch 7 | from initialize import * 8 | from ResnetNetworks import * 9 | from tcn_hx import * 10 | from torch.distributions import Categorical 11 | from tcn_ import * 12 | import copy 13 | 14 | from pretrain_CAE import ConvAutoencoder_NAV4 15 | 16 | 17 | class NavACLNetwork(nn.Module): 18 | def __init__(self, task_param_dim, hidden_dim, init_w=5e-4): 19 | super(NavACLNetwork, self).__init__() 20 | self.layer_1 = nn.Linear(task_param_dim, hidden_dim) 21 | self.layer_2 = nn.Linear(hidden_dim, hidden_dim) 22 | self.layer_3 = nn.Linear(hidden_dim, hidden_dim) 23 | self.layer_out = nn.Linear(hidden_dim, 1) 24 | 25 | nn.init.kaiming_uniform_(self.layer_1.weight, mode='fan_in', nonlinearity='relu') 26 | nn.init.kaiming_uniform_(self.layer_2.weight, mode='fan_in', nonlinearity='relu') 27 | nn.init.kaiming_uniform_(self.layer_3.weight, mode='fan_in', nonlinearity='relu') 28 | 29 | self.m1 = torch.nn.LeakyReLU(0.1) #torch.nn.PReLU(num_parameters=1, init=0.25) 30 | self.m2 = torch.nn.LeakyReLU(0.1) #torch.nn.PReLU(num_parameters=1, init=0.25) 31 | self.m3 = torch.nn.LeakyReLU(0.1) #torch.nn.PReLU(num_parameters=1, init=0.25) 32 | 33 | def forward(self, inputs): 34 | x = self.m1(self.layer_1(inputs)) 35 | x = self.m2(self.layer_2(x)) 36 | x = self.m3(self.layer_3(x)) 37 | x = torch.sigmoid(self.layer_out(x)) 38 | 39 | # x = x.clamp(0,1) 40 | 41 | return (x) 42 | 43 | def get_task_success_prob(self, task_param_array): 44 | torch_task_param_array = torch.FloatTensor(task_param_array).to(device) 45 | 46 | difficulty_estimate = self.forward(torch_task_param_array) 47 | 48 | return difficulty_estimate.detach().cpu().numpy() 49 | 50 | class ValueNetwork(nn.Module): 51 | def __init__(self, state_dim, hidden_dim, init_w=3e-3): 52 | super(ValueNetwork, self).__init__() 53 | 54 | self.linear1 = nn.Linear(state_dim, hidden_dim) 55 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 56 | self.linear3 = nn.Linear(hidden_dim, hidden_dim) 57 | self.linear4 = nn.Linear(hidden_dim, 1) 58 | # weights initialization 59 | self.linear4.weight.data.uniform_(-init_w, init_w) 60 | self.linear4.bias.data.uniform_(-init_w, init_w) 61 | 62 | def forward(self, state): 63 | x = F.relu(self.linear1(state)) 64 | x = F.relu(self.linear2(x)) 65 | x = F.relu(self.linear3(x)) 66 | x = self.linear4(x) 67 | return x 68 | 69 | class SoftQNetwork(nn.Module): 70 | def __init__(self, config,): 71 | super(SoftQNetwork, self).__init__() 72 | self.config = config 73 | resolutions = config['output_resolution'] 74 | self.num_stacked_frames = config['num_stacked_frames'] 75 | 76 | num_lidar_beams = config['num_lidar_beams'] 77 | hidden_dim = config['hidden_dim'] 78 | num_actions = config['num_actions'] 79 | self._num_actions = num_actions 80 | init_w = 3e-3 81 | 82 | shapes = (self.num_stacked_frames*resolutions[0],resolutions[1],resolutions[2],num_lidar_beams) 83 | self.inputs_shape = (*shapes,) 84 | 85 | self.kernel_size = 2 86 | self.channel_s = [256,128,64] # To think : 87 | self.channel_a = [16, 16, 16] 88 | self.channel_r = [16, 16, 16] 89 | 90 | 91 | if (config['use_pretrained_vae']): 92 | self.config['freeze_convolution'] = True 93 | ae_model = ConvAutoencoder_NAV4(imgChannels=self.num_stacked_frames*resolutions[0]) 94 | ae_model.load_state_dict(torch.load(config['pretrained_vae_path'])) 95 | self.features=copy.deepcopy(ae_model.encode) 96 | del ae_model 97 | else: 98 | self.features = nn.Sequential( 99 | ResNetBlock(self.num_stacked_frames*resolutions[0],64,3), 100 | ResNetBlock(64,128,3), 101 | ResNetBlock(128,256,3), 102 | ResNetBlock(256,128,3), 103 | ) 104 | 105 | if(self.config['freeze_convolution']): 106 | # print("using fixed convolution layers for Soft Q and Policy Network !") 107 | for param in self.features.parameters(): 108 | param.requires_grad = False 109 | 110 | self.fc = nn.Sequential( 111 | nn.Linear(self.features_size(), 512), 112 | nn.LeakyReLU(), 113 | nn.Linear(512, 256) # -> 64 114 | ) 115 | self.linear_Lidar_1 = nn.Linear(self.inputs_shape[3],hidden_dim) #256 -> 64 116 | self.linear_Lidar_2 = nn.Linear(hidden_dim,32) # 64-> 32 117 | 118 | self.linear_sar = nn.Linear(12,8) 119 | self.linear3 = nn.Linear(256 + 8 +32, hidden_dim) 120 | self.linear4 = nn.Linear(hidden_dim, 1) 121 | self.linear4.weight.data.uniform_(-init_w, init_w) 122 | self.linear4.bias.data.uniform_(-init_w, init_w) 123 | 124 | 125 | 126 | 127 | 128 | 129 | def forward(self, state_cam, state_lidar, prev_action, prev_reward): 130 | 131 | state_cam = state_cam/ 2**8 132 | l = self.linear_Lidar_1(state_lidar) 133 | l = self.linear_Lidar_2(l) 134 | # print(prev_reward.shape) 135 | # print(prev_action.shape) 136 | sar = torch.cat([prev_action.view(prev_action.size(0),-1), prev_reward.view(prev_reward.size(0),-1)], 1) 137 | # print(prev_action.view(prev_action.size(0),-1).shape) 138 | # print(prev_reward.view(prev_reward.size(0),-1).shape) 139 | # print(sar.shape) 140 | # print(sar.shape) 141 | sar = F.leaky_relu(self.linear_sar(sar)) 142 | 143 | x = self.features(state_cam) 144 | x = x.reshape(x.size(0), -1) 145 | x = self.fc(x) 146 | # print("a, pa, rew: ", action.shape, prev_action.shape, reward.shape) 147 | # print("shapes x, a, pa, r, l: ", x.shape, prev_action.shape, prev_action.shape, prev_reward.shape, l.shape) 148 | 149 | x = torch.cat([x, sar], 1) # the dim 0 is number of samples 150 | # x = F.relu(self.linear1(x)) 151 | # x = F.relu(self.linear2(x)) 152 | #concatenate with preprocessed lidar state 153 | x = torch.cat([x,l],1) 154 | x = F.relu(self.linear3(x)) 155 | x = self.linear4(x) 156 | return x 157 | 158 | def features_size(self): 159 | return self.features(torch.zeros(1, *self.inputs_shape[0:3])).view(1, -1).size(1) 160 | 161 | 162 | class PolicyNetwork(nn.Module): 163 | def __init__(self, config): 164 | super(PolicyNetwork, self).__init__() 165 | self.log_std_min = -20 166 | self.log_std_max = 2 167 | self.num_actions = config['num_actions'] 168 | self.action_range = config['action_range'] 169 | self.config = config 170 | 171 | self.version = 0 172 | init_w=3e-3 173 | self.num_stacked_frames = config['num_stacked_frames'] 174 | resolutions = config['output_resolution'] 175 | shapes = (self.num_stacked_frames*resolutions[0],resolutions[1],resolutions[2]) 176 | num_lidar_beams = config['num_lidar_beams'] 177 | hidden_dim = config['hidden_dim'] 178 | self.inputs_shape = (*shapes, num_lidar_beams ) 179 | 180 | self.kernel_size = 2 181 | self.channel_s = [256,128,64] # To think : 182 | self.channel_a = [16, 16 ,16] 183 | self.channel_r = [16, 16 ,16] 184 | 185 | 186 | if (config['use_resnet_archi']): 187 | self.features = nn.Sequential( 188 | ResNetBlock(self.inputs_shape[0],16,3),#40*40 189 | ResNetBlock(16,32,3), # 18*18 190 | ResNetBlock(32,64,3), 191 | Flatten(), 192 | ).apply(initialize_weights_he) #7*7 193 | if (config['use_pretrained_vae']): 194 | self.config['freeze_convolution'] = True 195 | ae_model = ConvAutoencoder_NAV4(imgChannels=self.num_stacked_frames*resolutions[0]) 196 | ae_model.load_state_dict(torch.load(config['pretrained_vae_path'])) 197 | self.features=ae_model.encode 198 | del ae_model 199 | else: 200 | # RGB*4 x 80 x 80 201 | self.features = nn.Sequential( 202 | ResNetBlock(self.num_stacked_frames*resolutions[0],64,3), 203 | ResNetBlock(64,128,3), 204 | ResNetBlock(128,256,3), 205 | ResNetBlock(256,128,3), 206 | ) 207 | 208 | if (self.config['freeze_convolution']): 209 | # print("using fixed convolution layers for Policy Network !") 210 | for param in self.features.parameters(): 211 | param.requires_grad = False 212 | 213 | self.fc = nn.Sequential( 214 | nn.Linear(self.features_size(), 512), 215 | nn.LeakyReLU(), 216 | nn.Linear(512, 256) 217 | ) 218 | self.linear_Lidar_1 = nn.Linear(self.inputs_shape[3],hidden_dim) 219 | self.linear_Lidar_2 = nn.Linear(hidden_dim,32) 220 | # L (4xSRA, 8) (8,8) 221 | # L (256+32+x, 2) 222 | # self.linear1 = nn.Linear(256 + self.num_actions +1, hidden_dim) 223 | # self.linear2 = nn.Linear(hidden_dim, hidden_dim) 224 | self.linear_sar = nn.Linear(12,8) 225 | self.linear3 = nn.Linear(256 + 8 +32, hidden_dim) 226 | # self.linear4 = nn.Linear(hidden_dim, hidden_dim) 227 | 228 | 229 | self.mean_linear = nn.Linear(hidden_dim, self.num_actions) 230 | self.mean_linear.weight.data.uniform_(-init_w, init_w) 231 | self.mean_linear.bias.data.uniform_(-init_w, init_w) 232 | 233 | self.log_std_linear = nn.Linear(hidden_dim, self.num_actions) 234 | self.log_std_linear.weight.data.uniform_(-init_w, init_w) 235 | self.log_std_linear.bias.data.uniform_(-init_w, init_w) 236 | 237 | 238 | 239 | def forward(self, state_cam, state_lidar,prev_action, prev_reward): 240 | state_cam = state_cam / 2**8 241 | state_lidar = state_lidar 242 | l = self.linear_Lidar_1(state_lidar) 243 | l = self.linear_Lidar_2(l) 244 | 245 | sar = torch.cat([prev_action.view(prev_action.size(0),-1), prev_reward.view(prev_reward.size(0),-1)], 1) 246 | # print(prev_action.view(prev_action.size(0),-1).shape) 247 | # print(prev_reward.view(prev_reward.size(0),-1).shape) 248 | # print(sar.shape) 249 | sar = F.leaky_relu(self.linear_sar(sar)) 250 | 251 | x = self.features(state_cam) 252 | x = x.reshape(x.size(0), -1) 253 | x = self.fc(x) 254 | 255 | # print("shapes x, a, pa, r: ", x.shape, prev_action.shape, reward.shape) 256 | x = torch.cat([x, sar], 1) # the dim 0 is number of samples 257 | # x = F.relu(self.linear1(x)) 258 | # x = F.relu(self.linear2(x)) 259 | 260 | # concatenate with preprocessed lidar state 261 | x = torch.cat([x,l],1) 262 | x = F.relu(self.linear3(x)) 263 | # x = F.relu(self.linear4(x)) 264 | 265 | mean = (self.mean_linear(x)) 266 | # mean = F.leaky_relu(self.mean_linear(x)) 267 | log_std = self.log_std_linear(x) 268 | log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) 269 | 270 | return mean, log_std 271 | 272 | def evaluate(self, state_cam, state_lidar,prev_action, prev_reward, device, epsilon=1e-6): 273 | ''' 274 | generate sampled action with state as input wrt the policy network; 275 | ''' 276 | mean, log_std = self.forward(state_cam, state_lidar,prev_action, prev_reward) 277 | std = log_std.exp() # no clip in evaluation, clip affects gradients flow 278 | 279 | normal = Normal(0, 1) 280 | z = normal.sample(mean.shape) 281 | action_0 = torch.tanh(mean + std*z.to(device)) # TanhNormal distribution as actions; reparameterization trick 282 | action = self.action_range*action_0 283 | # The log-likelihood here is for the TanhNorm distribution instead of only Gaussian distribution. \ 284 | # The TanhNorm forces the Gaussian with infinite action range to be finite. \ 285 | # For the three terms in this log-likelihood estimation: \ 286 | # (1). the first term is the log probability of action as in common \ 287 | # stochastic Gaussian action policy (without Tanh); \ 288 | # (2). the second term is the caused by the Tanh(), \ 289 | # as shown in appendix C. Enforcing Action Bounds of https://arxiv.org/pdf/1801.01290.pdf, \ 290 | # the epsilon is for preventing the negative cases in log; \ 291 | # (3). the third term is caused by the action range I used in this code is not (-1, 1) but with \ 292 | # an arbitrary action range, which is slightly different from original paper. 293 | log_prob = Normal(mean, std).log_prob(mean+ std*z.to(device)) - torch.log(1. - action_0.pow(2) + epsilon) - np.log(self.action_range) 294 | # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); 295 | # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, 296 | # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal. 297 | log_prob = log_prob.sum(dim=1, keepdim=True) 298 | return action, log_prob, z, mean, log_std 299 | 300 | 301 | def get_action(self, state_cam, state_lidar,prev_action, prev_reward, deterministic, device): 302 | 303 | state_cam = torch.FloatTensor(state_cam).unsqueeze(0).to(device) 304 | state_lidar = torch.FloatTensor(state_lidar).unsqueeze(0).to(device) 305 | prev_action = torch.FloatTensor(prev_action).unsqueeze(0).to(device) # needs to fit state dimensions so we add 1 dim 306 | prev_reward = torch.FloatTensor(prev_reward).unsqueeze(0).to(device) # needs to fit state dimensions so we add 1 dim 307 | 308 | 309 | mean, log_std = self.forward(state_cam, state_lidar,prev_action, prev_reward) 310 | std = log_std.exp() 311 | 312 | 313 | normal = Normal(0, 1) 314 | z = normal.sample(mean.shape).to(device) 315 | action = self.action_range* torch.tanh(mean + std*z) 316 | 317 | 318 | action = self.action_range* torch.tanh(mean).detach().cpu().numpy()[0] if deterministic else action.detach().cpu().numpy()[0] 319 | # print(action) 320 | return action 321 | 322 | 323 | def sample_action(self,): 324 | a=torch.FloatTensor(self.num_actions).uniform_(-1, 1) 325 | return self.action_range*a.numpy() 326 | def features_size(self): 327 | return self.features(torch.zeros(1, *self.inputs_shape[0:3])).view(1, -1).size(1) 328 | 329 | 330 | # -*- coding: utf-8 -*- 331 | """ 332 | Created on Tue May 25 17:41:51 2021 333 | 334 | @author: Honghu Xue 335 | """ 336 | import numpy as np 337 | import torch 338 | from torch import nn 339 | import torch.nn.functional as F 340 | from utils import initialize_weights_he 341 | from utils import initialize_weights_xavier 342 | 343 | class Flatten(nn.Module): 344 | def forward(self, x): 345 | return x.view(x.size(0), -1) 346 | 347 | 348 | class ResNetBlock(nn.Module): 349 | '''https://www.reddit.com/r/MachineLearning/comments/671455/d_batch_normalization_in_reinforcement_learning/ 350 | Batch normalziation causes instability in training in DQN 351 | Weight normalziation can increase a little bit on the performance in DQN. https://proceedings.neurips.cc/paper/2016/file/ed265bc903a5a097f61d3ec064d96d2e-Paper.pdf 352 | ''' 353 | 354 | 355 | def __init__(self, in_channels, out_channels, kernel_size): 356 | super(ResNetBlock, self).__init__() 357 | # !!! Don't fully understand what is padding = same in the slide!!! 358 | self.residual_pass = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, 359 | stride = 1, padding = 1, padding_mode='replicate'), 360 | nn.ReLU(), 361 | nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, 362 | stride = 1, padding = 1, padding_mode='replicate'), 363 | ) 364 | self.identity_mapping = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, 365 | stride = 1), # no padding here 366 | ) 367 | # After output size after passing through identity_mapping is W_2 = (signal_length - Kernal_size +2 * Padding)/Stride + 1 368 | self.down_dimension = nn.Sequential(nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=2, 369 | stride = 2, padding = 0), 370 | ) 371 | self.RelU = nn.ReLU() 372 | 373 | def forward(self, x): 374 | #TODO 375 | print("orig: ", x.shape) 376 | residual = self.residual_pass(x) 377 | print("residual: ", residual.shape) 378 | x = self.identity_mapping(x) 379 | print("identiy: ", x.shape) 380 | x += residual 381 | print("union: ", x.shape) 382 | x = self.RelU(x) 383 | x = self.down_dimension(x) 384 | print("downdim: ", x.shape) 385 | 386 | return x -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep-Reinforcement-Learning-for-mapless-navigation-in-intralogistics 2 | This repository is for the paper "Using Deep Reinforcement Learning with Automatic Curriculum Learning for Mapless Navigation in Intralogistics". 3 | 4 | The codes includes: 5 | * Soft Actor Critic Algorithm, adapted from [1] 6 | * Our algorithm of NavACL_Q 7 | * Hyper-parameter Settings 8 | 9 | ## Illustration of our results 10 | Here we demonstrate some videos of our trained agent on successfully reaching goal state, i.e., the blue dolly. 11 | 12 | 13 | [![watch the video](https://user-images.githubusercontent.com/27278454/151380252-510bd73c-03b7-4790-be10-562155494a9c.jpg)](https://user-images.githubusercontent.com/27278454/151382348-c3d8a984-01eb-4e69-9093-114d611eddb5.mp4) 14 | 15 | [![watch the video]((https://user-images.githubusercontent.com/27278454/151383580-3d2fb9d0-3b29-47f8-a472-0fba2513fe06.jpg)](https://user-images.githubusercontent.com/27278454/151382581-3993931e-7ba4-4738-88e3-7d312f4e5ff8.mp4) 16 | 17 | 18 | [![watch the video](https://user-images.githubusercontent.com/27278454/151383536-ae62d5ac-576b-4d3b-8815-7e43996d39e5.jpg)](https://user-images.githubusercontent.com/27278454/151382612-467f42c3-31cd-4366-ba11-92fc8ee24340.mp4) 19 | 20 | 21 | [![watch the video](https://user-images.githubusercontent.com/27278454/151383516-2b0dc284-e545-4063-b6c8-06b607251bde.jpg)](https://user-images.githubusercontent.com/27278454/151382630-68874454-36d6-4e61-82b7-20ec3d209ae1.mp4) 22 | 23 | 24 | Even with a moving target, the trained agent is able to react responsively to navigate to the goal. 25 | ![Media3](https://user-images.githubusercontent.com/27278454/151374369-5899a3bd-338f-4450-9f7d-f4bb5071c400.gif) 26 | 27 | 28 | 29 | ## Illustration of our results 30 | [1] https://github.com/quantumiracle/Popular-RL-Algorithms 31 | -------------------------------------------------------------------------------- /RobotTask.py: -------------------------------------------------------------------------------- 1 | from utils import rot_vec 2 | from numpy import savetxt 3 | import numpy as np 4 | from scipy.spatial import distance 5 | from scipy.spatial.transform import Rotation as R 6 | import random 7 | from enum import Enum 8 | 9 | class Tasktype(Enum): 10 | RANDOM = 0 11 | EASY = 1 12 | FRONTIER = 2 13 | 14 | class RobotTask(): 15 | def __init__(self,default_task,): 16 | super(RobotTask, self).__init__() 17 | 18 | self.robot_translation = np.array(default_task['robot_pose']['translation']).astype(np.float) 19 | self.robot_rotation = np.array(default_task['robot_pose']['rotation_yaw']).astype(np.float) 20 | 21 | self.dolly_translation = np.array(default_task['dolly_pose']['translation']).astype(np.float) 22 | self.dolly_rotation = np.array(default_task['dolly_pose']['rotation_yaw']).astype(np.float) 23 | 24 | self.obstacle_translation = np.array(default_task['obstacle_pose']['translation']).astype(np.float) 25 | self.obstacle_rotation = np.array(default_task['obstacle_pose']['rotation_yaw']).astype(np.float) 26 | 27 | self.obstacle_1_translation = np.array(default_task['obstacle_1_pose']['translation']).astype(np.float) 28 | self.obstacle_1_rotation = np.array(default_task['obstacle_1_pose']['rotation_yaw']).astype(np.float) 29 | 30 | self.obstacle_2_translation = np.array(default_task['obstacle_2_pose']['translation']).astype(np.float) 31 | self.obstacle_2_rotation = np.array(default_task['obstacle_2_pose']['rotation_yaw']).astype(np.float) 32 | 33 | self.obstacle_3_translation = np.array(default_task['obstacle_3_pose']['translation']).astype(np.float) 34 | self.obstacle_3_rotation = np.array(default_task['obstacle_3_pose']['rotation_yaw']).astype(np.float) 35 | 36 | self.q_value = 0 37 | self.task_type = Tasktype.RANDOM 38 | 39 | 40 | def randomize_task(self, randomization_params): 41 | """ 42 | randomizes the current task settings according to the randomization parameters (as a json) 43 | """ 44 | num_obstacles = random.randint(0,randomization_params['num_obstacles']) 45 | del_obstacle_translation = np.asarray(randomization_params['del_obstacle_translation']) 46 | self.robot_rotation += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['robot_randomization']['rotation_rnd']) ) 47 | self.dolly_rotation += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['dolly_randomization']['rotation_rnd']) ) 48 | self.obstacle_rotation += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['rotation_rnd']) ) 49 | self.obstacle_1_rotation += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['rotation_rnd']) ) 50 | self.obstacle_2_rotation += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['rotation_rnd']) ) 51 | self.obstacle_3_rotation += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['rotation_rnd']) ) 52 | 53 | 54 | 55 | self.robot_translation += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['robot_randomization']['translation_rnd_xyz']) ) 56 | #self.dolly_translation += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['dolly_randomization']['translation_rnd_xyz']) ) 57 | 58 | min_dist_d_r = randomization_params['min_dist_dolly_robot'] 59 | max_dist_d_r = randomization_params['max_dist_dolly_robot'] 60 | dolly_pos_spline = randomization_params['dolly_pos_spline_angle'] 61 | 62 | dolly_distance = min_dist_d_r + np.random.uniform(0,1)*(max_dist_d_r-min_dist_d_r) #create a distance vector with min, max distance according to the config file 63 | dolly_vector = np.array([1,0])*dolly_distance #create the final dolly position as the robot position+distance vector 64 | dolly_pos_rot = ( (-0.5 + np.random.uniform(0,1))) * dolly_pos_spline #rotate it around the origin 65 | dolly_vector = self.robot_translation + np.hstack((rot_vec(dolly_vector,dolly_pos_rot),np.zeros(1))) 66 | 67 | self.dolly_translation = dolly_vector 68 | 69 | 70 | 71 | min_dist_r_o = randomization_params['min_dist_robot_obs'] 72 | max_dist_r_o = randomization_params['max_dist_robot_obs'] 73 | 74 | min_dist_d_o = randomization_params['min_dist_dolly_obs'] 75 | max_dist_d_o = randomization_params['max_dist_dolly_obs'] 76 | obs_mean = randomization_params['obstacle_pos_spline_mean'] 77 | obstacle_vectors = [] 78 | 79 | for i in range(2): 80 | obstacle_distance = min_dist_r_o + np.random.uniform(0,1)*(max_dist_r_o-min_dist_r_o) #create a distance vector with min, max distance according to the config file 81 | obstacle_pos_rot = 90 + np.random.uniform(-obs_mean/2,obs_mean/2)*90+(i*180) #rotate it around the origin 82 | obstacle_vector = np.array([1,0])*obstacle_distance #create the final object position as the robot position+distance vector 83 | 84 | 85 | obstacle_vector = self.robot_translation + np.hstack((rot_vec(obstacle_vector,obstacle_pos_rot),np.zeros(1))) 86 | obstacle_vectors.append(obstacle_vector) 87 | 88 | for i in range(2): 89 | obstacle_distance = min_dist_d_o + np.random.uniform(0,1)*(max_dist_d_o-min_dist_d_o) #create a distance vector with min, max distance according to the config file 90 | obstacle_pos_rot = 90 + np.random.uniform(-obs_mean/2,obs_mean/2)*90+(i*180) #rotate it around the origin 91 | obstacle_vector = np.array([1,0])*obstacle_distance #create the final object position as the dolly position+distance vector 92 | 93 | 94 | obstacle_vector = self.dolly_translation + np.hstack((rot_vec(obstacle_vector,obstacle_pos_rot),np.zeros(1))) 95 | obstacle_vectors.append(obstacle_vector) 96 | 97 | for remove_obj_idx in range(4-1, num_obstacles-1, -1): 98 | obstacle_vectors[remove_obj_idx] = del_obstacle_translation 99 | 100 | # now randomly assign the created obstacle vectors to the target translations 101 | rnd_obstcl_indexes = np.random.permutation(np.array([0,1,2,3])) 102 | self.obstacle_translation =obstacle_vectors[rnd_obstcl_indexes[0]] #+= ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['translation_rnd_xyz']) ) 103 | self.obstacle_1_translation =obstacle_vectors[rnd_obstcl_indexes[1]] #+= ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['translation_rnd_xyz']) ) 104 | self.obstacle_2_translation =obstacle_vectors[rnd_obstcl_indexes[2]] #+= ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['translation_rnd_xyz']) ) 105 | self.obstacle_3_translation =obstacle_vectors[rnd_obstcl_indexes[3]] #+= ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['translation_rnd_xyz']) ) 106 | 107 | 108 | def set_q_value(self, q_value): 109 | self.q_value = q_value 110 | 111 | def get_obstacle_translations_array(self): 112 | return np.array([self.obstacle_translation, self.obstacle_1_translation, self.obstacle_2_translation, self.obstacle_3_translation]) 113 | 114 | def get_task_array(self): 115 | """ 116 | returns the task as a numpy array in the form: [[robo_trans_x,robo_trans_y,robo_trans_z,robo_yaw_angle], 117 | [dolly_trans_x,dolly_trans_y,dolly_trans_z,dolly_yaw_angle], 118 | [obstacle_trans_x,obstacletrans_y,obstacletrans_z,obobstacleyaw_angle] 119 | [obstacle_1_trans_x,obstacle_1_trans_y,obstacle_1_trans_z,obobstacle_1_yaw_angle] 120 | [obstacle_2_trans_x,obstacle_2_trans_y,obstacle_2_trans_z,obobstacle_2_yaw_angle] 121 | [obstacle_3_trans_x,obstacle_3_trans_y,obstacle_3_trans_z,obobstacle_3_yaw_angle]] 122 | """ 123 | robot_part = np.hstack((self.robot_translation, self.robot_rotation)) 124 | dolly_part = np.hstack((self.dolly_translation, self.dolly_rotation)) 125 | obstacle_part = np.hstack((self.obstacle_translation, self.obstacle_rotation)) 126 | obstacle_1_part = np.hstack((self.obstacle_1_translation, self.obstacle_1_rotation)) 127 | obstacle_2_part = np.hstack((self.obstacle_2_translation, self.obstacle_2_rotation)) 128 | obstacle_3_part = np.hstack((self.obstacle_3_translation, self.obstacle_3_rotation)) 129 | 130 | task_array = np.array([robot_part,dolly_part,obstacle_part,obstacle_1_part,obstacle_2_part,obstacle_3_part]) 131 | return task_array 132 | 133 | def from_task_array(self,task_array): 134 | self.robot_translation = task_array[0,0:3] 135 | self.robot_rotation = np.array([task_array[0][3]]) 136 | self.dolly_translation = task_array[1,0:3] 137 | self.dolly_rotation = np.array([task_array[1][3]]) 138 | self.obstacle_translation = task_array[2,0:3] 139 | self.obstacle_rotation = np.array([task_array[2][3]]) 140 | self.obstacle_1_translation = task_array[3,0:3] 141 | self.obstacle_1_rotation = np.array([task_array[3][3]]) 142 | self.obstacle_2_translation = task_array[4,0:3] 143 | self.obstacle_2_rotation = np.array([task_array[4][3]]) 144 | self.obstacle_3_translation = task_array[5,0:3] 145 | self.obstacle_3_rotation = np.array([task_array[5][3]]) 146 | self.q_value = 0 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /SAC_Trainers.py: -------------------------------------------------------------------------------- 1 | from hashlib import new 2 | from torch import ne 3 | from Networks import * 4 | from per_buffer import * 5 | from utils import * 6 | from ResnetNetworks import * 7 | 8 | 9 | 10 | class SAC_Trainer(): 11 | def __init__(self,save_location, config): 12 | self.save_config = config 13 | self.load_config(self.save_config) 14 | save_location = save_location 15 | self.q1_losses = [] 16 | self.q2_losses = [] 17 | self.policy_losses = [] 18 | self.alpha_losses = [] 19 | self.alpha_list = [] 20 | self.update_step = 0 21 | self.steps = 0 22 | action_range = 1. 23 | 24 | 25 | input_shape = (self.input_shape[0],self.input_shape[1],self.input_shape[2]) 26 | action_dim = 2 27 | state_dim = (self.num_stacked_frames*input_shape[0],input_shape[1],input_shape[2],self.num_lidar_beams) 28 | 29 | self.soft_q_net1 = SoftQNetwork(config).to(train_device) 30 | self.soft_q_net2 = SoftQNetwork(config).to(train_device) 31 | self.target_soft_q_net1 = SoftQNetwork(config).to(train_device) 32 | self.target_soft_q_net2 = SoftQNetwork(config).to(train_device) 33 | self.policy_net = PolicyNetwork(config).to(train_device) 34 | self.log_alpha = torch.ones(1, device=train_device) * np.log(self.alpha) 35 | self.log_alpha.requires_grad= True 36 | 37 | 38 | 39 | if(self.use_exp_prio_buffer): 40 | self.replay_buffer = PrioritizedReplayBuffer(self.buffer_maxlen,config,mem_efficient_mode=True,state_dim=state_dim,action_dim=2,num_stacked_frames=self.num_stacked_frames,alpha=config['prioritized_replay_alpha']) 41 | self.soft_q_criterion1 = torch.nn.SmoothL1Loss(reduction = 'none') 42 | self.soft_q_criterion2 = torch.nn.SmoothL1Loss(reduction = 'none') 43 | self.beta_schedule = LinearSchedule(config['buffer_maxlen'], 44 | initial_p=config['prioritized_replay_beta0'], 45 | final_p=config['prioritized_replay_beta1']) 46 | 47 | else: 48 | self.replay_buffer = ReplayBuffer(self.buffer_maxlen,config,mem_efficient_mode=True,state_dim=state_dim,action_dim=2,num_stacked_frames=self.num_stacked_frames) #(self.buffer_maxlen,self.num_stacked_frames,state_dim) # 49 | self.soft_q_criterion1 = torch.nn.SmoothL1Loss(reduction = 'none') 50 | self.soft_q_criterion2 = torch.nn.SmoothL1Loss(reduction = 'none') 51 | 52 | 53 | 54 | for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): 55 | target_param.data.copy_(param.data) 56 | for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): 57 | target_param.data.copy_(param.data) 58 | 59 | 60 | # for freezing feature extractor, dont put in to optimizer 61 | self.soft_q_optimizer1 = optim.Adam(filter(lambda p: p.requires_grad, self.soft_q_net1.parameters()), lr=self.q_lr) 62 | self.soft_q_optimizer2 = optim.Adam(filter(lambda p: p.requires_grad, self.soft_q_net2.parameters()), lr=self.q_lr) 63 | self.policy_optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.policy_net.parameters()), lr=self.p_lr) 64 | 65 | self.alpha_optimizer = optim.Adam([self.log_alpha], lr=self.a_lr) 66 | 67 | 68 | def all_elements_to(self, target_device): 69 | self.soft_q_net1 = self.soft_q_net1.to(target_device) 70 | self.soft_q_net2 = self.soft_q_net2.to(target_device) 71 | self.target_soft_q_net1 = self.target_soft_q_net1.to(target_device) 72 | self.target_soft_q_net2 = self.target_soft_q_net2.to(target_device) 73 | self.policy_net = self.policy_net.to(target_device) 74 | self.log_alpha = self.log_alpha.to(target_device) 75 | 76 | def load_config(self,config): 77 | self.tau = config['tau'] 78 | self.alpha = config['alpha'] 79 | self.q_lr = config['q_lr'] 80 | self.p_lr = config['p_lr'] 81 | self.a_lr = config['a_lr'] 82 | self.buffer_maxlen = config['buffer_maxlen'] 83 | self.num_stacked_frames = config['num_stacked_frames'] 84 | self.hidden_dim = config['hidden_dim'] 85 | self.freeze_convolution = config['freeze_convolution'] 86 | self.use_grad_clip = config['use_grad_clip'] 87 | self.max_grad = config['max_grad'] 88 | self.use_exp_prio_buffer = config['use_exp_prio_buffer'] 89 | self.use_hard_update = config['use_hard_update'] 90 | self.use_pretrained_resnet18 = config['use_pretrained_resnet18'] 91 | self.num_lidar_beams = config['num_lidar_beams'] 92 | self.input_shape = config['output_resolution'] 93 | self.hard_update_n_step = config['hard_update_n_step'] 94 | 95 | def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-1, gamma=0.99,soft_tau=1e-3, use_reward_scaling=False): 96 | start = current_milli_time() 97 | if self.use_exp_prio_buffer == False: 98 | c_obs,l_obs, action, prev_action, reward, prev_reward, c_next_osb,l_next_obs, done = self.replay_buffer.sample(batch_size) 99 | else: 100 | c_obs,l_obs, action, prev_action, reward, prev_reward, c_next_osb,l_next_obs, done, weights, idxs = self.replay_buffer.sample(batch_size,self.beta_schedule.value(self.update_step)) 101 | weights = torch.FloatTensor(weights).to(train_device) 102 | 103 | # print("sample time: ", current_milli_time()- start) 104 | 105 | self.update_step +=1 106 | cam_tensor = torch.FloatTensor(c_obs).to(train_device) 107 | lidar_tensor = torch.FloatTensor(l_obs).to(train_device) 108 | next_cam_tensor = torch.FloatTensor(c_next_osb).to(train_device) 109 | next_lidar_tensor = torch.FloatTensor(l_next_obs).to(train_device) 110 | action = torch.FloatTensor(action).to(train_device) 111 | prev_action = torch.FloatTensor(prev_action).to(train_device) 112 | reward = torch.FloatTensor(reward).to(train_device) # reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; - no now it is a stack of 4 so we dont need unsqzeeze anymore! 113 | prev_reward = torch.FloatTensor(prev_reward).to(train_device) # prev_reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; - no now it is a stack of 4 so we dont need unsqzeeze anymore! 114 | done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(train_device) 115 | 116 | predicted_q_value1 = self.soft_q_net1(cam_tensor,lidar_tensor, prev_action, prev_reward) 117 | predicted_q_value2 = self.soft_q_net2(cam_tensor,lidar_tensor, prev_action, prev_reward) 118 | 119 | with torch.no_grad(): 120 | new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_cam_tensor,next_lidar_tensor,action, reward, device=train_device) 121 | new_next_stacked_action = action.view(action.size(0),-1)[:,2:] 122 | new_next_stacked_action = torch.cat((new_next_stacked_action,new_next_action),dim=1) 123 | target_q_min = torch.min(self.target_soft_q_net1(next_cam_tensor,next_lidar_tensor, new_next_stacked_action, reward),self.target_soft_q_net2(next_cam_tensor, next_lidar_tensor, new_next_stacked_action,reward)) - self.alpha * next_log_prob 124 | target_q_value = reward[:,-1] + (1 - done) * gamma * target_q_min # if done==1, only reward 125 | 126 | new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(cam_tensor,lidar_tensor,prev_action, prev_reward, device=train_device) 127 | new_stacked_action = prev_action.view(prev_action.size(0),-1)[:,2:] 128 | new_stacked_action = torch.cat((new_stacked_action,new_action),dim=1) 129 | new_action = new_stacked_action 130 | if(use_reward_scaling): 131 | reward = reward[:,-1] 132 | prev_reward = prev_reward[:,-1] 133 | 134 | prev_reward = reward_scale * (prev_reward - prev_reward.mean(dim=0)) / (prev_reward.std(dim=0) + 1e-6) 135 | reward = reward_scale * (reward - reward.mean(dim=0)) / (reward.std(dim=0) + 1e-6) # normalize with batch mean and std; plus a small number to prevent numerical problem 136 | 137 | # Updating alpha wrt entropy 138 | # alpha = 0.0 # trade-off between exploration (max entropy) and exploitation (max Q) 139 | if auto_entropy is True: 140 | alpha_loss = -(self.log_alpha.exp() * (log_prob + target_entropy).detach()).mean() 141 | # print('alpha loss: ',alpha_loss) 142 | self.alpha_optimizer.zero_grad() 143 | alpha_loss.backward() 144 | self.alpha_optimizer.step() 145 | self.alpha = self.log_alpha.exp() 146 | self.alpha_losses.append(alpha_loss.item()) 147 | self.alpha_list.append(self.alpha.item()) 148 | 149 | else: 150 | self.alpha = self.alpha 151 | alpha_loss = 0 152 | 153 | if self.use_exp_prio_buffer == False: 154 | q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()).squeeze() # detach: no gradients for the variable 155 | q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()).squeeze() 156 | q_value_loss1 = (q_value_loss1).mean() 157 | q_value_loss2 = (q_value_loss2).mean() 158 | else: 159 | q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()).squeeze() # detach: no gradients for the variable 160 | q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()).squeeze() 161 | td_error_abs = (q_value_loss1 + q_value_loss2)/2 162 | q_value_loss1 = (q_value_loss1 * weights).mean() 163 | q_value_loss2 = (q_value_loss2 * weights).mean() 164 | # update priority for trained samples 165 | 166 | self.replay_buffer.update_priorities(idxs, np.nan_to_num(td_error_abs.cpu().detach()) + 1e-6) 167 | 168 | 169 | self.soft_q_optimizer1.zero_grad() 170 | q_value_loss1.backward() 171 | if(self.use_grad_clip): 172 | nn.utils.clip_grad_norm_(self.soft_q_net1.parameters(),self.max_grad) 173 | self.soft_q_optimizer1.step() 174 | self.soft_q_optimizer2.zero_grad() 175 | q_value_loss2.backward() 176 | if(self.use_grad_clip): 177 | nn.utils.clip_grad_norm_(self.soft_q_net2.parameters(),self.max_grad) 178 | self.soft_q_optimizer2.step() 179 | 180 | predicted_new_q_value = torch.min(self.soft_q_net1(cam_tensor,lidar_tensor, new_action, reward),self.soft_q_net2(cam_tensor,lidar_tensor, new_action, reward)) 181 | policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean() 182 | 183 | self.policy_optimizer.zero_grad() 184 | policy_loss.backward() 185 | self.policy_optimizer.step() 186 | self.policy_net.version += 1 187 | 188 | self.q1_losses.append(q_value_loss1.item()) 189 | self.q2_losses.append(q_value_loss2.item()) 190 | self.policy_losses.append(policy_loss.item()) 191 | 192 | 193 | 194 | if(self.use_hard_update): 195 | if (self.update_step % self.hard_update_n_step == 0) : 196 | for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): 197 | target_param.data.copy_(param.data) 198 | for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): 199 | target_param.data.copy_(param.data) 200 | 201 | 202 | else: # Soft update the target value net 203 | for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): 204 | target_param.data.copy_( # copy data value into target parameters 205 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau 206 | ) 207 | for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): 208 | target_param.data.copy_( # copy data value into target parameters 209 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau 210 | ) 211 | 212 | del cam_tensor, lidar_tensor, next_cam_tensor, next_lidar_tensor, action, prev_action, reward, prev_reward, done 213 | return predicted_new_q_value.mean() 214 | 215 | def save_checkpoint(self, path, suffix): 216 | torch.save({ 217 | 'config' : self.save_config, 218 | 'update_step' : self.update_step, 219 | 'soft_q_net1' : self.soft_q_net1.state_dict(), 220 | 'soft_q_net2' : self.soft_q_net2.state_dict(), 221 | 'policy_net' : self.policy_net.state_dict(), 222 | 'log_alpha' : self.log_alpha, 223 | 'soft_q_optimizer1' : self.soft_q_optimizer1.state_dict(), 224 | 'soft_q_optimizer2' : self.soft_q_optimizer2.state_dict(), 225 | 'policy_optimizer' : self.policy_optimizer.state_dict(), 226 | 'alpha_optimizer' : self.alpha_optimizer.state_dict() 227 | }, path+"_CHECKPOINT_"+str(suffix)) 228 | 229 | def load_checkpoint(self,path,suffix, load_config=False): 230 | checkpoint=torch.load(path+"_CHECKPOINT_"+str(suffix)) 231 | 232 | self.soft_q_net1.load_state_dict(checkpoint['soft_q_net1']) 233 | self.soft_q_net2.load_state_dict(checkpoint['soft_q_net2']) 234 | self.policy_net.load_state_dict(checkpoint['policy_net']) 235 | self.soft_q_optimizer1.load_state_dict(checkpoint['soft_q_optimizer1']) 236 | self.soft_q_optimizer2.load_state_dict(checkpoint['soft_q_optimizer2']) 237 | self.policy_optimizer.load_state_dict(checkpoint['policy_optimizer']) 238 | self.alpha_optimizer.load_state_dict(checkpoint['alpha_optimizer']) 239 | self.log_alpha = checkpoint['log_alpha'] 240 | print("ALHPA DEVICE!!!!!111!!!: ", self.log_alpha.get_device()) 241 | self.update_step = checkpoint['update_step'] 242 | if(load_config): 243 | self.save_config = checkpoint['config'] 244 | self.load_config(checkpoint['config']) 245 | 246 | for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): 247 | target_param.data.copy_(param.data) 248 | for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): 249 | target_param.data.copy_(param.data) 250 | print("LOADED PRETRAINED MODEL FROM: \n", path) 251 | print("NOW START TRAINING FROM STEP: ", self.update_step) 252 | 253 | self.all_elements_to(train_device) 254 | 255 | def save_model(self, path): 256 | torch.save(self.soft_q_net1.state_dict(), path+'_q1_'+str(self.update_step)) 257 | torch.save(self.soft_q_net2.state_dict(), path+'_q2_'+str(self.update_step)) 258 | torch.save(self.policy_net.state_dict(), path+'_policy_'+str(self.update_step)) 259 | 260 | def load_model(self, path, step): 261 | self.soft_q_net1.load_state_dict(torch.load(path+'_q1_'+str(step))) 262 | self.soft_q_net2.load_state_dict(torch.load(path+'_q2_'+str(step))) 263 | self.policy_net.load_state_dict(torch.load(path+'_policy_'+str(step))) 264 | 265 | self.soft_q_net1.eval() 266 | self.soft_q_net2.eval() 267 | self.policy_net.eval() 268 | 269 | 270 | 271 | """ 272 | 273 | class SAC_Trainer_Resnet(): 274 | def __init__(self,save_location, config): 275 | save_location = save_location 276 | self.q1_losses = [] 277 | self.q2_losses = [] 278 | self.policy_losses = [] 279 | self.alpha_losses = [] 280 | self.alpha_list = [] 281 | self.update_step = 0 282 | action_range = 1. 283 | tau = config['tau'] 284 | self.alpha = config['alpha'] 285 | q_lr = config['q_lr'] 286 | p_lr = config['p_lr'] 287 | a_lr = config['a_lr'] 288 | buffer_maxlen = config['buffer_maxlen'] 289 | num_stacked_frames = config['num_stacked_frames'] 290 | hidden_dim = config['hidden_dim'] 291 | freeze_convolution = config['freeze_convolution'] 292 | self.use_grad_clip = config['use_grad_clip'] 293 | self.max_grad = config['max_grad'] 294 | self.use_exp_prio_buffer= config['use_exp_prio_buffer'] 295 | use_pretrained_resnet18 = config['use_pretrained_resnet18'] 296 | num_lidar_beams = config['num_lidar_beams'] 297 | action_dim = 2 298 | input_shape = config['output_resolution'] 299 | input_shape = (input_shape[0],input_shape[1],input_shape[2]) 300 | state_dim = (num_stacked_frames*input_shape[0],input_shape[1],input_shape[2],num_lidar_beams) 301 | 302 | 303 | 304 | self.soft_q_net1 = SoftQNetwork_Resnet(state_dim, action_dim, hidden_dim,freeze_convolution=freeze_convolution,use_pretrained_resnet18=use_pretrained_resnet18).to(device) 305 | self.soft_q_net2 = SoftQNetwork_Resnet(state_dim, action_dim, hidden_dim,freeze_convolution=freeze_convolution,use_pretrained_resnet18=use_pretrained_resnet18).to(device) 306 | self.target_soft_q_net1 = SoftQNetwork_Resnet(state_dim, action_dim, hidden_dim,freeze_convolution=freeze_convolution,use_pretrained_resnet18=use_pretrained_resnet18).to(device) 307 | self.target_soft_q_net2 = SoftQNetwork_Resnet(state_dim, action_dim, hidden_dim,freeze_convolution=freeze_convolution,use_pretrained_resnet18=use_pretrained_resnet18).to(device) 308 | self.policy_net = PolicyNetwork_Resnet(state_dim, action_dim, hidden_dim, action_range,freeze_convolution=freeze_convolution,use_pretrained_resnet18=use_pretrained_resnet18).to(device) 309 | self.log_alpha = torch.ones(1, device=device) * np.log(self.alpha) 310 | self.log_alpha.requires_grad=True 311 | 312 | 313 | if(self.use_exp_prio_buffer): 314 | self.replay_buffer = PrioritizedBuffer(buffer_maxlen,env.observtion_shape,memory_efficient_mode = False) 315 | self.soft_q_criterion1 = torch.nn.SmoothL1Loss(reduction = 'none') 316 | self.soft_q_criterion2 = torch.nn.SmoothL1Loss(reduction = 'none') 317 | else: 318 | self.replay_buffer = ReplayBuffer_MemEfficient_Resnet(buffer_maxlen,num_stacked_frames,state_dim) 319 | self.soft_q_criterion1 = nn.MSELoss() 320 | self.soft_q_criterion2 = nn.MSELoss() 321 | 322 | 323 | for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): 324 | target_param.data.copy_(param.data) 325 | for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): 326 | target_param.data.copy_(param.data) 327 | 328 | self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=q_lr) 329 | self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=q_lr) 330 | self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=p_lr) 331 | self.alpha_optimizer = optim.Adam([self.log_alpha], lr=a_lr) 332 | 333 | 334 | def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99,soft_tau=1e-2, use_reward_scaling=False): 335 | start = current_milli_time() 336 | if self.use_exp_prio_buffer == False: 337 | state, action,prev_action, reward, prev_reward, next_state, done = self.replay_buffer.sample(batch_size) 338 | else: 339 | transitions, idxs, weights = self.replay_buffer.sample(batch_size) 340 | state, action, prev_action, reward, next_state, done = transitions 341 | weights = torch.FloatTensor(weights).to(device) 342 | 343 | state_shape = state.shape 344 | cam_state = [] 345 | lidar_state = [] 346 | next_cam_state = [] 347 | next_lidar_state = [] 348 | 349 | for x in range(batch_size): 350 | cam_state.append(state[x][0]) 351 | lidar_state.append(state[x][1]) 352 | next_cam_state.append(state[x][0]) 353 | next_lidar_state.append(state[x][1]) 354 | 355 | 356 | cam_state = np.asarray(cam_state).reshape((batch_size,3,224,224)) 357 | lidar_state = np.asarray(lidar_state) 358 | 359 | cam_tensor = torch.FloatTensor(cam_state).to(device) 360 | lidar_tensor = torch.FloatTensor(lidar_state).to(device) 361 | 362 | next_cam_state = np.asarray(next_cam_state).reshape((batch_size,3,224,224)) 363 | next_lidar_state = np.asarray(next_lidar_state) 364 | next_cam_tensor = torch.FloatTensor(next_cam_state).to(device) 365 | next_lidar_tensor = torch.FloatTensor(next_lidar_state).to(device) 366 | 367 | 368 | self.update_step +=1 369 | 370 | state = [cam_tensor,lidar_tensor] 371 | 372 | next_state = [next_cam_tensor, next_lidar_tensor] 373 | 374 | action = torch.FloatTensor(action).to(device) 375 | prev_action = torch.FloatTensor(prev_action).to(device) 376 | reward = torch.FloatTensor(reward).unsqueeze(1).to(device) # reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; 377 | prev_reward = torch.FloatTensor(prev_reward).to(device) # prev_reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; 378 | done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) 379 | 380 | 381 | 382 | predicted_q_value1 = self.soft_q_net1(state, action,prev_action, prev_reward) 383 | predicted_q_value2 = self.soft_q_net2(state, action,prev_action, prev_reward) 384 | new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state,prev_action, prev_reward,device=device) 385 | new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state,action, reward,device=device) 386 | 387 | if(use_reward_scaling): 388 | reward = reward_scale * (reward - reward.mean(dim=0)) / (reward.std(dim=0) + 1e-6) # normalize with batch mean and std; plus a small number to prevent numerical problem 389 | 390 | # Updating alpha wrt entropy 391 | # alpha = 0.0 # trade-off between exploration (max entropy) and exploitation (max Q) 392 | if auto_entropy is True: 393 | alpha_loss = -(self.log_alpha * (log_prob + target_entropy).detach()).mean() 394 | # print('alpha loss: ',alpha_loss) 395 | self.alpha_optimizer.zero_grad() 396 | alpha_loss.backward() 397 | self.alpha_optimizer.step() 398 | self.alpha = self.log_alpha.exp() 399 | 400 | self.alpha_losses.append(alpha_loss.item()) 401 | self.alpha_list.append(self.alpha.item()) 402 | 403 | else: 404 | self.alpha = 1. 405 | alpha_loss = 0 406 | 407 | # Training Q Function 408 | target_q_min = torch.min(self.target_soft_q_net1(next_state, new_next_action, action, reward),self.target_soft_q_net2(next_state, new_next_action,action,reward)) - self.alpha * next_log_prob 409 | target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward 410 | # print("pred Q", predicted_q_value1) 411 | # print("target Q", target_q_value) 412 | 413 | 414 | if self.use_exp_prio_buffer == False: 415 | q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()) # detach: no gradients for the variable 416 | q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()) 417 | else: 418 | q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()).squeeze() # detach: no gradients for the variable 419 | q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()).squeeze() 420 | # print("q1 loss", q_value_loss1) 421 | # print(predicted_q_value1.shape, target_q_value.shape) 422 | # print(weights.shape, self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()).squeeze().shape) 423 | td_error_abs = (q_value_loss1 + q_value_loss2)/2 424 | q_value_loss1 = (q_value_loss1 * weights).mean() 425 | q_value_loss2 = (q_value_loss2 * weights).mean() 426 | # print("IDXS: ", idxs) 427 | # update priority for trained samples 428 | for idx, td_error in zip(idxs, td_error_abs.cpu().detach() + 1e-8):#in zip(idxs, td_error_abs.cpu().detach().numpy()[0]): 429 | self.replay_buffer.update_priority(idx, td_error) 430 | 431 | self.soft_q_optimizer1.zero_grad() 432 | q_value_loss1.backward() 433 | if(self.use_grad_clip): 434 | nn.utils.clip_grad_norm_(self.soft_q_net1.parameters(),self.max_grad) 435 | self.soft_q_optimizer1.step() 436 | self.soft_q_optimizer2.zero_grad() 437 | q_value_loss2.backward() 438 | if(self.use_grad_clip): 439 | nn.utils.clip_grad_norm_(self.soft_q_net2.parameters(),self.max_grad) 440 | self.soft_q_optimizer2.step() 441 | 442 | 443 | predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action,action, reward),self.soft_q_net2(state, new_action,action, reward)) 444 | policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean() 445 | 446 | self.policy_optimizer.zero_grad() 447 | policy_loss.backward() 448 | self.policy_optimizer.step() 449 | 450 | self.q1_losses.append(q_value_loss1.item()) 451 | self.q2_losses.append(q_value_loss2.item()) 452 | self.policy_losses.append(policy_loss.item()) 453 | 454 | # Soft update the target value net 455 | for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): 456 | target_param.data.copy_( # copy data value into target parameters 457 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau 458 | ) 459 | for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): 460 | target_param.data.copy_( # copy data value into target parameters 461 | target_param.data * (1.0 - soft_tau) + param.data * soft_tau 462 | ) 463 | 464 | return predicted_new_q_value.mean() 465 | 466 | def save_model(self, path): 467 | torch.save(self.soft_q_net1.state_dict(), path+'_q1_'+str(self.update_step)) 468 | torch.save(self.soft_q_net2.state_dict(), path+'_q2_'+str(self.update_step)) 469 | torch.save(self.policy_net.state_dict(), path+'_policy_'+str(self.update_step)) 470 | 471 | def load_model(self, path, step): 472 | self.soft_q_net1.load_state_dict(torch.load(path+'_q1_'+str(step))) 473 | self.soft_q_net2.load_state_dict(torch.load(path+'_q2_'+str(step))) 474 | self.policy_net.load_state_dict(torch.load(path+'_policy_'+str(step))) 475 | 476 | self.soft_q_net1.eval() 477 | self.soft_q_net2.eval() 478 | self.policy_net.eval() 479 | 480 | """ -------------------------------------------------------------------------------- /SAC_hyperparameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "gamma" : 0.999, 3 | "tau" : 0.001, 4 | "alpha" : 0.2, 5 | "a_lr" : 2e-4, 6 | "q_lr" : 2e-4, 7 | "p_lr" : 2e-4, 8 | "num_stacked_frames" : 4, 9 | "train_starts" : 0, 10 | "fill_buffer_with_transitions" : 15000, 11 | "batch_size" : 64, 12 | "hidden_dim" : 64, 13 | "max_grad" : 10000, 14 | "max_episodes" : 1e+6, 15 | "max_steps" : 200, 16 | "buffer_maxlen" : 1e+6, 17 | "use_grad_clip" : false, 18 | "use_exp_prio_buffer" : true, 19 | "use_cer_exp" : false, 20 | "prioritized_replay_beta0" : 0.4, 21 | "prioritized_replay_beta1" : 0.6, 22 | "prioritized_replay_alpha" : 0.6, 23 | "use_reward_scaling" : false, 24 | "use_hard_update" : true, 25 | "hard_update_n_step" : 5000, 26 | "max_angular_vel" : 0.05, 27 | "min_linear_vel" : 0.5, 28 | "max_action_distance" : 0.5, 29 | "min_action_distance" : 0.1, 30 | "step_penalty" : -0.1, 31 | "dolly_collision_penalty" : -0.1, 32 | "different_action_penalty" : -0.05, 33 | "slow_penalty" : -0.000, 34 | "collision_penalty" : -3, 35 | "goal_reward" : 10.0, 36 | "heuristic_multiplier" : 0.0, 37 | "num_lidar_beams" : 256, 38 | "num_actions" : 2, 39 | "action_range" : 1.0, 40 | "freeze_convolution" : false, 41 | "terminate_on_collision" : true, 42 | "use_resnet_archi" : false, 43 | "use_pretrained_resnet18" : false, 44 | "num_Agents" : 3, 45 | "start_port" : 64000, 46 | "use_snail_mode" : false, 47 | "snail_kernel_size" : 2, 48 | "channel_s" : [256,128,64], 49 | "channel_a" : [16,16,16], 50 | "channel_r" : [16,16,16], 51 | "use_pretrained_vae" : false, 52 | "pretrained_vae_path" : "/home/developer/Training_results/Qricculum_Learning/big_and_small/hoffentlich/VAE_80803_615", 53 | "load_checkpoint" : true, 54 | "checkpoint_path" : "/media/developer/c214e611-7375-4843-a5ac-e51eed61602e/home/developer/Training_results/from_Hydra/Visual/10/04/2021/09:28:20/", 55 | "checkpoint_index" : "22_158322" 56 | } -------------------------------------------------------------------------------- /buffers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Dec 19 23:24:08 2020 4 | 5 | @author: hongh 6 | """ 7 | import numpy as np 8 | from collections import deque 9 | import random 10 | import torch 11 | import matplotlib.pyplot as plt 12 | from utils import * 13 | from segment_tree import * 14 | 15 | class ReplayBuffer: 16 | def __init__(self, capacity): 17 | self.capacity = capacity 18 | self.buffer = [] 19 | self.position = 0 20 | 21 | def push(self, state, action, prev_action, reward, prev_reward, next_state, done): 22 | if len(self.buffer) < self.capacity: 23 | self.buffer.append(None) 24 | self.buffer[self.position] = (state, action,prev_action, reward, prev_reward, next_state, done) 25 | self.position = int((self.position + 1) % self.capacity) # as a ring buffer 26 | 27 | def sample(self, batch_size): 28 | batch = random.sample(self.buffer, batch_size) 29 | state, action,prev_action, reward, prev_reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element 30 | ''' 31 | the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; 32 | zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; 33 | the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ; 34 | np.stack((1,2)) => array([1, 2]) 35 | ''' 36 | return state, action, prev_action, reward, prev_reward, next_state, done 37 | 38 | def __len__(self): 39 | return len(self.buffer) 40 | 41 | 42 | class ReplayBuffer_MemEfficient: 43 | def __init__(self, capacity,num_stacked_frames,observation_shape): 44 | self.capacity = capacity 45 | self.buffer = [] 46 | self.position = 0 47 | self.not_allowed_idx = set(np.arange(num_stacked_frames)) 48 | self.num_stacked_frames = num_stacked_frames 49 | self.stacked_images_shape = (observation_shape[0],observation_shape[1],observation_shape[2]) 50 | 51 | def add(self, obs_cam, obs_lidar, action, prev_action, reward, prev_reward, next_obs_cam, next_obs_lidar, done): 52 | if len(self.buffer) < self.capacity: 53 | self.buffer.append(None) 54 | self.buffer[self.position] = (obs_cam, obs_lidar, action,prev_action, reward, prev_reward, next_obs_cam, next_obs_lidar, done) 55 | self.position = int((self.position + 1) % self.capacity) # as a ring buffer 56 | 57 | def sample(self, batch_size): 58 | 59 | # sample without replacement and avoid the first n=num_stacked_frames entries in the buffer to avoid: idx < frame_stack 60 | idx_list = np.random.choice(len(self.buffer)-self.num_stacked_frames, batch_size)+self.num_stacked_frames 61 | 62 | 63 | # find terminating samples inside the batch 64 | for idx in range(len(idx_list)): 65 | for frame in range(self.num_stacked_frames-1): 66 | obs_cam, obs_lidar , action , prev_action , reward , prev_reward , next_obs_cam, next_obs_lidar , done = self.buffer[idx_list[idx]-frame] 67 | #if terminating frame is found in one of the 4 previous frames then set the index of the terminating frame to be the last one in that state 68 | #this avoids training on overlapping episodes, consider: the sampled idx is 13 -> then we concatenate the frames like so: [10,11,12,13] such that 13 is the last one 69 | # but now if frame 11 corresponds to a terminating state then frames 10 and 11 are in a different episode than 12 and 13 therefore a state containing the frames 70 | # [10,11,12,13] is impossible to reach (it would contain a teleport), therefore instead we find the done flag at frame 11 and reassign the target index from 13 to eleven 71 | # the resulting sample is now [8,9,10,11] instead of [10,11,12,13] 72 | if(done): 73 | idx_list[idx] = idx_list[idx]-frame 74 | break 75 | 76 | state_cam_batch = [] 77 | next_cam_state_batch = [] 78 | # stack the corresponding frames according to the idx_list and save them as batches 79 | ##### TODO: THIS TAKES TO LONG 80 | for idx in idx_list: 81 | obs_cam , obs_lidar , action , prev_action , reward , prev_reward , next_obs_cam , next_obs_lidar , done = self.buffer[idx] 82 | obs_cam_1, obs_lidar_1, action_1, prev_action_1, reward_1, prev_reward_1, next_obs_cam_1, next_obs_lidar_1, done_1 = self.buffer[idx-1] 83 | obs_cam_2, obs_lidar_2, action_2, prev_action_2, reward_2, prev_reward_2, next_obs_cam_2, next_obs_lidar_2, done_2 = self.buffer[idx-2] 84 | obs_cam_3, obs_lidar_3, action_3, prev_action_3, reward_3, prev_reward_3, next_obs_cam_3, next_obs_lidar_3, done_3 = self.buffer[idx-3] 85 | obs_shape = obs_cam.shape 86 | 87 | state_cam = np.asarray([obs_cam_3, obs_cam_2, obs_cam_1, obs_cam]).reshape(self.stacked_images_shape) 88 | next_state_cam = np.asarray([obs_cam_2, obs_cam_1, obs_cam, next_obs_cam]).reshape(self.stacked_images_shape) 89 | 90 | 91 | 92 | state_cam_batch.append(state_cam) 93 | next_cam_state_batch.append(next_state_cam) 94 | 95 | del obs_cam , obs_lidar , action , prev_action , reward , prev_reward , next_obs_cam , next_obs_lidar , done 96 | del obs_cam_1, obs_lidar_1, action_1, prev_action_1, reward_1, prev_reward_1, next_obs_cam_1, next_obs_lidar_1, done_1 97 | del obs_cam_2, obs_lidar_2, action_2, prev_action_2, reward_2, prev_reward_2, next_obs_cam_2, next_obs_lidar_2, done_2 98 | del obs_cam_3, obs_lidar_3, action_3, prev_action_3, reward_3, prev_reward_3, next_obs_cam_3, next_obs_lidar_3, done_3 99 | 100 | 101 | 102 | batch = [self.buffer[i] for i in idx_list] 103 | 104 | 105 | ''' 106 | the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; 107 | zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; 108 | the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ; 109 | np.stack((1,2)) => array([1, 2]) 110 | ''' 111 | obs_cam , obs_lidar , action , prev_action , reward , prev_reward , next_obs_cam , next_obs_lidar , done = map(np.stack, zip(*batch)) # stack for each element 112 | #replace the observations with the stacked states 113 | state_cam = np.asarray(state_cam_batch) 114 | next_state_cam = np.asarray(next_cam_state_batch) 115 | 116 | 117 | 118 | return state_cam,obs_lidar, action, prev_action, reward, prev_reward, next_state_cam, next_obs_lidar, done 119 | 120 | def __len__(self): 121 | return len(self.buffer) 122 | 123 | def show_state(self,state_tensor, save=False): 124 | state_shape= (84,84,3) 125 | pic0 = np.array(state_tensor.reshape(state_shape)) 126 | 127 | fig, (ax1) = plt.subplots(1, 1) 128 | ax1.imshow(pic0) 129 | plt.show() 130 | 131 | class ReplayBuffer_MemEfficient_Resnet: 132 | def __init__(self, capacity,num_stacked_frames,observation_shape): 133 | self.capacity = capacity 134 | self.buffer = [] 135 | self.position = 0 136 | self.not_allowed_idx = set(np.arange(num_stacked_frames)) 137 | self.num_stacked_frames = num_stacked_frames 138 | self.stacked_images_shape = (observation_shape[0],244,244) 139 | 140 | def push(self, observation, action, prev_action, reward, prev_reward, next_observation, done): 141 | if len(self.buffer) < self.capacity: 142 | self.buffer.append(None) 143 | self.buffer[self.position] = (observation, action,prev_action, reward, prev_reward, next_observation, done) 144 | self.position = int((self.position + 1) % self.capacity) # as a ring buffer 145 | 146 | def sample(self, batch_size): 147 | 148 | # sample without replacement and avoid the first n=num_stacked_frames entries in the buffer to avoid: idx < frame_stack 149 | idx_list = np.random.choice(len(self.buffer)-self.num_stacked_frames, batch_size)+self.num_stacked_frames 150 | 151 | 152 | # find terminating samples inside the batch 153 | for idx in range(len(idx_list)): 154 | for frame in range(self.num_stacked_frames-1): 155 | observation , action , prev_action , reward , prev_reward , next_observation , done = self.buffer[idx_list[idx]-frame] 156 | #if terminating frame is found in one of the 4 previous frames then set the index of the terminating frame to be the last one in that state 157 | #this avoids training on overlapping episodes, consider: the sampled idx is 13 -> then we concatenate the frames like so: [10,11,12,13] such that 13 is the last one 158 | # but now if frame 11 corresponds to a terminating state then frames 10 and 11 are in a different episode than 12 and 13 therefore a state containing the frames 159 | # [10,11,12,13] is impossible to reach (it would contain a teleport), therefore instead we find the done flag at frame 11 and reassign the target index from 13 to eleven 160 | # the resulting sample is now [8,9,10,11] instead of [10,11,12,13] 161 | if(done): 162 | idx_list[idx] = idx_list[idx]-frame 163 | break 164 | 165 | state_batch = [] 166 | next_state_batch = [] 167 | # stack the corresponding frames according to the idx_list and save them as batches 168 | for idx in idx_list: 169 | observation , action , prev_action , reward , prev_reward , next_observation , done = self.buffer[idx] 170 | # observation_1, action_1, prev_action_1, reward_1, prev_reward_1, next_observation_1, done_1 = self.buffer[idx-1] 171 | # observation_2, action_2, prev_action_2, reward_2, prev_reward_2, next_observation_2, done_2 = self.buffer[idx-2] 172 | # observation_3, action_3, prev_action_3, reward_3, prev_reward_3, next_observation_3, done_3 = self.buffer[idx-3] 173 | obs_shape = observation[0].shape 174 | 175 | state_lidar = observation[1] 176 | #preprocess(observation_3[0]).numpy(),preprocess(observation_2[0]).numpy(),preprocess(observation_1[0]).numpy(), 177 | state_cam = [preprocess(observation[0]).numpy()] 178 | state_cam = np.asarray(state_cam).reshape((3,224,224)) 179 | state = np.asarray([state_cam, state_lidar]) 180 | 181 | next_state_lidar = next_observation[1] 182 | #preprocess(observation_2[0]).numpy(),preprocess(observation_1[0]).numpy(),preprocess(observation[0]).numpy(), 183 | next_state_cam = [preprocess(next_observation[0]).numpy()] 184 | next_state_cam = np.asarray(next_state_cam).reshape((3,224,224)) 185 | next_state = np.asarray([next_state_cam,next_state_lidar]) 186 | 187 | state_batch.append(state) 188 | next_state_batch.append(next_state) 189 | 190 | 191 | batch = [self.buffer[i] for i in idx_list] 192 | 193 | 194 | ''' 195 | the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; 196 | zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; 197 | the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ; 198 | np.stack((1,2)) => array([1, 2]) 199 | ''' 200 | observation, action,prev_action, reward, prev_reward, next_observation, done = map(np.stack, zip(*batch)) # stack for each element 201 | #replace the observations with the stacked states 202 | state = np.asarray(state_batch) 203 | next_state = np.asarray(next_state_batch) 204 | 205 | 206 | return state, action, prev_action, reward, prev_reward, next_state, done 207 | 208 | def __len__(self): 209 | return len(self.buffer) 210 | 211 | def show_state(self,state_tensor, save=False): 212 | state_shape= (84,84,1) 213 | pic0 = np.array(state_tensor.reshape(state_shape)) 214 | 215 | fig, (ax1) = plt.subplots(1, 1) 216 | ax1.imshow(pic0) 217 | plt.show() 218 | 219 | class BasicBuffer: 220 | def __init__(self, max_size, shape): 221 | self.max_size = max_size 222 | self.buffer = deque(maxlen=max_size) 223 | self.frame_stack = shape[0] 224 | self.not_allowed_idx = set(np.arange(self.frame_stack)) 225 | self.img_shape = shape 226 | 227 | def push(self, state, action, reward, next_state, done): 228 | ''' Changed to store the last frame''' 229 | # state = np.expand_dims(state[-1], axis=0) 230 | # next_state = np.expand_dims(next_state[-1], axis=0) 231 | # state.astype(np.uint8) 232 | # next_state.astype(np.uint8) 233 | experience = (np.expand_dims(state[-1], axis=0).astype(np.uint8), np.array([action]), np.array([reward]), np.expand_dims(next_state[-1], axis=0).astype(np.uint8), done) 234 | self.buffer.append(experience) 235 | 236 | 237 | def sample(self, batch_size): 238 | '''Changed to memory-efficient one by storing one frame, but when extracting, frame stacking is automatically done.''' 239 | # state_batch = [] 240 | action_batch = [] 241 | reward_batch = [] 242 | # next_state_batch = [] 243 | done_batch = [] 244 | continue_sample = True 245 | state_batch = np.zeros((batch_size,*self.img_shape)) 246 | next_state_batch = np.zeros((batch_size,*self.img_shape)) 247 | 248 | # ----------------recursive sampling to avoid: idx < frame_stack---------------- 249 | while continue_sample: 250 | idx = np.random.choice(len(self.buffer), batch_size) # sample without replacement 251 | idx_set = set(idx) 252 | if len(idx_set.intersection(self.not_allowed_idx)) == 0: 253 | continue_sample = False 254 | 255 | 256 | # --------------- frame stacking ----------------- 257 | for i in range(len(idx)): 258 | for j in range(1, self.frame_stack+1):# 259 | if self.buffer[idx[i]-j+1][4] == True and j > 1: # done = True 260 | state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1] # the last [0] makes --> 1*84*84 --> 84*84 261 | next_state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1] 262 | else: 263 | state_batch[i, self.frame_stack-j] = self.buffer[idx[i]-j+1][0][0] # the last [0] makes --> 1*84*84 --> 84*84 264 | next_state_batch[i, self.frame_stack-j] = self.buffer[idx[i]-j+1][3][0] 265 | # state_batch.append(self.buffer[idx[i]][0]) 266 | 267 | # next_state_batch.append(self.buffer[idx[i]][3]) 268 | 269 | for i in range(len(idx)): 270 | reward_batch.append(self.buffer[idx[i]][2]) 271 | action_batch.append(self.buffer[idx[i]][1]) 272 | done_batch.append(self.buffer[idx[i]][4]) 273 | 274 | return (state_batch, action_batch, reward_batch, next_state_batch, done_batch) 275 | 276 | def __len__(self): 277 | return len(self.buffer) 278 | 279 | 280 | 281 | class SumTree: 282 | '''Even if I can't apply the trick of deque here, it is still possible to store one frame by ruling out the index in the current write position.''' 283 | write = 0 284 | 285 | def __init__(self, capacity): 286 | self.capacity = capacity 287 | self.data = np.zeros(capacity, dtype=object) # storing experience 288 | self.tree = np.zeros(2 * capacity - 1) # storing pripority + parental nodes. If you have n elements in the bottom, then you need n + (n-1) nodes to construct a tree. 289 | self.n_entries = 0 290 | self.overwrite_start_flag = False # record whether N_entry > capacity 291 | 292 | # update to the root node 293 | def _propagate(self, idx, change): 294 | parent = (idx - 1) // 2 295 | self.tree[parent] += change 296 | if parent != 0: 297 | self._propagate(parent, change) 298 | 299 | # find sample on leaf node 300 | def _retrieve(self, idx, s): 301 | left = 2 * idx + 1 302 | right = left + 1 303 | if left >= len(self.tree): 304 | return idx 305 | if s <= self.tree[left]: 306 | return self._retrieve(left, s) 307 | else: 308 | return self._retrieve(right, s - self.tree[left]) 309 | 310 | 311 | def total(self): 312 | return self.tree[0] 313 | 314 | # store priority and sample 315 | def add(self, p, data): 316 | idx = self.write + self.capacity - 1 # starting write from the first element of the bottom layer, 317 | self.data[self.write] = data 318 | self.update(idx, p) 319 | self.write += 1 320 | if self.write >= self.capacity: 321 | self.write = 0 322 | self.overwrite_start_flag = True 323 | if self.n_entries < self.capacity: 324 | self.n_entries += 1 325 | 326 | 327 | # update priority 328 | def update(self, idx, p): 329 | change = p - self.tree[idx] 330 | self.tree[idx] = p 331 | self._propagate(idx, change) 332 | 333 | # get priority and sample 334 | def get(self, s): 335 | #--------original---------- 336 | idx = self._retrieve(0, s) 337 | dataIdx = idx - self.capacity + 1 338 | return (idx, self.tree[idx], self.data[dataIdx]) 339 | #--------The below is Morvan's implementation---------- 340 | #https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Dueling%20Double%20DQN%20with%20PER%20and%20fixed-q%20targets/.ipynb_checkpoints/Dueling%20Deep%20Q%20Learning%20with%20Doom%20(%2B%20double%20DQNs%20and%20Prioritized%20Experience%20Replay)-checkpoint.ipynb 341 | #https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/5.2_Prioritized_Replay_DQN/RL_brain.py 342 | # parent_idx = 0 343 | # while True: # the while loop is faster than the method in the reference code 344 | # cl_idx = 2 * parent_idx + 1 # this leaf's left and right kids 345 | # cr_idx = cl_idx + 1 346 | # if cl_idx >= len(self.tree): # reach bottom, end search 347 | # leaf_idx = parent_idx 348 | # break 349 | # else: # downward search, always search for a higher priority node 350 | # if s <= self.tree[cl_idx]: 351 | # parent_idx = cl_idx 352 | # else: 353 | # s -= self.tree[cl_idx] 354 | # parent_idx = cr_idx 355 | # data_idx = leaf_idx - self.capacity + 1 356 | # return leaf_idx, self.tree[leaf_idx], self.data[data_idx] 357 | 358 | 359 | def max_priority(self): 360 | return np.max(self.tree[self.capacity-1:]) 361 | 362 | # ---- modified : later to avoid min_prob being overwritten ---- 363 | def min_prob(self): 364 | if self.overwrite_start_flag == False: 365 | p_min = float(np.min(self.tree[self.capacity-1:self.n_entries+self.capacity-1])) / self.total() 366 | self.p_min_history = p_min 367 | elif self.overwrite_start_flag == True: 368 | p_min = min( float(np.min(self.tree[self.capacity-1:self.n_entries+self.capacity-1])) / self.total() ,self.p_min_history) 369 | self.p_min_history = min(p_min, self.p_min_history) 370 | return p_min 371 | 372 | 373 | 374 | class PrioritizedBuffer: 375 | '''Currently, there is a problem when memory is full''' 376 | def __init__(self, max_size, shape, alpha=0.6, beta=0.4, memory_efficient_mode = True): 377 | self.sum_tree = SumTree(max_size) 378 | self.alpha = alpha 379 | self.beta = beta 380 | self.beta_increment = (1.0 - beta)/100000. 381 | self.current_length = 0 382 | self.frame_stack = shape[0] 383 | self.efficient_memory_mode = memory_efficient_mode 384 | self.img_shape = shape 385 | 386 | 387 | def push(self, state, action,prev_action, reward, next_state, done): 388 | priority = 1.0 if self.current_length == 0 else self.sum_tree.max_priority() 389 | self.current_length = self.current_length + 1 390 | # priority = priority ** self.alpha 391 | # -------------modified for efficient storage-------------- 392 | if self.efficient_memory_mode: 393 | experience = (np.expand_dims(state[-1], axis=0).astype(np.uint8), np.array([action]), np.array([reward]), np.expand_dims(next_state[-1], axis=0).astype(np.uint8), done) 394 | else: 395 | experience = (state, action,prev_action, reward, next_state, done) 396 | self.sum_tree.add(priority, experience) 397 | 398 | 399 | def sample(self, batch_size): 400 | # ----------------To modify: sampling for efficient memory storage------------------ 401 | batch_idx, batch, priorities = [], [], [] 402 | segment = self.sum_tree.total() / batch_size 403 | p_sum = self.sum_tree.tree[0] 404 | 405 | for i in range(batch_size): 406 | a = segment * i 407 | b = segment * (i + 1) 408 | 409 | # ---------self modified to allow storing for efficient memory storage of only one frame---------- 410 | if self.efficient_memory_mode: 411 | while True: 412 | s = random.uniform(a, b) 413 | idx, p, data = self.sum_tree.get(s) 414 | data_idx = idx-self.sum_tree.capacity+1 # Note: idx(node idx) is different from dataidx (data idx)! 415 | # print(data_idx, self.sum_tree.write, self.frame_stack) 416 | if (data_idx - self.sum_tree.write > self.frame_stack or data_idx<= self.sum_tree.write) and (self.sum_tree.overwrite_start_flag == True): 417 | break 418 | # elif (self.sum_tree.overwrite_start_flag == False) and (data_idx >= self.frame_stack - 1): 419 | # break 420 | elif (self.sum_tree.overwrite_start_flag == False): # allow samples from <= frame_stack 421 | break 422 | else: 423 | s = random.uniform(a, b) 424 | idx, p, data = self.sum_tree.get(s) 425 | batch_idx.append(idx) 426 | batch.append(data) 427 | priorities.append(p) 428 | prob = np.array(priorities) /p_sum 429 | IS_weights = np.power(self.sum_tree.n_entries * prob, -self.beta) 430 | max_weight = np.power(self.sum_tree.n_entries * self.sum_tree.min_prob(), -self.beta) 431 | IS_weights /= max_weight 432 | 433 | if self.efficient_memory_mode: 434 | action_batch = [] 435 | reward_batch = [] 436 | done_batch = [] 437 | state_batch = np.zeros((batch_size,*self.img_shape)) 438 | next_state_batch = np.zeros((batch_size,*self.img_shape)) 439 | # --------------- frame stacking ----------------- 440 | for i in range(len(batch_idx)): 441 | for j in range(1, self.frame_stack+1): 442 | data_idx = batch_idx[i]-self.sum_tree.capacity+1-j+1 # notice the offset to substract to get right data[data_idx] 443 | if data_idx < 0 and self.sum_tree.overwrite_start_flag == True: # for the case : sample the 1st element and stack the last 3 elements from the end 444 | data_idx += self.sum_tree.capacity 445 | elif data_idx < 0 and self.sum_tree.overwrite_start_flag == False: # for the case : sample the 2nd element and stack the 1st elements 3 times from the beginning 446 | data_idx = 0 447 | # print(data_idx ,self.sum_tree.data[data_idx]) 448 | if (self.sum_tree.data[data_idx][4] == True and j > 1) or (data_idx < 0): # for the case of done = True or sampling from the beginning 449 | state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1] # the last [0] makes --> 1*84*84 --> 84*84 450 | next_state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1] 451 | else: 452 | state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][0][0] # the last [0] makes --> 1*84*84 --> 84*84 453 | next_state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][3][0] 454 | 455 | for i in range(len(batch_idx)): 456 | data_idx = batch_idx[i]-self.sum_tree.capacity+1 457 | reward_batch.append(self.sum_tree.data[data_idx][2]) 458 | action_batch.append(self.sum_tree.data[data_idx][1]) 459 | done_batch.append(self.sum_tree.data[data_idx][4]) 460 | else: 461 | state_batch = [] 462 | action_batch = [] 463 | prev_action_batch = [] 464 | reward_batch = [] 465 | next_state_batch = [] 466 | done_batch = [] 467 | for transition in batch: 468 | state, action, prev_action,reward, next_state, done = transition 469 | state_batch.append(state) 470 | action_batch.append(action) 471 | prev_action_batch.append(prev_action) 472 | reward_batch.append(reward) 473 | next_state_batch.append(next_state) 474 | done_batch.append(done) 475 | self.beta = min(1., self.beta + self.beta_increment) 476 | return (state_batch, action_batch,prev_action_batch, reward_batch, next_state_batch, done_batch), batch_idx, IS_weights 477 | 478 | def update_priority(self, idx, td_error): 479 | priority = np.power(td_error, self.alpha) 480 | self.sum_tree.update(idx, priority) 481 | 482 | def __len__(self): 483 | return self.current_length 484 | 485 | 486 | 487 | class PrioritizedBuffer_SNAIL: 488 | '''Only designed for TCN mode''' 489 | def __init__(self, max_size, obs_shape, action_n, alpha=0.6, beta=0.4): 490 | self.sum_tree = SumTree(max_size) 491 | self.alpha = alpha 492 | self.beta = beta 493 | self.beta_increment = (1.0 - beta)/100000. 494 | self.current_length = 0 495 | self.frame_stack = obs_shape[0] 496 | self.efficient_memory_mode = True 497 | self.img_shape = obs_shape 498 | self.action_n = action_n 499 | 500 | def push(self, state, action, reward, next_state, done): 501 | priority = 1.0 if self.current_length == 0 else self.sum_tree.max_priority() 502 | self.current_length = self.current_length + 1 503 | #priority = td_error ** self.alpha 504 | # -------------modified for efficient storage-------------- 505 | if self.efficient_memory_mode: 506 | experience = (np.expand_dims(state[-1], axis=0).astype(np.uint8), np.array([action]), np.array([reward]), np.expand_dims(next_state[-1], axis=0).astype(np.uint8), done) 507 | else: 508 | experience = (state, action, reward, next_state, done) 509 | self.sum_tree.add(priority, experience) 510 | 511 | 512 | def sample(self, batch_size): 513 | # ----------------To modify: sampling for efficient memory storage------------------ 514 | batch_idx, batch, priorities = [], [], [] 515 | segment = self.sum_tree.total() / batch_size 516 | p_sum = self.sum_tree.tree[0] 517 | 518 | for i in range(batch_size): 519 | a = segment * i 520 | b = segment * (i + 1) 521 | 522 | # ---------self modified to allow storing for efficient memory storage of only one frame---------- 523 | if self.efficient_memory_mode: 524 | while True: 525 | s = random.uniform(a, b) 526 | idx, p, data = self.sum_tree.get(s) 527 | data_idx = idx-self.sum_tree.capacity+1 # Note: idx(node idx) is different from dataidx (data idx)! 528 | # print(data_idx, self.sum_tree.write, self.frame_stack) 529 | if (data_idx - self.sum_tree.write > self.frame_stack or data_idx<= self.sum_tree.write) and (self.sum_tree.overwrite_start_flag == True): 530 | break 531 | # elif (self.sum_tree.overwrite_start_flag == False) and (data_idx >= self.frame_stack - 1): 532 | # break 533 | elif (self.sum_tree.overwrite_start_flag == False): # allow samples from <= frame_stack 534 | break 535 | else: 536 | s = random.uniform(a, b) 537 | idx, p, data = self.sum_tree.get(s) 538 | batch_idx.append(idx) 539 | batch.append(data) 540 | priorities.append(p) 541 | prob = np.array(priorities) /p_sum 542 | IS_weights = np.power(self.sum_tree.n_entries * prob, -self.beta) 543 | max_weight = np.power(self.sum_tree.n_entries * self.sum_tree.min_prob(), -self.beta) 544 | IS_weights /= max_weight 545 | 546 | if self.efficient_memory_mode: 547 | # action: (B,L*feat), 548 | # reward: (B,L*1) 549 | action_batch_s = np.zeros((batch_size, self.action_n, self.img_shape[0])) 550 | reward_batch_s = np.zeros((batch_size, 1, self.img_shape[0])) 551 | reward_batch = [] 552 | action_batch = [] 553 | done_batch = [] 554 | state_batch = np.zeros((batch_size,*self.img_shape)) 555 | next_state_batch = np.zeros((batch_size,*self.img_shape)) 556 | # --------------- frame stacking ----------------- 557 | for i in range(len(batch_idx)): 558 | for j in range(1, self.frame_stack+1): 559 | data_idx = batch_idx[i]-self.sum_tree.capacity+1-j+1 # notice the offset to substract to get right data[data_idx] 560 | if data_idx < 0 and self.sum_tree.overwrite_start_flag == True: # for the case : sample the 1st element and stack the last 3 elements from the end 561 | data_idx += self.sum_tree.capacity 562 | elif data_idx < 0 and self.sum_tree.overwrite_start_flag == False: # for the case : sample the 2nd element and stack the 1st elements 3 times from the beginning 563 | data_idx = 0 564 | # print(data_idx ,self.sum_tree.data[data_idx]) 565 | if (self.sum_tree.data[data_idx][4] == True and j > 1) or (data_idx < 0): # for the case of done = True or sampling from the beginning 566 | state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1] # the last [0] makes --> 1*84*84 --> 84*84 567 | next_state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1] 568 | action_batch_s[i,:,self.frame_stack-j] = np.array([0]*self.action_n)# action_batch_s[i, self.frame_stack-j+1] 569 | reward_batch_s[i,:,self.frame_stack-j] = np.array([0])# reward_batch_s[i, self.frame_stack-j+1] 570 | else: 571 | state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][0][0] # the last [0] makes --> 1*84*84 --> 84*84 572 | next_state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][3][0] 573 | action_batch_s[i,:,self.frame_stack-j] = self.sum_tree.data[data_idx][1][0] 574 | reward_batch_s[i,:,self.frame_stack-j] = self.sum_tree.data[data_idx][2][0] 575 | 576 | for i in range(len(batch_idx)): 577 | data_idx = batch_idx[i]-self.sum_tree.capacity+1 578 | reward_batch.append(self.sum_tree.data[data_idx][2]) 579 | action_batch.append(self.sum_tree.data[data_idx][1]) 580 | done_batch.append(self.sum_tree.data[data_idx][4]) 581 | else: 582 | state_batch = [] 583 | action_batch = [] 584 | reward_batch = [] 585 | next_state_batch = [] 586 | done_batch = [] 587 | for transition in batch: 588 | state, action, reward, next_state, done = transition 589 | state_batch.append(state) 590 | action_batch.append(action) 591 | reward_batch.append(reward) 592 | next_state_batch.append(next_state) 593 | done_batch.append(done) 594 | self.beta = min(1., self.beta + self.beta_increment) 595 | return (state_batch, action_batch, reward_batch, next_state_batch, done_batch, action_batch_s, reward_batch_s), batch_idx, IS_weights 596 | 597 | def update_priority(self, idx, td_error): 598 | priority = np.power(td_error, self.alpha) 599 | self.sum_tree.update(idx, priority) 600 | 601 | def __len__(self): 602 | return self.current_length 603 | 604 | 605 | 606 | class PrioritizedBuffer_N_step: 607 | '''The difference to is gamma_exp is another element saved 608 | ''' 609 | def __init__(self, max_size, shape, alpha=0.6, beta=0.4): 610 | self.sum_tree = SumTree(max_size) 611 | self.alpha = alpha 612 | self.beta = beta 613 | self.beta_increment = (1.0 - beta)/100000. 614 | self.current_length = 0 615 | self.frame_stack = shape[0] 616 | self.efficient_memory_mode = True 617 | self.img_shape = shape 618 | 619 | 620 | def push(self, state, action, reward, next_state, done, gamma_exp): 621 | priority = 1.0 if self.current_length == 0 else self.sum_tree.max_priority() 622 | self.current_length = self.current_length + 1 623 | #priority = td_error ** self.alpha 624 | # -------------modified for efficient storage-------------- 625 | if self.efficient_memory_mode: 626 | experience = (np.expand_dims(state[-1], axis=0).astype(np.uint8), np.array([action]), np.array([reward]), np.expand_dims(next_state[-1], axis=0).astype(np.uint8), done, np.array([gamma_exp])) 627 | else: 628 | experience = (state, np.array([action]), np.array([reward]), next_state, done, np.array([gamma_exp])) 629 | self.sum_tree.add(priority, experience) 630 | 631 | 632 | def sample(self, batch_size): 633 | # ----------------To modify: sampling for efficient memory storage------------------ 634 | batch_idx, batch, priorities = [], [], [] 635 | segment = self.sum_tree.total() / batch_size 636 | p_sum = self.sum_tree.tree[0] 637 | 638 | for i in range(batch_size): 639 | a = segment * i 640 | b = segment * (i + 1) 641 | 642 | # ---------self modified to allow storing for efficient memory storage of only one frame---------- 643 | if self.efficient_memory_mode: 644 | while True: 645 | 646 | s = random.uniform(a, b) 647 | idx, p, data = self.sum_tree.get(s) 648 | data_idx = idx-self.sum_tree.capacity+1 # Note: idx(node idx) is different from dataidx (data idx)! 649 | # print(data_idx, self.sum_tree.write, self.frame_stack) 650 | if (data_idx - self.sum_tree.write > self.frame_stack or data_idx<= self.sum_tree.write) and (self.sum_tree.overwrite_start_flag == True): 651 | break 652 | # elif (self.sum_tree.overwrite_start_flag == False) and (data_idx >= self.frame_stack - 1): 653 | # break 654 | elif (self.sum_tree.overwrite_start_flag == False): # allow samples from <= frame_stack 655 | break 656 | else: 657 | s = random.uniform(a, b) 658 | idx, p, data = self.sum_tree.get(s) 659 | batch_idx.append(idx) 660 | batch.append(data) 661 | priorities.append(p) 662 | prob = np.array(priorities) /p_sum 663 | IS_weights = np.power(self.sum_tree.n_entries * prob, -self.beta) 664 | max_weight = np.power(self.sum_tree.n_entries * self.sum_tree.min_prob(), -self.beta) 665 | IS_weights /= max_weight 666 | 667 | if self.efficient_memory_mode: 668 | action_batch = [] 669 | reward_batch = [] 670 | done_batch = [] 671 | state_batch = np.zeros((batch_size,*self.img_shape)) 672 | next_state_batch = np.zeros((batch_size,*self.img_shape)) 673 | gamma_exp_batch = [] 674 | # --------------- frame stacking ----------------- 675 | for i in range(len(batch_idx)): 676 | for j in range(1, self.frame_stack+1): 677 | data_idx = batch_idx[i]-self.sum_tree.capacity+1 -j+1 # notice the offset to substract to get right data[data_idx] 678 | if data_idx < 0 and self.sum_tree.overwrite_start_flag == True: # for the case : sample the 1st element and stack the last 3 elements from the end 679 | data_idx += self.sum_tree.capacity 680 | elif data_idx < 0 and self.sum_tree.overwrite_start_flag == False: # for the case : sample the 2nd element and stack the 1st elements 3 times from the beginning 681 | data_idx = 0 682 | # print(data_idx ,self.sum_tree.data[data_idx]) 683 | if (self.sum_tree.data[data_idx][4] == True and j > 1) or (data_idx < 0): # for the case of done = True or sampling from the beginning 684 | state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1] # the last [0] makes --> 1*84*84 --> 84*84 685 | next_state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1] 686 | else: 687 | state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][0][0] # the last [0] makes --> 1*84*84 --> 84*84 688 | next_state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][3][0] 689 | 690 | for i in range(len(batch_idx)): 691 | data_idx = batch_idx[i]-self.sum_tree.capacity+1 692 | reward_batch.append(self.sum_tree.data[data_idx][2]) 693 | action_batch.append(self.sum_tree.data[data_idx][1]) 694 | done_batch.append(self.sum_tree.data[data_idx][4]) 695 | gamma_exp_batch.append(self.sum_tree.data[data_idx][5]) 696 | else: 697 | state_batch = [] 698 | action_batch = [] 699 | reward_batch = [] 700 | next_state_batch = [] 701 | done_batch = [] 702 | gamma_exp_batch = [] 703 | for transition in batch: 704 | state, action, reward, next_state, done, gamma_exp = transition 705 | state_batch.append(state) 706 | action_batch.append(action) 707 | reward_batch.append(reward) 708 | next_state_batch.append(next_state) 709 | done_batch.append(done) 710 | gamma_exp_batch.append(gamma_exp) 711 | self.beta = min(1., self.beta + self.beta_increment) 712 | return (state_batch, action_batch, reward_batch, next_state_batch, done_batch, gamma_exp_batch), batch_idx, IS_weights 713 | 714 | def update_priority(self, idx, td_error): 715 | priority = np.power(td_error, self.alpha) 716 | self.sum_tree.update(idx, priority) 717 | 718 | def __len__(self): 719 | return self.current_length 720 | 721 | 722 | -------------------------------------------------------------------------------- /nav_acl.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import matplotlib.pyplot as plt 3 | from utils import rel_rot, rot_vec 4 | from numpy import savetxt 5 | import numpy as np 6 | import scipy 7 | from scipy.spatial import distance 8 | from scipy.spatial.transform import Rotation as R 9 | from scipy import stats, optimize, interpolate 10 | import random 11 | from Networks import NavACLNetwork, SoftQNetwork 12 | from isaac_q_helper_functions import * 13 | from RobotTask import RobotTask, Tasktype 14 | import json 15 | import torch 16 | import torch.nn as nn 17 | import torch.optim as optim 18 | from collections import deque 19 | from per_buffer import ReplayBuffer 20 | import random 21 | import time 22 | 23 | 24 | class Nav_ACL(): 25 | """ 26 | My Interpretation of the Nav_ACL Algorithm for automatic Curriculum generation 27 | https://arxiv.org/abs/2009.05429 28 | """ 29 | def __init__(self,nav_acl_network,config,worker_instance=True, shared_q_net=None, env=None, task_offset=np.array([0,0,0]), agent_index = 0, shared_elements=None): 30 | super(Nav_ACL, self).__init__() 31 | """ 32 | my Implementaion keeps track of all previously generated tasks and their outcomes 33 | """ 34 | self.config = config 35 | self.tasks = [] 36 | self.steps = [] 37 | self.rewards = [] 38 | self.dones = [] 39 | self.params = [] 40 | self.default_task = self.config['default_task_omniverse'] 41 | self.GOID_limits = self.config['GOID_limits'] 42 | self.task_gen_method = self.config['task_generation_method'] 43 | self.initial_orientation = np.array([1,0]) # it is assumed that the initial 2D orientation of the robot and the dolly is [1,0] and that the applied is relative to this initial orientation 44 | self.NavACL_loss_func = torch.nn.BCELoss() 45 | self.adaptive_filtering_task_types = [Tasktype.RANDOM, Tasktype.EASY, Tasktype.FRONTIER] 46 | self.adaptive_filtering_task_probs = [0.15 , 0.425 , 0.425 ] 47 | self.device = train_device 48 | self.shared_elements = shared_elements 49 | self.agent_index = agent_index 50 | self.q_net = SoftQNetwork(self.config) 51 | self.q_net.to(Qricculum_device) 52 | 53 | if(not worker_instance): # this instance of NavACL is the one used to train the network i.e. it does not receive copies -> it produces the copies 54 | self.NavACLNet = nav_acl_network 55 | self.NavACL_optimizer = optim.Adam(self.NavACLNet.parameters(), lr=self.config['nav_acl_lr']) 56 | self.q_net = None 57 | self.env = None 58 | else: 59 | if(self.config['q_ricculum_learning']): 60 | if(self.config['use_AF_database']): ## use a predefined database for adaptive filtering method of nav_acl (only relevant for qricculum learning, bc. it prevents generating tasks wich are only used for creating the adaptive boundaries) 61 | self.load_AF_database(self.config['AF_database_path']+str(agent_index)+".npy") 62 | print("using qricculum learning for agent: ", agent_index, " with task AF_database: ", self.config['AF_database_path']+str(agent_index)+".npy") 63 | assert (shared_q_net is not None) 64 | assert (env is not None) 65 | self.env = env 66 | self.update_networks(nav_acl_network,shared_q_net,task_offset) 67 | else: 68 | self.update_networks(nav_acl_network,q_net=None) 69 | self.NavACLNet = nav_acl_network 70 | 71 | if(self.config['using_unity']): 72 | self.default_task = self.config['default_task_unity'] 73 | else: 74 | self.default_task = self.config['default_task_omniverse'] 75 | 76 | def update_networks(self,nav_acl_net, q_net, translation_offset=np.array([0,0,0])): 77 | if(self.config['q_ricculum_learning']): 78 | assert (q_net is not None) 79 | self.q_net.load_state_dict(q_net.state_dict()) 80 | for param in self.q_net.parameters(): 81 | param.requires_grad = False 82 | 83 | if(self.config['using_unity']): 84 | self.default_task = self.config['default_task_unity'] 85 | 86 | self.NavACLNet = NavACLNetwork(5,self.config['nav_acl_hidden_dim']) 87 | self.NavACLNet.load_state_dict(nav_acl_net.state_dict()) 88 | self.NavACLNet.to(self.device) 89 | 90 | def get_task_difficulty_measure(self, robot_task): 91 | success_probability = self.get_task_success_probability(robot_task) 92 | return (1-success_probability) 93 | 94 | def get_task_success_probability(self, robot_task): 95 | task_params_array = self.get_task_params_array(robot_task,normalize=self.config['normalize_tasks']) 96 | task_params_array = torch.FloatTensor(task_params_array).to(self.device) 97 | return self.NavACLNet(task_params_array) 98 | 99 | def normalize_task_params_array(self, task_params_array): 100 | # task_params_array = (task_params_array - self.task_mean) / (self.deviser+1e-3) 101 | r_rot = self.config['randomization_params']['robot_randomization']['rotation_rnd'][0] / 2 102 | d_rot = self.config['randomization_params']['dolly_randomization']['rotation_rnd'][0] / 2 103 | rot = r_rot + d_rot 104 | task_params_array[0] = np.interp(task_params_array[0], (self.config['randomization_params']['min_dist_dolly_robot'], self.config['randomization_params']['max_dist_dolly_robot']), (-1,1)) 105 | task_params_array[1] = np.interp(task_params_array[1], (self.config['randomization_params']['min_dist_dolly_obs'], self.config['randomization_params']['max_dist_dolly_obs']), (-1,1)) 106 | task_params_array[2] = np.interp(task_params_array[2], (self.config['randomization_params']['min_dist_robot_obs'], self.config['randomization_params']['max_dist_robot_obs']), (-1,1)) 107 | task_params_array[3] = np.interp(task_params_array[3], (-(rot*np.pi)/180, (rot*np.pi)/180), (-1,1)) 108 | task_params_array[4] = np.interp(task_params_array[4], (self.config['collision_penalty']-self.config['goal_reward'],self.config['goal_reward']), (-1,1)) 109 | return task_params_array 110 | 111 | def get_task_params_array(self, task,normalize=True): 112 | if(self.config['q_ricculum_learning']): 113 | task_params_array = np.array([self.compute_dolly_robot_distance(task), 114 | self.compute_dolly_min_distance(task), 115 | self.compute_robot_min_distance(task), 116 | self.compute_relative_rotation_robot_dolly(task), 117 | np.clip(task.q_value,self.config['collision_penalty']-self.config['goal_reward'],self.config['goal_reward'])]) 118 | else: 119 | task_params_array = np.array([self.compute_dolly_robot_distance(task), 120 | self.compute_dolly_min_distance(task), 121 | self.compute_robot_min_distance(task), 122 | self.compute_relative_rotation_robot_dolly(task), 123 | 0.1]) 124 | if normalize: 125 | task_params_array = self.normalize_task_params_array(task_params_array) 126 | return task_params_array 127 | 128 | def apply_offset_to_robot(self, robot_task, translation_offset): 129 | robot_task.robot_translation =robot_task.robot_translation + translation_offset 130 | return robot_task 131 | 132 | 133 | def apply_offset_to_dolly(self, robot_task, translation_offset): 134 | robot_task.dolly_translation += translation_offset 135 | return robot_task 136 | 137 | 138 | def create_randomized_task(self,translation_offset= np.array([0,0,0])): 139 | robot_task = RobotTask(self.default_task) 140 | robot_task.robot_translation =robot_task.robot_translation + translation_offset 141 | robot_task.dolly_translation += translation_offset 142 | robot_task.obstacle_translation += translation_offset 143 | robot_task.obstacle_1_translation += translation_offset 144 | robot_task.obstacle_2_translation += translation_offset 145 | robot_task.obstacle_3_translation += translation_offset 146 | robot_task.randomize_task(self.config['randomization_params']) 147 | return robot_task 148 | 149 | 150 | 151 | def sample_random_task(self,translation_offset= np.array([0,0,0])): 152 | robot_task = self.create_randomized_task(translation_offset=translation_offset) 153 | 154 | while self.check_if_task_is_valid(robot_task) is not True: 155 | robot_task = self.create_randomized_task(translation_offset=translation_offset) 156 | 157 | if(self.config['q_ricculum_learning']): 158 | robot_task.q_value = self.get_q_value_for_task(robot_task,translation_offset) 159 | # print("q value of task: ", self.get_task_params_array(robot_task,False), " is : ", robot_task.q_value) 160 | return robot_task 161 | 162 | def save_tasks_for_later(self,translation_offset, num_tasks=1000, save_location="/home/developer/Training_results/Qricculum_Learning"): 163 | task_list = [] 164 | for task in range(num_tasks): 165 | robot_task = self.create_randomized_task(translation_offset=translation_offset) 166 | obs_cam, obs_lidar = self.env.reset(robot_task) 167 | prev_action = np.zeros((2)) 168 | prev_reward = [self.config['step_penalty']] 169 | task_array = robot_task.get_task_array() 170 | task_list.append((obs_cam, obs_lidar,prev_action,prev_reward,task_array)) 171 | if (task % 1000 == 0 ): 172 | print("saved so far: ", task) 173 | task_array = np.array(task_list) 174 | np.save(save_location,task_array) 175 | 176 | 177 | def load_AF_database(self, AF_database_path): 178 | """ 179 | loads a task database from an .npy file and stores it as a list 180 | """ 181 | self.AF_database = np.load(AF_database_path, allow_pickle=True) 182 | self.AF_database = self.AF_database.tolist() # this way we can use random.sample(list, num_elements) 183 | print("using qricculum learning with database: ", AF_database_path) 184 | self.dtb_stack = self.create_dtb_stack() 185 | 186 | 187 | def observation_to_q_value(self, obs_cam, obs_lidar, prev_action, prev_reward): 188 | n_s_f = self.config['num_stacked_frames'] 189 | stacked_camera_obsrv = deque(maxlen=n_s_f) 190 | transitions = deque(maxlen=n_s_f) 191 | # start with a fresh state 192 | for i in range(self.config['num_stacked_frames']): 193 | stacked_camera_obsrv.append(obs_cam) 194 | if(self.config['use_snail_mode']): 195 | transitions.append((obs_cam, obs_lidar,prev_action,prev_reward)) 196 | 197 | stacked_camera_state = np.asarray(stacked_camera_obsrv).reshape((self.env.observtion_shape[0]*n_s_f,self.env.observtion_shape[1],self.env.observtion_shape[2])) 198 | cam_tensor = torch.FloatTensor(stacked_camera_state).unsqueeze(0).to(Qricculum_device) 199 | lidar_tensor = torch.FloatTensor(obs_lidar).unsqueeze(0).to(Qricculum_device) 200 | prev_action = torch.FloatTensor(prev_action).unsqueeze(0).to(Qricculum_device) 201 | reward = torch.FloatTensor(prev_reward).unsqueeze(0).to(Qricculum_device) # reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; 202 | with torch.no_grad(): 203 | q_value = self.q_net(cam_tensor,lidar_tensor, prev_action, reward).detach().cpu().numpy().flatten() 204 | del stacked_camera_state, cam_tensor, lidar_tensor, prev_action, reward, obs_cam, obs_lidar, stacked_camera_obsrv 205 | return q_value[0] 206 | 207 | 208 | 209 | 210 | 211 | 212 | def get_q_value_for_task(self,robot_task,translation_offset): 213 | obs_cam, obs_lidar = self.env.reset(robot_task) 214 | prev_action = np.zeros(8) 215 | prev_reward = np.ones(4)*self.config['step_penalty'] 216 | # self.show_state(obs_cam,True,string_extra=str(translation_offset)) 217 | q = self.observation_to_q_value(obs_cam, obs_lidar,prev_action, prev_reward) 218 | return q 219 | 220 | def show_state(self,state_tensor, save=False, string_extra=""): 221 | state_shape= (84,84,3) 222 | state_tensor = state_tensor / 2**8 223 | pic0 = np.array(state_tensor.reshape(state_shape).astype(np.float32)) 224 | 225 | fig, (ax1) = plt.subplots(1, 1) 226 | ax1.imshow(pic0) 227 | if(save): 228 | path = "/home/developer/Pictures"+string_extra+str(current_milli_time())+".png" 229 | print("saving image as: ", path) 230 | plt.imsave(path,pic0) 231 | else: 232 | plt.show() 233 | 234 | def generate_random_task(self, translation_offset=np.array([0,0,0])): 235 | """ 236 | returns a randomized RobotTask: 237 | 238 | based on a default task given in NAV_ACL_hyperparameters.json 239 | """ 240 | if self.task_gen_method == "GOID": 241 | return self.generate_random_task_GOID(translation_offset) 242 | 243 | if self.task_gen_method == "AF": 244 | return self.generate_random_task_AF(translation_offset) 245 | 246 | 247 | 248 | def generate_random_task_GOID(self, translation_offset): 249 | while True: 250 | robot_task = self.sample_random_task(translation_offset) 251 | difficulty = self.get_task_difficulty_measure(robot_task).cpu() 252 | if ( (difficulty > self.GOID_limits['lower_limit'] ) and (difficulty < self.GOID_limits['upper_limit'])) : 253 | return robot_task 254 | 255 | def generate_random_task_AF(self, translation_offset): 256 | if(self.config['only_random_tasks']): 257 | task_type = Tasktype.RANDOM 258 | else: 259 | task_type = np.random.choice(self.adaptive_filtering_task_types, 1, p=self.adaptive_filtering_task_probs)[0] # adaptive sampling of the task type 260 | return self.get_dynamic_task(task_type,translation_offset) 261 | 262 | def get_dynamic_task(self, task_type, translation_offset): 263 | """ 264 | This is the main algorithm for nav-acl-q 265 | """ 266 | 267 | 268 | if(self.config['use_AF_database']): 269 | mu, sigma = self.fast_create_adaptive_boundaries_mu_sig() 270 | else: 271 | mu, sigma = self.create_adaptive_boundaries() 272 | 273 | beta = self.config['adaptive_filtering_params']['nav_beta'] 274 | gamma_low = self.config['adaptive_filtering_params']['nav_gamma_low'] 275 | gamma_hi = self.config['adaptive_filtering_params']['nav_gamma_hi'] 276 | omega = self.config['adaptive_filtering_params']['nav_P_omega'] 277 | 278 | 279 | for trial in range(self.config['nav_acl_max_AF_task_samples']): 280 | if(self.config['runtimetasks_from_database'] and self.config['use_AF_database']): 281 | robot_task = self.sample_task_from_database(self.AF_database) 282 | else: 283 | robot_task = self.sample_random_task(translation_offset) 284 | succ_prob = self.get_task_success_probability(robot_task).cpu() 285 | time.sleep(0.01) 286 | 287 | if task_type == Tasktype.EASY: 288 | if (mu + beta*sigma) < 1: 289 | if (succ_prob > (mu + beta*sigma)) or (succ_prob > omega): 290 | robot_task.task_type = Tasktype.EASY 291 | return robot_task 292 | else: 293 | if (succ_prob > mu): 294 | robot_task.task_type = Tasktype.EASY 295 | return robot_task 296 | if task_type == Tasktype.FRONTIER: 297 | if (mu - gamma_low*sigma) < succ_prob < (mu + gamma_hi*sigma): 298 | robot_task.task_type = Tasktype.FRONTIER 299 | return robot_task 300 | if task_type == Tasktype.RANDOM: 301 | robot_task.task_type = Tasktype.RANDOM 302 | return robot_task 303 | print("TOOK too many trials to find task of type: ", task_type, " with boundaries mu: ", mu, " sigma: ", sigma) 304 | robot_task.task_type = Tasktype.RANDOM 305 | return robot_task 306 | 307 | 308 | def create_adaptive_boundaries(self, translation_offset=np.array([0,0,0])): 309 | tasks_parameters = [] 310 | if(self.config['use_AF_database']): 311 | tasks_parameters = self.get_task_parameters_for_AF_database() 312 | 313 | else: 314 | for i in range(self.config['nav_acl_AF_boundaries_task_samples']): 315 | tasks_parameters.append(self.get_task_params_array(self.sample_random_task(translation_offset=translation_offset),self.config['normalize_tasks'])) 316 | 317 | params = torch.FloatTensor(tasks_parameters).to(self.device) 318 | predictions = self.NavACLNet(params).cpu().detach().numpy() 319 | mu, sigma = scipy.stats.distributions.norm.fit(predictions) 320 | return mu, sigma 321 | 322 | 323 | def get_task_parameters_for_AF_database(self): 324 | tasks_parameters = [] 325 | for i in range(self.config['nav_acl_AF_boundaries_task_samples']): 326 | task = self.sample_task_from_database(self.AF_database) 327 | tasks_parameters.append(self.get_task_params_array(task,self.config['normalize_tasks'])) # and append it to the task array which will be used for creating the adaptive boundaries 328 | return tasks_parameters 329 | 330 | def sample_task_from_database(self,database): 331 | task_sample = random.sample(database, 1)[0] 332 | obs_cam, obs_lidar,_,_,task_array = task_sample 333 | prev_action = np.zeros(8) 334 | prev_reward = np.ones(4)*self.config['step_penalty'] 335 | q = self.observation_to_q_value(obs_cam,obs_lidar,prev_action,prev_reward) # pass observation through the q network to obtain currently estimated q value 336 | task = RobotTask(self.default_task) # create a task instance 337 | task.from_task_array(task_array) # initialize according to the task that was sampled from the database 338 | task.q_value = q # replace the q 339 | return task 340 | 341 | def compute_dolly_robot_distance(self, task): 342 | return distance.euclidean(task.robot_translation, task.dolly_translation) 343 | 344 | def compute_obstacle_distance_dolly(self, task): 345 | distances = np.array([distance.euclidean(task.obstacle_2_translation, task.dolly_translation), 346 | distance.euclidean(task.obstacle_3_translation, task.dolly_translation)]) 347 | return np.min(distances) 348 | 349 | def compute_obstacle_distance_robot(self, task): 350 | distances = np.array([distance.euclidean(task.obstacle_translation, task.robot_translation), 351 | distance.euclidean(task.obstacle_1_translation, task.robot_translation)]) 352 | return np.min(distances) 353 | 354 | def compute_path_complexity(self, task): 355 | return 0.1 356 | 357 | def check_if_task_is_valid(self,task): 358 | # first check is no object is too close to the dolly: 359 | if(self.compute_dolly_min_distance(task) < self.config['randomization_params']['min_dist_dolly_obs']): 360 | return False 361 | elif(self.compute_robot_min_distance(task) < self.config['randomization_params']['min_dist_robot_obs']): 362 | return False 363 | else: 364 | return True 365 | 366 | def compute_robot_min_distance(self,task): 367 | return self.compute_min_distance(task, task.robot_translation) 368 | def compute_dolly_min_distance(self,task): 369 | return self.compute_min_distance(task, task.dolly_translation) 370 | 371 | def compute_min_distance(self,task, target_translation): 372 | distances = np.zeros((4)) 373 | obstacles = task.get_obstacle_translations_array() 374 | for i in range(4): 375 | distances[i] = distance.euclidean(obstacles[i], target_translation) 376 | min_dist = np.min(distances) 377 | 378 | if min_dist > 50: 379 | return 0 380 | else : 381 | return np.min(distances) 382 | 383 | def compute_relative_rotation_robot_dolly(self, task, as_degrees=False): 384 | Robot_orientation = rot_vec(self.initial_orientation,task.robot_rotation[0]) 385 | Dolly_orientation = rot_vec(self.initial_orientation,task.dolly_rotation[0]) 386 | relative_rotation_rad = rel_rot(Robot_orientation,Dolly_orientation) 387 | if(as_degrees): 388 | return relative_rotation_rad * (180/np.pi) 389 | else: 390 | return relative_rotation_rad 391 | 392 | 393 | 394 | def train(self,robot_task,label,batch_mode = False): 395 | self.NavACL_optimizer.zero_grad() 396 | 397 | if(self.config['nav_acl_batch_mode']): 398 | params = [] 399 | labels = [] 400 | for t in range(self.config['nav_acl_batch_size']): 401 | if(type(robot_task[t])== RobotTask): 402 | params.append(self.get_task_params_array(robot_task[t],self.config['normalize_tasks'])) 403 | else: 404 | params.append(self.normalize_task_params_array(robot_task[t])) 405 | labels.append(label[t]) 406 | 407 | task_params_array = torch.FloatTensor(params).to(self.device) 408 | label = torch.FloatTensor(labels).flatten().to(self.device) 409 | 410 | 411 | else: 412 | if(type(robot_task)== RobotTask): 413 | task_params_array = self.get_task_params_array(robot_task,self.config['normalize_tasks']) 414 | else: 415 | task_params_array = self.normalize_task_params_array(robot_task) 416 | 417 | task_params_array = torch.FloatTensor(task_params_array).flatten().to(self.device) 418 | label = torch.FloatTensor(label).flatten().to(self.device) 419 | 420 | 421 | prediction = self.NavACLNet(task_params_array) 422 | loss = self.NavACL_loss_func(prediction, label) 423 | 424 | loss.backward() 425 | self.NavACL_optimizer.step() 426 | return prediction.detach().cpu().numpy() 427 | 428 | def batch_train(self,robot_tasks, labels): 429 | self.NavACL_optimizer.zero_grad() 430 | 431 | params = [] 432 | _labels = [] 433 | for t in range(self.config['nav_acl_batch_size']): 434 | if(type(robot_tasks[t])== RobotTask): 435 | params.append(self.get_task_params_array(robot_tasks[t],self.config['normalize_tasks'])) 436 | else: 437 | params.append(robot_tasks[t]) 438 | 439 | _labels.append(labels[t]) 440 | 441 | 442 | task_params_array = torch.FloatTensor(params).to(self.device) 443 | label = torch.FloatTensor(_labels).flatten().to(self.device) 444 | prediction = self.NavACLNet(task_params_array) 445 | # print("robot_tasks:" , robot_tasks) 446 | # print("predictions: ", prediction) 447 | loss = self.NavACL_loss_func(prediction, label.view((prediction.shape))) # labels are [batchsize] abd predictions are [batchsize,1] thus labels have to be viewed as [8,1]! This is very important, otherwise it will only use one of the parameters for esitmation 448 | 449 | loss.backward() 450 | self.NavACL_optimizer.step() 451 | return prediction.detach().cpu().numpy() 452 | 453 | def create_dtb_stack(self): 454 | af_database = np.load(self.config['AF_database_path']+str(self.agent_index)+".npy", allow_pickle=True) 455 | af_database = np.array(af_database).reshape(af_database.shape[0],af_database.shape[1]) 456 | dtb_stack = [] 457 | for t_i in range(af_database.shape[0]): 458 | image_stack = np.array([af_database[t_i][0],af_database[t_i][0],af_database[t_i][0],af_database[t_i][0]]).reshape(12,80,80) 459 | lidar = af_database[t_i][1] 460 | action = np.zeros((8)) 461 | reward = np.ones(4)*self.config['step_penalty'] 462 | task_array = af_database[t_i][4] 463 | robot_task = RobotTask(self.config['default_task_unity']) 464 | robot_task.from_task_array(task_array) 465 | task_params_array = self.get_task_params_array(robot_task,self.config['normalize_tasks']) 466 | task_tuple = (image_stack, lidar, action, reward, task_params_array) 467 | dtb_stack.append(task_tuple) 468 | return dtb_stack 469 | 470 | def fast_create_adaptive_boundaries_mu_sig(self): 471 | c, l, a, r, tsk = map(np.stack, zip(*random.sample(self.dtb_stack, self.config['nav_acl_AF_boundaries_task_samples']))) 472 | 473 | cam_tensor = torch.FloatTensor(c).to(Qricculum_device) 474 | lidar_tensor = torch.FloatTensor(l).to(Qricculum_device) 475 | prev_action = torch.FloatTensor(a).to(Qricculum_device) 476 | prev_reward = torch.FloatTensor(r).to(Qricculum_device) # prev_reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; 477 | with torch.no_grad(): 478 | predicted_q_values = self.q_net(cam_tensor,lidar_tensor, prev_action, prev_reward).detach().cpu().numpy() 479 | predicted_q_values = np.interp(predicted_q_values, (self.config['collision_penalty']-self.config['goal_reward'],self.config['goal_reward']), (-1,1)) 480 | 481 | tsk[:,4] = predicted_q_values[:,0] 482 | # print(tsk) 483 | tsk_tensor = torch.FloatTensor(tsk).to(self.device) 484 | predictions = self.NavACLNet(tsk_tensor).detach().cpu().numpy() 485 | mu, sigma = scipy.stats.distributions.norm.fit(predictions) 486 | return mu, sigma -------------------------------------------------------------------------------- /pretrain_CAE.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from torch.nn.modules.activation import LeakyReLU 4 | from utils import initialize_weights_he 5 | 6 | # The MNIST datasets are hosted on yann.lecun.com that has moved under CloudFlare protection 7 | # Run this script to enable the datasets download 8 | # Reference: https://github.com/pytorch/vision/issues/1938 9 | from six.moves import urllib 10 | opener = urllib.request.build_opener() 11 | opener.addheaders = [('User-agent', 'Mozilla/5.0')] 12 | urllib.request.install_opener(opener) 13 | from Networks import ResNetBlock 14 | import torch 15 | import numpy as np 16 | from torchvision import datasets 17 | import torchvision.transforms as transforms 18 | import cv2 19 | from zu_resnet import ResNetEncoder 20 | 21 | 22 | # define the NN architecture 23 | class ConvAutoencoder_NAV2(nn.Module): 24 | def __init__(self, imgChannels=1, zDim=512,featureDim=12*10*10, fix_params=False): 25 | super(ConvAutoencoder_NAV2, self).__init__() 26 | self.featureDim = featureDim 27 | ## encoder layers ## 28 | # https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout 29 | self.encode = nn.Sequential( 30 | nn.Conv2d(imgChannels, 32, 5, padding=2) , 31 | nn.BatchNorm2d(32), 32 | nn.ReLU(), 33 | ResNetBlock(32,64,3), 34 | ResNetBlock(64,128,3), 35 | ResNetBlock(128,256,3), 36 | ResNetBlock(256,128,3), # 64x5x5 = 3200 feature vector 37 | ).apply(initialize_weights_he) 38 | 39 | ## decoder layers ## 40 | ## a kernel of 2 and a stride of 2 will increase the spatial dims by 2 41 | self.decode = nn.Sequential( 42 | nn.ConvTranspose2d(128, 256, 2, stride=2), 43 | nn.ReLU(), 44 | nn.ConvTranspose2d(256, 128, 2, stride=2), 45 | nn.ReLU(), 46 | nn.ConvTranspose2d(128, 64, 2, stride=2), 47 | nn.ReLU(), 48 | nn.ConvTranspose2d(64, imgChannels, 2, stride=2), 49 | ).apply(initialize_weights_he) 50 | 51 | 52 | def fix_params(self): 53 | for param in self.encode.parameters(): 54 | param.requires_grad = False 55 | for param in self.decode.parameters(): 56 | param.requires_grad = False 57 | 58 | def encode_(self, x): 59 | return self.encode(x) 60 | 61 | def forward(self, x): 62 | x = self.encode(x) 63 | # print(x.shape) 64 | # x = x.reshape(64,5,5) 65 | x = self.decode(x) 66 | 67 | x = torch.sigmoid(x) 68 | return x 69 | 70 | 71 | # define the NN architecture 72 | class ConvAutoencoder_NAV3(nn.Module): 73 | def __init__(self, imgChannels=1, zDim=512,featureDim=12*10*10, fix_params=False): 74 | super(ConvAutoencoder_NAV3, self).__init__() 75 | self.featureDim = featureDim 76 | ## encoder layers ## 77 | # https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout 78 | self.encode = ResNetEncoder(12,blocks_sizes=[64,128,256,384],deepths=[2,2,2,2]) 79 | 80 | ## decoder layers ## 81 | ## a kernel of 2 and a stride of 2 will increase the spatial dims by 2 82 | self.decode = nn.Sequential( 83 | nn.ConvTranspose2d(384, 512, 2, stride=2), 84 | nn.ReLU(), 85 | nn.ConvTranspose2d(512, 256, 2, stride=2), 86 | nn.ReLU(), 87 | nn.ConvTranspose2d(256, 128, 2, stride=2), 88 | nn.ReLU(), 89 | nn.ConvTranspose2d(128, 64, 2, stride=2), 90 | nn.ReLU(), 91 | nn.ConvTranspose2d(64, imgChannels, 2, stride=2) 92 | ).apply(initialize_weights_he) 93 | 94 | 95 | def fix_params(self): 96 | for param in self.encode.parameters(): 97 | param.requires_grad = False 98 | for param in self.decode.parameters(): 99 | param.requires_grad = False 100 | 101 | def encode_(self, x): 102 | return self.encode(x) 103 | 104 | def forward(self, x): 105 | x = self.encode(x) 106 | # print(x.shape) 107 | # x = x.reshape(64,5,5) 108 | x = self.decode(x) 109 | 110 | x = torch.sigmoid(x) 111 | return x 112 | 113 | 114 | # define the NN architecture 115 | class ConvAutoencoder_NAV4(nn.Module): 116 | def __init__(self, imgChannels=1, zDim=512,featureDim=12*10*10, fix_params=False): 117 | super(ConvAutoencoder_NAV4, self).__init__() 118 | self.featureDim = featureDim 119 | ## encoder layers ## 120 | # https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout 121 | self.encode = nn.Sequential( 122 | ResNetBlock(imgChannels,64,3), 123 | ResNetBlock(64,128,3), 124 | ResNetBlock(128,256,3), 125 | ResNetBlock(256,128,3), # 64x5x5 = 3200 feature vector 126 | ).apply(initialize_weights_he) 127 | 128 | ## decoder layers ## 129 | ## a kernel of 2 and a stride of 2 will increase the spatial dims by 2 130 | self.decode = nn.Sequential( 131 | nn.ConvTranspose2d(128, 256, 2, stride=2), 132 | nn.ReLU(), 133 | nn.ConvTranspose2d(256, 128, 2, stride=2), 134 | nn.ReLU(), 135 | nn.ConvTranspose2d(128, 64, 2, stride=2), 136 | nn.ReLU(), 137 | nn.ConvTranspose2d(64, imgChannels, 2, stride=2), 138 | ).apply(initialize_weights_he) 139 | 140 | 141 | def fix_params(self): 142 | for param in self.encode.parameters(): 143 | param.requires_grad = False 144 | for param in self.decode.parameters(): 145 | param.requires_grad = False 146 | 147 | def encode_(self, x): 148 | return self.encode(x) 149 | 150 | def forward(self, x): 151 | x = self.encode(x) 152 | # print(x.shape) 153 | # x = x.reshape(64,5,5) 154 | x = self.decode(x) 155 | 156 | x = torch.sigmoid(x) 157 | return x 158 | 159 | 160 | 161 | # define the NN architecture 162 | class ConvAutoencoder_NAV6(nn.Module): 163 | def __init__(self, imgChannels=1, zDim=1024,featureDim=64*5*5, fix_params=False): 164 | super(ConvAutoencoder_NAV6, self).__init__() 165 | self.featureDim = featureDim 166 | ## encoder layers ## 167 | # https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout 168 | self.encode = nn.Sequential( 169 | ResNetBlock(imgChannels,64,3), 170 | ResNetBlock(64,128,3), 171 | ResNetBlock(128,256,3), 172 | ResNetBlock(256,64,3), # 64x5x5 = 3200 feature vector, 173 | nn.Flatten(), 174 | nn.Linear(featureDim,zDim) 175 | ).apply(initialize_weights_he) 176 | 177 | self. FC_1 = nn.Linear(zDim,featureDim) 178 | ## decoder layers ## 179 | ## a kernel of 2 and a stride of 2 will increase the spatial dims by 2 180 | self.decode = nn.Sequential( 181 | nn.ConvTranspose2d(64, 128, 2, stride=2), 182 | nn.ReLU(), 183 | nn.ConvTranspose2d(128, 256, 2, stride=2), 184 | nn.ReLU(), 185 | nn.ConvTranspose2d(256, 128, 2, stride=2), 186 | nn.ReLU(), 187 | nn.ConvTranspose2d(128, 64, 2, stride=2), 188 | 189 | ).apply(initialize_weights_he) 190 | 191 | 192 | def fix_params(self): 193 | for param in self.encode.parameters(): 194 | param.requires_grad = False 195 | for param in self.decode.parameters(): 196 | param.requires_grad = False 197 | 198 | def encode_(self, x): 199 | return self.encode(x) 200 | 201 | def forward(self, x): 202 | x = self.encode(x) 203 | 204 | x = x.view(-1, self.fedim) 205 | x = self.decode(x) 206 | 207 | x = torch.sigmoid(x) 208 | return x 209 | 210 | 211 | if __name__ == '__main__': 212 | 213 | 214 | GPU = True 215 | device_idx = 0 216 | if GPU: 217 | device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu") 218 | else: 219 | device = torch.device("cpu") 220 | # convert data to torch.FloatTensor 221 | transform = transforms.ToTensor() 222 | 223 | 224 | channels = 3 225 | n_s_f = 4 226 | inputshape = (80,80,channels) 227 | cv2_resz = (80,80) 228 | imshape = (channels,*cv2_resz) 229 | show_shape = (*cv2_resz,channels) 230 | 231 | 232 | model = ConvAutoencoder_NAV4(imgChannels=channels*n_s_f) 233 | # model.load_state_dict(torch.load("/home/developer/Training_results/Qricculum_Learning/big_and_small/final/Models/1/VAE_20")) 234 | model.load_state_dict(torch.load("/home/developer/Training_results/Qricculum_Learning/big_and_small/hoffentlich/VAE_80803_615")) 235 | model.eval() 236 | model.to(device) 237 | 238 | train_images = [] 239 | test_images = [] 240 | 241 | moving_database = np.load("/home/developer/Training_results/Qricculum_Learning/big_and_small/hoffentlich/VAE_dtb_12_8080_final_hoffentlich.npy") 242 | # moving_database = np.load("/home/developer/VAE_dtb_12_128128_final.npy") 243 | # moving_database = np.load("/home/developer/Training_results/Qricculum_Learning/big_and_small/3/VAE_dtb_3_8080.npy") 244 | 245 | print(moving_database.shape) 246 | print(moving_database[0]) 247 | 248 | stacked_images = [] 249 | 250 | train_data = (moving_database[0:45000]/ 2**8).astype(np.float32) 251 | test_data = (moving_database[45000:] / 2**8).astype(np.float32) 252 | 253 | print(train_data.shape) 254 | print(test_data.shape) 255 | 256 | # Create training and test dataloaders 257 | 258 | num_workers = 10 259 | # how many samples per batch to load 260 | batch_size = 32 261 | 262 | # prepare data loaders 263 | train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, num_workers=num_workers,shuffle=True) 264 | test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, num_workers=num_workers,shuffle=True) 265 | 266 | import matplotlib.pyplot as plt 267 | 268 | 269 | infostring = "net: \n" + str(model) + " \n \n \n" 270 | 271 | print(infostring) 272 | filename = "/home/developer/Training_results/VA/"+"Infofile.txt" 273 | text_file = open(filename, "w") 274 | n = text_file.write(infostring) 275 | text_file.close() 276 | learning_rate = 0.01 277 | 278 | # specify loss function 279 | criterion = nn.MSELoss() 280 | 281 | # specify loss function 282 | # torch.optim.Adam 283 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 284 | 285 | # from torch.optim.lr_scheduler import ExponentialLR 286 | from torch.optim.lr_scheduler import MultiStepLR 287 | 288 | # scheduler1 = ExponentialLR(optimizer, gamma=0.90) 289 | scheduler2 = MultiStepLR(optimizer, milestones=[30,50,70,90], gamma=0.25) 290 | 291 | # number of epochs to train the model 292 | n_epochs = 100 293 | 294 | # for epoch in range(1, n_epochs+1): 295 | # # monitor training loss 296 | # train_loss = 0.0 297 | # test_loss = 0.0 298 | 299 | # ################## 300 | # # train the model # 301 | # ################## 302 | # for data in train_loader: 303 | # # _ stands in for labels, here 304 | # # no need to flatten images 305 | # images = data 306 | # images = images.to(device) 307 | # # clear the gradients of all optimized variables 308 | # optimizer.zero_grad() 309 | # # forward pass: compute predicted outputs by passing inputs to the model 310 | # outputs = model(images).to(device) 311 | 312 | # # output_decoder = decoder(images) 313 | # # print(output_decoder) 314 | # # print(output_decoder.shape) 315 | # # calculate the loss 316 | # loss = criterion(outputs, images) 317 | # # backward pass: compute gradient of the loss with respect to model parameters 318 | # loss.backward() 319 | # # perform a single optimization step (parameter update) 320 | # optimizer.step() 321 | # # update running training loss 322 | # train_loss += loss.item()*images.size(0) 323 | 324 | # # print avg training statistics 325 | # train_loss = train_loss/len(train_loader) 326 | # print('Epoch: {} \tTraining Loss: {:.6f}'.format( 327 | # epoch, 328 | # train_loss 329 | # )) 330 | # for test_i_data in test_loader: 331 | # # _ stands in for labels, here 332 | # # no need to flatten images 333 | # test_images = test_i_data 334 | # test_images = test_images.to(device) 335 | # # clear the gradients of all optimized variables 336 | # with torch.no_grad(): 337 | # # forward pass: compute predicted outputs by passing inputs to the model 338 | # outputs = model(test_images).to(device) 339 | 340 | # loss = criterion(outputs, test_images) 341 | 342 | # test_loss += loss.item()*test_images.size(0) 343 | # print('Epoch: {} \tTesting Loss: {:.6f}'.format( 344 | # epoch, 345 | # test_loss 346 | # )) 347 | # torch.save(model.state_dict(), "/home/developer/Training_results/VA/VAE_RESNET18"+str(epoch)) 348 | # # scheduler1.step() 349 | # scheduler2.step() 350 | 351 | 352 | # obtain one batch of test images 353 | dataiter = iter(test_loader) 354 | while True: 355 | show_images = dataiter.next() 356 | show_images = show_images.to(device) 357 | # get sample outputs 358 | output = model(show_images) 359 | # prep images for display 360 | show_images = show_images.detach().cpu().numpy() 361 | 362 | # output is resized into a batch of iages 363 | output = output.view(batch_size,n_s_f*channels,*cv2_resz) 364 | # use detach when it's an output that requires_grad 365 | output = output.detach().cpu().numpy() 366 | 367 | print(output.shape) 368 | print(show_images.shape) 369 | 370 | # torch.save(model.state_dict(), "/home/developer/Training_results/VAE") 371 | 372 | # plot the first ten input images and then reconstructed images 373 | fig, axes = plt.subplots(nrows=2, ncols=4, sharex=True, sharey=True, figsize=(20,20)) 374 | 375 | axes[0][0].imshow(show_images[0][0:3].reshape(show_shape)) 376 | axes[0][0].get_xaxis().set_visible(False) 377 | axes[0][0].get_yaxis().set_visible(False) 378 | axes[0][1].imshow(show_images[0][3:6].reshape(show_shape)) 379 | axes[0][1].get_xaxis().set_visible(False) 380 | axes[0][1].get_yaxis().set_visible(False) 381 | axes[0][2].imshow(show_images[0][6:9].reshape(show_shape)) 382 | axes[0][2].get_xaxis().set_visible(False) 383 | axes[0][2].get_yaxis().set_visible(False) 384 | axes[0][3].imshow(show_images[0][9:12].reshape(show_shape)) 385 | axes[0][3].get_xaxis().set_visible(False) 386 | axes[0][3].get_yaxis().set_visible(False) 387 | 388 | 389 | axes[1][0].imshow(output[0][0:3].reshape(show_shape)) 390 | axes[1][0].get_xaxis().set_visible(False) 391 | axes[1][0].get_yaxis().set_visible(False) 392 | axes[1][1].imshow(output[0][3:6].reshape(show_shape)) 393 | axes[1][1].get_xaxis().set_visible(False) 394 | axes[1][1].get_yaxis().set_visible(False) 395 | axes[1][2].imshow(output[0][6:9].reshape(show_shape)) 396 | axes[1][2].get_xaxis().set_visible(False) 397 | axes[1][2].get_yaxis().set_visible(False) 398 | axes[1][3].imshow(output[0][9:12].reshape(show_shape)) 399 | axes[1][3].get_xaxis().set_visible(False) 400 | axes[1][3].get_yaxis().set_visible(False) 401 | 402 | # input images on top row, reconstructions on bottom 403 | # for show_images, row in zip([show_images, output], axes): 404 | # for img, ax in zip(show_images, row): 405 | # ax.imshow(img[0:3].reshape(show_shape)) 406 | # ax.get_xaxis().set_visible(False) 407 | # ax.get_yaxis().set_visible(False) 408 | 409 | plt.show() -------------------------------------------------------------------------------- /process_utils.py: -------------------------------------------------------------------------------- 1 | from numpy.core.shape_base import stack 2 | from RobotTask import RobotTask 3 | import json 4 | 5 | from scipy.signal.ltisys import StateSpace 6 | from VisualTaskEnv import VisualTaskEnv 7 | from isaac import * 8 | import time 9 | import numpy as np 10 | from threading import Thread 11 | from utils import * 12 | import torch 13 | from PyController import PyController 14 | from nav_acl import Nav_ACL 15 | from collections import deque 16 | import copy 17 | from Networks import * 18 | 19 | 20 | def start_gather_experience(agent_index, shared_elements,config): 21 | ########## start isaac app ####################################### 22 | app = Application("apps/py_sac_nav_acl/py_sac_nav_acl.app.json") 23 | isaac_thread = Thread(target=start_isaac_application,args=(app,agent_index,config,)) 24 | isaac_thread.start() 25 | ###########environment and alrogithm setup######################## 26 | time.sleep(1) 27 | env = VisualTaskEnv(app,agent_index, "py_controller_"+str(agent_index),config) 28 | torch.manual_seed((23345*agent_index)) 29 | np.random.seed((23345*agent_index)) 30 | env.seed((23345*agent_index)) 31 | ##########run episodes'############################################ 32 | # run_test_episodes(env, agent_index,shared_elements,config,) 33 | run_episodes(env, agent_index,shared_elements,config,) 34 | app.stop() 35 | 36 | def run_test_episodes(env, agent_index,shared_elements, config): 37 | Full_Path = "/home/developer/Training_results/Visual/10/08/2021/16:05:00/"+"_CHECKPOINT_" + "4_19681" 38 | Full_navn = "/home/developer/Training_results/Visual/10/08/2021/16:05:00/"+"_nav_acl_network_" + "4_19681" 39 | 40 | No_AE_Path= "/home/developer/Training_results/Visual/10/05/2021/11:19:13/"+"_CHECKPOINT_" + "62_541207" 41 | No_AE_navn= "/home/developer/Training_results/Visual/10/05/2021/11:19:13/"+"_nav_acl_network_" + "62_541207" 42 | 43 | RND_Path = "/home/developer/Training_results/Visual/09/30/2021/09:44:53/"+"_CHECKPOINT_" + "22_162675" 44 | RND_navn = "/home/developer/Training_results/Visual/09/30/2021/09:44:53/"+"_nav_acl_network_" + "22_162675" 45 | nav_list = [Full_navn,No_AE_navn,RND_navn] 46 | check_list = [Full_Path,No_AE_Path,RND_Path] 47 | 48 | names_list = ["FULL", "No_AE", "RND"] 49 | nav_acl_network = NavACLNetwork(5,config['nav_acl_hidden_dim']).to(train_device) 50 | for model_index in range(len(names_list)): 51 | print("load model: ", check_list[model_index]) 52 | checkpoint=torch.load(check_list[model_index]) 53 | print("load navacl model: ", nav_list[model_index]) 54 | nav_check =torch.load(nav_list[model_index]) 55 | nav_acl_network.load_state_dict(nav_check) 56 | print("hat geklappt") 57 | 58 | for model_index in range(len(names_list)): 59 | checkpoint=torch.load(check_list[model_index]) 60 | nav_check =torch.load(nav_list[model_index]) 61 | nav_acl_network.load_state_dict(nav_check) 62 | soft_q_net1 = SoftQNetwork(config).to(train_device) 63 | soft_q_net1.load_state_dict(checkpoint['soft_q_net1']) 64 | 65 | model_clone = PolicyNetwork(config) 66 | model_clone.load_state_dict(checkpoint['policy_net']) 67 | model_clone.to(inference_device) 68 | 69 | if( (config['q_ricculum_learning']) or config['create_new_AF_database']): 70 | nav_acl = Nav_ACL(nav_acl_network,config,shared_q_net=soft_q_net1,env=env,task_offset=np.array([0,0,0]),agent_index=agent_index) 71 | else: 72 | nav_acl = Nav_ACL(nav_acl_network,config,agent_index=agent_index) 73 | for param in model_clone.parameters(): 74 | param.requires_grad = False 75 | test_grid = generate_test_grid() 76 | test_angles = [0,45,-45,90,-90] 77 | # print(test_grid) 78 | 79 | for angle in test_angles: 80 | for trial in range(0,4): 81 | print("agent: ", agent_index, " test case: ", angle, trial, "model: ", names_list[model_index]) 82 | results = [] 83 | for task_offset in test_grid: 84 | orig_offset = task_offset 85 | task_config = config['default_test_unity'] 86 | task_config['robot_pose']['rotation_yaw'][0] = angle 87 | task = RobotTask(task_config) # create a task instance from a config dict 88 | task_offset[1] += (agent_index*40) 89 | task = nav_acl.apply_offset_to_robot(task, task_offset) 90 | task = nav_acl.apply_offset_to_dolly(task, [0,(agent_index*40),0]) 91 | 92 | task.set_q_value(nav_acl.get_q_value_for_task(task,task_offset)) 93 | num_steps, task_result, pred_prob = run_episode(env, agent_index, model_clone, nav_acl,shared_elements, 1, 0, config, test_task=task) 94 | results.append([orig_offset,angle,nav_acl.get_task_params_array(task,normalize=False),agent_index,num_steps,task_result, pred_prob]) 95 | results_array = np.asarray(results) 96 | np.save("/home/developer/Testing_results/"+names_list[model_index]+"_gt_agent_"+str(agent_index)+"_trial_" + str(trial)+"_yaw_"+str(angle), results_array) 97 | 98 | 99 | 100 | def run_episodes(env, agent_index, shared_elements, config): 101 | max_episodes = config['max_episodes'] 102 | num_Agents = config['num_Agents'] 103 | num_episodes = int(max_episodes / num_Agents) 104 | steps = 0 105 | robot_grid = generate_robot_grid() 106 | task_offset = robot_grid[agent_index] 107 | model_clone = PolicyNetwork(config) 108 | 109 | model_clone.load_state_dict(shared_elements['policy_network'].state_dict()) 110 | model_clone.to(inference_device) 111 | 112 | for param in model_clone.parameters(): 113 | param.requires_grad = False 114 | 115 | if( (config['q_ricculum_learning']) or config['create_new_AF_database']): 116 | nav_acl = Nav_ACL(shared_elements['nav_acl_network'],config,shared_q_net=shared_elements['q_network'],env=env,task_offset=task_offset,agent_index=agent_index) 117 | else: 118 | nav_acl = Nav_ACL(shared_elements['nav_acl_network'],config,agent_index=agent_index) 119 | 120 | if( config['create_new_AF_database']): 121 | nav_acl.save_tasks_for_later(task_offset,config['new_AF_database_size'],"/home/developer/Training_results/Qricculum_Learning/"+str(agent_index)) 122 | print("FINISHED SAVING TASK DATABASE FOR QRICCULUM LEARNING, saved at: ","/home/developer/Training_results/Qricculum_Learning/"+str(agent_index)) 123 | nav_acl.load_AF_database("/home/developer/Training_results/Qricculum_Learning/"+str(agent_index)+".npy") 124 | 125 | 126 | for episode_index in range(num_episodes): 127 | if (episode_index % config['update_nav_nets_every_N_episodes']) == 0: 128 | # Create a new local copy of the latest Policy net before starting the episode 129 | model_clone.load_state_dict(shared_elements['policy_network'].state_dict()) 130 | for param in model_clone.parameters(): 131 | param.requires_grad = False 132 | if( config['q_ricculum_learning']): 133 | nav_acl.update_networks(shared_elements['nav_acl_network'],shared_elements['q_network'],task_offset) 134 | else: 135 | nav_acl.update_networks(shared_elements['nav_acl_network'],None) 136 | 137 | num_steps, result = run_episode(env, agent_index, model_clone, nav_acl,shared_elements, episode_index, steps, config) 138 | steps += num_steps 139 | 140 | 141 | 142 | def run_episode(env, agent_index, policy_net, nav_acl,shared_elements, num_of_episode, steps, config, test_task=None): 143 | num_stacked_frames = config['num_stacked_frames'] 144 | train_start = config['train_starts'] 145 | max_steps = config['max_steps'] 146 | num_Agents = config['num_Agents'] 147 | t_max = config['buffer_maxlen']*num_Agents 148 | robot_grid = generate_robot_grid() 149 | task_offset = robot_grid[agent_index] 150 | t = steps*num_Agents # estimate total number of steps in the replay buffer 151 | P_Random = min(nav_acl.config['adaptive_filtering_params']['p_random_max'], t/t_max) 152 | P_Easy = ((1-P_Random)/2) 153 | P_Frontier = P_Easy 154 | action_dim = (2) 155 | nav_acl.adaptive_filtering_task_probs = [P_Random,P_Easy,P_Frontier] 156 | 157 | if(test_task is None): 158 | task = nav_acl.generate_random_task(translation_offset=task_offset) 159 | else: 160 | print("TEST TASK") 161 | task = test_task 162 | 163 | pred_prob_worker = nav_acl.get_task_success_probability(task).detach().cpu().numpy() 164 | 165 | step_durations = shared_elements['step_durations'] 166 | episode_rewards = shared_elements['episode_rewards'] 167 | episode_lengths = shared_elements['episode_lengths'] 168 | intermediate_buffer = shared_elements['experience_queue'] 169 | intermediate_buffer_nav_acl = shared_elements['nav_acl_exp_queue'] 170 | 171 | 172 | obs_cam, obs_lidar = env.reset(task) 173 | env.step([0,0]) 174 | obs_cam, obs_lidar = env.reset(task) 175 | stacked_camera_obsrv = deque(maxlen=num_stacked_frames) 176 | stacked_actions = deque(maxlen=num_stacked_frames) 177 | stacked_rewards = deque(maxlen=num_stacked_frames) 178 | transitions = deque(maxlen=num_stacked_frames) 179 | episode_reward = 0 180 | ep_steps = 0 181 | prev_action = np.zeros(action_dim) 182 | prev_reward = config['step_penalty'] 183 | task_result = [0] 184 | episode_exp = [] 185 | # start with a fresh state 186 | for i in range(num_stacked_frames): 187 | stacked_camera_obsrv.append(obs_cam) 188 | stacked_actions.append(prev_action) 189 | stacked_rewards.append(prev_reward) 190 | if(config['use_snail_mode']): 191 | transitions.append((obs_cam, obs_lidar,prev_action,prev_reward)) 192 | 193 | 194 | 195 | start_episode = current_milli_time() 196 | 197 | 198 | for step in range(max_steps): 199 | start = current_milli_time() 200 | 201 | stacked_camera_state = np.asarray(stacked_camera_obsrv).reshape((env.observtion_shape[0]*num_stacked_frames,env.observtion_shape[1],env.observtion_shape[2])) 202 | stacked_prev_action = np.asarray(stacked_actions) # [a-4,a-3, a-2, a-1] 203 | stacked_prev_reward = np.asarray(stacked_rewards).reshape(num_stacked_frames,1) 204 | 205 | with torch.no_grad(): 206 | if steps >= train_start: 207 | action = policy_net.get_action(stacked_camera_state,obs_lidar,stacked_prev_action,stacked_prev_reward, deterministic = False, device=inference_device) 208 | else: 209 | action = policy_net.sample_action() 210 | 211 | ### do action and get observation 212 | 213 | 214 | step_start = current_milli_time() 215 | next_obs_cam, next_obs_lidar, reward, done, info = env.step(action) 216 | 217 | ### append new action and reward (has to happen before appending to the replay buffer!) 218 | stacked_actions.append(action) # [a-3,a-2,a-1, a] 219 | stacked_rewards.append(reward) 220 | 221 | stacked_action = np.asarray(stacked_actions) 222 | stacked_reward = np.asarray(stacked_rewards).reshape(num_stacked_frames,1) 223 | 224 | ### append to episode exp 225 | episode_exp.append((obs_cam, obs_lidar, stacked_action, stacked_prev_action, stacked_reward, stacked_prev_reward, next_obs_cam, next_obs_lidar, done)) 226 | 227 | 228 | stacked_camera_obsrv.append(next_obs_cam) # has to happen after appending to the replay buffer 229 | 230 | obs_cam, obs_lidar = next_obs_cam, next_obs_lidar 231 | episode_reward += reward 232 | 233 | end = current_milli_time() 234 | step_durations.append(end-start) 235 | prev_action = action 236 | prev_reward = reward 237 | ep_steps += 1 238 | 239 | ### some manual cleanup to prevent numpy from polluting my memory 240 | del stacked_camera_state, next_obs_cam, next_obs_lidar 241 | 242 | if done: 243 | if info['collision']: 244 | task_result = [0] 245 | else: 246 | task_result = [1] 247 | 248 | break 249 | time_for_episode = current_milli_time()-start_episode 250 | avg_time_per_step = int(time_for_episode / ep_steps) 251 | print('Agent: ', agent_index, 'Finished Episode: ', num_of_episode, ' | Pred prob.: ', round(pred_prob_worker[0],2), ' | was considered: ', task.task_type.name, '| Episode Reward: ', round(episode_reward,2), ' | Num steps: ', ep_steps, '| avg time per step: ',avg_time_per_step, ' | done: ', task_result[0]) 252 | episode_rewards.append(episode_reward) 253 | episode_lengths.append(ep_steps) 254 | 255 | nav_acl.params.append(nav_acl.get_task_params_array(task,False)) # append the non normalized version of the task since we need that information for our statistics 256 | 257 | # append results to the shared queue 258 | intermediate_buffer_nav_acl.put((task,task_result,agent_index)) 259 | intermediate_buffer.put(episode_exp) 260 | return ep_steps, task_result[0], pred_prob_worker 261 | 262 | def start_isaac_sim_connector(num_Agents=1, start_port=64000): 263 | """ creates PyCodelets for receiving and sending messages to/from the isaac simulation environment (either Unity or Omniverse) via specified tcp subscriber and publisher nodes to allow for parallel access to the simulation """ 264 | app = Application("apps/py_sac_nav_acl/py_sim_connector.app.json") 265 | PUB_PORT_NUM = start_port 266 | SUB_PORT_NUM = start_port+1000 267 | for index in range(num_Agents): 268 | #create application and create node 269 | agent_suffix = '_'+str(index) 270 | app_name = "connector" +agent_suffix 271 | 272 | connector = app.add(app_name) 273 | 274 | tcp_sub = connector.add(app.registry.isaac.alice.TcpSubscriber) 275 | tcp_pub = connector.add(app.registry.isaac.alice.TcpPublisher) 276 | connector.add(app.registry.isaac.alice.TimeSynchronizer) 277 | 278 | tcp_sub.config['port'] = SUB_PORT_NUM + index 279 | tcp_sub.config['host'] = 'localhost' 280 | tcp_pub.config['port'] = PUB_PORT_NUM + index 281 | 282 | #receive messages from sim and publish via TCP 283 | app.connect('simulation.interface/output', 'collision'+agent_suffix, app_name+'/TcpPublisher', 'collision'+agent_suffix) 284 | app.connect('simulation.interface/output', 'bodies'+agent_suffix, app_name+'/TcpPublisher', 'bodies'+agent_suffix) 285 | app.connect('simulation.interface/output', 'rangescan_front'+agent_suffix, app_name+'/TcpPublisher', 'rangescan_front'+agent_suffix) 286 | app.connect('simulation.interface/output', 'rangescan_back'+agent_suffix, app_name+'/TcpPublisher', 'rangescan_back'+agent_suffix) 287 | app.connect('simulation.interface/output', 'color'+agent_suffix, app_name+'/TcpPublisher', 'color'+agent_suffix) 288 | 289 | #receive messages from TCP and publish them to the simulation 290 | print("messages coming from: ", SUB_PORT_NUM + index, "go to :", 'simulation.interface/input/####'+agent_suffix) 291 | #send control messages 292 | app.connect(app_name+'/TcpSubscriber', 'diff_command'+agent_suffix, 'simulation.interface/input', 'base_command'+agent_suffix) 293 | app.connect(app_name+'/TcpSubscriber', 'teleport_robot'+agent_suffix, 'simulation.interface/input', 'teleport_robot' +agent_suffix) 294 | app.connect(app_name+'/TcpSubscriber', 'teleport_dolly'+agent_suffix, 'simulation.interface/input', 'teleport_dolly' +agent_suffix) 295 | app.connect(app_name+'/TcpSubscriber', 'teleport_obstacle'+agent_suffix, 'simulation.interface/input', 'teleport_obstacle' +agent_suffix) 296 | app.connect(app_name+'/TcpSubscriber', 'teleport_obstacle'+agent_suffix+"_1", 'simulation.interface/input', 'teleport_obstacle' +agent_suffix+"_1") 297 | app.connect(app_name+'/TcpSubscriber', 'teleport_obstacle'+agent_suffix+"_2", 'simulation.interface/input', 'teleport_obstacle' +agent_suffix+"_2") 298 | app.connect(app_name+'/TcpSubscriber', 'teleport_obstacle'+agent_suffix+"_3", 'simulation.interface/input', 'teleport_obstacle' +agent_suffix+"_3") 299 | 300 | app.start() 301 | while True: 302 | time.sleep(5) 303 | app.stop() 304 | 305 | 306 | def start_isaac_application(app, agent_index, config): 307 | """ creates PyCodelets for receiving and sending messages to/from the isaac simulation environment (either Unity or Omniverse) """ 308 | Port = config['start_port'] 309 | PUB_PORT_NUM = Port + 1000 310 | SUB_PORT_NUM = Port 311 | 312 | #create application and create node 313 | agent_suffix = '_'+str(agent_index) 314 | app_name = "py_controller" +agent_suffix 315 | py_controller_node = app.add(app_name) 316 | 317 | 318 | component_node = py_controller_node.add(PyController) 319 | component_node.config['app_name'] = app_name 320 | component_node.config['agent_index'] = agent_index 321 | component_node.config['tick_period_seconds'] = config['tick_period_seconds'] 322 | component_node.config['action_duration_ticks'] = config['action_duration_ticks'] 323 | component_node.config['depth_cam_max_distance'] = config['depth_cam_max_distance'] 324 | component_node.config['lidar_max_distance'] = config['lidar_max_distance'] 325 | component_node.config['goal_threshold'] = config['goal_threshold'] 326 | component_node.config['output_resolution'] = config['output_resolution'] 327 | component_node.config['using_depth'] = config['using_depth'] 328 | component_node.config['using_unity'] = config['using_unity'] 329 | component_node.config['goal_description'] = config['goal_description'] 330 | component_node.config['omniverse_teleport_dict'] = config['omniverse_teleport_dict'] 331 | component_node.config['scale_dolly'] = config['scale_dolly'] 332 | contact_monitor_node = py_controller_node.add(app.registry.isaac.navigation.CollisionMonitor, 'CollisionMonitor') 333 | 334 | tcp_sub = py_controller_node.add(app.registry.isaac.alice.TcpSubscriber) 335 | tcp_pub = py_controller_node.add(app.registry.isaac.alice.TcpPublisher) 336 | 337 | py_controller_node.add(app.registry.isaac.alice.TimeSynchronizer) 338 | 339 | tcp_sub.config['port'] = SUB_PORT_NUM + agent_index 340 | tcp_sub.config['host'] = 'localhost' 341 | tcp_pub.config['port'] = PUB_PORT_NUM + agent_index 342 | print("start isaac app: ", agent_index, " \n SUB_PORT: ", SUB_PORT_NUM, "\n PUB_PORT: ", PUB_PORT_NUM, "\n") 343 | #receive messages 344 | app.connect(app_name+'/TcpSubscriber', 'collision'+agent_suffix, app_name+'/CollisionMonitor', 'collision') 345 | app.connect(app_name+'/CollisionMonitor', 'report', app_name+'/PyCodelet', 'collision'+agent_suffix) 346 | app.connect(app_name+'/TcpSubscriber', 'collision'+agent_suffix, app_name+'/PyCodelet', 'collision'+agent_suffix) 347 | app.connect(app_name+'/TcpSubscriber', 'bodies'+agent_suffix, app_name+'/PyCodelet', 'bodies'+agent_suffix) 348 | app.connect(app_name+'/TcpSubscriber', 'rangescan_front'+agent_suffix, app_name+'/PyCodelet', 'lidar_front'+agent_suffix) 349 | app.connect(app_name+'/TcpSubscriber', 'rangescan_back'+agent_suffix, app_name+'/PyCodelet', 'lidar_back'+agent_suffix) 350 | app.connect(app_name+'/TcpSubscriber', 'color'+agent_suffix, app_name+'/PyCodelet', 'color'+agent_suffix) 351 | 352 | 353 | print("STARTED CONTROLLER WITH NAME: ", app_name, "RECEIVING TCP ON ", SUB_PORT_NUM +agent_index, "SENDING TCP ON: ", PUB_PORT_NUM+agent_index) 354 | #send control messages 355 | app.connect(app_name+'/PyCodelet', 'diff_command'+agent_suffix, app_name+'/TcpPublisher', 'diff_command'+agent_suffix) 356 | app.connect(app_name+'/PyCodelet', 'teleport_robot'+agent_suffix, app_name+'/TcpPublisher', 'teleport_robot'+agent_suffix) 357 | app.connect(app_name+'/PyCodelet', 'teleport_dolly'+agent_suffix, app_name+'/TcpPublisher', 'teleport_dolly'+agent_suffix) 358 | app.connect(app_name+'/PyCodelet', 'teleport_obstacle'+agent_suffix, app_name+'/TcpPublisher', 'teleport_obstacle'+agent_suffix) 359 | app.connect(app_name+'/PyCodelet', 'teleport_obstacle'+agent_suffix + "_1", app_name+'/TcpPublisher', 'teleport_obstacle'+agent_suffix+ "_1") 360 | app.connect(app_name+'/PyCodelet', 'teleport_obstacle'+agent_suffix + "_2", app_name+'/TcpPublisher', 'teleport_obstacle'+agent_suffix+ "_2") 361 | app.connect(app_name+'/PyCodelet', 'teleport_obstacle'+agent_suffix + "_3", app_name+'/TcpPublisher', 'teleport_obstacle'+agent_suffix+ "_3") 362 | app.start() 363 | -------------------------------------------------------------------------------- /sac_v2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Soft Actor-Critic version 2 3 | using target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net 4 | add alpha loss compared with version 1 5 | paper: https://arxiv.org/pdf/1812.05905.pdf 6 | ''' 7 | import numpy as np 8 | import time 9 | from utils import * 10 | import torch.multiprocessing as mp 11 | from process_utils import * 12 | import copy 13 | from threading import Thread 14 | import scipy 15 | 16 | 17 | 18 | def mini_batch_train(agent,save_location, config): 19 | # training loop 20 | 21 | ## These shared lists and managers are for the multi-processing 22 | manager = mp.Manager() 23 | episode_rewards = manager.list() 24 | episode_lengths = manager.list() 25 | step_durations = manager.list() 26 | nav_acl_exp_list = manager.list() 27 | experience_queue = manager.Queue() 28 | nav_acl_exp_queue = manager.Queue() 29 | num_Agents = config['num_Agents'] 30 | nav_acl_network = NavACLNetwork(5,config['nav_acl_hidden_dim']).to(train_device) 31 | 32 | if config['load_checkpoint']: 33 | checkpoint = torch.load(config['checkpoint_path']+"_nav_acl_network_"+str(config['checkpoint_index'])) 34 | nav_acl_network.load_state_dict(checkpoint) 35 | 36 | # observation is only the current front camera image, but we want to concatenate the four most recent frames to form the state therefore we keep track of the 37 | # three previous frames in a deque with a maximum length 38 | 39 | agent.policy_net.share_memory() 40 | agent.target_soft_q_net1.share_memory() 41 | num_Agents = config['num_Agents'] 42 | nav_acl_mu_sig = manager.Array('d', range(2)) 43 | nav_acl_network.share_memory() 44 | 45 | shared_elements = {"episode_rewards" : episode_rewards, 46 | "episode_lengths" : episode_lengths, 47 | "step_durations" : step_durations, 48 | "policy_network" : agent.policy_net, 49 | "q_network" : agent.target_soft_q_net1, 50 | "nav_acl_network" : nav_acl_network, 51 | "experience_queue" : experience_queue, 52 | "nav_acl_exp_queue" : nav_acl_exp_queue, 53 | "nav_acl_exp_list" : nav_acl_exp_list, 54 | "nav_acl_mu_sig" : nav_acl_mu_sig} 55 | 56 | 57 | update_shared_policy_net(agent, shared_elements) 58 | 59 | 60 | dtb_stack = [] 61 | # TODO: checkpoint loading for nav acl network 62 | 63 | # start workers: 64 | print("START ", num_Agents, " WORKER TRHEADS ") 65 | processes = [] 66 | nav_acl_local_exp_list = [] # this list is used to store the latest n=nav_acl_batch_size task, task_result pairs to train the nav_acl network with 67 | 68 | # ## for each agent run a fraction of the number of episodes in parallel 69 | for agent_index in range(num_Agents): 70 | 71 | p1 = mp.Process(target=start_gather_experience,args=(agent_index,shared_elements,config,)) 72 | p1.start() 73 | 74 | processes.append(p1) 75 | time.sleep(0.2) 76 | 77 | # start an save cycle for saving the statistics hourly 78 | t2 = Thread(target=save_cycle,args=(agent,nav_acl_local_exp_list,shared_elements,save_location,)) 79 | t2.start() 80 | 81 | # start the update cycle that will gather the information from the worker cycles to update both the nav-acl network and the policy and q-value networks 82 | update_cycle(agent,nav_acl_local_exp_list,shared_elements,config,dtb_stack) 83 | 84 | 85 | def save_cycle(agent,nav_acl_local_exp_list,shared_elements,save_location): 86 | """ 87 | saves training statistics and networks every hour 88 | """ 89 | h_of_train = 0 90 | while(True): 91 | time.sleep(180) 92 | save_training_statiscics_to_location(shared_elements['episode_lengths'], 93 | shared_elements['step_durations'], 94 | shared_elements['episode_rewards'], 95 | nav_acl_local_exp_list, 96 | agent, 97 | shared_elements['nav_acl_network'], 98 | save_location, 99 | h_of_train) 100 | time.sleep(3420) 101 | h_of_train += 1 102 | 103 | 104 | #TODO: make this easier to read 105 | def update_cycle(agent,nav_acl_local_exp_list,shared_elements,config, dtb_stack): 106 | """ 107 | this cycle is an infinite loop that checks if new exp for training the q and policy networks can be found in the shared_elements['experience_queue'] 108 | if so it will train the q nets and the policy nets (policy net and target_q_net are shared accross all processes) on this new exp. 109 | (remember that the workers replace their copy of the qnet and the policy net on every episode) 110 | Furthermore this cylce looks out for new nav_acl exp inside the shared_elements['nav_acl_exp_queue'], 111 | if so the shared nav_acl Network will be trained - either batch wise or for each incoming exp 112 | """ 113 | train_start = config['train_starts'] 114 | batch_size = config['batch_size'] 115 | use_reward_scaling = config['use_reward_scaling'] 116 | gamma = config['gamma'] 117 | num_Agents = config['num_Agents'] 118 | soft_tau = config['tau'] 119 | updates = 0 120 | steps = 0 121 | nav_acl = Nav_ACL(shared_elements['nav_acl_network'], config,worker_instance=False) 122 | nav_acl_batch_tasks = [] 123 | nav_acl_batch_results = [] 124 | saved = False 125 | while(True): 126 | start = current_milli_time() 127 | idle = True 128 | 129 | ### This part gathers the experience (for the SAC part and the nav acl part) 130 | for q in range(num_Agents): 131 | if (not shared_elements['experience_queue'].empty()): 132 | episode = shared_elements['experience_queue'].get() 133 | for exp in episode: # take exp out of the shared queue and put it into the replay buffer 134 | obs_cam, obs_lidar, action, prev_action, reward, prev_reward, next_obs_cam, next_obs_lidar, done = exp 135 | 136 | # put it into the replay buffer 137 | agent.replay_buffer.add(obs_cam, obs_lidar, action, prev_action, reward, prev_reward, next_obs_cam, next_obs_lidar, done) 138 | steps += 1 139 | agent.steps += 1 140 | 141 | idle=False 142 | if (not shared_elements['nav_acl_exp_queue'].empty()): # take nav acl exp out of the shared queue and use it to train the shared nav-acl network 143 | task, result, agent_index = shared_elements['nav_acl_exp_queue'].get() 144 | task_params_array = nav_acl.get_task_params_array(task,normalize=False) # not normalizing this bc. only used for plotting 145 | prediction = nav_acl.get_task_success_probability(task).detach().cpu() 146 | 147 | if(config['q_ricculum_learning']): 148 | mu, sig = 0.5, 0.5 # q_ricculum lerning needs a q value estimate to create an adaptive boundaries, thus if we use q_ricculum lerning we cannot have this statistic so we replace with 0.5, 0.5 149 | else: 150 | mu, sig = nav_acl.create_adaptive_boundaries(100) 151 | 152 | nav_acl_local_exp_list.append(((task_params_array,result, prediction, mu, sig, agent_index,task.task_type))) 153 | 154 | if(nav_acl.config['nav_acl_batch_mode']): 155 | nav_acl_batch_tasks.append(nav_acl.get_task_params_array(task,normalize=nav_acl.config['normalize_tasks'])) 156 | nav_acl_batch_results.append(result) 157 | if(len(nav_acl_batch_tasks) == nav_acl.config['nav_acl_batch_size']): 158 | nav_acl_batch_tasks_array = np.asarray(nav_acl_batch_tasks) 159 | nav_acl_batch_results_array = np.asarray(nav_acl_batch_results) 160 | nav_acl.batch_train(nav_acl_batch_tasks_array, nav_acl_batch_results_array) 161 | nav_acl_batch_tasks.clear() 162 | nav_acl_batch_results.clear() 163 | 164 | else: 165 | task_params_array = nav_acl.get_task_params_array(task,normalize=nav_acl.config['normalize_tasks']) 166 | nav_acl.train(task, result) 167 | # # update_start = current_milli_time() 168 | 169 | ## after some Experience has been collected and put into the replay buffer, I perform 4 consecutive training steps 170 | for x in range(4): 171 | if((steps > config['train_starts']) and (steps > config['batch_size']) and (steps > config['fill_buffer_with_transitions'])): 172 | if(updates < int(steps/4)): 173 | agent.update(batch_size, reward_scale=2., auto_entropy=True, target_entropy=-1.,use_reward_scaling=use_reward_scaling,gamma=gamma,soft_tau=soft_tau) 174 | updates+=1 175 | idle=False 176 | # print("update process cycle time: ", current_milli_time()-start, "of which update time: ", current_milli_time()-update_start) 177 | if(updates % 500 <= 10): 178 | print("steps: ", steps, " updates so far: ", updates) 179 | 180 | 181 | if(idle): 182 | time.sleep(0.1) 183 | 184 | 185 | 186 | 187 | ## These three are not really needed since the networks in the shared_elements are the ones that are updated in the update cycle! 188 | def update_shared_q_net(agent, shared_elements): 189 | for target_param, param in zip(shared_elements['q_network'].parameters(), agent.soft_q_net1.parameters()): 190 | target_param.data.copy_(param.data) 191 | 192 | def update_shared_policy_net(agent,shared_elements): 193 | for target_param, param in zip(shared_elements['policy_network'].parameters(), agent.policy_net.parameters()): 194 | target_param.data.copy_(param.data) 195 | 196 | def update_shared_nav_net(nav_acl, shared_elements): 197 | for target_param, param in zip(shared_elements['nav_acl_network'].parameters(), nav_acl.NavACLNet.parameters()): 198 | target_param.data.copy_(param.data) 199 | 200 | def save_training_statiscics_to_location(episode_lengths,step_durations,episode_rewards,nav_acl_exp_list, agent, nav_acl_network, save_location,h_of_train): 201 | with mp.Lock(): 202 | nav_acl_params = [] 203 | nav_acl_dones = [] 204 | nav_acl_predictions = [] 205 | nav_acl_mu = [] 206 | nav_acl_sig = [] 207 | nav_acl_agent_index = [] 208 | nav_acl_task_type = [] 209 | copy_episode_lengths = [] 210 | copy_step_durations = [] 211 | copy_episode_rewards = [] 212 | 213 | for (task_params,result, prediction, mu, sig, agent_index, task_type) in nav_acl_exp_list: 214 | nav_acl_params.append(task_params) 215 | nav_acl_dones.append(result) 216 | nav_acl_predictions.append(prediction) 217 | nav_acl_mu.append(mu) 218 | nav_acl_sig.append(sig) 219 | nav_acl_agent_index.append(agent_index) 220 | nav_acl_task_type.append(task_type.value) 221 | 222 | for step in step_durations: 223 | copy_step_durations.append(step) 224 | 225 | for episode_reward in episode_rewards: 226 | copy_episode_rewards.append(episode_reward) 227 | 228 | for episode_length in episode_lengths: 229 | copy_episode_lengths.append(episode_length) 230 | 231 | np.save(save_location+"episode_lengts", copy_episode_lengths) 232 | np.save(save_location+"episode_rewards", copy_episode_rewards) 233 | np.save(save_location+"step_durations", copy_step_durations) 234 | np.save(save_location+"q1_losses", agent.q1_losses) 235 | np.save(save_location+"q2_losses", agent.q2_losses) 236 | np.save(save_location+"policy_losses", agent.policy_losses) 237 | np.save(save_location+"alpha_losses", agent.alpha_losses) 238 | np.save(save_location+"alpha", agent.alpha_list) 239 | 240 | 241 | np.save(save_location+"nav_acl_params", nav_acl_params) 242 | np.save(save_location+"nav_acl_dones", nav_acl_dones) 243 | np.save(save_location+"nav_acl_predictions",nav_acl_predictions) 244 | np.save(save_location+"nav_acl_mu", nav_acl_mu) 245 | np.save(save_location+"nav_acl_sig", nav_acl_sig) 246 | np.save(save_location+"nav_acl_agent_index",nav_acl_agent_index) 247 | np.save(save_location+"nav_acl_task_type", nav_acl_task_type) 248 | torch.save(nav_acl_network.state_dict(), save_location+'_nav_acl_network_'+str(h_of_train)+"_"+str(len(episode_rewards))) 249 | agent.save_checkpoint(save_location,str(h_of_train)+"_"+str(len(episode_rewards))) 250 | agent.save_model(save_location) 251 | print("SAVED SUCCESSFULLY!") 252 | 253 | def test_batch(env,agent,max_steps,num_tests): 254 | for eps in range(num_tests): 255 | state = env.reset() 256 | episode_reward = 0 257 | prev_action = agent.policy_net.sample_action() 258 | 259 | for step in range(max_steps): 260 | action = agent.policy_net.get_action(state,prev_action, deterministic = False) 261 | next_state, reward, done, _ = env.step(action) 262 | episode_reward += reward 263 | state=next_state 264 | prev_action = action 265 | 266 | if done: 267 | break 268 | 269 | print('Episode: ', eps, '| Episode Reward: ', episode_reward) --------------------------------------------------------------------------------