├── LICENSE
├── NAV_ACL_hpyerparameters.json
├── Networks.py
├── README.md
├── RobotTask.py
├── SAC_Trainers.py
├── SAC_hyperparameters.json
├── buffers.py
├── nav_acl.py
├── pretrain_CAE.py
├── process_utils.py
└── sac_v2.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 ai-lab-science
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAV_ACL_hpyerparameters.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "default_task_unity" : {
  3 |         "robot_pose" : {
  4 |             "translation" :[-2.8,0.0,0.01],
  5 |             "rotation_yaw" : [0]
  6 |         },
  7 |         "dolly_pose" : {
  8 |             "translation" : [2.5,0.15,0.01],
  9 |             "rotation_yaw" : [0]
 10 |         },
 11 |         "obstacle_pose": {
 12 |             "translation" :  [0.5,1.50,0.1],
 13 |             "rotation_yaw" : [0]
 14 |         },
 15 |         "obstacle_1_pose": {
 16 |             "translation" :  [0.5,-1.7,0.1],
 17 |             "rotation_yaw" : [0]
 18 |         },
 19 |         "obstacle_2_pose": {
 20 |             "translation" :  [2,-1.7,0.1],
 21 |             "rotation_yaw" : [0]
 22 |         },
 23 |         "obstacle_3_pose": {
 24 |             "translation" :  [2.5,2,0.1],
 25 |             "rotation_yaw" : [0]
 26 |         }
 27 |     },
 28 |     "default_test_unity" : {
 29 |         "robot_pose" : {
 30 |             "translation" :[0.0,2.5,0.01],
 31 |             "rotation_yaw" : [0]
 32 |         },
 33 |         "dolly_pose" : {
 34 |             "translation" : [2.0,0,0.01],
 35 |             "rotation_yaw" : [0]
 36 |         },
 37 |         "obstacle_pose": {
 38 |             "translation" :  [10,0.5,0.01],
 39 |             "rotation_yaw" : [0]
 40 |         },
 41 |         "obstacle_1_pose": {
 42 |             "translation" :  [10,4.1,0.01],
 43 |             "rotation_yaw" : [0]
 44 |         },
 45 |         "obstacle_2_pose": {
 46 |             "translation" :  [10,-1.7,0.1],
 47 |             "rotation_yaw" : [0]
 48 |         },
 49 |         "obstacle_3_pose": {
 50 |             "translation" :  [10,2,0.1],
 51 |             "rotation_yaw" : [0]
 52 |         }
 53 |     },
 54 |     "crazy_test_unity" : {
 55 |         "robot_pose" : {
 56 |             "translation" :[2.0,109.0,0.01],
 57 |             "rotation_yaw" : [180]
 58 |         },
 59 |         "dolly_pose" : {
 60 |             "translation" : [-7,111.0,0.01],
 61 |             "rotation_yaw" : [10]
 62 |         },
 63 |         "obstacle_pose": {
 64 |             "translation" :  [4.0,111.0,0.1],
 65 |             "rotation_yaw" : [0]
 66 |         },
 67 |         "obstacle_1_pose": {
 68 |             "translation" :  [8.0,111.0,0.1],
 69 |             "rotation_yaw" : [0]
 70 |         },
 71 |         "obstacle_2_pose": {
 72 |             "translation" :  [9.0,111.0,0.1],
 73 |             "rotation_yaw" : [0]
 74 |         },
 75 |         "obstacle_3_pose": {
 76 |             "translation" :  [2.0,111.5,0.1],
 77 |             "rotation_yaw" : [0]
 78 |         }
 79 |     },
 80 |     "default_task_omniverse" : {
 81 |         "robot_pose" : {
 82 |             "translation" :[1.1,0.0,0.01],
 83 |             "rotation_yaw" : [0]
 84 |         },
 85 |         "dolly_pose" : {
 86 |             "translation" : [2.5,0.15,0.01],
 87 |             "rotation_yaw" : [0]
 88 |         },
 89 |         "obstacle_pose": {
 90 |             "translation" :  [0.5,1.50,0.1],
 91 |             "rotation_yaw" : [0]
 92 |         },
 93 |         "obstacle_1_pose": {
 94 |             "translation" :  [0.5,-1.7,0.1],
 95 |             "rotation_yaw" : [0]
 96 |         },
 97 |         "obstacle_2_pose": {
 98 |             "translation" :  [2,-1.7,0.1],
 99 |             "rotation_yaw" : [0]
100 |         },
101 |         "obstacle_3_pose": {
102 |             "translation" :  [2.5,2,0.1],
103 |             "rotation_yaw" : [0]
104 |         }
105 |     },
106 | 
107 |     "randomization_params" : {
108 |     "robot_randomization":    {"translation_rnd_xyz":[1,1,0],"rotation_rnd":[180]},
109 |     "dolly_randomization":    {"translation_rnd_xyz":[0.7,0.4,0],"rotation_rnd":[30]},
110 |     "num_obstacles"         : 4,
111 |     "dolly_pos_spline_angle": 30,
112 |     "obstacle_pos_spline_mean" : 0.5,
113 |     "min_dist_dolly_robot":   1.5,
114 |     "max_dist_dolly_robot":   5.0,
115 |     "min_dist_dolly_obs":     2.0,
116 |     "max_dist_dolly_obs":     5.1,
117 |     "min_dist_robot_obs":     2.0,
118 |     "max_dist_robot_obs":     4.1,
119 |     "del_obstacle_translation":  [1000,1000,1000],
120 |     "obstacle_randomization": {"translation_rnd_xyz":[0.8,0.8,0],"rotation_rnd":[12]}
121 |     },
122 |     "runtimetasks_from_database": true ,
123 |     "update_nav_nets_every_N_episodes": 10, 
124 |     "nav_acl_max_AF_task_samples" : 100,
125 |     "nav_acl_AF_boundaries_task_samples" : 100,
126 |     "nav_acl_hidden_dim": 16,
127 |     "nav_acl_batch_mode": true,
128 |     "nav_acl_batch_size": 16,
129 |     "nav_acl_lr": 0.004,
130 |     "only_random_tasks": false,
131 |     "normalize_tasks": true,
132 |     "q_ricculum_learning" : true,
133 |     "task_generation_method" : "AF",
134 |     "create_new_AF_database" : false,
135 |     "new_AF_database_size"  : 6000,
136 |     "use_AF_database"  : false,
137 |     "AF_database_path" : "/home/developer/Training_results/Qricculum_Learning/big_and_small/hoffentlich/",
138 |     "GOID_limits": {"lower_limit" : 0.4, 
139 |                     "upper_limit" : 0.6},
140 | 
141 |     "adaptive_filtering_params" : {"nav_beta": 1.0,
142 |                         "nav_gamma_low": 1.0,
143 |                         "nav_gamma_hi": 0.1,
144 |                         "p_random_max" : 0.15,
145 |                         "nav_P_omega": 0.97}
146 | }
147 | 


--------------------------------------------------------------------------------
/Networks.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from   torch.distributions import Normal
  5 | import numpy as np
  6 | import torch
  7 | from initialize import *
  8 | from ResnetNetworks import *
  9 | from tcn_hx import *
 10 | from torch.distributions import Categorical
 11 | from tcn_ import *
 12 | import copy 
 13 | 
 14 | from pretrain_CAE import ConvAutoencoder_NAV4
 15 | 
 16 | 
 17 | class NavACLNetwork(nn.Module):
 18 |     def __init__(self, task_param_dim, hidden_dim, init_w=5e-4):
 19 |         super(NavACLNetwork, self).__init__()
 20 |         self.layer_1 = nn.Linear(task_param_dim, hidden_dim)
 21 |         self.layer_2 = nn.Linear(hidden_dim, hidden_dim)
 22 |         self.layer_3 = nn.Linear(hidden_dim, hidden_dim)
 23 |         self.layer_out = nn.Linear(hidden_dim, 1)
 24 | 
 25 |         nn.init.kaiming_uniform_(self.layer_1.weight, mode='fan_in', nonlinearity='relu')
 26 |         nn.init.kaiming_uniform_(self.layer_2.weight, mode='fan_in', nonlinearity='relu')
 27 |         nn.init.kaiming_uniform_(self.layer_3.weight, mode='fan_in', nonlinearity='relu')
 28 | 
 29 |         self.m1 = torch.nn.LeakyReLU(0.1) #torch.nn.PReLU(num_parameters=1, init=0.25)
 30 |         self.m2 = torch.nn.LeakyReLU(0.1) #torch.nn.PReLU(num_parameters=1, init=0.25)
 31 |         self.m3 = torch.nn.LeakyReLU(0.1) #torch.nn.PReLU(num_parameters=1, init=0.25)
 32 | 
 33 |     def forward(self, inputs):
 34 |         x = self.m1(self.layer_1(inputs))
 35 |         x = self.m2(self.layer_2(x))
 36 |         x = self.m3(self.layer_3(x))
 37 |         x = torch.sigmoid(self.layer_out(x))
 38 | 
 39 |         # x = x.clamp(0,1)
 40 |     
 41 |         return (x)
 42 | 
 43 |     def get_task_success_prob(self, task_param_array):
 44 |         torch_task_param_array = torch.FloatTensor(task_param_array).to(device)
 45 | 
 46 |         difficulty_estimate = self.forward(torch_task_param_array)
 47 |     
 48 |         return difficulty_estimate.detach().cpu().numpy()
 49 | 
 50 | class ValueNetwork(nn.Module):
 51 |     def __init__(self, state_dim, hidden_dim, init_w=3e-3):
 52 |         super(ValueNetwork, self).__init__()
 53 |         
 54 |         self.linear1 = nn.Linear(state_dim, hidden_dim)
 55 |         self.linear2 = nn.Linear(hidden_dim, hidden_dim)
 56 |         self.linear3 = nn.Linear(hidden_dim, hidden_dim)
 57 |         self.linear4 = nn.Linear(hidden_dim, 1)
 58 |         # weights initialization
 59 |         self.linear4.weight.data.uniform_(-init_w, init_w)
 60 |         self.linear4.bias.data.uniform_(-init_w, init_w)
 61 |         
 62 |     def forward(self, state):
 63 |         x = F.relu(self.linear1(state))
 64 |         x = F.relu(self.linear2(x))
 65 |         x = F.relu(self.linear3(x))
 66 |         x = self.linear4(x)
 67 |         return x
 68 |         
 69 | class SoftQNetwork(nn.Module):
 70 |     def __init__(self, config,):
 71 |         super(SoftQNetwork, self).__init__()
 72 |         self.config         = config
 73 |         resolutions         = config['output_resolution']
 74 |         self.num_stacked_frames = config['num_stacked_frames']
 75 | 
 76 |         num_lidar_beams     = config['num_lidar_beams']
 77 |         hidden_dim          = config['hidden_dim']
 78 |         num_actions         = config['num_actions']
 79 |         self._num_actions   = num_actions
 80 |         init_w              = 3e-3
 81 | 
 82 |         shapes              = (self.num_stacked_frames*resolutions[0],resolutions[1],resolutions[2],num_lidar_beams)
 83 |         self.inputs_shape   = (*shapes,) 
 84 | 
 85 |         self.kernel_size    = 2
 86 |         self.channel_s      = [256,128,64] # To think : 
 87 |         self.channel_a      = [16, 16, 16]
 88 |         self.channel_r      = [16, 16, 16]
 89 | 
 90 |         
 91 |         if (config['use_pretrained_vae']):
 92 |             self.config['freeze_convolution'] = True
 93 |             ae_model = ConvAutoencoder_NAV4(imgChannels=self.num_stacked_frames*resolutions[0])
 94 |             ae_model.load_state_dict(torch.load(config['pretrained_vae_path']))
 95 |             self.features=copy.deepcopy(ae_model.encode)
 96 |             del ae_model
 97 |         else:
 98 |             self.features = nn.Sequential(
 99 |                 ResNetBlock(self.num_stacked_frames*resolutions[0],64,3), 
100 |                 ResNetBlock(64,128,3), 
101 |                 ResNetBlock(128,256,3), 
102 |                 ResNetBlock(256,128,3),
103 |             )
104 | 
105 |         if(self.config['freeze_convolution']):
106 |             # print("using fixed convolution layers for Soft Q and Policy Network !")
107 |             for param in self.features.parameters():
108 |                 param.requires_grad = False
109 | 
110 |         self.fc = nn.Sequential(
111 |             nn.Linear(self.features_size(), 512),
112 |             nn.LeakyReLU(),
113 |             nn.Linear(512, 256) #  -> 64 
114 |         )
115 |         self.linear_Lidar_1 = nn.Linear(self.inputs_shape[3],hidden_dim) #256 -> 64
116 |         self.linear_Lidar_2 = nn.Linear(hidden_dim,32) # 64-> 32
117 | 
118 |         self.linear_sar = nn.Linear(12,8)
119 |         self.linear3 = nn.Linear(256 + 8 +32, hidden_dim)
120 |         self.linear4 = nn.Linear(hidden_dim, 1)
121 |         self.linear4.weight.data.uniform_(-init_w, init_w)
122 |         self.linear4.bias.data.uniform_(-init_w, init_w)
123 | 
124 | 
125 | 
126 | 
127 | 
128 |         
129 |     def forward(self, state_cam, state_lidar, prev_action, prev_reward):
130 | 
131 |         state_cam =  state_cam/ 2**8
132 |         l = self.linear_Lidar_1(state_lidar)
133 |         l = self.linear_Lidar_2(l)
134 |         # print(prev_reward.shape)
135 |         # print(prev_action.shape)
136 |         sar = torch.cat([prev_action.view(prev_action.size(0),-1), prev_reward.view(prev_reward.size(0),-1)], 1)
137 |         # print(prev_action.view(prev_action.size(0),-1).shape)
138 |         # print(prev_reward.view(prev_reward.size(0),-1).shape)
139 |         # print(sar.shape)
140 |         # print(sar.shape)
141 |         sar = F.leaky_relu(self.linear_sar(sar))
142 | 
143 |         x = self.features(state_cam)
144 |         x = x.reshape(x.size(0), -1)
145 |         x = self.fc(x)
146 |         # print("a, pa, rew: ", action.shape, prev_action.shape, reward.shape)
147 |         # print("shapes x, a, pa, r, l: ", x.shape, prev_action.shape, prev_action.shape, prev_reward.shape, l.shape)
148 | 
149 |         x = torch.cat([x, sar], 1) # the dim 0 is number of samples
150 |         # x = F.relu(self.linear1(x))
151 |         # x = F.relu(self.linear2(x))
152 |         #concatenate with preprocessed lidar state
153 |         x = torch.cat([x,l],1)
154 |         x = F.relu(self.linear3(x))
155 |         x = self.linear4(x)
156 |         return x
157 | 
158 |     def features_size(self):
159 |         return self.features(torch.zeros(1, *self.inputs_shape[0:3])).view(1, -1).size(1)
160 |        
161 |         
162 | class PolicyNetwork(nn.Module):
163 |     def __init__(self, config):
164 |         super(PolicyNetwork, self).__init__()
165 |         self.log_std_min = -20
166 |         self.log_std_max = 2
167 |         self.num_actions  = config['num_actions']
168 |         self.action_range = config['action_range']
169 |         self.config = config
170 | 
171 |         self.version     = 0
172 |         init_w=3e-3
173 |         self.num_stacked_frames = config['num_stacked_frames']
174 |         resolutions         = config['output_resolution']
175 |         shapes              = (self.num_stacked_frames*resolutions[0],resolutions[1],resolutions[2])
176 |         num_lidar_beams     = config['num_lidar_beams']
177 |         hidden_dim          = config['hidden_dim']
178 |         self.inputs_shape   = (*shapes, num_lidar_beams ) 
179 | 
180 |         self.kernel_size = 2
181 |         self.channel_s      = [256,128,64] # To think : 
182 |         self.channel_a      = [16, 16 ,16]
183 |         self.channel_r      = [16, 16 ,16]
184 | 
185 | 
186 |         if (config['use_resnet_archi']):
187 |                 self.features = nn.Sequential(
188 |                 ResNetBlock(self.inputs_shape[0],16,3),#40*40
189 |                 ResNetBlock(16,32,3), # 18*18
190 |                 ResNetBlock(32,64,3),
191 |                 Flatten(),
192 |                 ).apply(initialize_weights_he) #7*7
193 |         if (config['use_pretrained_vae']):
194 |             self.config['freeze_convolution'] = True
195 |             ae_model = ConvAutoencoder_NAV4(imgChannels=self.num_stacked_frames*resolutions[0])
196 |             ae_model.load_state_dict(torch.load(config['pretrained_vae_path']))
197 |             self.features=ae_model.encode
198 |             del ae_model
199 |         else:
200 |             # RGB*4 x 80 x 80
201 |             self.features = nn.Sequential(
202 |                 ResNetBlock(self.num_stacked_frames*resolutions[0],64,3), 
203 |                 ResNetBlock(64,128,3), 
204 |                 ResNetBlock(128,256,3), 
205 |                 ResNetBlock(256,128,3),
206 |             )
207 | 
208 |         if (self.config['freeze_convolution']):
209 |             # print("using fixed convolution layers for Policy Network !")
210 |             for param in self.features.parameters():
211 |                 param.requires_grad = False
212 |     
213 |         self.fc = nn.Sequential(
214 |             nn.Linear(self.features_size(), 512),
215 |             nn.LeakyReLU(),
216 |             nn.Linear(512, 256)
217 |         )
218 |         self.linear_Lidar_1 = nn.Linear(self.inputs_shape[3],hidden_dim)
219 |         self.linear_Lidar_2 = nn.Linear(hidden_dim,32)
220 |         # L (4xSRA, 8) (8,8)
221 |         # L (256+32+x, 2)
222 |         # self.linear1 = nn.Linear(256 + self.num_actions +1, hidden_dim)
223 |         # self.linear2 = nn.Linear(hidden_dim, hidden_dim)
224 |         self.linear_sar = nn.Linear(12,8)
225 |         self.linear3 = nn.Linear(256 + 8 +32, hidden_dim)
226 |         # self.linear4 = nn.Linear(hidden_dim, hidden_dim)
227 | 
228 | 
229 |         self.mean_linear = nn.Linear(hidden_dim, self.num_actions)
230 |         self.mean_linear.weight.data.uniform_(-init_w, init_w)
231 |         self.mean_linear.bias.data.uniform_(-init_w, init_w)
232 |         
233 |         self.log_std_linear = nn.Linear(hidden_dim, self.num_actions)
234 |         self.log_std_linear.weight.data.uniform_(-init_w, init_w)
235 |         self.log_std_linear.bias.data.uniform_(-init_w, init_w)
236 | 
237 | 
238 |         
239 |     def forward(self, state_cam, state_lidar,prev_action, prev_reward):
240 |         state_cam   = state_cam / 2**8
241 |         state_lidar = state_lidar
242 |         l = self.linear_Lidar_1(state_lidar)
243 |         l = self.linear_Lidar_2(l)
244 |         
245 |         sar = torch.cat([prev_action.view(prev_action.size(0),-1), prev_reward.view(prev_reward.size(0),-1)], 1)
246 |         # print(prev_action.view(prev_action.size(0),-1).shape)
247 |         # print(prev_reward.view(prev_reward.size(0),-1).shape)
248 |         # print(sar.shape)
249 |         sar = F.leaky_relu(self.linear_sar(sar))
250 | 
251 |         x = self.features(state_cam)
252 |         x = x.reshape(x.size(0), -1)
253 |         x = self.fc(x)
254 | 
255 |         # print("shapes x, a, pa, r: ", x.shape, prev_action.shape, reward.shape)
256 |         x = torch.cat([x, sar], 1) # the dim 0 is number of samples
257 |         # x = F.relu(self.linear1(x))
258 |         # x = F.relu(self.linear2(x))
259 | 
260 |         # concatenate with preprocessed lidar state
261 |         x = torch.cat([x,l],1)
262 |         x = F.relu(self.linear3(x))
263 |         # x = F.relu(self.linear4(x))
264 | 
265 |         mean    = (self.mean_linear(x))
266 |         # mean    = F.leaky_relu(self.mean_linear(x))
267 |         log_std = self.log_std_linear(x)
268 |         log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
269 |         
270 |         return mean, log_std
271 |     
272 |     def evaluate(self, state_cam, state_lidar,prev_action, prev_reward, device, epsilon=1e-6):
273 |         '''
274 |         generate sampled action with state as input wrt the policy network;
275 |         '''
276 |         mean, log_std = self.forward(state_cam, state_lidar,prev_action, prev_reward)
277 |         std = log_std.exp() # no clip in evaluation, clip affects gradients flow
278 |         
279 |         normal = Normal(0, 1)
280 |         z      = normal.sample(mean.shape) 
281 |         action_0 = torch.tanh(mean + std*z.to(device)) # TanhNormal distribution as actions; reparameterization trick
282 |         action = self.action_range*action_0
283 |         # The log-likelihood here is for the TanhNorm distribution instead of only Gaussian distribution. \
284 |         # The TanhNorm forces the Gaussian with infinite action range to be finite. \
285 |         # For the three terms in this log-likelihood estimation: \
286 |         # (1). the first term is the log probability of action as in common \
287 |         # stochastic Gaussian action policy (without Tanh); \
288 |         # (2). the second term is the caused by the Tanh(), \
289 |         # as shown in appendix C. Enforcing Action Bounds of https://arxiv.org/pdf/1801.01290.pdf, \
290 |         # the epsilon is for preventing the negative cases in log; \
291 |         # (3). the third term is caused by the action range I used in this code is not (-1, 1) but with \
292 |         # an arbitrary action range, which is slightly different from original paper.
293 |         log_prob = Normal(mean, std).log_prob(mean+ std*z.to(device)) - torch.log(1. - action_0.pow(2) + epsilon) -  np.log(self.action_range)
294 |         # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); 
295 |         # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, 
296 |         # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal.
297 |         log_prob = log_prob.sum(dim=1, keepdim=True)
298 |         return action, log_prob, z, mean, log_std
299 |         
300 |     
301 |     def get_action(self, state_cam, state_lidar,prev_action, prev_reward,  deterministic, device):
302 | 
303 |         state_cam   = torch.FloatTensor(state_cam).unsqueeze(0).to(device) 
304 |         state_lidar = torch.FloatTensor(state_lidar).unsqueeze(0).to(device) 
305 |         prev_action = torch.FloatTensor(prev_action).unsqueeze(0).to(device) # needs to fit state dimensions so we add 1 dim
306 |         prev_reward      = torch.FloatTensor(prev_reward).unsqueeze(0).to(device) # needs to fit state dimensions so we add 1 dim
307 | 
308 | 
309 |         mean, log_std = self.forward(state_cam, state_lidar,prev_action, prev_reward)
310 |         std = log_std.exp()
311 |         
312 | 
313 |         normal = Normal(0, 1)
314 |         z      = normal.sample(mean.shape).to(device)
315 |         action = self.action_range* torch.tanh(mean + std*z)
316 |         
317 | 
318 |         action = self.action_range* torch.tanh(mean).detach().cpu().numpy()[0] if deterministic else action.detach().cpu().numpy()[0]
319 |         # print(action)
320 |         return action
321 | 
322 | 
323 |     def sample_action(self,):
324 |         a=torch.FloatTensor(self.num_actions).uniform_(-1, 1)
325 |         return self.action_range*a.numpy()
326 |     def features_size(self):
327 |         return self.features(torch.zeros(1, *self.inputs_shape[0:3])).view(1, -1).size(1)
328 | 
329 | 
330 | # -*- coding: utf-8 -*-
331 | """
332 | Created on Tue May 25 17:41:51 2021
333 | 
334 | @author: Honghu Xue
335 | """
336 | import numpy as np
337 | import torch
338 | from torch import nn
339 | import torch.nn.functional as F
340 | from utils import initialize_weights_he
341 | from utils import initialize_weights_xavier
342 |     
343 | class Flatten(nn.Module):
344 |     def forward(self, x):
345 |         return x.view(x.size(0), -1)    
346 |     
347 |     
348 | class ResNetBlock(nn.Module):
349 |     '''https://www.reddit.com/r/MachineLearning/comments/671455/d_batch_normalization_in_reinforcement_learning/
350 |         Batch normalziation causes instability in training in DQN
351 |         Weight normalziation can increase a little bit on the performance in DQN. https://proceedings.neurips.cc/paper/2016/file/ed265bc903a5a097f61d3ec064d96d2e-Paper.pdf
352 |     '''
353 |     
354 |     
355 |     def __init__(self, in_channels, out_channels, kernel_size):
356 |         super(ResNetBlock, self).__init__()
357 |         # !!! Don't fully understand what is padding = same in the slide!!!
358 |         self.residual_pass = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
359 |                                                     stride = 1, padding = 1, padding_mode='replicate'),
360 |                                 nn.ReLU(),
361 |                                 nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size,
362 |                                                     stride = 1, padding = 1, padding_mode='replicate'), 
363 |                                )
364 |         self.identity_mapping = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1,
365 |                                                     stride = 1), # no padding here
366 |                                )
367 |         # After output size after passing through identity_mapping is W_2 = (signal_length - Kernal_size +2 * Padding)/Stride + 1        
368 |         self.down_dimension = nn.Sequential(nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=2,
369 |                                                     stride = 2, padding = 0),
370 |                                )                  
371 |         self.RelU = nn.ReLU()
372 |         
373 |     def forward(self, x):
374 |         #TODO
375 |         print("orig: ", x.shape)
376 |         residual = self.residual_pass(x)
377 |         print("residual: ", residual.shape)
378 |         x = self.identity_mapping(x) 
379 |         print("identiy: ", x.shape)
380 |         x += residual
381 |         print("union: ", x.shape)
382 |         x =  self.RelU(x)
383 |         x = self.down_dimension(x)
384 |         print("downdim: ", x.shape)
385 | 
386 |         return x


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep-Reinforcement-Learning-for-mapless-navigation-in-intralogistics
 2 | This repository is for the paper "Using Deep Reinforcement Learning with Automatic Curriculum Learning for Mapless Navigation in Intralogistics".
 3 | 
 4 | The codes includes:
 5 | * Soft Actor Critic Algorithm, adapted from [1]
 6 | * Our algorithm of NavACL_Q
 7 | * Hyper-parameter Settings
 8 | 
 9 | ## Illustration of our results
10 | Here we demonstrate some videos of our trained agent on successfully reaching goal state, i.e., the blue dolly.
11 | 
12 | <!-- ![Media3](https://user-images.githubusercontent.com/27278454/151372213-829e4833-9a3b-4ba2-b046-1a3f7b226bf9.mp4)  -->
13 | [![watch the video](https://user-images.githubusercontent.com/27278454/151380252-510bd73c-03b7-4790-be10-562155494a9c.jpg)](https://user-images.githubusercontent.com/27278454/151382348-c3d8a984-01eb-4e69-9093-114d611eddb5.mp4)
14 | 
15 | [![watch the video]((https://user-images.githubusercontent.com/27278454/151383580-3d2fb9d0-3b29-47f8-a472-0fba2513fe06.jpg)](https://user-images.githubusercontent.com/27278454/151382581-3993931e-7ba4-4738-88e3-7d312f4e5ff8.mp4)
16 | 
17 | 
18 | [![watch the video](https://user-images.githubusercontent.com/27278454/151383536-ae62d5ac-576b-4d3b-8815-7e43996d39e5.jpg)](https://user-images.githubusercontent.com/27278454/151382612-467f42c3-31cd-4366-ba11-92fc8ee24340.mp4)
19 | 
20 | 
21 | [![watch the video](https://user-images.githubusercontent.com/27278454/151383516-2b0dc284-e545-4063-b6c8-06b607251bde.jpg)](https://user-images.githubusercontent.com/27278454/151382630-68874454-36d6-4e61-82b7-20ec3d209ae1.mp4)
22 | 
23 | 
24 | Even with a moving target, the trained agent is able to react responsively to navigate to the goal.
25 | ![Media3](https://user-images.githubusercontent.com/27278454/151374369-5899a3bd-338f-4450-9f7d-f4bb5071c400.gif)
26 | 
27 | 
28 | 
29 | ## Illustration of our results
30 | [1] https://github.com/quantumiracle/Popular-RL-Algorithms
31 | 


--------------------------------------------------------------------------------
/RobotTask.py:
--------------------------------------------------------------------------------
  1 | from utils import rot_vec
  2 | from numpy import savetxt
  3 | import numpy as np
  4 | from scipy.spatial import distance
  5 | from scipy.spatial.transform import Rotation as R
  6 | import random
  7 | from enum import Enum
  8 | 
  9 | class Tasktype(Enum):
 10 |   RANDOM    = 0
 11 |   EASY      = 1
 12 |   FRONTIER  = 2
 13 | 
 14 | class RobotTask():
 15 |   def __init__(self,default_task,):
 16 |     super(RobotTask, self).__init__()
 17 | 
 18 |     self.robot_translation      = np.array(default_task['robot_pose']['translation']).astype(np.float)
 19 |     self.robot_rotation         = np.array(default_task['robot_pose']['rotation_yaw']).astype(np.float)
 20 | 
 21 |     self.dolly_translation      = np.array(default_task['dolly_pose']['translation']).astype(np.float)
 22 |     self.dolly_rotation         = np.array(default_task['dolly_pose']['rotation_yaw']).astype(np.float)
 23 | 
 24 |     self.obstacle_translation   = np.array(default_task['obstacle_pose']['translation']).astype(np.float)
 25 |     self.obstacle_rotation      = np.array(default_task['obstacle_pose']['rotation_yaw']).astype(np.float)
 26 | 
 27 |     self.obstacle_1_translation = np.array(default_task['obstacle_1_pose']['translation']).astype(np.float)
 28 |     self.obstacle_1_rotation    = np.array(default_task['obstacle_1_pose']['rotation_yaw']).astype(np.float)
 29 | 
 30 |     self.obstacle_2_translation = np.array(default_task['obstacle_2_pose']['translation']).astype(np.float)
 31 |     self.obstacle_2_rotation    = np.array(default_task['obstacle_2_pose']['rotation_yaw']).astype(np.float)
 32 | 
 33 |     self.obstacle_3_translation = np.array(default_task['obstacle_3_pose']['translation']).astype(np.float)
 34 |     self.obstacle_3_rotation    = np.array(default_task['obstacle_3_pose']['rotation_yaw']).astype(np.float)
 35 | 
 36 |     self.q_value                = 0
 37 |     self.task_type              = Tasktype.RANDOM
 38 | 
 39 | 
 40 |   def randomize_task(self, randomization_params):
 41 |     """
 42 |       randomizes the current task settings according to the randomization parameters (as a json)
 43 |     """
 44 |     num_obstacles                = random.randint(0,randomization_params['num_obstacles'])
 45 |     del_obstacle_translation     = np.asarray(randomization_params['del_obstacle_translation'])
 46 |     self.robot_rotation         += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['robot_randomization']['rotation_rnd'])           )
 47 |     self.dolly_rotation         += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['dolly_randomization']['rotation_rnd'])           )
 48 |     self.obstacle_rotation      += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['rotation_rnd'])        )
 49 |     self.obstacle_1_rotation    += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['rotation_rnd'])        )
 50 |     self.obstacle_2_rotation    += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['rotation_rnd'])        )
 51 |     self.obstacle_3_rotation    += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['rotation_rnd'])        )
 52 | 
 53 | 
 54 | 
 55 |     self.robot_translation      += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['robot_randomization']['translation_rnd_xyz'])    )
 56 |     #self.dolly_translation      += ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['dolly_randomization']['translation_rnd_xyz'])    )
 57 | 
 58 |     min_dist_d_r                = randomization_params['min_dist_dolly_robot']
 59 |     max_dist_d_r                = randomization_params['max_dist_dolly_robot']
 60 |     dolly_pos_spline            = randomization_params['dolly_pos_spline_angle']
 61 | 
 62 |     dolly_distance              = min_dist_d_r + np.random.uniform(0,1)*(max_dist_d_r-min_dist_d_r)        #create a distance vector with min, max distance according to the config file
 63 |     dolly_vector                = np.array([1,0])*dolly_distance                                    #create the final dolly position as the robot position+distance vector
 64 |     dolly_pos_rot               = ( (-0.5 + np.random.uniform(0,1))) * dolly_pos_spline                    #rotate it around the origin
 65 |     dolly_vector                = self.robot_translation + np.hstack((rot_vec(dolly_vector,dolly_pos_rot),np.zeros(1)))
 66 | 
 67 |     self.dolly_translation      = dolly_vector
 68 | 
 69 | 
 70 |     
 71 |     min_dist_r_o                = randomization_params['min_dist_robot_obs']
 72 |     max_dist_r_o                = randomization_params['max_dist_robot_obs']
 73 | 
 74 |     min_dist_d_o                = randomization_params['min_dist_dolly_obs']
 75 |     max_dist_d_o                = randomization_params['max_dist_dolly_obs']
 76 |     obs_mean                    = randomization_params['obstacle_pos_spline_mean']
 77 |     obstacle_vectors            = []
 78 | 
 79 |     for i in range(2):
 80 |       obstacle_distance = min_dist_r_o + np.random.uniform(0,1)*(max_dist_r_o-min_dist_r_o)        #create a distance vector with min, max distance according to the config file
 81 |       obstacle_pos_rot  = 90 + np.random.uniform(-obs_mean/2,obs_mean/2)*90+(i*180)                                             #rotate it around the origin
 82 |       obstacle_vector   = np.array([1,0])*obstacle_distance                                 #create the final object position as the robot position+distance vector
 83 | 
 84 | 
 85 |       obstacle_vector   = self.robot_translation + np.hstack((rot_vec(obstacle_vector,obstacle_pos_rot),np.zeros(1)))
 86 |       obstacle_vectors.append(obstacle_vector)
 87 | 
 88 |     for i in range(2):
 89 |       obstacle_distance = min_dist_d_o + np.random.uniform(0,1)*(max_dist_d_o-min_dist_d_o)        #create a distance vector with min, max distance according to the config file
 90 |       obstacle_pos_rot  = 90 + np.random.uniform(-obs_mean/2,obs_mean/2)*90+(i*180)                                           #rotate it around the origin
 91 |       obstacle_vector   = np.array([1,0])*obstacle_distance                                 #create the final object position as the dolly position+distance vector
 92 | 
 93 | 
 94 |       obstacle_vector   = self.dolly_translation + np.hstack((rot_vec(obstacle_vector,obstacle_pos_rot),np.zeros(1)))
 95 |       obstacle_vectors.append(obstacle_vector)
 96 | 
 97 |     for remove_obj_idx in range(4-1, num_obstacles-1, -1):
 98 |       obstacle_vectors[remove_obj_idx] =  del_obstacle_translation
 99 | 
100 |     # now randomly assign the created obstacle vectors to the target translations
101 |     rnd_obstcl_indexes = np.random.permutation(np.array([0,1,2,3]))
102 |     self.obstacle_translation   =obstacle_vectors[rnd_obstcl_indexes[0]]        #+= ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['translation_rnd_xyz']) )
103 |     self.obstacle_1_translation =obstacle_vectors[rnd_obstcl_indexes[1]]        #+= ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['translation_rnd_xyz']) )
104 |     self.obstacle_2_translation =obstacle_vectors[rnd_obstcl_indexes[2]]        #+= ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['translation_rnd_xyz']) )
105 |     self.obstacle_3_translation =obstacle_vectors[rnd_obstcl_indexes[3]]        #+= ( (-0.5 + np.random.uniform(0,1)) *np.array(randomization_params['obstacle_randomization']['translation_rnd_xyz']) )
106 | 
107 | 
108 |   def set_q_value(self, q_value):
109 |     self.q_value = q_value
110 | 
111 |   def get_obstacle_translations_array(self):
112 |     return np.array([self.obstacle_translation, self.obstacle_1_translation, self.obstacle_2_translation, self.obstacle_3_translation])
113 | 
114 |   def get_task_array(self):
115 |       """
116 |         returns the task as a numpy array in the form: [[robo_trans_x,robo_trans_y,robo_trans_z,robo_yaw_angle],
117 |                                                         [dolly_trans_x,dolly_trans_y,dolly_trans_z,dolly_yaw_angle],
118 |                                                         [obstacle_trans_x,obstacletrans_y,obstacletrans_z,obobstacleyaw_angle]
119 |                                                         [obstacle_1_trans_x,obstacle_1_trans_y,obstacle_1_trans_z,obobstacle_1_yaw_angle]
120 |                                                         [obstacle_2_trans_x,obstacle_2_trans_y,obstacle_2_trans_z,obobstacle_2_yaw_angle]
121 |                                                         [obstacle_3_trans_x,obstacle_3_trans_y,obstacle_3_trans_z,obobstacle_3_yaw_angle]]
122 |       """
123 |       robot_part      = np.hstack((self.robot_translation,      self.robot_rotation))
124 |       dolly_part      = np.hstack((self.dolly_translation,      self.dolly_rotation))
125 |       obstacle_part   = np.hstack((self.obstacle_translation,   self.obstacle_rotation))
126 |       obstacle_1_part = np.hstack((self.obstacle_1_translation, self.obstacle_1_rotation))
127 |       obstacle_2_part = np.hstack((self.obstacle_2_translation, self.obstacle_2_rotation))
128 |       obstacle_3_part = np.hstack((self.obstacle_3_translation, self.obstacle_3_rotation))
129 | 
130 |       task_array = np.array([robot_part,dolly_part,obstacle_part,obstacle_1_part,obstacle_2_part,obstacle_3_part])
131 |       return task_array
132 |   
133 |   def from_task_array(self,task_array):
134 |     self.robot_translation      = task_array[0,0:3]
135 |     self.robot_rotation         = np.array([task_array[0][3]])
136 |     self.dolly_translation      = task_array[1,0:3]
137 |     self.dolly_rotation         = np.array([task_array[1][3]])
138 |     self.obstacle_translation   = task_array[2,0:3]
139 |     self.obstacle_rotation      = np.array([task_array[2][3]])
140 |     self.obstacle_1_translation = task_array[3,0:3]
141 |     self.obstacle_1_rotation    = np.array([task_array[3][3]])
142 |     self.obstacle_2_translation = task_array[4,0:3]
143 |     self.obstacle_2_rotation    = np.array([task_array[4][3]])
144 |     self.obstacle_3_translation = task_array[5,0:3]
145 |     self.obstacle_3_rotation    = np.array([task_array[5][3]])
146 |     self.q_value                = 0
147 |   
148 |   
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------
/SAC_Trainers.py:
--------------------------------------------------------------------------------
  1 | from hashlib import new
  2 | from torch import ne
  3 | from Networks import *
  4 | from per_buffer import *
  5 | from utils    import *
  6 | from ResnetNetworks import *
  7 | 
  8 | 
  9 | 
 10 | class SAC_Trainer():
 11 |     def __init__(self,save_location, config):
 12 |         self.save_config             = config
 13 |         self.load_config(self.save_config)
 14 |         save_location           = save_location
 15 |         self.q1_losses          = []
 16 |         self.q2_losses          = []
 17 |         self.policy_losses      = []
 18 |         self.alpha_losses       = []
 19 |         self.alpha_list         = []
 20 |         self.update_step        = 0
 21 |         self.steps              = 0
 22 |         action_range            = 1.
 23 | 
 24 | 
 25 |         input_shape             = (self.input_shape[0],self.input_shape[1],self.input_shape[2])
 26 |         action_dim              = 2
 27 |         state_dim               = (self.num_stacked_frames*input_shape[0],input_shape[1],input_shape[2],self.num_lidar_beams)
 28 | 
 29 |         self.soft_q_net1            = SoftQNetwork(config).to(train_device)
 30 |         self.soft_q_net2            = SoftQNetwork(config).to(train_device)
 31 |         self.target_soft_q_net1     = SoftQNetwork(config).to(train_device)
 32 |         self.target_soft_q_net2     = SoftQNetwork(config).to(train_device)
 33 |         self.policy_net             = PolicyNetwork(config).to(train_device)
 34 |         self.log_alpha              = torch.ones(1, device=train_device) * np.log(self.alpha)
 35 |         self.log_alpha.requires_grad= True
 36 | 
 37 | 
 38 | 
 39 |         if(self.use_exp_prio_buffer):
 40 |             self.replay_buffer     = PrioritizedReplayBuffer(self.buffer_maxlen,config,mem_efficient_mode=True,state_dim=state_dim,action_dim=2,num_stacked_frames=self.num_stacked_frames,alpha=config['prioritized_replay_alpha']) 
 41 |             self.soft_q_criterion1 = torch.nn.SmoothL1Loss(reduction = 'none')
 42 |             self.soft_q_criterion2 = torch.nn.SmoothL1Loss(reduction = 'none')
 43 |             self.beta_schedule     = LinearSchedule(config['buffer_maxlen'],
 44 |                                                     initial_p=config['prioritized_replay_beta0'],
 45 |                                                     final_p=config['prioritized_replay_beta1'])
 46 | 
 47 |         else:
 48 |             self.replay_buffer     = ReplayBuffer(self.buffer_maxlen,config,mem_efficient_mode=True,state_dim=state_dim,action_dim=2,num_stacked_frames=self.num_stacked_frames) #(self.buffer_maxlen,self.num_stacked_frames,state_dim) # 
 49 |             self.soft_q_criterion1 = torch.nn.SmoothL1Loss(reduction = 'none')
 50 |             self.soft_q_criterion2 = torch.nn.SmoothL1Loss(reduction = 'none')
 51 | 
 52 | 
 53 | 
 54 |         for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()):
 55 |             target_param.data.copy_(param.data)
 56 |         for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()):
 57 |             target_param.data.copy_(param.data)
 58 | 
 59 | 
 60 |         # for freezing feature extractor, dont put in to optimizer
 61 |         self.soft_q_optimizer1 = optim.Adam(filter(lambda p: p.requires_grad, self.soft_q_net1.parameters()), lr=self.q_lr)
 62 |         self.soft_q_optimizer2 = optim.Adam(filter(lambda p: p.requires_grad, self.soft_q_net2.parameters()), lr=self.q_lr)
 63 |         self.policy_optimizer  = optim.Adam(filter(lambda p: p.requires_grad, self.policy_net.parameters()),  lr=self.p_lr)
 64 | 
 65 |         self.alpha_optimizer   = optim.Adam([self.log_alpha], lr=self.a_lr)
 66 | 
 67 | 
 68 |     def all_elements_to(self, target_device):
 69 |         self.soft_q_net1            = self.soft_q_net1.to(target_device)
 70 |         self.soft_q_net2            = self.soft_q_net2.to(target_device)
 71 |         self.target_soft_q_net1     = self.target_soft_q_net1.to(target_device)
 72 |         self.target_soft_q_net2     = self.target_soft_q_net2.to(target_device)
 73 |         self.policy_net             = self.policy_net.to(target_device)
 74 |         self.log_alpha              = self.log_alpha.to(target_device)
 75 | 
 76 |     def load_config(self,config):
 77 |         self.tau                     = config['tau']
 78 |         self.alpha                   = config['alpha']
 79 |         self.q_lr                    = config['q_lr']
 80 |         self.p_lr                    = config['p_lr']
 81 |         self.a_lr                    = config['a_lr']
 82 |         self.buffer_maxlen           = config['buffer_maxlen']
 83 |         self.num_stacked_frames      = config['num_stacked_frames']
 84 |         self.hidden_dim              = config['hidden_dim']
 85 |         self.freeze_convolution      = config['freeze_convolution']
 86 |         self.use_grad_clip           = config['use_grad_clip']
 87 |         self.max_grad                = config['max_grad']
 88 |         self.use_exp_prio_buffer     = config['use_exp_prio_buffer']
 89 |         self.use_hard_update         = config['use_hard_update']
 90 |         self.use_pretrained_resnet18 = config['use_pretrained_resnet18']
 91 |         self.num_lidar_beams         = config['num_lidar_beams']
 92 |         self.input_shape             = config['output_resolution']
 93 |         self.hard_update_n_step      = config['hard_update_n_step']
 94 |     
 95 |     def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-1, gamma=0.99,soft_tau=1e-3, use_reward_scaling=False):
 96 |         start = current_milli_time()
 97 |         if self.use_exp_prio_buffer == False:
 98 |             c_obs,l_obs, action, prev_action, reward, prev_reward, c_next_osb,l_next_obs, done = self.replay_buffer.sample(batch_size)
 99 |         else:
100 |             c_obs,l_obs, action, prev_action, reward, prev_reward, c_next_osb,l_next_obs, done, weights, idxs = self.replay_buffer.sample(batch_size,self.beta_schedule.value(self.update_step))
101 |             weights = torch.FloatTensor(weights).to(train_device)
102 | 
103 |         # print("sample time: ", current_milli_time()- start)
104 | 
105 |         self.update_step +=1 
106 |         cam_tensor          = torch.FloatTensor(c_obs).to(train_device)
107 |         lidar_tensor        = torch.FloatTensor(l_obs).to(train_device)
108 |         next_cam_tensor     = torch.FloatTensor(c_next_osb).to(train_device)
109 |         next_lidar_tensor   = torch.FloatTensor(l_next_obs).to(train_device)
110 |         action              = torch.FloatTensor(action).to(train_device)
111 |         prev_action         = torch.FloatTensor(prev_action).to(train_device)
112 |         reward              = torch.FloatTensor(reward).to(train_device)       # reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; - no now it is a stack of 4 so we dont need unsqzeeze anymore!
113 |         prev_reward         = torch.FloatTensor(prev_reward).to(train_device)  # prev_reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; - no now it is a stack of 4 so we dont need unsqzeeze anymore!
114 |         done                = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(train_device)
115 | 
116 |         predicted_q_value1 = self.soft_q_net1(cam_tensor,lidar_tensor, prev_action, prev_reward)
117 |         predicted_q_value2 = self.soft_q_net2(cam_tensor,lidar_tensor, prev_action, prev_reward)
118 | 
119 |         with torch.no_grad():
120 |             new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_cam_tensor,next_lidar_tensor,action, reward, device=train_device) 
121 |             new_next_stacked_action = action.view(action.size(0),-1)[:,2:]
122 |             new_next_stacked_action = torch.cat((new_next_stacked_action,new_next_action),dim=1)
123 |             target_q_min = torch.min(self.target_soft_q_net1(next_cam_tensor,next_lidar_tensor, new_next_stacked_action, reward),self.target_soft_q_net2(next_cam_tensor, next_lidar_tensor, new_next_stacked_action,reward)) - self.alpha * next_log_prob
124 |             target_q_value = reward[:,-1] + (1 - done) * gamma * target_q_min # if done==1, only reward
125 | 
126 |         new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(cam_tensor,lidar_tensor,prev_action, prev_reward, device=train_device)
127 |         new_stacked_action = prev_action.view(prev_action.size(0),-1)[:,2:]
128 |         new_stacked_action = torch.cat((new_stacked_action,new_action),dim=1)
129 |         new_action         = new_stacked_action
130 |         if(use_reward_scaling):
131 |             reward      = reward[:,-1]
132 |             prev_reward = prev_reward[:,-1]
133 | 
134 |             prev_reward = reward_scale * (prev_reward - prev_reward.mean(dim=0)) / (prev_reward.std(dim=0) + 1e-6)
135 |             reward      = reward_scale * (reward - reward.mean(dim=0)) / (reward.std(dim=0) + 1e-6) # normalize with batch mean and std; plus a small number to prevent numerical problem
136 | 
137 |     # Updating alpha wrt entropy
138 |         # alpha = 0.0  # trade-off between exploration (max entropy) and exploitation (max Q) 
139 |         if auto_entropy is True:
140 |             alpha_loss = -(self.log_alpha.exp() * (log_prob + target_entropy).detach()).mean()
141 |             # print('alpha loss: ',alpha_loss)
142 |             self.alpha_optimizer.zero_grad()
143 |             alpha_loss.backward()
144 |             self.alpha_optimizer.step()
145 |             self.alpha = self.log_alpha.exp()
146 |             self.alpha_losses.append(alpha_loss.item())
147 |             self.alpha_list.append(self.alpha.item())
148 | 
149 |         else:
150 |             self.alpha = self.alpha
151 |             alpha_loss = 0
152 | 
153 |         if self.use_exp_prio_buffer == False:            
154 |             q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()).squeeze()  # detach: no gradients for the variable
155 |             q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()).squeeze()
156 |             q_value_loss1 = (q_value_loss1).mean()
157 |             q_value_loss2 = (q_value_loss2).mean()
158 |         else:
159 |             q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()).squeeze()  # detach: no gradients for the variable
160 |             q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()).squeeze()
161 |             td_error_abs = (q_value_loss1 + q_value_loss2)/2
162 |             q_value_loss1 = (q_value_loss1 * weights).mean()
163 |             q_value_loss2 = (q_value_loss2 * weights).mean()
164 |             # update priority for trained samples
165 | 
166 |             self.replay_buffer.update_priorities(idxs, np.nan_to_num(td_error_abs.cpu().detach()) + 1e-6)
167 | 
168 | 
169 |         self.soft_q_optimizer1.zero_grad()
170 |         q_value_loss1.backward()
171 |         if(self.use_grad_clip):
172 |             nn.utils.clip_grad_norm_(self.soft_q_net1.parameters(),self.max_grad)
173 |         self.soft_q_optimizer1.step()
174 |         self.soft_q_optimizer2.zero_grad()
175 |         q_value_loss2.backward()
176 |         if(self.use_grad_clip):
177 |             nn.utils.clip_grad_norm_(self.soft_q_net2.parameters(),self.max_grad)
178 |         self.soft_q_optimizer2.step()  
179 | 
180 |         predicted_new_q_value = torch.min(self.soft_q_net1(cam_tensor,lidar_tensor, new_action, reward),self.soft_q_net2(cam_tensor,lidar_tensor, new_action, reward))
181 |         policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean()
182 | 
183 |         self.policy_optimizer.zero_grad()
184 |         policy_loss.backward()
185 |         self.policy_optimizer.step()
186 |         self.policy_net.version += 1
187 | 
188 |         self.q1_losses.append(q_value_loss1.item())
189 |         self.q2_losses.append(q_value_loss2.item())
190 |         self.policy_losses.append(policy_loss.item())
191 | 
192 |         
193 |     
194 |         if(self.use_hard_update):
195 |             if (self.update_step % self.hard_update_n_step == 0) :
196 |                 for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()):
197 |                     target_param.data.copy_(param.data)
198 |                 for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()):
199 |                     target_param.data.copy_(param.data)
200 | 
201 | 
202 |         else: # Soft update the target value net
203 |             for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()):
204 |                 target_param.data.copy_(  # copy data value into target parameters
205 |                     target_param.data * (1.0 - soft_tau) + param.data * soft_tau
206 |                 )
207 |             for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()):
208 |                 target_param.data.copy_(  # copy data value into target parameters
209 |                     target_param.data * (1.0 - soft_tau) + param.data * soft_tau
210 |                 )
211 |             
212 |         del cam_tensor, lidar_tensor, next_cam_tensor, next_lidar_tensor, action, prev_action, reward, prev_reward, done  
213 |         return predicted_new_q_value.mean()
214 | 
215 |     def save_checkpoint(self, path, suffix):
216 |         torch.save({
217 |                     'config'            : self.save_config,
218 |                     'update_step'       : self.update_step,
219 |                     'soft_q_net1'       : self.soft_q_net1.state_dict(),
220 |                     'soft_q_net2'       : self.soft_q_net2.state_dict(),
221 |                     'policy_net'        : self.policy_net.state_dict(),
222 |                     'log_alpha'         : self.log_alpha,
223 |                     'soft_q_optimizer1' : self.soft_q_optimizer1.state_dict(),
224 |                     'soft_q_optimizer2' : self.soft_q_optimizer2.state_dict(),
225 |                     'policy_optimizer'  : self.policy_optimizer.state_dict(),
226 |                     'alpha_optimizer'   : self.alpha_optimizer.state_dict()
227 |                     }, path+"_CHECKPOINT_"+str(suffix))
228 | 
229 |     def load_checkpoint(self,path,suffix, load_config=False):
230 |         checkpoint=torch.load(path+"_CHECKPOINT_"+str(suffix))
231 | 
232 |         self.soft_q_net1.load_state_dict(checkpoint['soft_q_net1'])
233 |         self.soft_q_net2.load_state_dict(checkpoint['soft_q_net2'])
234 |         self.policy_net.load_state_dict(checkpoint['policy_net'])
235 |         self.soft_q_optimizer1.load_state_dict(checkpoint['soft_q_optimizer1'])
236 |         self.soft_q_optimizer2.load_state_dict(checkpoint['soft_q_optimizer2'])
237 |         self.policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
238 |         self.alpha_optimizer.load_state_dict(checkpoint['alpha_optimizer'])
239 |         self.log_alpha = checkpoint['log_alpha']
240 |         print("ALHPA DEVICE!!!!!111!!!: ", self.log_alpha.get_device())
241 |         self.update_step = checkpoint['update_step']
242 |         if(load_config):
243 |             self.save_config = checkpoint['config']
244 |             self.load_config(checkpoint['config'])
245 | 
246 |         for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()):
247 |             target_param.data.copy_(param.data)
248 |         for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()):
249 |             target_param.data.copy_(param.data)
250 |         print("LOADED PRETRAINED MODEL FROM: \n", path)
251 |         print("NOW START TRAINING FROM STEP: ", self.update_step)
252 | 
253 |         self.all_elements_to(train_device)
254 | 
255 |     def save_model(self, path):
256 |         torch.save(self.soft_q_net1.state_dict(), path+'_q1_'+str(self.update_step))
257 |         torch.save(self.soft_q_net2.state_dict(), path+'_q2_'+str(self.update_step))
258 |         torch.save(self.policy_net.state_dict(), path+'_policy_'+str(self.update_step))
259 | 
260 |     def load_model(self, path, step):
261 |         self.soft_q_net1.load_state_dict(torch.load(path+'_q1_'+str(step)))
262 |         self.soft_q_net2.load_state_dict(torch.load(path+'_q2_'+str(step)))
263 |         self.policy_net.load_state_dict(torch.load(path+'_policy_'+str(step)))
264 | 
265 |         self.soft_q_net1.eval()
266 |         self.soft_q_net2.eval()
267 |         self.policy_net.eval()
268 | 
269 | 
270 |         
271 | """
272 | 
273 | class SAC_Trainer_Resnet():
274 |     def __init__(self,save_location, config):
275 |         save_location           = save_location
276 |         self.q1_losses          = []
277 |         self.q2_losses          = []
278 |         self.policy_losses      = []
279 |         self.alpha_losses       = []
280 |         self.alpha_list         = []
281 |         self.update_step        = 0
282 |         action_range            = 1.
283 |         tau                     = config['tau']
284 |         self.alpha              = config['alpha']
285 |         q_lr                    = config['q_lr']
286 |         p_lr                    = config['p_lr']
287 |         a_lr                    = config['a_lr']
288 |         buffer_maxlen           = config['buffer_maxlen']
289 |         num_stacked_frames      = config['num_stacked_frames']
290 |         hidden_dim              = config['hidden_dim']
291 |         freeze_convolution      = config['freeze_convolution']
292 |         self.use_grad_clip      = config['use_grad_clip']
293 |         self.max_grad           = config['max_grad']
294 |         self.use_exp_prio_buffer= config['use_exp_prio_buffer']
295 |         use_pretrained_resnet18 = config['use_pretrained_resnet18']
296 |         num_lidar_beams         = config['num_lidar_beams']
297 |         action_dim              = 2
298 |         input_shape             = config['output_resolution']
299 |         input_shape             = (input_shape[0],input_shape[1],input_shape[2])
300 |         state_dim               = (num_stacked_frames*input_shape[0],input_shape[1],input_shape[2],num_lidar_beams)
301 |         
302 |         
303 | 
304 |         self.soft_q_net1 = SoftQNetwork_Resnet(state_dim, action_dim, hidden_dim,freeze_convolution=freeze_convolution,use_pretrained_resnet18=use_pretrained_resnet18).to(device)
305 |         self.soft_q_net2 = SoftQNetwork_Resnet(state_dim, action_dim, hidden_dim,freeze_convolution=freeze_convolution,use_pretrained_resnet18=use_pretrained_resnet18).to(device)
306 |         self.target_soft_q_net1 = SoftQNetwork_Resnet(state_dim, action_dim, hidden_dim,freeze_convolution=freeze_convolution,use_pretrained_resnet18=use_pretrained_resnet18).to(device)
307 |         self.target_soft_q_net2 = SoftQNetwork_Resnet(state_dim, action_dim, hidden_dim,freeze_convolution=freeze_convolution,use_pretrained_resnet18=use_pretrained_resnet18).to(device)
308 |         self.policy_net = PolicyNetwork_Resnet(state_dim, action_dim, hidden_dim, action_range,freeze_convolution=freeze_convolution,use_pretrained_resnet18=use_pretrained_resnet18).to(device)
309 |         self.log_alpha = torch.ones(1, device=device) * np.log(self.alpha)
310 |         self.log_alpha.requires_grad=True
311 | 
312 | 
313 |         if(self.use_exp_prio_buffer):
314 |             self.replay_buffer = PrioritizedBuffer(buffer_maxlen,env.observtion_shape,memory_efficient_mode = False)
315 |             self.soft_q_criterion1 = torch.nn.SmoothL1Loss(reduction = 'none')
316 |             self.soft_q_criterion2 = torch.nn.SmoothL1Loss(reduction = 'none')
317 |         else:
318 |             self.replay_buffer = ReplayBuffer_MemEfficient_Resnet(buffer_maxlen,num_stacked_frames,state_dim)
319 |             self.soft_q_criterion1 = nn.MSELoss()
320 |             self.soft_q_criterion2 = nn.MSELoss()
321 | 
322 | 
323 |         for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()):
324 |             target_param.data.copy_(param.data)
325 |         for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()):
326 |             target_param.data.copy_(param.data)
327 | 
328 |         self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=q_lr)
329 |         self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=q_lr)
330 |         self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=p_lr)
331 |         self.alpha_optimizer = optim.Adam([self.log_alpha], lr=a_lr)
332 | 
333 |     
334 |     def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99,soft_tau=1e-2, use_reward_scaling=False):
335 |         start = current_milli_time()
336 |         if self.use_exp_prio_buffer == False:
337 |             state, action,prev_action, reward, prev_reward, next_state, done = self.replay_buffer.sample(batch_size)
338 |         else:
339 |             transitions, idxs, weights = self.replay_buffer.sample(batch_size)
340 |             state, action, prev_action, reward, next_state, done = transitions
341 |             weights = torch.FloatTensor(weights).to(device)
342 | 
343 |         state_shape         =  state.shape
344 |         cam_state           = []
345 |         lidar_state         = []
346 |         next_cam_state      = []
347 |         next_lidar_state    = []
348 | 
349 |         for x in range(batch_size): 
350 |             cam_state.append(state[x][0])
351 |             lidar_state.append(state[x][1])
352 |             next_cam_state.append(state[x][0])
353 |             next_lidar_state.append(state[x][1])
354 | 
355 | 
356 |         cam_state           = np.asarray(cam_state).reshape((batch_size,3,224,224))
357 |         lidar_state         = np.asarray(lidar_state)
358 | 
359 |         cam_tensor          = torch.FloatTensor(cam_state).to(device)
360 |         lidar_tensor        = torch.FloatTensor(lidar_state).to(device)
361 | 
362 |         next_cam_state      = np.asarray(next_cam_state).reshape((batch_size,3,224,224))
363 |         next_lidar_state    = np.asarray(next_lidar_state)
364 |         next_cam_tensor     = torch.FloatTensor(next_cam_state).to(device)
365 |         next_lidar_tensor   = torch.FloatTensor(next_lidar_state).to(device)
366 | 
367 | 
368 |         self.update_step +=1 
369 | 
370 |         state               = [cam_tensor,lidar_tensor]
371 | 
372 |         next_state          = [next_cam_tensor, next_lidar_tensor]
373 | 
374 |         action              = torch.FloatTensor(action).to(device)
375 |         prev_action         = torch.FloatTensor(prev_action).to(device)
376 |         reward              = torch.FloatTensor(reward).unsqueeze(1).to(device)       # reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim;
377 |         prev_reward         = torch.FloatTensor(prev_reward).to(device)  # prev_reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim;
378 |         done                = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device)
379 |         
380 |         
381 | 
382 |         predicted_q_value1 = self.soft_q_net1(state, action,prev_action, prev_reward)
383 |         predicted_q_value2 = self.soft_q_net2(state, action,prev_action, prev_reward)
384 |         new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state,prev_action, prev_reward,device=device)
385 |         new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state,action, reward,device=device)
386 | 
387 |         if(use_reward_scaling):
388 |             reward = reward_scale * (reward - reward.mean(dim=0)) / (reward.std(dim=0) + 1e-6) # normalize with batch mean and std; plus a small number to prevent numerical problem
389 | 
390 |     # Updating alpha wrt entropy
391 |         # alpha = 0.0  # trade-off between exploration (max entropy) and exploitation (max Q) 
392 |         if auto_entropy is True:
393 |             alpha_loss = -(self.log_alpha * (log_prob + target_entropy).detach()).mean()
394 |             # print('alpha loss: ',alpha_loss)
395 |             self.alpha_optimizer.zero_grad()
396 |             alpha_loss.backward()
397 |             self.alpha_optimizer.step()
398 |             self.alpha = self.log_alpha.exp()
399 | 
400 |             self.alpha_losses.append(alpha_loss.item())
401 |             self.alpha_list.append(self.alpha.item())
402 | 
403 |         else:
404 |             self.alpha = 1.
405 |             alpha_loss = 0
406 | 
407 |     # Training Q Function
408 |         target_q_min = torch.min(self.target_soft_q_net1(next_state, new_next_action, action, reward),self.target_soft_q_net2(next_state, new_next_action,action,reward)) - self.alpha * next_log_prob
409 |         target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward
410 |         # print("pred Q", predicted_q_value1)
411 |         # print("target Q", target_q_value)
412 | 
413 | 
414 |         if self.use_exp_prio_buffer == False:            
415 |             q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach())  # detach: no gradients for the variable
416 |             q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach())
417 |         else:
418 |             q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()).squeeze()  # detach: no gradients for the variable
419 |             q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()).squeeze()
420 |             # print("q1 loss", q_value_loss1)
421 |             # print(predicted_q_value1.shape, target_q_value.shape)
422 |             # print(weights.shape, self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()).squeeze().shape)
423 |             td_error_abs = (q_value_loss1 + q_value_loss2)/2
424 |             q_value_loss1 = (q_value_loss1 * weights).mean()
425 |             q_value_loss2 = (q_value_loss2 * weights).mean()
426 |             # print("IDXS: ", idxs)
427 |             # update priority for trained samples
428 |             for idx, td_error in zip(idxs, td_error_abs.cpu().detach() + 1e-8):#in zip(idxs, td_error_abs.cpu().detach().numpy()[0]):
429 |                 self.replay_buffer.update_priority(idx, td_error)
430 | 
431 |         self.soft_q_optimizer1.zero_grad()
432 |         q_value_loss1.backward()
433 |         if(self.use_grad_clip):
434 |             nn.utils.clip_grad_norm_(self.soft_q_net1.parameters(),self.max_grad)
435 |         self.soft_q_optimizer1.step()
436 |         self.soft_q_optimizer2.zero_grad()
437 |         q_value_loss2.backward()
438 |         if(self.use_grad_clip):
439 |             nn.utils.clip_grad_norm_(self.soft_q_net2.parameters(),self.max_grad)
440 |         self.soft_q_optimizer2.step()  
441 | 
442 | 
443 |         predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action,action, reward),self.soft_q_net2(state, new_action,action, reward))
444 |         policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean()
445 | 
446 |         self.policy_optimizer.zero_grad()
447 |         policy_loss.backward()
448 |         self.policy_optimizer.step()
449 | 
450 |         self.q1_losses.append(q_value_loss1.item())
451 |         self.q2_losses.append(q_value_loss2.item())
452 |         self.policy_losses.append(policy_loss.item())
453 |         
454 |     # Soft update the target value net
455 |         for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()):
456 |             target_param.data.copy_(  # copy data value into target parameters
457 |                 target_param.data * (1.0 - soft_tau) + param.data * soft_tau
458 |             )
459 |         for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()):
460 |             target_param.data.copy_(  # copy data value into target parameters
461 |                 target_param.data * (1.0 - soft_tau) + param.data * soft_tau
462 |             )
463 | 
464 |         return predicted_new_q_value.mean()
465 | 
466 |     def save_model(self, path):
467 |         torch.save(self.soft_q_net1.state_dict(), path+'_q1_'+str(self.update_step))
468 |         torch.save(self.soft_q_net2.state_dict(), path+'_q2_'+str(self.update_step))
469 |         torch.save(self.policy_net.state_dict(), path+'_policy_'+str(self.update_step))
470 | 
471 |     def load_model(self, path, step):
472 |         self.soft_q_net1.load_state_dict(torch.load(path+'_q1_'+str(step)))
473 |         self.soft_q_net2.load_state_dict(torch.load(path+'_q2_'+str(step)))
474 |         self.policy_net.load_state_dict(torch.load(path+'_policy_'+str(step)))
475 | 
476 |         self.soft_q_net1.eval()
477 |         self.soft_q_net2.eval()
478 |         self.policy_net.eval()
479 | 
480 | """


--------------------------------------------------------------------------------
/SAC_hyperparameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gamma"                     : 0.999,
 3 |     "tau"                       : 0.001,
 4 |     "alpha"                     : 0.2,
 5 |     "a_lr"                      : 2e-4,
 6 |     "q_lr"                      : 2e-4,
 7 |     "p_lr"                      : 2e-4,
 8 |     "num_stacked_frames"        : 4,
 9 |     "train_starts"              : 0,
10 |     "fill_buffer_with_transitions"  : 15000,
11 |     "batch_size"                : 64,
12 |     "hidden_dim"                : 64,
13 |     "max_grad"                  : 10000,
14 |     "max_episodes"              : 1e+6,
15 |     "max_steps"                 : 200,
16 |     "buffer_maxlen"             : 1e+6,
17 |     "use_grad_clip"             : false,
18 |     "use_exp_prio_buffer"       : true,
19 |     "use_cer_exp"               : false,
20 |     "prioritized_replay_beta0"  : 0.4,
21 |     "prioritized_replay_beta1"  : 0.6,
22 |     "prioritized_replay_alpha"  : 0.6, 
23 |     "use_reward_scaling"        : false,
24 |     "use_hard_update"           : true,
25 |     "hard_update_n_step"        : 5000,
26 |     "max_angular_vel"           :  0.05,
27 |     "min_linear_vel"            :  0.5,
28 |     "max_action_distance"       :  0.5,
29 |     "min_action_distance"       :  0.1,
30 |     "step_penalty"              : -0.1,
31 |     "dolly_collision_penalty"   : -0.1,
32 |     "different_action_penalty"  : -0.05,
33 |     "slow_penalty"              : -0.000,
34 |     "collision_penalty"         : -3,
35 |     "goal_reward"               :  10.0,
36 |     "heuristic_multiplier"      :  0.0,
37 |     "num_lidar_beams"           : 256,
38 |     "num_actions"               : 2,
39 |     "action_range"              : 1.0,
40 |     "freeze_convolution"        : false,
41 |     "terminate_on_collision"    : true,
42 |     "use_resnet_archi"          : false,
43 |     "use_pretrained_resnet18"   : false,
44 |     "num_Agents"                : 3,
45 |     "start_port"                : 64000,
46 |     "use_snail_mode"            : false,
47 |     "snail_kernel_size"         : 2,
48 |     "channel_s"                 : [256,128,64],
49 |     "channel_a"                 : [16,16,16],
50 |     "channel_r"                 : [16,16,16],
51 |     "use_pretrained_vae"        : false,
52 |     "pretrained_vae_path"       : "/home/developer/Training_results/Qricculum_Learning/big_and_small/hoffentlich/VAE_80803_615",
53 |     "load_checkpoint"           : true,
54 |     "checkpoint_path"           : "/media/developer/c214e611-7375-4843-a5ac-e51eed61602e/home/developer/Training_results/from_Hydra/Visual/10/04/2021/09:28:20/",
55 |     "checkpoint_index"          : "22_158322"
56 | }


--------------------------------------------------------------------------------
/buffers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Dec 19 23:24:08 2020
  4 | 
  5 | @author: hongh
  6 | """
  7 | import numpy as np
  8 | from collections import deque
  9 | import random
 10 | import torch
 11 | import matplotlib.pyplot as plt
 12 | from utils import *
 13 | from segment_tree import * 
 14 | 
 15 | class ReplayBuffer:
 16 |     def __init__(self, capacity):
 17 |         self.capacity = capacity
 18 |         self.buffer = []
 19 |         self.position = 0
 20 |     
 21 |     def push(self, state, action, prev_action, reward, prev_reward, next_state, done):
 22 |         if len(self.buffer) < self.capacity:
 23 |             self.buffer.append(None)
 24 |         self.buffer[self.position] = (state, action,prev_action, reward, prev_reward, next_state, done)
 25 |         self.position = int((self.position + 1) % self.capacity)  # as a ring buffer
 26 |     
 27 |     def sample(self, batch_size):
 28 |         batch = random.sample(self.buffer, batch_size)
 29 |         state, action,prev_action, reward, prev_reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element
 30 |         ''' 
 31 |         the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
 32 |         zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
 33 |         the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
 34 |         np.stack((1,2)) => array([1, 2])
 35 |         '''
 36 |         return state, action, prev_action, reward, prev_reward, next_state, done
 37 |     
 38 |     def __len__(self):
 39 |         return len(self.buffer)
 40 | 
 41 | 
 42 | class ReplayBuffer_MemEfficient:
 43 |     def __init__(self, capacity,num_stacked_frames,observation_shape):
 44 |         self.capacity = capacity
 45 |         self.buffer = []
 46 |         self.position = 0
 47 |         self.not_allowed_idx = set(np.arange(num_stacked_frames))
 48 |         self.num_stacked_frames = num_stacked_frames
 49 |         self.stacked_images_shape    = (observation_shape[0],observation_shape[1],observation_shape[2])
 50 |     
 51 |     def add(self, obs_cam, obs_lidar, action, prev_action, reward, prev_reward, next_obs_cam, next_obs_lidar, done):
 52 |         if len(self.buffer) < self.capacity:
 53 |             self.buffer.append(None)
 54 |         self.buffer[self.position] = (obs_cam, obs_lidar, action,prev_action, reward, prev_reward, next_obs_cam, next_obs_lidar, done)
 55 |         self.position = int((self.position + 1) % self.capacity)  # as a ring buffer
 56 | 
 57 |     def sample(self, batch_size):
 58 |         
 59 |         # sample without replacement and avoid the first n=num_stacked_frames entries in the buffer to avoid: idx < frame_stack
 60 |         idx_list = np.random.choice(len(self.buffer)-self.num_stacked_frames, batch_size)+self.num_stacked_frames 
 61 | 
 62 | 
 63 |         # find terminating samples inside the batch
 64 |         for idx in range(len(idx_list)): 
 65 |             for frame in range(self.num_stacked_frames-1):
 66 |                 obs_cam, obs_lidar  , action  , prev_action  , reward  , prev_reward  , next_obs_cam, next_obs_lidar  , done   = self.buffer[idx_list[idx]-frame]
 67 |                 #if terminating frame is found in one of the 4 previous frames then set the index of the terminating frame to be the last one in that state
 68 |                 #this avoids training on overlapping episodes, consider: the sampled idx is 13 -> then we concatenate the frames like so: [10,11,12,13] such that 13 is the last one
 69 |                 # but now if frame 11 corresponds to a terminating state then frames 10 and 11 are in a different episode than 12 and 13 therefore a state containing the frames
 70 |                 # [10,11,12,13] is impossible to reach (it would contain a teleport), therefore instead we find the done flag at frame 11 and reassign the target index from 13 to eleven
 71 |                 # the resulting sample is now [8,9,10,11] instead of [10,11,12,13]
 72 |                 if(done):
 73 |                     idx_list[idx] = idx_list[idx]-frame
 74 |                     break
 75 | 
 76 |         state_cam_batch = []
 77 |         next_cam_state_batch = []
 78 |         # stack the corresponding frames according to the idx_list and save them as batches
 79 |         ##### TODO: THIS TAKES TO LONG
 80 |         for idx in idx_list:
 81 |             obs_cam  , obs_lidar  , action  , prev_action  , reward  , prev_reward  , next_obs_cam  , next_obs_lidar  , done   = self.buffer[idx]
 82 |             obs_cam_1, obs_lidar_1, action_1, prev_action_1, reward_1, prev_reward_1, next_obs_cam_1, next_obs_lidar_1, done_1 = self.buffer[idx-1]
 83 |             obs_cam_2, obs_lidar_2, action_2, prev_action_2, reward_2, prev_reward_2, next_obs_cam_2, next_obs_lidar_2, done_2 = self.buffer[idx-2]
 84 |             obs_cam_3, obs_lidar_3, action_3, prev_action_3, reward_3, prev_reward_3, next_obs_cam_3, next_obs_lidar_3, done_3 = self.buffer[idx-3]
 85 |             obs_shape       = obs_cam.shape
 86 | 
 87 |             state_cam           = np.asarray([obs_cam_3, obs_cam_2, obs_cam_1, obs_cam]).reshape(self.stacked_images_shape)
 88 |             next_state_cam      = np.asarray([obs_cam_2, obs_cam_1, obs_cam, next_obs_cam]).reshape(self.stacked_images_shape)
 89 | 
 90 | 
 91 | 
 92 |             state_cam_batch.append(state_cam)
 93 |             next_cam_state_batch.append(next_state_cam)
 94 |             
 95 |             del obs_cam  , obs_lidar  , action  , prev_action  , reward  , prev_reward  , next_obs_cam  , next_obs_lidar  , done  
 96 |             del obs_cam_1, obs_lidar_1, action_1, prev_action_1, reward_1, prev_reward_1, next_obs_cam_1, next_obs_lidar_1, done_1
 97 |             del obs_cam_2, obs_lidar_2, action_2, prev_action_2, reward_2, prev_reward_2, next_obs_cam_2, next_obs_lidar_2, done_2
 98 |             del obs_cam_3, obs_lidar_3, action_3, prev_action_3, reward_3, prev_reward_3, next_obs_cam_3, next_obs_lidar_3, done_3
 99 | 
100 | 
101 | 
102 |         batch = [self.buffer[i] for i in idx_list]
103 | 
104 | 
105 |         ''' 
106 |         the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
107 |         zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
108 |         the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
109 |         np.stack((1,2)) => array([1, 2])
110 |         '''
111 |         obs_cam  , obs_lidar  , action  , prev_action  , reward  , prev_reward  , next_obs_cam  , next_obs_lidar  , done = map(np.stack, zip(*batch)) # stack for each element
112 |         #replace the observations with the stacked states
113 |         state_cam       = np.asarray(state_cam_batch)
114 |         next_state_cam  = np.asarray(next_cam_state_batch)
115 | 
116 | 
117 | 
118 |         return state_cam,obs_lidar, action, prev_action, reward, prev_reward, next_state_cam, next_obs_lidar, done
119 |     
120 |     def __len__(self):
121 |         return len(self.buffer)
122 | 
123 |     def show_state(self,state_tensor, save=False):
124 |         state_shape= (84,84,3)
125 |         pic0 = np.array(state_tensor.reshape(state_shape))    
126 |         
127 |         fig, (ax1) = plt.subplots(1, 1)
128 |         ax1.imshow(pic0)
129 |         plt.show()
130 |   
131 | class ReplayBuffer_MemEfficient_Resnet:
132 |     def __init__(self, capacity,num_stacked_frames,observation_shape):
133 |         self.capacity = capacity
134 |         self.buffer = []
135 |         self.position = 0
136 |         self.not_allowed_idx = set(np.arange(num_stacked_frames))
137 |         self.num_stacked_frames = num_stacked_frames
138 |         self.stacked_images_shape    = (observation_shape[0],244,244)
139 |     
140 |     def push(self, observation, action, prev_action, reward, prev_reward, next_observation, done):
141 |         if len(self.buffer) < self.capacity:
142 |             self.buffer.append(None)
143 |         self.buffer[self.position] = (observation, action,prev_action, reward, prev_reward, next_observation, done)
144 |         self.position = int((self.position + 1) % self.capacity)  # as a ring buffer
145 | 
146 |     def sample(self, batch_size):
147 |         
148 |         # sample without replacement and avoid the first n=num_stacked_frames entries in the buffer to avoid: idx < frame_stack
149 |         idx_list = np.random.choice(len(self.buffer)-self.num_stacked_frames, batch_size)+self.num_stacked_frames 
150 | 
151 | 
152 |         # find terminating samples inside the batch
153 |         for idx in range(len(idx_list)): 
154 |             for frame in range(self.num_stacked_frames-1):
155 |                 observation  , action  , prev_action  , reward  , prev_reward  , next_observation  , done   = self.buffer[idx_list[idx]-frame]
156 |                 #if terminating frame is found in one of the 4 previous frames then set the index of the terminating frame to be the last one in that state
157 |                 #this avoids training on overlapping episodes, consider: the sampled idx is 13 -> then we concatenate the frames like so: [10,11,12,13] such that 13 is the last one
158 |                 # but now if frame 11 corresponds to a terminating state then frames 10 and 11 are in a different episode than 12 and 13 therefore a state containing the frames
159 |                 # [10,11,12,13] is impossible to reach (it would contain a teleport), therefore instead we find the done flag at frame 11 and reassign the target index from 13 to eleven
160 |                 # the resulting sample is now [8,9,10,11] instead of [10,11,12,13]
161 |                 if(done):
162 |                     idx_list[idx] = idx_list[idx]-frame
163 |                     break
164 | 
165 |         state_batch = []
166 |         next_state_batch = []
167 |         # stack the corresponding frames according to the idx_list and save them as batches
168 |         for idx in idx_list:
169 |             observation  , action  , prev_action  , reward  , prev_reward  , next_observation  , done   = self.buffer[idx]
170 |             # observation_1, action_1, prev_action_1, reward_1, prev_reward_1, next_observation_1, done_1 = self.buffer[idx-1]
171 |             # observation_2, action_2, prev_action_2, reward_2, prev_reward_2, next_observation_2, done_2 = self.buffer[idx-2]
172 |             # observation_3, action_3, prev_action_3, reward_3, prev_reward_3, next_observation_3, done_3 = self.buffer[idx-3]
173 |             obs_shape       = observation[0].shape
174 | 
175 |             state_lidar         = observation[1]
176 |                                     #preprocess(observation_3[0]).numpy(),preprocess(observation_2[0]).numpy(),preprocess(observation_1[0]).numpy(),
177 |             state_cam           = [preprocess(observation[0]).numpy()]
178 |             state_cam           = np.asarray(state_cam).reshape((3,224,224))
179 |             state               = np.asarray([state_cam, state_lidar])
180 | 
181 |             next_state_lidar    = next_observation[1]
182 |                 #preprocess(observation_2[0]).numpy(),preprocess(observation_1[0]).numpy(),preprocess(observation[0]).numpy(),
183 |             next_state_cam = [preprocess(next_observation[0]).numpy()]
184 |             next_state_cam = np.asarray(next_state_cam).reshape((3,224,224))
185 |             next_state          = np.asarray([next_state_cam,next_state_lidar])
186 | 
187 |             state_batch.append(state)
188 |             next_state_batch.append(next_state)
189 | 
190 | 
191 |         batch = [self.buffer[i] for i in idx_list]
192 | 
193 | 
194 |         ''' 
195 |         the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
196 |         zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
197 |         the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
198 |         np.stack((1,2)) => array([1, 2])
199 |         '''
200 |         observation, action,prev_action, reward, prev_reward, next_observation, done = map(np.stack, zip(*batch)) # stack for each element
201 |         #replace the observations with the stacked states
202 |         state       = np.asarray(state_batch)
203 |         next_state  = np.asarray(next_state_batch)
204 | 
205 | 
206 |         return state, action, prev_action, reward, prev_reward, next_state, done
207 |     
208 |     def __len__(self):
209 |         return len(self.buffer)
210 | 
211 |     def show_state(self,state_tensor, save=False):
212 |         state_shape= (84,84,1)
213 |         pic0 = np.array(state_tensor.reshape(state_shape))    
214 |         
215 |         fig, (ax1) = plt.subplots(1, 1)
216 |         ax1.imshow(pic0)
217 |         plt.show()
218 | 
219 | class BasicBuffer:
220 |     def __init__(self, max_size, shape):
221 |         self.max_size = max_size
222 |         self.buffer = deque(maxlen=max_size)       
223 |         self.frame_stack = shape[0]
224 |         self.not_allowed_idx = set(np.arange(self.frame_stack))
225 |         self.img_shape = shape
226 |         
227 |     def push(self, state, action, reward, next_state, done):
228 |         ''' Changed to store the last frame'''
229 |         # state = np.expand_dims(state[-1], axis=0)
230 |         # next_state = np.expand_dims(next_state[-1], axis=0)
231 |         # state.astype(np.uint8)
232 |         # next_state.astype(np.uint8)
233 |         experience = (np.expand_dims(state[-1], axis=0).astype(np.uint8), np.array([action]), np.array([reward]), np.expand_dims(next_state[-1], axis=0).astype(np.uint8), done)
234 |         self.buffer.append(experience)
235 |         
236 | 
237 |     def sample(self, batch_size):
238 |         '''Changed to memory-efficient one by storing one frame, but when extracting, frame stacking is automatically done.'''
239 |         # state_batch = []
240 |         action_batch = []
241 |         reward_batch = []
242 |         # next_state_batch = []
243 |         done_batch = []
244 |         continue_sample = True
245 |         state_batch = np.zeros((batch_size,*self.img_shape))
246 |         next_state_batch = np.zeros((batch_size,*self.img_shape))
247 |         
248 |         # ----------------recursive sampling to avoid: idx < frame_stack----------------
249 |         while continue_sample:
250 |             idx = np.random.choice(len(self.buffer), batch_size) # sample without replacement
251 |             idx_set = set(idx)
252 |             if len(idx_set.intersection(self.not_allowed_idx)) == 0:
253 |                 continue_sample = False          
254 | 
255 |                   
256 |         # --------------- frame stacking -----------------         
257 |         for i in range(len(idx)):
258 |             for j in range(1, self.frame_stack+1):# 
259 |                 if self.buffer[idx[i]-j+1][4] == True and j > 1: # done = True         
260 |                     state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1]  # the last [0] makes --> 1*84*84 --> 84*84 
261 |                     next_state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1]
262 |                 else:
263 |                     state_batch[i, self.frame_stack-j] = self.buffer[idx[i]-j+1][0][0]  # the last [0] makes --> 1*84*84 --> 84*84 
264 |                     next_state_batch[i, self.frame_stack-j] = self.buffer[idx[i]-j+1][3][0]
265 |                 # state_batch.append(self.buffer[idx[i]][0])
266 |                 
267 |                 # next_state_batch.append(self.buffer[idx[i]][3])
268 |                 
269 |         for i in range(len(idx)):
270 |             reward_batch.append(self.buffer[idx[i]][2])
271 |             action_batch.append(self.buffer[idx[i]][1])
272 |             done_batch.append(self.buffer[idx[i]][4])
273 |             
274 |         return (state_batch, action_batch, reward_batch, next_state_batch, done_batch)
275 | 
276 |     def __len__(self):
277 |         return len(self.buffer)
278 |     
279 |     
280 |     
281 | class SumTree:
282 |     '''Even if I can't apply the trick of deque here, it is still possible to store one frame by ruling out the index in the current write position.'''
283 |     write = 0
284 | 
285 |     def __init__(self, capacity):
286 |         self.capacity = capacity
287 |         self.data = np.zeros(capacity, dtype=object) # storing experience
288 |         self.tree = np.zeros(2 * capacity - 1) # storing pripority + parental nodes. If you have n elements in the bottom, then you need n + (n-1) nodes to construct a tree.        
289 |         self.n_entries = 0
290 |         self.overwrite_start_flag = False # record whether N_entry > capacity
291 |         
292 |     # update to the root node
293 |     def _propagate(self, idx, change):
294 |         parent = (idx - 1) // 2
295 |         self.tree[parent] += change
296 |         if parent != 0:
297 |             self._propagate(parent, change)
298 | 
299 |     # find sample on leaf node
300 |     def _retrieve(self, idx, s):
301 |         left = 2 * idx + 1
302 |         right = left + 1
303 |         if left >= len(self.tree):
304 |             return idx
305 |         if s <= self.tree[left]:
306 |             return self._retrieve(left, s)
307 |         else:
308 |             return self._retrieve(right, s - self.tree[left])
309 | 
310 | 
311 |     def total(self):
312 |         return self.tree[0]
313 | 
314 |     # store priority and sample
315 |     def add(self, p, data):
316 |         idx = self.write + self.capacity - 1 # starting write from the first element of the bottom layer,
317 |         self.data[self.write] = data
318 |         self.update(idx, p)
319 |         self.write += 1
320 |         if self.write >= self.capacity:
321 |             self.write = 0
322 |             self.overwrite_start_flag = True
323 |         if self.n_entries < self.capacity:
324 |             self.n_entries += 1
325 | 
326 | 
327 |     # update priority
328 |     def update(self, idx, p):
329 |         change = p - self.tree[idx]
330 |         self.tree[idx] = p
331 |         self._propagate(idx, change)
332 | 
333 |     # get priority and sample
334 |     def get(self, s):
335 |         #--------original----------
336 |         idx = self._retrieve(0, s)
337 |         dataIdx = idx - self.capacity + 1
338 |         return (idx, self.tree[idx], self.data[dataIdx])   
339 |         #--------The below is Morvan's implementation----------
340 |         #https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Dueling%20Double%20DQN%20with%20PER%20and%20fixed-q%20targets/.ipynb_checkpoints/Dueling%20Deep%20Q%20Learning%20with%20Doom%20(%2B%20double%20DQNs%20and%20Prioritized%20Experience%20Replay)-checkpoint.ipynb
341 |         #https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/5.2_Prioritized_Replay_DQN/RL_brain.py
342 | #        parent_idx = 0
343 | #        while True:     # the while loop is faster than the method in the reference code
344 | #            cl_idx = 2 * parent_idx + 1         # this leaf's left and right kids
345 | #            cr_idx = cl_idx + 1
346 | #            if cl_idx >= len(self.tree):        # reach bottom, end search
347 | #                leaf_idx = parent_idx
348 | #                break
349 | #            else:       # downward search, always search for a higher priority node
350 | #                if s <= self.tree[cl_idx]:
351 | #                    parent_idx = cl_idx
352 | #                else:
353 | #                    s -= self.tree[cl_idx]
354 | #                    parent_idx = cr_idx
355 | #        data_idx = leaf_idx - self.capacity + 1
356 | #        return leaf_idx, self.tree[leaf_idx], self.data[data_idx]   
357 |     
358 |     
359 |     def max_priority(self):
360 |         return np.max(self.tree[self.capacity-1:])
361 | 
362 |     # ---- modified : later to avoid min_prob being overwritten ----
363 |     def min_prob(self):
364 |         if self.overwrite_start_flag == False:        
365 |             p_min = float(np.min(self.tree[self.capacity-1:self.n_entries+self.capacity-1])) / self.total()
366 |             self.p_min_history = p_min
367 |         elif self.overwrite_start_flag == True: 
368 |             p_min = min( float(np.min(self.tree[self.capacity-1:self.n_entries+self.capacity-1])) / self.total() ,self.p_min_history)
369 |             self.p_min_history = min(p_min, self.p_min_history)
370 |         return p_min
371 |     
372 |     
373 |     
374 | class PrioritizedBuffer:
375 |     '''Currently, there is a problem when memory is full'''
376 |     def __init__(self, max_size, shape, alpha=0.6, beta=0.4, memory_efficient_mode = True):
377 |         self.sum_tree = SumTree(max_size)
378 |         self.alpha = alpha
379 |         self.beta = beta
380 |         self.beta_increment = (1.0 - beta)/100000.
381 |         self.current_length = 0
382 |         self.frame_stack = shape[0]
383 |         self.efficient_memory_mode = memory_efficient_mode
384 |         self.img_shape = shape
385 |         
386 |         
387 |     def push(self, state, action,prev_action, reward, next_state, done):
388 |         priority = 1.0 if self.current_length == 0 else self.sum_tree.max_priority()
389 |         self.current_length = self.current_length + 1
390 |         # priority = priority ** self.alpha
391 |         # -------------modified for efficient storage--------------
392 |         if self.efficient_memory_mode:
393 |             experience = (np.expand_dims(state[-1], axis=0).astype(np.uint8), np.array([action]), np.array([reward]), np.expand_dims(next_state[-1], axis=0).astype(np.uint8), done)
394 |         else:
395 |             experience = (state, action,prev_action, reward, next_state, done)
396 |         self.sum_tree.add(priority, experience)
397 | 
398 | 
399 |     def sample(self, batch_size):
400 |         # ----------------To modify: sampling for efficient memory storage------------------
401 |         batch_idx, batch, priorities = [], [], []
402 |         segment = self.sum_tree.total() / batch_size
403 |         p_sum = self.sum_tree.tree[0]
404 | 
405 |         for i in range(batch_size):
406 |             a = segment * i
407 |             b = segment * (i + 1)
408 |             
409 |             # ---------self modified to allow storing for efficient memory storage of only one frame----------
410 |             if self.efficient_memory_mode:
411 |                 while True:
412 |                     s = random.uniform(a, b)
413 |                     idx, p, data = self.sum_tree.get(s)
414 |                     data_idx = idx-self.sum_tree.capacity+1 # Note: idx(node idx) is different from dataidx (data idx)!
415 |                     # print(data_idx, self.sum_tree.write, self.frame_stack)
416 |                     if (data_idx - self.sum_tree.write > self.frame_stack or data_idx<= self.sum_tree.write) and (self.sum_tree.overwrite_start_flag == True): 
417 |                         break
418 |                     # elif (self.sum_tree.overwrite_start_flag == False) and (data_idx >= self.frame_stack - 1):
419 |                     #     break         
420 |                     elif (self.sum_tree.overwrite_start_flag == False): # allow samples from <= frame_stack
421 |                         break                                   
422 |             else:
423 |                 s = random.uniform(a, b)
424 |                 idx, p, data = self.sum_tree.get(s)
425 |             batch_idx.append(idx)
426 |             batch.append(data)
427 |             priorities.append(p)
428 |         prob = np.array(priorities) /p_sum 
429 |         IS_weights = np.power(self.sum_tree.n_entries * prob, -self.beta)
430 |         max_weight = np.power(self.sum_tree.n_entries * self.sum_tree.min_prob(), -self.beta)
431 |         IS_weights /= max_weight
432 |         
433 |         if self.efficient_memory_mode:
434 |             action_batch = []
435 |             reward_batch = []
436 |             done_batch = []
437 |             state_batch = np.zeros((batch_size,*self.img_shape))
438 |             next_state_batch = np.zeros((batch_size,*self.img_shape))
439 |             # --------------- frame stacking -----------------         
440 |             for i in range(len(batch_idx)):
441 |                 for j in range(1, self.frame_stack+1):
442 |                     data_idx = batch_idx[i]-self.sum_tree.capacity+1-j+1 # notice the offset to substract to get right data[data_idx]
443 |                     if data_idx < 0 and self.sum_tree.overwrite_start_flag == True: # for the case : sample the 1st element and stack the last 3 elements from the end
444 |                         data_idx += self.sum_tree.capacity
445 |                     elif data_idx < 0 and self.sum_tree.overwrite_start_flag == False: # for the case : sample the 2nd element and stack the 1st elements 3 times from the beginning
446 |                         data_idx = 0
447 |                     # print(data_idx ,self.sum_tree.data[data_idx])
448 |                     if (self.sum_tree.data[data_idx][4] == True and j > 1) or (data_idx < 0): # for the case of done = True or sampling from the beginning         
449 |                         state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1]  # the last [0] makes --> 1*84*84 --> 84*84 
450 |                         next_state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1]
451 |                     else:
452 |                         state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][0][0]  # the last [0] makes --> 1*84*84 --> 84*84 
453 |                         next_state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][3][0]
454 |                  
455 |             for i in range(len(batch_idx)):
456 |                 data_idx = batch_idx[i]-self.sum_tree.capacity+1
457 |                 reward_batch.append(self.sum_tree.data[data_idx][2])
458 |                 action_batch.append(self.sum_tree.data[data_idx][1])
459 |                 done_batch.append(self.sum_tree.data[data_idx][4])           
460 |         else:
461 |             state_batch = []
462 |             action_batch = []
463 |             prev_action_batch = []
464 |             reward_batch = []
465 |             next_state_batch = []
466 |             done_batch = []
467 |             for transition in batch:
468 |                 state, action, prev_action,reward, next_state, done = transition
469 |                 state_batch.append(state)
470 |                 action_batch.append(action)
471 |                 prev_action_batch.append(prev_action)
472 |                 reward_batch.append(reward)
473 |                 next_state_batch.append(next_state)
474 |                 done_batch.append(done)
475 |         self.beta = min(1., self.beta + self.beta_increment)
476 |         return (state_batch, action_batch,prev_action_batch, reward_batch, next_state_batch, done_batch), batch_idx, IS_weights
477 | 
478 |     def update_priority(self, idx, td_error):
479 |         priority =  np.power(td_error, self.alpha)
480 |         self.sum_tree.update(idx, priority)
481 | 
482 |     def __len__(self):
483 |         return self.current_length
484 |     
485 |     
486 |     
487 | class PrioritizedBuffer_SNAIL:   
488 |     '''Only designed for TCN mode'''
489 |     def __init__(self, max_size, obs_shape, action_n, alpha=0.6, beta=0.4):
490 |         self.sum_tree = SumTree(max_size)
491 |         self.alpha = alpha
492 |         self.beta = beta
493 |         self.beta_increment = (1.0 - beta)/100000.
494 |         self.current_length = 0
495 |         self.frame_stack = obs_shape[0]
496 |         self.efficient_memory_mode = True
497 |         self.img_shape = obs_shape
498 |         self.action_n = action_n
499 |         
500 |     def push(self, state, action, reward, next_state, done):
501 |         priority = 1.0 if self.current_length == 0 else self.sum_tree.max_priority()
502 |         self.current_length = self.current_length + 1
503 |         #priority = td_error ** self.alpha
504 |         # -------------modified for efficient storage--------------
505 |         if self.efficient_memory_mode:
506 |             experience = (np.expand_dims(state[-1], axis=0).astype(np.uint8), np.array([action]), np.array([reward]), np.expand_dims(next_state[-1], axis=0).astype(np.uint8), done)
507 |         else:
508 |             experience = (state, action, reward, next_state, done)
509 |         self.sum_tree.add(priority, experience)
510 | 
511 | 
512 |     def sample(self, batch_size):
513 |         # ----------------To modify: sampling for efficient memory storage------------------
514 |         batch_idx, batch, priorities = [], [], []
515 |         segment = self.sum_tree.total() / batch_size
516 |         p_sum = self.sum_tree.tree[0]
517 | 
518 |         for i in range(batch_size):
519 |             a = segment * i
520 |             b = segment * (i + 1)
521 |             
522 |             # ---------self modified to allow storing for efficient memory storage of only one frame----------
523 |             if self.efficient_memory_mode:
524 |                 while True:
525 |                     s = random.uniform(a, b)
526 |                     idx, p, data = self.sum_tree.get(s)
527 |                     data_idx = idx-self.sum_tree.capacity+1 # Note: idx(node idx) is different from dataidx (data idx)!
528 |                     # print(data_idx, self.sum_tree.write, self.frame_stack)
529 |                     if (data_idx - self.sum_tree.write > self.frame_stack or data_idx<= self.sum_tree.write) and (self.sum_tree.overwrite_start_flag == True): 
530 |                         break
531 |                     # elif (self.sum_tree.overwrite_start_flag == False) and (data_idx >= self.frame_stack - 1):
532 |                     #     break         
533 |                     elif (self.sum_tree.overwrite_start_flag == False): # allow samples from <= frame_stack
534 |                         break                                   
535 |             else:
536 |                 s = random.uniform(a, b)
537 |                 idx, p, data = self.sum_tree.get(s)
538 |             batch_idx.append(idx)
539 |             batch.append(data)
540 |             priorities.append(p)
541 |         prob = np.array(priorities) /p_sum 
542 |         IS_weights = np.power(self.sum_tree.n_entries * prob, -self.beta)
543 |         max_weight = np.power(self.sum_tree.n_entries * self.sum_tree.min_prob(), -self.beta)
544 |         IS_weights /= max_weight
545 |         
546 |         if self.efficient_memory_mode:
547 |             # action: (B,L*feat), 
548 |             # reward: (B,L*1)
549 |             action_batch_s = np.zeros((batch_size, self.action_n, self.img_shape[0])) 
550 |             reward_batch_s = np.zeros((batch_size, 1, self.img_shape[0])) 
551 |             reward_batch = []
552 |             action_batch = []
553 |             done_batch = []
554 |             state_batch = np.zeros((batch_size,*self.img_shape))
555 |             next_state_batch = np.zeros((batch_size,*self.img_shape))
556 |             # --------------- frame stacking -----------------         
557 |             for i in range(len(batch_idx)):
558 |                 for j in range(1, self.frame_stack+1):
559 |                     data_idx = batch_idx[i]-self.sum_tree.capacity+1-j+1 # notice the offset to substract to get right data[data_idx]
560 |                     if data_idx < 0 and self.sum_tree.overwrite_start_flag == True: # for the case : sample the 1st element and stack the last 3 elements from the end
561 |                         data_idx += self.sum_tree.capacity
562 |                     elif data_idx < 0 and self.sum_tree.overwrite_start_flag == False: # for the case : sample the 2nd element and stack the 1st elements 3 times from the beginning
563 |                         data_idx = 0
564 |                     # print(data_idx ,self.sum_tree.data[data_idx])
565 |                     if (self.sum_tree.data[data_idx][4] == True and j > 1) or (data_idx < 0): # for the case of done = True or sampling from the beginning         
566 |                         state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1]  # the last [0] makes --> 1*84*84 --> 84*84 
567 |                         next_state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1]
568 |                         action_batch_s[i,:,self.frame_stack-j] = np.array([0]*self.action_n)# action_batch_s[i, self.frame_stack-j+1]
569 |                         reward_batch_s[i,:,self.frame_stack-j] = np.array([0])# reward_batch_s[i, self.frame_stack-j+1]
570 |                     else:
571 |                         state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][0][0]  # the last [0] makes --> 1*84*84 --> 84*84 
572 |                         next_state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][3][0]
573 |                         action_batch_s[i,:,self.frame_stack-j] = self.sum_tree.data[data_idx][1][0]
574 |                         reward_batch_s[i,:,self.frame_stack-j] = self.sum_tree.data[data_idx][2][0]
575 |                  
576 |             for i in range(len(batch_idx)):
577 |                 data_idx = batch_idx[i]-self.sum_tree.capacity+1
578 |                 reward_batch.append(self.sum_tree.data[data_idx][2])
579 |                 action_batch.append(self.sum_tree.data[data_idx][1])
580 |                 done_batch.append(self.sum_tree.data[data_idx][4])           
581 |         else:
582 |             state_batch = []
583 |             action_batch = []
584 |             reward_batch = []
585 |             next_state_batch = []
586 |             done_batch = []
587 |             for transition in batch:
588 |                 state, action, reward, next_state, done = transition
589 |                 state_batch.append(state)
590 |                 action_batch.append(action)
591 |                 reward_batch.append(reward)
592 |                 next_state_batch.append(next_state)
593 |                 done_batch.append(done)
594 |         self.beta = min(1., self.beta + self.beta_increment)
595 |         return (state_batch, action_batch, reward_batch, next_state_batch, done_batch, action_batch_s, reward_batch_s), batch_idx, IS_weights
596 | 
597 |     def update_priority(self, idx, td_error):
598 |         priority =  np.power(td_error, self.alpha)
599 |         self.sum_tree.update(idx, priority)
600 | 
601 |     def __len__(self):
602 |         return self.current_length    
603 |     
604 |     
605 |     
606 | class PrioritizedBuffer_N_step:
607 |     '''The difference to is gamma_exp is another element saved
608 |     '''
609 |     def __init__(self, max_size, shape, alpha=0.6, beta=0.4):
610 |         self.sum_tree = SumTree(max_size)
611 |         self.alpha = alpha
612 |         self.beta = beta
613 |         self.beta_increment = (1.0 - beta)/100000.
614 |         self.current_length = 0
615 |         self.frame_stack = shape[0]
616 |         self.efficient_memory_mode = True
617 |         self.img_shape = shape
618 |         
619 |         
620 |     def push(self, state, action, reward, next_state, done, gamma_exp):
621 |         priority = 1.0 if self.current_length == 0 else self.sum_tree.max_priority()
622 |         self.current_length = self.current_length + 1
623 |         #priority = td_error ** self.alpha
624 |         # -------------modified for efficient storage--------------
625 |         if self.efficient_memory_mode:
626 |             experience = (np.expand_dims(state[-1], axis=0).astype(np.uint8), np.array([action]), np.array([reward]), np.expand_dims(next_state[-1], axis=0).astype(np.uint8), done, np.array([gamma_exp]))
627 |         else:
628 |             experience = (state, np.array([action]), np.array([reward]), next_state, done, np.array([gamma_exp]))
629 |         self.sum_tree.add(priority, experience)
630 | 
631 | 
632 |     def sample(self, batch_size):
633 |         # ----------------To modify: sampling for efficient memory storage------------------
634 |         batch_idx, batch, priorities = [], [], []
635 |         segment = self.sum_tree.total() / batch_size
636 |         p_sum = self.sum_tree.tree[0]
637 |     
638 |         for i in range(batch_size):
639 |             a = segment * i
640 |             b = segment * (i + 1)
641 |             
642 |             # ---------self modified to allow storing for efficient memory storage of only one frame----------
643 |             if self.efficient_memory_mode:
644 |                 while True:
645 | 
646 |                     s = random.uniform(a, b)
647 |                     idx, p, data = self.sum_tree.get(s)
648 |                     data_idx = idx-self.sum_tree.capacity+1 # Note: idx(node idx) is different from dataidx (data idx)!
649 |                     # print(data_idx, self.sum_tree.write, self.frame_stack)
650 |                     if (data_idx - self.sum_tree.write > self.frame_stack or data_idx<= self.sum_tree.write) and (self.sum_tree.overwrite_start_flag == True): 
651 |                         break
652 |                     # elif (self.sum_tree.overwrite_start_flag == False) and (data_idx >= self.frame_stack - 1):
653 |                     #     break         
654 |                     elif (self.sum_tree.overwrite_start_flag == False): # allow samples from <= frame_stack
655 |                         break                                            
656 |             else:
657 |                 s = random.uniform(a, b)
658 |                 idx, p, data = self.sum_tree.get(s)
659 |             batch_idx.append(idx)
660 |             batch.append(data)
661 |             priorities.append(p)
662 |         prob = np.array(priorities) /p_sum 
663 |         IS_weights = np.power(self.sum_tree.n_entries * prob, -self.beta)
664 |         max_weight = np.power(self.sum_tree.n_entries * self.sum_tree.min_prob(), -self.beta)
665 |         IS_weights /= max_weight
666 |         
667 |         if self.efficient_memory_mode:
668 |             action_batch = []
669 |             reward_batch = []
670 |             done_batch = []
671 |             state_batch = np.zeros((batch_size,*self.img_shape))
672 |             next_state_batch = np.zeros((batch_size,*self.img_shape))
673 |             gamma_exp_batch = []
674 |             # --------------- frame stacking -----------------         
675 |             for i in range(len(batch_idx)):
676 |                 for j in range(1, self.frame_stack+1):
677 |                     data_idx = batch_idx[i]-self.sum_tree.capacity+1  -j+1 # notice the offset to substract to get right data[data_idx]
678 |                     if data_idx < 0 and self.sum_tree.overwrite_start_flag == True: # for the case : sample the 1st element and stack the last 3 elements from the end
679 |                         data_idx += self.sum_tree.capacity
680 |                     elif data_idx < 0 and self.sum_tree.overwrite_start_flag == False: # for the case : sample the 2nd element and stack the 1st elements 3 times from the beginning
681 |                         data_idx = 0
682 |                     # print(data_idx ,self.sum_tree.data[data_idx])
683 |                     if (self.sum_tree.data[data_idx][4] == True and j > 1) or (data_idx < 0): # for the case of done = True or sampling from the beginning         
684 |                         state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1]  # the last [0] makes --> 1*84*84 --> 84*84 
685 |                         next_state_batch[i, self.frame_stack-j] = state_batch[i, self.frame_stack-j+1]
686 |                     else:
687 |                         state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][0][0]  # the last [0] makes --> 1*84*84 --> 84*84 
688 |                         next_state_batch[i, self.frame_stack-j] = self.sum_tree.data[data_idx][3][0]
689 |                  
690 |             for i in range(len(batch_idx)):
691 |                 data_idx = batch_idx[i]-self.sum_tree.capacity+1
692 |                 reward_batch.append(self.sum_tree.data[data_idx][2])
693 |                 action_batch.append(self.sum_tree.data[data_idx][1])
694 |                 done_batch.append(self.sum_tree.data[data_idx][4])
695 |                 gamma_exp_batch.append(self.sum_tree.data[data_idx][5])
696 |         else:
697 |             state_batch = []
698 |             action_batch = []
699 |             reward_batch = []
700 |             next_state_batch = []
701 |             done_batch = []
702 |             gamma_exp_batch = []
703 |             for transition in batch:
704 |                 state, action, reward, next_state, done, gamma_exp = transition
705 |                 state_batch.append(state)
706 |                 action_batch.append(action)
707 |                 reward_batch.append(reward)
708 |                 next_state_batch.append(next_state)
709 |                 done_batch.append(done)
710 |                 gamma_exp_batch.append(gamma_exp)
711 |         self.beta = min(1., self.beta + self.beta_increment)
712 |         return (state_batch, action_batch, reward_batch, next_state_batch, done_batch, gamma_exp_batch), batch_idx, IS_weights
713 | 
714 |     def update_priority(self, idx, td_error):
715 |         priority =  np.power(td_error, self.alpha)
716 |         self.sum_tree.update(idx, priority)
717 | 
718 |     def __len__(self):
719 |         return self.current_length
720 |     
721 | 
722 | 


--------------------------------------------------------------------------------
/nav_acl.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | import matplotlib.pyplot as plt
  3 | from utils import rel_rot, rot_vec
  4 | from numpy import savetxt
  5 | import numpy as np
  6 | import scipy
  7 | from scipy.spatial import distance
  8 | from scipy.spatial.transform import Rotation as R
  9 | from scipy import stats, optimize, interpolate
 10 | import random
 11 | from Networks import NavACLNetwork, SoftQNetwork
 12 | from isaac_q_helper_functions import *
 13 | from RobotTask import RobotTask, Tasktype
 14 | import json
 15 | import torch
 16 | import torch.nn as nn
 17 | import torch.optim as optim
 18 | from collections import deque
 19 | from per_buffer import ReplayBuffer
 20 | import random
 21 | import time
 22 | 
 23 | 
 24 | class Nav_ACL():
 25 |   """
 26 |     My Interpretation of the Nav_ACL Algorithm for automatic Curriculum generation
 27 |     https://arxiv.org/abs/2009.05429
 28 |   """
 29 |   def __init__(self,nav_acl_network,config,worker_instance=True, shared_q_net=None, env=None, task_offset=np.array([0,0,0]), agent_index = 0, shared_elements=None):
 30 |     super(Nav_ACL, self).__init__()
 31 |     """
 32 |         my Implementaion keeps track of all previously generated tasks and their outcomes
 33 |     """
 34 |     self.config                         = config
 35 |     self.tasks                          = []
 36 |     self.steps                          = []
 37 |     self.rewards                        = []
 38 |     self.dones                          = []
 39 |     self.params                         = []
 40 |     self.default_task                   = self.config['default_task_omniverse']
 41 |     self.GOID_limits                    = self.config['GOID_limits']
 42 |     self.task_gen_method                = self.config['task_generation_method']
 43 |     self.initial_orientation            = np.array([1,0]) # it is assumed that the initial 2D orientation of the robot and the dolly is [1,0] and that the applied is relative to this initial orientation
 44 |     self.NavACL_loss_func               = torch.nn.BCELoss()
 45 |     self.adaptive_filtering_task_types  = [Tasktype.RANDOM, Tasktype.EASY, Tasktype.FRONTIER]
 46 |     self.adaptive_filtering_task_probs  = [0.15  ,   0.425   ,   0.425 ]
 47 |     self.device                         = train_device
 48 |     self.shared_elements                = shared_elements
 49 |     self.agent_index                    = agent_index
 50 |     self.q_net                          = SoftQNetwork(self.config)
 51 |     self.q_net.to(Qricculum_device)
 52 | 
 53 |     if(not worker_instance): # this instance of NavACL is the one used to train the network i.e. it does not receive copies ->  it produces the copies 
 54 |       self.NavACLNet        = nav_acl_network
 55 |       self.NavACL_optimizer = optim.Adam(self.NavACLNet.parameters(), lr=self.config['nav_acl_lr'])
 56 |       self.q_net = None
 57 |       self.env   = None
 58 |     else:
 59 |       if(self.config['q_ricculum_learning']):
 60 |         if(self.config['use_AF_database']): ## use a predefined database for adaptive filtering method of nav_acl (only relevant for qricculum learning, bc. it prevents generating tasks wich are only used for creating the adaptive boundaries)
 61 |           self.load_AF_database(self.config['AF_database_path']+str(agent_index)+".npy")
 62 |           print("using qricculum learning for agent: ", agent_index, " with task AF_database: ", self.config['AF_database_path']+str(agent_index)+".npy")
 63 |         assert (shared_q_net is not None)
 64 |         assert (env is not None)
 65 |         self.env = env
 66 |         self.update_networks(nav_acl_network,shared_q_net,task_offset)
 67 |       else:
 68 |         self.update_networks(nav_acl_network,q_net=None)
 69 |         self.NavACLNet = nav_acl_network
 70 | 
 71 |     if(self.config['using_unity']):
 72 |       self.default_task  = self.config['default_task_unity']
 73 |     else:
 74 |       self.default_task = self.config['default_task_omniverse']
 75 | 
 76 |   def update_networks(self,nav_acl_net, q_net, translation_offset=np.array([0,0,0])):
 77 |     if(self.config['q_ricculum_learning']):
 78 |         assert (q_net is not None)
 79 |         self.q_net.load_state_dict(q_net.state_dict())
 80 |         for param in self.q_net.parameters():
 81 |             param.requires_grad = False
 82 | 
 83 |     if(self.config['using_unity']):
 84 |       self.default_task  = self.config['default_task_unity']
 85 | 
 86 |     self.NavACLNet = NavACLNetwork(5,self.config['nav_acl_hidden_dim'])
 87 |     self.NavACLNet.load_state_dict(nav_acl_net.state_dict())
 88 |     self.NavACLNet.to(self.device)
 89 | 
 90 |   def get_task_difficulty_measure(self, robot_task):
 91 |     success_probability = self.get_task_success_probability(robot_task)
 92 |     return (1-success_probability)
 93 |   
 94 |   def get_task_success_probability(self, robot_task):
 95 |     task_params_array = self.get_task_params_array(robot_task,normalize=self.config['normalize_tasks'])
 96 |     task_params_array = torch.FloatTensor(task_params_array).to(self.device)
 97 |     return self.NavACLNet(task_params_array)
 98 | 
 99 |   def normalize_task_params_array(self, task_params_array):
100 |     # task_params_array = (task_params_array - self.task_mean) / (self.deviser+1e-3)
101 |     r_rot = self.config['randomization_params']['robot_randomization']['rotation_rnd'][0] / 2
102 |     d_rot = self.config['randomization_params']['dolly_randomization']['rotation_rnd'][0] / 2
103 |     rot   = r_rot + d_rot
104 |     task_params_array[0] = np.interp(task_params_array[0], (self.config['randomization_params']['min_dist_dolly_robot'], self.config['randomization_params']['max_dist_dolly_robot']), (-1,1))
105 |     task_params_array[1] = np.interp(task_params_array[1], (self.config['randomization_params']['min_dist_dolly_obs'], self.config['randomization_params']['max_dist_dolly_obs']), (-1,1))
106 |     task_params_array[2] = np.interp(task_params_array[2], (self.config['randomization_params']['min_dist_robot_obs'], self.config['randomization_params']['max_dist_robot_obs']), (-1,1))
107 |     task_params_array[3] = np.interp(task_params_array[3], (-(rot*np.pi)/180, (rot*np.pi)/180), (-1,1))
108 |     task_params_array[4] = np.interp(task_params_array[4], (self.config['collision_penalty']-self.config['goal_reward'],self.config['goal_reward']), (-1,1))
109 |     return task_params_array
110 | 
111 |   def get_task_params_array(self, task,normalize=True):
112 |     if(self.config['q_ricculum_learning']):
113 |       task_params_array = np.array([self.compute_dolly_robot_distance(task),
114 |                                     self.compute_dolly_min_distance(task),
115 |                                     self.compute_robot_min_distance(task),
116 |                                     self.compute_relative_rotation_robot_dolly(task),
117 |                                     np.clip(task.q_value,self.config['collision_penalty']-self.config['goal_reward'],self.config['goal_reward'])])
118 |     else:
119 |       task_params_array = np.array([self.compute_dolly_robot_distance(task),
120 |                                     self.compute_dolly_min_distance(task),
121 |                                     self.compute_robot_min_distance(task),
122 |                                     self.compute_relative_rotation_robot_dolly(task),
123 |                                     0.1])
124 |     if normalize:
125 |       task_params_array = self.normalize_task_params_array(task_params_array)
126 |     return task_params_array
127 | 
128 |   def apply_offset_to_robot(self, robot_task, translation_offset):
129 |     robot_task.robot_translation      =robot_task.robot_translation + translation_offset
130 |     return robot_task
131 |   
132 | 
133 |   def apply_offset_to_dolly(self, robot_task, translation_offset):
134 |     robot_task.dolly_translation      += translation_offset
135 |     return robot_task
136 | 
137 | 
138 |   def create_randomized_task(self,translation_offset= np.array([0,0,0])):
139 |     robot_task                        = RobotTask(self.default_task)
140 |     robot_task.robot_translation      =robot_task.robot_translation + translation_offset
141 |     robot_task.dolly_translation      += translation_offset
142 |     robot_task.obstacle_translation   += translation_offset
143 |     robot_task.obstacle_1_translation += translation_offset
144 |     robot_task.obstacle_2_translation += translation_offset
145 |     robot_task.obstacle_3_translation += translation_offset
146 |     robot_task.randomize_task(self.config['randomization_params'])
147 |     return robot_task
148 | 
149 | 
150 | 
151 |   def sample_random_task(self,translation_offset= np.array([0,0,0])):
152 |     robot_task = self.create_randomized_task(translation_offset=translation_offset)
153 | 
154 |     while self.check_if_task_is_valid(robot_task) is not True:
155 |       robot_task = self.create_randomized_task(translation_offset=translation_offset)
156 | 
157 |     if(self.config['q_ricculum_learning']):
158 |       robot_task.q_value = self.get_q_value_for_task(robot_task,translation_offset)
159 |       # print("q value of task: ", self.get_task_params_array(robot_task,False), " is : ", robot_task.q_value)
160 |     return robot_task
161 | 
162 |   def save_tasks_for_later(self,translation_offset, num_tasks=1000, save_location="/home/developer/Training_results/Qricculum_Learning"):
163 |     task_list             = []
164 |     for task in range(num_tasks):
165 |       robot_task            = self.create_randomized_task(translation_offset=translation_offset)
166 |       obs_cam, obs_lidar    = self.env.reset(robot_task) 
167 |       prev_action           = np.zeros((2))
168 |       prev_reward           = [self.config['step_penalty']]
169 |       task_array            = robot_task.get_task_array()
170 |       task_list.append((obs_cam, obs_lidar,prev_action,prev_reward,task_array))
171 |       if (task % 1000 == 0 ):
172 |         print("saved so far: ", task)
173 |     task_array = np.array(task_list)
174 |     np.save(save_location,task_array)
175 | 
176 | 
177 |   def load_AF_database(self, AF_database_path):
178 |     """
179 |       loads a task database from an .npy file and stores it as a list
180 |     """
181 |     self.AF_database = np.load(AF_database_path, allow_pickle=True)
182 |     self.AF_database = self.AF_database.tolist() # this way we can use random.sample(list, num_elements)
183 |     print("using qricculum learning with database: ", AF_database_path)
184 |     self.dtb_stack = self.create_dtb_stack()
185 | 
186 | 
187 |   def observation_to_q_value(self, obs_cam, obs_lidar, prev_action, prev_reward):
188 |     n_s_f = self.config['num_stacked_frames']
189 |     stacked_camera_obsrv  = deque(maxlen=n_s_f)
190 |     transitions           = deque(maxlen=n_s_f)
191 |     # start with a fresh state
192 |     for i in range(self.config['num_stacked_frames']):
193 |         stacked_camera_obsrv.append(obs_cam)
194 |         if(self.config['use_snail_mode']):
195 |             transitions.append((obs_cam, obs_lidar,prev_action,prev_reward))
196 | 
197 |     stacked_camera_state = np.asarray(stacked_camera_obsrv).reshape((self.env.observtion_shape[0]*n_s_f,self.env.observtion_shape[1],self.env.observtion_shape[2]))
198 |     cam_tensor           = torch.FloatTensor(stacked_camera_state).unsqueeze(0).to(Qricculum_device)
199 |     lidar_tensor         = torch.FloatTensor(obs_lidar).unsqueeze(0).to(Qricculum_device)
200 |     prev_action          = torch.FloatTensor(prev_action).unsqueeze(0).to(Qricculum_device)
201 |     reward               = torch.FloatTensor(prev_reward).unsqueeze(0).to(Qricculum_device)       # reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim;
202 |     with torch.no_grad():
203 |       q_value = self.q_net(cam_tensor,lidar_tensor, prev_action, reward).detach().cpu().numpy().flatten()
204 |       del stacked_camera_state, cam_tensor, lidar_tensor, prev_action, reward, obs_cam, obs_lidar, stacked_camera_obsrv
205 |       return q_value[0]
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 |   def get_q_value_for_task(self,robot_task,translation_offset):
213 |     obs_cam, obs_lidar    = self.env.reset(robot_task) 
214 |     prev_action           = np.zeros(8)
215 |     prev_reward           = np.ones(4)*self.config['step_penalty']
216 |     # self.show_state(obs_cam,True,string_extra=str(translation_offset))
217 |     q                     = self.observation_to_q_value(obs_cam, obs_lidar,prev_action, prev_reward)
218 |     return q
219 | 
220 |   def show_state(self,state_tensor, save=False, string_extra=""):
221 |       state_shape= (84,84,3)
222 |       state_tensor = state_tensor / 2**8
223 |       pic0 = np.array(state_tensor.reshape(state_shape).astype(np.float32))    
224 |       
225 |       fig, (ax1) = plt.subplots(1, 1)
226 |       ax1.imshow(pic0)
227 |       if(save):
228 |         path = "/home/developer/Pictures"+string_extra+str(current_milli_time())+".png"
229 |         print("saving image as: ", path)
230 |         plt.imsave(path,pic0)
231 |       else:
232 |         plt.show()
233 | 
234 |   def generate_random_task(self, translation_offset=np.array([0,0,0])):
235 |     """
236 |       returns a randomized RobotTask:
237 |       
238 |       based on a default task given in NAV_ACL_hyperparameters.json
239 |     """
240 |     if self.task_gen_method == "GOID":
241 |       return self.generate_random_task_GOID(translation_offset)
242 |     
243 |     if self.task_gen_method == "AF":
244 |       return self.generate_random_task_AF(translation_offset)
245 | 
246 | 
247 | 
248 |   def generate_random_task_GOID(self, translation_offset):
249 |     while True:
250 |       robot_task = self.sample_random_task(translation_offset)
251 |       difficulty = self.get_task_difficulty_measure(robot_task).cpu()
252 |       if ( (difficulty > self.GOID_limits['lower_limit'] ) and (difficulty < self.GOID_limits['upper_limit'])) :
253 |         return robot_task
254 | 
255 |   def generate_random_task_AF(self, translation_offset):
256 |     if(self.config['only_random_tasks']):
257 |       task_type = Tasktype.RANDOM
258 |     else:
259 |       task_type = np.random.choice(self.adaptive_filtering_task_types, 1, p=self.adaptive_filtering_task_probs)[0] # adaptive sampling of the task type
260 |     return self.get_dynamic_task(task_type,translation_offset)
261 | 
262 |   def get_dynamic_task(self, task_type, translation_offset):
263 |     """
264 |       This is the main algorithm for nav-acl-q
265 |     """
266 | 
267 |     
268 |     if(self.config['use_AF_database']):
269 |       mu, sigma = self.fast_create_adaptive_boundaries_mu_sig()
270 |     else:
271 |       mu, sigma = self.create_adaptive_boundaries()
272 | 
273 |     beta      = self.config['adaptive_filtering_params']['nav_beta']
274 |     gamma_low = self.config['adaptive_filtering_params']['nav_gamma_low']
275 |     gamma_hi  = self.config['adaptive_filtering_params']['nav_gamma_hi']
276 |     omega     = self.config['adaptive_filtering_params']['nav_P_omega']
277 | 
278 | 
279 |     for trial in range(self.config['nav_acl_max_AF_task_samples']): 
280 |       if(self.config['runtimetasks_from_database'] and self.config['use_AF_database']):
281 |         robot_task = self.sample_task_from_database(self.AF_database)
282 |       else:
283 |         robot_task = self.sample_random_task(translation_offset)
284 |       succ_prob  = self.get_task_success_probability(robot_task).cpu()
285 |       time.sleep(0.01)
286 | 
287 |       if task_type == Tasktype.EASY:
288 |         if (mu + beta*sigma) < 1:
289 |           if (succ_prob > (mu + beta*sigma)) or (succ_prob > omega):
290 |             robot_task.task_type = Tasktype.EASY
291 |             return robot_task
292 |         else:
293 |           if (succ_prob > mu):
294 |             robot_task.task_type = Tasktype.EASY
295 |             return robot_task
296 |       if task_type == Tasktype.FRONTIER:
297 |         if (mu - gamma_low*sigma) < succ_prob < (mu + gamma_hi*sigma):
298 |           robot_task.task_type = Tasktype.FRONTIER
299 |           return robot_task
300 |       if task_type == Tasktype.RANDOM: 
301 |         robot_task.task_type = Tasktype.RANDOM
302 |         return robot_task
303 |     print("TOOK too many trials to find task of type: ", task_type, " with boundaries mu: ", mu, " sigma: ", sigma)
304 |     robot_task.task_type = Tasktype.RANDOM
305 |     return robot_task
306 | 
307 | 
308 |   def create_adaptive_boundaries(self, translation_offset=np.array([0,0,0])):
309 |     tasks_parameters = []
310 |     if(self.config['use_AF_database']):
311 |       tasks_parameters = self.get_task_parameters_for_AF_database()
312 | 
313 |     else: 
314 |       for i in range(self.config['nav_acl_AF_boundaries_task_samples']):
315 |         tasks_parameters.append(self.get_task_params_array(self.sample_random_task(translation_offset=translation_offset),self.config['normalize_tasks']))
316 | 
317 |     params      = torch.FloatTensor(tasks_parameters).to(self.device)
318 |     predictions = self.NavACLNet(params).cpu().detach().numpy() 
319 |     mu, sigma   = scipy.stats.distributions.norm.fit(predictions)
320 |     return mu, sigma
321 | 
322 | 
323 |   def get_task_parameters_for_AF_database(self):
324 |     tasks_parameters = []
325 |     for i in range(self.config['nav_acl_AF_boundaries_task_samples']):
326 |       task = self.sample_task_from_database(self.AF_database)
327 |       tasks_parameters.append(self.get_task_params_array(task,self.config['normalize_tasks'])) # and append it to the task array which will be used for creating the adaptive boundaries
328 |     return tasks_parameters
329 | 
330 |   def sample_task_from_database(self,database):
331 |     task_sample                        = random.sample(database, 1)[0]
332 |     obs_cam, obs_lidar,_,_,task_array  = task_sample
333 |     prev_action                        = np.zeros(8)
334 |     prev_reward                        = np.ones(4)*self.config['step_penalty']
335 |     q                                  = self.observation_to_q_value(obs_cam,obs_lidar,prev_action,prev_reward) # pass observation through the q network to obtain currently estimated q value
336 |     task                               = RobotTask(self.default_task)                                           # create a task instance
337 |     task.from_task_array(task_array)                                                                            # initialize according to the task that was sampled from the database
338 |     task.q_value = q                                                                                            # replace the q 
339 |     return task
340 | 
341 |   def compute_dolly_robot_distance(self, task):
342 |       return distance.euclidean(task.robot_translation, task.dolly_translation)
343 |   
344 |   def compute_obstacle_distance_dolly(self, task):
345 |       distances = np.array([distance.euclidean(task.obstacle_2_translation, task.dolly_translation),
346 |                             distance.euclidean(task.obstacle_3_translation, task.dolly_translation)])
347 |       return np.min(distances)
348 | 
349 |   def compute_obstacle_distance_robot(self, task):
350 |       distances = np.array([distance.euclidean(task.obstacle_translation,   task.robot_translation),
351 |                             distance.euclidean(task.obstacle_1_translation, task.robot_translation)])
352 |       return np.min(distances)
353 | 
354 |   def compute_path_complexity(self, task):
355 |       return 0.1
356 | 
357 |   def check_if_task_is_valid(self,task):
358 |     # first check is no object is too close to the dolly:
359 |     if(self.compute_dolly_min_distance(task) < self.config['randomization_params']['min_dist_dolly_obs']):
360 |       return False
361 |     elif(self.compute_robot_min_distance(task) < self.config['randomization_params']['min_dist_robot_obs']):
362 |       return False
363 |     else:
364 |       return True
365 | 
366 |   def compute_robot_min_distance(self,task):
367 |     return self.compute_min_distance(task, task.robot_translation)
368 |   def compute_dolly_min_distance(self,task):
369 |     return self.compute_min_distance(task, task.dolly_translation)
370 |   
371 |   def compute_min_distance(self,task, target_translation):
372 |     distances = np.zeros((4))
373 |     obstacles = task.get_obstacle_translations_array()
374 |     for i in range(4):
375 |       distances[i] = distance.euclidean(obstacles[i], target_translation)
376 |     min_dist = np.min(distances)
377 | 
378 |     if min_dist > 50:
379 |       return 0
380 |     else : 
381 |       return np.min(distances)
382 | 
383 |   def compute_relative_rotation_robot_dolly(self, task, as_degrees=False):
384 |     Robot_orientation     = rot_vec(self.initial_orientation,task.robot_rotation[0])
385 |     Dolly_orientation     = rot_vec(self.initial_orientation,task.dolly_rotation[0])
386 |     relative_rotation_rad = rel_rot(Robot_orientation,Dolly_orientation)
387 |     if(as_degrees):
388 |       return relative_rotation_rad * (180/np.pi)
389 |     else:
390 |       return relative_rotation_rad
391 | 
392 | 
393 | 
394 |   def train(self,robot_task,label,batch_mode = False):
395 |     self.NavACL_optimizer.zero_grad()
396 | 
397 |     if(self.config['nav_acl_batch_mode']):
398 |       params   = []
399 |       labels   = []
400 |       for t in range(self.config['nav_acl_batch_size']):
401 |         if(type(robot_task[t])== RobotTask):
402 |           params.append(self.get_task_params_array(robot_task[t],self.config['normalize_tasks']))
403 |         else:
404 |           params.append(self.normalize_task_params_array(robot_task[t]))
405 |         labels.append(label[t])
406 | 
407 |       task_params_array = torch.FloatTensor(params).to(self.device)
408 |       label             = torch.FloatTensor(labels).flatten().to(self.device)
409 | 
410 | 
411 |     else: 
412 |       if(type(robot_task)== RobotTask):
413 |         task_params_array = self.get_task_params_array(robot_task,self.config['normalize_tasks'])
414 |       else:
415 |         task_params_array = self.normalize_task_params_array(robot_task)
416 | 
417 |       task_params_array = torch.FloatTensor(task_params_array).flatten().to(self.device)
418 |       label             = torch.FloatTensor(label).flatten().to(self.device)
419 | 
420 | 
421 |     prediction = self.NavACLNet(task_params_array)
422 |     loss = self.NavACL_loss_func(prediction, label)
423 | 
424 |     loss.backward()
425 |     self.NavACL_optimizer.step()
426 |     return prediction.detach().cpu().numpy()
427 | 
428 |   def batch_train(self,robot_tasks, labels):
429 |     self.NavACL_optimizer.zero_grad()
430 | 
431 |     params   = []
432 |     _labels   = []
433 |     for t in range(self.config['nav_acl_batch_size']):
434 |       if(type(robot_tasks[t])== RobotTask):
435 |         params.append(self.get_task_params_array(robot_tasks[t],self.config['normalize_tasks']))
436 |       else:
437 |         params.append(robot_tasks[t])
438 | 
439 |       _labels.append(labels[t])
440 | 
441 | 
442 |     task_params_array = torch.FloatTensor(params).to(self.device)
443 |     label             = torch.FloatTensor(_labels).flatten().to(self.device)
444 |     prediction        = self.NavACLNet(task_params_array)
445 |     # print("robot_tasks:" , robot_tasks)
446 |     # print("predictions: ", prediction)
447 |     loss              = self.NavACL_loss_func(prediction, label.view((prediction.shape))) # labels are [batchsize] abd predictions are [batchsize,1] thus labels have to be viewed as [8,1]! This is very important, otherwise it will only use one of the parameters for esitmation
448 | 
449 |     loss.backward()
450 |     self.NavACL_optimizer.step()
451 |     return prediction.detach().cpu().numpy()
452 | 
453 |   def create_dtb_stack(self):
454 |       af_database = np.load(self.config['AF_database_path']+str(self.agent_index)+".npy", allow_pickle=True)
455 |       af_database = np.array(af_database).reshape(af_database.shape[0],af_database.shape[1])
456 |       dtb_stack = []
457 |       for t_i in range(af_database.shape[0]):
458 |           image_stack = np.array([af_database[t_i][0],af_database[t_i][0],af_database[t_i][0],af_database[t_i][0]]).reshape(12,80,80)
459 |           lidar       = af_database[t_i][1]
460 |           action      = np.zeros((8))
461 |           reward      = np.ones(4)*self.config['step_penalty']
462 |           task_array  = af_database[t_i][4]
463 |           robot_task  = RobotTask(self.config['default_task_unity'])
464 |           robot_task.from_task_array(task_array)
465 |           task_params_array = self.get_task_params_array(robot_task,self.config['normalize_tasks'])
466 |           task_tuple = (image_stack, lidar, action, reward, task_params_array)
467 |           dtb_stack.append(task_tuple)
468 |       return dtb_stack
469 | 
470 |   def fast_create_adaptive_boundaries_mu_sig(self):
471 |     c, l, a, r, tsk = map(np.stack, zip(*random.sample(self.dtb_stack, self.config['nav_acl_AF_boundaries_task_samples'])))
472 | 
473 |     cam_tensor          = torch.FloatTensor(c).to(Qricculum_device)
474 |     lidar_tensor        = torch.FloatTensor(l).to(Qricculum_device)
475 |     prev_action         = torch.FloatTensor(a).to(Qricculum_device)
476 |     prev_reward         = torch.FloatTensor(r).to(Qricculum_device)  # prev_reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim;
477 |     with torch.no_grad():
478 |         predicted_q_values = self.q_net(cam_tensor,lidar_tensor, prev_action, prev_reward).detach().cpu().numpy()
479 |         predicted_q_values = np.interp(predicted_q_values, (self.config['collision_penalty']-self.config['goal_reward'],self.config['goal_reward']), (-1,1))
480 |     
481 |     tsk[:,4] = predicted_q_values[:,0]
482 |     # print(tsk)
483 |     tsk_tensor = torch.FloatTensor(tsk).to(self.device)
484 |     predictions = self.NavACLNet(tsk_tensor).detach().cpu().numpy()
485 |     mu, sigma   = scipy.stats.distributions.norm.fit(predictions)
486 |     return mu, sigma


--------------------------------------------------------------------------------
/pretrain_CAE.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | from torch.nn.modules.activation import LeakyReLU
  4 | from utils import initialize_weights_he
  5 | 
  6 | # The MNIST datasets are hosted on yann.lecun.com that has moved under CloudFlare protection
  7 | # Run this script to enable the datasets download
  8 | # Reference: https://github.com/pytorch/vision/issues/1938
  9 | from six.moves import urllib
 10 | opener = urllib.request.build_opener()
 11 | opener.addheaders = [('User-agent', 'Mozilla/5.0')]
 12 | urllib.request.install_opener(opener)
 13 | from Networks import ResNetBlock
 14 | import torch
 15 | import numpy as np
 16 | from torchvision import datasets
 17 | import torchvision.transforms as transforms
 18 | import cv2
 19 | from zu_resnet import ResNetEncoder
 20 | 
 21 | 
 22 | # define the NN architecture
 23 | class ConvAutoencoder_NAV2(nn.Module):
 24 |     def __init__(self, imgChannels=1, zDim=512,featureDim=12*10*10, fix_params=False):
 25 |         super(ConvAutoencoder_NAV2, self).__init__()
 26 |         self.featureDim = featureDim
 27 |         ## encoder layers ##
 28 |         #     https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout
 29 |         self.encode = nn.Sequential(
 30 |                 nn.Conv2d(imgChannels,  32, 5, padding=2)  ,
 31 |                 nn.BatchNorm2d(32),
 32 |                 nn.ReLU(),
 33 |                 ResNetBlock(32,64,3), 
 34 |                 ResNetBlock(64,128,3), 
 35 |                 ResNetBlock(128,256,3), 
 36 |                 ResNetBlock(256,128,3),  # 64x5x5 = 3200 feature vector
 37 |                 ).apply(initialize_weights_he) 
 38 | 
 39 |         ## decoder layers ##
 40 |         ## a kernel of 2 and a stride of 2 will increase the spatial dims by 2
 41 |         self.decode = nn.Sequential(
 42 |             nn.ConvTranspose2d(128, 256,  2, stride=2),
 43 |             nn.ReLU(),
 44 |             nn.ConvTranspose2d(256, 128,  2, stride=2),
 45 |             nn.ReLU(),
 46 |             nn.ConvTranspose2d(128, 64, 2, stride=2),
 47 |             nn.ReLU(),
 48 |             nn.ConvTranspose2d(64, imgChannels, 2, stride=2),
 49 |         ).apply(initialize_weights_he) 
 50 | 
 51 | 
 52 |     def fix_params(self):
 53 |         for param in self.encode.parameters():
 54 |             param.requires_grad = False
 55 |         for param in self.decode.parameters():
 56 |             param.requires_grad = False
 57 | 
 58 |     def encode_(self, x):
 59 |         return self.encode(x)
 60 | 
 61 |     def forward(self, x):
 62 |         x = self.encode(x)
 63 |         # print(x.shape)
 64 |         # x = x.reshape(64,5,5)
 65 |         x = self.decode(x)
 66 | 
 67 |         x = torch.sigmoid(x)        
 68 |         return x
 69 | 
 70 | 
 71 | # define the NN architecture
 72 | class ConvAutoencoder_NAV3(nn.Module):
 73 |     def __init__(self, imgChannels=1, zDim=512,featureDim=12*10*10, fix_params=False):
 74 |         super(ConvAutoencoder_NAV3, self).__init__()
 75 |         self.featureDim = featureDim
 76 |         ## encoder layers ##
 77 |         #     https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout
 78 |         self.encode = ResNetEncoder(12,blocks_sizes=[64,128,256,384],deepths=[2,2,2,2])
 79 | 
 80 |         ## decoder layers ##
 81 |         ## a kernel of 2 and a stride of 2 will increase the spatial dims by 2
 82 |         self.decode = nn.Sequential(
 83 |             nn.ConvTranspose2d(384, 512,  2, stride=2),
 84 |             nn.ReLU(),
 85 |             nn.ConvTranspose2d(512, 256,  2, stride=2),
 86 |             nn.ReLU(),
 87 |             nn.ConvTranspose2d(256, 128,  2, stride=2),
 88 |             nn.ReLU(),
 89 |             nn.ConvTranspose2d(128, 64,  2, stride=2),
 90 |             nn.ReLU(),
 91 |             nn.ConvTranspose2d(64, imgChannels,  2, stride=2)
 92 |         ).apply(initialize_weights_he) 
 93 | 
 94 | 
 95 |     def fix_params(self):
 96 |         for param in self.encode.parameters():
 97 |             param.requires_grad = False
 98 |         for param in self.decode.parameters():
 99 |             param.requires_grad = False
100 | 
101 |     def encode_(self, x):
102 |         return self.encode(x)
103 | 
104 |     def forward(self, x):
105 |         x = self.encode(x)
106 |         # print(x.shape)
107 |         # x = x.reshape(64,5,5)
108 |         x = self.decode(x)
109 | 
110 |         x = torch.sigmoid(x)        
111 |         return x
112 | 
113 | 
114 | # define the NN architecture
115 | class ConvAutoencoder_NAV4(nn.Module):
116 |     def __init__(self, imgChannels=1, zDim=512,featureDim=12*10*10, fix_params=False):
117 |         super(ConvAutoencoder_NAV4, self).__init__()
118 |         self.featureDim = featureDim
119 |         ## encoder layers ##
120 |         #     https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout
121 |         self.encode = nn.Sequential(
122 |                 ResNetBlock(imgChannels,64,3), 
123 |                 ResNetBlock(64,128,3), 
124 |                 ResNetBlock(128,256,3), 
125 |                 ResNetBlock(256,128,3),  # 64x5x5 = 3200 feature vector
126 |                 ).apply(initialize_weights_he) 
127 | 
128 |         ## decoder layers ##
129 |         ## a kernel of 2 and a stride of 2 will increase the spatial dims by 2
130 |         self.decode = nn.Sequential(
131 |             nn.ConvTranspose2d(128, 256,  2, stride=2),
132 |             nn.ReLU(),
133 |             nn.ConvTranspose2d(256, 128,  2, stride=2),
134 |             nn.ReLU(),
135 |             nn.ConvTranspose2d(128, 64, 2, stride=2),
136 |             nn.ReLU(),
137 |             nn.ConvTranspose2d(64, imgChannels, 2, stride=2),
138 |         ).apply(initialize_weights_he) 
139 | 
140 | 
141 |     def fix_params(self):
142 |         for param in self.encode.parameters():
143 |             param.requires_grad = False
144 |         for param in self.decode.parameters():
145 |             param.requires_grad = False
146 | 
147 |     def encode_(self, x):
148 |         return self.encode(x)
149 | 
150 |     def forward(self, x):
151 |         x = self.encode(x)
152 |         # print(x.shape)
153 |         # x = x.reshape(64,5,5)
154 |         x = self.decode(x)
155 | 
156 |         x = torch.sigmoid(x)        
157 |         return x
158 | 
159 | 
160 | 
161 | # define the NN architecture
162 | class ConvAutoencoder_NAV6(nn.Module):
163 |     def __init__(self, imgChannels=1, zDim=1024,featureDim=64*5*5, fix_params=False):
164 |         super(ConvAutoencoder_NAV6, self).__init__()
165 |         self.featureDim = featureDim
166 |         ## encoder layers ##
167 |         #     https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout
168 |         self.encode = nn.Sequential(
169 |                 ResNetBlock(imgChannels,64,3), 
170 |                 ResNetBlock(64,128,3), 
171 |                 ResNetBlock(128,256,3), 
172 |                 ResNetBlock(256,64,3),  # 64x5x5 = 3200 feature vector,
173 |                 nn.Flatten(),
174 |                 nn.Linear(featureDim,zDim)
175 |                 ).apply(initialize_weights_he) 
176 | 
177 |         self. FC_1 = nn.Linear(zDim,featureDim)
178 |         ## decoder layers ##
179 |         ## a kernel of 2 and a stride of 2 will increase the spatial dims by 2
180 |         self.decode = nn.Sequential(
181 |             nn.ConvTranspose2d(64, 128,  2, stride=2),
182 |             nn.ReLU(),
183 |             nn.ConvTranspose2d(128, 256,  2, stride=2),
184 |             nn.ReLU(),
185 |             nn.ConvTranspose2d(256, 128, 2, stride=2),
186 |             nn.ReLU(),
187 |             nn.ConvTranspose2d(128, 64, 2, stride=2),
188 |             
189 |         ).apply(initialize_weights_he) 
190 | 
191 | 
192 |     def fix_params(self):
193 |         for param in self.encode.parameters():
194 |             param.requires_grad = False
195 |         for param in self.decode.parameters():
196 |             param.requires_grad = False
197 | 
198 |     def encode_(self, x):
199 |         return self.encode(x)
200 | 
201 |     def forward(self, x):
202 |         x = self.encode(x)
203 | 
204 |         x = x.view(-1, self.fedim)
205 |         x = self.decode(x)
206 | 
207 |         x = torch.sigmoid(x)        
208 |         return x
209 | 
210 | 
211 | if __name__ == '__main__':
212 | 
213 | 
214 |     GPU = True
215 |     device_idx = 0
216 |     if GPU:
217 |         device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")
218 |     else:
219 |         device = torch.device("cpu")
220 |     # convert data to torch.FloatTensor
221 |     transform = transforms.ToTensor()
222 | 
223 | 
224 |     channels = 3
225 |     n_s_f = 4
226 |     inputshape = (80,80,channels)
227 |     cv2_resz = (80,80)
228 |     imshape  = (channels,*cv2_resz)
229 |     show_shape = (*cv2_resz,channels)
230 | 
231 | 
232 |     model = ConvAutoencoder_NAV4(imgChannels=channels*n_s_f)
233 |     # model.load_state_dict(torch.load("/home/developer/Training_results/Qricculum_Learning/big_and_small/final/Models/1/VAE_20"))
234 |     model.load_state_dict(torch.load("/home/developer/Training_results/Qricculum_Learning/big_and_small/hoffentlich/VAE_80803_615"))
235 |     model.eval()
236 |     model.to(device)
237 | 
238 |     train_images = []
239 |     test_images  = []
240 | 
241 |     moving_database = np.load("/home/developer/Training_results/Qricculum_Learning/big_and_small/hoffentlich/VAE_dtb_12_8080_final_hoffentlich.npy")
242 |     # moving_database = np.load("/home/developer/VAE_dtb_12_128128_final.npy")
243 |     # moving_database = np.load("/home/developer/Training_results/Qricculum_Learning/big_and_small/3/VAE_dtb_3_8080.npy")
244 | 
245 |     print(moving_database.shape)
246 |     print(moving_database[0])
247 | 
248 |     stacked_images = []
249 | 
250 |     train_data =  (moving_database[0:45000]/ 2**8).astype(np.float32)
251 |     test_data  =  (moving_database[45000:] / 2**8).astype(np.float32)
252 | 
253 |     print(train_data.shape)
254 |     print(test_data.shape)
255 | 
256 |     # Create training and test dataloaders
257 | 
258 |     num_workers = 10
259 |     # how many samples per batch to load
260 |     batch_size = 32
261 | 
262 |     # prepare data loaders
263 |     train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, num_workers=num_workers,shuffle=True)
264 |     test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, num_workers=num_workers,shuffle=True)
265 | 
266 |     import matplotlib.pyplot as plt
267 | 
268 | 
269 |     infostring = "net: \n" + str(model) + " \n \n \n"
270 |     
271 |     print(infostring)
272 |     filename = "/home/developer/Training_results/VA/"+"Infofile.txt"
273 |     text_file = open(filename, "w")
274 |     n = text_file.write(infostring)
275 |     text_file.close()
276 |     learning_rate = 0.01
277 | 
278 |     # specify loss function
279 |     criterion = nn.MSELoss()
280 | 
281 |     # specify loss function
282 |     # torch.optim.Adam
283 |     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
284 | 
285 |     # from torch.optim.lr_scheduler import ExponentialLR
286 |     from torch.optim.lr_scheduler import MultiStepLR
287 | 
288 |     # scheduler1 = ExponentialLR(optimizer, gamma=0.90)
289 |     scheduler2 = MultiStepLR(optimizer, milestones=[30,50,70,90], gamma=0.25)
290 | 
291 |     # number of epochs to train the model
292 |     n_epochs = 100
293 | 
294 |     # for epoch in range(1, n_epochs+1):
295 |     #     # monitor training loss
296 |     #     train_loss = 0.0
297 |     #     test_loss  = 0.0
298 |         
299 |     #     ##################
300 |     #     # train the model #
301 |     #     ##################
302 |     #     for data in train_loader:
303 |     #         # _ stands in for labels, here
304 |     #         # no need to flatten images
305 |     #         images = data
306 |     #         images = images.to(device)
307 |     #         # clear the gradients of all optimized variables
308 |     #         optimizer.zero_grad()
309 |     #         # forward pass: compute predicted outputs by passing inputs to the model
310 |     #         outputs = model(images).to(device)
311 | 
312 |     #         # output_decoder = decoder(images)
313 |     #         # print(output_decoder)
314 |     #         # print(output_decoder.shape)
315 |     #         # calculate the loss
316 |     #         loss = criterion(outputs, images)
317 |     #         # backward pass: compute gradient of the loss with respect to model parameters
318 |     #         loss.backward()
319 |     #         # perform a single optimization step (parameter update)
320 |     #         optimizer.step()
321 |     #         # update running training loss
322 |     #         train_loss += loss.item()*images.size(0)
323 |                 
324 |     #     # print avg training statistics 
325 |     #     train_loss = train_loss/len(train_loader)
326 |     #     print('Epoch: {} \tTraining Loss: {:.6f}'.format(
327 |     #         epoch, 
328 |     #         train_loss
329 |     #         ))
330 |     #     for test_i_data in test_loader:
331 |     #         # _ stands in for labels, here
332 |     #         # no need to flatten images
333 |     #         test_images = test_i_data
334 |     #         test_images = test_images.to(device)
335 |     #         # clear the gradients of all optimized variables
336 |     #         with torch.no_grad():
337 |     #             # forward pass: compute predicted outputs by passing inputs to the model
338 |     #             outputs = model(test_images).to(device)
339 | 
340 |     #             loss = criterion(outputs, test_images)
341 | 
342 |     #             test_loss += loss.item()*test_images.size(0)
343 |     #     print('Epoch: {} \tTesting Loss: {:.6f}'.format(
344 |     #         epoch, 
345 |     #         test_loss
346 |     #         ))
347 |     #     torch.save(model.state_dict(), "/home/developer/Training_results/VA/VAE_RESNET18"+str(epoch))
348 |     #     # scheduler1.step()
349 |     #     scheduler2.step()
350 | 
351 | 
352 |     # obtain one batch of test images
353 |     dataiter = iter(test_loader)
354 |     while True: 
355 |         show_images = dataiter.next()
356 |         show_images = show_images.to(device)
357 |         # get sample outputs
358 |         output = model(show_images)
359 |         # prep images for display
360 |         show_images = show_images.detach().cpu().numpy()
361 | 
362 |         # output is resized into a batch of iages
363 |         output = output.view(batch_size,n_s_f*channels,*cv2_resz)
364 |         # use detach when it's an output that requires_grad
365 |         output = output.detach().cpu().numpy()
366 | 
367 |         print(output.shape)
368 |         print(show_images.shape)
369 | 
370 |         # torch.save(model.state_dict(), "/home/developer/Training_results/VAE")
371 | 
372 |         # plot the first ten input images and then reconstructed images
373 |         fig, axes = plt.subplots(nrows=2, ncols=4, sharex=True, sharey=True, figsize=(20,20))
374 | 
375 |         axes[0][0].imshow(show_images[0][0:3].reshape(show_shape))
376 |         axes[0][0].get_xaxis().set_visible(False)
377 |         axes[0][0].get_yaxis().set_visible(False)
378 |         axes[0][1].imshow(show_images[0][3:6].reshape(show_shape))
379 |         axes[0][1].get_xaxis().set_visible(False)
380 |         axes[0][1].get_yaxis().set_visible(False)
381 |         axes[0][2].imshow(show_images[0][6:9].reshape(show_shape))
382 |         axes[0][2].get_xaxis().set_visible(False)
383 |         axes[0][2].get_yaxis().set_visible(False)
384 |         axes[0][3].imshow(show_images[0][9:12].reshape(show_shape))
385 |         axes[0][3].get_xaxis().set_visible(False)
386 |         axes[0][3].get_yaxis().set_visible(False)
387 | 
388 | 
389 |         axes[1][0].imshow(output[0][0:3].reshape(show_shape))
390 |         axes[1][0].get_xaxis().set_visible(False)
391 |         axes[1][0].get_yaxis().set_visible(False)
392 |         axes[1][1].imshow(output[0][3:6].reshape(show_shape))
393 |         axes[1][1].get_xaxis().set_visible(False)
394 |         axes[1][1].get_yaxis().set_visible(False)
395 |         axes[1][2].imshow(output[0][6:9].reshape(show_shape))
396 |         axes[1][2].get_xaxis().set_visible(False)
397 |         axes[1][2].get_yaxis().set_visible(False)
398 |         axes[1][3].imshow(output[0][9:12].reshape(show_shape))
399 |         axes[1][3].get_xaxis().set_visible(False)
400 |         axes[1][3].get_yaxis().set_visible(False)
401 | 
402 |         # input images on top row, reconstructions on bottom
403 |         # for show_images, row in zip([show_images, output], axes):
404 |         #     for img, ax in zip(show_images, row):
405 |         #         ax.imshow(img[0:3].reshape(show_shape))
406 |         #         ax.get_xaxis().set_visible(False)
407 |         #         ax.get_yaxis().set_visible(False)
408 | 
409 |         plt.show()


--------------------------------------------------------------------------------
/process_utils.py:
--------------------------------------------------------------------------------
  1 | from numpy.core.shape_base import stack
  2 | from RobotTask import RobotTask
  3 | import json
  4 | 
  5 | from scipy.signal.ltisys import StateSpace
  6 | from VisualTaskEnv import VisualTaskEnv
  7 | from isaac import *
  8 | import time
  9 | import numpy as np
 10 | from threading import Thread
 11 | from utils import *
 12 | import torch
 13 | from PyController import PyController
 14 | from nav_acl import Nav_ACL
 15 | from collections import deque
 16 | import copy
 17 | from Networks import *
 18 | 
 19 | 
 20 | def start_gather_experience(agent_index, shared_elements,config):
 21 |     ########## start isaac app #######################################
 22 |     app                             = Application("apps/py_sac_nav_acl/py_sac_nav_acl.app.json")
 23 |     isaac_thread                    = Thread(target=start_isaac_application,args=(app,agent_index,config,))
 24 |     isaac_thread.start()
 25 |     ###########environment and alrogithm setup########################
 26 |     time.sleep(1)
 27 |     env = VisualTaskEnv(app,agent_index, "py_controller_"+str(agent_index),config)
 28 |     torch.manual_seed((23345*agent_index))
 29 |     np.random.seed((23345*agent_index))
 30 |     env.seed((23345*agent_index))
 31 |     ##########run episodes'############################################
 32 |     # run_test_episodes(env, agent_index,shared_elements,config,)
 33 |     run_episodes(env, agent_index,shared_elements,config,)
 34 |     app.stop()
 35 | 
 36 | def run_test_episodes(env, agent_index,shared_elements, config):
 37 |     Full_Path = "/home/developer/Training_results/Visual/10/08/2021/16:05:00/"+"_CHECKPOINT_" + "4_19681"
 38 |     Full_navn = "/home/developer/Training_results/Visual/10/08/2021/16:05:00/"+"_nav_acl_network_" + "4_19681"
 39 | 
 40 |     No_AE_Path= "/home/developer/Training_results/Visual/10/05/2021/11:19:13/"+"_CHECKPOINT_" + "62_541207"
 41 |     No_AE_navn= "/home/developer/Training_results/Visual/10/05/2021/11:19:13/"+"_nav_acl_network_" + "62_541207"
 42 | 
 43 |     RND_Path  = "/home/developer/Training_results/Visual/09/30/2021/09:44:53/"+"_CHECKPOINT_" + "22_162675"
 44 |     RND_navn  = "/home/developer/Training_results/Visual/09/30/2021/09:44:53/"+"_nav_acl_network_" + "22_162675"
 45 |     nav_list = [Full_navn,No_AE_navn,RND_navn]
 46 |     check_list = [Full_Path,No_AE_Path,RND_Path]
 47 |     
 48 |     names_list = ["FULL", "No_AE", "RND"]
 49 |     nav_acl_network         = NavACLNetwork(5,config['nav_acl_hidden_dim']).to(train_device)
 50 |     for model_index in range(len(names_list)):
 51 |         print("load model: ", check_list[model_index])
 52 |         checkpoint=torch.load(check_list[model_index])
 53 |         print("load navacl model: ", nav_list[model_index])
 54 |         nav_check =torch.load(nav_list[model_index])
 55 |         nav_acl_network.load_state_dict(nav_check)
 56 |     print("hat geklappt")
 57 | 
 58 |     for model_index in range(len(names_list)):
 59 |         checkpoint=torch.load(check_list[model_index])
 60 |         nav_check =torch.load(nav_list[model_index])
 61 |         nav_acl_network.load_state_dict(nav_check)
 62 |         soft_q_net1 = SoftQNetwork(config).to(train_device)
 63 |         soft_q_net1.load_state_dict(checkpoint['soft_q_net1'])
 64 | 
 65 |         model_clone                   = PolicyNetwork(config)
 66 |         model_clone.load_state_dict(checkpoint['policy_net'])
 67 |         model_clone.to(inference_device)
 68 | 
 69 |         if( (config['q_ricculum_learning']) or config['create_new_AF_database']):
 70 |             nav_acl             = Nav_ACL(nav_acl_network,config,shared_q_net=soft_q_net1,env=env,task_offset=np.array([0,0,0]),agent_index=agent_index)
 71 |         else: 
 72 |             nav_acl             = Nav_ACL(nav_acl_network,config,agent_index=agent_index)
 73 |         for param in model_clone.parameters():
 74 |             param.requires_grad = False
 75 |         test_grid = generate_test_grid()
 76 |         test_angles = [0,45,-45,90,-90]
 77 |         # print(test_grid)
 78 |         
 79 |         for angle in test_angles:
 80 |             for trial in range(0,4):
 81 |                 print("agent: ", agent_index, " test case: ", angle, trial, "model: ", names_list[model_index])
 82 |                 results = []
 83 |                 for task_offset in test_grid:
 84 |                     orig_offset = task_offset
 85 |                     task_config = config['default_test_unity']
 86 |                     task_config['robot_pose']['rotation_yaw'][0] = angle
 87 |                     task = RobotTask(task_config) # create a task instance from a config dict
 88 |                     task_offset[1] += (agent_index*40)
 89 |                     task = nav_acl.apply_offset_to_robot(task, task_offset)
 90 |                     task = nav_acl.apply_offset_to_dolly(task, [0,(agent_index*40),0])
 91 | 
 92 |                     task.set_q_value(nav_acl.get_q_value_for_task(task,task_offset))
 93 |                     num_steps, task_result, pred_prob = run_episode(env, agent_index, model_clone, nav_acl,shared_elements, 1, 0, config, test_task=task)
 94 |                     results.append([orig_offset,angle,nav_acl.get_task_params_array(task,normalize=False),agent_index,num_steps,task_result, pred_prob])
 95 |                 results_array = np.asarray(results)
 96 |                 np.save("/home/developer/Testing_results/"+names_list[model_index]+"_gt_agent_"+str(agent_index)+"_trial_" + str(trial)+"_yaw_"+str(angle), results_array)
 97 | 
 98 | 
 99 | 
100 | def run_episodes(env, agent_index, shared_elements, config):
101 |     max_episodes                  = config['max_episodes']
102 |     num_Agents                    = config['num_Agents']
103 |     num_episodes                  = int(max_episodes / num_Agents)
104 |     steps                         = 0
105 |     robot_grid                    = generate_robot_grid()
106 |     task_offset                   = robot_grid[agent_index]
107 |     model_clone                   = PolicyNetwork(config)
108 | 
109 |     model_clone.load_state_dict(shared_elements['policy_network'].state_dict())
110 |     model_clone.to(inference_device)
111 | 
112 |     for param in model_clone.parameters():
113 |         param.requires_grad = False
114 | 
115 |     if( (config['q_ricculum_learning']) or config['create_new_AF_database']):
116 |         nav_acl             = Nav_ACL(shared_elements['nav_acl_network'],config,shared_q_net=shared_elements['q_network'],env=env,task_offset=task_offset,agent_index=agent_index)
117 |     else: 
118 |         nav_acl             = Nav_ACL(shared_elements['nav_acl_network'],config,agent_index=agent_index)
119 |     
120 |     if( config['create_new_AF_database']):
121 |         nav_acl.save_tasks_for_later(task_offset,config['new_AF_database_size'],"/home/developer/Training_results/Qricculum_Learning/"+str(agent_index))
122 |         print("FINISHED SAVING TASK DATABASE FOR QRICCULUM LEARNING, saved at: ","/home/developer/Training_results/Qricculum_Learning/"+str(agent_index))
123 |         nav_acl.load_AF_database("/home/developer/Training_results/Qricculum_Learning/"+str(agent_index)+".npy")
124 |     
125 | 
126 |     for episode_index in range(num_episodes):
127 |         if (episode_index % config['update_nav_nets_every_N_episodes']) == 0:
128 |             # Create a new local copy of the latest Policy net before starting the episode
129 |             model_clone.load_state_dict(shared_elements['policy_network'].state_dict())
130 |             for param in model_clone.parameters():
131 |                 param.requires_grad = False
132 |             if( config['q_ricculum_learning']):
133 |                 nav_acl.update_networks(shared_elements['nav_acl_network'],shared_elements['q_network'],task_offset)
134 |             else: 
135 |                 nav_acl.update_networks(shared_elements['nav_acl_network'],None)
136 |         
137 |         num_steps, result = run_episode(env, agent_index, model_clone, nav_acl,shared_elements, episode_index, steps, config)
138 |         steps += num_steps
139 | 
140 | 
141 | 
142 | def run_episode(env, agent_index, policy_net, nav_acl,shared_elements, num_of_episode, steps, config, test_task=None):
143 |     num_stacked_frames      = config['num_stacked_frames']  
144 |     train_start             = config['train_starts']
145 |     max_steps               = config['max_steps']
146 |     num_Agents              = config['num_Agents']
147 |     t_max                   = config['buffer_maxlen']*num_Agents
148 |     robot_grid              = generate_robot_grid()
149 |     task_offset             = robot_grid[agent_index]
150 |     t                       = steps*num_Agents # estimate total number of steps in the replay buffer
151 |     P_Random                = min(nav_acl.config['adaptive_filtering_params']['p_random_max'], t/t_max)
152 |     P_Easy                  = ((1-P_Random)/2)
153 |     P_Frontier              = P_Easy
154 |     action_dim              = (2)
155 |     nav_acl.adaptive_filtering_task_probs = [P_Random,P_Easy,P_Frontier]
156 | 
157 |     if(test_task is None):
158 |         task = nav_acl.generate_random_task(translation_offset=task_offset)
159 |     else:
160 |         print("TEST TASK")
161 |         task = test_task
162 | 
163 |     pred_prob_worker        = nav_acl.get_task_success_probability(task).detach().cpu().numpy()
164 | 
165 |     step_durations          = shared_elements['step_durations']
166 |     episode_rewards         = shared_elements['episode_rewards']
167 |     episode_lengths         = shared_elements['episode_lengths']
168 |     intermediate_buffer     = shared_elements['experience_queue']
169 |     intermediate_buffer_nav_acl = shared_elements['nav_acl_exp_queue']
170 | 
171 | 
172 |     obs_cam, obs_lidar    =  env.reset(task)
173 |     env.step([0,0])
174 |     obs_cam, obs_lidar    = env.reset(task) 
175 |     stacked_camera_obsrv  = deque(maxlen=num_stacked_frames)
176 |     stacked_actions       = deque(maxlen=num_stacked_frames)
177 |     stacked_rewards       = deque(maxlen=num_stacked_frames)
178 |     transitions           = deque(maxlen=num_stacked_frames)
179 |     episode_reward  = 0
180 |     ep_steps        = 0
181 |     prev_action     = np.zeros(action_dim)
182 |     prev_reward     = config['step_penalty']
183 |     task_result     = [0]
184 |     episode_exp     = []
185 |     # start with a fresh state
186 |     for i in range(num_stacked_frames):
187 |         stacked_camera_obsrv.append(obs_cam)
188 |         stacked_actions.append(prev_action)
189 |         stacked_rewards.append(prev_reward)
190 |         if(config['use_snail_mode']):
191 |             transitions.append((obs_cam, obs_lidar,prev_action,prev_reward))
192 | 
193 | 
194 | 
195 |     start_episode           = current_milli_time()
196 |     
197 |    
198 |     for step in range(max_steps):
199 |         start = current_milli_time()
200 | 
201 |         stacked_camera_state = np.asarray(stacked_camera_obsrv).reshape((env.observtion_shape[0]*num_stacked_frames,env.observtion_shape[1],env.observtion_shape[2]))
202 |         stacked_prev_action  = np.asarray(stacked_actions) # [a-4,a-3, a-2, a-1]
203 |         stacked_prev_reward  = np.asarray(stacked_rewards).reshape(num_stacked_frames,1)
204 | 
205 |         with torch.no_grad():
206 |             if steps >= train_start:
207 |                 action = policy_net.get_action(stacked_camera_state,obs_lidar,stacked_prev_action,stacked_prev_reward, deterministic = False, device=inference_device)
208 |             else:      
209 |                 action = policy_net.sample_action()
210 | 
211 |         ### do action and get observation
212 |         
213 | 
214 |         step_start = current_milli_time()
215 |         next_obs_cam, next_obs_lidar, reward, done, info = env.step(action)
216 | 
217 |         ### append new action and reward (has to happen before appending to the replay buffer!)
218 |         stacked_actions.append(action) # [a-3,a-2,a-1, a]
219 |         stacked_rewards.append(reward) 
220 | 
221 |         stacked_action  = np.asarray(stacked_actions)
222 |         stacked_reward  = np.asarray(stacked_rewards).reshape(num_stacked_frames,1)
223 | 
224 |         ### append to episode exp
225 |         episode_exp.append((obs_cam, obs_lidar, stacked_action, stacked_prev_action, stacked_reward, stacked_prev_reward, next_obs_cam, next_obs_lidar, done))
226 | 
227 | 
228 |         stacked_camera_obsrv.append(next_obs_cam) # has to happen after appending to the replay buffer
229 | 
230 |         obs_cam, obs_lidar = next_obs_cam, next_obs_lidar
231 |         episode_reward += reward
232 |        
233 |         end = current_milli_time()
234 |         step_durations.append(end-start)
235 |         prev_action = action
236 |         prev_reward = reward
237 |         ep_steps += 1
238 | 
239 |         ### some manual cleanup to prevent numpy from polluting my memory
240 |         del stacked_camera_state, next_obs_cam, next_obs_lidar
241 |         
242 |         if done:
243 |             if info['collision']:
244 |                 task_result = [0]
245 |             else:
246 |                 task_result = [1]
247 | 
248 |             break
249 |     time_for_episode = current_milli_time()-start_episode
250 |     avg_time_per_step = int(time_for_episode / ep_steps)
251 |     print('Agent: ', agent_index,  'Finished Episode: ', num_of_episode, ' | Pred prob.: ', round(pred_prob_worker[0],2), ' | was considered: ', task.task_type.name, '| Episode Reward: ', round(episode_reward,2), ' | Num steps: ', ep_steps, '| avg time per step: ',avg_time_per_step, ' | done: ', task_result[0])
252 |     episode_rewards.append(episode_reward)
253 |     episode_lengths.append(ep_steps)
254 | 
255 |     nav_acl.params.append(nav_acl.get_task_params_array(task,False)) # append the non normalized version of the task since we need that information for our statistics
256 | 
257 |     # append results to the shared queue
258 |     intermediate_buffer_nav_acl.put((task,task_result,agent_index))
259 |     intermediate_buffer.put(episode_exp)
260 |     return ep_steps, task_result[0], pred_prob_worker
261 | 
262 | def start_isaac_sim_connector(num_Agents=1, start_port=64000):
263 |   """ creates PyCodelets for receiving and sending messages to/from the isaac simulation environment (either Unity or Omniverse) via specified tcp subscriber and publisher nodes to allow for parallel access to the simulation """
264 |   app = Application("apps/py_sac_nav_acl/py_sim_connector.app.json")
265 |   PUB_PORT_NUM = start_port
266 |   SUB_PORT_NUM = start_port+1000
267 |   for index in range(num_Agents):
268 |     #create application and create node
269 |     agent_suffix = '_'+str(index)
270 |     app_name = "connector" +agent_suffix
271 |     
272 |     connector = app.add(app_name)
273 | 
274 |     tcp_sub = connector.add(app.registry.isaac.alice.TcpSubscriber)
275 |     tcp_pub = connector.add(app.registry.isaac.alice.TcpPublisher)
276 |     connector.add(app.registry.isaac.alice.TimeSynchronizer)
277 | 
278 |     tcp_sub.config['port'] = SUB_PORT_NUM     + index
279 |     tcp_sub.config['host'] = 'localhost'
280 |     tcp_pub.config['port'] = PUB_PORT_NUM     + index
281 | 
282 |     #receive messages from sim and publish via TCP
283 |     app.connect('simulation.interface/output',                             'collision'+agent_suffix,       app_name+'/TcpPublisher',        'collision'+agent_suffix)
284 |     app.connect('simulation.interface/output',                             'bodies'+agent_suffix,          app_name+'/TcpPublisher',        'bodies'+agent_suffix)
285 |     app.connect('simulation.interface/output',                             'rangescan_front'+agent_suffix, app_name+'/TcpPublisher',        'rangescan_front'+agent_suffix)
286 |     app.connect('simulation.interface/output',                             'rangescan_back'+agent_suffix,  app_name+'/TcpPublisher',        'rangescan_back'+agent_suffix)
287 |     app.connect('simulation.interface/output',                             'color'+agent_suffix,           app_name+'/TcpPublisher',        'color'+agent_suffix)
288 |     
289 |     #receive messages from TCP and publish them to the simulation
290 |     print("messages coming from: ", SUB_PORT_NUM     + index, "go to :", 'simulation.interface/input/####'+agent_suffix)
291 |     #send control messages
292 |     app.connect(app_name+'/TcpSubscriber',              'diff_command'+agent_suffix,              'simulation.interface/input',                  'base_command'+agent_suffix)
293 |     app.connect(app_name+'/TcpSubscriber',              'teleport_robot'+agent_suffix,            'simulation.interface/input',                  'teleport_robot' +agent_suffix)
294 |     app.connect(app_name+'/TcpSubscriber',              'teleport_dolly'+agent_suffix,            'simulation.interface/input',                  'teleport_dolly' +agent_suffix)
295 |     app.connect(app_name+'/TcpSubscriber',              'teleport_obstacle'+agent_suffix,         'simulation.interface/input',                  'teleport_obstacle' +agent_suffix)
296 |     app.connect(app_name+'/TcpSubscriber',              'teleport_obstacle'+agent_suffix+"_1",    'simulation.interface/input',                  'teleport_obstacle' +agent_suffix+"_1")
297 |     app.connect(app_name+'/TcpSubscriber',              'teleport_obstacle'+agent_suffix+"_2",    'simulation.interface/input',                  'teleport_obstacle' +agent_suffix+"_2")
298 |     app.connect(app_name+'/TcpSubscriber',              'teleport_obstacle'+agent_suffix+"_3",    'simulation.interface/input',                  'teleport_obstacle' +agent_suffix+"_3")
299 | 
300 |   app.start()
301 |   while True:
302 |       time.sleep(5)
303 |   app.stop()
304 | 
305 | 
306 | def start_isaac_application(app, agent_index, config):
307 |   """ creates PyCodelets for receiving and sending messages to/from the isaac simulation environment (either Unity or Omniverse) """
308 |   Port = config['start_port']
309 |   PUB_PORT_NUM = Port + 1000 
310 |   SUB_PORT_NUM = Port
311 | 
312 |   #create application and create node
313 |   agent_suffix = '_'+str(agent_index)
314 |   app_name = "py_controller" +agent_suffix
315 |   py_controller_node = app.add(app_name)
316 | 
317 | 
318 |   component_node = py_controller_node.add(PyController)
319 |   component_node.config['app_name']                     = app_name
320 |   component_node.config['agent_index']                  = agent_index
321 |   component_node.config['tick_period_seconds']          = config['tick_period_seconds']
322 |   component_node.config['action_duration_ticks']        = config['action_duration_ticks']
323 |   component_node.config['depth_cam_max_distance']       = config['depth_cam_max_distance']
324 |   component_node.config['lidar_max_distance']           = config['lidar_max_distance']
325 |   component_node.config['goal_threshold']               = config['goal_threshold']
326 |   component_node.config['output_resolution']            = config['output_resolution']
327 |   component_node.config['using_depth']                  = config['using_depth']
328 |   component_node.config['using_unity']                  = config['using_unity']
329 |   component_node.config['goal_description']             = config['goal_description']
330 |   component_node.config['omniverse_teleport_dict']      = config['omniverse_teleport_dict']
331 |   component_node.config['scale_dolly']                  = config['scale_dolly']
332 |   contact_monitor_node = py_controller_node.add(app.registry.isaac.navigation.CollisionMonitor, 'CollisionMonitor')
333 | 
334 |   tcp_sub = py_controller_node.add(app.registry.isaac.alice.TcpSubscriber)
335 |   tcp_pub = py_controller_node.add(app.registry.isaac.alice.TcpPublisher)
336 |     
337 |   py_controller_node.add(app.registry.isaac.alice.TimeSynchronizer)
338 | 
339 |   tcp_sub.config['port'] = SUB_PORT_NUM     + agent_index
340 |   tcp_sub.config['host'] = 'localhost'
341 |   tcp_pub.config['port'] = PUB_PORT_NUM + agent_index
342 |   print("start isaac app: ", agent_index, " \n SUB_PORT: ", SUB_PORT_NUM, "\n PUB_PORT: ", PUB_PORT_NUM, "\n")
343 |   #receive messages
344 |   app.connect(app_name+'/TcpSubscriber',           'collision'+agent_suffix,            app_name+'/CollisionMonitor',        'collision')
345 |   app.connect(app_name+'/CollisionMonitor',        'report',                            app_name+'/PyCodelet',       'collision'+agent_suffix)
346 |   app.connect(app_name+'/TcpSubscriber',           'collision'+agent_suffix,            app_name+'/PyCodelet',       'collision'+agent_suffix)
347 |   app.connect(app_name+'/TcpSubscriber',           'bodies'+agent_suffix,               app_name+'/PyCodelet',       'bodies'+agent_suffix)
348 |   app.connect(app_name+'/TcpSubscriber',           'rangescan_front'+agent_suffix,      app_name+'/PyCodelet',       'lidar_front'+agent_suffix)
349 |   app.connect(app_name+'/TcpSubscriber',           'rangescan_back'+agent_suffix,       app_name+'/PyCodelet',       'lidar_back'+agent_suffix)
350 |   app.connect(app_name+'/TcpSubscriber',           'color'+agent_suffix,                app_name+'/PyCodelet',       'color'+agent_suffix)
351 | 
352 |   
353 |   print("STARTED CONTROLLER WITH NAME: ", app_name, "RECEIVING TCP ON ", SUB_PORT_NUM +agent_index, "SENDING TCP ON: ", PUB_PORT_NUM+agent_index)
354 |   #send control messages
355 |   app.connect(app_name+'/PyCodelet',              'diff_command'+agent_suffix,         app_name+'/TcpPublisher',      'diff_command'+agent_suffix)
356 |   app.connect(app_name+'/PyCodelet',              'teleport_robot'+agent_suffix,       app_name+'/TcpPublisher',      'teleport_robot'+agent_suffix)
357 |   app.connect(app_name+'/PyCodelet',              'teleport_dolly'+agent_suffix,       app_name+'/TcpPublisher',      'teleport_dolly'+agent_suffix)
358 |   app.connect(app_name+'/PyCodelet',              'teleport_obstacle'+agent_suffix,    app_name+'/TcpPublisher',      'teleport_obstacle'+agent_suffix)
359 |   app.connect(app_name+'/PyCodelet',              'teleport_obstacle'+agent_suffix + "_1",    app_name+'/TcpPublisher',      'teleport_obstacle'+agent_suffix+ "_1")
360 |   app.connect(app_name+'/PyCodelet',              'teleport_obstacle'+agent_suffix + "_2",    app_name+'/TcpPublisher',      'teleport_obstacle'+agent_suffix+ "_2")
361 |   app.connect(app_name+'/PyCodelet',              'teleport_obstacle'+agent_suffix + "_3",    app_name+'/TcpPublisher',      'teleport_obstacle'+agent_suffix+ "_3")
362 |   app.start()
363 | 


--------------------------------------------------------------------------------
/sac_v2.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Soft Actor-Critic version 2
  3 | using target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net
  4 | add alpha loss compared with version 1
  5 | paper: https://arxiv.org/pdf/1812.05905.pdf
  6 | '''
  7 | import numpy as np
  8 | import time
  9 | from utils import *
 10 | import torch.multiprocessing as mp
 11 | from process_utils import *
 12 | import copy
 13 | from threading import Thread
 14 | import scipy
 15 | 
 16 | 
 17 |     
 18 | def mini_batch_train(agent,save_location, config):
 19 |         # training loop
 20 | 
 21 |     ## These shared lists and managers are for the multi-processing
 22 |     manager = mp.Manager()
 23 |     episode_rewards         = manager.list()
 24 |     episode_lengths         = manager.list()
 25 |     step_durations          = manager.list()
 26 |     nav_acl_exp_list        = manager.list()
 27 |     experience_queue        = manager.Queue()
 28 |     nav_acl_exp_queue       = manager.Queue()
 29 |     num_Agents              = config['num_Agents']
 30 |     nav_acl_network         = NavACLNetwork(5,config['nav_acl_hidden_dim']).to(train_device)
 31 | 
 32 |     if config['load_checkpoint']:
 33 |         checkpoint              = torch.load(config['checkpoint_path']+"_nav_acl_network_"+str(config['checkpoint_index']))
 34 |         nav_acl_network.load_state_dict(checkpoint)
 35 | 
 36 |     # observation is only the current front camera image, but we want to concatenate the four most recent frames to form the state therefore we keep track of the 
 37 |     # three previous frames in a deque with a maximum length
 38 | 
 39 |     agent.policy_net.share_memory()
 40 |     agent.target_soft_q_net1.share_memory()
 41 |     num_Agents              = config['num_Agents']
 42 |     nav_acl_mu_sig          = manager.Array('d', range(2))
 43 |     nav_acl_network.share_memory()
 44 | 
 45 |     shared_elements         = {"episode_rewards"    : episode_rewards, 
 46 |                                "episode_lengths"    : episode_lengths, 
 47 |                                "step_durations"     : step_durations, 
 48 |                                "policy_network"     : agent.policy_net,
 49 |                                "q_network"          : agent.target_soft_q_net1, 
 50 |                                "nav_acl_network"    : nav_acl_network, 
 51 |                                "experience_queue"   : experience_queue, 
 52 |                                "nav_acl_exp_queue"  : nav_acl_exp_queue,
 53 |                                "nav_acl_exp_list"   : nav_acl_exp_list,
 54 |                                "nav_acl_mu_sig"     : nav_acl_mu_sig}
 55 | 
 56 | 
 57 |     update_shared_policy_net(agent, shared_elements)
 58 | 
 59 | 
 60 |     dtb_stack = []
 61 |     # TODO: checkpoint loading for nav acl network
 62 |     
 63 |     # start workers: 
 64 |     print("START ", num_Agents, " WORKER TRHEADS ")
 65 |     processes               = []
 66 |     nav_acl_local_exp_list  = [] # this list is used to store the latest n=nav_acl_batch_size task, task_result pairs to train the nav_acl network with
 67 | 
 68 |     # ## for each agent run a fraction of the number of episodes in parallel
 69 |     for agent_index in range(num_Agents):
 70 | 
 71 |         p1 = mp.Process(target=start_gather_experience,args=(agent_index,shared_elements,config,))
 72 |         p1.start()
 73 | 
 74 |         processes.append(p1)
 75 |         time.sleep(0.2)
 76 |     
 77 |     # start an save cycle for saving the statistics hourly
 78 |     t2 = Thread(target=save_cycle,args=(agent,nav_acl_local_exp_list,shared_elements,save_location,))
 79 |     t2.start()
 80 | 
 81 |     # start the update cycle that will gather the information from the worker cycles to update both the nav-acl network and the policy and q-value networks
 82 |     update_cycle(agent,nav_acl_local_exp_list,shared_elements,config,dtb_stack)
 83 | 
 84 | 
 85 | def save_cycle(agent,nav_acl_local_exp_list,shared_elements,save_location):
 86 |     """
 87 |         saves training statistics and networks every hour
 88 |     """
 89 |     h_of_train  = 0
 90 |     while(True):
 91 |         time.sleep(180)
 92 |         save_training_statiscics_to_location(shared_elements['episode_lengths'],
 93 |                                              shared_elements['step_durations'],
 94 |                                              shared_elements['episode_rewards'],
 95 |                                              nav_acl_local_exp_list,
 96 |                                              agent,
 97 |                                              shared_elements['nav_acl_network'],
 98 |                                              save_location,
 99 |                                              h_of_train)
100 |         time.sleep(3420)
101 |         h_of_train += 1
102 | 
103 | 
104 | #TODO: make this easier to read
105 | def update_cycle(agent,nav_acl_local_exp_list,shared_elements,config, dtb_stack):
106 |     """
107 |         this cycle is an infinite loop that checks if new exp for training the q and policy networks can be found in the shared_elements['experience_queue']
108 |         if so it will train the q nets and the policy nets (policy net and target_q_net are shared accross all processes) on this new exp. 
109 |         (remember that the workers replace their copy of the qnet and the policy net on every episode)
110 |         Furthermore this cylce looks out for new nav_acl exp inside the shared_elements['nav_acl_exp_queue'], 
111 |         if so the shared nav_acl Network will be trained - either batch wise or for each incoming exp
112 |     """
113 |     train_start             = config['train_starts']
114 |     batch_size              = config['batch_size']
115 |     use_reward_scaling      = config['use_reward_scaling']
116 |     gamma                   = config['gamma']
117 |     num_Agents              = config['num_Agents']
118 |     soft_tau                = config['tau']
119 |     updates                 = 0 
120 |     steps                   = 0
121 |     nav_acl                 = Nav_ACL(shared_elements['nav_acl_network'], config,worker_instance=False)
122 |     nav_acl_batch_tasks     = []
123 |     nav_acl_batch_results   = []
124 |     saved = False
125 |     while(True):
126 |         start = current_milli_time()
127 |         idle = True
128 | 
129 |         ### This part gathers the experience (for the SAC part and the nav acl part)
130 |         for q in range(num_Agents):
131 |             if (not shared_elements['experience_queue'].empty()):
132 |                 episode = shared_elements['experience_queue'].get()
133 |                 for exp in episode: # take exp out of the shared queue and put it into the replay buffer
134 |                     obs_cam, obs_lidar, action, prev_action, reward, prev_reward, next_obs_cam, next_obs_lidar, done = exp
135 | 
136 |                     # put it into the replay buffer
137 |                     agent.replay_buffer.add(obs_cam, obs_lidar, action, prev_action, reward, prev_reward, next_obs_cam, next_obs_lidar, done)
138 |                     steps += 1
139 |                     agent.steps += 1
140 | 
141 |                 idle=False
142 |             if (not shared_elements['nav_acl_exp_queue'].empty()): # take nav acl exp out of the shared queue and use it to train the shared nav-acl network
143 |                 task, result, agent_index = shared_elements['nav_acl_exp_queue'].get()
144 |                 task_params_array         = nav_acl.get_task_params_array(task,normalize=False) # not normalizing this bc. only used for plotting
145 |                 prediction                = nav_acl.get_task_success_probability(task).detach().cpu()
146 | 
147 |                 if(config['q_ricculum_learning']):
148 |                     mu, sig             = 0.5, 0.5 # q_ricculum lerning needs a q value estimate to create an adaptive boundaries, thus if we use q_ricculum lerning we cannot have this statistic so we replace with 0.5, 0.5
149 |                 else:
150 |                     mu, sig             = nav_acl.create_adaptive_boundaries(100)
151 | 
152 |                 nav_acl_local_exp_list.append(((task_params_array,result, prediction, mu, sig, agent_index,task.task_type)))
153 | 
154 |                 if(nav_acl.config['nav_acl_batch_mode']):
155 |                     nav_acl_batch_tasks.append(nav_acl.get_task_params_array(task,normalize=nav_acl.config['normalize_tasks'])) 
156 |                     nav_acl_batch_results.append(result)
157 |                     if(len(nav_acl_batch_tasks) == nav_acl.config['nav_acl_batch_size']):
158 |                         nav_acl_batch_tasks_array = np.asarray(nav_acl_batch_tasks)
159 |                         nav_acl_batch_results_array = np.asarray(nav_acl_batch_results)
160 |                         nav_acl.batch_train(nav_acl_batch_tasks_array, nav_acl_batch_results_array)
161 |                         nav_acl_batch_tasks.clear()
162 |                         nav_acl_batch_results.clear()
163 |                         
164 |                 else:   
165 |                     task_params_array   = nav_acl.get_task_params_array(task,normalize=nav_acl.config['normalize_tasks'])
166 |                     nav_acl.train(task, result)
167 |         # # update_start = current_milli_time()
168 | 
169 |         ## after some Experience has been collected and put into the replay buffer, I perform 4 consecutive training steps
170 |         for x in range(4):
171 |             if((steps > config['train_starts']) and (steps > config['batch_size']) and (steps > config['fill_buffer_with_transitions'])):
172 |                 if(updates < int(steps/4)):
173 |                     agent.update(batch_size, reward_scale=2., auto_entropy=True, target_entropy=-1.,use_reward_scaling=use_reward_scaling,gamma=gamma,soft_tau=soft_tau)
174 |                     updates+=1
175 |                     idle=False
176 |         # print("update process cycle time: ", current_milli_time()-start, "of which update time: ", current_milli_time()-update_start)
177 |         if(updates % 500 <= 10):
178 |             print("steps: ", steps, " updates so far: ", updates)
179 |             
180 | 
181 |         if(idle):
182 |             time.sleep(0.1)
183 | 
184 | 
185 | 
186 | 
187 | ## These three are not really needed since the networks in the shared_elements are the ones that are updated in the update cycle!
188 | def update_shared_q_net(agent, shared_elements):
189 |     for target_param, param in zip(shared_elements['q_network'].parameters(), agent.soft_q_net1.parameters()):
190 |         target_param.data.copy_(param.data)
191 | 
192 | def update_shared_policy_net(agent,shared_elements):
193 |     for target_param, param in zip(shared_elements['policy_network'].parameters(), agent.policy_net.parameters()):
194 |         target_param.data.copy_(param.data)
195 | 
196 | def update_shared_nav_net(nav_acl, shared_elements):
197 |     for target_param, param in zip(shared_elements['nav_acl_network'].parameters(), nav_acl.NavACLNet.parameters()):    
198 |         target_param.data.copy_(param.data)
199 | 
200 | def save_training_statiscics_to_location(episode_lengths,step_durations,episode_rewards,nav_acl_exp_list, agent, nav_acl_network, save_location,h_of_train):
201 |     with mp.Lock():
202 |         nav_acl_params           = []
203 |         nav_acl_dones            = []
204 |         nav_acl_predictions      = []
205 |         nav_acl_mu               = []
206 |         nav_acl_sig              = []
207 |         nav_acl_agent_index      = []
208 |         nav_acl_task_type        = []
209 |         copy_episode_lengths     = []
210 |         copy_step_durations      = []
211 |         copy_episode_rewards     = []
212 |     
213 |         for (task_params,result, prediction, mu, sig, agent_index, task_type) in nav_acl_exp_list:
214 |             nav_acl_params.append(task_params)
215 |             nav_acl_dones.append(result)
216 |             nav_acl_predictions.append(prediction)
217 |             nav_acl_mu.append(mu)
218 |             nav_acl_sig.append(sig)
219 |             nav_acl_agent_index.append(agent_index)
220 |             nav_acl_task_type.append(task_type.value)
221 |         
222 |         for step in step_durations:
223 |             copy_step_durations.append(step)
224 |         
225 |         for episode_reward in episode_rewards:
226 |             copy_episode_rewards.append(episode_reward)
227 |         
228 |         for episode_length in episode_lengths:
229 |             copy_episode_lengths.append(episode_length)
230 | 
231 |         np.save(save_location+"episode_lengts",     copy_episode_lengths)
232 |         np.save(save_location+"episode_rewards",    copy_episode_rewards)
233 |         np.save(save_location+"step_durations",     copy_step_durations)
234 |         np.save(save_location+"q1_losses",          agent.q1_losses)
235 |         np.save(save_location+"q2_losses",          agent.q2_losses)
236 |         np.save(save_location+"policy_losses",      agent.policy_losses)
237 |         np.save(save_location+"alpha_losses",       agent.alpha_losses)
238 |         np.save(save_location+"alpha",              agent.alpha_list)
239 | 
240 | 
241 |         np.save(save_location+"nav_acl_params",     nav_acl_params)
242 |         np.save(save_location+"nav_acl_dones",      nav_acl_dones)
243 |         np.save(save_location+"nav_acl_predictions",nav_acl_predictions)
244 |         np.save(save_location+"nav_acl_mu",         nav_acl_mu)
245 |         np.save(save_location+"nav_acl_sig",        nav_acl_sig)
246 |         np.save(save_location+"nav_acl_agent_index",nav_acl_agent_index)
247 |         np.save(save_location+"nav_acl_task_type",  nav_acl_task_type)
248 |         torch.save(nav_acl_network.state_dict(), save_location+'_nav_acl_network_'+str(h_of_train)+"_"+str(len(episode_rewards)))
249 |         agent.save_checkpoint(save_location,str(h_of_train)+"_"+str(len(episode_rewards)))
250 |         agent.save_model(save_location)
251 |         print("SAVED SUCCESSFULLY!")
252 | 
253 | def test_batch(env,agent,max_steps,num_tests):
254 |     for eps in range(num_tests):
255 |         state =  env.reset()
256 |         episode_reward = 0
257 |         prev_action = agent.policy_net.sample_action()
258 | 
259 |         for step in range(max_steps):
260 |             action = agent.policy_net.get_action(state,prev_action, deterministic = False)
261 |             next_state, reward, done, _ = env.step(action)            
262 |             episode_reward += reward
263 |             state=next_state
264 |             prev_action = action
265 |             
266 |             if done:
267 |                 break
268 | 
269 |         print('Episode: ', eps, '| Episode Reward: ', episode_reward)


--------------------------------------------------------------------------------