├── datasets ├── __init__.py ├── scared_dataset.py └── mono_dataset.py ├── imgs ├── pose.png ├── depth.png ├── overview.png └── reconstruction.png ├── overview.png ├── splits └── endovis │ ├── gt_poses_sq1.npz │ ├── gt_poses_sq2.npz │ ├── 3d_reconstruction.txt │ ├── test_files_sequence1.txt │ ├── test_files.txt │ └── test_files_sequence2.txt ├── networks ├── __init__.py ├── pose_cnn.py ├── pose_decoder.py ├── appearance_flow_decoder.py ├── depth_decoder.py ├── optical_flow_decoder.py ├── resnet_encoder.py ├── depth_decoder_ICRA.py └── dares.py ├── train_end_to_end.py ├── export_gt_depth.py ├── visualize_pose.py ├── utils.py ├── test_simple.py ├── README.md ├── evaluate_pose.py ├── evaluate_depth.py ├── evaluate_3d_reconstruction.py ├── options.py ├── layers.py └── trainer_end_to_end.py /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .scared_dataset import SCAREDRAWDataset 2 | -------------------------------------------------------------------------------- /imgs/pose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mobarakol/DARES/HEAD/imgs/pose.png -------------------------------------------------------------------------------- /overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mobarakol/DARES/HEAD/overview.png -------------------------------------------------------------------------------- /imgs/depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mobarakol/DARES/HEAD/imgs/depth.png -------------------------------------------------------------------------------- /imgs/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mobarakol/DARES/HEAD/imgs/overview.png -------------------------------------------------------------------------------- /imgs/reconstruction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mobarakol/DARES/HEAD/imgs/reconstruction.png -------------------------------------------------------------------------------- /splits/endovis/gt_poses_sq1.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mobarakol/DARES/HEAD/splits/endovis/gt_poses_sq1.npz -------------------------------------------------------------------------------- /splits/endovis/gt_poses_sq2.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mobarakol/DARES/HEAD/splits/endovis/gt_poses_sq2.npz -------------------------------------------------------------------------------- /splits/endovis/3d_reconstruction.txt: -------------------------------------------------------------------------------- 1 | dataset1/keyframe1 1 l 2 | dataset1/keyframe1 5 l 3 | dataset1/keyframe1 10 l 4 | dataset1/keyframe1 15 l 5 | dataset2/keyframe4 1352 l 6 | dataset2/keyframe4 1353 l 7 | -------------------------------------------------------------------------------- /networks/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet_encoder import ResnetEncoder 2 | from .depth_decoder import DepthDecoder 3 | from .pose_decoder import PoseDecoder 4 | from .pose_cnn import PoseCNN 5 | from .appearance_flow_decoder import TransformDecoder 6 | from .optical_flow_decoder import PositionDecoder 7 | from .dares import DARES 8 | -------------------------------------------------------------------------------- /train_end_to_end.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | from trainer_end_to_end import Trainer 4 | from options import MonodepthOptions 5 | 6 | options = MonodepthOptions() 7 | opts = options.parse() 8 | 9 | 10 | if __name__ == "__main__": 11 | trainer = Trainer(opts) 12 | trainer.train() 13 | -------------------------------------------------------------------------------- /networks/pose_cnn.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class PoseCNN(nn.Module): 8 | def __init__(self, num_input_frames): 9 | super(PoseCNN, self).__init__() 10 | 11 | self.num_input_frames = num_input_frames 12 | 13 | self.convs = {} 14 | self.convs[0] = nn.Conv2d(3 * num_input_frames, 16, 7, 2, 3) 15 | self.convs[1] = nn.Conv2d(16, 32, 5, 2, 2) 16 | self.convs[2] = nn.Conv2d(32, 64, 3, 2, 1) 17 | self.convs[3] = nn.Conv2d(64, 128, 3, 2, 1) 18 | self.convs[4] = nn.Conv2d(128, 256, 3, 2, 1) 19 | self.convs[5] = nn.Conv2d(256, 256, 3, 2, 1) 20 | self.convs[6] = nn.Conv2d(256, 256, 3, 2, 1) 21 | 22 | self.pose_conv = nn.Conv2d(256, 6 * (num_input_frames - 1), 1) 23 | 24 | self.num_convs = len(self.convs) 25 | 26 | self.relu = nn.ReLU(True) 27 | 28 | self.net = nn.ModuleList(list(self.convs.values())) 29 | 30 | def forward(self, out): 31 | 32 | for i in range(self.num_convs): 33 | out = self.convs[i](out) 34 | out = self.relu(out) 35 | 36 | out = self.pose_conv(out) 37 | out = out.mean(3).mean(2) 38 | 39 | out = 0.01 * out.view(-1, self.num_input_frames - 1, 1, 6) 40 | 41 | axisangle = out[..., :3] 42 | translation = out[..., 3:] 43 | 44 | return axisangle, translation 45 | -------------------------------------------------------------------------------- /networks/pose_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import torch 4 | import torch.nn as nn 5 | from collections import OrderedDict 6 | 7 | 8 | class PoseDecoder(nn.Module): 9 | def __init__(self, num_ch_enc, num_input_features, num_frames_to_predict_for=None, stride=1): 10 | super(PoseDecoder, self).__init__() 11 | 12 | self.num_ch_enc = num_ch_enc 13 | self.num_input_features = num_input_features 14 | 15 | if num_frames_to_predict_for is None: 16 | num_frames_to_predict_for = num_input_features - 1 17 | self.num_frames_to_predict_for = num_frames_to_predict_for 18 | 19 | self.convs = OrderedDict() 20 | self.convs[("squeeze")] = nn.Conv2d(self.num_ch_enc[-1], 256, 1) 21 | self.convs[("pose", 0)] = nn.Conv2d(num_input_features * 256, 256, 3, stride, 1) 22 | self.convs[("pose", 1)] = nn.Conv2d(256, 256, 3, stride, 1) 23 | self.convs[("pose", 2)] = nn.Conv2d(256, 6 * num_frames_to_predict_for, 1) 24 | 25 | self.relu = nn.ReLU() 26 | 27 | self.net = nn.ModuleList(list(self.convs.values())) 28 | 29 | def forward(self, input_features): 30 | last_features = [f[-1] for f in input_features] 31 | 32 | cat_features = [self.relu(self.convs["squeeze"](f)) for f in last_features] 33 | cat_features = torch.cat(cat_features, 1) 34 | 35 | out = cat_features 36 | for i in range(3): 37 | out = self.convs[("pose", i)](out) 38 | if i != 2: 39 | out = self.relu(out) 40 | 41 | out = out.mean(3).mean(2) 42 | 43 | out = 0.001*out.view(-1, self.num_frames_to_predict_for, 1, 6) 44 | 45 | axisangle = out[..., :3] 46 | translation = out[..., 3:] 47 | 48 | return axisangle, translation 49 | -------------------------------------------------------------------------------- /export_gt_depth.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | 5 | import argparse 6 | import numpy as np 7 | import PIL.Image as pil 8 | import cv2 9 | 10 | from utils import readlines 11 | 12 | def export_gt_depths_SCARED(): 13 | 14 | parser = argparse.ArgumentParser(description='export_gt_depth') 15 | 16 | parser.add_argument('--data_path', 17 | type=str, 18 | help='path to the root of the data', 19 | required=True) 20 | parser.add_argument('--split', 21 | type=str, 22 | help='which split to export gt from', 23 | required=True, 24 | choices=["endovis"]) 25 | opt = parser.parse_args() 26 | 27 | split_folder = os.path.join(os.path.dirname(__file__), "splits", opt.split) 28 | lines = readlines(os.path.join(split_folder, "test_files.txt")) 29 | print("Exporting ground truth depths for {}".format(opt.split)) 30 | i=0 31 | gt_depths = [] 32 | for line in lines: 33 | i = i+1 34 | folder, frame_id, _ = line.split() 35 | frame_id = int(frame_id) 36 | print(i) 37 | print(folder) 38 | 39 | if opt.split == "endovis": 40 | f_str = "scene_points{:06d}.tiff".format(frame_id - 1) 41 | gt_depth_path = os.path.join( 42 | opt.data_path, 43 | folder, 44 | "image_02/data/groundtruth", 45 | f_str) 46 | depth_gt = cv2.imread(gt_depth_path, 3) 47 | depth_gt = depth_gt[:, :, 0] 48 | gt_depth = depth_gt[0:1024, :] 49 | 50 | gt_depths.append(gt_depth.astype(np.float32)) 51 | 52 | output_path = os.path.join(split_folder, "gt_depths.npz") 53 | 54 | print("Saving to {}".format(opt.split)) 55 | 56 | np.savez_compressed(output_path, data=np.array(gt_depths)) 57 | 58 | 59 | if __name__ == "__main__": 60 | export_gt_depths_SCARED() 61 | -------------------------------------------------------------------------------- /datasets/scared_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import skimage.transform 5 | import numpy as np 6 | import PIL.Image as pil 7 | import cv2 8 | 9 | from .mono_dataset import MonoDataset 10 | 11 | 12 | class SCAREDDataset(MonoDataset): 13 | def __init__(self, *args, **kwargs): 14 | super(SCAREDDataset, self).__init__(*args, **kwargs) 15 | 16 | self.K = np.array([[0.82, 0, 0.5, 0], 17 | [0, 1.02, 0.5, 0], 18 | [0, 0, 1, 0], 19 | [0, 0, 0, 1]], dtype=np.float32) 20 | 21 | # self.full_res_shape = (1280, 1024) 22 | self.side_map = {"2": 2, "3": 3, "l": 2, "r": 3} 23 | 24 | def check_depth(self): 25 | 26 | return False 27 | 28 | def get_color(self, folder, frame_index, side, do_flip): 29 | color = self.loader(self.get_image_path(folder, frame_index, side)) 30 | 31 | if do_flip: 32 | color = color.transpose(pil.FLIP_LEFT_RIGHT) 33 | 34 | return color 35 | 36 | 37 | class SCAREDRAWDataset(SCAREDDataset): 38 | def __init__(self, *args, **kwargs): 39 | super(SCAREDRAWDataset, self).__init__(*args, **kwargs) 40 | 41 | def get_image_path(self, folder, frame_index, side): 42 | f_str = "{:010d}{}".format(frame_index, self.img_ext) 43 | image_path = os.path.join( 44 | self.data_path, folder, "image_0{}/data".format(self.side_map[side]), f_str) 45 | 46 | return image_path 47 | 48 | def get_depth(self, folder, frame_index, side, do_flip): 49 | f_str = "scene_points{:06d}.tiff".format(frame_index-1) 50 | 51 | depth_path = os.path.join( 52 | self.data_path, 53 | folder, 54 | "image_0{}/data/groundtruth".format(self.side_map[side]), 55 | f_str) 56 | 57 | depth_gt = cv2.imread(depth_path, 3) 58 | depth_gt = depth_gt[:, :, 0] 59 | depth_gt = depth_gt[0:1024, :] 60 | if do_flip: 61 | depth_gt = np.fliplr(depth_gt) 62 | 63 | return depth_gt 64 | 65 | 66 | -------------------------------------------------------------------------------- /networks/appearance_flow_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | from collections import OrderedDict 8 | from layers import * 9 | 10 | 11 | class TransformDecoder(nn.Module): 12 | def __init__(self, num_ch_enc, scales = range(4) , num_output_channels=3, use_skips=True): 13 | super(TransformDecoder, self).__init__() 14 | 15 | self.num_output_channels = num_output_channels 16 | self.use_skips = use_skips 17 | self.upsample_mode = 'nearest' 18 | self.scales = scales 19 | 20 | self.num_ch_enc = num_ch_enc 21 | self.num_ch_dec = np.array([16, 32, 64, 128, 256]) 22 | 23 | # decoder 24 | self.convs = OrderedDict() # 有序字典 25 | for i in range(4, -1, -1): 26 | # upconv_0 27 | num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + 1] 28 | num_ch_out = self.num_ch_dec[i] 29 | self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out) 30 | 31 | # upconv_1 32 | num_ch_in = self.num_ch_dec[i] 33 | if self.use_skips and i > 0: 34 | num_ch_in += self.num_ch_enc[i - 1] 35 | num_ch_out = self.num_ch_dec[i] 36 | self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out) 37 | 38 | for s in self.scales: 39 | self.convs[("transform_conv", s)] = Conv3x3(self.num_ch_dec[s], self.num_output_channels) 40 | 41 | self.decoder = nn.ModuleList(list(self.convs.values())) 42 | self.Tanh = nn.Tanh() 43 | 44 | def forward(self, input_features): 45 | self.outputs = {} 46 | # decoder 47 | x = input_features[-1] 48 | for i in range(4, -1, -1): 49 | x = self.convs[("upconv", i, 0)](x) 50 | x = [upsample(x)] 51 | if self.use_skips and i > 0: 52 | x += [input_features[i - 1]] 53 | x = torch.cat(x, 1) 54 | x = self.convs[("upconv", i, 1)](x) 55 | if i in self.scales: 56 | self.outputs[("transform", i)] = self.Tanh(self.convs[("transform_conv", i)](x)) 57 | 58 | return self.outputs 59 | -------------------------------------------------------------------------------- /networks/depth_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved. 2 | # 3 | # This software is licensed under the terms of the Monodepth2 licence 4 | # which allows for non-commercial use only, the full terms of which are made 5 | # available in the LICENSE file. 6 | 7 | from __future__ import absolute_import, division, print_function 8 | 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | 13 | from collections import OrderedDict 14 | from layers import * 15 | 16 | 17 | class DepthDecoder(nn.Module): 18 | def __init__(self, num_ch_enc, scales= range(4), num_output_channels=1, use_skips=True): 19 | super(DepthDecoder, self).__init__() 20 | 21 | self.num_output_channels = num_output_channels 22 | self.use_skips = use_skips 23 | self.upsample_mode = 'nearest' 24 | self.scales = scales 25 | 26 | self.num_ch_enc = num_ch_enc 27 | self.num_ch_dec = np.array([16, 32, 64, 128, 256]) 28 | 29 | # decoder 30 | self.convs = OrderedDict() # 有序字典 31 | for i in range(4, -1, -1): 32 | # upconv_0 33 | num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + 1] 34 | num_ch_out = self.num_ch_dec[i] 35 | self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out) 36 | 37 | # upconv_1 38 | num_ch_in = self.num_ch_dec[i] 39 | if self.use_skips and i > 0: 40 | num_ch_in += self.num_ch_enc[i - 1] 41 | num_ch_out = self.num_ch_dec[i] 42 | self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out) 43 | 44 | for s in self.scales: 45 | self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s], self.num_output_channels) 46 | 47 | self.decoder = nn.ModuleList(list(self.convs.values())) 48 | self.sigmoid = nn.Sigmoid() 49 | 50 | def forward(self, input_features): 51 | self.outputs = {} 52 | # decoder 53 | x = input_features[-1] 54 | for i in range(4, -1, -1): 55 | x = self.convs[("upconv", i, 0)](x) 56 | x = [upsample(x)] 57 | if self.use_skips and i > 0: 58 | x += [input_features[i - 1]] 59 | x = torch.cat(x, 1) 60 | x = self.convs[("upconv", i, 1)](x) 61 | if i in self.scales: 62 | self.outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv", i)](x)) 63 | 64 | return self.outputs 65 | -------------------------------------------------------------------------------- /visualize_pose.py: -------------------------------------------------------------------------------- 1 | # import necessary module 2 | from mpl_toolkits.mplot3d import axes3d 3 | import os 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | from options import MonodepthOptions 7 | opt = MonodepthOptions().parse() 8 | # load data from file 9 | # you replace this using with open 10 | gt_path = os.path.join(os.path.dirname(__file__), "splits", "endovis", "gt_poses_sq{}.npz".format(opt.scared_pose_seq)) 11 | gt_local_poses = np.load(gt_path, fix_imports=True, encoding='latin1')["data"] 12 | 13 | our_path = os.path.join(os.path.dirname(__file__), "splits", "endovis", "pred_pose_sq{}.npz".format(opt.scared_pose_seq)) 14 | our_local_poses = np.load(our_path, fix_imports=True, encoding='latin1')["data"] 15 | 16 | 17 | def dump(source_to_target_transformations): 18 | Ms = [] 19 | cam_to_world = np.eye(4) 20 | Ms.append(cam_to_world) 21 | for source_to_target_transformation in source_to_target_transformations: 22 | cam_to_world = np.dot(source_to_target_transformation, cam_to_world) 23 | Ms.append(cam_to_world) 24 | return Ms 25 | 26 | 27 | def compute_scale(gtruth, pred): 28 | 29 | # Optimize the scaling factor 30 | scale = np.sum(gtruth[:, :3, 3] * pred[:, :3, 3]) / np.sum(pred[:, :3, 3] ** 2) 31 | 32 | return scale 33 | 34 | dump_gt = np.array(dump(gt_local_poses)) 35 | dump_our = np.array(dump(our_local_poses)) 36 | 37 | scale_our = dump_our * compute_scale(dump_gt, dump_our) 38 | 39 | num = gt_local_poses.shape[0] 40 | points_our = [] 41 | points_gt = [] 42 | origin = np.array([[0], [0], [0], [1]]) 43 | 44 | for i in range(0, num): 45 | point_our = np.dot(scale_our[i], origin) 46 | point_gt = np.dot(dump_gt[i], origin) 47 | 48 | points_our.append(point_our) 49 | points_gt.append(point_gt) 50 | 51 | points_our = np.array(points_our) 52 | points_gt = np.array(points_gt) 53 | 54 | # new a figure and set it into 3d 55 | fig = plt.figure() 56 | ax = fig.add_subplot(projection='3d') 57 | 58 | # set figure information 59 | # ax.set_title("3D_Curve") 60 | ax.set_xlabel("x [mm]") 61 | ax.set_ylabel("y [mm]") 62 | ax.set_zlabel("z [mm]") 63 | 64 | # draw the figure, the color is r = read 65 | figure1, = ax.plot(points_gt[:, 0, 0], points_gt[:, 1, 0], points_gt[:, 2, 0], label = 'GT', linestyle = '-', c='b', linewidth=1.6) 66 | figure2, = ax.plot(points_our[:, 0, 0], points_our[:, 1, 0], points_our[:, 2, 0], label = 'Prediction', linestyle = '-', c='g', linewidth=1.6) 67 | 68 | plt.legend() 69 | plt.savefig('trajectory_pose_seq{}.png'.format(opt.scared_pose_seq),dpi=600) 70 | plt.show() 71 | -------------------------------------------------------------------------------- /networks/optical_flow_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | from torch.distributions.normal import Normal 8 | from collections import OrderedDict 9 | from layers import * 10 | 11 | 12 | class PositionDecoder(nn.Module): 13 | def __init__(self, num_ch_enc, scales = range(4) , num_output_channels=2, use_skips=True): 14 | super(PositionDecoder, self).__init__() 15 | 16 | self.num_output_channels = num_output_channels 17 | self.use_skips = use_skips 18 | self.upsample_mode = 'nearest' 19 | self.scales = scales 20 | 21 | self.num_ch_enc = num_ch_enc 22 | self.num_ch_dec = np.array([16, 32, 64, 128, 256]) 23 | self.conv = getattr(nn, 'Conv2d') 24 | 25 | # decoder 26 | self.convs = OrderedDict() # 有序字典 27 | for i in range(4, -1, -1): 28 | # upconv_0 29 | num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + 1] 30 | num_ch_out = self.num_ch_dec[i] 31 | self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out) 32 | 33 | # upconv_1 34 | num_ch_in = self.num_ch_dec[i] 35 | if self.use_skips and i > 0: 36 | num_ch_in += self.num_ch_enc[i - 1] 37 | num_ch_out = self.num_ch_dec[i] 38 | self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out) 39 | 40 | for s in self.scales: 41 | 42 | self.convs[("position_conv", s)] = self.conv (self.num_ch_dec[s], self.num_output_channels, kernel_size = 3, padding = 1) 43 | # init flow layer with small weights and bias 44 | self.convs[("position_conv", s)].weight = nn.Parameter(Normal(0, 1e-5).sample(self.convs[("position_conv", s)].weight.shape)) 45 | self.convs[("position_conv", s)].bias = nn.Parameter(torch.zeros(self.convs[("position_conv", s)].bias.shape)) 46 | 47 | self.decoder = nn.ModuleList(list(self.convs.values())) 48 | 49 | def forward(self, input_features): 50 | self.outputs = {} 51 | # decoder 52 | x = input_features[-1] 53 | for i in range(4, -1, -1): 54 | x = self.convs[("upconv", i, 0)](x) 55 | x = [upsample(x)] 56 | if self.use_skips and i > 0: 57 | x += [input_features[i - 1]] 58 | x = torch.cat(x, 1) 59 | x = self.convs[("upconv", i, 1)](x) 60 | if i in self.scales: 61 | self.outputs[("position", i)] = self.convs[("position_conv", i)](x) 62 | 63 | return self.outputs 64 | -------------------------------------------------------------------------------- /networks/resnet_encoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torchvision.models as models 8 | import torch.utils.model_zoo as model_zoo 9 | 10 | 11 | class ResNetMultiImageInput(models.ResNet): 12 | """Constructs a resnet model with varying number of input images. 13 | Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py 14 | """ 15 | def __init__(self, block, layers, num_classes=1000, num_input_images=1): 16 | super(ResNetMultiImageInput, self).__init__(block, layers) 17 | self.inplanes = 64 18 | self.conv1 = nn.Conv2d( 19 | num_input_images * 3, 64, kernel_size=7, stride=2, padding=3, bias=False) 20 | self.bn1 = nn.BatchNorm2d(64) 21 | self.relu = nn.ReLU(inplace=True) 22 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 23 | self.layer1 = self._make_layer(block, 64, layers[0]) 24 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 25 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 26 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 27 | 28 | for m in self.modules(): 29 | if isinstance(m, nn.Conv2d): 30 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 31 | elif isinstance(m, nn.BatchNorm2d): 32 | nn.init.constant_(m.weight, 1) 33 | nn.init.constant_(m.bias, 0) 34 | 35 | 36 | def resnet_multiimage_input(num_layers, pretrained=False, num_input_images=1): 37 | """Constructs a ResNet model. 38 | Args: 39 | num_layers (int): Number of resnet layers. Must be 18 or 50 40 | pretrained (bool): If True, returns a model pre-trained on ImageNet 41 | num_input_images (int): Number of frames stacked as input 42 | """ 43 | assert num_layers in [18, 50], "Can only run with 18 or 50 layer resnet" 44 | blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers] 45 | block_type = {18: models.resnet.BasicBlock, 50: models.resnet.Bottleneck}[num_layers] 46 | model = ResNetMultiImageInput(block_type, blocks, num_input_images=num_input_images) 47 | 48 | if pretrained: 49 | # loaded = model_zoo.load_url(models.resnet.model_urls['resnet{}'.format(num_layers)]) 50 | loaded = torch.hub.load_state_dict_from_url(models.ResNet18_Weights.IMAGENET1K_V1.url) 51 | loaded['conv1.weight'] = torch.cat( 52 | [loaded['conv1.weight']] * num_input_images, 1) / num_input_images 53 | model.load_state_dict(loaded) 54 | return model 55 | 56 | 57 | class ResnetEncoder(nn.Module): 58 | """Pytorch module for a resnet encoder 59 | """ 60 | def __init__(self, num_layers, pretrained, num_input_images=1): 61 | super(ResnetEncoder, self).__init__() 62 | 63 | self.num_ch_enc = np.array([64, 64, 128, 256, 512]) 64 | 65 | resnets = {18: models.resnet18, 66 | 34: models.resnet34, 67 | 50: models.resnet50, 68 | 101: models.resnet101, 69 | 152: models.resnet152} 70 | 71 | if num_layers not in resnets: 72 | raise ValueError("{} is not a valid number of resnet layers".format(num_layers)) 73 | 74 | if num_input_images > 1: 75 | self.encoder = resnet_multiimage_input(num_layers, pretrained, num_input_images) 76 | else: 77 | self.encoder = resnets[num_layers](pretrained) 78 | 79 | if num_layers > 34: 80 | self.num_ch_enc[1:] *= 4 81 | 82 | def forward(self, input_image): 83 | 84 | self.features = [] 85 | # x = (input_image - 0.45) / 0.225 86 | x = input_image 87 | x = self.encoder.conv1(x) 88 | x = self.encoder.bn1(x) 89 | self.features.append(self.encoder.relu(x)) 90 | self.features.append(self.encoder.layer1(self.encoder.maxpool(self.features[-1]))) 91 | self.features.append(self.encoder.layer2(self.features[-1])) 92 | self.features.append(self.encoder.layer3(self.features[-1])) 93 | self.features.append(self.encoder.layer4(self.features[-1])) 94 | 95 | return self.features 96 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | import os 3 | import hashlib 4 | import zipfile 5 | from six.moves import urllib 6 | 7 | 8 | def readlines(filename): 9 | """Read all the lines in a text file and return as a list 10 | """ 11 | with open(filename, 'r') as f: 12 | lines = f.read().splitlines() 13 | return lines 14 | 15 | 16 | def normalize_image(x): 17 | """Rescale image pixels to span range [0, 1] 18 | """ 19 | ma = float(x.max().cpu().data) 20 | mi = float(x.min().cpu().data) 21 | d = ma - mi if ma != mi else 1e5 22 | return (x - mi) / d 23 | 24 | 25 | def sec_to_hm(t): 26 | """Convert time in seconds to time in hours, minutes and seconds 27 | e.g. 10239 -> (2, 50, 39) 28 | """ 29 | t = int(t) 30 | s = t % 60 31 | t //= 60 32 | m = t % 60 33 | t //= 60 34 | return t, m, s 35 | 36 | 37 | def sec_to_hm_str(t): 38 | """Convert time in seconds to a nice string 39 | e.g. 10239 -> '02h50m39s' 40 | """ 41 | h, m, s = sec_to_hm(t) 42 | return "{:02d}h{:02d}m{:02d}s".format(h, m, s) 43 | 44 | 45 | def download_model_if_doesnt_exist(model_name): 46 | """If pretrained kitti model doesn't exist, download and unzip it 47 | """ 48 | # values are tuples of (, ) 49 | download_paths = { 50 | "mono_640x192": 51 | ("https://storage.googleapis.com/niantic-lon-static/research/monodepth2/mono_640x192.zip", 52 | "a964b8356e08a02d009609d9e3928f7c"), 53 | "stereo_640x192": 54 | ("https://storage.googleapis.com/niantic-lon-static/research/monodepth2/stereo_640x192.zip", 55 | "3dfb76bcff0786e4ec07ac00f658dd07"), 56 | "mono+stereo_640x192": 57 | ("https://storage.googleapis.com/niantic-lon-static/research/monodepth2/mono%2Bstereo_640x192.zip", 58 | "c024d69012485ed05d7eaa9617a96b81"), 59 | "mono_no_pt_640x192": 60 | ("https://storage.googleapis.com/niantic-lon-static/research/monodepth2/mono_no_pt_640x192.zip", 61 | "9c2f071e35027c895a4728358ffc913a"), 62 | "stereo_no_pt_640x192": 63 | ("https://storage.googleapis.com/niantic-lon-static/research/monodepth2/stereo_no_pt_640x192.zip", 64 | "41ec2de112905f85541ac33a854742d1"), 65 | "mono+stereo_no_pt_640x192": 66 | ("https://storage.googleapis.com/niantic-lon-static/research/monodepth2/mono%2Bstereo_no_pt_640x192.zip", 67 | "46c3b824f541d143a45c37df65fbab0a"), 68 | "mono_1024x320": 69 | ("https://storage.googleapis.com/niantic-lon-static/research/monodepth2/mono_1024x320.zip", 70 | "0ab0766efdfeea89a0d9ea8ba90e1e63"), 71 | "stereo_1024x320": 72 | ("https://storage.googleapis.com/niantic-lon-static/research/monodepth2/stereo_1024x320.zip", 73 | "afc2f2126d70cf3fdf26b550898b501a"), 74 | "mono+stereo_1024x320": 75 | ("https://storage.googleapis.com/niantic-lon-static/research/monodepth2/mono%2Bstereo_1024x320.zip", 76 | "cdc5fc9b23513c07d5b19235d9ef08f7"), 77 | } 78 | 79 | if not os.path.exists("models"): 80 | os.makedirs("models") 81 | 82 | model_path = os.path.join("models", model_name) 83 | 84 | def check_file_matches_md5(checksum, fpath): 85 | if not os.path.exists(fpath): 86 | return False 87 | with open(fpath, 'rb') as f: 88 | current_md5checksum = hashlib.md5(f.read()).hexdigest() 89 | return current_md5checksum == checksum 90 | 91 | # see if we have the model already downloaded... 92 | if not os.path.exists(os.path.join(model_path, "encoder.pth")): 93 | 94 | model_url, required_md5checksum = download_paths[model_name] 95 | 96 | if not check_file_matches_md5(required_md5checksum, model_path + ".zip"): 97 | print("-> Downloading pretrained model to {}".format(model_path + ".zip")) 98 | urllib.request.urlretrieve(model_url, model_path + ".zip") 99 | 100 | if not check_file_matches_md5(required_md5checksum, model_path + ".zip"): 101 | print(" Failed to download a file which matches the checksum - quitting") 102 | quit() 103 | 104 | print(" Unzipping model...") 105 | with zipfile.ZipFile(model_path + ".zip", 'r') as f: 106 | f.extractall(model_path) 107 | 108 | print(" Model unzipped to {}".format(model_path)) 109 | -------------------------------------------------------------------------------- /test_simple.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import sys 5 | import glob 6 | import argparse 7 | import numpy as np 8 | import PIL.Image as pil 9 | import matplotlib as mpl 10 | import matplotlib.cm as cm 11 | import cv2 12 | import torch 13 | from torchvision import transforms, datasets 14 | 15 | import networks 16 | from layers import disp_to_depth 17 | 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser( 21 | description='Simple testing funtion for Monodepthv2 models.') 22 | 23 | parser.add_argument('--image_path', type=str, 24 | help='path to a test image or folder of images', required=True) 25 | parser.add_argument('--model_path', type=str, 26 | help='path to the test model', required=True) 27 | parser.add_argument('--ext', type=str, 28 | help='image extension to search for in folder', default="png") 29 | parser.add_argument("--no_cuda", 30 | help='if set, disables CUDA', 31 | action='store_true') 32 | 33 | return parser.parse_args() 34 | 35 | 36 | def test_simple(args): 37 | """Function to predict for a single image or folder of images 38 | """ 39 | if torch.cuda.is_available() and not args.no_cuda: 40 | device = torch.device("cuda") 41 | else: 42 | device = torch.device("cpu") 43 | 44 | model_path = args.model_path 45 | 46 | print("-> Loading model from ", model_path) 47 | depth_model_path = os.path.join(opt.load_weights_folder, "depth_model.pth") 48 | 49 | depth_model_dict = torch.load(depth_model_path) 50 | depth_model = networks.DARES() 51 | 52 | model_dict = depth_model.state_dict() 53 | 54 | depth_model.load_state_dict({k: v for k, v in depth_model_dict.items() if k in model_dict}) 55 | depth_model.cuda() 56 | depth_model.eval() 57 | 58 | # FINDING INPUT IMAGES 59 | if os.path.isfile(args.image_path): 60 | # Only testing on a single image 61 | paths = [args.image_path] 62 | output_directory = os.path.dirname(args.image_path) 63 | elif os.path.isdir(args.image_path): 64 | # Searching folder for images 65 | paths = glob.glob(os.path.join(args.image_path, '*.{}'.format(args.ext))) 66 | output_directory = args.image_path 67 | else: 68 | raise Exception("Can not find args.image_path: {}".format(args.image_path)) 69 | 70 | print("-> Predicting on {:d} test images".format(len(paths))) 71 | 72 | # PREDICTING ON EACH IMAGE IN TURN 73 | with torch.no_grad(): 74 | for idx, image_path in enumerate(paths): 75 | 76 | if image_path.endswith("_disp.jpg"): 77 | # don't try to predict disparity for a disparity image! 78 | continue 79 | 80 | # Load image and preprocess 81 | input_image = pil.open(image_path).convert('RGB') 82 | 83 | original_width, original_height = input_image.size 84 | input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) 85 | input_image = transforms.ToTensor()(input_image).unsqueeze(0) 86 | 87 | # PREDICTION 88 | input_image = input_image.to(device) 89 | output = depth_model(input_image) 90 | pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) 91 | pred_disp = pred_disp.cpu()[:, 0].numpy() 92 | 93 | disp = outputs[("disp", 0)] 94 | disp_resized = torch.nn.functional.interpolate( 95 | disp, (original_height * 2, original_width * 2), mode="bilinear", align_corners=False) 96 | 97 | # Saving numpy file 98 | output_name = os.path.splitext(os.path.basename(image_path))[0] 99 | name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) 100 | np.save(name_dest_npy, pred_disp) 101 | 102 | # Saving colormapped depth image 103 | disp_resized_np = pred_disp.squeeze().cpu().numpy() 104 | vmax = np.percentile(disp_resized_np, 95) 105 | 106 | normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) # 归一化到0-1 107 | mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') # colormap 108 | colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) 109 | 110 | im = pil.fromarray(colormapped_im) 111 | 112 | name_dest_im = os.path.join(output_directory, "{}.jpeg".format(output_name)) 113 | im.save(name_dest_im, quality=95) 114 | 115 | print(" Processed {:d} of {:d} images - saved prediction to {}".format( 116 | idx + 1, len(paths), name_dest_im)) 117 | 118 | print('->p Done!') 119 | 120 | 121 | if __name__ == '__main__': 122 | args = parse_args() 123 | test_simple(args) 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DARES 2 | 3 | This is the official PyTorch implementation for training and testing depth estimation models using the method described in 4 | 5 | > **DARES: Depth Anything in Robotic Endoscopic Surgery with Self-supervised Vector-LoRA of the Foundation Model** 6 | > 7 | > Mona Sheikh Zeinoddin, Chiara Lena, Jiongqi Qu, Luca Carlini, Mattia 8 | Magro, Seunghoi Kim, Elena De Momi, Sophia Bano, Matthew 9 | Grech-Sollars, Evangelos Mazomenos, Daniel C. Alexander, Danail 10 | Stoyanov, Matthew J. Clarkson and Mobarakol Islam 11 | > 12 | > [accepted by the European Conference on Computer Vision 2024 Efficient Deep Learning for Foundation Models WorkShop (arXiv pdf)](https://arxiv.org/pdf/2408.17433) 13 | 14 | 15 | #### Overview 16 | 17 |

18 | 19 |

20 | 21 | ## ✏️ 📄 Citation 22 | 23 | If you find our work useful in your research please consider citing our paper: 24 | 25 | ``` 26 | @article{zeinoddin2024dares, 27 | title={DARES: Depth Anything in Robotic Endoscopic Surgery with Self-supervised Vector-LoRA of the Foundation Model}, 28 | author={Zeinoddin, Mona Sheikh and Lena, Chiara and Qu, Jiongqi and Carlini, Luca and Magro, Mattia and Kim, Seunghoi and De Momi, Elena and Bano, Sophia and Grech-Sollars, Matthew and Mazomenos, Evangelos and others}, 29 | journal={arXiv preprint arXiv:2408.17433}, 30 | year={2024} 31 | } 32 | ``` 33 | 34 | 35 | 36 | 37 | ## ⚙️ Setup 38 | 39 | We ran our experiments with PyTorch 1.2.0, torchvision 0.4.0, CUDA 10.2, Python 3.7.3 and Ubuntu 18.04. 40 | 41 | 42 | 43 | ## 🖼️ Prediction for a single image or a folder of images 44 | 45 | You can predict scaled disparity for a single image or a folder of images with: 46 | 47 | ```shell 48 | CUDA_VISIBLE_DEVICES=0 python test_simple.py --model_path --image_path 49 | ``` 50 | 51 | ## Initializing with AF-Sfm Learner weights 52 | 53 | You can download AF-Sfm Learners weights that we use in initialization with: 54 | 55 | ```shell 56 | gdown 1kf7LjQ6a2ACKr6nX5Uyee3of3bXn1xWB 57 | unzip -q Model_trained_end_to_end.zip 58 | mv Model_trained_end_to_end af_sfmlearner_weights 59 | ``` 60 | 61 | 62 | ## 💾 Datasets 63 | 64 | You can download the [Endovis or SCARED dataset](https://endovissub2019-scared.grand-challenge.org) by signing the challenge rules and emailing them to max.allan@intusurg.com 65 | 66 | **Endovis split** 67 | 68 | The train/test/validation split for Endovis dataset used in our works is defined in the `splits/endovis` folder. 69 | 70 | **Endovis data preprocessing** 71 | 72 | We use the ffmpeg to convert the RGB.mp4 into images.png: 73 | 74 | ```shell 75 | find . -name "*.mp4" -print0 | xargs -0 -I {} sh -c 'output_dir=$(dirname "$1"); ffmpeg -i "$1" "$output_dir/%10d.png"' _ {} 76 | ``` 77 | We only use the left frames in our experiments and please refer to [extract_left_frames.py](https://github.com/ShuweiShao/AF-SfMLearner/blob/main/extract_left_frames.py). For dataset 8 and 9, we rephrase keyframes 0-4 as keyframes 1-5. 78 | 79 | **Data structure** 80 | 81 | The directory of dataset structure is shown as follows: 82 | 83 | ``` 84 | /path/to/endovis_data/ 85 | dataset1/ 86 | keyframe1/ 87 | image_02/ 88 | data/ 89 | 0000000001.png 90 | ``` 91 | 92 | 93 | 94 | ## ⏳ Endovis training 95 | 96 | 97 | ```shell 98 | CUDA_VISIBLE_DEVICES=0 python train_end_to_end.py --data_path --log_dir 99 | ``` 100 | 101 | ## 📊 Endovis evaluation 102 | 103 | To prepare the ground truth depth maps run: 104 | ```shell 105 | CUDA_VISIBLE_DEVICES=0 python export_gt_depth.py --data_path endovis_data --split endovis 106 | ``` 107 | ...assuming that you have placed the endovis dataset in the default location of `./endovis_data/`. 108 | 109 | The following example command evaluates the epoch 19 weights of a model named `mono_model`: 110 | ```shell 111 | CUDA_VISIBLE_DEVICES=0 python evaluate_depth.py --data_path --load_weights_folder ~/mono_model/mdp/models/weights_19 --eval_mono 112 | ``` 113 | 114 | #### Depth Estimation 115 | 116 |

117 | 118 |

119 | 120 | #### Visual Odometry 121 | 122 |

123 | 124 |

125 | 126 | #### 3D Reconstruction 127 | 128 |

129 | 130 |

131 | 132 | 133 | ## Model zoo 134 | 135 | | Model | Abs Rel | Sq Rel | RMSE | RMSE log | Link | 136 | | ------------ | ---------- | ------ | --------- | ---- | ---- | 137 | | End-to-end best model weights | 0.052 | 0.356 | 4.483 | 0.073 |[google](https://drive.google.com/file/d/11C0sw396TcH2hMM7u6uMr-uBsCP4l2Kd/view?usp=sharing)| 138 | 139 | 140 | 141 | ## Contact 142 | 143 | If you have any questions, please feel free to contact mona.zeinoddin.22@ucl.ac.uk or mobarakol.islam@ucl.ac.uk 144 | 145 | 146 | 147 | ## Acknowledgement 148 | 149 | Our code is based on the implementation of [AF-Sfm Learner](https://github.com/ShuweiShao/AF-SfMLearner). We thank AF-Sfm Learner's authors for their excellent work and repository. 150 | -------------------------------------------------------------------------------- /evaluate_pose.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import torch 5 | import networks 6 | import numpy as np 7 | 8 | from torch.utils.data import DataLoader 9 | from layers import transformation_from_parameters 10 | from utils import readlines 11 | from options import MonodepthOptions 12 | from datasets import SCAREDRAWDataset 13 | import warnings 14 | warnings.filterwarnings('ignore') 15 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 16 | 17 | 18 | # from https://github.com/tinghuiz/SfMLearner 19 | def dump_xyz(source_to_target_transformations): 20 | xyzs = [] 21 | cam_to_world = np.eye(4) 22 | xyzs.append(cam_to_world[:3, 3]) 23 | for source_to_target_transformation in source_to_target_transformations: 24 | cam_to_world = np.dot(cam_to_world, source_to_target_transformation) 25 | # cam_to_world = np.dot(source_to_target_transformation, cam_to_world) 26 | xyzs.append(cam_to_world[:3, 3]) 27 | return xyzs 28 | 29 | 30 | def dump_r(source_to_target_transformations): 31 | rs = [] 32 | cam_to_world = np.eye(4) 33 | rs.append(cam_to_world[:3, :3]) 34 | for source_to_target_transformation in source_to_target_transformations: 35 | cam_to_world = np.dot(cam_to_world, source_to_target_transformation) 36 | # cam_to_world = np.dot(source_to_target_transformation, cam_to_world) 37 | rs.append(cam_to_world[:3, :3]) 38 | return rs 39 | 40 | 41 | # from https://github.com/tinghuiz/SfMLearner 42 | def compute_ate(gtruth_xyz, pred_xyz_o): 43 | 44 | # Make sure that the first matched frames align (no need for rotational alignment as 45 | # all the predicted/ground-truth snippets have been converted to use the same coordinate 46 | # system with the first frame of the snippet being the origin). 47 | offset = gtruth_xyz[0] - pred_xyz_o[0] 48 | pred_xyz = pred_xyz_o + offset[None, :] 49 | 50 | # Optimize the scaling factor 51 | scale = np.sum(gtruth_xyz * pred_xyz) / np.sum(pred_xyz ** 2) 52 | alignment_error = pred_xyz * scale - gtruth_xyz 53 | rmse = np.sqrt(np.sum(alignment_error ** 2)) / gtruth_xyz.shape[0] 54 | return rmse 55 | 56 | 57 | def compute_re(gtruth_r, pred_r): 58 | RE = 0 59 | gt = gtruth_r 60 | pred = pred_r 61 | for gt_pose, pred_pose in zip(gt, pred): 62 | # Residual matrix to which we compute angle's sin and cos 63 | R = gt_pose @ np.linalg.inv(pred_pose) 64 | s = np.linalg.norm([R[0, 1] - R[1, 0], 65 | R[1, 2] - R[2, 1], 66 | R[0, 2] - R[2, 0]]) 67 | c = np.trace(R) - 1 68 | # Note: we actually compute double of cos and sin, but arctan2 is invariant to scale 69 | RE += np.arctan2(s, c) 70 | 71 | return RE / gtruth_r.shape[0] 72 | 73 | 74 | def evaluate(opt): 75 | """Evaluate odometry on the SCARED dataset 76 | """ 77 | assert os.path.isdir(opt.load_weights_folder), \ 78 | "Cannot find a folder at {}".format(opt.load_weights_folder) 79 | 80 | filenames = readlines( 81 | os.path.join(os.path.dirname(__file__), "splits", "endovis", 82 | "test_files_sequence{}.txt".format(opt.scared_pose_seq))) 83 | 84 | dataset = SCAREDRAWDataset(opt.data_path, filenames, opt.height, opt.width, 85 | [0, 1], 4, is_train=False) 86 | dataloader = DataLoader(dataset, opt.batch_size, shuffle=False, 87 | num_workers=opt.num_workers, pin_memory=True, drop_last=False) 88 | 89 | pose_encoder_path = os.path.join(opt.load_weights_folder, "pose_encoder.pth") 90 | pose_decoder_path = os.path.join(opt.load_weights_folder, "pose.pth") 91 | 92 | pose_encoder = networks.ResnetEncoder(opt.num_layers, False, 2) 93 | pose_encoder.load_state_dict(torch.load(pose_encoder_path, map_location=device.type)) 94 | 95 | pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) 96 | pose_decoder.load_state_dict(torch.load(pose_decoder_path, map_location=device.type)) 97 | 98 | pose_encoder.to(device) 99 | pose_encoder.eval() 100 | pose_decoder.to(device) 101 | pose_decoder.eval() 102 | 103 | pred_poses = [] 104 | 105 | print("-> Computing pose predictions") 106 | 107 | opt.frame_ids = [0, 1] # pose network only takes two frames as input 108 | 109 | with torch.no_grad(): 110 | for inputs in dataloader: 111 | for key, ipt in inputs.items(): 112 | inputs[key] = ipt.to(device) 113 | 114 | all_color_aug = torch.cat([inputs[("color", 1, 0)], inputs[("color", 0, 0)]], 1) 115 | 116 | features = [pose_encoder(all_color_aug)] 117 | axisangle, translation = pose_decoder(features) 118 | 119 | pred_poses.append( 120 | transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy()) 121 | 122 | pred_poses = np.concatenate(pred_poses) 123 | # np.savez_compressed(os.path.join(os.path.dirname(__file__), "splits", "endovis", "curve", "pose_our.npz"), data=np.array(pred_poses)) 124 | np.savez_compressed(os.path.join(os.path.dirname(__file__), "splits", "endovis", "pred_pose_sq{}.npz".format(opt.scared_pose_seq)), data=np.array(pred_poses)) 125 | 126 | gt_path = os.path.join(os.path.dirname(__file__), "splits", "endovis", "gt_poses_sq{}.npz".format(opt.scared_pose_seq)) 127 | gt_local_poses = np.load(gt_path, fix_imports=True, encoding='latin1')["data"] 128 | 129 | ates = [] 130 | res = [] 131 | num_frames = gt_local_poses.shape[0] 132 | track_length = 5 133 | for i in range(0, num_frames - 1): 134 | local_xyzs = np.array(dump_xyz(pred_poses[i:i + track_length - 1])) 135 | gt_local_xyzs = np.array(dump_xyz(gt_local_poses[i:i + track_length - 1])) 136 | local_rs = np.array(dump_r(pred_poses[i:i + track_length - 1])) 137 | gt_rs = np.array(dump_r(gt_local_poses[i:i + track_length - 1])) 138 | 139 | ates.append(compute_ate(gt_local_xyzs, local_xyzs)) 140 | res.append(compute_re(local_rs, gt_rs)) 141 | 142 | print("\n Trajectory error: {:0.4f}, std: {:0.4f}\n".format(np.mean(ates), np.std(ates))) 143 | print("\n Rotation error: {:0.4f}, std: {:0.4f}\n".format(np.mean(res), np.std(res))) 144 | 145 | 146 | if __name__ == "__main__": 147 | options = MonodepthOptions() 148 | evaluate(options.parse()) 149 | -------------------------------------------------------------------------------- /datasets/mono_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import random 5 | import numpy as np 6 | import copy 7 | from PIL import Image # using pillow-simd for increased speed 8 | from PIL import ImageFile 9 | 10 | import torch 11 | import torch.utils.data as data 12 | from torchvision import transforms 13 | 14 | ImageFile.LOAD_TRUNCATED_IMAGES=True 15 | 16 | def pil_loader(path): 17 | # open path as file to avoid ResourceWarning 18 | # (https://github.com/python-pillow/Pillow/issues/835) 19 | with open(path, 'rb') as f: 20 | with Image.open(f) as img: 21 | return img.convert('RGB') 22 | 23 | 24 | class MonoDataset(data.Dataset): 25 | """Superclass for monocular dataloaders 26 | 27 | Args: 28 | data_path 29 | filenames 30 | height 31 | width 32 | frame_idxs 33 | num_scales 34 | is_train 35 | img_ext 36 | """ 37 | def __init__(self, 38 | data_path, 39 | filenames, 40 | height, 41 | width, 42 | frame_idxs, 43 | num_scales, 44 | is_train=False, 45 | img_ext='.png'): 46 | super(MonoDataset, self).__init__() 47 | 48 | self.data_path = data_path 49 | self.filenames = filenames 50 | self.height = height 51 | self.width = width 52 | self.num_scales = num_scales 53 | # self.interp = Image.ANTIALIAS 54 | self.interp = Image.LANCZOS 55 | 56 | self.frame_idxs = frame_idxs 57 | 58 | self.is_train = is_train 59 | self.img_ext = img_ext 60 | 61 | self.loader = pil_loader 62 | self.to_tensor = transforms.ToTensor() 63 | 64 | # We need to specify augmentations differently in newer versions of torchvision. 65 | # We first try the newer tuple version; if this fails we fall back to scalars 66 | try: 67 | self.brightness = (0.8, 1.2) 68 | self.contrast = (0.8, 1.2) 69 | self.saturation = (0.8, 1.2) 70 | self.hue = (-0.1, 0.1) 71 | transforms.transforms.ColorJitter(self.brightness,self.contrast,self.saturation,self.hue) 72 | except TypeError: 73 | self.brightness = 0.2 74 | self.contrast = 0.2 75 | self.saturation = 0.2 76 | self.hue = 0.1 77 | 78 | self.resize = {} 79 | for i in range(self.num_scales): 80 | s = 2 ** i 81 | self.resize[i] = transforms.Resize((self.height // s, self.width // s), 82 | interpolation=self.interp) 83 | self.load_depth = self.check_depth() 84 | 85 | def preprocess(self, inputs, color_aug): 86 | """Resize colour images to the required scales and augment if required 87 | 88 | We create the color_aug object in advance and apply the same augmentation to all 89 | images in this item. This ensures that all images input to the pose network receive the 90 | same augmentation. 91 | """ 92 | for k in list(inputs): 93 | frame = inputs[k] 94 | if "color" in k: 95 | n, im, i = k 96 | for i in range(self.num_scales): 97 | inputs[(n, im, i)] = self.resize[i](inputs[(n, im, i - 1)]) 98 | 99 | for k in list(inputs): 100 | f = inputs[k] 101 | if "color" in k: 102 | n, im, i = k 103 | inputs[(n, im, i)] = self.to_tensor(f) 104 | inputs[(n + "_aug", im, i)] = self.to_tensor(color_aug(f)) 105 | 106 | def __len__(self): 107 | return len(self.filenames) 108 | 109 | def __getitem__(self, index): 110 | """Returns a single training item from the dataset as a dictionary. 111 | 112 | Values correspond to torch tensors. 113 | Keys in the dictionary are either strings or tuples: 114 | 115 | ("color", , ) for raw colour images, 116 | ("color_aug", , ) for augmented colour images, 117 | ("K", scale) or ("inv_K", scale) for camera intrinsics, 118 | "stereo_T" for camera extrinsics, and 119 | "depth_gt" for ground truth depth maps. 120 | 121 | is either: 122 | an integer (e.g. 0, -1, or 1) representing the temporal step relative to 'index', 123 | or 124 | "s" for the opposite image in the stereo pair. 125 | 126 | is an integer representing the scale of the image relative to the fullsize image: 127 | -1 images at native resolution as loaded from disk 128 | 0 images resized to (self.width, self.height ) 129 | 1 images resized to (self.width // 2, self.height // 2) 130 | 2 images resized to (self.width // 4, self.height // 4) 131 | 3 images resized to (self.width // 8, self.height // 8) 132 | """ 133 | inputs = {} 134 | 135 | do_color_aug = self.is_train and random.random() > 0.5 136 | do_flip = self.is_train and random.random() > 0.5 137 | 138 | line = self.filenames[index].split() 139 | folder = line[0] 140 | sequence = folder[7] 141 | keyframe = folder[-1] 142 | inputs["sequence"] = torch.from_numpy(np.array(int(sequence))) 143 | inputs["keyframe"] = torch.from_numpy(np.array(int(keyframe))) 144 | 145 | if len(line) == 3: 146 | frame_index = int(line[1]) 147 | else: 148 | frame_index = 0 149 | 150 | if len(line) == 3: 151 | side = line[2] 152 | else: 153 | side = None 154 | 155 | inputs["frame_id"] = torch.from_numpy(np.array(frame_index)) 156 | for i in self.frame_idxs: 157 | if i == "s": 158 | other_side = {"r": "l", "l": "r"}[side] 159 | inputs[("color", i, -1)] = self.get_color(folder, frame_index, other_side, do_flip) 160 | else: 161 | inputs[("color", i, -1)] = self.get_color(folder, frame_index + i, side, do_flip) 162 | 163 | # adjusting intrinsics to match each scale in the pyramid 164 | for scale in range(self.num_scales): 165 | K = self.K.copy() 166 | K[0, :] *= self.width // (2 ** scale) 167 | K[1, :] *= self.height // (2 ** scale) 168 | 169 | 170 | inv_K = np.linalg.pinv(K) 171 | 172 | inputs[("K", scale)] = torch.from_numpy(K) 173 | inputs[("inv_K", scale)] = torch.from_numpy(inv_K) 174 | 175 | if do_color_aug: 176 | color_aug = transforms.ColorJitter(self.brightness, self.contrast, self.saturation, self.hue) 177 | else: 178 | color_aug = (lambda x: x) 179 | 180 | self.preprocess(inputs, color_aug) 181 | for i in self.frame_idxs: 182 | del inputs[("color", i, -1)] 183 | del inputs[("color_aug", i, -1)] 184 | 185 | if self.load_depth: 186 | depth_gt = self.get_depth(folder, frame_index, side, do_flip) 187 | inputs["depth_gt"] = np.expand_dims(depth_gt, 0) 188 | inputs["depth_gt"] = torch.from_numpy(inputs["depth_gt"].astype(np.float32)) 189 | 190 | if "s" in self.frame_idxs: 191 | stereo_T = np.eye(4, dtype=np.float32) 192 | baseline_sign = -1 if do_flip else 1 193 | side_sign = -1 if side == "l" else 1 194 | stereo_T[0, 3] = side_sign * baseline_sign * 0.1 195 | 196 | inputs["stereo_T"] = torch.from_numpy(stereo_T) 197 | return inputs 198 | 199 | def get_color(self, folder, frame_index, side, do_flip): 200 | raise NotImplementedError 201 | 202 | def check_depth(self): 203 | raise NotImplementedError 204 | 205 | def get_depth(self, folder, frame_index, side, do_flip): 206 | raise NotImplementedError 207 | -------------------------------------------------------------------------------- /networks/depth_decoder_ICRA.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | from collections import OrderedDict 8 | from layers import * 9 | from torch.nn.parameter import Parameter 10 | 11 | 12 | class DepthDecoder(nn.Module): 13 | def __init__(self, num_ch_enc, scales = range(4) , num_output_channels=1, use_skips=True): 14 | super(DepthDecoder, self).__init__() 15 | 16 | self.num_output_channels = num_output_channels 17 | self.use_skips = use_skips 18 | self.upsample_mode = 'nearest' 19 | self.scales = scales 20 | 21 | self.num_ch_enc = num_ch_enc 22 | self.num_ch_dec = np.array([16, 32, 64, 128, 256]) 23 | 24 | # decoder 25 | self.convs = OrderedDict() # 有序字典 26 | for i in range(4, -1, -1): 27 | 28 | # upconv_0 29 | num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + 1] 30 | num_ch_out = self.num_ch_dec[i] 31 | self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out) 32 | 33 | # upconv_1 34 | num_ch_in = self.num_ch_dec[i] 35 | if self.use_skips and i > 0: 36 | num_ch_in += self.num_ch_enc[i - 1] 37 | num_ch_out = self.num_ch_dec[i] 38 | self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out) 39 | 40 | for s in self.scales: 41 | self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s], self.num_output_channels) 42 | 43 | self.convs[("attention", 0)] = ChannelSpatialSELayer(self.num_ch_enc[-1], reduction_ratio=16) 44 | # self.convs[("attention", 0)] = ChannelSELayer(self.num_ch_enc[-1], reduction_ratio=16) 45 | # self.convs[("attention", 0)] = SPPSELayer(self.num_ch_enc[-1], reduction=16) 46 | # self.convs[("attention", 0)] = SpatialSELayer(self.num_ch_enc[-1]) 47 | 48 | self.decoder = nn.ModuleList(list(self.convs.values())) 49 | self.sigmoid = nn.Sigmoid() 50 | 51 | def forward(self, input_features): 52 | self.outputs = {} 53 | # decoder 54 | x = input_features[-1] 55 | x = self.convs[("attention", 0)](x) 56 | for i in range(4, -1, -1): 57 | x = self.convs[("upconv", i, 0)](x) 58 | x = [upsample(x)] 59 | if self.use_skips and i > 0: 60 | x += [input_features[i - 1]] 61 | x = torch.cat(x, 1) 62 | x = self.convs[("upconv", i, 1)](x) 63 | if i in self.scales: 64 | self.outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv", i)](x)) 65 | 66 | return self.outputs 67 | 68 | 69 | class ChannelSELayer(nn.Module): 70 | 71 | def __init__(self, num_channels, reduction_ratio=16): 72 | 73 | super(ChannelSELayer, self).__init__() 74 | num_channels_reduced = num_channels // reduction_ratio 75 | self.reduction_ratio = reduction_ratio 76 | self.fc1 = nn.Linear(num_channels, num_channels_reduced, bias=True) 77 | self.fc2 = nn.Linear(num_channels_reduced, num_channels, bias=True) 78 | self.elu = nn.ELU(inplace=True) 79 | self.sigmoid = nn.Sigmoid() 80 | 81 | def forward(self, input_tensor): 82 | 83 | batch_size, num_channels, H, W = input_tensor.size() 84 | # Average along each channel 85 | squeeze_tensor = input_tensor.view(batch_size, num_channels, -1).mean(dim=2) 86 | 87 | # channel excitation 88 | fc_out_1 = self.elu(self.fc1(squeeze_tensor)) 89 | fc_out_2 = self.sigmoid(self.fc2(fc_out_1)) 90 | 91 | a, b = squeeze_tensor.size() 92 | output_tensor = torch.mul(input_tensor, fc_out_2.view(a, b, 1, 1)) 93 | 94 | return output_tensor 95 | 96 | 97 | class SPPSELayer(nn.Module): 98 | def __init__(self, channel, reduction=16): 99 | super(SPPSELayer, self).__init__() 100 | self.avg_pool1 = nn.AdaptiveAvgPool2d(1) 101 | self.avg_pool2 = nn.AdaptiveAvgPool2d(2) 102 | self.avg_pool4 = nn.AdaptiveAvgPool2d(4) 103 | self.fc = nn.Sequential( 104 | nn.Linear(channel*21, channel*21 // reduction, bias=True), 105 | nn.ELU(inplace=True), 106 | nn.Linear(channel*21 // reduction, channel, bias=True), 107 | nn.Sigmoid() 108 | ) 109 | 110 | def forward(self, x): 111 | 112 | b, c, _, _ = x.size() 113 | y1 = self.avg_pool1(x).view(b, c) 114 | y2 = self.avg_pool2(x).view(b, 4 * c) 115 | y3 = self.avg_pool4(x).view(b, 16 * c) 116 | y = torch.cat((y1, y2, y3), 1) 117 | y = self.fc(y).view(b, c, 1, 1) 118 | 119 | return x * y.expand_as(x) 120 | 121 | 122 | class SpatialSELayer(nn.Module): 123 | 124 | def __init__(self, num_channels): 125 | 126 | super(SpatialSELayer, self).__init__() 127 | 128 | self.conv0 = nn.Conv2d(num_channels, num_channels // 16, 1) 129 | self.conv1 = nn.Conv2d(num_channels // 16, num_channels // 16, 3, dilation=1) 130 | self.conv2 = nn.Conv2d(num_channels // 16, num_channels // 16, 3, dilation=2) 131 | self.conv3 = nn.Conv2d(num_channels // 16, num_channels // 16, 3, dilation=3) 132 | self.conv4 = nn.Conv2d(num_channels // 16, num_channels // 16, 3, dilation=4) 133 | self.conv5 = nn.Conv2d((num_channels // 16), 1, 1) 134 | self.elu = nn.ELU(inplace=True) 135 | self.pad1 = nn.ReflectionPad2d(1) 136 | self.pad2 = nn.ReflectionPad2d(2) 137 | self.pad3 = nn.ReflectionPad2d(3) 138 | self.pad4 = nn.ReflectionPad2d(4) 139 | self.sigmoid = nn.Sigmoid() 140 | self.p1 = Parameter(torch.ones(1)) 141 | self.p2 = Parameter(torch.zeros(1)) 142 | self.p3 = Parameter(torch.zeros(1)) 143 | self.p4 = Parameter(torch.zeros(1)) 144 | 145 | 146 | def forward(self, input_tensor): 147 | 148 | batch_size, channel, a, b = input_tensor.size() 149 | 150 | out0 = self.conv0(input_tensor) 151 | 152 | out1 = self.pad1(out0) 153 | out1 = self.conv1(out1) 154 | out1 = self.elu(out1) 155 | att1 = self.conv5(out1) 156 | att1 = self.sigmoid(att1) 157 | 158 | out2 = torch.add(out0, out1) 159 | out2 = self.pad2(out2) 160 | out2 = self.conv2(out2) 161 | out2 = self.elu(out2) 162 | att2 = self.conv5(out2) 163 | att2 = self.sigmoid(att2) 164 | 165 | out3 = torch.add(out0, out2) 166 | out3 = self.pad3(out3) 167 | out3 = self.conv3(out3) 168 | out3 = self.elu(out3) 169 | att3 = self.conv5(out3) 170 | att3 = self.sigmoid(att3) 171 | 172 | out4 = torch.add(out0, out3) 173 | out4 = self.pad3(out4) 174 | out4 = self.conv3(out4) 175 | out4 = self.elu(out4) 176 | att4 = self.conv5(out4) 177 | att4 = self.sigmoid(att4) 178 | 179 | att1 = att1.view(batch_size, 1, a, b) 180 | att2 = att2.view(batch_size, 1, a, b) 181 | att3 = att3.view(batch_size, 1, a, b) 182 | att4 = att4.view(batch_size, 1, a, b) 183 | 184 | out1 = torch.mul(input_tensor, att1) 185 | out2 = torch.mul(input_tensor, att2) 186 | out3 = torch.mul(input_tensor, att3) 187 | out4 = torch.mul(input_tensor, att4) 188 | 189 | output_tensor = self.elu((self.p1 * out1 + self.p2 * out2 + self.p3 * out3 + self.p4 * out4)) 190 | # output_tensor = self.elu(out1 + out2 + out3 + out4) 191 | return output_tensor 192 | 193 | 194 | class ChannelSpatialSELayer(nn.Module): 195 | 196 | def __init__(self, num_channels, reduction_ratio=16): 197 | 198 | super(ChannelSpatialSELayer, self).__init__() 199 | self.cSE = SPPSELayer(num_channels, reduction_ratio) 200 | self.sSE = SpatialSELayer(num_channels) 201 | self.conv = nn.Conv2d(2 * num_channels, num_channels, 1) 202 | self.elu = nn.ELU() 203 | 204 | def forward(self, input_tensor): 205 | 206 | output_tensor = torch.cat((self.cSE(input_tensor), self.sSE(input_tensor)), dim=1) 207 | output_tensor = self.conv(output_tensor) 208 | output_tensor = torch.add(input_tensor, output_tensor) 209 | output_tensor = self.elu(output_tensor) 210 | 211 | return output_tensor 212 | 213 | 214 | -------------------------------------------------------------------------------- /networks/dares.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoImageProcessor, AutoModelForDepthEstimation, DepthAnythingForDepthEstimation 2 | import torch 3 | from torchvision import transforms 4 | import numpy as np 5 | from PIL import Image 6 | import requests 7 | import matplotlib.pyplot as plt 8 | import os 9 | import torch.nn as nn 10 | import math 11 | from torch.nn.parameter import Parameter 12 | 13 | class _LoRA_qkv(nn.Module): 14 | """In Dinov2 it is implemented as 15 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 16 | B, N, C = x.shape 17 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) 18 | q, k, v = qkv.unbind(0) 19 | """ 20 | 21 | def __init__( 22 | self, 23 | w: nn.Module, 24 | linear_a: nn.Module, 25 | linear_b: nn.Module 26 | ): 27 | super().__init__() 28 | self.w = w 29 | self.linear_a = linear_a 30 | self.linear_b = linear_b 31 | self.dim = w.in_features 32 | 33 | def forward(self, x): 34 | W = self.w(x) 35 | residual = W.clone() 36 | deltaW = self.linear_b(self.linear_a(x)) 37 | 38 | W += deltaW 39 | return W 40 | 41 | class DepthAnythingDepthEstimationHead(nn.Module): 42 | 43 | def __init__(self, model_head): 44 | super().__init__() 45 | self.conv1 = model_head.conv1 46 | self.conv2 = model_head.conv2 47 | self.activation1 = nn.ReLU() 48 | self.conv3 = model_head.conv3 49 | self.activation2 = nn.Sigmoid() 50 | 51 | def forward(self, hidden_states, height, width): 52 | predicted_depth = self.conv1(hidden_states) 53 | predicted_depth = nn.functional.interpolate( 54 | predicted_depth, 55 | (int(height), int(width)), 56 | mode="bilinear", 57 | align_corners=True, 58 | ) 59 | predicted_depth = self.conv2(predicted_depth) 60 | predicted_depth = self.activation1(predicted_depth) 61 | predicted_depth = self.conv3(predicted_depth) 62 | predicted_depth = self.activation2(predicted_depth) 63 | return predicted_depth 64 | 65 | class LoRAInitializer: 66 | def __init__(self, model, r=[14,14,12,12,10,10,8,8,8,8,8,8], lora=['q', 'v']): 67 | self.model = model 68 | self.r = r 69 | self.lora = lora 70 | self.w_As = [] 71 | self.w_Bs = [] 72 | self.initialize_lora() 73 | 74 | def initialize_lora(self): 75 | for param in self.model.backbone.parameters(): 76 | param.requires_grad = False 77 | 78 | for t_layer_i, blk in enumerate(self.model.backbone.encoder.layer): 79 | dim = blk.attention.attention.query.in_features 80 | 81 | if 'q' in self.lora: 82 | w_q = blk.attention.attention.query 83 | w_a_linear_q = nn.Linear(dim, self.r[t_layer_i], bias=False) 84 | w_b_linear_q = nn.Linear(self.r[t_layer_i], dim, bias=False) 85 | self.w_As.append(w_a_linear_q) 86 | self.w_Bs.append(w_b_linear_q) 87 | blk.attention.attention.query = _LoRA_qkv(w_q, w_a_linear_q, w_b_linear_q) 88 | 89 | if 'v' in self.lora: 90 | w_v = blk.attention.attention.value 91 | w_a_linear_v = nn.Linear(dim, self.r[t_layer_i], bias=False) 92 | w_b_linear_v = nn.Linear(self.r[t_layer_i], dim, bias=False) 93 | self.w_As.append(w_a_linear_v) 94 | self.w_Bs.append(w_b_linear_v) 95 | blk.attention.attention.value = _LoRA_qkv(w_v, w_a_linear_v, w_b_linear_v) 96 | 97 | if 'k' in self.lora: 98 | w_k = blk.attention.attention.key 99 | w_a_linear_k = nn.Linear(dim, self.r[t_layer_i], bias=False) 100 | w_b_linear_k = nn.Linear(self.r[t_layer_i], dim, bias=False) 101 | self.w_As.append(w_a_linear_k) 102 | self.w_Bs.append(w_b_linear_k) 103 | blk.attention.attention.key = _LoRA_qkv(w_k, w_a_linear_k, w_b_linear_k) 104 | 105 | self.reset_parameters() 106 | print("LoRA params initialized!") 107 | 108 | def reset_parameters(self): 109 | for w_A in self.w_As: 110 | nn.init.kaiming_uniform_(w_A.weight, a=math.sqrt(5)) 111 | for w_B in self.w_Bs: 112 | nn.init.zeros_(w_B.weight) 113 | 114 | 115 | class DARES(nn.Module): 116 | def __init__(self, r = [14,14,12,12,10,10,8,8,8,8,8,8], lora = ['q', 'v']): 117 | super(DARES, self).__init__() 118 | model = DepthAnythingForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf") 119 | self.r = r 120 | self.lora = lora 121 | self.config = model.config 122 | self.backbone = model.backbone 123 | 124 | # Initialize LoRA parameters 125 | self.lora_initializer = LoRAInitializer(model, r, lora) 126 | 127 | self.neck = model.neck 128 | model_head = model.head 129 | self.head = DepthAnythingDepthEstimationHead(model_head) 130 | model.post_init() 131 | 132 | def save_parameters(self, filename: str) -> None: 133 | r"""Only safetensors is supported now. 134 | 135 | pip install safetensor if you do not have one installed yet. 136 | 137 | save both lora and fc parameters. 138 | """ 139 | 140 | assert filename.endswith(".pt") or filename.endswith('.pth') 141 | 142 | num_layer = len(self.w_As) # actually, it is half 143 | a_tensors = {f"w_a_{i:03d}": self.w_As[i].weight for i in range(num_layer)} 144 | b_tensors = {f"w_b_{i:03d}": self.w_Bs[i].weight for i in range(num_layer)} 145 | decode_head_tensors = {} 146 | 147 | # save prompt encoder, only `state_dict`, the `named_parameter` is not permitted 148 | if isinstance(self.decode_head, torch.nn.DataParallel) or isinstance(self.decode_head, torch.nn.parallel.DistributedDataParallel): 149 | state_dict = self.decode_head.module.state_dict() 150 | else: 151 | state_dict = self.decode_head.state_dict() 152 | for key, value in state_dict.items(): 153 | decode_head_tensors[key] = value 154 | 155 | merged_dict = {**a_tensors, **b_tensors, **decode_head_tensors} 156 | torch.save(merged_dict, filename) 157 | 158 | print('saved lora parameters to %s.' % filename) 159 | 160 | def load_parameters(self, filename: str, device: str) -> None: 161 | r"""Only safetensors is supported now. 162 | 163 | pip install safetensor if you do not have one installed yet.\ 164 | 165 | load both lora and fc parameters. 166 | """ 167 | 168 | assert filename.endswith(".pt") or filename.endswith('.pth') 169 | 170 | state_dict = torch.load(filename, map_location=device) 171 | 172 | for i, w_A_linear in enumerate(self.w_As): 173 | saved_key = f"w_a_{i:03d}" 174 | saved_tensor = state_dict[saved_key] 175 | w_A_linear.weight = Parameter(saved_tensor) 176 | 177 | for i, w_B_linear in enumerate(self.w_Bs): 178 | saved_key = f"w_b_{i:03d}" 179 | saved_tensor = state_dict[saved_key] 180 | w_B_linear.weight = Parameter(saved_tensor) 181 | 182 | decode_head_dict = self.decode_head.state_dict() 183 | decode_head_keys = decode_head_dict.keys() 184 | 185 | # load decode head 186 | decode_head_keys = [k for k in decode_head_keys] 187 | decode_head_values = [state_dict[k] for k in decode_head_keys] 188 | decode_head_new_state_dict = {k: v for k, v in zip(decode_head_keys, decode_head_values)} 189 | decode_head_dict.update(decode_head_new_state_dict) 190 | 191 | self.decode_head.load_state_dict(decode_head_dict) 192 | 193 | print('loaded lora parameters from %s.' % filename) 194 | 195 | def forward(self, pixel_values): 196 | outputs = self.backbone.forward_with_filtered_kwargs( 197 | pixel_values, output_hidden_states=None, output_attentions=None 198 | ) 199 | hidden_states = outputs.feature_maps 200 | _, _, height, width = pixel_values.shape 201 | patch_size = self.config.patch_size 202 | patch_height = height // patch_size 203 | patch_width = width // patch_size 204 | hidden_states = self.neck(hidden_states, patch_height, patch_width) 205 | outputs = {} 206 | outputs[("disp", 0)] = self.head(hidden_states[3], height, width) 207 | outputs[("disp", 1)] = self.head(hidden_states[2], height/2, width/2) 208 | outputs[("disp", 2)] = self.head(hidden_states[1], height/4, width/4) 209 | outputs[("disp", 3)] = self.head(hidden_states[0], height/8, width/8) 210 | return outputs 211 | -------------------------------------------------------------------------------- /evaluate_depth.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import cv2 5 | import numpy as np 6 | 7 | import torch 8 | from torch.utils.data import DataLoader 9 | 10 | from layers import disp_to_depth 11 | from utils import readlines 12 | from options import MonodepthOptions 13 | import datasets 14 | import networks 15 | 16 | cv2.setNumThreads(0) # This speeds up evaluation 5x on our unix systems (OpenCV 3.3.1) 17 | 18 | 19 | splits_dir = os.path.join(os.path.dirname(__file__), "splits") 20 | 21 | # Models which were trained with stereo supervision were trained with a nominal 22 | # baseline of 0.1 units. The KITTI rig has a baseline of 54cm. Therefore, 23 | # to convert our stereo predictions to real-world scale we multiply our depths by 5.4. 24 | STEREO_SCALE_FACTOR = 5.4 25 | 26 | 27 | def compute_errors(gt, pred): 28 | """Computation of error metrics between predicted and ground truth depths 29 | """ 30 | thresh = np.maximum((gt / pred), (pred / gt)) 31 | a1 = (thresh < 1.25 ).mean() 32 | a2 = (thresh < 1.25 ** 2).mean() 33 | a3 = (thresh < 1.25 ** 3).mean() 34 | 35 | rmse = (gt - pred) ** 2 36 | rmse = np.sqrt(rmse.mean()) 37 | 38 | rmse_log = (np.log(gt) - np.log(pred)) ** 2 39 | rmse_log = np.sqrt(rmse_log.mean()) 40 | 41 | abs_rel = np.mean(np.abs(gt - pred) / gt) 42 | 43 | sq_rel = np.mean(((gt - pred) ** 2) / gt) 44 | 45 | return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 46 | 47 | 48 | def batch_post_process_disparity(l_disp, r_disp): 49 | """Apply the disparity post-processing method as introduced in Monodepthv1 50 | """ 51 | _, h, w = l_disp.shape 52 | m_disp = 0.5 * (l_disp + r_disp) 53 | l, _ = np.meshgrid(np.linspace(0, 1, w), np.linspace(0, 1, h)) 54 | l_mask = (1.0 - np.clip(20 * (l - 0.05), 0, 1))[None, ...] 55 | r_mask = l_mask[:, :, ::-1] 56 | return r_mask * l_disp + l_mask * r_disp + (1.0 - l_mask - r_mask) * m_disp 57 | 58 | 59 | def evaluate(opt): 60 | """Evaluates a pretrained model using a specified test set 61 | """ 62 | MIN_DEPTH = 1e-3 63 | MAX_DEPTH = 150 64 | 65 | assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \ 66 | "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo" 67 | 68 | if opt.ext_disp_to_eval is None: 69 | 70 | opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) 71 | 72 | assert os.path.isdir(opt.load_weights_folder), \ 73 | "Cannot find a folder at {}".format(opt.load_weights_folder) 74 | 75 | print("-> Loading weights from {}".format(opt.load_weights_folder)) 76 | 77 | filenames = readlines(os.path.join(splits_dir, opt.eval_split, "test_files.txt")) 78 | 79 | depth_model_path = os.path.join(opt.load_weights_folder, "depth_model.pth") 80 | 81 | depth_model_dict = torch.load(depth_model_path) 82 | 83 | dataset = datasets.SCAREDRAWDataset(opt.data_path, filenames, 84 | 256,320, 85 | [0], 4, is_train=False) 86 | dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers, 87 | pin_memory=True, drop_last=False) 88 | depth_model = networks.DARES() 89 | 90 | model_dict = depth_model.state_dict() 91 | 92 | depth_model.load_state_dict({k: v for k, v in depth_model_dict.items() if k in model_dict}) 93 | depth_model.cuda() 94 | depth_model.eval() 95 | 96 | pred_disps = [] 97 | 98 | print("-> Computing predictions with size {}x{}".format( 99 | 320, 256)) 100 | 101 | with torch.no_grad(): 102 | for data in dataloader: 103 | input_color = data[("color", 0, 0)].cuda() 104 | 105 | if opt.post_process: 106 | # Post-processed results require each image to have two forward passes 107 | input_color = torch.cat((input_color, torch.flip(input_color, [3])), 0) 108 | 109 | output = depth_model(input_color) 110 | pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) 111 | pred_disp = pred_disp.cpu()[:, 0].numpy() 112 | 113 | if opt.post_process: 114 | N = pred_disp.shape[0] // 2 115 | pred_disp = batch_post_process_disparity(pred_disp[:N], pred_disp[N:, :, ::-1]) 116 | 117 | pred_disps.append(pred_disp) 118 | 119 | pred_disps = np.concatenate(pred_disps) 120 | 121 | else: 122 | # Load predictions from file 123 | print("-> Loading predictions from {}".format(opt.ext_disp_to_eval)) 124 | pred_disps = np.load(opt.ext_disp_to_eval) 125 | 126 | if opt.eval_eigen_to_benchmark: 127 | eigen_to_benchmark_ids = np.load( 128 | os.path.join(splits_dir, "benchmark", "eigen_to_benchmark_ids.npy")) 129 | 130 | pred_disps = pred_disps[eigen_to_benchmark_ids] 131 | 132 | if opt.save_pred_disps: 133 | output_path = os.path.join( 134 | opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) 135 | print("-> Saving predicted disparities to ", output_path) 136 | np.save(output_path, pred_disps) 137 | 138 | if opt.no_eval: 139 | print("-> Evaluation disabled. Done.") 140 | quit() 141 | 142 | elif opt.eval_split == 'benchmark': 143 | save_dir = os.path.join(opt.load_weights_folder, "benchmark_predictions") 144 | print("-> Saving out benchmark predictions to {}".format(save_dir)) 145 | if not os.path.exists(save_dir): 146 | os.makedirs(save_dir) 147 | 148 | for idx in range(len(pred_disps)): 149 | disp_resized = cv2.resize(pred_disps[idx], (1216, 352)) 150 | depth = STEREO_SCALE_FACTOR / disp_resized 151 | depth = np.clip(depth, 0, 80) 152 | depth = np.uint16(depth * 256) 153 | save_path = os.path.join(save_dir, "{:010d}.png".format(idx)) 154 | cv2.imwrite(save_path, depth) 155 | 156 | print("-> No ground truth is available for the KITTI benchmark, so not evaluating. Done.") 157 | quit() 158 | 159 | gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz") 160 | 161 | gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1')["data"] 162 | 163 | print("-> Evaluating") 164 | 165 | if opt.eval_stereo: 166 | print(" Stereo evaluation - " 167 | "disabling median scaling, scaling by {}".format(STEREO_SCALE_FACTOR)) 168 | opt.disable_median_scaling = True 169 | opt.pred_depth_scale_factor = STEREO_SCALE_FACTOR 170 | else: 171 | print(" Mono evaluation - using median scaling") 172 | 173 | errors = [] 174 | ratios = [] 175 | 176 | for i in range(pred_disps.shape[0]): 177 | 178 | gt_depth = gt_depths[i] 179 | gt_height, gt_width = gt_depth.shape[:2] 180 | 181 | pred_disp = pred_disps[i] 182 | pred_disp = cv2.resize(pred_disp, (gt_width, gt_height)) 183 | pred_depth = 1/pred_disp 184 | 185 | if opt.eval_split == "eigen": 186 | mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH) 187 | 188 | crop = np.array([0.40810811 * gt_height, 0.99189189 * gt_height, 189 | 0.03594771 * gt_width, 0.96405229 * gt_width]).astype(np.int32) 190 | crop_mask = np.zeros(mask.shape) 191 | crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1 192 | mask = np.logical_and(mask, crop_mask) 193 | 194 | else: 195 | mask = np.logical_and(gt_depth > MIN_DEPTH, gt_depth < MAX_DEPTH) 196 | 197 | pred_depth = pred_depth[mask] 198 | gt_depth = gt_depth[mask] 199 | 200 | pred_depth *= opt.pred_depth_scale_factor 201 | if not opt.disable_median_scaling: 202 | ratio = np.median(gt_depth) / np.median(pred_depth) 203 | ratios.append(ratio) 204 | pred_depth *= ratio 205 | 206 | pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH 207 | pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH 208 | 209 | errors.append(compute_errors(gt_depth, pred_depth)) 210 | 211 | if not opt.disable_median_scaling: 212 | ratios = np.array(ratios) 213 | med = np.median(ratios) 214 | print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format(med, np.std(ratios / med))) 215 | 216 | mean_errors = np.array(errors).mean(0) 217 | 218 | print("\n " + ("{:>8} | " * 7).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3")) 219 | print(("&{: 8.3f} " * 7).format(*mean_errors.tolist()) + "\\\\") 220 | print("\n-> Done!") 221 | 222 | 223 | if __name__ == "__main__": 224 | options = MonodepthOptions() 225 | evaluate(options.parse()) 226 | -------------------------------------------------------------------------------- /evaluate_3d_reconstruction.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import cv2 5 | import numpy as np 6 | from tqdm import tqdm 7 | import time 8 | 9 | import torch 10 | from torch.utils.data import DataLoader 11 | from PIL import Image 12 | import matplotlib 13 | import matplotlib.pyplot as plt 14 | import open3d as o3d 15 | 16 | from layers import disp_to_depth 17 | from utils import readlines 18 | from options import MonodepthOptions 19 | import datasets 20 | import networks 21 | 22 | cv2.setNumThreads(0) # This speeds up evaluation 5x on our unix systems (OpenCV 3.3.1) 23 | 24 | 25 | splits_dir = os.path.join(os.path.dirname(__file__), "splits") 26 | 27 | # Models which were trained with stereo supervision were trained with a nominal 28 | # baseline of 0.1 units. The KITTI rig has a baseline of 54cm. Therefore, 29 | # to convert our stereo predictions to real-world scale we multiply our depths by 5.4. 30 | STEREO_SCALE_FACTOR = 5.4 31 | 32 | def render_depth(values, colormap_name="magma_r") -> Image: 33 | min_value, max_value = values.min(), values.max() 34 | normalized_values = (values - min_value) / (max_value - min_value) 35 | 36 | colormap = matplotlib.colormaps[colormap_name] 37 | colors = colormap(normalized_values, bytes=True) # ((1)xhxwx4) 38 | colors = colors[:, :, :3] # Discard alpha component 39 | return Image.fromarray(colors) 40 | 41 | def compute_scale(gt, pred,min,max): 42 | mask = np.logical_and(gt > min, gt < max) 43 | pred = pred[mask] 44 | gt = gt[mask] 45 | scale = np.median(gt) / np.median(pred) 46 | return scale 47 | 48 | def reconstruct_pointcloud(rgb, depth, cam_K, vis_rgbd=False): 49 | 50 | rgb = np.asarray(rgb, order="C") 51 | rgb_im = o3d.geometry.Image(rgb.astype(np.uint8)) 52 | depth_im = o3d.geometry.Image(depth) 53 | 54 | rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth(rgb_im, depth_im, convert_rgb_to_intensity=False) 55 | if vis_rgbd: 56 | plt.subplot(1, 2, 1) 57 | plt.title('RGB image') 58 | plt.imshow(rgbd_image.color) 59 | plt.subplot(1, 2, 2) 60 | plt.title('Depth image') 61 | plt.imshow(rgbd_image.depth) 62 | plt.colorbar() 63 | plt.show() 64 | 65 | cam = o3d.camera.PinholeCameraIntrinsic() 66 | cam.intrinsic_matrix = cam_K 67 | 68 | pcd = o3d.geometry.PointCloud.create_from_rgbd_image( 69 | rgbd_image, 70 | cam 71 | ) 72 | 73 | return pcd 74 | def evaluate(opt): 75 | """Evaluates a pretrained model using a specified test set 76 | """ 77 | MIN_DEPTH = 1e-3 78 | MAX_DEPTH = 150 79 | 80 | assert sum((opt.eval_mono, opt.eval_stereo)) == 1, \ 81 | "Please choose mono or stereo evaluation by setting either --eval_mono or --eval_stereo" 82 | 83 | opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) 84 | 85 | assert os.path.isdir(opt.load_weights_folder), \ 86 | "Cannot find a folder at {}".format(opt.load_weights_folder) 87 | 88 | print("-> Loading weights from {}".format(opt.load_weights_folder)) 89 | 90 | model_path = os.path.join(opt.load_weights_folder, "depth_model.pth") 91 | 92 | weight_dict = torch.load(model_path) 93 | 94 | if opt.eval_split == 'endovis': 95 | filenames = readlines(os.path.join(splits_dir, opt.eval_split, "3d_reconstruction.txt")) 96 | dataset = datasets.SCAREDRAWDataset(opt.data_path, filenames, 97 | 256, 320, 98 | [0], 4, is_train=False) 99 | 100 | save_dir = os.path.join(splits_dir, opt.eval_split, "reconstruction") 101 | os.makedirs(save_dir, exist_ok=True) 102 | 103 | dataloader = DataLoader(dataset, 1, shuffle=False, num_workers=opt.num_workers, 104 | pin_memory=True, drop_last=False) 105 | model = networks.DARES() 106 | 107 | model_dict = model.state_dict() 108 | model.load_state_dict({k: v for k, v in weight_dict.items() if k in model_dict}) 109 | 110 | model.cuda() 111 | model.eval() 112 | 113 | 114 | rgbs = [] 115 | pred_disps = [] 116 | cam_Ks = [] 117 | inference_times = [] 118 | sequences = [] 119 | keyframes = [] 120 | frame_ids = [] 121 | 122 | print("-> Computing predictions with size 256x320") 123 | 124 | with torch.no_grad(): 125 | for data in tqdm(dataloader): 126 | input_color = data[("color", 0, 0)].cuda() 127 | if opt.post_process: 128 | # Post-processed results require each image to have two forward passes 129 | input_color = torch.cat((input_color, torch.flip(input_color, [3])), 0) 130 | 131 | time_start = time.time() 132 | output = model(input_color) 133 | inference_time = time.time() - time_start 134 | pred_disp, _ = disp_to_depth(output[("disp", 0)], opt.min_depth, opt.max_depth) 135 | pred_disp = pred_disp.cpu()[:, 0].numpy() 136 | 137 | rgbs.append(input_color) 138 | pred_disps.append(pred_disp) 139 | cam_Ks.append(data[("K", 0)]) 140 | inference_times.append(inference_time) 141 | sequences.append(data['sequence']) 142 | keyframes.append(data['keyframe']) 143 | frame_ids.append(data['frame_id']) 144 | 145 | 146 | pred_disps = np.concatenate(pred_disps) 147 | 148 | if opt.save_pred_disps: 149 | output_path = os.path.join( 150 | opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) 151 | print("-> Saving predicted disparities to ", output_path) 152 | np.save(output_path, pred_disps) 153 | 154 | elif opt.eval_split == 'benchmark': 155 | save_dir = os.path.join(opt.load_weights_folder, "benchmark_predictions") 156 | print("-> Saving out benchmark predictions to {}".format(save_dir)) 157 | if not os.path.exists(save_dir): 158 | os.makedirs(save_dir) 159 | 160 | for idx in range(len(pred_disps)): 161 | disp_resized = cv2.resize(pred_disps[idx], (1216, 352)) 162 | depth = STEREO_SCALE_FACTOR / disp_resized 163 | depth = np.clip(depth, 0, 80) 164 | depth = np.uint16(depth * 256) 165 | save_path = os.path.join(save_dir, "{:010d}.png".format(idx)) 166 | cv2.imwrite(save_path, depth) 167 | 168 | print("-> No ground truth is available for the KITTI benchmark, so not evaluating. Done.") 169 | quit() 170 | elif opt.eval_split == 'endovis': 171 | gt_path = os.path.join(splits_dir, opt.eval_split, "gt_depths.npz") 172 | gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1')["data"] 173 | 174 | 175 | if opt.visualize_depth: 176 | vis_dir = os.path.join(opt.load_weights_folder, "vis_depth") 177 | os.makedirs(vis_dir, exist_ok=True) 178 | 179 | print("-> Reconstructing") 180 | 181 | pcds = [] 182 | for i in tqdm(range(pred_disps.shape[0])): 183 | 184 | sequence = str(np.array(sequences[i][0])) 185 | keyframe = str(np.array(keyframes[i][0])) 186 | frame_id = "{:06d}".format(frame_ids[i][0]) 187 | 188 | pred_disp = pred_disps[i] 189 | pred_depth = 1/pred_disp 190 | pred_height, pred_width = pred_depth.shape[:2] 191 | 192 | gt_depth = gt_depths[i] 193 | gt_depth = cv2.resize(gt_depth, (pred_width, pred_height), interpolation=cv2.INTER_NEAREST) 194 | 195 | rgb = rgbs[i].squeeze().permute(1,2,0).cpu().numpy() * 255 196 | cam_K = cam_Ks[i][0,:3,:3].numpy() 197 | if opt.visualize_depth: 198 | vis_pred_depth = render_depth(pred_depth) 199 | vis_file_name = os.path.join(vis_dir, sequence + "_" + keyframe + "_" + frame_id + ".png") 200 | print(vis_file_name) 201 | vis_pred_depth.save(vis_file_name) 202 | 203 | scale = compute_scale(gt_depth, pred_depth, MIN_DEPTH ,MAX_DEPTH) 204 | pred_depth *= scale 205 | print(rgb.max(), rgb.min(), pred_depth.max(), pred_depth.min(),scale) 206 | pred_depth[pred_depth < MIN_DEPTH] = MIN_DEPTH 207 | pred_depth[pred_depth > MAX_DEPTH] = MAX_DEPTH 208 | pcd = reconstruct_pointcloud(rgb, pred_depth, cam_K, vis_rgbd=False) 209 | # o3d.visualization.draw_geometries([pcd]) 210 | pcds.append(pcd) 211 | print('Saving point clouds...') 212 | for i, pcd in enumerate(pcds): 213 | sequence = str(np.array(sequences[i][0])) 214 | keyframe = str(np.array(keyframes[i][0])) 215 | frame_id = "{:06d}".format(frame_ids[i][0]) 216 | fn = os.path.join(save_dir, sequence + "_" + keyframe + "_" + frame_id + ".ply") 217 | o3d.io.write_point_cloud(fn, pcd) 218 | 219 | print('Point clouds saved to', save_dir) 220 | 221 | 222 | if __name__ == "__main__": 223 | options = MonodepthOptions() 224 | evaluate(options.parse()) 225 | -------------------------------------------------------------------------------- /splits/endovis/test_files_sequence1.txt: -------------------------------------------------------------------------------- 1 | dataset5/keyframe4 1 l 2 | dataset5/keyframe4 2 l 3 | dataset5/keyframe4 3 l 4 | dataset5/keyframe4 4 l 5 | dataset5/keyframe4 5 l 6 | dataset5/keyframe4 6 l 7 | dataset5/keyframe4 7 l 8 | dataset5/keyframe4 8 l 9 | dataset5/keyframe4 9 l 10 | dataset5/keyframe4 10 l 11 | dataset5/keyframe4 11 l 12 | dataset5/keyframe4 12 l 13 | dataset5/keyframe4 13 l 14 | dataset5/keyframe4 14 l 15 | dataset5/keyframe4 15 l 16 | dataset5/keyframe4 16 l 17 | dataset5/keyframe4 17 l 18 | dataset5/keyframe4 18 l 19 | dataset5/keyframe4 19 l 20 | dataset5/keyframe4 20 l 21 | dataset5/keyframe4 21 l 22 | dataset5/keyframe4 22 l 23 | dataset5/keyframe4 23 l 24 | dataset5/keyframe4 24 l 25 | dataset5/keyframe4 25 l 26 | dataset5/keyframe4 26 l 27 | dataset5/keyframe4 27 l 28 | dataset5/keyframe4 28 l 29 | dataset5/keyframe4 29 l 30 | dataset5/keyframe4 30 l 31 | dataset5/keyframe4 31 l 32 | dataset5/keyframe4 32 l 33 | dataset5/keyframe4 33 l 34 | dataset5/keyframe4 34 l 35 | dataset5/keyframe4 35 l 36 | dataset5/keyframe4 36 l 37 | dataset5/keyframe4 37 l 38 | dataset5/keyframe4 38 l 39 | dataset5/keyframe4 39 l 40 | dataset5/keyframe4 40 l 41 | dataset5/keyframe4 41 l 42 | dataset5/keyframe4 42 l 43 | dataset5/keyframe4 43 l 44 | dataset5/keyframe4 44 l 45 | dataset5/keyframe4 45 l 46 | dataset5/keyframe4 46 l 47 | dataset5/keyframe4 47 l 48 | dataset5/keyframe4 48 l 49 | dataset5/keyframe4 49 l 50 | dataset5/keyframe4 50 l 51 | dataset5/keyframe4 51 l 52 | dataset5/keyframe4 52 l 53 | dataset5/keyframe4 53 l 54 | dataset5/keyframe4 54 l 55 | dataset5/keyframe4 55 l 56 | dataset5/keyframe4 56 l 57 | dataset5/keyframe4 57 l 58 | dataset5/keyframe4 58 l 59 | dataset5/keyframe4 59 l 60 | dataset5/keyframe4 60 l 61 | dataset5/keyframe4 61 l 62 | dataset5/keyframe4 62 l 63 | dataset5/keyframe4 63 l 64 | dataset5/keyframe4 64 l 65 | dataset5/keyframe4 65 l 66 | dataset5/keyframe4 66 l 67 | dataset5/keyframe4 67 l 68 | dataset5/keyframe4 68 l 69 | dataset5/keyframe4 69 l 70 | dataset5/keyframe4 70 l 71 | dataset5/keyframe4 71 l 72 | dataset5/keyframe4 72 l 73 | dataset5/keyframe4 73 l 74 | dataset5/keyframe4 74 l 75 | dataset5/keyframe4 75 l 76 | dataset5/keyframe4 76 l 77 | dataset5/keyframe4 77 l 78 | dataset5/keyframe4 78 l 79 | dataset5/keyframe4 79 l 80 | dataset5/keyframe4 80 l 81 | dataset5/keyframe4 81 l 82 | dataset5/keyframe4 82 l 83 | dataset5/keyframe4 83 l 84 | dataset5/keyframe4 84 l 85 | dataset5/keyframe4 85 l 86 | dataset5/keyframe4 86 l 87 | dataset5/keyframe4 87 l 88 | dataset5/keyframe4 88 l 89 | dataset5/keyframe4 89 l 90 | dataset5/keyframe4 90 l 91 | dataset5/keyframe4 91 l 92 | dataset5/keyframe4 92 l 93 | dataset5/keyframe4 93 l 94 | dataset5/keyframe4 94 l 95 | dataset5/keyframe4 95 l 96 | dataset5/keyframe4 96 l 97 | dataset5/keyframe4 97 l 98 | dataset5/keyframe4 98 l 99 | dataset5/keyframe4 99 l 100 | dataset5/keyframe4 100 l 101 | dataset5/keyframe4 101 l 102 | dataset5/keyframe4 102 l 103 | dataset5/keyframe4 103 l 104 | dataset5/keyframe4 104 l 105 | dataset5/keyframe4 105 l 106 | dataset5/keyframe4 106 l 107 | dataset5/keyframe4 107 l 108 | dataset5/keyframe4 108 l 109 | dataset5/keyframe4 109 l 110 | dataset5/keyframe4 110 l 111 | dataset5/keyframe4 111 l 112 | dataset5/keyframe4 112 l 113 | dataset5/keyframe4 113 l 114 | dataset5/keyframe4 114 l 115 | dataset5/keyframe4 115 l 116 | dataset5/keyframe4 116 l 117 | dataset5/keyframe4 117 l 118 | dataset5/keyframe4 118 l 119 | dataset5/keyframe4 119 l 120 | dataset5/keyframe4 120 l 121 | dataset5/keyframe4 121 l 122 | dataset5/keyframe4 122 l 123 | dataset5/keyframe4 123 l 124 | dataset5/keyframe4 124 l 125 | dataset5/keyframe4 125 l 126 | dataset5/keyframe4 126 l 127 | dataset5/keyframe4 127 l 128 | dataset5/keyframe4 128 l 129 | dataset5/keyframe4 129 l 130 | dataset5/keyframe4 130 l 131 | dataset5/keyframe4 131 l 132 | dataset5/keyframe4 132 l 133 | dataset5/keyframe4 133 l 134 | dataset5/keyframe4 134 l 135 | dataset5/keyframe4 135 l 136 | dataset5/keyframe4 136 l 137 | dataset5/keyframe4 137 l 138 | dataset5/keyframe4 138 l 139 | dataset5/keyframe4 139 l 140 | dataset5/keyframe4 140 l 141 | dataset5/keyframe4 141 l 142 | dataset5/keyframe4 142 l 143 | dataset5/keyframe4 143 l 144 | dataset5/keyframe4 144 l 145 | dataset5/keyframe4 145 l 146 | dataset5/keyframe4 146 l 147 | dataset5/keyframe4 147 l 148 | dataset5/keyframe4 148 l 149 | dataset5/keyframe4 149 l 150 | dataset5/keyframe4 150 l 151 | dataset5/keyframe4 151 l 152 | dataset5/keyframe4 152 l 153 | dataset5/keyframe4 153 l 154 | dataset5/keyframe4 154 l 155 | dataset5/keyframe4 155 l 156 | dataset5/keyframe4 156 l 157 | dataset5/keyframe4 157 l 158 | dataset5/keyframe4 158 l 159 | dataset5/keyframe4 159 l 160 | dataset5/keyframe4 160 l 161 | dataset5/keyframe4 161 l 162 | dataset5/keyframe4 162 l 163 | dataset5/keyframe4 163 l 164 | dataset5/keyframe4 164 l 165 | dataset5/keyframe4 165 l 166 | dataset5/keyframe4 166 l 167 | dataset5/keyframe4 167 l 168 | dataset5/keyframe4 168 l 169 | dataset5/keyframe4 169 l 170 | dataset5/keyframe4 170 l 171 | dataset5/keyframe4 171 l 172 | dataset5/keyframe4 172 l 173 | dataset5/keyframe4 173 l 174 | dataset5/keyframe4 174 l 175 | dataset5/keyframe4 175 l 176 | dataset5/keyframe4 176 l 177 | dataset5/keyframe4 177 l 178 | dataset5/keyframe4 178 l 179 | dataset5/keyframe4 179 l 180 | dataset5/keyframe4 180 l 181 | dataset5/keyframe4 181 l 182 | dataset5/keyframe4 182 l 183 | dataset5/keyframe4 183 l 184 | dataset5/keyframe4 184 l 185 | dataset5/keyframe4 185 l 186 | dataset5/keyframe4 186 l 187 | dataset5/keyframe4 187 l 188 | dataset5/keyframe4 188 l 189 | dataset5/keyframe4 189 l 190 | dataset5/keyframe4 190 l 191 | dataset5/keyframe4 191 l 192 | dataset5/keyframe4 192 l 193 | dataset5/keyframe4 193 l 194 | dataset5/keyframe4 194 l 195 | dataset5/keyframe4 195 l 196 | dataset5/keyframe4 196 l 197 | dataset5/keyframe4 197 l 198 | dataset5/keyframe4 198 l 199 | dataset5/keyframe4 199 l 200 | dataset5/keyframe4 200 l 201 | dataset5/keyframe4 201 l 202 | dataset5/keyframe4 202 l 203 | dataset5/keyframe4 203 l 204 | dataset5/keyframe4 204 l 205 | dataset5/keyframe4 205 l 206 | dataset5/keyframe4 206 l 207 | dataset5/keyframe4 207 l 208 | dataset5/keyframe4 208 l 209 | dataset5/keyframe4 209 l 210 | dataset5/keyframe4 210 l 211 | dataset5/keyframe4 211 l 212 | dataset5/keyframe4 212 l 213 | dataset5/keyframe4 213 l 214 | dataset5/keyframe4 214 l 215 | dataset5/keyframe4 215 l 216 | dataset5/keyframe4 216 l 217 | dataset5/keyframe4 217 l 218 | dataset5/keyframe4 218 l 219 | dataset5/keyframe4 219 l 220 | dataset5/keyframe4 220 l 221 | dataset5/keyframe4 221 l 222 | dataset5/keyframe4 222 l 223 | dataset5/keyframe4 223 l 224 | dataset5/keyframe4 224 l 225 | dataset5/keyframe4 225 l 226 | dataset5/keyframe4 226 l 227 | dataset5/keyframe4 227 l 228 | dataset5/keyframe4 228 l 229 | dataset5/keyframe4 229 l 230 | dataset5/keyframe4 230 l 231 | dataset5/keyframe4 231 l 232 | dataset5/keyframe4 232 l 233 | dataset5/keyframe4 233 l 234 | dataset5/keyframe4 234 l 235 | dataset5/keyframe4 235 l 236 | dataset5/keyframe4 236 l 237 | dataset5/keyframe4 237 l 238 | dataset5/keyframe4 238 l 239 | dataset5/keyframe4 239 l 240 | dataset5/keyframe4 240 l 241 | dataset5/keyframe4 241 l 242 | dataset5/keyframe4 242 l 243 | dataset5/keyframe4 243 l 244 | dataset5/keyframe4 244 l 245 | dataset5/keyframe4 245 l 246 | dataset5/keyframe4 246 l 247 | dataset5/keyframe4 247 l 248 | dataset5/keyframe4 248 l 249 | dataset5/keyframe4 249 l 250 | dataset5/keyframe4 250 l 251 | dataset5/keyframe4 251 l 252 | dataset5/keyframe4 252 l 253 | dataset5/keyframe4 253 l 254 | dataset5/keyframe4 254 l 255 | dataset5/keyframe4 255 l 256 | dataset5/keyframe4 256 l 257 | dataset5/keyframe4 257 l 258 | dataset5/keyframe4 258 l 259 | dataset5/keyframe4 259 l 260 | dataset5/keyframe4 260 l 261 | dataset5/keyframe4 261 l 262 | dataset5/keyframe4 262 l 263 | dataset5/keyframe4 263 l 264 | dataset5/keyframe4 264 l 265 | dataset5/keyframe4 265 l 266 | dataset5/keyframe4 266 l 267 | dataset5/keyframe4 267 l 268 | dataset5/keyframe4 268 l 269 | dataset5/keyframe4 269 l 270 | dataset5/keyframe4 270 l 271 | dataset5/keyframe4 271 l 272 | dataset5/keyframe4 272 l 273 | dataset5/keyframe4 273 l 274 | dataset5/keyframe4 274 l 275 | dataset5/keyframe4 275 l 276 | dataset5/keyframe4 276 l 277 | dataset5/keyframe4 277 l 278 | dataset5/keyframe4 278 l 279 | dataset5/keyframe4 279 l 280 | dataset5/keyframe4 280 l 281 | dataset5/keyframe4 281 l 282 | dataset5/keyframe4 282 l 283 | dataset5/keyframe4 283 l 284 | dataset5/keyframe4 284 l 285 | dataset5/keyframe4 285 l 286 | dataset5/keyframe4 286 l 287 | dataset5/keyframe4 287 l 288 | dataset5/keyframe4 288 l 289 | dataset5/keyframe4 289 l 290 | dataset5/keyframe4 290 l 291 | dataset5/keyframe4 291 l 292 | dataset5/keyframe4 292 l 293 | dataset5/keyframe4 293 l 294 | dataset5/keyframe4 294 l 295 | dataset5/keyframe4 295 l 296 | dataset5/keyframe4 296 l 297 | dataset5/keyframe4 297 l 298 | dataset5/keyframe4 298 l 299 | dataset5/keyframe4 299 l 300 | dataset5/keyframe4 300 l 301 | dataset5/keyframe4 301 l 302 | dataset5/keyframe4 302 l 303 | dataset5/keyframe4 303 l 304 | dataset5/keyframe4 304 l 305 | dataset5/keyframe4 305 l 306 | dataset5/keyframe4 306 l 307 | dataset5/keyframe4 307 l 308 | dataset5/keyframe4 308 l 309 | dataset5/keyframe4 309 l 310 | dataset5/keyframe4 310 l 311 | dataset5/keyframe4 311 l 312 | dataset5/keyframe4 312 l 313 | dataset5/keyframe4 313 l 314 | dataset5/keyframe4 314 l 315 | dataset5/keyframe4 315 l 316 | dataset5/keyframe4 316 l 317 | dataset5/keyframe4 317 l 318 | dataset5/keyframe4 318 l 319 | dataset5/keyframe4 319 l 320 | dataset5/keyframe4 320 l 321 | dataset5/keyframe4 321 l 322 | dataset5/keyframe4 322 l 323 | dataset5/keyframe4 323 l 324 | dataset5/keyframe4 324 l 325 | dataset5/keyframe4 325 l 326 | dataset5/keyframe4 326 l 327 | dataset5/keyframe4 327 l 328 | dataset5/keyframe4 328 l 329 | dataset5/keyframe4 329 l 330 | dataset5/keyframe4 330 l 331 | dataset5/keyframe4 331 l 332 | dataset5/keyframe4 332 l 333 | dataset5/keyframe4 333 l 334 | dataset5/keyframe4 334 l 335 | dataset5/keyframe4 335 l 336 | dataset5/keyframe4 336 l 337 | dataset5/keyframe4 337 l 338 | dataset5/keyframe4 338 l 339 | dataset5/keyframe4 339 l 340 | dataset5/keyframe4 340 l 341 | dataset5/keyframe4 341 l 342 | dataset5/keyframe4 342 l 343 | dataset5/keyframe4 343 l 344 | dataset5/keyframe4 344 l 345 | dataset5/keyframe4 345 l 346 | dataset5/keyframe4 346 l 347 | dataset5/keyframe4 347 l 348 | dataset5/keyframe4 348 l 349 | dataset5/keyframe4 349 l 350 | dataset5/keyframe4 350 l 351 | dataset5/keyframe4 351 l 352 | dataset5/keyframe4 352 l 353 | dataset5/keyframe4 353 l 354 | dataset5/keyframe4 354 l 355 | dataset5/keyframe4 355 l 356 | dataset5/keyframe4 356 l 357 | dataset5/keyframe4 357 l 358 | dataset5/keyframe4 358 l 359 | dataset5/keyframe4 359 l 360 | dataset5/keyframe4 360 l 361 | dataset5/keyframe4 361 l 362 | dataset5/keyframe4 362 l 363 | dataset5/keyframe4 363 l 364 | dataset5/keyframe4 364 l 365 | dataset5/keyframe4 365 l 366 | dataset5/keyframe4 366 l 367 | dataset5/keyframe4 367 l 368 | dataset5/keyframe4 368 l 369 | dataset5/keyframe4 369 l 370 | dataset5/keyframe4 370 l 371 | dataset5/keyframe4 371 l 372 | dataset5/keyframe4 372 l 373 | dataset5/keyframe4 373 l 374 | dataset5/keyframe4 374 l 375 | dataset5/keyframe4 375 l 376 | dataset5/keyframe4 376 l 377 | dataset5/keyframe4 377 l 378 | dataset5/keyframe4 378 l 379 | dataset5/keyframe4 379 l 380 | dataset5/keyframe4 380 l 381 | dataset5/keyframe4 381 l 382 | dataset5/keyframe4 382 l 383 | dataset5/keyframe4 383 l 384 | dataset5/keyframe4 384 l 385 | dataset5/keyframe4 385 l 386 | dataset5/keyframe4 386 l 387 | dataset5/keyframe4 387 l 388 | dataset5/keyframe4 388 l 389 | dataset5/keyframe4 389 l 390 | dataset5/keyframe4 390 l 391 | dataset5/keyframe4 391 l 392 | dataset5/keyframe4 392 l 393 | dataset5/keyframe4 393 l 394 | dataset5/keyframe4 394 l 395 | dataset5/keyframe4 395 l 396 | dataset5/keyframe4 396 l 397 | dataset5/keyframe4 397 l 398 | dataset5/keyframe4 398 l 399 | dataset5/keyframe4 399 l 400 | dataset5/keyframe4 400 l 401 | dataset5/keyframe4 401 l 402 | dataset5/keyframe4 402 l 403 | dataset5/keyframe4 403 l 404 | dataset5/keyframe4 404 l 405 | dataset5/keyframe4 405 l 406 | dataset5/keyframe4 406 l 407 | dataset5/keyframe4 407 l 408 | dataset5/keyframe4 408 l 409 | dataset5/keyframe4 409 l 410 | dataset5/keyframe4 410 l 411 | -------------------------------------------------------------------------------- /options.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import argparse 5 | 6 | file_dir = os.path.dirname(__file__) # the directory that options.py resides in 7 | 8 | 9 | class MonodepthOptions: 10 | def __init__(self): 11 | self.parser = argparse.ArgumentParser(description="Monodepthv2 options") 12 | 13 | # PATHS 14 | self.parser.add_argument("--data_path", 15 | type=str, 16 | help="path to the training data", 17 | default=os.path.join(file_dir, "endovis_data")) 18 | self.parser.add_argument("--log_dir", 19 | type=str, 20 | help="log directory", 21 | default=os.path.join(os.path.expanduser("~"), "tmp")) 22 | 23 | # TRAINING options 24 | self.parser.add_argument("--model_name", 25 | type=str, 26 | help="the name of the folder to save the model in", 27 | default="mdp") 28 | self.parser.add_argument("--split", 29 | type=str, 30 | help="which training split to use", 31 | choices=["endovis", "eigen_zhou", "eigen_full", "odom", "benchmark"], 32 | default="endovis") 33 | self.parser.add_argument("--num_layers", 34 | type=int, 35 | help="number of resnet layers", 36 | default=18, 37 | choices=[18, 34, 50, 101, 152]) 38 | self.parser.add_argument("--dataset", 39 | type=str, 40 | help="dataset to train on", 41 | default="endovis", 42 | choices=["endovis", "kitti", "kitti_odom", "kitti_depth", "kitti_test"]) 43 | self.parser.add_argument("--png", 44 | help="if set, trains from raw KITTI png files (instead of jpgs)", 45 | action="store_true") 46 | self.parser.add_argument("--height", 47 | type=int, 48 | help="input image height", 49 | default=256) 50 | self.parser.add_argument("--width", 51 | type=int, 52 | help="input image width", 53 | default=320) 54 | self.parser.add_argument("--disparity_smoothness", 55 | type=float, 56 | help="disparity smoothness weight", 57 | default=1e-4) 58 | self.parser.add_argument("--position_smoothness", 59 | type=float, 60 | help="registration smoothness weight", 61 | default=1e-3) 62 | self.parser.add_argument("--consistency_constraint", 63 | type=float, 64 | help="consistency constraint weight", 65 | default=0.01) 66 | self.parser.add_argument("--epipolar_constraint", 67 | type=float, 68 | help="epipolar constraint weight", 69 | default=0.01) 70 | self.parser.add_argument("--geometry_constraint", 71 | type=float, 72 | help="geometry constraint weight", 73 | default=0.01) 74 | self.parser.add_argument("--transform_constraint", 75 | type=float, 76 | help="transform constraint weight", 77 | default=0.01) 78 | self.parser.add_argument("--transform_smoothness", 79 | type=float, 80 | help="transform smoothness weight", 81 | default=0.01) 82 | self.parser.add_argument("--scales", 83 | nargs="+", 84 | type=int, 85 | help="scales used in the loss", 86 | default=[0, 1, 2, 3]) 87 | self.parser.add_argument("--min_depth", 88 | type=float, 89 | help="minimum depth", 90 | default=0.1) 91 | self.parser.add_argument("--max_depth", 92 | type=float, 93 | help="maximum depth", 94 | default=150.0) 95 | self.parser.add_argument("--use_stereo", 96 | help="if set, uses stereo pair for training", 97 | action="store_true") 98 | self.parser.add_argument("--frame_ids", 99 | nargs="+", 100 | type=int, 101 | help="frames to load", 102 | default=[0, -1, 1]) 103 | 104 | # OPTIMIZATION options 105 | self.parser.add_argument("--batch_size", 106 | type=int, 107 | help="batch size", 108 | default=12) 109 | self.parser.add_argument("--learning_rate", 110 | type=float, 111 | help="learning rate", 112 | default=1e-4) 113 | self.parser.add_argument("--num_epochs", 114 | type=int, 115 | help="number of epochs", 116 | default=20) 117 | self.parser.add_argument("--scheduler_step_size", 118 | type=int, 119 | help="step size of the scheduler", 120 | default=10) 121 | 122 | # ABLATION options 123 | self.parser.add_argument("--v1_multiscale", 124 | help="if set, uses monodepth v1 multiscale", 125 | action="store_true") 126 | self.parser.add_argument("--avg_reprojection", 127 | help="if set, uses average reprojection loss", 128 | action="store_true") 129 | self.parser.add_argument("--disable_automasking", 130 | help="if set, doesn't do auto-masking", 131 | action="store_true") 132 | self.parser.add_argument("--predictive_mask", 133 | help="if set, uses a predictive masking scheme as in Zhou et al", 134 | action="store_true") 135 | self.parser.add_argument("--no_ssim", 136 | help="if set, disables ssim in the loss", 137 | action="store_true") 138 | self.parser.add_argument("--weights_init", 139 | type=str, 140 | help="pretrained or scratch", 141 | default="pretrained", 142 | choices=["pretrained", "scratch"]) 143 | self.parser.add_argument("--pose_model_input", 144 | type=str, 145 | help="how many images the pose network gets", 146 | default="pairs", 147 | choices=["pairs", "all"]) 148 | self.parser.add_argument("--pose_model_type", 149 | type=str, 150 | help="normal or shared", 151 | default="separate_resnet", 152 | choices=["posecnn", "separate_resnet", "shared"]) 153 | 154 | # SYSTEM options 155 | self.parser.add_argument("--no_cuda", 156 | help="if set disables CUDA", 157 | action="store_true") 158 | self.parser.add_argument("--num_workers", 159 | type=int, 160 | help="number of dataloader workers", 161 | default=12) 162 | 163 | # LOADING options 164 | self.parser.add_argument("--load_weights_folder", 165 | type=str, 166 | help="name of model to load") 167 | self.parser.add_argument("--models_to_load", 168 | nargs="+", 169 | type=str, 170 | help="models to load", 171 | default=["position_encoder", "position"]) 172 | 173 | # LOGGING options 174 | self.parser.add_argument("--log_frequency", 175 | type=int, 176 | help="number of batches between each tensorboard log", 177 | default=200) 178 | self.parser.add_argument("--save_frequency", 179 | type=int, 180 | help="number of epochs between each save", 181 | default=1) 182 | 183 | # EVALUATION options 184 | self.parser.add_argument("--eval_stereo", 185 | help="if set evaluates in stereo mode", 186 | action="store_true") 187 | self.parser.add_argument("--eval_mono", 188 | help="if set evaluates in mono mode", 189 | action="store_true") 190 | self.parser.add_argument("--disable_median_scaling", 191 | help="if set disables median scaling in evaluation", 192 | action="store_true") 193 | self.parser.add_argument("--pred_depth_scale_factor", 194 | help="if set multiplies predictions by this number", 195 | type=float, 196 | default=1) 197 | self.parser.add_argument("--ext_disp_to_eval", 198 | type=str, 199 | help="optional path to a .npy disparities file to evaluate") 200 | self.parser.add_argument("--eval_split", 201 | type=str, 202 | default="endovis", 203 | choices=[ 204 | "eigen", "eigen_benchmark", "benchmark", "odom_9", "odom_10", "endovis"], 205 | help="which split to run eval on") 206 | self.parser.add_argument("--save_pred_disps", 207 | help="if set saves predicted disparities", 208 | action="store_true") 209 | self.parser.add_argument("--no_eval", 210 | help="if set disables evaluation", 211 | action="store_true") 212 | self.parser.add_argument("--eval_eigen_to_benchmark", 213 | help="if set assume we are loading eigen results from npy but " 214 | "we want to evaluate using the new benchmark.", 215 | action="store_true") 216 | self.parser.add_argument("--eval_out_dir", 217 | help="if set will output the disparities to this folder", 218 | type=str) 219 | self.parser.add_argument("--post_process", 220 | help="if set will perform the flipping post processing " 221 | "from the original monodepth paper", 222 | action="store_true") 223 | self.parser.add_argument("--visualize_depth", 224 | help="if set saves visualized depth map", 225 | action="store_true") 226 | self.parser.add_argument("--save_recon", 227 | help="if set saves reconstruction files", 228 | action="store_true") 229 | self.parser.add_argument("--scared_pose_seq", 230 | type=str, 231 | help="pose sequence in scared", 232 | default=1) 233 | self.parser.add_argument("--zero_shot", 234 | help="if set saves reconstruction files", 235 | action="store_true") 236 | self.parser.add_argument("--dam_hf_weights", 237 | help="DAM huggingface weights name", 238 | type=str) 239 | 240 | 241 | def parse(self): 242 | self.options = self.parser.parse_args() 243 | return self.options 244 | -------------------------------------------------------------------------------- /splits/endovis/test_files.txt: -------------------------------------------------------------------------------- 1 | dataset3/keyframe4 390 l 2 | dataset6/keyframe4 770 l 3 | dataset7/keyframe4 250 l 4 | dataset5/keyframe4 25 l 5 | dataset3/keyframe4 180 l 6 | dataset1/keyframe3 242 l 7 | dataset3/keyframe4 70 l 8 | dataset4/keyframe4 14 l 9 | dataset3/keyframe4 20 l 10 | dataset4/keyframe4 118 l 11 | dataset6/keyframe4 1034 l 12 | dataset2/keyframe4 2 l 13 | dataset6/keyframe4 602 l 14 | dataset3/keyframe4 25 l 15 | dataset4/keyframe4 95 l 16 | dataset5/keyframe4 58 l 17 | dataset6/keyframe4 1058 l 18 | dataset7/keyframe4 952 l 19 | dataset1/keyframe3 205 l 20 | dataset5/keyframe4 373 l 21 | dataset3/keyframe4 35 l 22 | dataset5/keyframe4 150 l 23 | dataset6/keyframe4 26 l 24 | dataset4/keyframe4 155 l 25 | dataset3/keyframe4 40 l 26 | dataset6/keyframe4 15 l 27 | dataset3/keyframe4 670 l 28 | dataset1/keyframe3 74 l 29 | dataset3/keyframe4 152 l 30 | dataset1/keyframe3 350 l 31 | dataset6/keyframe4 698 l 32 | dataset7/keyframe4 268 l 33 | dataset2/keyframe4 224 l 34 | dataset1/keyframe3 214 l 35 | dataset2/keyframe4 1223 l 36 | dataset4/keyframe4 8 l 37 | dataset4/keyframe4 10 l 38 | dataset5/keyframe4 219 l 39 | dataset4/keyframe4 150 l 40 | dataset5/keyframe4 55 l 41 | dataset4/keyframe4 182 l 42 | dataset5/keyframe4 114 l 43 | dataset3/keyframe4 306 l 44 | dataset1/keyframe3 365 l 45 | dataset5/keyframe4 135 l 46 | dataset7/keyframe4 534 l 47 | dataset3/keyframe4 712 l 48 | dataset1/keyframe3 58 l 49 | dataset6/keyframe4 1274 l 50 | dataset5/keyframe4 86 l 51 | dataset6/keyframe4 1250 l 52 | dataset5/keyframe4 28 l 53 | dataset2/keyframe4 113 l 54 | dataset7/keyframe4 40 l 55 | dataset6/keyframe4 1226 l 56 | dataset1/keyframe3 250 l 57 | dataset7/keyframe4 600 l 58 | dataset2/keyframe4 1704 l 59 | dataset3/keyframe4 138 l 60 | dataset4/keyframe4 54 l 61 | dataset5/keyframe4 45 l 62 | dataset1/keyframe3 458 l 63 | dataset5/keyframe4 121 l 64 | dataset4/keyframe4 42 l 65 | dataset5/keyframe4 67 l 66 | dataset7/keyframe4 65 l 67 | dataset4/keyframe4 115 l 68 | dataset5/keyframe4 394 l 69 | dataset6/keyframe4 146 l 70 | dataset6/keyframe4 70 l 71 | dataset4/keyframe4 162 l 72 | dataset7/keyframe4 382 l 73 | dataset4/keyframe4 126 l 74 | dataset3/keyframe4 365 l 75 | dataset1/keyframe3 306 l 76 | dataset5/keyframe4 40 l 77 | dataset4/keyframe4 154 l 78 | dataset6/keyframe4 20 l 79 | dataset2/keyframe4 25 l 80 | dataset3/keyframe4 516 l 81 | dataset5/keyframe4 198 l 82 | dataset6/keyframe4 650 l 83 | dataset4/keyframe4 134 l 84 | dataset7/keyframe4 1180 l 85 | dataset7/keyframe4 45 l 86 | dataset4/keyframe4 6 l 87 | dataset7/keyframe4 50 l 88 | dataset1/keyframe3 222 l 89 | dataset2/keyframe4 350 l 90 | dataset3/keyframe4 768 l 91 | dataset6/keyframe4 72 l 92 | dataset3/keyframe4 796 l 93 | dataset6/keyframe4 2 l 94 | dataset3/keyframe4 824 l 95 | dataset7/keyframe4 800 l 96 | dataset5/keyframe4 331 l 97 | dataset7/keyframe4 1750 l 98 | dataset2/keyframe4 1260 l 99 | dataset7/keyframe4 344 l 100 | dataset3/keyframe4 208 l 101 | dataset3/keyframe4 432 l 102 | dataset5/keyframe4 387 l 103 | dataset6/keyframe4 10 l 104 | dataset3/keyframe4 96 l 105 | dataset4/keyframe4 79 l 106 | dataset6/keyframe4 450 l 107 | dataset1/keyframe3 370 l 108 | dataset7/keyframe4 1978 l 109 | dataset4/keyframe4 198 l 110 | dataset3/keyframe4 348 l 111 | dataset6/keyframe4 30 l 112 | dataset2/keyframe4 1630 l 113 | dataset4/keyframe4 60 l 114 | dataset2/keyframe4 1149 l 115 | dataset4/keyframe4 22 l 116 | dataset7/keyframe4 686 l 117 | dataset3/keyframe4 698 l 118 | dataset5/keyframe4 23 l 119 | dataset5/keyframe4 191 l 120 | dataset3/keyframe4 325 l 121 | dataset6/keyframe4 866 l 122 | dataset6/keyframe4 1154 l 123 | dataset3/keyframe4 236 l 124 | dataset2/keyframe4 631 l 125 | dataset3/keyframe4 395 l 126 | dataset3/keyframe4 502 l 127 | dataset6/keyframe4 1130 l 128 | dataset6/keyframe4 530 l 129 | dataset5/keyframe4 324 l 130 | dataset5/keyframe4 184 l 131 | dataset3/keyframe4 60 l 132 | dataset7/keyframe4 500 l 133 | dataset5/keyframe4 160 l 134 | dataset7/keyframe4 1066 l 135 | dataset1/keyframe3 410 l 136 | dataset6/keyframe4 500 l 137 | dataset4/keyframe4 50 l 138 | dataset4/keyframe4 158 l 139 | dataset2/keyframe4 927 l 140 | dataset2/keyframe4 39 l 141 | dataset7/keyframe4 1522 l 142 | dataset4/keyframe4 4 l 143 | dataset1/keyframe3 50 l 144 | dataset5/keyframe4 289 l 145 | dataset1/keyframe3 234 l 146 | dataset7/keyframe4 1256 l 147 | dataset5/keyframe4 170 l 148 | dataset4/keyframe4 175 l 149 | dataset3/keyframe4 656 l 150 | dataset3/keyframe4 614 l 151 | dataset4/keyframe4 26 l 152 | dataset6/keyframe4 300 l 153 | dataset4/keyframe4 35 l 154 | dataset6/keyframe4 962 l 155 | dataset4/keyframe4 166 l 156 | dataset7/keyframe4 550 l 157 | dataset1/keyframe3 375 l 158 | dataset3/keyframe4 222 l 159 | dataset2/keyframe4 340 l 160 | dataset7/keyframe4 55 l 161 | dataset1/keyframe3 190 l 162 | dataset5/keyframe4 69 l 163 | dataset2/keyframe4 1334 l 164 | dataset6/keyframe4 25 l 165 | dataset1/keyframe3 122 l 166 | dataset5/keyframe4 93 l 167 | dataset2/keyframe4 35 l 168 | dataset7/keyframe4 990 l 169 | dataset3/keyframe4 292 l 170 | dataset2/keyframe4 20 l 171 | dataset5/keyframe4 44 l 172 | dataset2/keyframe4 150 l 173 | dataset7/keyframe4 400 l 174 | dataset1/keyframe3 195 l 175 | dataset6/keyframe4 122 l 176 | dataset3/keyframe4 264 l 177 | dataset2/keyframe4 853 l 178 | dataset7/keyframe4 25 l 179 | dataset6/keyframe4 674 l 180 | dataset4/keyframe4 46 l 181 | dataset2/keyframe4 372 l 182 | dataset5/keyframe4 177 l 183 | dataset5/keyframe4 128 l 184 | dataset5/keyframe4 120 l 185 | dataset7/keyframe4 150 l 186 | dataset2/keyframe4 60 l 187 | dataset1/keyframe3 130 l 188 | dataset1/keyframe3 362 l 189 | dataset1/keyframe3 394 l 190 | dataset4/keyframe4 178 l 191 | dataset1/keyframe3 98 l 192 | dataset7/keyframe4 30 l 193 | dataset1/keyframe3 466 l 194 | dataset4/keyframe4 45 l 195 | dataset1/keyframe3 66 l 196 | dataset7/keyframe4 1902 l 197 | dataset1/keyframe3 146 l 198 | dataset6/keyframe4 17 l 199 | dataset1/keyframe3 322 l 200 | dataset6/keyframe4 75 l 201 | dataset5/keyframe4 72 l 202 | dataset2/keyframe4 1519 l 203 | dataset6/keyframe4 434 l 204 | dataset4/keyframe4 202 l 205 | dataset5/keyframe4 9 l 206 | dataset7/keyframe4 1446 l 207 | dataset6/keyframe4 250 l 208 | dataset4/keyframe4 82 l 209 | dataset3/keyframe4 404 l 210 | dataset6/keyframe4 890 l 211 | dataset7/keyframe4 70 l 212 | dataset1/keyframe3 418 l 213 | dataset3/keyframe4 75 l 214 | dataset5/keyframe4 33 l 215 | dataset4/keyframe4 110 l 216 | dataset3/keyframe4 55 l 217 | dataset6/keyframe4 722 l 218 | dataset2/keyframe4 446 l 219 | dataset6/keyframe4 482 l 220 | dataset5/keyframe4 37 l 221 | dataset1/keyframe3 162 l 222 | dataset5/keyframe4 212 l 223 | dataset2/keyframe4 483 l 224 | dataset1/keyframe3 238 l 225 | dataset1/keyframe3 26 l 226 | dataset4/keyframe4 78 l 227 | dataset5/keyframe4 190 l 228 | dataset7/keyframe4 762 l 229 | dataset2/keyframe4 705 l 230 | dataset4/keyframe4 70 l 231 | dataset3/keyframe4 558 l 232 | dataset6/keyframe4 914 l 233 | dataset6/keyframe4 506 l 234 | dataset2/keyframe4 779 l 235 | dataset6/keyframe4 938 l 236 | dataset2/keyframe4 890 l 237 | dataset6/keyframe4 200 l 238 | dataset7/keyframe4 1484 l 239 | dataset2/keyframe4 816 l 240 | dataset3/keyframe4 362 l 241 | dataset6/keyframe4 1082 l 242 | dataset2/keyframe4 420 l 243 | dataset4/keyframe4 2 l 244 | dataset4/keyframe4 102 l 245 | dataset1/keyframe3 442 l 246 | dataset1/keyframe3 210 l 247 | dataset5/keyframe4 16 l 248 | dataset3/keyframe4 474 l 249 | dataset3/keyframe4 544 l 250 | dataset4/keyframe4 174 l 251 | dataset6/keyframe4 150 l 252 | dataset4/keyframe4 105 l 253 | dataset7/keyframe4 300 l 254 | dataset3/keyframe4 385 l 255 | dataset1/keyframe3 330 l 256 | dataset4/keyframe4 58 l 257 | dataset4/keyframe4 142 l 258 | dataset7/keyframe4 1028 l 259 | dataset2/keyframe4 1482 l 260 | dataset2/keyframe4 1075 l 261 | dataset5/keyframe4 233 l 262 | dataset4/keyframe4 62 l 263 | dataset7/keyframe4 496 l 264 | dataset1/keyframe3 345 l 265 | dataset3/keyframe4 812 l 266 | dataset2/keyframe4 390 l 267 | dataset2/keyframe4 76 l 268 | dataset3/keyframe4 375 l 269 | dataset1/keyframe3 426 l 270 | dataset6/keyframe4 79 l 271 | dataset4/keyframe4 66 l 272 | dataset6/keyframe4 35 l 273 | dataset7/keyframe4 230 l 274 | dataset4/keyframe4 138 l 275 | dataset3/keyframe4 58 l 276 | dataset5/keyframe4 380 l 277 | dataset1/keyframe3 266 l 278 | dataset2/keyframe4 2037 l 279 | dataset3/keyframe4 405 l 280 | dataset3/keyframe4 530 l 281 | dataset3/keyframe4 30 l 282 | dataset1/keyframe3 138 l 283 | dataset1/keyframe3 230 l 284 | dataset5/keyframe4 254 l 285 | dataset1/keyframe3 360 l 286 | dataset6/keyframe4 45 l 287 | dataset7/keyframe4 1104 l 288 | dataset5/keyframe4 352 l 289 | dataset7/keyframe4 838 l 290 | dataset2/keyframe4 50 l 291 | dataset3/keyframe4 16 l 292 | dataset6/keyframe4 1106 l 293 | dataset6/keyframe4 746 l 294 | dataset5/keyframe4 30 l 295 | dataset5/keyframe4 163 l 296 | dataset2/keyframe4 520 l 297 | dataset7/keyframe4 60 l 298 | dataset5/keyframe4 296 l 299 | dataset7/keyframe4 458 l 300 | dataset7/keyframe4 1788 l 301 | dataset7/keyframe4 20 l 302 | dataset3/keyframe4 110 l 303 | dataset6/keyframe4 578 l 304 | dataset4/keyframe4 74 l 305 | dataset1/keyframe3 274 l 306 | dataset7/keyframe4 1636 l 307 | dataset5/keyframe4 60 l 308 | dataset3/keyframe4 642 l 309 | dataset2/keyframe4 79 l 310 | dataset7/keyframe4 1864 l 311 | dataset3/keyframe4 124 l 312 | dataset6/keyframe4 362 l 313 | dataset5/keyframe4 140 l 314 | dataset7/keyframe4 420 l 315 | dataset5/keyframe4 53 l 316 | dataset3/keyframe4 355 l 317 | dataset4/keyframe4 186 l 318 | dataset2/keyframe4 55 l 319 | dataset6/keyframe4 626 l 320 | dataset7/keyframe4 610 l 321 | dataset1/keyframe3 335 l 322 | dataset6/keyframe4 218 l 323 | dataset5/keyframe4 210 l 324 | dataset7/keyframe4 1142 l 325 | dataset6/keyframe4 40 l 326 | dataset2/keyframe4 261 l 327 | dataset5/keyframe4 366 l 328 | dataset5/keyframe4 18 l 329 | dataset2/keyframe4 1001 l 330 | dataset6/keyframe4 986 l 331 | dataset7/keyframe4 78 l 332 | dataset1/keyframe3 450 l 333 | dataset7/keyframe4 1826 l 334 | dataset5/keyframe4 240 l 335 | dataset4/keyframe4 122 l 336 | dataset3/keyframe4 740 l 337 | dataset5/keyframe4 130 l 338 | dataset5/keyframe4 268 l 339 | dataset2/keyframe4 380 l 340 | dataset2/keyframe4 45 l 341 | dataset6/keyframe4 77 l 342 | dataset1/keyframe3 170 l 343 | dataset6/keyframe4 818 l 344 | dataset4/keyframe4 16 l 345 | dataset5/keyframe4 10 l 346 | dataset1/keyframe3 226 l 347 | dataset3/keyframe4 684 l 348 | dataset3/keyframe4 572 l 349 | dataset6/keyframe4 350 l 350 | dataset5/keyframe4 35 l 351 | dataset3/keyframe4 415 l 352 | dataset6/keyframe4 1202 l 353 | dataset4/keyframe4 25 l 354 | dataset7/keyframe4 1370 l 355 | dataset1/keyframe3 385 l 356 | dataset3/keyframe4 418 l 357 | dataset5/keyframe4 110 l 358 | dataset1/keyframe3 386 l 359 | dataset5/keyframe4 282 l 360 | dataset6/keyframe4 55 l 361 | dataset3/keyframe4 726 l 362 | dataset2/keyframe4 2000 l 363 | dataset1/keyframe3 434 l 364 | dataset4/keyframe4 75 l 365 | dataset2/keyframe4 298 l 366 | dataset5/keyframe4 20 l 367 | dataset7/keyframe4 154 l 368 | dataset1/keyframe3 10 l 369 | dataset2/keyframe4 410 l 370 | dataset1/keyframe3 82 l 371 | dataset1/keyframe3 18 l 372 | dataset4/keyframe4 114 l 373 | dataset2/keyframe4 1038 l 374 | dataset1/keyframe3 106 l 375 | dataset6/keyframe4 23 l 376 | dataset6/keyframe4 842 l 377 | dataset5/keyframe4 200 l 378 | dataset3/keyframe4 345 l 379 | dataset7/keyframe4 2130 l 380 | dataset2/keyframe4 400 l 381 | dataset6/keyframe4 1298 l 382 | dataset5/keyframe4 261 l 383 | dataset2/keyframe4 1297 l 384 | dataset1/keyframe3 355 l 385 | dataset6/keyframe4 65 l 386 | dataset7/keyframe4 1408 l 387 | dataset2/keyframe4 1186 l 388 | dataset7/keyframe4 724 l 389 | dataset3/keyframe4 65 l 390 | dataset2/keyframe4 1556 l 391 | dataset5/keyframe4 205 l 392 | dataset3/keyframe4 782 l 393 | dataset4/keyframe4 170 l 394 | dataset4/keyframe4 206 l 395 | dataset6/keyframe4 338 l 396 | dataset1/keyframe3 2 l 397 | dataset6/keyframe4 100 l 398 | dataset7/keyframe4 1218 l 399 | dataset1/keyframe3 34 l 400 | dataset1/keyframe3 282 l 401 | dataset4/keyframe4 98 l 402 | dataset5/keyframe4 317 l 403 | dataset4/keyframe4 38 l 404 | dataset7/keyframe4 1940 l 405 | dataset6/keyframe4 170 l 406 | dataset5/keyframe4 76 l 407 | dataset6/keyframe4 266 l 408 | dataset6/keyframe4 554 l 409 | dataset2/keyframe4 335 l 410 | dataset2/keyframe4 742 l 411 | dataset2/keyframe4 1408 l 412 | dataset7/keyframe4 1712 l 413 | dataset1/keyframe3 258 l 414 | dataset7/keyframe4 306 l 415 | dataset2/keyframe4 1445 l 416 | dataset1/keyframe3 346 l 417 | dataset2/keyframe4 2074 l 418 | dataset5/keyframe4 51 l 419 | dataset1/keyframe3 114 l 420 | dataset1/keyframe3 290 l 421 | dataset6/keyframe4 410 l 422 | dataset7/keyframe4 35 l 423 | dataset3/keyframe4 376 l 424 | dataset1/keyframe3 90 l 425 | dataset4/keyframe4 130 l 426 | dataset2/keyframe4 1593 l 427 | dataset1/keyframe3 340 l 428 | dataset2/keyframe4 1371 l 429 | dataset6/keyframe4 794 l 430 | dataset3/keyframe4 334 l 431 | dataset2/keyframe4 360 l 432 | dataset4/keyframe4 30 l 433 | dataset4/keyframe4 125 l 434 | dataset4/keyframe4 40 l 435 | dataset7/keyframe4 914 l 436 | dataset3/keyframe4 250 l 437 | dataset3/keyframe4 460 l 438 | dataset6/keyframe4 1010 l 439 | dataset2/keyframe4 1667 l 440 | dataset3/keyframe4 600 l 441 | dataset3/keyframe4 446 l 442 | dataset5/keyframe4 107 l 443 | dataset5/keyframe4 5 l 444 | dataset4/keyframe4 135 l 445 | dataset4/keyframe4 94 l 446 | dataset2/keyframe4 187 l 447 | dataset2/keyframe4 668 l 448 | dataset4/keyframe4 146 l 449 | dataset4/keyframe4 34 l 450 | dataset5/keyframe4 156 l 451 | dataset6/keyframe4 1178 l 452 | dataset7/keyframe4 2054 l 453 | dataset5/keyframe4 180 l 454 | dataset5/keyframe4 47 l 455 | dataset7/keyframe4 200 l 456 | dataset6/keyframe4 1322 l 457 | dataset3/keyframe4 320 l 458 | dataset4/keyframe4 194 l 459 | dataset7/keyframe4 1598 l 460 | dataset3/keyframe4 50 l 461 | dataset7/keyframe4 15 l 462 | dataset5/keyframe4 79 l 463 | dataset4/keyframe4 145 l 464 | dataset3/keyframe4 2 l 465 | dataset4/keyframe4 190 l 466 | dataset5/keyframe4 149 l 467 | dataset4/keyframe4 165 l 468 | dataset6/keyframe4 290 l 469 | dataset7/keyframe4 116 l 470 | dataset6/keyframe4 50 l 471 | dataset2/keyframe4 2111 l 472 | dataset7/keyframe4 2092 l 473 | dataset1/keyframe3 298 l 474 | dataset7/keyframe4 350 l 475 | dataset4/keyframe4 185 l 476 | dataset1/keyframe3 338 l 477 | dataset5/keyframe4 100 l 478 | dataset7/keyframe4 2 l 479 | dataset3/keyframe4 194 l 480 | dataset5/keyframe4 63 l 481 | dataset4/keyframe4 55 l 482 | dataset6/keyframe4 60 l 483 | dataset1/keyframe3 186 l 484 | dataset7/keyframe4 876 l 485 | dataset1/keyframe3 154 l 486 | dataset7/keyframe4 572 l 487 | dataset5/keyframe4 142 l 488 | dataset5/keyframe4 2 l 489 | dataset1/keyframe3 246 l 490 | dataset5/keyframe4 275 l 491 | dataset5/keyframe4 65 l 492 | dataset4/keyframe4 65 l 493 | dataset7/keyframe4 2016 l 494 | dataset3/keyframe4 335 l 495 | dataset3/keyframe4 628 l 496 | dataset2/keyframe4 557 l 497 | dataset5/keyframe4 345 l 498 | dataset6/keyframe4 194 l 499 | dataset6/keyframe4 400 l 500 | dataset5/keyframe4 359 l 501 | dataset1/keyframe3 203 l 502 | dataset1/keyframe3 218 l 503 | dataset2/keyframe4 1112 l 504 | dataset1/keyframe3 193 l 505 | dataset1/keyframe3 378 l 506 | dataset5/keyframe4 247 l 507 | dataset7/keyframe4 2168 l 508 | dataset2/keyframe4 75 l 509 | dataset4/keyframe4 18 l 510 | dataset1/keyframe3 380 l 511 | dataset2/keyframe4 370 l 512 | dataset5/keyframe4 338 l 513 | dataset3/keyframe4 166 l 514 | dataset2/keyframe4 594 l 515 | dataset5/keyframe4 13 l 516 | dataset6/keyframe4 386 l 517 | dataset3/keyframe4 72 l 518 | dataset1/keyframe3 42 l 519 | dataset7/keyframe4 648 l 520 | dataset2/keyframe4 409 l 521 | dataset4/keyframe4 20 l 522 | dataset1/keyframe3 402 l 523 | dataset6/keyframe4 1346 l 524 | dataset1/keyframe3 200 l 525 | dataset7/keyframe4 1294 l 526 | dataset1/keyframe3 178 l 527 | dataset2/keyframe4 964 l 528 | dataset5/keyframe4 310 l 529 | dataset3/keyframe4 754 l 530 | dataset3/keyframe4 488 l 531 | dataset7/keyframe4 192 l 532 | dataset6/keyframe4 242 l 533 | dataset6/keyframe4 314 l 534 | dataset7/keyframe4 450 l 535 | dataset4/keyframe4 90 l 536 | dataset1/keyframe3 354 l 537 | dataset2/keyframe4 430 l 538 | dataset7/keyframe4 1332 l 539 | dataset2/keyframe4 30 l 540 | dataset5/keyframe4 303 l 541 | dataset1/keyframe3 314 l 542 | dataset5/keyframe4 401 l 543 | dataset3/keyframe4 586 l 544 | dataset3/keyframe4 44 l 545 | dataset5/keyframe4 226 l 546 | dataset6/keyframe4 550 l 547 | dataset6/keyframe4 458 l 548 | dataset4/keyframe4 12 l 549 | dataset4/keyframe4 86 l 550 | dataset4/keyframe4 106 l 551 | dataset3/keyframe4 278 l -------------------------------------------------------------------------------- /splits/endovis/test_files_sequence2.txt: -------------------------------------------------------------------------------- 1 | dataset3/keyframe4 1 l 2 | dataset3/keyframe4 2 l 3 | dataset3/keyframe4 3 l 4 | dataset3/keyframe4 4 l 5 | dataset3/keyframe4 5 l 6 | dataset3/keyframe4 6 l 7 | dataset3/keyframe4 7 l 8 | dataset3/keyframe4 8 l 9 | dataset3/keyframe4 9 l 10 | dataset3/keyframe4 10 l 11 | dataset3/keyframe4 11 l 12 | dataset3/keyframe4 12 l 13 | dataset3/keyframe4 13 l 14 | dataset3/keyframe4 14 l 15 | dataset3/keyframe4 15 l 16 | dataset3/keyframe4 16 l 17 | dataset3/keyframe4 17 l 18 | dataset3/keyframe4 18 l 19 | dataset3/keyframe4 19 l 20 | dataset3/keyframe4 20 l 21 | dataset3/keyframe4 21 l 22 | dataset3/keyframe4 22 l 23 | dataset3/keyframe4 23 l 24 | dataset3/keyframe4 24 l 25 | dataset3/keyframe4 25 l 26 | dataset3/keyframe4 26 l 27 | dataset3/keyframe4 27 l 28 | dataset3/keyframe4 28 l 29 | dataset3/keyframe4 29 l 30 | dataset3/keyframe4 30 l 31 | dataset3/keyframe4 31 l 32 | dataset3/keyframe4 32 l 33 | dataset3/keyframe4 33 l 34 | dataset3/keyframe4 34 l 35 | dataset3/keyframe4 35 l 36 | dataset3/keyframe4 36 l 37 | dataset3/keyframe4 37 l 38 | dataset3/keyframe4 38 l 39 | dataset3/keyframe4 39 l 40 | dataset3/keyframe4 40 l 41 | dataset3/keyframe4 41 l 42 | dataset3/keyframe4 42 l 43 | dataset3/keyframe4 43 l 44 | dataset3/keyframe4 44 l 45 | dataset3/keyframe4 45 l 46 | dataset3/keyframe4 46 l 47 | dataset3/keyframe4 47 l 48 | dataset3/keyframe4 48 l 49 | dataset3/keyframe4 49 l 50 | dataset3/keyframe4 50 l 51 | dataset3/keyframe4 51 l 52 | dataset3/keyframe4 52 l 53 | dataset3/keyframe4 53 l 54 | dataset3/keyframe4 54 l 55 | dataset3/keyframe4 55 l 56 | dataset3/keyframe4 56 l 57 | dataset3/keyframe4 57 l 58 | dataset3/keyframe4 58 l 59 | dataset3/keyframe4 59 l 60 | dataset3/keyframe4 60 l 61 | dataset3/keyframe4 61 l 62 | dataset3/keyframe4 62 l 63 | dataset3/keyframe4 63 l 64 | dataset3/keyframe4 64 l 65 | dataset3/keyframe4 65 l 66 | dataset3/keyframe4 66 l 67 | dataset3/keyframe4 67 l 68 | dataset3/keyframe4 68 l 69 | dataset3/keyframe4 69 l 70 | dataset3/keyframe4 70 l 71 | dataset3/keyframe4 71 l 72 | dataset3/keyframe4 72 l 73 | dataset3/keyframe4 73 l 74 | dataset3/keyframe4 74 l 75 | dataset3/keyframe4 75 l 76 | dataset3/keyframe4 76 l 77 | dataset3/keyframe4 77 l 78 | dataset3/keyframe4 78 l 79 | dataset3/keyframe4 79 l 80 | dataset3/keyframe4 80 l 81 | dataset3/keyframe4 81 l 82 | dataset3/keyframe4 82 l 83 | dataset3/keyframe4 83 l 84 | dataset3/keyframe4 84 l 85 | dataset3/keyframe4 85 l 86 | dataset3/keyframe4 86 l 87 | dataset3/keyframe4 87 l 88 | dataset3/keyframe4 88 l 89 | dataset3/keyframe4 89 l 90 | dataset3/keyframe4 90 l 91 | dataset3/keyframe4 91 l 92 | dataset3/keyframe4 92 l 93 | dataset3/keyframe4 93 l 94 | dataset3/keyframe4 94 l 95 | dataset3/keyframe4 95 l 96 | dataset3/keyframe4 96 l 97 | dataset3/keyframe4 97 l 98 | dataset3/keyframe4 98 l 99 | dataset3/keyframe4 99 l 100 | dataset3/keyframe4 100 l 101 | dataset3/keyframe4 101 l 102 | dataset3/keyframe4 102 l 103 | dataset3/keyframe4 103 l 104 | dataset3/keyframe4 104 l 105 | dataset3/keyframe4 105 l 106 | dataset3/keyframe4 106 l 107 | dataset3/keyframe4 107 l 108 | dataset3/keyframe4 108 l 109 | dataset3/keyframe4 109 l 110 | dataset3/keyframe4 110 l 111 | dataset3/keyframe4 111 l 112 | dataset3/keyframe4 112 l 113 | dataset3/keyframe4 113 l 114 | dataset3/keyframe4 114 l 115 | dataset3/keyframe4 115 l 116 | dataset3/keyframe4 116 l 117 | dataset3/keyframe4 117 l 118 | dataset3/keyframe4 118 l 119 | dataset3/keyframe4 119 l 120 | dataset3/keyframe4 120 l 121 | dataset3/keyframe4 121 l 122 | dataset3/keyframe4 122 l 123 | dataset3/keyframe4 123 l 124 | dataset3/keyframe4 124 l 125 | dataset3/keyframe4 125 l 126 | dataset3/keyframe4 126 l 127 | dataset3/keyframe4 127 l 128 | dataset3/keyframe4 128 l 129 | dataset3/keyframe4 129 l 130 | dataset3/keyframe4 130 l 131 | dataset3/keyframe4 131 l 132 | dataset3/keyframe4 132 l 133 | dataset3/keyframe4 133 l 134 | dataset3/keyframe4 134 l 135 | dataset3/keyframe4 135 l 136 | dataset3/keyframe4 136 l 137 | dataset3/keyframe4 137 l 138 | dataset3/keyframe4 138 l 139 | dataset3/keyframe4 139 l 140 | dataset3/keyframe4 140 l 141 | dataset3/keyframe4 141 l 142 | dataset3/keyframe4 142 l 143 | dataset3/keyframe4 143 l 144 | dataset3/keyframe4 144 l 145 | dataset3/keyframe4 145 l 146 | dataset3/keyframe4 146 l 147 | dataset3/keyframe4 147 l 148 | dataset3/keyframe4 148 l 149 | dataset3/keyframe4 149 l 150 | dataset3/keyframe4 150 l 151 | dataset3/keyframe4 151 l 152 | dataset3/keyframe4 152 l 153 | dataset3/keyframe4 153 l 154 | dataset3/keyframe4 154 l 155 | dataset3/keyframe4 155 l 156 | dataset3/keyframe4 156 l 157 | dataset3/keyframe4 157 l 158 | dataset3/keyframe4 158 l 159 | dataset3/keyframe4 159 l 160 | dataset3/keyframe4 160 l 161 | dataset3/keyframe4 161 l 162 | dataset3/keyframe4 162 l 163 | dataset3/keyframe4 163 l 164 | dataset3/keyframe4 164 l 165 | dataset3/keyframe4 165 l 166 | dataset3/keyframe4 166 l 167 | dataset3/keyframe4 167 l 168 | dataset3/keyframe4 168 l 169 | dataset3/keyframe4 169 l 170 | dataset3/keyframe4 170 l 171 | dataset3/keyframe4 171 l 172 | dataset3/keyframe4 172 l 173 | dataset3/keyframe4 173 l 174 | dataset3/keyframe4 174 l 175 | dataset3/keyframe4 175 l 176 | dataset3/keyframe4 176 l 177 | dataset3/keyframe4 177 l 178 | dataset3/keyframe4 178 l 179 | dataset3/keyframe4 179 l 180 | dataset3/keyframe4 180 l 181 | dataset3/keyframe4 181 l 182 | dataset3/keyframe4 182 l 183 | dataset3/keyframe4 183 l 184 | dataset3/keyframe4 184 l 185 | dataset3/keyframe4 185 l 186 | dataset3/keyframe4 186 l 187 | dataset3/keyframe4 187 l 188 | dataset3/keyframe4 188 l 189 | dataset3/keyframe4 189 l 190 | dataset3/keyframe4 190 l 191 | dataset3/keyframe4 191 l 192 | dataset3/keyframe4 192 l 193 | dataset3/keyframe4 193 l 194 | dataset3/keyframe4 194 l 195 | dataset3/keyframe4 195 l 196 | dataset3/keyframe4 196 l 197 | dataset3/keyframe4 197 l 198 | dataset3/keyframe4 198 l 199 | dataset3/keyframe4 199 l 200 | dataset3/keyframe4 200 l 201 | dataset3/keyframe4 201 l 202 | dataset3/keyframe4 202 l 203 | dataset3/keyframe4 203 l 204 | dataset3/keyframe4 204 l 205 | dataset3/keyframe4 205 l 206 | dataset3/keyframe4 206 l 207 | dataset3/keyframe4 207 l 208 | dataset3/keyframe4 208 l 209 | dataset3/keyframe4 209 l 210 | dataset3/keyframe4 210 l 211 | dataset3/keyframe4 211 l 212 | dataset3/keyframe4 212 l 213 | dataset3/keyframe4 213 l 214 | dataset3/keyframe4 214 l 215 | dataset3/keyframe4 215 l 216 | dataset3/keyframe4 216 l 217 | dataset3/keyframe4 217 l 218 | dataset3/keyframe4 218 l 219 | dataset3/keyframe4 219 l 220 | dataset3/keyframe4 220 l 221 | dataset3/keyframe4 221 l 222 | dataset3/keyframe4 222 l 223 | dataset3/keyframe4 223 l 224 | dataset3/keyframe4 224 l 225 | dataset3/keyframe4 225 l 226 | dataset3/keyframe4 226 l 227 | dataset3/keyframe4 227 l 228 | dataset3/keyframe4 228 l 229 | dataset3/keyframe4 229 l 230 | dataset3/keyframe4 230 l 231 | dataset3/keyframe4 231 l 232 | dataset3/keyframe4 232 l 233 | dataset3/keyframe4 233 l 234 | dataset3/keyframe4 234 l 235 | dataset3/keyframe4 235 l 236 | dataset3/keyframe4 236 l 237 | dataset3/keyframe4 237 l 238 | dataset3/keyframe4 238 l 239 | dataset3/keyframe4 239 l 240 | dataset3/keyframe4 240 l 241 | dataset3/keyframe4 241 l 242 | dataset3/keyframe4 242 l 243 | dataset3/keyframe4 243 l 244 | dataset3/keyframe4 244 l 245 | dataset3/keyframe4 245 l 246 | dataset3/keyframe4 246 l 247 | dataset3/keyframe4 247 l 248 | dataset3/keyframe4 248 l 249 | dataset3/keyframe4 249 l 250 | dataset3/keyframe4 250 l 251 | dataset3/keyframe4 251 l 252 | dataset3/keyframe4 252 l 253 | dataset3/keyframe4 253 l 254 | dataset3/keyframe4 254 l 255 | dataset3/keyframe4 255 l 256 | dataset3/keyframe4 256 l 257 | dataset3/keyframe4 257 l 258 | dataset3/keyframe4 258 l 259 | dataset3/keyframe4 259 l 260 | dataset3/keyframe4 260 l 261 | dataset3/keyframe4 261 l 262 | dataset3/keyframe4 262 l 263 | dataset3/keyframe4 263 l 264 | dataset3/keyframe4 264 l 265 | dataset3/keyframe4 265 l 266 | dataset3/keyframe4 266 l 267 | dataset3/keyframe4 267 l 268 | dataset3/keyframe4 268 l 269 | dataset3/keyframe4 269 l 270 | dataset3/keyframe4 270 l 271 | dataset3/keyframe4 271 l 272 | dataset3/keyframe4 272 l 273 | dataset3/keyframe4 273 l 274 | dataset3/keyframe4 274 l 275 | dataset3/keyframe4 275 l 276 | dataset3/keyframe4 276 l 277 | dataset3/keyframe4 277 l 278 | dataset3/keyframe4 278 l 279 | dataset3/keyframe4 279 l 280 | dataset3/keyframe4 280 l 281 | dataset3/keyframe4 281 l 282 | dataset3/keyframe4 282 l 283 | dataset3/keyframe4 283 l 284 | dataset3/keyframe4 284 l 285 | dataset3/keyframe4 285 l 286 | dataset3/keyframe4 286 l 287 | dataset3/keyframe4 287 l 288 | dataset3/keyframe4 288 l 289 | dataset3/keyframe4 289 l 290 | dataset3/keyframe4 290 l 291 | dataset3/keyframe4 291 l 292 | dataset3/keyframe4 292 l 293 | dataset3/keyframe4 293 l 294 | dataset3/keyframe4 294 l 295 | dataset3/keyframe4 295 l 296 | dataset3/keyframe4 296 l 297 | dataset3/keyframe4 297 l 298 | dataset3/keyframe4 298 l 299 | dataset3/keyframe4 299 l 300 | dataset3/keyframe4 300 l 301 | dataset3/keyframe4 301 l 302 | dataset3/keyframe4 302 l 303 | dataset3/keyframe4 303 l 304 | dataset3/keyframe4 304 l 305 | dataset3/keyframe4 305 l 306 | dataset3/keyframe4 306 l 307 | dataset3/keyframe4 307 l 308 | dataset3/keyframe4 308 l 309 | dataset3/keyframe4 309 l 310 | dataset3/keyframe4 310 l 311 | dataset3/keyframe4 311 l 312 | dataset3/keyframe4 312 l 313 | dataset3/keyframe4 313 l 314 | dataset3/keyframe4 314 l 315 | dataset3/keyframe4 315 l 316 | dataset3/keyframe4 316 l 317 | dataset3/keyframe4 317 l 318 | dataset3/keyframe4 318 l 319 | dataset3/keyframe4 319 l 320 | dataset3/keyframe4 320 l 321 | dataset3/keyframe4 321 l 322 | dataset3/keyframe4 322 l 323 | dataset3/keyframe4 323 l 324 | dataset3/keyframe4 324 l 325 | dataset3/keyframe4 325 l 326 | dataset3/keyframe4 326 l 327 | dataset3/keyframe4 327 l 328 | dataset3/keyframe4 328 l 329 | dataset3/keyframe4 329 l 330 | dataset3/keyframe4 330 l 331 | dataset3/keyframe4 331 l 332 | dataset3/keyframe4 332 l 333 | dataset3/keyframe4 333 l 334 | dataset3/keyframe4 334 l 335 | dataset3/keyframe4 335 l 336 | dataset3/keyframe4 336 l 337 | dataset3/keyframe4 337 l 338 | dataset3/keyframe4 338 l 339 | dataset3/keyframe4 339 l 340 | dataset3/keyframe4 340 l 341 | dataset3/keyframe4 341 l 342 | dataset3/keyframe4 342 l 343 | dataset3/keyframe4 343 l 344 | dataset3/keyframe4 344 l 345 | dataset3/keyframe4 345 l 346 | dataset3/keyframe4 346 l 347 | dataset3/keyframe4 347 l 348 | dataset3/keyframe4 348 l 349 | dataset3/keyframe4 349 l 350 | dataset3/keyframe4 350 l 351 | dataset3/keyframe4 351 l 352 | dataset3/keyframe4 352 l 353 | dataset3/keyframe4 353 l 354 | dataset3/keyframe4 354 l 355 | dataset3/keyframe4 355 l 356 | dataset3/keyframe4 356 l 357 | dataset3/keyframe4 357 l 358 | dataset3/keyframe4 358 l 359 | dataset3/keyframe4 359 l 360 | dataset3/keyframe4 360 l 361 | dataset3/keyframe4 361 l 362 | dataset3/keyframe4 362 l 363 | dataset3/keyframe4 363 l 364 | dataset3/keyframe4 364 l 365 | dataset3/keyframe4 365 l 366 | dataset3/keyframe4 366 l 367 | dataset3/keyframe4 367 l 368 | dataset3/keyframe4 368 l 369 | dataset3/keyframe4 369 l 370 | dataset3/keyframe4 370 l 371 | dataset3/keyframe4 371 l 372 | dataset3/keyframe4 372 l 373 | dataset3/keyframe4 373 l 374 | dataset3/keyframe4 374 l 375 | dataset3/keyframe4 375 l 376 | dataset3/keyframe4 376 l 377 | dataset3/keyframe4 377 l 378 | dataset3/keyframe4 378 l 379 | dataset3/keyframe4 379 l 380 | dataset3/keyframe4 380 l 381 | dataset3/keyframe4 381 l 382 | dataset3/keyframe4 382 l 383 | dataset3/keyframe4 383 l 384 | dataset3/keyframe4 384 l 385 | dataset3/keyframe4 385 l 386 | dataset3/keyframe4 386 l 387 | dataset3/keyframe4 387 l 388 | dataset3/keyframe4 388 l 389 | dataset3/keyframe4 389 l 390 | dataset3/keyframe4 390 l 391 | dataset3/keyframe4 391 l 392 | dataset3/keyframe4 392 l 393 | dataset3/keyframe4 393 l 394 | dataset3/keyframe4 394 l 395 | dataset3/keyframe4 395 l 396 | dataset3/keyframe4 396 l 397 | dataset3/keyframe4 397 l 398 | dataset3/keyframe4 398 l 399 | dataset3/keyframe4 399 l 400 | dataset3/keyframe4 400 l 401 | dataset3/keyframe4 401 l 402 | dataset3/keyframe4 402 l 403 | dataset3/keyframe4 403 l 404 | dataset3/keyframe4 404 l 405 | dataset3/keyframe4 405 l 406 | dataset3/keyframe4 406 l 407 | dataset3/keyframe4 407 l 408 | dataset3/keyframe4 408 l 409 | dataset3/keyframe4 409 l 410 | dataset3/keyframe4 410 l 411 | dataset3/keyframe4 411 l 412 | dataset3/keyframe4 412 l 413 | dataset3/keyframe4 413 l 414 | dataset3/keyframe4 414 l 415 | dataset3/keyframe4 415 l 416 | dataset3/keyframe4 416 l 417 | dataset3/keyframe4 417 l 418 | dataset3/keyframe4 418 l 419 | dataset3/keyframe4 419 l 420 | dataset3/keyframe4 420 l 421 | dataset3/keyframe4 421 l 422 | dataset3/keyframe4 422 l 423 | dataset3/keyframe4 423 l 424 | dataset3/keyframe4 424 l 425 | dataset3/keyframe4 425 l 426 | dataset3/keyframe4 426 l 427 | dataset3/keyframe4 427 l 428 | dataset3/keyframe4 428 l 429 | dataset3/keyframe4 429 l 430 | dataset3/keyframe4 430 l 431 | dataset3/keyframe4 431 l 432 | dataset3/keyframe4 432 l 433 | dataset3/keyframe4 433 l 434 | dataset3/keyframe4 434 l 435 | dataset3/keyframe4 435 l 436 | dataset3/keyframe4 436 l 437 | dataset3/keyframe4 437 l 438 | dataset3/keyframe4 438 l 439 | dataset3/keyframe4 439 l 440 | dataset3/keyframe4 440 l 441 | dataset3/keyframe4 441 l 442 | dataset3/keyframe4 442 l 443 | dataset3/keyframe4 443 l 444 | dataset3/keyframe4 444 l 445 | dataset3/keyframe4 445 l 446 | dataset3/keyframe4 446 l 447 | dataset3/keyframe4 447 l 448 | dataset3/keyframe4 448 l 449 | dataset3/keyframe4 449 l 450 | dataset3/keyframe4 450 l 451 | dataset3/keyframe4 451 l 452 | dataset3/keyframe4 452 l 453 | dataset3/keyframe4 453 l 454 | dataset3/keyframe4 454 l 455 | dataset3/keyframe4 455 l 456 | dataset3/keyframe4 456 l 457 | dataset3/keyframe4 457 l 458 | dataset3/keyframe4 458 l 459 | dataset3/keyframe4 459 l 460 | dataset3/keyframe4 460 l 461 | dataset3/keyframe4 461 l 462 | dataset3/keyframe4 462 l 463 | dataset3/keyframe4 463 l 464 | dataset3/keyframe4 464 l 465 | dataset3/keyframe4 465 l 466 | dataset3/keyframe4 466 l 467 | dataset3/keyframe4 467 l 468 | dataset3/keyframe4 468 l 469 | dataset3/keyframe4 469 l 470 | dataset3/keyframe4 470 l 471 | dataset3/keyframe4 471 l 472 | dataset3/keyframe4 472 l 473 | dataset3/keyframe4 473 l 474 | dataset3/keyframe4 474 l 475 | dataset3/keyframe4 475 l 476 | dataset3/keyframe4 476 l 477 | dataset3/keyframe4 477 l 478 | dataset3/keyframe4 478 l 479 | dataset3/keyframe4 479 l 480 | dataset3/keyframe4 480 l 481 | dataset3/keyframe4 481 l 482 | dataset3/keyframe4 482 l 483 | dataset3/keyframe4 483 l 484 | dataset3/keyframe4 484 l 485 | dataset3/keyframe4 485 l 486 | dataset3/keyframe4 486 l 487 | dataset3/keyframe4 487 l 488 | dataset3/keyframe4 488 l 489 | dataset3/keyframe4 489 l 490 | dataset3/keyframe4 490 l 491 | dataset3/keyframe4 491 l 492 | dataset3/keyframe4 492 l 493 | dataset3/keyframe4 493 l 494 | dataset3/keyframe4 494 l 495 | dataset3/keyframe4 495 l 496 | dataset3/keyframe4 496 l 497 | dataset3/keyframe4 497 l 498 | dataset3/keyframe4 498 l 499 | dataset3/keyframe4 499 l 500 | dataset3/keyframe4 500 l 501 | dataset3/keyframe4 501 l 502 | dataset3/keyframe4 502 l 503 | dataset3/keyframe4 503 l 504 | dataset3/keyframe4 504 l 505 | dataset3/keyframe4 505 l 506 | dataset3/keyframe4 506 l 507 | dataset3/keyframe4 507 l 508 | dataset3/keyframe4 508 l 509 | dataset3/keyframe4 509 l 510 | dataset3/keyframe4 510 l 511 | dataset3/keyframe4 511 l 512 | dataset3/keyframe4 512 l 513 | dataset3/keyframe4 513 l 514 | dataset3/keyframe4 514 l 515 | dataset3/keyframe4 515 l 516 | dataset3/keyframe4 516 l 517 | dataset3/keyframe4 517 l 518 | dataset3/keyframe4 518 l 519 | dataset3/keyframe4 519 l 520 | dataset3/keyframe4 520 l 521 | dataset3/keyframe4 521 l 522 | dataset3/keyframe4 522 l 523 | dataset3/keyframe4 523 l 524 | dataset3/keyframe4 524 l 525 | dataset3/keyframe4 525 l 526 | dataset3/keyframe4 526 l 527 | dataset3/keyframe4 527 l 528 | dataset3/keyframe4 528 l 529 | dataset3/keyframe4 529 l 530 | dataset3/keyframe4 530 l 531 | dataset3/keyframe4 531 l 532 | dataset3/keyframe4 532 l 533 | dataset3/keyframe4 533 l 534 | dataset3/keyframe4 534 l 535 | dataset3/keyframe4 535 l 536 | dataset3/keyframe4 536 l 537 | dataset3/keyframe4 537 l 538 | dataset3/keyframe4 538 l 539 | dataset3/keyframe4 539 l 540 | dataset3/keyframe4 540 l 541 | dataset3/keyframe4 541 l 542 | dataset3/keyframe4 542 l 543 | dataset3/keyframe4 543 l 544 | dataset3/keyframe4 544 l 545 | dataset3/keyframe4 545 l 546 | dataset3/keyframe4 546 l 547 | dataset3/keyframe4 547 l 548 | dataset3/keyframe4 548 l 549 | dataset3/keyframe4 549 l 550 | dataset3/keyframe4 550 l 551 | dataset3/keyframe4 551 l 552 | dataset3/keyframe4 552 l 553 | dataset3/keyframe4 553 l 554 | dataset3/keyframe4 554 l 555 | dataset3/keyframe4 555 l 556 | dataset3/keyframe4 556 l 557 | dataset3/keyframe4 557 l 558 | dataset3/keyframe4 558 l 559 | dataset3/keyframe4 559 l 560 | dataset3/keyframe4 560 l 561 | dataset3/keyframe4 561 l 562 | dataset3/keyframe4 562 l 563 | dataset3/keyframe4 563 l 564 | dataset3/keyframe4 564 l 565 | dataset3/keyframe4 565 l 566 | dataset3/keyframe4 566 l 567 | dataset3/keyframe4 567 l 568 | dataset3/keyframe4 568 l 569 | dataset3/keyframe4 569 l 570 | dataset3/keyframe4 570 l 571 | dataset3/keyframe4 571 l 572 | dataset3/keyframe4 572 l 573 | dataset3/keyframe4 573 l 574 | dataset3/keyframe4 574 l 575 | dataset3/keyframe4 575 l 576 | dataset3/keyframe4 576 l 577 | dataset3/keyframe4 577 l 578 | dataset3/keyframe4 578 l 579 | dataset3/keyframe4 579 l 580 | dataset3/keyframe4 580 l 581 | dataset3/keyframe4 581 l 582 | dataset3/keyframe4 582 l 583 | dataset3/keyframe4 583 l 584 | dataset3/keyframe4 584 l 585 | dataset3/keyframe4 585 l 586 | dataset3/keyframe4 586 l 587 | dataset3/keyframe4 587 l 588 | dataset3/keyframe4 588 l 589 | dataset3/keyframe4 589 l 590 | dataset3/keyframe4 590 l 591 | dataset3/keyframe4 591 l 592 | dataset3/keyframe4 592 l 593 | dataset3/keyframe4 593 l 594 | dataset3/keyframe4 594 l 595 | dataset3/keyframe4 595 l 596 | dataset3/keyframe4 596 l 597 | dataset3/keyframe4 597 l 598 | dataset3/keyframe4 598 l 599 | dataset3/keyframe4 599 l 600 | dataset3/keyframe4 600 l 601 | dataset3/keyframe4 601 l 602 | dataset3/keyframe4 602 l 603 | dataset3/keyframe4 603 l 604 | dataset3/keyframe4 604 l 605 | dataset3/keyframe4 605 l 606 | dataset3/keyframe4 606 l 607 | dataset3/keyframe4 607 l 608 | dataset3/keyframe4 608 l 609 | dataset3/keyframe4 609 l 610 | dataset3/keyframe4 610 l 611 | dataset3/keyframe4 611 l 612 | dataset3/keyframe4 612 l 613 | dataset3/keyframe4 613 l 614 | dataset3/keyframe4 614 l 615 | dataset3/keyframe4 615 l 616 | dataset3/keyframe4 616 l 617 | dataset3/keyframe4 617 l 618 | dataset3/keyframe4 618 l 619 | dataset3/keyframe4 619 l 620 | dataset3/keyframe4 620 l 621 | dataset3/keyframe4 621 l 622 | dataset3/keyframe4 622 l 623 | dataset3/keyframe4 623 l 624 | dataset3/keyframe4 624 l 625 | dataset3/keyframe4 625 l 626 | dataset3/keyframe4 626 l 627 | dataset3/keyframe4 627 l 628 | dataset3/keyframe4 628 l 629 | dataset3/keyframe4 629 l 630 | dataset3/keyframe4 630 l 631 | dataset3/keyframe4 631 l 632 | dataset3/keyframe4 632 l 633 | dataset3/keyframe4 633 l 634 | dataset3/keyframe4 634 l 635 | dataset3/keyframe4 635 l 636 | dataset3/keyframe4 636 l 637 | dataset3/keyframe4 637 l 638 | dataset3/keyframe4 638 l 639 | dataset3/keyframe4 639 l 640 | dataset3/keyframe4 640 l 641 | dataset3/keyframe4 641 l 642 | dataset3/keyframe4 642 l 643 | dataset3/keyframe4 643 l 644 | dataset3/keyframe4 644 l 645 | dataset3/keyframe4 645 l 646 | dataset3/keyframe4 646 l 647 | dataset3/keyframe4 647 l 648 | dataset3/keyframe4 648 l 649 | dataset3/keyframe4 649 l 650 | dataset3/keyframe4 650 l 651 | dataset3/keyframe4 651 l 652 | dataset3/keyframe4 652 l 653 | dataset3/keyframe4 653 l 654 | dataset3/keyframe4 654 l 655 | dataset3/keyframe4 655 l 656 | dataset3/keyframe4 656 l 657 | dataset3/keyframe4 657 l 658 | dataset3/keyframe4 658 l 659 | dataset3/keyframe4 659 l 660 | dataset3/keyframe4 660 l 661 | dataset3/keyframe4 661 l 662 | dataset3/keyframe4 662 l 663 | dataset3/keyframe4 663 l 664 | dataset3/keyframe4 664 l 665 | dataset3/keyframe4 665 l 666 | dataset3/keyframe4 666 l 667 | dataset3/keyframe4 667 l 668 | dataset3/keyframe4 668 l 669 | dataset3/keyframe4 669 l 670 | dataset3/keyframe4 670 l 671 | dataset3/keyframe4 671 l 672 | dataset3/keyframe4 672 l 673 | dataset3/keyframe4 673 l 674 | dataset3/keyframe4 674 l 675 | dataset3/keyframe4 675 l 676 | dataset3/keyframe4 676 l 677 | dataset3/keyframe4 677 l 678 | dataset3/keyframe4 678 l 679 | dataset3/keyframe4 679 l 680 | dataset3/keyframe4 680 l 681 | dataset3/keyframe4 681 l 682 | dataset3/keyframe4 682 l 683 | dataset3/keyframe4 683 l 684 | dataset3/keyframe4 684 l 685 | dataset3/keyframe4 685 l 686 | dataset3/keyframe4 686 l 687 | dataset3/keyframe4 687 l 688 | dataset3/keyframe4 688 l 689 | dataset3/keyframe4 689 l 690 | dataset3/keyframe4 690 l 691 | dataset3/keyframe4 691 l 692 | dataset3/keyframe4 692 l 693 | dataset3/keyframe4 693 l 694 | dataset3/keyframe4 694 l 695 | dataset3/keyframe4 695 l 696 | dataset3/keyframe4 696 l 697 | dataset3/keyframe4 697 l 698 | dataset3/keyframe4 698 l 699 | dataset3/keyframe4 699 l 700 | dataset3/keyframe4 700 l 701 | dataset3/keyframe4 701 l 702 | dataset3/keyframe4 702 l 703 | dataset3/keyframe4 703 l 704 | dataset3/keyframe4 704 l 705 | dataset3/keyframe4 705 l 706 | dataset3/keyframe4 706 l 707 | dataset3/keyframe4 707 l 708 | dataset3/keyframe4 708 l 709 | dataset3/keyframe4 709 l 710 | dataset3/keyframe4 710 l 711 | dataset3/keyframe4 711 l 712 | dataset3/keyframe4 712 l 713 | dataset3/keyframe4 713 l 714 | dataset3/keyframe4 714 l 715 | dataset3/keyframe4 715 l 716 | dataset3/keyframe4 716 l 717 | dataset3/keyframe4 717 l 718 | dataset3/keyframe4 718 l 719 | dataset3/keyframe4 719 l 720 | dataset3/keyframe4 720 l 721 | dataset3/keyframe4 721 l 722 | dataset3/keyframe4 722 l 723 | dataset3/keyframe4 723 l 724 | dataset3/keyframe4 724 l 725 | dataset3/keyframe4 725 l 726 | dataset3/keyframe4 726 l 727 | dataset3/keyframe4 727 l 728 | dataset3/keyframe4 728 l 729 | dataset3/keyframe4 729 l 730 | dataset3/keyframe4 730 l 731 | dataset3/keyframe4 731 l 732 | dataset3/keyframe4 732 l 733 | dataset3/keyframe4 733 l 734 | dataset3/keyframe4 734 l 735 | dataset3/keyframe4 735 l 736 | dataset3/keyframe4 736 l 737 | dataset3/keyframe4 737 l 738 | dataset3/keyframe4 738 l 739 | dataset3/keyframe4 739 l 740 | dataset3/keyframe4 740 l 741 | dataset3/keyframe4 741 l 742 | dataset3/keyframe4 742 l 743 | dataset3/keyframe4 743 l 744 | dataset3/keyframe4 744 l 745 | dataset3/keyframe4 745 l 746 | dataset3/keyframe4 746 l 747 | dataset3/keyframe4 747 l 748 | dataset3/keyframe4 748 l 749 | dataset3/keyframe4 749 l 750 | dataset3/keyframe4 750 l 751 | dataset3/keyframe4 751 l 752 | dataset3/keyframe4 752 l 753 | dataset3/keyframe4 753 l 754 | dataset3/keyframe4 754 l 755 | dataset3/keyframe4 755 l 756 | dataset3/keyframe4 756 l 757 | dataset3/keyframe4 757 l 758 | dataset3/keyframe4 758 l 759 | dataset3/keyframe4 759 l 760 | dataset3/keyframe4 760 l 761 | dataset3/keyframe4 761 l 762 | dataset3/keyframe4 762 l 763 | dataset3/keyframe4 763 l 764 | dataset3/keyframe4 764 l 765 | dataset3/keyframe4 765 l 766 | dataset3/keyframe4 766 l 767 | dataset3/keyframe4 767 l 768 | dataset3/keyframe4 768 l 769 | dataset3/keyframe4 769 l 770 | dataset3/keyframe4 770 l 771 | dataset3/keyframe4 771 l 772 | dataset3/keyframe4 772 l 773 | dataset3/keyframe4 773 l 774 | dataset3/keyframe4 774 l 775 | dataset3/keyframe4 775 l 776 | dataset3/keyframe4 776 l 777 | dataset3/keyframe4 777 l 778 | dataset3/keyframe4 778 l 779 | dataset3/keyframe4 779 l 780 | dataset3/keyframe4 780 l 781 | dataset3/keyframe4 781 l 782 | dataset3/keyframe4 782 l 783 | dataset3/keyframe4 783 l 784 | dataset3/keyframe4 784 l 785 | dataset3/keyframe4 785 l 786 | dataset3/keyframe4 786 l 787 | dataset3/keyframe4 787 l 788 | dataset3/keyframe4 788 l 789 | dataset3/keyframe4 789 l 790 | dataset3/keyframe4 790 l 791 | dataset3/keyframe4 791 l 792 | dataset3/keyframe4 792 l 793 | dataset3/keyframe4 793 l 794 | dataset3/keyframe4 794 l 795 | dataset3/keyframe4 795 l 796 | dataset3/keyframe4 796 l 797 | dataset3/keyframe4 797 l 798 | dataset3/keyframe4 798 l 799 | dataset3/keyframe4 799 l 800 | dataset3/keyframe4 800 l 801 | dataset3/keyframe4 801 l 802 | dataset3/keyframe4 802 l 803 | dataset3/keyframe4 803 l 804 | dataset3/keyframe4 804 l 805 | dataset3/keyframe4 805 l 806 | dataset3/keyframe4 806 l 807 | dataset3/keyframe4 807 l 808 | dataset3/keyframe4 808 l 809 | dataset3/keyframe4 809 l 810 | dataset3/keyframe4 810 l 811 | dataset3/keyframe4 811 l 812 | dataset3/keyframe4 812 l 813 | dataset3/keyframe4 813 l 814 | dataset3/keyframe4 814 l 815 | dataset3/keyframe4 815 l 816 | dataset3/keyframe4 816 l 817 | dataset3/keyframe4 817 l 818 | dataset3/keyframe4 818 l 819 | dataset3/keyframe4 819 l 820 | dataset3/keyframe4 820 l 821 | dataset3/keyframe4 821 l 822 | dataset3/keyframe4 822 l 823 | dataset3/keyframe4 823 l 824 | dataset3/keyframe4 824 l 825 | dataset3/keyframe4 825 l 826 | dataset3/keyframe4 826 l 827 | dataset3/keyframe4 827 l 828 | dataset3/keyframe4 828 l 829 | dataset3/keyframe4 829 l 830 | dataset3/keyframe4 830 l 831 | dataset3/keyframe4 831 l 832 | dataset3/keyframe4 832 l 833 | dataset3/keyframe4 833 l -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import numpy as np 4 | 5 | import torch 6 | import math 7 | import cv2 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | from warnings import warn 12 | 13 | 14 | def disp_to_depth(disp, min_depth, max_depth): 15 | """Convert network's sigmoid output into depth prediction 16 | The formula for this conversion is given in the 'additional considerations' 17 | section of the paper. 18 | """ 19 | min_disp = 1 / max_depth 20 | max_disp = 1 / min_depth 21 | scaled_disp = min_disp + (max_disp - min_disp) * disp 22 | depth = 1 / scaled_disp 23 | return scaled_disp, depth 24 | 25 | 26 | def transformation_from_parameters(axisangle, translation, invert=False): 27 | """Convert the network's (axisangle, translation) output into a 4x4 matrix 28 | """ 29 | R = rot_from_axisangle(axisangle) 30 | t = translation.clone() 31 | 32 | if invert: 33 | R = R.transpose(1, 2) 34 | t *= -1 35 | 36 | T = get_translation_matrix(t) 37 | 38 | if invert: 39 | M = torch.matmul(R, T) 40 | else: 41 | M = torch.matmul(T, R) 42 | 43 | return M 44 | 45 | 46 | def get_translation_matrix(translation_vector): 47 | """Convert a translation vector into a 4x4 transformation matrix 48 | """ 49 | T = torch.zeros(translation_vector.shape[0], 4, 4).to(device=translation_vector.device) 50 | 51 | t = translation_vector.contiguous().view(-1, 3, 1) 52 | 53 | T[:, 0, 0] = 1 54 | T[:, 1, 1] = 1 55 | T[:, 2, 2] = 1 56 | T[:, 3, 3] = 1 57 | T[:, :3, 3, None] = t 58 | 59 | return T 60 | 61 | 62 | def rot_from_axisangle(vec): 63 | """Convert an axisangle rotation into a 4x4 transformation matrix 64 | (adapted from https://github.com/Wallacoloo/printipi) 65 | Input 'vec' has to be Bx1x3 66 | """ 67 | angle = torch.norm(vec, 2, 2, True) 68 | axis = vec / (angle + 1e-7) 69 | 70 | ca = torch.cos(angle) 71 | sa = torch.sin(angle) 72 | C = 1 - ca 73 | 74 | x = axis[..., 0].unsqueeze(1) 75 | y = axis[..., 1].unsqueeze(1) 76 | z = axis[..., 2].unsqueeze(1) 77 | 78 | xs = x * sa 79 | ys = y * sa 80 | zs = z * sa 81 | xC = x * C 82 | yC = y * C 83 | zC = z * C 84 | xyC = x * yC 85 | yzC = y * zC 86 | zxC = z * xC 87 | 88 | rot = torch.zeros((vec.shape[0], 4, 4)).to(device=vec.device) 89 | 90 | rot[:, 0, 0] = torch.squeeze(x * xC + ca) 91 | rot[:, 0, 1] = torch.squeeze(xyC - zs) 92 | rot[:, 0, 2] = torch.squeeze(zxC + ys) 93 | rot[:, 1, 0] = torch.squeeze(xyC + zs) 94 | rot[:, 1, 1] = torch.squeeze(y * yC + ca) 95 | rot[:, 1, 2] = torch.squeeze(yzC - xs) 96 | rot[:, 2, 0] = torch.squeeze(zxC - ys) 97 | rot[:, 2, 1] = torch.squeeze(yzC + xs) 98 | rot[:, 2, 2] = torch.squeeze(z * zC + ca) 99 | rot[:, 3, 3] = 1 100 | 101 | return rot 102 | 103 | 104 | class ConvBlock(nn.Module): 105 | """Layer to perform a convolution followed by ELU 106 | """ 107 | def __init__(self, in_channels, out_channels): 108 | super(ConvBlock, self).__init__() 109 | 110 | self.conv = Conv3x3(in_channels, out_channels) 111 | self.nonlin = nn.ELU(inplace=True) 112 | 113 | def forward(self, x): 114 | out = self.conv(x) 115 | out = self.nonlin(out) 116 | return out 117 | 118 | 119 | class Conv3x3(nn.Module): 120 | """Layer to pad and convolve input 121 | """ 122 | def __init__(self, in_channels, out_channels, use_refl=True): 123 | super(Conv3x3, self).__init__() 124 | 125 | if use_refl: 126 | self.pad = nn.ReflectionPad2d(1) 127 | else: 128 | self.pad = nn.ZeroPad2d(1) 129 | self.conv = nn.Conv2d(int(in_channels), int(out_channels), 3) 130 | 131 | def forward(self, x): 132 | out = self.pad(x) 133 | out = self.conv(out) 134 | return out 135 | 136 | 137 | class BackprojectDepth(nn.Module): 138 | """Layer to transform a depth image into a point cloud 139 | """ 140 | def __init__(self, batch_size, height, width): 141 | super(BackprojectDepth, self).__init__() 142 | 143 | self.batch_size = batch_size 144 | self.height = height 145 | self.width = width 146 | 147 | meshgrid = np.meshgrid(range(self.width), range(self.height), indexing='xy') 148 | self.id_coords = np.stack(meshgrid, axis=0).astype(np.float32) 149 | self.id_coords = nn.Parameter(torch.from_numpy(self.id_coords), 150 | requires_grad=False) 151 | 152 | self.ones = nn.Parameter(torch.ones(self.batch_size, 1, self.height * self.width), 153 | requires_grad=False) 154 | 155 | self.pix_coords = torch.unsqueeze(torch.stack( 156 | [self.id_coords[0].view(-1), self.id_coords[1].view(-1)], 0), 0) 157 | self.pix_coords = self.pix_coords.repeat(batch_size, 1, 1) 158 | self.pix_coords = nn.Parameter(torch.cat([self.pix_coords, self.ones], 1), 159 | requires_grad=False) 160 | 161 | def forward(self, depth, inv_K): 162 | cam_points = torch.matmul(inv_K[:, :3, :3], self.pix_coords) 163 | cam_points = depth.view(self.batch_size, 1, -1) * cam_points 164 | cam_points = torch.cat([cam_points, self.ones], 1) 165 | 166 | return cam_points 167 | 168 | 169 | class Project3D(nn.Module): 170 | """Layer which projects 3D points into a camera with intrinsics K and at position T 171 | """ 172 | def __init__(self, batch_size, height, width, eps=1e-7): 173 | super(Project3D, self).__init__() 174 | 175 | self.batch_size = batch_size 176 | self.height = height 177 | self.width = width 178 | self.eps = eps 179 | 180 | def forward(self, points, K, T): 181 | P = torch.matmul(K, T)[:, :3, :] 182 | 183 | cam_points = torch.matmul(P, points) 184 | 185 | pix_coords = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) + self.eps) 186 | pix_coords = pix_coords.view(self.batch_size, 2, self.height, self.width) 187 | pix_coords = pix_coords.permute(0, 2, 3, 1) 188 | pix_coords[..., 0] /= self.width - 1 189 | pix_coords[..., 1] /= self.height - 1 190 | pix_coords = (pix_coords - 0.5) * 2 191 | 192 | return pix_coords 193 | 194 | 195 | class Project3D_Raw(nn.Module): 196 | """Layer which projects 3D points into a camera with intrinsics K and at position T 197 | """ 198 | def __init__(self, batch_size, height, width, eps=1e-7): 199 | super(Project3D_Raw, self).__init__() 200 | 201 | self.batch_size = batch_size 202 | self.height = height 203 | self.width = width 204 | self.eps = eps 205 | 206 | def forward(self, points, K, T): 207 | 208 | P = torch.matmul(K, T)[:, :3, :] 209 | 210 | cam_points = torch.matmul(P, points) 211 | 212 | raw_pix_coords = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) + self.eps) 213 | raw_pix_coords = raw_pix_coords.view(self.batch_size, 2, self.height, self.width) 214 | raw_pix_coords = raw_pix_coords.permute(0, 2, 3, 1) 215 | 216 | return raw_pix_coords 217 | 218 | 219 | def upsample(x): 220 | """Upsample input tensor by a factor of 2 221 | """ 222 | return F.interpolate(x, scale_factor=2, mode="nearest") 223 | 224 | 225 | def get_smooth_loss(disp, img): 226 | 227 | """Computes the smoothness loss for a disparity image 228 | The color image is used for edge-aware smoothness 229 | """ 230 | grad_disp_x = torch.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:]) 231 | grad_disp_y = torch.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :]) 232 | 233 | grad_img_x = torch.mean(torch.abs(img[:, :, :, :-1] - img[:, :, :, 1:]), 1, keepdim=True) 234 | grad_img_y = torch.mean(torch.abs(img[:, :, :-1, :] - img[:, :, 1:, :]), 1, keepdim=True) 235 | 236 | grad_disp_x *= torch.exp(-grad_img_x) 237 | grad_disp_y *= torch.exp(-grad_img_y) 238 | 239 | return grad_disp_x.mean() + grad_disp_y.mean() 240 | 241 | 242 | def get_smooth_bright(transform, target, pred, occu_mask): 243 | 244 | """Computes the smoothness loss for a appearance flow 245 | """ 246 | grad_transform_x = torch.mean(torch.abs(transform[:, :, :, :-1] - transform[:, :, :, 1:]), 1, keepdim=True) 247 | grad_transform_y = torch.mean(torch.abs(transform[:, :, :-1, :] - transform[:, :, 1:, :]), 1, keepdim=True) 248 | 249 | residue = (target - pred) 250 | 251 | grad_residue_x = torch.mean(torch.abs(residue[:, :, :, :-1] - residue[:, :, :, 1:]), 1, keepdim=True) 252 | grad_residue_y = torch.mean(torch.abs(residue[:, :, :-1, :] - residue[:, :, 1:, :]), 1, keepdim=True) 253 | 254 | mask_x = occu_mask[:, :, :, :-1] 255 | mask_y = occu_mask[:, :, :-1, :] 256 | 257 | # grad_residue_x = grad_residue_x * mask_x / (mask_x.mean() + 1e-7) 258 | # grad_residue_y = grad_residue_y * mask_y / (mask_y.mean() + 1e-7) 259 | 260 | grad_transform_x *= torch.exp(-grad_residue_x) 261 | grad_transform_y *= torch.exp(-grad_residue_y) 262 | 263 | grad_transform_x *= mask_x 264 | grad_transform_y *= mask_y 265 | 266 | return (grad_transform_x.sum() / mask_x.sum() + grad_transform_y.sum() / mask_y.sum()) 267 | 268 | 269 | def get_smooth_registration(position): 270 | 271 | """Computes the smoothness loss for a optical flow 272 | """ 273 | grad_disp_x = torch.abs(position[:, :, :, :-1] - position[:, :, :, 1:]) 274 | grad_disp_y = torch.abs(position[:, :, :-1, :] - position[:, :, 1:, :]) 275 | 276 | return grad_disp_x.mean() + grad_disp_y.mean() 277 | 278 | 279 | class SSIM(nn.Module): 280 | """Layer to compute the SSIM loss between a pair of images 281 | """ 282 | def __init__(self): 283 | super(SSIM, self).__init__() 284 | self.mu_x_pool = nn.AvgPool2d(3, 1) 285 | self.mu_y_pool = nn.AvgPool2d(3, 1) 286 | self.sig_x_pool = nn.AvgPool2d(3, 1) 287 | self.sig_y_pool = nn.AvgPool2d(3, 1) 288 | self.sig_xy_pool = nn.AvgPool2d(3, 1) 289 | 290 | self.refl = nn.ReflectionPad2d(1) 291 | 292 | self.C1 = 0.01 ** 2 293 | self.C2 = 0.03 ** 2 294 | 295 | def forward(self, x, y): 296 | x = self.refl(x) 297 | y = self.refl(y) 298 | 299 | mu_x = self.mu_x_pool(x) 300 | mu_y = self.mu_y_pool(y) 301 | 302 | sigma_x = self.sig_x_pool(x ** 2) - mu_x ** 2 303 | sigma_y = self.sig_y_pool(y ** 2) - mu_y ** 2 304 | sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y 305 | 306 | SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2) 307 | SSIM_d = (mu_x ** 2 + mu_y ** 2 + self.C1) * (sigma_x + sigma_y + self.C2) 308 | 309 | return torch.clamp((1 - SSIM_n / SSIM_d) / 2, 0, 1) 310 | 311 | 312 | def ncc_loss(I, J, win=None): 313 | """ 314 | calculate the normalize local cross correlation between I and J 315 | assumes I, J are sized [batch_size, *vol_shape, nb_feats] 316 | """ 317 | 318 | ndims = len(list(I.size())) - 2 319 | assert ndims in [1, 2, 3], "volumes should be 1 to 3 dimensions. found: %d" % ndims 320 | 321 | if win is None: 322 | win = [5] * ndims 323 | 324 | sum_filt = torch.ones([1, 1, *win]).to("cuda") 325 | 326 | pad_no = math.floor(win[0] / 2) 327 | 328 | if ndims == 1: 329 | stride = (1) 330 | padding = (pad_no) 331 | elif ndims == 2: 332 | stride = (1, 1) 333 | padding = (pad_no, pad_no) 334 | else: 335 | stride = (1, 1, 1) 336 | padding = (pad_no, pad_no, pad_no) 337 | 338 | I_var, J_var, cross = compute_local_sums(I, J, sum_filt, stride, padding, win) 339 | 340 | cc = cross * cross / (I_var * J_var + 1e-5) 341 | 342 | # return -1 * torch.mean(cc) 343 | return -1 * cc 344 | 345 | 346 | def compute_local_sums(I, J, filt, stride, padding, win): 347 | 348 | I2 = I * I 349 | J2 = J * J 350 | IJ = I * J 351 | 352 | I_sum = F.conv2d(I, filt, stride=stride, padding=padding) 353 | J_sum = F.conv2d(J, filt, stride=stride, padding=padding) 354 | I2_sum = F.conv2d(I2, filt, stride=stride, padding=padding) 355 | J2_sum = F.conv2d(J2, filt, stride=stride, padding=padding) 356 | IJ_sum = F.conv2d(IJ, filt, stride=stride, padding=padding) 357 | 358 | win_size = np.prod(win) 359 | u_I = I_sum / win_size 360 | u_J = J_sum / win_size 361 | 362 | cross = IJ_sum - u_J * I_sum - u_I * J_sum + u_I * u_J * win_size 363 | I_var = I2_sum - 2 * u_I * I_sum + u_I * u_I * win_size 364 | J_var = J2_sum - 2 * u_J * J_sum + u_J * u_J * win_size 365 | 366 | return I_var, J_var, cross 367 | 368 | 369 | def compute_depth_errors(gt, pred): 370 | """Computation of error metrics between predicted and ground truth depths 371 | """ 372 | thresh = torch.max((gt / pred), (pred / gt)) 373 | a1 = (thresh < 1.25 ).float().mean() 374 | a2 = (thresh < 1.25 ** 2).float().mean() 375 | a3 = (thresh < 1.25 ** 3).float().mean() 376 | 377 | rmse = (gt - pred) ** 2 378 | rmse = torch.sqrt(rmse.mean()) 379 | 380 | rmse_log = (torch.log(gt) - torch.log(pred)) ** 2 381 | rmse_log = torch.sqrt(rmse_log.mean()) 382 | 383 | abs_rel = torch.mean(torch.abs(gt - pred) / gt) 384 | 385 | sq_rel = torch.mean((gt - pred) ** 2 / gt) 386 | 387 | return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 388 | 389 | 390 | class SpatialTransformer(nn.Module): 391 | 392 | def __init__(self, size, mode='bilinear'): 393 | """ 394 | Instiantiate the block 395 | :param size: size of input to the spatial transformer block 396 | :param mode: method of interpolation for grid_sampler 397 | """ 398 | super(SpatialTransformer, self).__init__() 399 | 400 | # Create sampling grid 401 | vectors = [torch.arange(0, s) for s in size] 402 | grids = torch.meshgrid(vectors) 403 | grid = torch.stack(grids) # y, x, z 404 | grid = torch.unsqueeze(grid, 0) # add batch 405 | grid = grid.type(torch.FloatTensor) 406 | self.register_buffer('grid', grid) 407 | self.mode = mode 408 | 409 | def forward(self, src, flow): 410 | """ 411 | Push the src and flow through the spatial transform block 412 | :param src: the source image 413 | :param flow: the output from the U-Net 414 | """ 415 | new_locs = self.grid + flow 416 | shape = flow.shape[2:] 417 | 418 | # Need to normalize grid values to [-1, 1] for resampler 419 | for i in range(len(shape)): 420 | new_locs[:, i, ...] = 2*(new_locs[:, i, ...]/(shape[i]-1) - 0.5) 421 | 422 | if len(shape) == 2: 423 | new_locs = new_locs.permute(0, 2, 3, 1) 424 | new_locs = new_locs[..., [1, 0]] 425 | elif len(shape) == 3: 426 | new_locs = new_locs.permute(0, 2, 3, 4, 1) 427 | new_locs = new_locs[..., [2, 1, 0]] 428 | 429 | return F.grid_sample(src, new_locs, mode=self.mode, padding_mode="border") 430 | 431 | 432 | class optical_flow(nn.Module): 433 | 434 | def __init__(self, size, batch_size, height, width, eps=1e-7): 435 | super(optical_flow, self).__init__() 436 | 437 | # Create sampling grid 438 | vectors = [torch.arange(0, s) for s in size] 439 | grids = torch.meshgrid(vectors) 440 | grid = torch.stack(grids) # y, x, z 441 | grid = torch.unsqueeze(grid, 0) # add batch 442 | grid = grid.type(torch.FloatTensor) 443 | self.register_buffer('grid', grid) 444 | 445 | self.batch_size = batch_size 446 | self.height = height 447 | self.width = width 448 | self.eps = eps 449 | 450 | def forward(self, points, K, T): 451 | 452 | P = torch.matmul(K, T)[:, :3, :] 453 | cam_points = torch.matmul(P, points) 454 | pix_coords = cam_points[:, :2, :] / (cam_points[:, 2, :].unsqueeze(1) + self.eps) 455 | pix_coords = pix_coords.view(self.batch_size, 2, self.height, self.width) 456 | optical_flow = pix_coords[:, [1,0], ...] - self.grid 457 | 458 | return optical_flow 459 | 460 | 461 | class get_occu_mask_backward(nn.Module): 462 | 463 | def __init__(self, size): 464 | super(get_occu_mask_backward, self).__init__() 465 | 466 | # Create sampling grid 467 | vectors = [torch.arange(0, s) for s in size] 468 | grids = torch.meshgrid(vectors) 469 | grid = torch.stack(grids) # y, x, z 470 | grid = torch.unsqueeze(grid, 0) # add batch 471 | grid = grid.type(torch.FloatTensor) 472 | self.register_buffer('grid', grid) 473 | 474 | def forward(self, flow, th=0.95): 475 | 476 | new_locs = self.grid + flow 477 | new_locs = new_locs[:, [1,0], ...] 478 | corr_map = get_corresponding_map(new_locs) 479 | occu_map = corr_map 480 | occu_mask = (occu_map > th).float() 481 | 482 | return occu_mask, occu_map 483 | 484 | 485 | class get_occu_mask_bidirection(nn.Module): 486 | 487 | def __init__(self, size, mode='bilinear'): 488 | super(get_occu_mask_bidirection, self).__init__() 489 | 490 | # Create sampling grid 491 | vectors = [torch.arange(0, s) for s in size] 492 | grids = torch.meshgrid(vectors) 493 | grid = torch.stack(grids) # y, x, z 494 | grid = torch.unsqueeze(grid, 0) # add batch 495 | grid = grid.type(torch.FloatTensor) 496 | self.register_buffer('grid', grid) 497 | self.mode = mode 498 | 499 | def forward(self, flow12, flow21, scale=0.01, bias=0.5): 500 | 501 | new_locs = self.grid + flow12 502 | shape = flow12.shape[2:] 503 | 504 | # Need to normalize grid values to [-1, 1] for resampler 505 | for i in range(len(shape)): 506 | new_locs[:, i, ...] = 2*(new_locs[:, i, ...]/(shape[i]-1) - 0.5) 507 | 508 | if len(shape) == 2: 509 | new_locs = new_locs.permute(0, 2, 3, 1) 510 | new_locs = new_locs[..., [1, 0]] 511 | elif len(shape) == 3: 512 | new_locs = new_locs.permute(0, 2, 3, 4, 1) 513 | new_locs = new_locs[..., [2, 1, 0]] 514 | 515 | flow21_warped = F.grid_sample(flow21, new_locs, mode=self.mode, padding_mode="border") 516 | flow12_diff = torch.abs(flow12 + flow21_warped) 517 | # mag = (flow12 * flow12).sum(1, keepdim=True) + \ 518 | # (flow21_warped * flow21_warped).sum(1, keepdim=True) 519 | # occ_thresh = scale * mag + bias 520 | # occ_mask = (flow12_diff * flow12_diff).sum(1, keepdim=True) < occ_thresh 521 | 522 | return flow12_diff 523 | 524 | 525 | class match(nn.Module): 526 | 527 | def __init__(self, size, batch_size): 528 | super(match, self).__init__() 529 | 530 | # Create sampling grid 531 | vectors = [torch.arange(0, s) for s in size] 532 | grids = torch.meshgrid(vectors) 533 | grid = torch.stack(grids) # y, x, z 534 | grid = torch.unsqueeze(grid, 0) # add batch 535 | grid = grid.type(torch.FloatTensor) 536 | self.register_buffer('grid', grid) 537 | self.batch_size = batch_size 538 | 539 | def forward(self, flow): 540 | new_locs = self.grid + flow 541 | mach = torch.cat((self.grid[:, [1,0], ...].repeat(self.batch_size,1,1,1), new_locs[:, [1,0], ...]), 1) 542 | 543 | return mach 544 | 545 | 546 | def get_texu_mask(non_rigid, rigid): 547 | 548 | diff_flow = (non_rigid - rigid).pow(2).mean(1, True) 549 | sum_flow = 0.01 * (non_rigid.pow(2).mean(1, True) + rigid.pow(2).mean(1, True)) + 0.5 550 | texu_mask = (diff_flow < sum_flow).float() 551 | 552 | return texu_mask 553 | 554 | 555 | def get_corresponding_map(data): 556 | """ 557 | :param data: unnormalized coordinates Bx2xHxW 558 | :return: Bx1xHxW 559 | """ 560 | B, _, H, W = data.size() 561 | 562 | # x = data[:, 0, :, :].view(B, -1).clamp(0, W - 1) # BxN (N=H*W) 563 | # y = data[:, 1, :, :].view(B, -1).clamp(0, H - 1) 564 | 565 | x = data[:, 0, :, :].view(B, -1) # BxN (N=H*W) 566 | y = data[:, 1, :, :].view(B, -1) 567 | 568 | # invalid = (x < 0) | (x > W - 1) | (y < 0) | (y > H - 1) # BxN 569 | # invalid = invalid.repeat([1, 4]) 570 | 571 | x1 = torch.floor(x) 572 | x_floor = x1.clamp(0, W - 1) 573 | y1 = torch.floor(y) 574 | y_floor = y1.clamp(0, H - 1) 575 | x0 = x1 + 1 576 | x_ceil = x0.clamp(0, W - 1) 577 | y0 = y1 + 1 578 | y_ceil = y0.clamp(0, H - 1) 579 | 580 | x_ceil_out = x0 != x_ceil 581 | y_ceil_out = y0 != y_ceil 582 | x_floor_out = x1 != x_floor 583 | y_floor_out = y1 != y_floor 584 | invalid = torch.cat([x_ceil_out | y_ceil_out, 585 | x_ceil_out | y_floor_out, 586 | x_floor_out | y_ceil_out, 587 | x_floor_out | y_floor_out], dim=1) 588 | 589 | # encode coordinates, since the scatter function can only index along one axis 590 | corresponding_map = torch.zeros(B, H * W).type_as(data) 591 | indices = torch.cat([x_ceil + y_ceil * W, 592 | x_ceil + y_floor * W, 593 | x_floor + y_ceil * W, 594 | x_floor + y_floor * W], 1).long() # BxN (N=4*H*W) 595 | values = torch.cat([(1 - torch.abs(x - x_ceil)) * (1 - torch.abs(y - y_ceil)), 596 | (1 - torch.abs(x - x_ceil)) * (1 - torch.abs(y - y_floor)), 597 | (1 - torch.abs(x - x_floor)) * (1 - torch.abs(y - y_ceil)), 598 | (1 - torch.abs(x - x_floor)) * (1 - torch.abs(y - y_floor))], 599 | 1) 600 | # values = torch.ones_like(values) 601 | 602 | values[invalid] = 0 603 | 604 | corresponding_map.scatter_add_(1, indices, values) 605 | # decode coordinates 606 | corresponding_map = corresponding_map.view(B, H, W) 607 | 608 | return corresponding_map.unsqueeze(1) 609 | 610 | 611 | class BerHuLoss(nn.Module): 612 | def __init__(self): 613 | super(BerHuLoss, self).__init__() 614 | 615 | def forward(self, pred, target): 616 | 617 | assert pred.dim() == target.dim(), "inconsistent dimensions" 618 | 619 | diff = pred - target 620 | abs_diff = diff.abs() 621 | c = 0.2 * abs_diff.max() 622 | mask = (abs_diff <= c).float() 623 | l2_loss = (diff ** 2 + c ** 2) / (2 * c) 624 | 625 | loss = (mask * abs_diff + (1 - mask) * l2_loss).mean() 626 | 627 | return loss 628 | 629 | 630 | class reduced_ransac(nn.Module): 631 | def __init__(self, check_num, dataset): 632 | super(reduced_ransac, self).__init__() 633 | self.check_num = check_num 634 | # self.thres = thres 635 | self.dataset = dataset 636 | 637 | def robust_rand_sample(self, match, mask, num, robust=True): 638 | # match: [b, 4, -1] mask: [b, 1, -1] 639 | b, n = match.shape[0], match.shape[2] 640 | nonzeros_num = torch.min(torch.sum(mask > 0, dim=-1)) # [] 641 | if nonzeros_num.detach().cpu().numpy() == n: 642 | rand_int = torch.randint(0, n, [num]) 643 | select_match = match[:,:,rand_int] 644 | else: 645 | # If there is zero score in match, sample the non-zero matches. 646 | select_idxs = [] 647 | if robust: 648 | num = np.minimum(nonzeros_num.detach().cpu().numpy(), num) 649 | for i in range(b): 650 | nonzero_idx = torch.nonzero(mask[i,0,:]) # [nonzero_num,1] 651 | rand_int = torch.randint(0, nonzero_idx.shape[0], [int(num)]) 652 | select_idx = nonzero_idx[rand_int, :] # [num, 1] 653 | select_idxs.append(select_idx) 654 | select_idxs = torch.stack(select_idxs, 0) # [b,num,1] 655 | select_match = torch.gather(match.transpose(1,2), index=select_idxs.repeat(1,1,4), dim=1).transpose(1,2) # [b, 4, num] 656 | return select_match, num 657 | 658 | def top_ratio_sample(self, match, mask, ratio): 659 | # match: [b, 4, -1] mask: [b, 1, -1] 660 | b, total_num = match.shape[0], match.shape[-1] 661 | scores, indices = torch.topk(mask, int(ratio*total_num), dim=-1) # [B, 1, ratio*tnum] 662 | select_match = torch.gather(match.transpose(1,2), index=indices.squeeze(1).unsqueeze(-1).repeat(1,1,4), dim=1).transpose(1,2) # [b, 4, ratio*tnum] 663 | return select_match, scores 664 | 665 | def forward(self, match, mask, visualizer=None): 666 | # match: [B, 4, H, W] mask: [B, 1, H, W] 667 | b, h, w = match.shape[0], match.shape[2], match.shape[3] 668 | match = match.view([b, 4, -1]).contiguous() 669 | mask = mask.view([b, 1, -1]).contiguous() 670 | 671 | # Sample matches for RANSAC 8-point and best F selection 672 | top_ratio_match, top_ratio_mask = self.top_ratio_sample(match, mask, ratio=0.20) # [b, 4, ratio*H*W] 673 | check_match, check_num = self.robust_rand_sample(top_ratio_match, top_ratio_mask, num=self.check_num) # [b, 4, check_num] 674 | check_match = check_match.contiguous() 675 | 676 | cv_f = [] 677 | for i in range(b): 678 | if self.dataset == 'nyuv2': 679 | f, m = cv2.findFundamentalMat(check_match[i,:2,:].transpose(0,1).detach().cpu().numpy(), check_match[i,2:,:].transpose(0,1).detach().cpu().numpy(), cv2.FM_LMEDS, 0.99) 680 | else: 681 | f, m = cv2.findFundamentalMat(check_match[i,:2,:].transpose(0,1).detach().cpu().numpy(), check_match[i,2:,:].transpose(0,1).detach().cpu().numpy(), cv2.FM_RANSAC, 0.1, 0.99) 682 | cv_f.append(f) 683 | cv_f = np.stack(cv_f, axis=0) 684 | cv_f = torch.from_numpy(cv_f).float().to(match.get_device()) 685 | 686 | return cv_f 687 | 688 | -------------------------------------------------------------------------------- /trainer_end_to_end.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import time 4 | import json 5 | import datasets 6 | import networks 7 | from networks import DARES 8 | import numpy as np 9 | import torch.optim as optim 10 | import torch.nn as nn 11 | 12 | from utils import * 13 | from layers import * 14 | from torch.utils.data import DataLoader 15 | from tensorboardX import SummaryWriter 16 | from torchmetrics.image import MultiScaleStructuralSimilarityIndexMeasure 17 | import os 18 | 19 | script_path = os.path.abspath(__file__) 20 | 21 | root_dir = os.path.dirname(script_path) 22 | 23 | 24 | class Trainer: 25 | def __init__(self, options): 26 | self.opt = options 27 | self.log_path = os.path.join(self.opt.log_dir, self.opt.model_name) 28 | 29 | # checking height and width are multiples of 32 30 | assert self.opt.height % 32 == 0, "'height' must be a multiple of 32" 31 | assert self.opt.width % 32 == 0, "'width' must be a multiple of 32" 32 | 33 | self.models = {} 34 | self.parameters_to_train = [] 35 | self.parameters_to_train_0 = [] 36 | 37 | self.device = torch.device("cpu" if self.opt.no_cuda else "cuda") 38 | 39 | self.num_scales = len(self.opt.scales) # 4 40 | self.num_input_frames = len(self.opt.frame_ids) # 3 41 | self.num_pose_frames = 2 if self.opt.pose_model_input == "pairs" else self.num_input_frames # 2 42 | 43 | assert self.opt.frame_ids[0] == 0, "frame_ids must start with 0" 44 | 45 | self.use_pose_net = not (self.opt.use_stereo and self.opt.frame_ids == [0]) 46 | 47 | if self.opt.use_stereo: 48 | self.opt.frame_ids.append("s") 49 | 50 | self.models["depth_model"] = DARES() 51 | 52 | self.models["depth_model"].to(self.device) 53 | self.parameters_to_train += list(filter(lambda p: p.requires_grad, self.models["depth_model"].parameters())) 54 | 55 | self.models["position_encoder"] = networks.ResnetEncoder( 56 | self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=2) # 18 57 | 58 | self.models["position_encoder"].load_state_dict(torch.load(os.path.join(root_dir ,"af_sfmlearner_weights" ,"position_encoder.pth"))) 59 | self.models["position_encoder"].to(self.device) 60 | self.parameters_to_train_0 += list(self.models["position_encoder"].parameters()) 61 | 62 | self.models["position"] = networks.PositionDecoder( 63 | self.models["position_encoder"].num_ch_enc, self.opt.scales) 64 | self.models["position"].load_state_dict(torch.load(os.path.join(root_dir ,"af_sfmlearner_weights" ,"position.pth"))) 65 | 66 | self.models["position"].to(self.device) 67 | self.parameters_to_train_0 += list(self.models["position"].parameters()) 68 | 69 | self.models["transform_encoder"] = networks.ResnetEncoder( 70 | self.opt.num_layers, self.opt.weights_init == "pretrained", num_input_images=2) # 18 71 | self.models["transform_encoder"].load_state_dict(torch.load(os.path.join(root_dir ,"af_sfmlearner_weights" ,"transform_encoder.pth"))) 72 | self.models["transform_encoder"].to(self.device) 73 | self.parameters_to_train += list(self.models["transform_encoder"].parameters()) 74 | 75 | self.models["transform"] = networks.TransformDecoder( 76 | self.models["transform_encoder"].num_ch_enc, self.opt.scales) 77 | self.models["transform"].load_state_dict(torch.load(os.path.join(root_dir ,"af_sfmlearner_weights" ,"transform.pth"))) 78 | self.models["transform"].to(self.device) 79 | self.parameters_to_train += list(self.models["transform"].parameters()) 80 | 81 | if self.use_pose_net: 82 | 83 | if self.opt.pose_model_type == "separate_resnet": 84 | pose_encoder_path = os.path.join(root_dir, "af_sfmlearner_weights", "pose_encoder.pth") 85 | pose_decoder_path = os.path.join(root_dir, "af_sfmlearner_weights", "pose.pth") 86 | self.models["pose_encoder"] = networks.ResnetEncoder( 87 | self.opt.num_layers, 88 | self.opt.weights_init == "pretrained", 89 | num_input_images=self.num_pose_frames) 90 | self.models["pose_encoder"].load_state_dict(torch.load(pose_encoder_path)) 91 | self.models["pose_encoder"].to(self.device) 92 | self.parameters_to_train += list(self.models["pose_encoder"].parameters()) 93 | 94 | self.models["pose"] = networks.PoseDecoder( 95 | self.models["pose_encoder"].num_ch_enc, 96 | num_input_features=1, 97 | num_frames_to_predict_for=2) 98 | 99 | self.models["pose"].load_state_dict(torch.load(pose_decoder_path)) 100 | elif self.opt.pose_model_type == "shared": 101 | self.models["pose"] = networks.PoseDecoder( 102 | self.models["encoder"].num_ch_enc, self.num_pose_frames) 103 | self.models["pose"].load_state_dict(torch.load(pose_decoder_path)) 104 | 105 | elif self.opt.pose_model_type == "posecnn": 106 | self.models["pose"] = networks.PoseCNN( 107 | self.num_input_frames if self.opt.pose_model_input == "all" else 2) 108 | self.models["pose"].load_state_dict(torch.load(pose_decoder_path)) 109 | 110 | self.models["pose"].to(self.device) 111 | self.parameters_to_train += list(self.models["pose"].parameters()) 112 | 113 | if self.opt.predictive_mask: 114 | assert self.opt.disable_automasking, \ 115 | "When using predictive_mask, please disable automasking with --disable_automasking" 116 | 117 | # Our implementation of the predictive masking baseline has the the same architecture 118 | # as our depth decoder. We predict a separate mask for each source frame. 119 | print('CHECK: self.opt.predictive_mask') 120 | self.models["predictive_mask"] = networks.DepthDecoder( 121 | self.models["encoder"].num_ch_enc, self.opt.scales, 122 | num_output_channels=(len(self.opt.frame_ids) - 1)) 123 | self.models["predictive_mask"].to(self.device) 124 | self.parameters_to_train += list(self.models["predictive_mask"].parameters()) 125 | else: 126 | print('CHECK: NO self.opt.predictive_mask') 127 | 128 | self.model_optimizer = optim.Adam(self.parameters_to_train, self.opt.learning_rate) 129 | self.model_lr_scheduler = optim.lr_scheduler.StepLR( 130 | self.model_optimizer, self.opt.scheduler_step_size, 0.1) 131 | self.model_optimizer_0 = optim.Adam(self.parameters_to_train_0, 1e-4) 132 | self.model_lr_scheduler_0 = optim.lr_scheduler.StepLR( 133 | self.model_optimizer_0, self.opt.scheduler_step_size, 0.1) 134 | 135 | if self.opt.load_weights_folder is not None: 136 | self.load_model() 137 | 138 | print("Training model named:\n ", self.opt.model_name) 139 | print("Models and tensorboard events files are saved to:\n ", self.opt.log_dir) 140 | print("Training is using:\n ", self.device) 141 | 142 | # data 143 | datasets_dict = {"endovis": datasets.SCAREDRAWDataset} 144 | self.dataset = datasets_dict[self.opt.dataset] 145 | 146 | fpath = os.path.join(os.path.dirname(__file__), "splits", self.opt.split, "{}_files.txt") 147 | train_filenames = readlines(fpath.format("train")) 148 | val_filenames = readlines(fpath.format("val")) 149 | img_ext = '.png' 150 | 151 | num_train_samples = len(train_filenames) 152 | self.num_total_steps = num_train_samples // self.opt.batch_size * self.opt.num_epochs 153 | 154 | train_dataset = self.dataset( 155 | self.opt.data_path, train_filenames, self.opt.height, self.opt.width, 156 | self.opt.frame_ids, 4, is_train=True, img_ext=img_ext) 157 | self.train_loader = DataLoader( 158 | train_dataset, self.opt.batch_size, True, 159 | num_workers=self.opt.num_workers, pin_memory=True, drop_last=True) 160 | val_dataset = self.dataset( 161 | self.opt.data_path, val_filenames, self.opt.height, self.opt.width, 162 | self.opt.frame_ids, 4, is_train=False, img_ext=img_ext) 163 | self.val_loader = DataLoader( 164 | val_dataset, self.opt.batch_size, False, 165 | num_workers=1, pin_memory=True, drop_last=True) 166 | self.val_iter = iter(self.val_loader) 167 | 168 | self.writers = {} 169 | for mode in ["train", "val"]: 170 | self.writers[mode] = SummaryWriter(os.path.join(self.log_path, mode)) 171 | 172 | if not self.opt.no_ssim: 173 | self.ms_ssim = MultiScaleStructuralSimilarityIndexMeasure(data_range=1.0) 174 | self.ms_ssim.to(self.device) 175 | 176 | self.spatial_transform = SpatialTransformer((self.opt.height, self.opt.width)) 177 | self.spatial_transform.to(self.device) 178 | 179 | self.get_occu_mask_backward = get_occu_mask_backward((self.opt.height, self.opt.width)) 180 | self.get_occu_mask_backward.to(self.device) 181 | 182 | self.get_occu_mask_bidirection = get_occu_mask_bidirection((self.opt.height, self.opt.width)) 183 | self.get_occu_mask_bidirection.to(self.device) 184 | 185 | self.backproject_depth = {} 186 | self.project_3d = {} 187 | self.position_depth = {} 188 | 189 | for scale in self.opt.scales: 190 | h = self.opt.height // (2 ** scale) 191 | w = self.opt.width // (2 ** scale) 192 | 193 | self.backproject_depth[scale] = BackprojectDepth(self.opt.batch_size, h, w) 194 | self.backproject_depth[scale].to(self.device) 195 | 196 | self.project_3d[scale] = Project3D(self.opt.batch_size, h, w) 197 | self.project_3d[scale].to(self.device) 198 | 199 | self.position_depth[scale] = optical_flow((h, w), self.opt.batch_size, h, w) 200 | self.position_depth[scale].to(self.device) 201 | 202 | self.depth_metric_names = [ 203 | "de/abs_rel", "de/sq_rel", "de/rms", "de/log_rms", "da/a1", "da/a2", "da/a3"] 204 | 205 | print("Using split:\n ", self.opt.split) 206 | print("There are {:d} training items and {:d} validation items\n".format( 207 | len(train_dataset), len(val_dataset))) 208 | 209 | self.save_opts() 210 | 211 | def set_train_0(self): 212 | """Convert all models to training mode 213 | """ 214 | for param in self.models["position_encoder"].parameters(): 215 | param.requires_grad = True 216 | for param in self.models["position"].parameters(): 217 | param.requires_grad = True 218 | 219 | for param in self.models["depth_model"].parameters(): 220 | param.requires_grad = False 221 | for param in self.models["pose_encoder"].parameters(): 222 | param.requires_grad = False 223 | for param in self.models["pose"].parameters(): 224 | param.requires_grad = False 225 | for param in self.models["transform_encoder"].parameters(): 226 | param.requires_grad = False 227 | for param in self.models["transform"].parameters(): 228 | param.requires_grad = False 229 | 230 | self.models["position_encoder"].train() 231 | self.models["position"].train() 232 | 233 | self.models["depth_model"].eval() 234 | self.models["pose_encoder"].eval() 235 | self.models["pose"].eval() 236 | self.models["transform_encoder"].eval() 237 | self.models["transform"].eval() 238 | 239 | def set_train(self): 240 | """Convert all models to training mode 241 | """ 242 | for param in self.models["position_encoder"].parameters(): 243 | param.requires_grad = False 244 | for param in self.models["position"].parameters(): 245 | param.requires_grad = False 246 | 247 | # for param in self.models["encoder"].parameters(): 248 | # param.requires_grad = True 249 | for param in self.models["depth_model"].parameters(): 250 | param.requires_grad = True 251 | for param in self.models["pose_encoder"].parameters(): 252 | param.requires_grad = True 253 | for param in self.models["pose"].parameters(): 254 | param.requires_grad = True 255 | for param in self.models["transform_encoder"].parameters(): 256 | param.requires_grad = True 257 | for param in self.models["transform"].parameters(): 258 | param.requires_grad = True 259 | 260 | self.models["position_encoder"].eval() 261 | self.models["position"].eval() 262 | 263 | self.models["depth_model"].train() 264 | self.models["pose_encoder"].train() 265 | self.models["pose"].train() 266 | self.models["transform_encoder"].train() 267 | self.models["transform"].train() 268 | 269 | def set_eval(self): 270 | """Convert all models to testing/evaluation mode 271 | """ 272 | self.models["depth_model"].eval() 273 | self.models["transform_encoder"].eval() 274 | self.models["transform"].eval() 275 | self.models["pose_encoder"].eval() 276 | self.models["pose"].eval() 277 | 278 | def train(self): 279 | """Run the entire training pipeline 280 | """ 281 | self.epoch = 0 282 | self.step = 0 283 | self.start_time = time.time() 284 | for self.epoch in range(self.opt.num_epochs): 285 | self.run_epoch() 286 | if (self.epoch + 1) % self.opt.save_frequency == 0: 287 | self.save_model() 288 | 289 | def run_epoch(self): 290 | """Run a single epoch of training and validation 291 | """ 292 | 293 | print("Training") 294 | 295 | for batch_idx, inputs in enumerate(self.train_loader): 296 | 297 | before_op_time = time.time() 298 | 299 | # position 300 | self.set_train_0() 301 | _, losses_0 = self.process_batch_0(inputs) 302 | self.model_optimizer_0.zero_grad() 303 | losses_0["loss"].backward() 304 | self.model_optimizer_0.step() 305 | 306 | # depth, pose, transform 307 | self.set_train() 308 | outputs, losses = self.process_batch(inputs) 309 | self.model_optimizer.zero_grad() 310 | losses["loss"].backward() 311 | self.model_optimizer.step() 312 | 313 | duration = time.time() - before_op_time 314 | 315 | phase = batch_idx % self.opt.log_frequency == 0 316 | 317 | if phase: 318 | 319 | self.log_time(batch_idx, duration, losses["loss"].cpu().data) 320 | self.log("train", inputs, outputs, losses) 321 | # self.val() 322 | 323 | self.step += 1 324 | 325 | self.model_lr_scheduler.step() 326 | self.model_lr_scheduler_0.step() 327 | 328 | def process_batch_0(self, inputs): 329 | """Pass a minibatch through the network and generate images and losses 330 | """ 331 | for key, ipt in inputs.items(): 332 | inputs[key] = ipt.to(self.device) 333 | 334 | outputs = {} 335 | outputs.update(self.predict_poses_0(inputs)) 336 | losses = self.compute_losses_0(inputs, outputs) 337 | 338 | return outputs, losses 339 | 340 | def predict_poses_0(self, inputs): 341 | """Predict poses between input frames for monocular sequences. 342 | """ 343 | outputs = {} 344 | if self.num_pose_frames == 2: 345 | pose_feats = {f_i: inputs["color_aug", f_i, 0] for f_i in self.opt.frame_ids} 346 | 347 | for f_i in self.opt.frame_ids[1:]: 348 | 349 | if f_i != "s": 350 | 351 | inputs_all = [pose_feats[f_i], pose_feats[0]] 352 | inputs_all_reverse = [pose_feats[0], pose_feats[f_i]] 353 | 354 | # position 355 | position_inputs = self.models["position_encoder"](torch.cat(inputs_all, 1)) 356 | position_inputs_reverse = self.models["position_encoder"](torch.cat(inputs_all_reverse, 1)) 357 | outputs_0 = self.models["position"](position_inputs) 358 | outputs_1 = self.models["position"](position_inputs_reverse) 359 | 360 | for scale in self.opt.scales: 361 | outputs[("position", scale, f_i)] = outputs_0[("position", scale)] 362 | outputs[("position", "high", scale, f_i)] = F.interpolate( 363 | outputs[("position", scale, f_i)], [self.opt.height, self.opt.width], mode="bilinear", 364 | align_corners=True) 365 | outputs[("registration", scale, f_i)] = self.spatial_transform(inputs[("color", f_i, 0)], 366 | outputs[( 367 | "position", "high", scale, f_i)]) 368 | 369 | outputs[("position_reverse", scale, f_i)] = outputs_1[("position", scale)] 370 | outputs[("position_reverse", "high", scale, f_i)] = F.interpolate( 371 | outputs[("position_reverse", scale, f_i)], [self.opt.height, self.opt.width], 372 | mode="bilinear", align_corners=True) 373 | outputs[("occu_mask_backward", scale, f_i)], _ = self.get_occu_mask_backward( 374 | outputs[("position_reverse", "high", scale, f_i)]) 375 | outputs[("occu_map_bidirection", scale, f_i)] = self.get_occu_mask_bidirection( 376 | outputs[("position", "high", scale, f_i)], 377 | outputs[("position_reverse", "high", scale, f_i)]) 378 | 379 | # transform 380 | transform_input = [outputs[("registration", 0, f_i)], inputs[("color", 0, 0)]] 381 | transform_inputs = self.models["transform_encoder"](torch.cat(transform_input, 1)) 382 | outputs_2 = self.models["transform"](transform_inputs) 383 | 384 | for scale in self.opt.scales: 385 | outputs[("transform", scale, f_i)] = outputs_2[("transform", scale)] 386 | outputs[("transform", "high", scale, f_i)] = F.interpolate( 387 | outputs[("transform", scale, f_i)], [self.opt.height, self.opt.width], mode="bilinear", 388 | align_corners=True) 389 | outputs[("refined", scale, f_i)] = (outputs[("transform", "high", scale, f_i)] * outputs[ 390 | ("occu_mask_backward", 0, f_i)].detach() + inputs[("color", 0, 0)]) 391 | outputs[("refined", scale, f_i)] = torch.clamp(outputs[("refined", scale, f_i)], min=0.0, 392 | max=1.0) 393 | return outputs 394 | 395 | def compute_losses_0(self, inputs, outputs): 396 | 397 | losses = {} 398 | total_loss = 0 399 | 400 | for scale in self.opt.scales: 401 | 402 | loss = 0 403 | loss_smooth_registration = 0 404 | loss_registration = 0 405 | 406 | if self.opt.v1_multiscale: 407 | source_scale = scale 408 | else: 409 | source_scale = 0 410 | 411 | color = inputs[("color", 0, scale)] 412 | 413 | for frame_id in self.opt.frame_ids[1:]: 414 | occu_mask_backward = outputs[("occu_mask_backward", 0, frame_id)].detach() 415 | loss_smooth_registration += (get_smooth_loss(outputs[("position", scale, frame_id)], color)) 416 | loss_registration += ( 417 | self.compute_reprojection_loss(outputs[("registration", scale, frame_id)], outputs[("refined", scale, frame_id)].detach()) * occu_mask_backward).sum() / occu_mask_backward.sum() 418 | 419 | loss += loss_registration / 2.0 420 | loss += self.opt.position_smoothness * (loss_smooth_registration / 2.0) / (2 ** scale) 421 | 422 | total_loss += loss 423 | losses["loss/{}".format(scale)] = loss 424 | 425 | total_loss /= self.num_scales 426 | losses["loss"] = total_loss 427 | return losses 428 | 429 | def process_batch(self, inputs): 430 | """Pass a minibatch through the network and generate images and losses 431 | """ 432 | for key, ipt in inputs.items(): 433 | inputs[key] = ipt.to(self.device) 434 | outputs = self.models["depth_model"](inputs["color_aug", 0, 0]) 435 | 436 | if self.use_pose_net: 437 | outputs.update(self.predict_poses(inputs, outputs)) 438 | 439 | self.generate_images_pred(inputs, outputs) 440 | losses = self.compute_losses(inputs, outputs) 441 | 442 | return outputs, losses 443 | 444 | def predict_poses(self, inputs, disps): 445 | """Predict poses between input frames for monocular sequences. 446 | """ 447 | outputs = {} 448 | if self.num_pose_frames == 2: 449 | pose_feats = {f_i: inputs["color_aug", f_i, 0] for f_i in self.opt.frame_ids} 450 | 451 | for f_i in self.opt.frame_ids[1:]: 452 | 453 | if f_i != "s": 454 | 455 | inputs_all = [pose_feats[f_i], pose_feats[0]] 456 | inputs_all_reverse = [pose_feats[0], pose_feats[f_i]] 457 | 458 | # position 459 | position_inputs = self.models["position_encoder"](torch.cat(inputs_all, 1)) 460 | position_inputs_reverse = self.models["position_encoder"](torch.cat(inputs_all_reverse, 1)) 461 | outputs_0 = self.models["position"](position_inputs) 462 | outputs_1 = self.models["position"](position_inputs_reverse) 463 | 464 | for scale in self.opt.scales: 465 | 466 | outputs[("position", scale, f_i)] = outputs_0[("position", scale)] 467 | outputs[("position", "high", scale, f_i)] = F.interpolate( 468 | outputs[("position", scale, f_i)], [self.opt.height, self.opt.width], mode="bilinear", align_corners=True) 469 | outputs[("registration", scale, f_i)] = self.spatial_transform(inputs[("color", f_i, 0)], outputs[("position", "high", scale, f_i)]) 470 | 471 | outputs[("position_reverse", scale, f_i)] = outputs_1[("position", scale)] 472 | outputs[("position_reverse", "high", scale, f_i)] = F.interpolate( 473 | outputs[("position_reverse", scale, f_i)], [self.opt.height, self.opt.width], mode="bilinear", align_corners=True) 474 | outputs[("occu_mask_backward", scale, f_i)], outputs[("occu_map_backward", scale, f_i)]= self.get_occu_mask_backward(outputs[("position_reverse", "high", scale, f_i)]) 475 | outputs[("occu_map_bidirection", scale, f_i)] = self.get_occu_mask_bidirection(outputs[("position", "high", scale, f_i)], 476 | outputs[("position_reverse", "high", scale, f_i)]) 477 | 478 | # transform 479 | transform_input = [outputs[("registration", 0, f_i)], inputs[("color", 0, 0)]] 480 | transform_inputs = self.models["transform_encoder"](torch.cat(transform_input, 1)) 481 | outputs_2 = self.models["transform"](transform_inputs) 482 | 483 | for scale in self.opt.scales: 484 | 485 | outputs[("transform", scale, f_i)] = outputs_2[("transform", scale)] 486 | outputs[("transform", "high", scale, f_i)] = F.interpolate( 487 | outputs[("transform", scale, f_i)], [self.opt.height, self.opt.width], mode="bilinear", align_corners=True) 488 | outputs[("refined", scale, f_i)] = (outputs[("transform", "high", scale, f_i)] * outputs[("occu_mask_backward", 0, f_i)].detach() + inputs[("color", 0, 0)]) 489 | outputs[("refined", scale, f_i)] = torch.clamp(outputs[("refined", scale, f_i)], min=0.0, max=1.0) 490 | # outputs[("grad_refined", scale, f_i)] = get_gradmap(outputs[("refined", scale, f_i)]) 491 | 492 | 493 | # pose 494 | pose_inputs = [self.models["pose_encoder"](torch.cat(inputs_all, 1))] 495 | axisangle, translation = self.models["pose"](pose_inputs) 496 | 497 | outputs[("axisangle", 0, f_i)] = axisangle 498 | outputs[("translation", 0, f_i)] = translation 499 | outputs[("cam_T_cam", 0, f_i)] = transformation_from_parameters( 500 | axisangle[:, 0], translation[:, 0]) 501 | 502 | return outputs 503 | 504 | def generate_images_pred(self, inputs, outputs): 505 | """Generate the warped (reprojected) color images for a minibatch. 506 | Generated images are saved into the `outputs` dictionary. 507 | """ 508 | for scale in self.opt.scales: 509 | 510 | disp = outputs[("disp", scale)] 511 | if self.opt.v1_multiscale: 512 | source_scale = scale 513 | else: 514 | disp = F.interpolate( 515 | disp, [self.opt.height, self.opt.width], mode="bilinear", align_corners=True) 516 | 517 | _, depth = disp_to_depth(disp, self.opt.min_depth, self.opt.max_depth) 518 | 519 | outputs[("depth", 0, scale)] = depth 520 | 521 | source_scale = 0 522 | for i, frame_id in enumerate(self.opt.frame_ids[1:]): 523 | 524 | if frame_id == "s": 525 | T = inputs["stereo_T"] 526 | else: 527 | T = outputs[("cam_T_cam", 0, frame_id)] 528 | 529 | # from the authors of https://arxiv.org/abs/1712.00175 530 | if self.opt.pose_model_type == "posecnn": 531 | 532 | axisangle = outputs[("axisangle", 0, frame_id)] 533 | translation = outputs[("translation", 0, frame_id)] 534 | 535 | inv_depth = 1 / depth 536 | mean_inv_depth = inv_depth.mean(3, True).mean(2, True) 537 | 538 | T = transformation_from_parameters( 539 | axisangle[:, 0], translation[:, 0] * mean_inv_depth[:, 0], frame_id < 0) 540 | 541 | cam_points = self.backproject_depth[source_scale]( 542 | depth, inputs[("inv_K", source_scale)]) 543 | pix_coords = self.project_3d[source_scale]( 544 | cam_points, inputs[("K", source_scale)], T) 545 | 546 | outputs[("sample", frame_id, scale)] = pix_coords 547 | 548 | outputs[("color", frame_id, scale)] = F.grid_sample( 549 | inputs[("color", frame_id, source_scale)], 550 | outputs[("sample", frame_id, scale)], 551 | padding_mode="border", 552 | align_corners=True) 553 | 554 | outputs[("position_depth", scale, frame_id)] = self.position_depth[source_scale]( 555 | cam_points, inputs[("K", source_scale)], T) 556 | 557 | 558 | 559 | 560 | def compute_reprojection_loss(self, pred, target): 561 | 562 | abs_diff = torch.abs(target - pred) 563 | l1_loss = abs_diff.mean(1, True) 564 | 565 | if self.opt.no_ssim: 566 | reprojection_loss = l1_loss 567 | else: 568 | ms_ssim_loss = 1 - self.ms_ssim(pred, target) 569 | reprojection_loss = 0.9 * ms_ssim_loss + 0.1 * l1_loss 570 | 571 | return reprojection_loss 572 | 573 | def compute_losses(self, inputs, outputs): 574 | 575 | losses = {} 576 | total_loss = 0 577 | 578 | for scale in self.opt.scales: 579 | 580 | loss = 0 581 | loss_reprojection = 0 582 | loss_transform = 0 583 | loss_cvt = 0 584 | 585 | if self.opt.v1_multiscale: 586 | source_scale = scale 587 | else: 588 | source_scale = 0 589 | 590 | disp = outputs[("disp", scale)] 591 | color = inputs[("color", 0, scale)] 592 | 593 | for frame_id in self.opt.frame_ids[1:]: 594 | 595 | occu_mask_backward = outputs[("occu_mask_backward", 0, frame_id)].detach() 596 | 597 | loss_reprojection += ( 598 | self.compute_reprojection_loss(outputs[("color", frame_id, scale)], outputs[("refined", scale, frame_id)]) * occu_mask_backward).sum() / occu_mask_backward.sum() 599 | loss_transform += ( 600 | torch.abs(outputs[("refined", scale, frame_id)] - outputs[("registration", 0, frame_id)].detach()).mean(1, True) * occu_mask_backward).sum() / occu_mask_backward.sum() 601 | loss_cvt += get_smooth_bright( 602 | outputs[("transform", "high", scale, frame_id)], inputs[("color", 0, 0)], outputs[("registration", scale, frame_id)].detach(), occu_mask_backward) 603 | 604 | mean_disp = disp.mean(2, True).mean(3, True) 605 | norm_disp = disp / (mean_disp + 1e-7) 606 | smooth_loss = get_smooth_loss(norm_disp, color) 607 | 608 | loss += loss_reprojection / 2.0 609 | loss += self.opt.transform_constraint * (loss_transform / 2.0) 610 | loss += self.opt.transform_smoothness * (loss_cvt / 2.0) 611 | loss += self.opt.disparity_smoothness * smooth_loss / (2 ** scale) 612 | 613 | total_loss += loss 614 | losses["loss/{}".format(scale)] = loss 615 | 616 | total_loss /= self.num_scales 617 | losses["loss"] = total_loss 618 | return losses 619 | 620 | def val(self): 621 | """Validate the model on a single minibatch 622 | """ 623 | self.set_eval() 624 | try: 625 | inputs = self.val_iter.next() 626 | except StopIteration: 627 | self.val_iter = iter(self.val_loader) 628 | inputs = self.val_iter.next() 629 | 630 | with torch.no_grad(): 631 | outputs, losses = self.process_batch_val(inputs) 632 | self.log("val", inputs, outputs, losses) 633 | del inputs, outputs, losses 634 | 635 | self.set_train() 636 | 637 | def process_batch_val(self, inputs): 638 | """Pass a minibatch through the network and generate images and losses 639 | """ 640 | for key, ipt in inputs.items(): 641 | inputs[key] = ipt.to(self.device) 642 | 643 | if self.opt.pose_model_type == "shared": 644 | # If we are using a shared encoder for both depth and pose (as advocated 645 | # in monodepthv1), then all images are fed separately through the depth encoder. 646 | print('CHECK: self.opt.pose_model_type == "shared"') 647 | all_color_aug = torch.cat([inputs[("color_aug", i, 0)] for i in self.opt.frame_ids]) 648 | all_features = self.models["encoder"](all_color_aug) 649 | all_features = [torch.split(f, self.opt.batch_size) for f in all_features] 650 | 651 | features = {} 652 | for i, k in enumerate(self.opt.frame_ids): 653 | features[k] = [f[i] for f in all_features] 654 | 655 | outputs = self.models["depth"](features[0]) 656 | else: 657 | print('CHECK: self.opt.pose_model_type != "shared"') 658 | # Otherwise, we only feed the image with frame_id 0 through the depth encoder 659 | features = self.models["encoder"](inputs["color_aug", 0, 0]) 660 | outputs = self.models["depth"](features) 661 | 662 | if self.opt.predictive_mask: 663 | outputs["predictive_mask"] = self.models["predictive_mask"](features) 664 | 665 | if self.use_pose_net: 666 | outputs.update(self.predict_poses(inputs, features, outputs)) 667 | 668 | self.generate_images_pred(inputs, outputs) 669 | losses = self.compute_losses_val(inputs, outputs) 670 | 671 | return outputs, losses 672 | 673 | def compute_losses_val(self, inputs, outputs): 674 | """Compute the reprojection, perception_loss and smoothness losses for a minibatch 675 | """ 676 | losses = {} 677 | total_loss = 0 678 | 679 | for scale in self.opt.scales: 680 | 681 | loss = 0 682 | registration_losses = [] 683 | 684 | target = inputs[("color", 0, 0)] 685 | 686 | for frame_id in self.opt.frame_ids[1:]: 687 | registration_losses.append( 688 | ncc_loss(outputs[("registration", scale, frame_id)].mean(1, True), target.mean(1, True))) 689 | 690 | registration_losses = torch.cat(registration_losses, 1) 691 | registration_losses, idxs_registration = torch.min(registration_losses, dim=1) 692 | 693 | loss += registration_losses.mean() 694 | total_loss += loss 695 | losses["loss/{}".format(scale)] = loss 696 | 697 | total_loss /= self.num_scales 698 | losses["loss"] = -1 * total_loss 699 | 700 | return losses 701 | 702 | def log_time(self, batch_idx, duration, loss): 703 | """Print a logging statement to the terminal 704 | """ 705 | samples_per_sec = self.opt.batch_size / duration 706 | time_sofar = time.time() - self.start_time 707 | training_time_left = ( 708 | self.num_total_steps / self.step - 1.0) * time_sofar if self.step > 0 else 0 709 | print_string = "epoch {:>3} | batch {:>6} | examples/s: {:5.1f}" + \ 710 | " | loss: {:.5f} | time elapsed: {} | time left: {}" 711 | print(print_string.format(self.epoch, batch_idx, samples_per_sec, loss, 712 | sec_to_hm_str(time_sofar), sec_to_hm_str(training_time_left))) 713 | 714 | def log(self, mode, inputs, outputs, losses): 715 | """Write an event to the tensorboard events file 716 | """ 717 | writer = self.writers[mode] 718 | for l, v in losses.items(): 719 | writer.add_scalar("{}".format(l), v, self.step) 720 | 721 | for j in range(min(4, self.opt.batch_size)): # write a maxmimum of four images 722 | for s in self.opt.scales: 723 | for frame_id in self.opt.frame_ids[1:]: 724 | 725 | writer.add_image( 726 | "brightness_{}_{}/{}".format(frame_id, s, j), 727 | outputs[("transform", "high", s, frame_id)][j].data, self.step) 728 | writer.add_image( 729 | "registration_{}_{}/{}".format(frame_id, s, j), 730 | outputs[("registration", s, frame_id)][j].data, self.step) 731 | writer.add_image( 732 | "refined_{}_{}/{}".format(frame_id, s, j), 733 | outputs[("refined", s, frame_id)][j].data, self.step) 734 | if s == 0: 735 | writer.add_image( 736 | "occu_mask_backward_{}_{}/{}".format(frame_id, s, j), 737 | outputs[("occu_mask_backward", s, frame_id)][j].data, self.step) 738 | 739 | writer.add_image( 740 | "disp_{}/{}".format(s, j), 741 | normalize_image(outputs[("disp", s)][j]), self.step) 742 | 743 | def save_opts(self): 744 | """Save options to disk so we know what we ran this experiment with 745 | """ 746 | models_dir = os.path.join(self.log_path, "models") 747 | if not os.path.exists(models_dir): 748 | os.makedirs(models_dir) 749 | to_save = self.opt.__dict__.copy() 750 | 751 | with open(os.path.join(models_dir, 'opt.json'), 'w') as f: 752 | json.dump(to_save, f, indent=2) 753 | 754 | def save_model(self): 755 | """Save model weights to disk 756 | """ 757 | save_folder = os.path.join(self.log_path, "models", "weights_{}".format(self.epoch)) 758 | if not os.path.exists(save_folder): 759 | os.makedirs(save_folder) 760 | 761 | for model_name, model in self.models.items(): 762 | save_path = os.path.join(save_folder, "{}.pth".format(model_name)) 763 | to_save = model.state_dict() 764 | if model_name == 'encoder': 765 | # save the sizes - these are needed at prediction time 766 | to_save['height'] = self.opt.height 767 | to_save['width'] = self.opt.width 768 | to_save['use_stereo'] = self.opt.use_stereo 769 | torch.save(to_save, save_path) 770 | 771 | save_path = os.path.join(save_folder, "{}.pth".format("adam")) 772 | torch.save(self.model_optimizer.state_dict(), save_path) 773 | 774 | def load_model(self): 775 | """Load model(s) from disk 776 | """ 777 | self.opt.load_weights_folder = os.path.expanduser(self.opt.load_weights_folder) 778 | 779 | assert os.path.isdir(self.opt.load_weights_folder), \ 780 | "Cannot find folder {}".format(self.opt.load_weights_folder) 781 | print("loading model from folder {}".format(self.opt.load_weights_folder)) 782 | 783 | for n in self.opt.models_to_load: 784 | print("Loading {} weights...".format(n)) 785 | path = os.path.join(self.opt.load_weights_folder, "{}.pth".format(n)) 786 | model_dict = self.models[n].state_dict() 787 | pretrained_dict = torch.load(path) 788 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} 789 | model_dict.update(pretrained_dict) 790 | self.models[n].load_state_dict(model_dict) 791 | 792 | 793 | print("Adam is randomly initialized") 794 | --------------------------------------------------------------------------------