├── .gitignore ├── LICENSE ├── README.md ├── camera_model.py ├── dataloader └── kitti.py ├── depth_model.py ├── dpt ├── __init__.py ├── base_model.py ├── blocks.py ├── midas_net.py ├── models.py ├── transforms.py └── vit.py ├── main.py ├── requirements.txt ├── results ├── maps │ └── .placeholder └── plots │ └── .placeholder ├── scale_recovery.py ├── seq_00.gif ├── traj_utils.py ├── util ├── __init__.py ├── gric.py ├── io.py ├── misc.py ├── pallete.py └── png_to_jpg.py ├── visual_odometry.py └── weights └── .placeholder /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Translations 7 | *.mo 8 | *.pot 9 | 10 | # Jupyter Notebook 11 | .ipynb_checkpoints 12 | 13 | # IPython 14 | profile_default/ 15 | ipython_config.py 16 | 17 | # pyenv 18 | .python-version 19 | 20 | # Pyre type checker 21 | .pyre/ 22 | 23 | temp/ 24 | *.pt 25 | .pt 26 | dataset/ 27 | dataset 28 | .idea 29 | .gitattributes 30 | eval 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 aofrancani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DPT-VO: Dense Prediction Transformer for Scale Estimation in Monocular Visual Odometry 2 | 3 | [![arXiv](https://img.shields.io/badge/cs.CV-arXiv%3A2210.01723-B31B1B.svg)](https://arxiv.org/abs/2210.01723) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/aofrancani/DPT-VO/blob/main/LICENSE) 5 | 6 | Official repository of the paper "[Dense Prediction Transformer for Scale Estimation in Monocular Visual Odometry](https://arxiv.org/abs/2210.01723)" 7 | 8 | 9 | 10 | ## Abstract 11 | *Monocular visual odometry consists of the estimation of the position of an agent through images of a single camera, and it is applied in autonomous vehicles, medical robots, and augmented reality. However, monocular systems suffer from the scale ambiguity problem due to the lack of depth information in 2D frames. This paper contributes by showing an application of the dense prediction transformer model for scale estimation in monocular visual odometry systems. Experimental results show that the scale drift problem of monocular systems can be reduced through the accurate estimation of the depth map by this model, achieving competitive state-of-the-art performance on a visual odometry benchmark.* 12 | 13 | 14 | ## Contents 15 | 1. [Dataset](#1-dataset) 16 | 2. [Download the DPT Model](#2-download-the-dpt-model) 17 | 3. [Setup](#3-setup) 18 | 4. [Usage](#4-usage) 19 | 5. [Evaluation](#5-evaluation) 20 | 21 | 22 | ## 1. Dataset 23 | Download the [KITTI odometry dataset (grayscale).](https://www.cvlibs.net/datasets/kitti/eval_odometry.php) 24 | 25 | In this work, we use the `.jpg` format. You can convert the dataset to `.jpg` format with [png_to_jpg.py.](https://github.com/aofrancani/DPT-VO/blob/main/util/png_to_jpg.py) 26 | 27 | Create a simbolic link (Windows) or a softlink (Linux) to the dataset in the `dataset` folder: 28 | 29 | - On Windows: 30 | ```mklink /D \DPT-VO\dataset ``` 31 | - On Linux: 32 | ```ln -s /DPT-VO/dataset``` 33 | 34 | Then, the data structure should be as follows: 35 | ``` 36 | |---DPT-VO 37 | |---dataset 38 | |---sequences_jpg 39 | |---00 40 | |---image_0 41 | |---000000.png 42 | |---000001.png 43 | |---... 44 | |---image_1 45 | |... 46 | |---image_2 47 | |---... 48 | |---image_3 49 | |---... 50 | |---01 51 | |---... 52 | ``` 53 | 54 | ## 2. Download the DPT Model 55 | Download the [DPT trained weights](https://drive.google.com/file/d/1-oJpORoJEdxj4LTV-Pc17iB-smp-khcX/view) and save it in the `weights` folder. 56 | - [dpt_hybrid_kitti-cb926ef4.pt](https://drive.google.com/file/d/1-oJpORoJEdxj4LTV-Pc17iB-smp-khcX/view) 57 | 58 | For more details please check the [original DPT repository](https://github.com/isl-org/DPT). 59 | 60 | 61 | ## 3. Setup 62 | - Create a virtual environment using Anaconda and activate it: 63 | ``` 64 | conda create -n dpt-vo python==3.8.0 65 | conda activate dpt-vo 66 | ``` 67 | - Install dependencies (with environment activated): 68 | ``` 69 | pip install -r requirements.txt 70 | ``` 71 | 72 | ## 4. Usage 73 | Run the `main.py` code with the following command: 74 | ``` 75 | python main.py -s 76 | ``` 77 | You can also use a different path to dataset by changing the arguments ``--data_path`` and ``--pose_path``: 78 | ``` 79 | python main.py -d -p -s 80 | ``` 81 | 82 | ## 5. Evaluation 83 | The evalutaion is done with the [KITTI odometry evaluation toolbox](https://github.com/Huangying-Zhan/kitti-odom-eval). Please go to the [evaluation repository](https://github.com/Huangying-Zhan/kitti-odom-eval) to see more details about the evaluation metrics and how to run the toolbox. 84 | 85 | 86 | ## Citation 87 | Please cite our paper if you find this research useful in your work: 88 | 89 | ```bibtex 90 | @INPROCEEDINGS{Francani2022, 91 | title={Dense Prediction Transformer for Scale Estimation in Monocular Visual Odometry}, 92 | author={André O. Françani and Marcos R. O. A. Maximo}, 93 | booktitle={2022 Latin American Robotics Symposium (LARS), 2022 Brazilian Symposium on Robotics (SBR), and 2022 Workshop on Robotics in Education (WRE)}, 94 | days={18-21}, 95 | month={oct}, 96 | year={2022}, 97 | } 98 | ``` 99 | 100 | ## References 101 | Some of the functions were borrowed and adapted from three amazing works, which are: [DPT](https://github.com/isl-org/DPT), [DF-VO](https://github.com/Huangying-Zhan/DF-VO), and [monoVO](https://github.com/uoip/monoVO-python). 102 | -------------------------------------------------------------------------------- /camera_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class CameraModel(object): 5 | """ 6 | Class that represents a pin-hole camera model (or projective camera model). 7 | In the pin-hole camera model, light goes through the camera center (cx, cy) before its projection 8 | onto the image plane. 9 | """ 10 | def __init__(self, params): 11 | """ 12 | Creates a camera model 13 | 14 | Arguments: 15 | params {dict} -- Camera parameters 16 | """ 17 | 18 | # Image resolution 19 | self.width = params['width'] 20 | self.height = params['height'] 21 | # Focal length of camera 22 | self.fx = params['fx'] 23 | self.fy = params['fy'] 24 | # Optical center (principal point) 25 | self.cx = params['cx'] 26 | self.cy = params['cy'] 27 | # Distortion coefficients. 28 | # k1, k2, and k3 are the radial coefficients. 29 | # p1 and p2 are the tangential distortion coefficients. 30 | #self.distortion_coeff = [params['k1'], params['k2'], params['p1'], params['p2'], params['k3']] 31 | self.mat = np.array([ 32 | [self.fx, 0, self.cx], 33 | [0, self.fy, self.cy], 34 | [0, 0, 1]]) -------------------------------------------------------------------------------- /dataloader/kitti.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import cv2 4 | import argparse 5 | 6 | 7 | class KITTI: 8 | def __init__(self, 9 | data_path=r"dataset\sequences_jpg", 10 | pose_path=r"dataset\poses", 11 | sequence="00", 12 | camera_id="0", 13 | ): 14 | """ 15 | Dataloader for KITTI Visual Odometry Dataset 16 | http://www.cvlibs.net/datasets/kitti/eval_odometry.php 17 | 18 | Arguments: 19 | data_path {str}: path to data sequences 20 | pose_path {str}: path to poses 21 | sequence {str}: sequence to be tested (default: "00") 22 | """ 23 | self.data_path = data_path 24 | self.sequence = sequence 25 | self.camera_id = camera_id 26 | self.frame_id = 0 27 | 28 | # Read ground truth poses 29 | with open(os.path.join(pose_path, sequence+".txt")) as f: 30 | self.poses = f.readlines() 31 | 32 | # Get frames list 33 | frames_dir = os.path.join(data_path, sequence, "image_{}".format(camera_id), "*.jpg") 34 | self.frames = sorted(glob.glob(frames_dir)) 35 | 36 | # Camera Parameters 37 | self.cam_params = {} 38 | frame = cv2.imread(self.frames[self.frame_id], 0) 39 | self.cam_params["width"] = frame.shape[0] 40 | self.cam_params["height"] = frame.shape[1] 41 | self.read_intrinsics_param() 42 | 43 | def __len__(self): 44 | return len(self.frames) 45 | 46 | def get_next_data(self): 47 | """ 48 | Returns: 49 | frame {ndarray}: image frame at index self.frame_id 50 | pose {list}: list containing the ground truth pose [x, y, z] 51 | frame_id {int}: integer representing the frame index 52 | """ 53 | # Read frame as grayscale 54 | frame = cv2.imread(self.frames[self.frame_id], 0) 55 | self.cam_params["width"] = frame.shape[0] 56 | self.cam_params["height"] = frame.shape[0] 57 | 58 | # Read poses 59 | pose = self.poses[self.frame_id] 60 | pose = pose.strip().split() 61 | pose = [float(pose[3]), float(pose[7]), float(pose[11])] # coordinates for the left camera 62 | frame_id = self.frame_id 63 | self.frame_id = self.frame_id + 1 64 | return frame, pose, frame_id 65 | 66 | # def get_limits(self): 67 | # """ 68 | # Returns the limits to draw max and min poses in trajectory image 69 | # 70 | # Returns: 71 | # x_lim {list}: float representing max and min poses of x 72 | # y_lim {list}: float representing max and min poses of y 73 | # z_lim {list}: float representing max and min poses of z 74 | # """ 75 | # poses = [pose.strip().split() for pose in self.poses] 76 | # x_lim = [max([pose[0] for pose in poses]), min([pose[0] for pose in poses])] 77 | # y_lim = [max([pose[1] for pose in poses]), min([pose[1] for pose in poses])] 78 | # z_lim = [max([pose[2] for pose in poses]), min([pose[2] for pose in poses])] 79 | # return x_lim, y_lim, z_lim 80 | 81 | def read_intrinsics_param(self): 82 | """ 83 | Reads camera intrinsics parameters 84 | 85 | Returns: 86 | cam_params {dict}: dictionary with focal lenght and principal point 87 | """ 88 | calib_file = os.path.join(self.data_path, self.sequence, "calib.txt") 89 | with open(calib_file, 'r') as f: 90 | lines = f.readlines() 91 | line = lines[int(self.camera_id)].strip().split() 92 | [fx, cx, fy, cy] = [float(line[1]), float(line[3]), float(line[6]), float(line[7])] 93 | 94 | # focal length of camera 95 | self.cam_params["fx"] = fx 96 | self.cam_params["fy"] = fy 97 | # principal point (optical center) 98 | self.cam_params["cx"] = cx 99 | self.cam_params["cy"] = cy 100 | 101 | 102 | if __name__ == "__main__": 103 | 104 | parser = argparse.ArgumentParser() 105 | 106 | parser.add_argument( 107 | "-d", "--data_path", 108 | default=r"dataset\sequences_jpg", 109 | help="path to dataset" 110 | ) 111 | parser.add_argument( 112 | "-s", 113 | "--sequence", 114 | default="03", 115 | help="sequence to be evaluated", 116 | ) 117 | parser.add_argument( 118 | "-p", 119 | "--pose_path", 120 | default=r"dataset\poses", 121 | help="path to ground truth poses", 122 | ) 123 | 124 | args = parser.parse_args() 125 | 126 | # Create dataloader 127 | dataloader = KITTI( 128 | data_path=args.data_path, 129 | pose_path=args.pose_path, 130 | sequence=args.sequence, 131 | ) 132 | 133 | dataloader.read_intrinsics_param() 134 | 135 | 136 | -------------------------------------------------------------------------------- /depth_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Build DPT depth model 3 | - modified from https://github.com/isl-org/DPT 4 | """ 5 | 6 | import os 7 | import torch 8 | import cv2 9 | import argparse 10 | import util.io 11 | 12 | from torchvision.transforms import Compose 13 | 14 | from dpt.models import DPTDepthModel 15 | from dpt.transforms import Resize, NormalizeImage, PrepareForNet 16 | 17 | 18 | class DepthModel(object): 19 | """ 20 | Build DPT network and compute depth maps 21 | """ 22 | 23 | def __init__(self, model_type="dpt_hybrid", optimize=True): 24 | """ 25 | Build MonoDepthNN to compute depth maps. 26 | 27 | Arguments: 28 | model_path (str): path to saved model 29 | """ 30 | default_models = { 31 | "dpt_large": "weights/dpt_large-midas-2f21e586.pt", 32 | "dpt_hybrid": "weights/dpt_hybrid-midas-501f0c75.pt", 33 | "dpt_hybrid_kitti": "weights/dpt_hybrid_kitti-cb926ef4.pt", 34 | } 35 | model_path = default_models[model_type] 36 | self.model_type = model_type 37 | self.optimize = optimize 38 | 39 | # select device 40 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 41 | print("device: ", self.device) 42 | 43 | # load network 44 | if model_type == "dpt_large": # DPT-Large 45 | net_w = net_h = 384 46 | model = DPTDepthModel( 47 | path=model_path, 48 | backbone="vitl16_384", 49 | non_negative=True, 50 | enable_attention_hooks=False, 51 | ) 52 | normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 53 | 54 | elif model_type == "dpt_hybrid": # DPT-Hybrid 55 | net_w = net_h = 384 56 | model = DPTDepthModel( 57 | path=model_path, 58 | backbone="vitb_rn50_384", 59 | non_negative=True, 60 | enable_attention_hooks=False, 61 | ) 62 | normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 63 | 64 | elif model_type == "dpt_hybrid_kitti": 65 | net_w = 1216 66 | net_h = 352 67 | 68 | model = DPTDepthModel( 69 | path=model_path, 70 | scale=0.00006016, 71 | shift=0.00579, 72 | invert=True, 73 | backbone="vitb_rn50_384", 74 | non_negative=True, 75 | enable_attention_hooks=False, 76 | ) 77 | 78 | normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) 79 | 80 | else: 81 | assert ( 82 | False 83 | ), f"model_type '{model_type}' not implemented, use: --model_type [dpt_large|dpt_hybrid|dpt_hybrid_kitti|dpt_hybrid_nyu|midas_v21]" 84 | 85 | self.transform = Compose( 86 | [ 87 | Resize( 88 | net_w, 89 | net_h, 90 | resize_target=None, 91 | keep_aspect_ratio=True, 92 | ensure_multiple_of=32, 93 | resize_method="minimal", 94 | image_interpolation_method=cv2.INTER_CUBIC, 95 | ), 96 | normalization, 97 | PrepareForNet(), 98 | ] 99 | ) 100 | 101 | model.eval() 102 | 103 | if optimize and self.device == torch.device("cuda"): 104 | model = model.to(memory_format=torch.channels_last) 105 | model = model.half() 106 | 107 | self.model = model.to(self.device) 108 | 109 | @torch.no_grad() 110 | def compute_depth(self, img, kitti_crop=False): 111 | """ 112 | Computes depth map 113 | 114 | Arguments: 115 | img (array): image (0-255) 116 | """ 117 | 118 | if img.ndim == 2: 119 | img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) 120 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 121 | 122 | if kitti_crop is True: 123 | height, width, _ = img.shape 124 | top = height - 352 125 | left = (width - 1216) // 2 126 | img = img[top : top + 352, left : left + 1216, :] 127 | 128 | img_input = self.transform({"image": img})["image"] 129 | 130 | # with torch.no_grad(): 131 | sample = torch.from_numpy(img_input).to(self.device).unsqueeze(0) 132 | 133 | if self.optimize and self.device == torch.device("cuda"): 134 | sample = sample.to(memory_format=torch.channels_last) 135 | sample = sample.half() 136 | 137 | prediction = self.model.forward(sample) 138 | prediction = ( 139 | torch.nn.functional.interpolate( 140 | prediction.unsqueeze(1), 141 | size=img.shape[:2], 142 | mode="bicubic", 143 | align_corners=False, 144 | ) 145 | .squeeze() 146 | .cpu() 147 | .numpy() 148 | ) 149 | 150 | # if self.model_type == "dpt_hybrid_kitti": 151 | # prediction *= 256 152 | 153 | return prediction 154 | 155 | 156 | if __name__ == "__main__": 157 | parser = argparse.ArgumentParser() 158 | 159 | parser.add_argument( 160 | "-m", "--model_weights", default=None, help="path to model weights" 161 | ) 162 | 163 | parser.add_argument( 164 | "-t", 165 | "--model_type", 166 | default="dpt_hybrid", 167 | help="model type [dpt_large|dpt_hybrid|midas_v21]", 168 | ) 169 | 170 | parser.add_argument("--optimize", dest="optimize", action="store_true") 171 | parser.set_defaults(optimize=True) 172 | 173 | args = parser.parse_args() 174 | 175 | # default_models = { 176 | # "midas_v21": "weights/midas_v21-f6b98070.pt", 177 | # "dpt_large": "weights/dpt_large-midas-2f21e586.pt", 178 | # "dpt_hybrid": "weights/dpt_hybrid-midas-501f0c75.pt", 179 | # "dpt_hybrid_kitti": "weights/dpt_hybrid_kitti-cb926ef4.pt", 180 | # "dpt_hybrid_nyu": "weights/dpt_hybrid_nyu-2ce69ec7.pt", 181 | # } 182 | # 183 | # if args.model_weights is None: 184 | # args.model_weights = default_models[args.model_type] 185 | 186 | # set torch options 187 | torch.backends.cudnn.enabled = True 188 | torch.backends.cudnn.benchmark = True 189 | 190 | # build model 191 | model = DepthModel( 192 | args.model_type, 193 | args.optimize, 194 | ) 195 | # print(model) 196 | 197 | # read img 198 | img_path = r"dataset\sequences_jpg\00\image_0\000000.jpg" 199 | img = cv2.imread(img_path) 200 | 201 | # compute depth 202 | depth = model.compute_depth(img, kitti_crop=False) 203 | 204 | filename = os.path.join( 205 | "temp", os.path.splitext(os.path.basename(img_path))[0] 206 | ) 207 | util.io.write_depth(filename.replace(".jpg", "_depth.jpg"), depth, bits=2, absolute_depth=True) 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /dpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aofrancani/DPT-VO/aef64207243fbd29ec17e1c108def8ac3e19b68d/dpt/__init__.py -------------------------------------------------------------------------------- /dpt/base_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class BaseModel(torch.nn.Module): 5 | def load(self, path): 6 | """Load model from file. 7 | 8 | Args: 9 | path (str): file path 10 | """ 11 | parameters = torch.load(path, map_location=torch.device("cpu")) 12 | 13 | if "optimizer" in parameters: 14 | parameters = parameters["model"] 15 | 16 | self.load_state_dict(parameters) 17 | -------------------------------------------------------------------------------- /dpt/blocks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .vit import ( 5 | _make_pretrained_vitb_rn50_384, 6 | _make_pretrained_vitl16_384, 7 | _make_pretrained_vitb16_384, 8 | forward_vit, 9 | ) 10 | 11 | 12 | def _make_encoder( 13 | backbone, 14 | features, 15 | use_pretrained, 16 | groups=1, 17 | expand=False, 18 | exportable=True, 19 | hooks=None, 20 | use_vit_only=False, 21 | use_readout="ignore", 22 | enable_attention_hooks=False, 23 | ): 24 | if backbone == "vitl16_384": 25 | pretrained = _make_pretrained_vitl16_384( 26 | use_pretrained, 27 | hooks=hooks, 28 | use_readout=use_readout, 29 | enable_attention_hooks=enable_attention_hooks, 30 | ) 31 | scratch = _make_scratch( 32 | [256, 512, 1024, 1024], features, groups=groups, expand=expand 33 | ) # ViT-L/16 - 85.0% Top1 (backbone) 34 | elif backbone == "vitb_rn50_384": 35 | pretrained = _make_pretrained_vitb_rn50_384( 36 | use_pretrained, 37 | hooks=hooks, 38 | use_vit_only=use_vit_only, 39 | use_readout=use_readout, 40 | enable_attention_hooks=enable_attention_hooks, 41 | ) 42 | scratch = _make_scratch( 43 | [256, 512, 768, 768], features, groups=groups, expand=expand 44 | ) # ViT-H/16 - 85.0% Top1 (backbone) 45 | elif backbone == "vitb16_384": 46 | pretrained = _make_pretrained_vitb16_384( 47 | use_pretrained, 48 | hooks=hooks, 49 | use_readout=use_readout, 50 | enable_attention_hooks=enable_attention_hooks, 51 | ) 52 | scratch = _make_scratch( 53 | [96, 192, 384, 768], features, groups=groups, expand=expand 54 | ) # ViT-B/16 - 84.6% Top1 (backbone) 55 | elif backbone == "resnext101_wsl": 56 | pretrained = _make_pretrained_resnext101_wsl(use_pretrained) 57 | scratch = _make_scratch( 58 | [256, 512, 1024, 2048], features, groups=groups, expand=expand 59 | ) # efficientnet_lite3 60 | else: 61 | print(f"Backbone '{backbone}' not implemented") 62 | assert False 63 | 64 | return pretrained, scratch 65 | 66 | 67 | def _make_scratch(in_shape, out_shape, groups=1, expand=False): 68 | scratch = nn.Module() 69 | 70 | out_shape1 = out_shape 71 | out_shape2 = out_shape 72 | out_shape3 = out_shape 73 | out_shape4 = out_shape 74 | if expand == True: 75 | out_shape1 = out_shape 76 | out_shape2 = out_shape * 2 77 | out_shape3 = out_shape * 4 78 | out_shape4 = out_shape * 8 79 | 80 | scratch.layer1_rn = nn.Conv2d( 81 | in_shape[0], 82 | out_shape1, 83 | kernel_size=3, 84 | stride=1, 85 | padding=1, 86 | bias=False, 87 | groups=groups, 88 | ) 89 | scratch.layer2_rn = nn.Conv2d( 90 | in_shape[1], 91 | out_shape2, 92 | kernel_size=3, 93 | stride=1, 94 | padding=1, 95 | bias=False, 96 | groups=groups, 97 | ) 98 | scratch.layer3_rn = nn.Conv2d( 99 | in_shape[2], 100 | out_shape3, 101 | kernel_size=3, 102 | stride=1, 103 | padding=1, 104 | bias=False, 105 | groups=groups, 106 | ) 107 | scratch.layer4_rn = nn.Conv2d( 108 | in_shape[3], 109 | out_shape4, 110 | kernel_size=3, 111 | stride=1, 112 | padding=1, 113 | bias=False, 114 | groups=groups, 115 | ) 116 | 117 | return scratch 118 | 119 | 120 | def _make_resnet_backbone(resnet): 121 | pretrained = nn.Module() 122 | pretrained.layer1 = nn.Sequential( 123 | resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1 124 | ) 125 | 126 | pretrained.layer2 = resnet.layer2 127 | pretrained.layer3 = resnet.layer3 128 | pretrained.layer4 = resnet.layer4 129 | 130 | return pretrained 131 | 132 | 133 | def _make_pretrained_resnext101_wsl(use_pretrained): 134 | resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl") 135 | return _make_resnet_backbone(resnet) 136 | 137 | 138 | class Interpolate(nn.Module): 139 | """Interpolation module.""" 140 | 141 | def __init__(self, scale_factor, mode, align_corners=False): 142 | """Init. 143 | 144 | Args: 145 | scale_factor (float): scaling 146 | mode (str): interpolation mode 147 | """ 148 | super(Interpolate, self).__init__() 149 | 150 | self.interp = nn.functional.interpolate 151 | self.scale_factor = scale_factor 152 | self.mode = mode 153 | self.align_corners = align_corners 154 | 155 | def forward(self, x): 156 | """Forward pass. 157 | 158 | Args: 159 | x (tensor): input 160 | 161 | Returns: 162 | tensor: interpolated data 163 | """ 164 | 165 | x = self.interp( 166 | x, 167 | scale_factor=self.scale_factor, 168 | mode=self.mode, 169 | align_corners=self.align_corners, 170 | ) 171 | 172 | return x 173 | 174 | 175 | class ResidualConvUnit(nn.Module): 176 | """Residual convolution module.""" 177 | 178 | def __init__(self, features): 179 | """Init. 180 | 181 | Args: 182 | features (int): number of features 183 | """ 184 | super().__init__() 185 | 186 | self.conv1 = nn.Conv2d( 187 | features, features, kernel_size=3, stride=1, padding=1, bias=True 188 | ) 189 | 190 | self.conv2 = nn.Conv2d( 191 | features, features, kernel_size=3, stride=1, padding=1, bias=True 192 | ) 193 | 194 | self.relu = nn.ReLU(inplace=True) 195 | 196 | def forward(self, x): 197 | """Forward pass. 198 | 199 | Args: 200 | x (tensor): input 201 | 202 | Returns: 203 | tensor: output 204 | """ 205 | out = self.relu(x) 206 | out = self.conv1(out) 207 | out = self.relu(out) 208 | out = self.conv2(out) 209 | 210 | return out + x 211 | 212 | 213 | class FeatureFusionBlock(nn.Module): 214 | """Feature fusion block.""" 215 | 216 | def __init__(self, features): 217 | """Init. 218 | 219 | Args: 220 | features (int): number of features 221 | """ 222 | super(FeatureFusionBlock, self).__init__() 223 | 224 | self.resConfUnit1 = ResidualConvUnit(features) 225 | self.resConfUnit2 = ResidualConvUnit(features) 226 | 227 | def forward(self, *xs): 228 | """Forward pass. 229 | 230 | Returns: 231 | tensor: output 232 | """ 233 | output = xs[0] 234 | 235 | if len(xs) == 2: 236 | output += self.resConfUnit1(xs[1]) 237 | 238 | output = self.resConfUnit2(output) 239 | 240 | output = nn.functional.interpolate( 241 | output, scale_factor=2, mode="bilinear", align_corners=True 242 | ) 243 | 244 | return output 245 | 246 | 247 | class ResidualConvUnit_custom(nn.Module): 248 | """Residual convolution module.""" 249 | 250 | def __init__(self, features, activation, bn): 251 | """Init. 252 | 253 | Args: 254 | features (int): number of features 255 | """ 256 | super().__init__() 257 | 258 | self.bn = bn 259 | 260 | self.groups = 1 261 | 262 | self.conv1 = nn.Conv2d( 263 | features, 264 | features, 265 | kernel_size=3, 266 | stride=1, 267 | padding=1, 268 | bias=not self.bn, 269 | groups=self.groups, 270 | ) 271 | 272 | self.conv2 = nn.Conv2d( 273 | features, 274 | features, 275 | kernel_size=3, 276 | stride=1, 277 | padding=1, 278 | bias=not self.bn, 279 | groups=self.groups, 280 | ) 281 | 282 | if self.bn == True: 283 | self.bn1 = nn.BatchNorm2d(features) 284 | self.bn2 = nn.BatchNorm2d(features) 285 | 286 | self.activation = activation 287 | 288 | self.skip_add = nn.quantized.FloatFunctional() 289 | 290 | def forward(self, x): 291 | """Forward pass. 292 | 293 | Args: 294 | x (tensor): input 295 | 296 | Returns: 297 | tensor: output 298 | """ 299 | 300 | out = self.activation(x) 301 | out = self.conv1(out) 302 | if self.bn == True: 303 | out = self.bn1(out) 304 | 305 | out = self.activation(out) 306 | out = self.conv2(out) 307 | if self.bn == True: 308 | out = self.bn2(out) 309 | 310 | if self.groups > 1: 311 | out = self.conv_merge(out) 312 | 313 | return self.skip_add.add(out, x) 314 | 315 | # return out + x 316 | 317 | 318 | class FeatureFusionBlock_custom(nn.Module): 319 | """Feature fusion block.""" 320 | 321 | def __init__( 322 | self, 323 | features, 324 | activation, 325 | deconv=False, 326 | bn=False, 327 | expand=False, 328 | align_corners=True, 329 | ): 330 | """Init. 331 | 332 | Args: 333 | features (int): number of features 334 | """ 335 | super(FeatureFusionBlock_custom, self).__init__() 336 | 337 | self.deconv = deconv 338 | self.align_corners = align_corners 339 | 340 | self.groups = 1 341 | 342 | self.expand = expand 343 | out_features = features 344 | if self.expand == True: 345 | out_features = features // 2 346 | 347 | self.out_conv = nn.Conv2d( 348 | features, 349 | out_features, 350 | kernel_size=1, 351 | stride=1, 352 | padding=0, 353 | bias=True, 354 | groups=1, 355 | ) 356 | 357 | self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) 358 | self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) 359 | 360 | self.skip_add = nn.quantized.FloatFunctional() 361 | 362 | def forward(self, *xs): 363 | """Forward pass. 364 | 365 | Returns: 366 | tensor: output 367 | """ 368 | output = xs[0] 369 | 370 | if len(xs) == 2: 371 | res = self.resConfUnit1(xs[1]) 372 | output = self.skip_add.add(output, res) 373 | # output += res 374 | 375 | output = self.resConfUnit2(output) 376 | 377 | output = nn.functional.interpolate( 378 | output, scale_factor=2, mode="bilinear", align_corners=self.align_corners 379 | ) 380 | 381 | output = self.out_conv(output) 382 | 383 | return output 384 | -------------------------------------------------------------------------------- /dpt/midas_net.py: -------------------------------------------------------------------------------- 1 | """MidashNet: Network for monocular depth estimation trained by mixing several datasets. 2 | This file contains code that is adapted from 3 | https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | 8 | from .base_model import BaseModel 9 | from .blocks import FeatureFusionBlock, Interpolate, _make_encoder 10 | 11 | 12 | class MidasNet_large(BaseModel): 13 | """Network for monocular depth estimation.""" 14 | 15 | def __init__(self, path=None, features=256, non_negative=True): 16 | """Init. 17 | 18 | Args: 19 | path (str, optional): Path to saved model. Defaults to None. 20 | features (int, optional): Number of features. Defaults to 256. 21 | backbone (str, optional): Backbone network for encoder. Defaults to resnet50 22 | """ 23 | print("Loading weights: ", path) 24 | 25 | super(MidasNet_large, self).__init__() 26 | 27 | use_pretrained = False if path is None else True 28 | 29 | self.pretrained, self.scratch = _make_encoder( 30 | backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained 31 | ) 32 | 33 | self.scratch.refinenet4 = FeatureFusionBlock(features) 34 | self.scratch.refinenet3 = FeatureFusionBlock(features) 35 | self.scratch.refinenet2 = FeatureFusionBlock(features) 36 | self.scratch.refinenet1 = FeatureFusionBlock(features) 37 | 38 | self.scratch.output_conv = nn.Sequential( 39 | nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1), 40 | Interpolate(scale_factor=2, mode="bilinear"), 41 | nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1), 42 | nn.ReLU(True), 43 | nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), 44 | nn.ReLU(True) if non_negative else nn.Identity(), 45 | ) 46 | 47 | if path: 48 | self.load(path) 49 | 50 | def forward(self, x): 51 | """Forward pass. 52 | 53 | Args: 54 | x (tensor): input data (image) 55 | 56 | Returns: 57 | tensor: depth 58 | """ 59 | 60 | layer_1 = self.pretrained.layer1(x) 61 | layer_2 = self.pretrained.layer2(layer_1) 62 | layer_3 = self.pretrained.layer3(layer_2) 63 | layer_4 = self.pretrained.layer4(layer_3) 64 | 65 | layer_1_rn = self.scratch.layer1_rn(layer_1) 66 | layer_2_rn = self.scratch.layer2_rn(layer_2) 67 | layer_3_rn = self.scratch.layer3_rn(layer_3) 68 | layer_4_rn = self.scratch.layer4_rn(layer_4) 69 | 70 | path_4 = self.scratch.refinenet4(layer_4_rn) 71 | path_3 = self.scratch.refinenet3(path_4, layer_3_rn) 72 | path_2 = self.scratch.refinenet2(path_3, layer_2_rn) 73 | path_1 = self.scratch.refinenet1(path_2, layer_1_rn) 74 | 75 | out = self.scratch.output_conv(path_1) 76 | 77 | return torch.squeeze(out, dim=1) 78 | -------------------------------------------------------------------------------- /dpt/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .base_model import BaseModel 6 | from .blocks import ( 7 | FeatureFusionBlock, 8 | FeatureFusionBlock_custom, 9 | Interpolate, 10 | _make_encoder, 11 | forward_vit, 12 | ) 13 | 14 | 15 | def _make_fusion_block(features, use_bn): 16 | return FeatureFusionBlock_custom( 17 | features, 18 | nn.ReLU(False), 19 | deconv=False, 20 | bn=use_bn, 21 | expand=False, 22 | align_corners=True, 23 | ) 24 | 25 | 26 | class DPT(BaseModel): 27 | def __init__( 28 | self, 29 | head, 30 | features=256, 31 | backbone="vitb_rn50_384", 32 | readout="project", 33 | channels_last=False, 34 | use_bn=False, 35 | enable_attention_hooks=False, 36 | ): 37 | 38 | super(DPT, self).__init__() 39 | 40 | self.channels_last = channels_last 41 | 42 | hooks = { 43 | "vitb_rn50_384": [0, 1, 8, 11], 44 | "vitb16_384": [2, 5, 8, 11], 45 | "vitl16_384": [5, 11, 17, 23], 46 | } 47 | 48 | # Instantiate backbone and reassemble blocks 49 | self.pretrained, self.scratch = _make_encoder( 50 | backbone, 51 | features, 52 | False, # Set to true of you want to train from scratch, uses ImageNet weights 53 | groups=1, 54 | expand=False, 55 | exportable=False, 56 | hooks=hooks[backbone], 57 | use_readout=readout, 58 | enable_attention_hooks=enable_attention_hooks, 59 | ) 60 | 61 | self.scratch.refinenet1 = _make_fusion_block(features, use_bn) 62 | self.scratch.refinenet2 = _make_fusion_block(features, use_bn) 63 | self.scratch.refinenet3 = _make_fusion_block(features, use_bn) 64 | self.scratch.refinenet4 = _make_fusion_block(features, use_bn) 65 | 66 | self.scratch.output_conv = head 67 | 68 | def forward(self, x): 69 | if self.channels_last == True: 70 | x.contiguous(memory_format=torch.channels_last) 71 | 72 | layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x) 73 | 74 | layer_1_rn = self.scratch.layer1_rn(layer_1) 75 | layer_2_rn = self.scratch.layer2_rn(layer_2) 76 | layer_3_rn = self.scratch.layer3_rn(layer_3) 77 | layer_4_rn = self.scratch.layer4_rn(layer_4) 78 | 79 | path_4 = self.scratch.refinenet4(layer_4_rn) 80 | path_3 = self.scratch.refinenet3(path_4, layer_3_rn) 81 | path_2 = self.scratch.refinenet2(path_3, layer_2_rn) 82 | path_1 = self.scratch.refinenet1(path_2, layer_1_rn) 83 | 84 | out = self.scratch.output_conv(path_1) 85 | 86 | return out 87 | 88 | 89 | class DPTDepthModel(DPT): 90 | def __init__( 91 | self, path=None, non_negative=True, scale=1.0, shift=0.0, invert=False, **kwargs 92 | ): 93 | features = kwargs["features"] if "features" in kwargs else 256 94 | 95 | self.scale = scale 96 | self.shift = shift 97 | self.invert = invert 98 | 99 | head = nn.Sequential( 100 | nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), 101 | Interpolate(scale_factor=2, mode="bilinear", align_corners=True), 102 | nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), 103 | nn.ReLU(True), 104 | nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), 105 | nn.ReLU(True) if non_negative else nn.Identity(), 106 | nn.Identity(), 107 | ) 108 | 109 | super().__init__(head, **kwargs) 110 | 111 | if path is not None: 112 | self.load(path) 113 | 114 | def forward(self, x): 115 | inv_depth = super().forward(x).squeeze(dim=1) 116 | 117 | if self.invert: 118 | depth = self.scale * inv_depth + self.shift 119 | depth[depth < 1e-8] = 1e-8 120 | depth = 1.0 / depth 121 | return depth 122 | else: 123 | return inv_depth 124 | 125 | 126 | class DPTSegmentationModel(DPT): 127 | def __init__(self, num_classes, path=None, **kwargs): 128 | 129 | features = kwargs["features"] if "features" in kwargs else 256 130 | 131 | kwargs["use_bn"] = True 132 | 133 | head = nn.Sequential( 134 | nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False), 135 | nn.BatchNorm2d(features), 136 | nn.ReLU(True), 137 | nn.Dropout(0.1, False), 138 | nn.Conv2d(features, num_classes, kernel_size=1), 139 | Interpolate(scale_factor=2, mode="bilinear", align_corners=True), 140 | ) 141 | 142 | super().__init__(head, **kwargs) 143 | 144 | self.auxlayer = nn.Sequential( 145 | nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False), 146 | nn.BatchNorm2d(features), 147 | nn.ReLU(True), 148 | nn.Dropout(0.1, False), 149 | nn.Conv2d(features, num_classes, kernel_size=1), 150 | ) 151 | 152 | if path is not None: 153 | self.load(path) 154 | -------------------------------------------------------------------------------- /dpt/transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import math 4 | 5 | 6 | def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): 7 | """Rezise the sample to ensure the given size. Keeps aspect ratio. 8 | 9 | Args: 10 | sample (dict): sample 11 | size (tuple): image size 12 | 13 | Returns: 14 | tuple: new size 15 | """ 16 | shape = list(sample["disparity"].shape) 17 | 18 | if shape[0] >= size[0] and shape[1] >= size[1]: 19 | return sample 20 | 21 | scale = [0, 0] 22 | scale[0] = size[0] / shape[0] 23 | scale[1] = size[1] / shape[1] 24 | 25 | scale = max(scale) 26 | 27 | shape[0] = math.ceil(scale * shape[0]) 28 | shape[1] = math.ceil(scale * shape[1]) 29 | 30 | # resize 31 | sample["image"] = cv2.resize( 32 | sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method 33 | ) 34 | 35 | sample["disparity"] = cv2.resize( 36 | sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST 37 | ) 38 | sample["mask"] = cv2.resize( 39 | sample["mask"].astype(np.float32), 40 | tuple(shape[::-1]), 41 | interpolation=cv2.INTER_NEAREST, 42 | ) 43 | sample["mask"] = sample["mask"].astype(bool) 44 | 45 | return tuple(shape) 46 | 47 | 48 | class Resize(object): 49 | """Resize sample to given size (width, height).""" 50 | 51 | def __init__( 52 | self, 53 | width, 54 | height, 55 | resize_target=True, 56 | keep_aspect_ratio=False, 57 | ensure_multiple_of=1, 58 | resize_method="lower_bound", 59 | image_interpolation_method=cv2.INTER_AREA, 60 | ): 61 | """Init. 62 | 63 | Args: 64 | width (int): desired output width 65 | height (int): desired output height 66 | resize_target (bool, optional): 67 | True: Resize the full sample (image, mask, target). 68 | False: Resize image only. 69 | Defaults to True. 70 | keep_aspect_ratio (bool, optional): 71 | True: Keep the aspect ratio of the input sample. 72 | Output sample might not have the given width and height, and 73 | resize behaviour depends on the parameter 'resize_method'. 74 | Defaults to False. 75 | ensure_multiple_of (int, optional): 76 | Output width and height is constrained to be multiple of this parameter. 77 | Defaults to 1. 78 | resize_method (str, optional): 79 | "lower_bound": Output will be at least as large as the given size. 80 | "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) 81 | "minimal": Scale as least as possible. (Output size might be smaller than given size.) 82 | Defaults to "lower_bound". 83 | """ 84 | self.__width = width 85 | self.__height = height 86 | 87 | self.__resize_target = resize_target 88 | self.__keep_aspect_ratio = keep_aspect_ratio 89 | self.__multiple_of = ensure_multiple_of 90 | self.__resize_method = resize_method 91 | self.__image_interpolation_method = image_interpolation_method 92 | 93 | def constrain_to_multiple_of(self, x, min_val=0, max_val=None): 94 | y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) 95 | 96 | if max_val is not None and y > max_val: 97 | y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) 98 | 99 | if y < min_val: 100 | y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) 101 | 102 | return y 103 | 104 | def get_size(self, width, height): 105 | # determine new height and width 106 | scale_height = self.__height / height 107 | scale_width = self.__width / width 108 | 109 | if self.__keep_aspect_ratio: 110 | if self.__resize_method == "lower_bound": 111 | # scale such that output size is lower bound 112 | if scale_width > scale_height: 113 | # fit width 114 | scale_height = scale_width 115 | else: 116 | # fit height 117 | scale_width = scale_height 118 | elif self.__resize_method == "upper_bound": 119 | # scale such that output size is upper bound 120 | if scale_width < scale_height: 121 | # fit width 122 | scale_height = scale_width 123 | else: 124 | # fit height 125 | scale_width = scale_height 126 | elif self.__resize_method == "minimal": 127 | # scale as least as possbile 128 | if abs(1 - scale_width) < abs(1 - scale_height): 129 | # fit width 130 | scale_height = scale_width 131 | else: 132 | # fit height 133 | scale_width = scale_height 134 | else: 135 | raise ValueError( 136 | f"resize_method {self.__resize_method} not implemented" 137 | ) 138 | 139 | if self.__resize_method == "lower_bound": 140 | new_height = self.constrain_to_multiple_of( 141 | scale_height * height, min_val=self.__height 142 | ) 143 | new_width = self.constrain_to_multiple_of( 144 | scale_width * width, min_val=self.__width 145 | ) 146 | elif self.__resize_method == "upper_bound": 147 | new_height = self.constrain_to_multiple_of( 148 | scale_height * height, max_val=self.__height 149 | ) 150 | new_width = self.constrain_to_multiple_of( 151 | scale_width * width, max_val=self.__width 152 | ) 153 | elif self.__resize_method == "minimal": 154 | new_height = self.constrain_to_multiple_of(scale_height * height) 155 | new_width = self.constrain_to_multiple_of(scale_width * width) 156 | else: 157 | raise ValueError(f"resize_method {self.__resize_method} not implemented") 158 | 159 | return (new_width, new_height) 160 | 161 | def __call__(self, sample): 162 | width, height = self.get_size( 163 | sample["image"].shape[1], sample["image"].shape[0] 164 | ) 165 | 166 | # resize sample 167 | sample["image"] = cv2.resize( 168 | sample["image"], 169 | (width, height), 170 | interpolation=self.__image_interpolation_method, 171 | ) 172 | 173 | if self.__resize_target: 174 | if "disparity" in sample: 175 | sample["disparity"] = cv2.resize( 176 | sample["disparity"], 177 | (width, height), 178 | interpolation=cv2.INTER_NEAREST, 179 | ) 180 | 181 | if "depth" in sample: 182 | sample["depth"] = cv2.resize( 183 | sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST 184 | ) 185 | 186 | sample["mask"] = cv2.resize( 187 | sample["mask"].astype(np.float32), 188 | (width, height), 189 | interpolation=cv2.INTER_NEAREST, 190 | ) 191 | sample["mask"] = sample["mask"].astype(bool) 192 | 193 | return sample 194 | 195 | 196 | class NormalizeImage(object): 197 | """Normlize image by given mean and std.""" 198 | 199 | def __init__(self, mean, std): 200 | self.__mean = mean 201 | self.__std = std 202 | 203 | def __call__(self, sample): 204 | sample["image"] = (sample["image"] - self.__mean) / self.__std 205 | 206 | return sample 207 | 208 | 209 | class PrepareForNet(object): 210 | """Prepare sample for usage as network input.""" 211 | 212 | def __init__(self): 213 | pass 214 | 215 | def __call__(self, sample): 216 | image = np.transpose(sample["image"], (2, 0, 1)) 217 | sample["image"] = np.ascontiguousarray(image).astype(np.float32) 218 | 219 | if "mask" in sample: 220 | sample["mask"] = sample["mask"].astype(np.float32) 221 | sample["mask"] = np.ascontiguousarray(sample["mask"]) 222 | 223 | if "disparity" in sample: 224 | disparity = sample["disparity"].astype(np.float32) 225 | sample["disparity"] = np.ascontiguousarray(disparity) 226 | 227 | if "depth" in sample: 228 | depth = sample["depth"].astype(np.float32) 229 | sample["depth"] = np.ascontiguousarray(depth) 230 | 231 | return sample 232 | -------------------------------------------------------------------------------- /dpt/vit.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import timm 4 | import types 5 | import math 6 | import torch.nn.functional as F 7 | 8 | 9 | activations = {} 10 | 11 | 12 | def get_activation(name): 13 | def hook(model, input, output): 14 | activations[name] = output 15 | 16 | return hook 17 | 18 | 19 | attention = {} 20 | 21 | 22 | def get_attention(name): 23 | def hook(module, input, output): 24 | x = input[0] 25 | B, N, C = x.shape 26 | qkv = ( 27 | module.qkv(x) 28 | .reshape(B, N, 3, module.num_heads, C // module.num_heads) 29 | .permute(2, 0, 3, 1, 4) 30 | ) 31 | q, k, v = ( 32 | qkv[0], 33 | qkv[1], 34 | qkv[2], 35 | ) # make torchscript happy (cannot use tensor as tuple) 36 | 37 | attn = (q @ k.transpose(-2, -1)) * module.scale 38 | 39 | attn = attn.softmax(dim=-1) # [:,:,1,1:] 40 | attention[name] = attn 41 | 42 | return hook 43 | 44 | 45 | def get_mean_attention_map(attn, token, shape): 46 | attn = attn[:, :, token, 1:] 47 | attn = attn.unflatten(2, torch.Size([shape[2] // 16, shape[3] // 16])).float() 48 | attn = torch.nn.functional.interpolate( 49 | attn, size=shape[2:], mode="bicubic", align_corners=False 50 | ).squeeze(0) 51 | 52 | all_attn = torch.mean(attn, 0) 53 | 54 | return all_attn 55 | 56 | 57 | class Slice(nn.Module): 58 | def __init__(self, start_index=1): 59 | super(Slice, self).__init__() 60 | self.start_index = start_index 61 | 62 | def forward(self, x): 63 | return x[:, self.start_index :] 64 | 65 | 66 | class AddReadout(nn.Module): 67 | def __init__(self, start_index=1): 68 | super(AddReadout, self).__init__() 69 | self.start_index = start_index 70 | 71 | def forward(self, x): 72 | if self.start_index == 2: 73 | readout = (x[:, 0] + x[:, 1]) / 2 74 | else: 75 | readout = x[:, 0] 76 | return x[:, self.start_index :] + readout.unsqueeze(1) 77 | 78 | 79 | class ProjectReadout(nn.Module): 80 | def __init__(self, in_features, start_index=1): 81 | super(ProjectReadout, self).__init__() 82 | self.start_index = start_index 83 | 84 | self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU()) 85 | 86 | def forward(self, x): 87 | readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :]) 88 | features = torch.cat((x[:, self.start_index :], readout), -1) 89 | 90 | return self.project(features) 91 | 92 | 93 | class Transpose(nn.Module): 94 | def __init__(self, dim0, dim1): 95 | super(Transpose, self).__init__() 96 | self.dim0 = dim0 97 | self.dim1 = dim1 98 | 99 | def forward(self, x): 100 | x = x.transpose(self.dim0, self.dim1) 101 | return x 102 | 103 | 104 | def forward_vit(pretrained, x): 105 | b, c, h, w = x.shape 106 | 107 | glob = pretrained.model.forward_flex(x) 108 | 109 | layer_1 = pretrained.activations["1"] 110 | layer_2 = pretrained.activations["2"] 111 | layer_3 = pretrained.activations["3"] 112 | layer_4 = pretrained.activations["4"] 113 | 114 | layer_1 = pretrained.act_postprocess1[0:2](layer_1) 115 | layer_2 = pretrained.act_postprocess2[0:2](layer_2) 116 | layer_3 = pretrained.act_postprocess3[0:2](layer_3) 117 | layer_4 = pretrained.act_postprocess4[0:2](layer_4) 118 | 119 | unflatten = nn.Sequential( 120 | nn.Unflatten( 121 | 2, 122 | torch.Size( 123 | [ 124 | h // pretrained.model.patch_size[1], 125 | w // pretrained.model.patch_size[0], 126 | ] 127 | ), 128 | ) 129 | ) 130 | 131 | if layer_1.ndim == 3: 132 | layer_1 = unflatten(layer_1) 133 | if layer_2.ndim == 3: 134 | layer_2 = unflatten(layer_2) 135 | if layer_3.ndim == 3: 136 | layer_3 = unflatten(layer_3) 137 | if layer_4.ndim == 3: 138 | layer_4 = unflatten(layer_4) 139 | 140 | layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1) 141 | layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2) 142 | layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3) 143 | layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4) 144 | 145 | return layer_1, layer_2, layer_3, layer_4 146 | 147 | 148 | def _resize_pos_embed(self, posemb, gs_h, gs_w): 149 | posemb_tok, posemb_grid = ( 150 | posemb[:, : self.start_index], 151 | posemb[0, self.start_index :], 152 | ) 153 | 154 | gs_old = int(math.sqrt(len(posemb_grid))) 155 | 156 | posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) 157 | posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear") 158 | posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) 159 | 160 | posemb = torch.cat([posemb_tok, posemb_grid], dim=1) 161 | 162 | return posemb 163 | 164 | 165 | def forward_flex(self, x): 166 | b, c, h, w = x.shape 167 | 168 | pos_embed = self._resize_pos_embed( 169 | self.pos_embed, h // self.patch_size[1], w // self.patch_size[0] 170 | ) 171 | 172 | B = x.shape[0] 173 | 174 | if hasattr(self.patch_embed, "backbone"): 175 | x = self.patch_embed.backbone(x) 176 | if isinstance(x, (list, tuple)): 177 | x = x[-1] # last feature if backbone outputs list/tuple of features 178 | 179 | x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) 180 | 181 | if getattr(self, "dist_token", None) is not None: 182 | cls_tokens = self.cls_token.expand( 183 | B, -1, -1 184 | ) # stole cls_tokens impl from Phil Wang, thanks 185 | dist_token = self.dist_token.expand(B, -1, -1) 186 | x = torch.cat((cls_tokens, dist_token, x), dim=1) 187 | else: 188 | cls_tokens = self.cls_token.expand( 189 | B, -1, -1 190 | ) # stole cls_tokens impl from Phil Wang, thanks 191 | x = torch.cat((cls_tokens, x), dim=1) 192 | 193 | x = x + pos_embed 194 | x = self.pos_drop(x) 195 | 196 | for blk in self.blocks: 197 | x = blk(x) 198 | 199 | x = self.norm(x) 200 | 201 | return x 202 | 203 | 204 | def get_readout_oper(vit_features, features, use_readout, start_index=1): 205 | if use_readout == "ignore": 206 | readout_oper = [Slice(start_index)] * len(features) 207 | elif use_readout == "add": 208 | readout_oper = [AddReadout(start_index)] * len(features) 209 | elif use_readout == "project": 210 | readout_oper = [ 211 | ProjectReadout(vit_features, start_index) for out_feat in features 212 | ] 213 | else: 214 | assert ( 215 | False 216 | ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" 217 | 218 | return readout_oper 219 | 220 | 221 | def _make_vit_b16_backbone( 222 | model, 223 | features=[96, 192, 384, 768], 224 | size=[384, 384], 225 | hooks=[2, 5, 8, 11], 226 | vit_features=768, 227 | use_readout="ignore", 228 | start_index=1, 229 | enable_attention_hooks=False, 230 | ): 231 | pretrained = nn.Module() 232 | 233 | pretrained.model = model 234 | pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) 235 | pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) 236 | pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) 237 | pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) 238 | 239 | pretrained.activations = activations 240 | 241 | if enable_attention_hooks: 242 | pretrained.model.blocks[hooks[0]].attn.register_forward_hook( 243 | get_attention("attn_1") 244 | ) 245 | pretrained.model.blocks[hooks[1]].attn.register_forward_hook( 246 | get_attention("attn_2") 247 | ) 248 | pretrained.model.blocks[hooks[2]].attn.register_forward_hook( 249 | get_attention("attn_3") 250 | ) 251 | pretrained.model.blocks[hooks[3]].attn.register_forward_hook( 252 | get_attention("attn_4") 253 | ) 254 | pretrained.attention = attention 255 | 256 | readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) 257 | 258 | # 32, 48, 136, 384 259 | pretrained.act_postprocess1 = nn.Sequential( 260 | readout_oper[0], 261 | Transpose(1, 2), 262 | nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), 263 | nn.Conv2d( 264 | in_channels=vit_features, 265 | out_channels=features[0], 266 | kernel_size=1, 267 | stride=1, 268 | padding=0, 269 | ), 270 | nn.ConvTranspose2d( 271 | in_channels=features[0], 272 | out_channels=features[0], 273 | kernel_size=4, 274 | stride=4, 275 | padding=0, 276 | bias=True, 277 | dilation=1, 278 | groups=1, 279 | ), 280 | ) 281 | 282 | pretrained.act_postprocess2 = nn.Sequential( 283 | readout_oper[1], 284 | Transpose(1, 2), 285 | nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), 286 | nn.Conv2d( 287 | in_channels=vit_features, 288 | out_channels=features[1], 289 | kernel_size=1, 290 | stride=1, 291 | padding=0, 292 | ), 293 | nn.ConvTranspose2d( 294 | in_channels=features[1], 295 | out_channels=features[1], 296 | kernel_size=2, 297 | stride=2, 298 | padding=0, 299 | bias=True, 300 | dilation=1, 301 | groups=1, 302 | ), 303 | ) 304 | 305 | pretrained.act_postprocess3 = nn.Sequential( 306 | readout_oper[2], 307 | Transpose(1, 2), 308 | nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), 309 | nn.Conv2d( 310 | in_channels=vit_features, 311 | out_channels=features[2], 312 | kernel_size=1, 313 | stride=1, 314 | padding=0, 315 | ), 316 | ) 317 | 318 | pretrained.act_postprocess4 = nn.Sequential( 319 | readout_oper[3], 320 | Transpose(1, 2), 321 | nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), 322 | nn.Conv2d( 323 | in_channels=vit_features, 324 | out_channels=features[3], 325 | kernel_size=1, 326 | stride=1, 327 | padding=0, 328 | ), 329 | nn.Conv2d( 330 | in_channels=features[3], 331 | out_channels=features[3], 332 | kernel_size=3, 333 | stride=2, 334 | padding=1, 335 | ), 336 | ) 337 | 338 | pretrained.model.start_index = start_index 339 | pretrained.model.patch_size = [16, 16] 340 | 341 | # We inject this function into the VisionTransformer instances so that 342 | # we can use it with interpolated position embeddings without modifying the library source. 343 | pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) 344 | pretrained.model._resize_pos_embed = types.MethodType( 345 | _resize_pos_embed, pretrained.model 346 | ) 347 | 348 | return pretrained 349 | 350 | 351 | def _make_vit_b_rn50_backbone( 352 | model, 353 | features=[256, 512, 768, 768], 354 | size=[384, 384], 355 | hooks=[0, 1, 8, 11], 356 | vit_features=768, 357 | use_vit_only=False, 358 | use_readout="ignore", 359 | start_index=1, 360 | enable_attention_hooks=False, 361 | ): 362 | pretrained = nn.Module() 363 | 364 | pretrained.model = model 365 | 366 | if use_vit_only == True: 367 | pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) 368 | pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) 369 | else: 370 | pretrained.model.patch_embed.backbone.stages[0].register_forward_hook( 371 | get_activation("1") 372 | ) 373 | pretrained.model.patch_embed.backbone.stages[1].register_forward_hook( 374 | get_activation("2") 375 | ) 376 | 377 | pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) 378 | pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) 379 | 380 | if enable_attention_hooks: 381 | pretrained.model.blocks[2].attn.register_forward_hook(get_attention("attn_1")) 382 | pretrained.model.blocks[5].attn.register_forward_hook(get_attention("attn_2")) 383 | pretrained.model.blocks[8].attn.register_forward_hook(get_attention("attn_3")) 384 | pretrained.model.blocks[11].attn.register_forward_hook(get_attention("attn_4")) 385 | pretrained.attention = attention 386 | 387 | pretrained.activations = activations 388 | 389 | readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) 390 | 391 | if use_vit_only == True: 392 | pretrained.act_postprocess1 = nn.Sequential( 393 | readout_oper[0], 394 | Transpose(1, 2), 395 | nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), 396 | nn.Conv2d( 397 | in_channels=vit_features, 398 | out_channels=features[0], 399 | kernel_size=1, 400 | stride=1, 401 | padding=0, 402 | ), 403 | nn.ConvTranspose2d( 404 | in_channels=features[0], 405 | out_channels=features[0], 406 | kernel_size=4, 407 | stride=4, 408 | padding=0, 409 | bias=True, 410 | dilation=1, 411 | groups=1, 412 | ), 413 | ) 414 | 415 | pretrained.act_postprocess2 = nn.Sequential( 416 | readout_oper[1], 417 | Transpose(1, 2), 418 | nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), 419 | nn.Conv2d( 420 | in_channels=vit_features, 421 | out_channels=features[1], 422 | kernel_size=1, 423 | stride=1, 424 | padding=0, 425 | ), 426 | nn.ConvTranspose2d( 427 | in_channels=features[1], 428 | out_channels=features[1], 429 | kernel_size=2, 430 | stride=2, 431 | padding=0, 432 | bias=True, 433 | dilation=1, 434 | groups=1, 435 | ), 436 | ) 437 | else: 438 | pretrained.act_postprocess1 = nn.Sequential( 439 | nn.Identity(), nn.Identity(), nn.Identity() 440 | ) 441 | pretrained.act_postprocess2 = nn.Sequential( 442 | nn.Identity(), nn.Identity(), nn.Identity() 443 | ) 444 | 445 | pretrained.act_postprocess3 = nn.Sequential( 446 | readout_oper[2], 447 | Transpose(1, 2), 448 | nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), 449 | nn.Conv2d( 450 | in_channels=vit_features, 451 | out_channels=features[2], 452 | kernel_size=1, 453 | stride=1, 454 | padding=0, 455 | ), 456 | ) 457 | 458 | pretrained.act_postprocess4 = nn.Sequential( 459 | readout_oper[3], 460 | Transpose(1, 2), 461 | nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), 462 | nn.Conv2d( 463 | in_channels=vit_features, 464 | out_channels=features[3], 465 | kernel_size=1, 466 | stride=1, 467 | padding=0, 468 | ), 469 | nn.Conv2d( 470 | in_channels=features[3], 471 | out_channels=features[3], 472 | kernel_size=3, 473 | stride=2, 474 | padding=1, 475 | ), 476 | ) 477 | 478 | pretrained.model.start_index = start_index 479 | pretrained.model.patch_size = [16, 16] 480 | 481 | # We inject this function into the VisionTransformer instances so that 482 | # we can use it with interpolated position embeddings without modifying the library source. 483 | pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) 484 | 485 | # We inject this function into the VisionTransformer instances so that 486 | # we can use it with interpolated position embeddings without modifying the library source. 487 | pretrained.model._resize_pos_embed = types.MethodType( 488 | _resize_pos_embed, pretrained.model 489 | ) 490 | 491 | return pretrained 492 | 493 | 494 | def _make_pretrained_vitb_rn50_384( 495 | pretrained, 496 | use_readout="ignore", 497 | hooks=None, 498 | use_vit_only=False, 499 | enable_attention_hooks=False, 500 | ): 501 | model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained) 502 | 503 | hooks = [0, 1, 8, 11] if hooks == None else hooks 504 | return _make_vit_b_rn50_backbone( 505 | model, 506 | features=[256, 512, 768, 768], 507 | size=[384, 384], 508 | hooks=hooks, 509 | use_vit_only=use_vit_only, 510 | use_readout=use_readout, 511 | enable_attention_hooks=enable_attention_hooks, 512 | ) 513 | 514 | 515 | def _make_pretrained_vitl16_384( 516 | pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False 517 | ): 518 | model = timm.create_model("vit_large_patch16_384", pretrained=pretrained) 519 | 520 | hooks = [5, 11, 17, 23] if hooks == None else hooks 521 | return _make_vit_b16_backbone( 522 | model, 523 | features=[256, 512, 1024, 1024], 524 | hooks=hooks, 525 | vit_features=1024, 526 | use_readout=use_readout, 527 | enable_attention_hooks=enable_attention_hooks, 528 | ) 529 | 530 | 531 | def _make_pretrained_vitb16_384( 532 | pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False 533 | ): 534 | model = timm.create_model("vit_base_patch16_384", pretrained=pretrained) 535 | 536 | hooks = [2, 5, 8, 11] if hooks == None else hooks 537 | return _make_vit_b16_backbone( 538 | model, 539 | features=[96, 192, 384, 768], 540 | hooks=hooks, 541 | use_readout=use_readout, 542 | enable_attention_hooks=enable_attention_hooks, 543 | ) 544 | 545 | 546 | def _make_pretrained_deitb16_384( 547 | pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False 548 | ): 549 | model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained) 550 | 551 | hooks = [2, 5, 8, 11] if hooks == None else hooks 552 | return _make_vit_b16_backbone( 553 | model, 554 | features=[96, 192, 384, 768], 555 | hooks=hooks, 556 | use_readout=use_readout, 557 | enable_attention_hooks=enable_attention_hooks, 558 | ) 559 | 560 | 561 | def _make_pretrained_deitb16_distil_384( 562 | pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False 563 | ): 564 | model = timm.create_model( 565 | "vit_deit_base_distilled_patch16_384", pretrained=pretrained 566 | ) 567 | 568 | hooks = [2, 5, 8, 11] if hooks == None else hooks 569 | return _make_vit_b16_backbone( 570 | model, 571 | features=[96, 192, 384, 768], 572 | hooks=hooks, 573 | use_readout=use_readout, 574 | start_index=2, 575 | enable_attention_hooks=enable_attention_hooks, 576 | ) 577 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import argparse 4 | from tqdm import tqdm 5 | from dataloader.kitti import KITTI 6 | from camera_model import CameraModel 7 | from depth_model import DepthModel 8 | from visual_odometry import VisualOdometry 9 | from traj_utils import plot_trajectory, save_trajectory 10 | import torch 11 | 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument( 16 | "-d", "--data_path", 17 | default=r"dataset\sequences_jpg", 18 | help="path to dataset" 19 | ) 20 | parser.add_argument( 21 | "-s", "--sequence", 22 | default=00, 23 | help="sequence to be evaluated", 24 | ) 25 | parser.add_argument( 26 | "-p", 27 | "--pose_path", 28 | default=r"dataset\poses", 29 | help="path to ground truth poses", 30 | ) 31 | parser.add_argument( 32 | "-m", "--model_weights", 33 | default=None, 34 | help="path to model weights" 35 | ) 36 | parser.add_argument( 37 | "-t", "--model_type", 38 | default="dpt_hybrid_kitti", 39 | help="model type [dpt_large|dpt_hybrid|dpt_hybrid_kitti]", 40 | ) 41 | parser.add_argument( 42 | "-disp", "--display_traj", 43 | default=False, 44 | help="display trajectory during motion estimation if True", 45 | ) 46 | parser.add_argument( 47 | "-seed", "--SEED", 48 | default=2, 49 | help="Random seed (int)", 50 | ) 51 | 52 | parser.add_argument("--kitti_crop", dest="kitti_crop", action="store_true") 53 | parser.add_argument("--absolute_depth", dest="absolute_depth", action="store_true") 54 | parser.add_argument("--optimize", dest="optimize", action="store_true") 55 | parser.add_argument("--no-optimize", dest="optimize", action="store_false") 56 | parser.set_defaults(optimize=True) 57 | parser.set_defaults(kitti_crop=False) 58 | parser.set_defaults(absolute_depth=False) 59 | 60 | args = parser.parse_args() 61 | 62 | # Set random seed 63 | np.random.seed(args.SEED) 64 | torch.cuda.manual_seed(args.SEED) 65 | torch.manual_seed(args.SEED) 66 | 67 | # Create KITTI dataloader 68 | dataloader = KITTI( 69 | data_path=args.data_path, 70 | pose_path=args.pose_path, 71 | sequence=args.sequence, 72 | ) 73 | 74 | # Create camera model object 75 | cam = CameraModel(params=dataloader.cam_params) 76 | 77 | # Create network model to estimate depth 78 | depth_model = DepthModel(model_type=args.model_type) 79 | 80 | # Initialize VO with camera model and depth model 81 | vo = VisualOdometry(cam, depth_model) 82 | 83 | # Initialize graph trajectory 84 | trajectory = 255 + np.zeros((700, 700, 3), dtype=np.uint8) 85 | 86 | # Initialize lists 87 | estimated_trajectory = [] 88 | gt_trajectory = [] 89 | poses = [] 90 | 91 | for _ in tqdm(range(len(dataloader)), desc="Sequence {}: ".format(args.sequence)): 92 | # Get frame, ground truth pose and frame_id from dataset 93 | frame, pose, frame_id = dataloader.get_next_data() 94 | 95 | # Apply VO motion estimation algorithm 96 | vo.update(frame, frame_id) 97 | 98 | # Get estimated translation 99 | estimated_t = vo.t.flatten() 100 | [x, y, z] = estimated_t 101 | [x_true, y_true, z_true] = [pose[0], pose[1], pose[2]] 102 | 103 | # Store all estimated poses (4x4) 104 | poses.append(vo.pose) 105 | 106 | # Store trajectories 107 | estimated_trajectory.append(estimated_t) 108 | gt_trajectory.append(pose) 109 | 110 | # Draw trajectory 111 | if args.display_traj: 112 | cv2.circle(trajectory, (int(x)+350, int(-z)+610), 1, (255, 0, 0), 1) 113 | cv2.circle(trajectory, (int(x_true)+350, int(-z_true)+610), 1, (0, 0, 255), 2) 114 | cv2.rectangle(trajectory, (10, 20), (600, 81), (255, 255, 255), -1) # background to display MSE 115 | cv2.putText(trajectory, "Ground truth (RED)", (20, 40), cv2.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255), 1, 8) 116 | cv2.putText(trajectory, "Estimated (BLUE)", (20, 60), cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 0, 0), 1, 8) 117 | # compute and display distance 118 | MSE = np.linalg.norm(np.array([x, z]) - np.array([x_true, z_true])) 119 | cv2.putText(trajectory, "Frobenius Norm: {:.2f}".format(MSE), (20, 80), cv2.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 0), 1, 8) 120 | 121 | cv2.imshow("Camera", frame) 122 | cv2.imshow("Visual Odometry", trajectory) 123 | if cv2.waitKey(1) & 0xFF == ord('q'): 124 | break 125 | 126 | # Save predicted poses 127 | save_trajectory(poses, args.sequence, save_dir="results") 128 | 129 | # Save image map 130 | if args.display_traj: 131 | cv2.imwrite("results/maps/map_{}.png".format(args.sequence), trajectory) 132 | 133 | # Plot estimated trajectory 134 | plot_trajectory(gt_trajectory, estimated_trajectory, 135 | save_name="results/plots/plot_{}.png".format(args.sequence)) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.8.1 2 | torchvision==0.9.1 3 | opencv-python==4.5.2.54 4 | timm==0.4.5 5 | tqdm==4.64.0 6 | matplotlib==3.5.2 7 | sklearn==0.0 8 | numpy==1.23.1 -------------------------------------------------------------------------------- /results/maps/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aofrancani/DPT-VO/aef64207243fbd29ec17e1c108def8ac3e19b68d/results/maps/.placeholder -------------------------------------------------------------------------------- /results/plots/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aofrancani/DPT-VO/aef64207243fbd29ec17e1c108def8ac3e19b68d/results/plots/.placeholder -------------------------------------------------------------------------------- /scale_recovery.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code adapted and modified from DF-VO: 3 | -https://github.com/Huangying-Zhan/DF-VO 4 | """ 5 | 6 | import numpy as np 7 | from sklearn import linear_model 8 | import cv2 9 | 10 | 11 | def image_shape(img): 12 | """Return image shape 13 | 14 | Args: 15 | img (array, [HxWx(c) or HxW]): image 16 | 17 | Returns: 18 | a tuple containing 19 | - **h** (int) : image height 20 | - **w** (int) : image width 21 | - **c** (int) : image channel 22 | """ 23 | if len(img.shape) == 3: 24 | return img.shape 25 | elif len(img.shape) == 2: 26 | h, w = img.shape 27 | return h, w, 1 28 | 29 | 30 | def find_scale_from_depth(cam_intrinsics, kp1, kp2, T_21, depth2, ransac_method="depth_ratio"): 31 | """Compute VO scaling factor for T_21 32 | 33 | Args: 34 | kp1 (array, [Nx2]): reference kp 35 | kp2 (array, [Nx2]): current kp 36 | T_21 (array, [4x4]): relative pose; from view 1 to view 2 37 | depth2 (array, [HxW]): depth 2 38 | 39 | Returns: 40 | scale (float): scaling factor 41 | """ 42 | # Triangulation 43 | img_h, img_w, _ = image_shape(depth2) 44 | kp1_norm = kp1.copy() 45 | kp2_norm = kp2.copy() 46 | 47 | kp1_norm[:, 0] = \ 48 | (kp1[:, 0] - cam_intrinsics.cx) / cam_intrinsics.fx 49 | kp1_norm[:, 1] = \ 50 | (kp1[:, 1] - cam_intrinsics.cy) / cam_intrinsics.fy 51 | kp2_norm[:, 0] = \ 52 | (kp2[:, 0] - cam_intrinsics.cx) / cam_intrinsics.fx 53 | kp2_norm[:, 1] = \ 54 | (kp2[:, 1] - cam_intrinsics.cy) / cam_intrinsics.fy 55 | 56 | # triangulation 57 | _, _, X2_tri = triangulation(kp1_norm, kp2_norm, np.eye(4), T_21) 58 | 59 | # Triangulation outlier removal 60 | depth2_tri = convert_sparse3D_to_depth(kp2, X2_tri, img_h, img_w) 61 | depth2_tri[depth2_tri < 0] = 0 62 | # self.timers.end('triangulation') 63 | 64 | # common mask filtering 65 | non_zero_mask_pred2 = (depth2 > 0) 66 | non_zero_mask_tri2 = (depth2_tri > 0) 67 | valid_mask2 = non_zero_mask_pred2 * non_zero_mask_tri2 68 | 69 | depth_pred_non_zero = np.concatenate([depth2[valid_mask2]]) 70 | depth_tri_non_zero = np.concatenate([depth2_tri[valid_mask2]]) 71 | depth_ratio = depth_tri_non_zero / depth_pred_non_zero 72 | 73 | 74 | # Estimate scale (ransac) 75 | if valid_mask2.sum() > 10: 76 | # RANSAC scaling solver 77 | # self.timers.start('scale ransac', 'scale_recovery') 78 | ransac = linear_model.RANSACRegressor( 79 | base_estimator=linear_model.LinearRegression( 80 | fit_intercept=False), 81 | min_samples=3, # minimum number of min_samples 82 | max_trials=100, # maximum number of trials 83 | stop_probability=0.99, # the probability that the algorithm produces a useful result 84 | residual_threshold=0.1, # inlier threshold value 85 | ) 86 | if ransac_method == "depth_ratio": 87 | ransac.fit( 88 | depth_ratio.reshape(-1, 1), 89 | np.ones((depth_ratio.shape[0], 1)) 90 | ) 91 | elif ransac_method == "abs_diff": 92 | ransac.fit( 93 | depth_tri_non_zero.reshape(-1, 1), 94 | depth_pred_non_zero.reshape(-1, 1), 95 | ) 96 | scale = ransac.estimator_.coef_[0, 0] 97 | 98 | else: 99 | scale = -1.0 100 | return scale 101 | 102 | 103 | def triangulation(kp1, kp2, T_1w, T_2w): 104 | """Triangulation to get 3D points 105 | 106 | Args: 107 | kp1 (array, [Nx2]): keypoint in view 1 (normalized) 108 | kp2 (array, [Nx2]): keypoints in view 2 (normalized) 109 | T_1w (array, [4x4]): pose of view 1 w.r.t i.e. T_1w (from w to 1) 110 | T_2w (array, [4x4]): pose of view 2 w.r.t world, i.e. T_2w (from w to 2) 111 | 112 | Returns: 113 | a tuple containing 114 | - **X** (array, [3xN]): 3D coordinates of the keypoints w.r.t world coordinate 115 | - **X1** (array, [3xN]): 3D coordinates of the keypoints w.r.t view1 coordinate 116 | - **X2** (array, [3xN]): 3D coordinates of the keypoints w.r.t view2 coordinate 117 | """ 118 | kp1_3D = np.ones((3, kp1.shape[0])) 119 | kp2_3D = np.ones((3, kp2.shape[0])) 120 | kp1_3D[0], kp1_3D[1] = kp1[:, 0].copy(), kp1[:, 1].copy() 121 | kp2_3D[0], kp2_3D[1] = kp2[:, 0].copy(), kp2[:, 1].copy() 122 | X = cv2.triangulatePoints(T_1w[:3], T_2w[:3], kp1_3D[:2], kp2_3D[:2]) 123 | X /= X[3] 124 | X1 = T_1w[:3] @ X 125 | X2 = T_2w[:3] @ X 126 | return X[:3], X1, X2 127 | 128 | 129 | def convert_sparse3D_to_depth(kp, XYZ, height, width): 130 | """Convert sparse 3D keypoint to depth map 131 | 132 | Args: 133 | kp (array, [Nx2]): keypoints 134 | XYZ (array, [3xN]): 3D coorindates for the keypoints 135 | height (int): image height 136 | width (int): image width 137 | 138 | Returns: 139 | depth (array, [HxW]): depth map 140 | """ 141 | # initialize depth map 142 | depth = np.zeros((height, width)) 143 | kp_int = kp.astype(np.int) 144 | 145 | # remove out of region keypoints 146 | y_idx = (kp_int[:, 0] >= 0) * (kp_int[:, 0] < width) 147 | kp_int = kp_int[y_idx] 148 | x_idx = (kp_int[:, 1] >= 0) * (kp_int[:, 1] < height) 149 | kp_int = kp_int[x_idx] 150 | 151 | XYZ = XYZ[:, y_idx] 152 | XYZ = XYZ[:, x_idx] 153 | 154 | depth[kp_int[:, 1], kp_int[:, 0]] = XYZ[2] 155 | return depth 156 | -------------------------------------------------------------------------------- /seq_00.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aofrancani/DPT-VO/aef64207243fbd29ec17e1c108def8ac3e19b68d/seq_00.gif -------------------------------------------------------------------------------- /traj_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_trajectory(gt_poses, estimated_poses, save_name): 6 | """ 7 | Plot estimated and ground truth trajectory. 8 | 9 | Args: 10 | gt_poses {list}: list with ground truth poses of trajectory [x_true, y_true, z_true] 11 | estimated_poses {list}: list with estimated poses of trajectory [x, y, z] 12 | """ 13 | plt.figure() 14 | # Plot estimated trajectory 15 | plt.plot([x[0] for x in estimated_poses], [z[2] for z in estimated_poses], "b") 16 | # Plot ground truth trajectory 17 | plt.plot([x[0] for x in gt_poses], [z[2] for z in gt_poses], "r") 18 | 19 | plt.grid() 20 | plt.title("Visual Odometry") 21 | plt.xlabel("Translation in x direction [m]") 22 | plt.ylabel("Translation in z direction [m]") 23 | plt.legend(["estimated", "ground truth"]) 24 | plt.savefig(save_name) 25 | 26 | 27 | def save_trajectory(poses, sequence, save_dir): 28 | """ 29 | Save predicted poses in .txt file 30 | 31 | Args: 32 | poses {ndarray}: list with all 4x4 pose matrix 33 | sequence {str}: sequence of KITTI dataset 34 | save_dir {str}: path to save pose 35 | """ 36 | # create directory 37 | if not os.path.exists(save_dir): 38 | os.makedirs(save_dir) 39 | 40 | output_filename = os.path.join(save_dir, "{}.txt".format(sequence)) 41 | with open(output_filename, "w") as f: 42 | for pose in poses: 43 | pose = pose.flatten()[:12] 44 | line = " ".join([str(x) for x in pose]) + "\n" 45 | # line = f"{pose[0]:.4f}" + " " + f"{pose[1]:.4f}" + " " + f"{pose[2]:.4f}" + "\n" 46 | f.write(line) 47 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aofrancani/DPT-VO/aef64207243fbd29ec17e1c108def8ac3e19b68d/util/__init__.py -------------------------------------------------------------------------------- /util/gric.py: -------------------------------------------------------------------------------- 1 | '''''' 2 | ''' 3 | @Author: Huangying Zhan (huangying.zhan.work@gmail.com) 4 | @Date: 2020-03-01 5 | @Copyright: Copyright (C) Huangying Zhan 2020. All rights reserved. Please refer to the license file. 6 | @LastEditTime: 2020-05-27 7 | @LastEditors: Huangying Zhan 8 | @Description: This file contains functions related to GRIC computation 9 | ''' 10 | 11 | import numpy as np 12 | 13 | 14 | def compute_fundamental_residual(F, kp1, kp2): 15 | """ 16 | Compute fundamental matrix residual 17 | 18 | Args: 19 | F (array, [3x3]): Fundamental matrix (from view-1 to view-2) 20 | kp1 (array, [Nx2]): keypoint 1 21 | kp2 (array, [Nx2]): keypoint 2 22 | 23 | Returns: 24 | res (array, [N]): residual 25 | """ 26 | # get homogeneous keypoints (3xN array) 27 | m0 = np.ones((3, kp1.shape[0])) 28 | m0[:2] = np.transpose(kp1, (1, 0)) 29 | m1 = np.ones((3, kp2.shape[0])) 30 | m1[:2] = np.transpose(kp2, (1, 0)) 31 | 32 | Fm0 = F @ m0 # 3xN 33 | Ftm1 = F.T @ m1 # 3xN 34 | 35 | m1Fm0 = (np.transpose(Fm0, (1, 0)) @ m1).diagonal() 36 | res = m1Fm0 ** 2 / (np.sum(Fm0[:2] ** 2, axis=0) + np.sum(Ftm1[:2] ** 2, axis=0)) 37 | return res 38 | 39 | 40 | def compute_homography_residual(H_in, kp1, kp2): 41 | """ 42 | Compute homography matrix residual 43 | 44 | Args: 45 | H (array, [3x3]): homography matrix (Transformation from view-1 to view-2) 46 | kp1 (array, [Nx2]): keypoint 1 47 | kp2 (array, [Nx2]): keypoint 2 48 | 49 | Returns: 50 | res (array, [N]): residual 51 | """ 52 | n = kp1.shape[0] 53 | H = H_in.flatten() 54 | 55 | # get homogeneous keypoints (3xN array) 56 | m0 = np.ones((3, kp1.shape[0])) 57 | m0[:2] = np.transpose(kp1, (1, 0)) 58 | m1 = np.ones((3, kp2.shape[0])) 59 | m1[:2] = np.transpose(kp2, (1, 0)) 60 | 61 | G0 = np.zeros((3, n)) 62 | G1 = np.zeros((3, n)) 63 | 64 | G0[0] = H[0] - m1[0] * H[6] 65 | G0[1] = H[1] - m1[0] * H[7] 66 | G0[2] = -m0[0] * H[6] - m0[1] * H[7] - H[8] 67 | 68 | G1[0] = H[3] - m1[1] * H[6] 69 | G1[1] = H[4] - m1[1] * H[7] 70 | G1[2] = -m0[0] * H[6] - m0[1] * H[7] - H[8] 71 | 72 | magG0 = np.sqrt(G0[0] * G0[0] + G0[1] * G0[1] + G0[2] * G0[2]) 73 | magG1 = np.sqrt(G1[0] * G1[0] + G1[1] * G1[1] + G1[2] * G1[2]) 74 | magG0G1 = G0[0] * G1[0] + G0[1] * G1[1] 75 | 76 | alpha = np.arccos(magG0G1 / (magG0 * magG1)) 77 | 78 | alg = np.zeros((2, n)) 79 | alg[0] = m0[0] * H[0] + m0[1] * H[1] + H[2] - \ 80 | m1[0] * (m0[0] * H[6] + m0[1] * H[7] + H[8]) 81 | 82 | alg[1] = m0[0] * H[3] + m0[1] * H[4] + H[5] - \ 83 | m1[1] * (m0[0] * H[6] + m0[1] * H[7] + H[8]) 84 | 85 | D1 = alg[0] / magG0 86 | D2 = alg[1] / magG1 87 | 88 | res = (D1 * D1 + D2 * D2 - 2.0 * D1 * D2 * np.cos(alpha)) / np.sin(alpha) 89 | 90 | return res 91 | 92 | 93 | def calc_GRIC(res, sigma, n, model): 94 | """Calculate GRIC 95 | 96 | Args: 97 | res (array, [N]): residual 98 | sigma (float): assumed variance of the error 99 | n (int): number of residuals 100 | model (str): model type 101 | - FMat 102 | - EMat 103 | - HMat 104 | """ 105 | R = 4 106 | sigmasq1 = 1. / sigma ** 2 107 | 108 | K = { 109 | "FMat": 7, 110 | "EMat": 5, 111 | "HMat": 8, 112 | }[model] 113 | D = { 114 | "FMat": 3, 115 | "EMat": 3, 116 | "HMat": 2, 117 | }[model] 118 | 119 | lam3RD = 2.0 * (R - D) 120 | 121 | sum_ = 0 122 | for i in range(n): 123 | tmp = res[i] * sigmasq1 124 | if tmp <= lam3RD: 125 | sum_ += tmp 126 | else: 127 | sum_ += lam3RD 128 | 129 | sum_ += n * D * np.log(R) + K * np.log(R * n) 130 | 131 | return sum_ 132 | -------------------------------------------------------------------------------- /util/io.py: -------------------------------------------------------------------------------- 1 | """Utils for monoDepth. 2 | """ 3 | import sys 4 | import re 5 | import numpy as np 6 | import cv2 7 | import torch 8 | import matplotlib as mpl 9 | 10 | from PIL import Image 11 | 12 | 13 | from .pallete import get_mask_pallete 14 | 15 | def read_pfm(path): 16 | """Read pfm file. 17 | 18 | Args: 19 | path (str): path to file 20 | 21 | Returns: 22 | tuple: (data, scale) 23 | """ 24 | with open(path, "rb") as file: 25 | 26 | color = None 27 | width = None 28 | height = None 29 | scale = None 30 | endian = None 31 | 32 | header = file.readline().rstrip() 33 | if header.decode("ascii") == "PF": 34 | color = True 35 | elif header.decode("ascii") == "Pf": 36 | color = False 37 | else: 38 | raise Exception("Not a PFM file: " + path) 39 | 40 | dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii")) 41 | if dim_match: 42 | width, height = list(map(int, dim_match.groups())) 43 | else: 44 | raise Exception("Malformed PFM header.") 45 | 46 | scale = float(file.readline().decode("ascii").rstrip()) 47 | if scale < 0: 48 | # little-endian 49 | endian = "<" 50 | scale = -scale 51 | else: 52 | # big-endian 53 | endian = ">" 54 | 55 | data = np.fromfile(file, endian + "f") 56 | shape = (height, width, 3) if color else (height, width) 57 | 58 | data = np.reshape(data, shape) 59 | data = np.flipud(data) 60 | 61 | return data, scale 62 | 63 | 64 | def write_pfm(path, image, scale=1): 65 | """Write pfm file. 66 | 67 | Args: 68 | path (str): pathto file 69 | image (array): data 70 | scale (int, optional): Scale. Defaults to 1. 71 | """ 72 | 73 | with open(path, "wb") as file: 74 | color = None 75 | 76 | if image.dtype.name != "float32": 77 | raise Exception("Image dtype must be float32.") 78 | 79 | image = np.flipud(image) 80 | 81 | if len(image.shape) == 3 and image.shape[2] == 3: # color image 82 | color = True 83 | elif ( 84 | len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1 85 | ): # greyscale 86 | color = False 87 | else: 88 | raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") 89 | 90 | file.write("PF\n" if color else "Pf\n".encode()) 91 | file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) 92 | 93 | endian = image.dtype.byteorder 94 | 95 | if endian == "<" or endian == "=" and sys.byteorder == "little": 96 | scale = -scale 97 | 98 | file.write("%f\n".encode() % scale) 99 | 100 | image.tofile(file) 101 | 102 | 103 | def read_image(path): 104 | """Read image and output RGB image (0-1). 105 | 106 | Args: 107 | path (str): path to file 108 | 109 | Returns: 110 | array: RGB image (0-1) 111 | """ 112 | img = cv2.imread(path) 113 | 114 | if img.ndim == 2: 115 | img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) 116 | 117 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 118 | 119 | return img 120 | 121 | 122 | def resize_image(img): 123 | """Resize image and make it fit for network. 124 | 125 | Args: 126 | img (array): image 127 | 128 | Returns: 129 | tensor: data ready for network 130 | """ 131 | height_orig = img.shape[0] 132 | width_orig = img.shape[1] 133 | 134 | if width_orig > height_orig: 135 | scale = width_orig / 384 136 | else: 137 | scale = height_orig / 384 138 | 139 | height = (np.ceil(height_orig / scale / 32) * 32).astype(int) 140 | width = (np.ceil(width_orig / scale / 32) * 32).astype(int) 141 | 142 | img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA) 143 | 144 | img_resized = ( 145 | torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float() 146 | ) 147 | img_resized = img_resized.unsqueeze(0) 148 | 149 | return img_resized 150 | 151 | 152 | def resize_depth(depth, width, height): 153 | """Resize depth map and bring to CPU (numpy). 154 | 155 | Args: 156 | depth (tensor): depth 157 | width (int): image width 158 | height (int): image height 159 | 160 | Returns: 161 | array: processed depth 162 | """ 163 | depth = torch.squeeze(depth[0, :, :, :]).to("cpu") 164 | 165 | depth_resized = cv2.resize( 166 | depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC 167 | ) 168 | 169 | return depth_resized 170 | 171 | 172 | def write_depth(path, depth, bits=1, absolute_depth=False): 173 | """Write depth map to pfm and png file. 174 | 175 | Args: 176 | path (str): filepath without extension 177 | depth (array): depth 178 | """ 179 | write_pfm(path + ".pfm", depth.astype(np.float32)) 180 | 181 | if absolute_depth: 182 | out = depth 183 | else: 184 | depth_min = depth.min() 185 | depth_max = depth.max() 186 | 187 | max_val = (2 ** (8 * bits)) - 1 188 | 189 | if depth_max - depth_min > np.finfo("float").eps: 190 | out = max_val * (depth - depth_min) / (depth_max - depth_min) 191 | else: 192 | out = np.zeros(depth.shape, dtype=depth.dtype) 193 | 194 | if bits == 1: 195 | cv2.imwrite(path + ".png", out.astype("uint8"), [cv2.IMWRITE_PNG_COMPRESSION, 0]) 196 | elif bits == 2: 197 | # out = 1 / (out + 1e-3) 198 | # out[out == 0] = 0 199 | # vmax = np.percentile(out, 90) 200 | # normalizer = mpl.colors.Normalize(vmin=0, vmax=90) 201 | # normalizer = mpl.colors.Normalize(vmin=0, vmax=90) 202 | # mapper = mpl.cm.ScalarMappable(norm=normalizer, cmap='magma') 203 | # mapper = mpl.cm.ScalarMappable(cmap='magma') 204 | # out = (mapper.to_rgba(out)[:, :, :3]*255).astype(np.uint8) 205 | cv2.imwrite(path + ".png", cv2.cvtColor(out.astype("uint8"), cv2.COLOR_RGB2BGR), [cv2.IMWRITE_PNG_COMPRESSION, 0]) 206 | 207 | # cv2.imwrite(path + ".png", out.astype("uint16"), [cv2.IMWRITE_PNG_COMPRESSION, 0]) 208 | 209 | return 210 | 211 | 212 | def write_segm_img(path, image, labels, palette="detail", alpha=0.5): 213 | """Write depth map to pfm and png file. 214 | 215 | Args: 216 | path (str): filepath without extension 217 | image (array): input image 218 | labels (array): labeling of the image 219 | """ 220 | 221 | mask = get_mask_pallete(labels, "ade20k") 222 | 223 | img = Image.fromarray(np.uint8(255*image)).convert("RGBA") 224 | seg = mask.convert("RGBA") 225 | 226 | out = Image.blend(img, seg, alpha) 227 | 228 | out.save(path + ".png") 229 | 230 | return 231 | -------------------------------------------------------------------------------- /util/misc.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | from dpt.vit import get_mean_attention_map 4 | 5 | def visualize_attention(input, model, prediction, model_type): 6 | input = (input + 1.0)/2.0 7 | 8 | attn1 = model.pretrained.attention["attn_1"] 9 | attn2 = model.pretrained.attention["attn_2"] 10 | attn3 = model.pretrained.attention["attn_3"] 11 | attn4 = model.pretrained.attention["attn_4"] 12 | 13 | plt.subplot(3,4,1), plt.imshow(input.squeeze().permute(1,2,0)), plt.title("Input", fontsize=8), plt.axis("off") 14 | plt.subplot(3,4,2), plt.imshow(prediction), plt.set_cmap("inferno"), plt.title("Prediction", fontsize=8), plt.axis("off") 15 | 16 | if model_type == "dpt_hybrid": 17 | h = [3,6,9,12] 18 | else: 19 | h = [6,12,18,24] 20 | 21 | # upper left 22 | plt.subplot(345), 23 | ax1 = plt.imshow(get_mean_attention_map(attn1, 1, input.shape)) 24 | plt.ylabel("Upper left corner", fontsize=8) 25 | plt.title(f"Layer {h[0]}", fontsize=8) 26 | gc = plt.gca() 27 | gc.axes.xaxis.set_ticklabels([]) 28 | gc.axes.yaxis.set_ticklabels([]) 29 | gc.axes.xaxis.set_ticks([]) 30 | gc.axes.yaxis.set_ticks([]) 31 | 32 | 33 | plt.subplot(346), 34 | plt.imshow(get_mean_attention_map(attn2, 1, input.shape)) 35 | plt.title(f"Layer {h[1]}", fontsize=8) 36 | plt.axis("off"), 37 | 38 | plt.subplot(347), 39 | plt.imshow(get_mean_attention_map(attn3, 1, input.shape)) 40 | plt.title(f"Layer {h[2]}", fontsize=8) 41 | plt.axis("off"), 42 | 43 | 44 | plt.subplot(348), 45 | plt.imshow(get_mean_attention_map(attn4, 1, input.shape)) 46 | plt.title(f"Layer {h[3]}", fontsize=8) 47 | plt.axis("off"), 48 | 49 | 50 | # lower right 51 | plt.subplot(3,4,9), plt.imshow(get_mean_attention_map(attn1, -1, input.shape)) 52 | plt.ylabel("Lower right corner", fontsize=8) 53 | gc = plt.gca() 54 | gc.axes.xaxis.set_ticklabels([]) 55 | gc.axes.yaxis.set_ticklabels([]) 56 | gc.axes.xaxis.set_ticks([]) 57 | gc.axes.yaxis.set_ticks([]) 58 | 59 | plt.subplot(3,4,10), plt.imshow(get_mean_attention_map(attn2, -1, input.shape)), plt.axis("off") 60 | plt.subplot(3,4,11), plt.imshow(get_mean_attention_map(attn3, -1, input.shape)), plt.axis("off") 61 | plt.subplot(3,4,12), plt.imshow(get_mean_attention_map(attn4, -1, input.shape)), plt.axis("off") 62 | plt.tight_layout() 63 | plt.show() 64 | -------------------------------------------------------------------------------- /util/pallete.py: -------------------------------------------------------------------------------- 1 | ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 | ## Created by: Hang Zhang 3 | ## ECE Department, Rutgers University 4 | ## Email: zhang.hang@rutgers.edu 5 | ## Copyright (c) 2017 6 | ## 7 | ## This source code is licensed under the MIT-style license found in the 8 | ## LICENSE file in the root directory of this source tree 9 | ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 10 | 11 | from PIL import Image 12 | 13 | def get_mask_pallete(npimg, dataset='detail'): 14 | """Get image color pallete for visualizing masks""" 15 | # recovery boundary 16 | if dataset == 'pascal_voc': 17 | npimg[npimg==21] = 255 18 | # put colormap 19 | out_img = Image.fromarray(npimg.squeeze().astype('uint8')) 20 | if dataset == 'ade20k': 21 | out_img.putpalette(adepallete) 22 | elif dataset == 'citys': 23 | out_img.putpalette(citypallete) 24 | elif dataset in ('detail', 'pascal_voc', 'pascal_aug'): 25 | out_img.putpalette(vocpallete) 26 | return out_img 27 | 28 | def _get_voc_pallete(num_cls): 29 | n = num_cls 30 | pallete = [0]*(n*3) 31 | for j in range(0,n): 32 | lab = j 33 | pallete[j*3+0] = 0 34 | pallete[j*3+1] = 0 35 | pallete[j*3+2] = 0 36 | i = 0 37 | while (lab > 0): 38 | pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i)) 39 | pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i)) 40 | pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i)) 41 | i = i + 1 42 | lab >>= 3 43 | return pallete 44 | 45 | vocpallete = _get_voc_pallete(256) 46 | 47 | adepallete = [0,0,0,120,120,120,180,120,120,6,230,230,80,50,50,4,200,3,120,120,80,140,140,140,204,5,255,230,230,230,4,250,7,224,5,255,235,255,7,150,5,61,120,120,70,8,255,51,255,6,82,143,255,140,204,255,4,255,51,7,204,70,3,0,102,200,61,230,250,255,6,51,11,102,255,255,7,71,255,9,224,9,7,230,220,220,220,255,9,92,112,9,255,8,255,214,7,255,224,255,184,6,10,255,71,255,41,10,7,255,255,224,255,8,102,8,255,255,61,6,255,194,7,255,122,8,0,255,20,255,8,41,255,5,153,6,51,255,235,12,255,160,150,20,0,163,255,140,140,140,250,10,15,20,255,0,31,255,0,255,31,0,255,224,0,153,255,0,0,0,255,255,71,0,0,235,255,0,173,255,31,0,255,11,200,200,255,82,0,0,255,245,0,61,255,0,255,112,0,255,133,255,0,0,255,163,0,255,102,0,194,255,0,0,143,255,51,255,0,0,82,255,0,255,41,0,255,173,10,0,255,173,255,0,0,255,153,255,92,0,255,0,255,255,0,245,255,0,102,255,173,0,255,0,20,255,184,184,0,31,255,0,255,61,0,71,255,255,0,204,0,255,194,0,255,82,0,10,255,0,112,255,51,0,255,0,194,255,0,122,255,0,255,163,255,153,0,0,255,10,255,112,0,143,255,0,82,0,255,163,255,0,255,235,0,8,184,170,133,0,255,0,255,92,184,0,255,255,0,31,0,184,255,0,214,255,255,0,112,92,255,0,0,224,255,112,224,255,70,184,160,163,0,255,153,0,255,71,255,0,255,0,163,255,204,0,255,0,143,0,255,235,133,255,0,255,0,235,245,0,255,255,0,122,255,245,0,10,190,212,214,255,0,0,204,255,20,0,255,255,255,0,0,153,255,0,41,255,0,255,204,41,0,255,41,255,0,173,0,255,0,245,255,71,0,255,122,0,255,0,255,184,0,92,255,184,255,0,0,133,255,255,214,0,25,194,194,102,255,0,92,0,255] 48 | 49 | citypallete = [ 50 | 128,64,128,244,35,232,70,70,70,102,102,156,190,153,153,153,153,153,250,170,30,220,220,0,107,142,35,152,251,152,70,130,180,220,20,60,255,0,0,0,0,142,0,0,70,0,60,100,0,80,100,0,0,230,119,11,32,128,192,0,0,64,128,128,64,128,0,192,128,128,192,128,64,64,0,192,64,0,64,192,0,192,192,0,64,64,128,192,64,128,64,192,128,192,192,128,0,0,64,128,0,64,0,128,64,128,128,64,0,0,192,128,0,192,0,128,192,128,128,192,64,0,64,192,0,64,64,128,64,192,128,64,64,0,192,192,0,192,64,128,192,192,128,192,0,64,64,128,64,64,0,192,64,128,192,64,0,64,192,128,64,192,0,192,192,128,192,192,64,64,64,192,64,64,64,192,64,192,192,64,64,64,192,192,64,192,64,192,192,192,192,192,32,0,0,160,0,0,32,128,0,160,128,0,32,0,128,160,0,128,32,128,128,160,128,128,96,0,0,224,0,0,96,128,0,224,128,0,96,0,128,224,0,128,96,128,128,224,128,128,32,64,0,160,64,0,32,192,0,160,192,0,32,64,128,160,64,128,32,192,128,160,192,128,96,64,0,224,64,0,96,192,0,224,192,0,96,64,128,224,64,128,96,192,128,224,192,128,32,0,64,160,0,64,32,128,64,160,128,64,32,0,192,160,0,192,32,128,192,160,128,192,96,0,64,224,0,64,96,128,64,224,128,64,96,0,192,224,0,192,96,128,192,224,128,192,32,64,64,160,64,64,32,192,64,160,192,64,32,64,192,160,64,192,32,192,192,160,192,192,96,64,64,224,64,64,96,192,64,224,192,64,96,64,192,224,64,192,96,192,192,224,192,192,0,32,0,128,32,0,0,160,0,128,160,0,0,32,128,128,32,128,0,160,128,128,160,128,64,32,0,192,32,0,64,160,0,192,160,0,64,32,128,192,32,128,64,160,128,192,160,128,0,96,0,128,96,0,0,224,0,128,224,0,0,96,128,128,96,128,0,224,128,128,224,128,64,96,0,192,96,0,64,224,0,192,224,0,64,96,128,192,96,128,64,224,128,192,224,128,0,32,64,128,32,64,0,160,64,128,160,64,0,32,192,128,32,192,0,160,192,128,160,192,64,32,64,192,32,64,64,160,64,192,160,64,64,32,192,192,32,192,64,160,192,192,160,192,0,96,64,128,96,64,0,224,64,128,224,64,0,96,192,128,96,192,0,224,192,128,224,192,64,96,64,192,96,64,64,224,64,192,224,64,64,96,192,192,96,192,64,224,192,192,224,192,32,32,0,160,32,0,32,160,0,160,160,0,32,32,128,160,32,128,32,160,128,160,160,128,96,32,0,224,32,0,96,160,0,224,160,0,96,32,128,224,32,128,96,160,128,224,160,128,32,96,0,160,96,0,32,224,0,160,224,0,32,96,128,160,96,128,32,224,128,160,224,128,96,96,0,224,96,0,96,224,0,224,224,0,96,96,128,224,96,128,96,224,128,224,224,128,32,32,64,160,32,64,32,160,64,160,160,64,32,32,192,160,32,192,32,160,192,160,160,192,96,32,64,224,32,64,96,160,64,224,160,64,96,32,192,224,32,192,96,160,192,224,160,192,32,96,64,160,96,64,32,224,64,160,224,64,32,96,192,160,96,192,32,224,192,160,224,192,96,96,64,224,96,64,96,224,64,224,224,64,96,96,192,224,96,192,96,224,192,0,0,0] 51 | -------------------------------------------------------------------------------- /util/png_to_jpg.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | from PIL import Image 4 | from tqdm import tqdm 5 | 6 | """ Convert png to jpg images for KITTI dataset 7 | 8 | Data structure: 9 | |---dataset 10 | |---sequences 11 | |---00 12 | |---image_0 13 | |---000000.png 14 | |---000001.png 15 | |---... 16 | |---image_1 17 | |... 18 | |---image_2 19 | |---... 20 | |---image_3 21 | |---... 22 | |---01 23 | |---... 24 | """ 25 | 26 | dataset_dir = f"..\dataset\sequences" 27 | new_root_dir = f"..\dataset\sequences_jpg" 28 | image_nb = 0 # number of image folder (image_0, image_1, ... image_3) 29 | 30 | # create new directory 31 | if not os.path.exists(new_root_dir): 32 | os.makedirs(new_root_dir) 33 | 34 | for seq_nb in range(22): # list all sequences ["00", ... "21"] 35 | # sequence as 2-digit string 36 | seq = "{:02d}".format(seq_nb) 37 | 38 | # create seq in save directory 39 | seq_dir = os.path.join(new_root_dir, seq) 40 | if not os.path.exists(seq_dir): 41 | os.makedirs(seq_dir) 42 | 43 | # create image_nb directory 44 | img_nb_dir = os.path.join(seq_dir, "image_{}".format(image_nb)) 45 | if not os.path.exists(img_nb_dir): 46 | os.makedirs(img_nb_dir) 47 | 48 | images_list = glob(os.path.join(dataset_dir, seq, "image_{}".format(image_nb), "*")) # paths to png images in seq 49 | 50 | for i in tqdm(range(len(images_list)), desc="Sequence {}: ".format(seq)): 51 | img_path = images_list[i] 52 | img_name = os.path.basename(img_path) 53 | 54 | # read png image and convert to jpg using PIL 55 | img = Image.open(img_path) 56 | save_dir = os.path.join(img_nb_dir, img_name.replace(".png", ".jpg")) 57 | img.save(save_dir) 58 | -------------------------------------------------------------------------------- /visual_odometry.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from util.gric import * 4 | from scale_recovery import find_scale_from_depth 5 | 6 | 7 | class VisualOdometry: 8 | """ 9 | Monocular Visual Odometry: 10 | 1) Capture new frame I_k 11 | 2) Extract and match features between I_{k-1} and I_k 12 | 3) Compute essential matrix for image pair I_{k-1}, I_k 13 | 4) Decompose essential matrix into R_k and t_k, and form T_k 14 | 5) Compute relative scale and rescale tk accordingly 15 | 6) Concatenate transformation by computing Ck ¼ Ck1Tk 16 | 7) Repeat from 1). 17 | 18 | Main theory source: 19 | D. Scaramuzza and F. Fraundorfer, "Visual Odometry [Tutorial]" 20 | https://rpg.ifi.uzh.ch/visual_odometry_tutorial.html 21 | 22 | Code ref.: 23 | https://github.com/uoip/monoVO-python 24 | """ 25 | 26 | def __init__(self, cam, depth_model): 27 | self.cam = cam # camera object 28 | self.prev_frame = None # previous frame 29 | self.feat_ref = None # reference features (first frame) 30 | self.feat_curr = None # features from current image 31 | self.detector_method = "FAST" # detector method 32 | self.matching_method = "OF_PyrLK" # feature matching method 33 | self.min_num_features = 2500 34 | self.R = np.eye(3) 35 | self.t = np.zeros((3, 1)) 36 | self.depth_model = depth_model 37 | self.prev_depth = None 38 | self.pose = np.zeros((4, 4)) # pose matrix [R | t; 0 1] 39 | 40 | def detect_features(self, frame): 41 | """ 42 | Point-feature detector: search for salient keypoints that are likely to match well in other image frames. 43 | - corner detectors: Moravec, Forstner, Harris, Shi-Tomasi, and FAST. 44 | - blob detectors: SIFT, SURF, and CENSURE. 45 | 46 | Args: 47 | frame {ndarray}: frame to be processed 48 | """ 49 | if self.detector_method == "FAST": 50 | detector = cv2.FastFeatureDetector_create() # threshold=25, nonmaxSuppression=True) 51 | return detector.detect(frame) 52 | 53 | elif self.detector_method == "ORB": 54 | detector = cv2.ORB_create(nfeatures=2000) 55 | kp1, des1 = detector.detectAndCompute(frame, None) 56 | return kp1 57 | 58 | def feature_matching(self, frame): 59 | """ 60 | The feature-matching: looks for corresponding features in other images. 61 | 62 | Args: 63 | frame {ndarray}: frame to be processed 64 | """ 65 | if self.matching_method == "OF_PyrLK": 66 | # Calculate optical flow for a sparse feature set using the iterative Lucas-Kanade method with pyramids 67 | kp2, st, err = cv2.calcOpticalFlowPyrLK(self.prev_frame, frame, 68 | self.feat_ref, None, 69 | winSize=(21, 21), 70 | criteria=( 71 | cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 30, 0.01)) 72 | 73 | st = st.reshape(st.shape[0]) # status of points from frame to frame 74 | # Keypoints 75 | kp1 = self.feat_ref[st == 1] 76 | kp2 = kp2[st == 1] 77 | return kp1, kp2 78 | 79 | def motion_estimation_init(self, frame): 80 | """ 81 | Processes first frame to initialize the reference features and the matrix R and t. 82 | Only for frame_id == 0. 83 | 84 | Args: 85 | frame {ndarray}: frame to be processed 86 | frame_id {int}: integer corresponding to the frame id 87 | """ 88 | feat_ref = self.detect_features(frame) 89 | self.feat_ref = np.array([x.pt for x in feat_ref], dtype=np.float32) 90 | 91 | def motion_estimation(self, frame, frame_id): 92 | """ 93 | Estimates the motion from current frame and computed keypoints 94 | 95 | Args: 96 | frame {ndarray}: frame to be processed 97 | frame_id {int}: integer corresponding to the frame id 98 | pose {list}: list with ground truth pose [x, y, z] 99 | """ 100 | self.feat_ref, self.feat_curr = self.feature_matching(frame) 101 | # # Estimating an essential matrix (E): geometric relations between two images 102 | R, t = compute_pose_2d2d(self.feat_ref, self.feat_curr, self.cam) 103 | 104 | # estimate depth from a single image frame 105 | depth = self.depth_model.compute_depth(frame) 106 | depth = preprocess_depth(depth, crop=[[0.3, 1], [0, 1]], depth_range=[0, 50]) 107 | 108 | if frame_id == 1: 109 | self.R = R 110 | self.t = t 111 | 112 | else: 113 | E_pose = np.eye(4) 114 | E_pose[:3, :3] = R 115 | E_pose[: 3, 3:] = t 116 | 117 | # estimate scale 118 | scale = find_scale_from_depth( 119 | self.cam, 120 | self.feat_ref, 121 | self.feat_curr, 122 | np.linalg.inv(E_pose), 123 | depth 124 | ) 125 | 126 | if np.linalg.norm(t) == 0 or scale == -1.0: 127 | R, t = compute_pose_3d2d( 128 | self.feat_ref, 129 | self.feat_curr, 130 | self.prev_depth, 131 | self.cam 132 | ) # pose: from cur->ref 133 | scale = 1.0 134 | 135 | # estimate camera motion 136 | self.t = self.t + scale * self.R.dot(t) 137 | self.R = self.R.dot(R) 138 | 139 | self.prev_depth = depth 140 | 141 | # check if number of features is enough (some features are lost in time due to the moving scene) 142 | if self.feat_ref.shape[0] < self.min_num_features: 143 | self.feat_curr = self.detect_features(frame) 144 | self.feat_curr = np.array([x.pt for x in self.feat_curr], dtype=np.float32) 145 | 146 | # update reference features 147 | self.feat_ref = self.feat_curr 148 | 149 | def update(self, frame, frame_id): 150 | """ 151 | Computes the camera motion between the current image and the previous one. 152 | """ 153 | # Process first frame to get reference features 154 | if frame_id == 0: 155 | self.motion_estimation_init(frame) 156 | else: 157 | self.motion_estimation(frame, frame_id) 158 | 159 | self.prev_frame = frame 160 | 161 | # Pose matrix 162 | pose = np.eye(4) 163 | pose[:3, :3] = self.R 164 | pose[: 3, 3:] = self.t 165 | self.pose = pose 166 | 167 | 168 | def preprocess_depth(depth, crop, depth_range): 169 | """ 170 | Preprocess depth map with cropping and capping range 171 | Code adapted from DF-VO: 172 | -https://github.com/Huangying-Zhan/DF-VO 173 | 174 | Args: 175 | depth (array, [HxW]): depth map 176 | crop (list): normalized crop regions [[y0, y1], [x0, x1]]. non-cropped regions set to 0. 177 | depth_range (list): a list with float numbers [min_depth, max_depth] 178 | 179 | Returns: 180 | depth (array, [HxW]): processed depth map 181 | """ 182 | # normalize depth 183 | # depth_min = depth.min() 184 | # depth_max = depth.max() 185 | # max_val = (2 ** (8 * 1)) - 1 186 | # if depth_max - depth_min > np.finfo("float").eps: 187 | # depth = max_val * (depth - depth_min) / (depth_max - depth_min) 188 | # else: 189 | # depth = np.zeros(depth.shape, dtype=depth.dtype) 190 | 191 | # print("depth_max: ", depth.max()) 192 | # print("depth_min: ", depth.min()) 193 | 194 | # set cropping region 195 | min_depth, max_depth = depth_range 196 | h, w = depth.shape 197 | y0, y1 = int(h * crop[0][0]), int(h * crop[0][1]) 198 | x0, x1 = int(w * crop[1][0]), int(w * crop[1][1]) 199 | depth_mask = np.zeros((h, w)) 200 | depth_mask[y0:y1, x0:x1] = 1 201 | 202 | # set range mask 203 | depth_range_mask = (depth < max_depth) * (depth > min_depth) 204 | 205 | # set invalid pixel to zero depth 206 | valid_mask = depth_mask * depth_range_mask 207 | depth = depth * valid_mask 208 | return depth 209 | 210 | 211 | def compute_pose_3d2d(kp1, kp2, depth_1, cam_intrinsics): 212 | """ 213 | Compute pose from 3d-2d correspondences 214 | Code adapted from DF-VO: 215 | -https://github.com/Huangying-Zhan/DF-VO 216 | 217 | Args: 218 | cam_intrinsics: camera intrinsics 219 | kp1 (array, [Nx2]): keypoints for view-1 220 | kp2 (array, [Nx2]): keypoints for view-2 221 | depth_1 (array, [HxW]): depths for view-1 222 | 223 | Returns: 224 | R (array, [3x3]): rotation matrix 225 | t (array, [3x1]): translation vector 226 | """ 227 | max_depth = 50 228 | min_depth = 0 229 | 230 | outputs = {} 231 | height, width = depth_1.shape 232 | 233 | # Filter keypoints outside image region 234 | x_idx = (kp1[:, 0] >= 0) * (kp1[:, 0] < width) 235 | kp1 = kp1[x_idx] 236 | kp2 = kp2[x_idx] 237 | x_idx = (kp2[:, 0] >= 0) * (kp2[:, 0] < width) 238 | kp1 = kp1[x_idx] 239 | kp2 = kp2[x_idx] 240 | y_idx = (kp1[:, 1] >= 0) * (kp1[:, 1] < height) 241 | kp1 = kp1[y_idx] 242 | kp2 = kp2[y_idx] 243 | y_idx = (kp2[:, 1] >= 0) * (kp2[:, 1] < height) 244 | kp1 = kp1[y_idx] 245 | kp2 = kp2[y_idx] 246 | 247 | # Filter keypoints outside depth range 248 | kp1_int = kp1.astype(np.int) 249 | kp_depths = depth_1[kp1_int[:, 1], kp1_int[:, 0]] 250 | non_zero_mask = (kp_depths != 0) 251 | depth_range_mask = (kp_depths < max_depth) * (kp_depths > min_depth) 252 | valid_kp_mask = non_zero_mask * depth_range_mask 253 | 254 | kp1 = kp1[valid_kp_mask] 255 | kp2 = kp2[valid_kp_mask] 256 | 257 | # Get 3D coordinates for kp1 258 | XYZ_kp1 = unprojection_kp(kp1, kp_depths[valid_kp_mask], cam_intrinsics) 259 | 260 | # initialize ransac setup 261 | best_rt = [] 262 | best_inlier = 0 263 | max_ransac_iter = 3 264 | 265 | for _ in range(max_ransac_iter): 266 | # shuffle kp (only useful when random seed is fixed) 267 | new_list = np.arange(0, kp2.shape[0], 1) 268 | np.random.shuffle(new_list) 269 | new_XYZ = XYZ_kp1.copy()[new_list] 270 | new_kp2 = kp2.copy()[new_list] 271 | 272 | if new_kp2.shape[0] > 4: 273 | # PnP solver 274 | flag, r, t, inlier = cv2.solvePnPRansac( 275 | objectPoints=new_XYZ, 276 | imagePoints=new_kp2, 277 | cameraMatrix=cam_intrinsics.mat, 278 | distCoeffs=None, 279 | iterationsCount=100, # number of iteration 280 | reprojectionError=1, # inlier threshold value 281 | ) 282 | 283 | # save best pose estimation 284 | if flag and inlier.shape[0] > best_inlier: 285 | best_rt = [r, t] 286 | best_inlier = inlier.shape[0] 287 | 288 | # format pose 289 | R = np.eye(3) 290 | t = np.zeros((3, 1)) 291 | if len(best_rt) != 0: 292 | r, t = best_rt 293 | R = cv2.Rodrigues(r)[0] 294 | E_pose = np.eye(4) 295 | E_pose[:3, :3] = R 296 | E_pose[: 3, 3:] = t 297 | E_pose = np.linalg.inv(E_pose) 298 | R = E_pose[:3, :3] 299 | t = E_pose[: 3, 3:] 300 | return R, t 301 | 302 | 303 | def unprojection_kp(kp, kp_depth, cam_intrinsics): 304 | """ 305 | Convert kp to XYZ 306 | Code from DF-VO: 307 | -https://github.com/Huangying-Zhan/DF-VO 308 | 309 | Args: 310 | kp (array, [Nx2]): [x, y] keypoints 311 | kp_depth (array, [Nx2]): keypoint depth 312 | cam_intrinsics (Intrinsics): camera intrinsics 313 | 314 | Returns: 315 | XYZ (array, [Nx3]): 3D coordinates 316 | """ 317 | N = kp.shape[0] 318 | # initialize regular grid 319 | XYZ = np.ones((N, 3, 1)) 320 | XYZ[:, :2, 0] = kp 321 | 322 | inv_K = np.ones((1, 3, 3)) 323 | inv_K[0] = np.linalg.inv(cam_intrinsics.mat) # cam_intrinsics.inv_mat 324 | inv_K = np.repeat(inv_K, N, axis=0) 325 | 326 | XYZ = np.matmul(inv_K, XYZ)[:, :, 0] 327 | XYZ[:, 0] = XYZ[:, 0] * kp_depth 328 | XYZ[:, 1] = XYZ[:, 1] * kp_depth 329 | XYZ[:, 2] = XYZ[:, 2] * kp_depth 330 | return XYZ 331 | 332 | 333 | def compute_pose_2d2d(kp_ref, kp_cur, cam_intrinsics): 334 | """ 335 | Compute the pose from view2 to view1 336 | Code adapted from DF-VO: 337 | -https://github.com/Huangying-Zhan/DF-VO 338 | 339 | Args: 340 | kp_ref (array, [Nx2]): keypoints for reference view 341 | kp_cur (array, [Nx2]): keypoints for current view 342 | cam_intrinsics (Intrinsics): camera intrinsics 343 | is_iterative (bool): is iterative stage 344 | 345 | Returns: 346 | a dictionary containing 347 | - **pose** (SE3): relative pose from current to reference view 348 | - **best_inliers** (array, [N]): boolean inlier mask 349 | """ 350 | principal_points = (cam_intrinsics.cx, cam_intrinsics.cy) 351 | 352 | # initialize ransac setup 353 | R = np.eye(3) 354 | t = np.zeros((3, 1)) 355 | best_Rt = [R, t] 356 | best_inlier_cnt = 0 357 | max_ransac_iter = 3 358 | best_inliers = np.ones((kp_ref.shape[0], 1)) == 1 359 | 360 | # method GRIC of validating E-tracker 361 | if kp_cur.shape[0] > 10: 362 | H, H_inliers = cv2.findHomography( 363 | kp_cur, 364 | kp_ref, 365 | method=cv2.RANSAC, 366 | confidence=0.99, 367 | ransacReprojThreshold=1, 368 | ) 369 | 370 | H_res = compute_homography_residual(H, kp_cur, kp_ref) 371 | H_gric = calc_GRIC( 372 | res=H_res, 373 | sigma=0.8, 374 | n=kp_cur.shape[0], 375 | model="HMat" 376 | ) 377 | valid_case = True 378 | else: 379 | valid_case = False 380 | 381 | if valid_case: 382 | num_valid_case = 0 383 | for i in range(max_ransac_iter): # repeat ransac for several times for stable result 384 | # shuffle kp_cur and kp_ref (only useful when random seed is fixed) 385 | new_list = np.arange(0, kp_cur.shape[0], 1) 386 | np.random.shuffle(new_list) 387 | new_kp_cur = kp_cur.copy()[new_list] 388 | new_kp_ref = kp_ref.copy()[new_list] 389 | 390 | E, inliers = cv2.findEssentialMat( 391 | new_kp_cur, 392 | new_kp_ref, 393 | focal=cam_intrinsics.fx, 394 | pp=principal_points, 395 | method=cv2.RANSAC, 396 | prob=0.99, 397 | threshold=0.2, 398 | ) 399 | 400 | # get F from E 401 | K = cam_intrinsics.mat 402 | F = np.linalg.inv(K.T) @ E @ np.linalg.inv(K) 403 | E_res = compute_fundamental_residual(F, new_kp_cur, new_kp_ref) 404 | 405 | E_gric = calc_GRIC( 406 | res=E_res, 407 | sigma=0.8, 408 | n=kp_cur.shape[0], 409 | model='EMat' 410 | ) 411 | valid_case = H_gric > E_gric 412 | 413 | # inlier check 414 | inlier_check = inliers.sum() > best_inlier_cnt 415 | 416 | # save best_E 417 | if inlier_check: 418 | best_E = E 419 | best_inlier_cnt = inliers.sum() 420 | 421 | revert_new_list = np.zeros_like(new_list) 422 | for cnt, i in enumerate(new_list): 423 | revert_new_list[i] = cnt 424 | best_inliers = inliers[list(revert_new_list)] 425 | num_valid_case += (valid_case * 1) 426 | 427 | major_valid = num_valid_case > (max_ransac_iter / 2) 428 | if major_valid: 429 | cheirality_cnt, R, t, _ = cv2.recoverPose(best_E, kp_cur, kp_ref, 430 | focal=cam_intrinsics.fx, 431 | pp=principal_points, 432 | ) 433 | 434 | # cheirality_check 435 | if cheirality_cnt > kp_cur.shape[0] * 0.1: 436 | best_Rt = [R, t] 437 | 438 | R, t = best_Rt 439 | return R, t 440 | -------------------------------------------------------------------------------- /weights/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aofrancani/DPT-VO/aef64207243fbd29ec17e1c108def8ac3e19b68d/weights/.placeholder --------------------------------------------------------------------------------