├── LICENSE ├── README.md ├── carla.py ├── docs ├── .DS_Store ├── assets │ ├── method.png │ ├── paper-thumbnail.png │ ├── problem.png │ ├── results.png │ ├── sedan_to_sedan_trim.mp4 │ ├── sedan_to_suv_our_method_trim.mp4 │ ├── sedan_to_suv_trim.mp4 │ ├── tzofi2023view.bib │ ├── tzofi2023view.pdf │ └── viewpoint_problem.png └── index.html ├── drivesim.py ├── nusccalib.json ├── requirements.txt └── tools ├── augmentations.py ├── av_types.py ├── common.py ├── data_tools.py ├── misc.py ├── sensor_models.py ├── transformations.py └── transforms.py /LICENSE: -------------------------------------------------------------------------------- 1 | NVIDIA Source Code License for Towards Viewpoint Robustness for Bird's Eye View Segmentation 2 | 3 | 1. Definitions 4 | 5 | “Licensor” means any person or entity that distributes its Work. 6 | “Software” means the original work of authorship made available under this License. 7 | “Work” means the Software and any additions to or derivative works of the Software that are made available under this License. 8 | The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this License, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work. 9 | Works, including the Software, are “made available” under this License by including in or with the Work either (a) a copyright notice referencing the applicability of this License to the Work, or (b) a copy of this License. 10 | 11 | 2. License Grant 12 | 13 | 2.1 Copyright Grant. Subject to the terms and conditions of this License, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form. 14 | 15 | 3. Limitations 16 | 17 | 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this License, (b) you include a complete copy of this License with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work. 18 | 19 | 3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this License (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself. 20 | 21 | 3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA and its affiliates may use the Work and any derivative works commercially. As used herein, “non-commercially” means for research or evaluation purposes only. 22 | 23 | 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this License from such Licensor (including the grant in Section 2.1) will terminate immediately. 24 | 25 | 3.5 Trademarks. This License does not grant any rights to use any Licensor’s or its affiliates’ names, logos, or trademarks, except as necessary to reproduce the notices described in this License. 26 | 27 | 3.6 Termination. If you violate any term of this License, then your rights under this License (including the grant in Section 2.1) will terminate immediately. 28 | 29 | 4. Disclaimer of Warranty. 30 | 31 | THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF M ERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE. 32 | 33 | 5. Limitation of Liability. 34 | 35 | EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER COMM ERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Towards Viewpoint Robustness for Bird's Eye View Segmentation 2 | 3 | Datasets from CARLA and NVIDIA DRIVE Sim are provided [here](https://drive.google.com/drive/folders/1FQGl9oHyMb7CspUBSFQvpByZD9myLync) for future work in quantifying viewpoint robustness. This repository contains the code for preprocessing and loading this data into your ML pipeline! 4 | 5 | > __Towards Viewpoint Robustness in Bird's Eye View Segmentation__ 6 | > [Tzofi Klinghoffer](https://tzofi.github.io/), [Jonah Philion](https://www.cs.toronto.edu/~jphilion/), [Wenzheng Chen](https://www.cs.toronto.edu/~wenzheng/), [Or Litany](https://orlitany.github.io/), [Zan Gojcic](https://zgojcic.github.io/), [Jungseock Joo](https://www.jsjoo.com/), [Ramesh Raskar](https://www.media.mit.edu/people/raskar/overview/), [Sanja Fidler](https://www.cs.utoronto.ca/~fidler/), [Jose M. Alvarez](https://alvarezlopezjosem.github.io/) 7 | > _International Conference on Computer Vision (_ICCV_), 2023_ 8 | > __[Project page](https://nvlabs.github.io/viewpoint-robustness) / [Paper](https://nvlabs.github.io/viewpoint-robustness/docs/assets/tzofi2023view.pdf) / [BibTeX](https://nvlabs.github.io/viewpoint-robustness/docs/assets/tzofi2023view.bib)__ 9 | 10 | For business inquiries, please submit the [NVIDIA research licensing form](https://www.nvidia.com/en-us/research/inquiries/). 11 | 12 | ## Preparation 13 | ```pip install -r requirements.txt``` 14 | 15 | ## Usage 16 | 17 | We provide two datasets, one rendered in CARLA, which includes train and test subsets across 36 different viewpoints, and one rendered in DRIVE Sim, which includes test subsets across 11 viewpoints. 18 | 19 | ### Using CARLA Data 20 | 21 | The provided dataset is rendered from a front-facing camera using NuScenes configuration (specified in nuscenes.json). The dataset includes train and test splits across 36 different viewpoints. Train data is collected in CARLA's Town03 and test data is collected in CARLA's Town05. Each split contains 25k images. To create your dataloader: 22 | 23 | 1. [ Downloading ] Download the data from the above Google drive link and untar it (tar -xvf filename) 24 | 2. [ Data Loading ] Run the following command to instiantiate a dataset: 25 | ``` 26 | python carla.py path_to_data_eg_pitch0 27 | ``` 28 | 29 | The viewpoint change is described by the folder name. pitch0 is the default view and all other changes in viewpoint are applied to it, e.g. pitch-4 changes the viewpoint from the default view to -4 degrees pitch (all other extrinsic parameters left the same), yaw16 changes the viewpoint from the default view to +16 degrees pitch, height15 changes the viewpoint from the default to +15 inches height, and pitch\_height-8\_12 changes the viewpoint from the default to -8 degrees pitch AND +12 inches height. The default extrinsics and corresponding adjustments are stored in each info.json file. 30 | 31 | ### Using NVIDIA DRIVE Sim Data 32 | 33 | The provided dataset is rendered from a front-facing 120 degree camera with an ftheta lens. In our work, we rectify this data that of a 50 degree pinhole model camera. Once rectified, we can then load the data and use it to evaluate our model. Please refer to the following steps: 34 | 35 | 1. [ Downloading ] Download the data from the above Google drive link and unzip it 36 | 2. [ Optional: data is already rectified ][ Rectifying ] Run the following command to rectify all data (this will take awhile to run): 37 | ``` 38 | python drivesim.py --dataroot ./DRIVESim_datasets --session 5f8f235e-9304-11ed-8a70-6479f09080c1 --dataset_idx 0 --rectify 1 39 | ``` 40 | 3. [ Optional ][ Visualization ] Run the following command to visualize the images and ground truth: 41 | ``` 42 | python drivesim.py --dataroot ./DRIVESim_datasets --session 5f8f235e-9304-11ed-8a70-6479f09080c1 --dataset_idx 0 --vis 1 43 | ``` 44 | 4. [ Data Loading ] Run the following command to instantiate a dataset with the rectified data: 45 | ``` 46 | python drivesim.py --dataroot ./DRIVESim_datasets --session 5f8f235e-9304-11ed-8a70-6479f09080c1 --dataset_idx 0 --frames rgb_jpeg_rectified 47 | ``` 48 | 49 | Dataset idx refers to which test dataset to load, and corresponds to: 50 | 0: default view 51 | 2: +1.5 m depth 52 | 3: +0.2 m height 53 | 4: +0.4 m height 54 | 5: +0.6 m height 55 | 6: +0.8 m height 56 | 7: -5 degrees pitch (looking downwards) 57 | 8: -10 degrees pitch (looking downwards) 58 | 9: -10 degrees pitch (looking downwards) and +0.6 m height 59 | 10: +5 degrees pitch (looking upwards) 60 | 11: +10 degrees pitch (looking upwards) 61 | 62 | The above visualization step visualizes the unrectified images. The codebase also includes a function for visualizing rectified images to verify the data prior to training. 63 | 64 | ## Rendering CARLA Data 65 | 66 | The code that we use to render the CARLA datasets are provided in [this git issue](https://github.com/NVlabs/viewpoint-robustness/issues/4). While we only rendered RGB images and corresponding 3D bounding box labels in our datasets, depth, segmentation, and other data can also be rendered. 67 | 68 | ## Thanks 69 | 70 | Many thanks to [Alperen Degirmenci](https://scholar.harvard.edu/adegirmenci/home) and [Maying Shen](https://mayings.github.io/). 71 | 72 | This project makes use of a number of awesome projects, including: 73 | * [Lift, Splat, Shoot](https://nv-tlabs.github.io/lift-splat-shoot/) 74 | * [Cross View Transformers](https://github.com/bradyz/cross_view_transformers) 75 | * [Worldsheet](https://worldsheet.github.io/) 76 | 77 | Many thanks to the authors of these papers! 78 | 79 | ## License and Citation 80 | 81 | ```bibtex 82 | @inproceedings{tzofi2023view, 83 | author = {Klinghoffer, Tzofi and Philion, Jonah and Chen, Wenzheng and Litany, Or and Gojcic, Zan 84 | and Joo, Jungseock and Raskar, Ramesh and Fidler, Sanja and Alvarez, Jose M}, 85 | title = {Towards Viewpoint Robustness in Bird's Eye View Segmentation}, 86 | booktitle = {International Conference on Computer Vision}, 87 | year = {2023} 88 | } 89 | ``` 90 | 91 | Copyright © 2023, NVIDIA Corporation. All rights reserved. 92 | -------------------------------------------------------------------------------- /carla.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2023 NVIDIA Corporation. All rights reserved. 3 | Licensed under the NVIDIA Source Code License. See LICENSE at https://github.com/NVlabs/viewpoint-robustness 4 | Authors: Tzofi Klinghoffer, Jonah Philion, Wenzheng Chen, Or Litany, Zan Gojcic, Jungseock Joo, Ramesh Raskar, Sanja Fidler, Jose M. Alvarez 5 | """ 6 | 7 | import cv2 8 | import json 9 | import numpy as np 10 | import os 11 | import sys 12 | import torch 13 | from pathlib import Path 14 | from PIL import Image 15 | from pyquaternion import Quaternion 16 | from nuscenes.utils.data_classes import Box 17 | 18 | from tools.misc import img_transform, normalize_img, gen_dx_bx, choose_cams, parse_calibration, sample_augmentation 19 | 20 | CARLA_CAMORDER = { 21 | 'CAM_FRONT': 0, 22 | 'CAM_FRONT_RIGHT': 1, 23 | 'CAM_BACK_RIGHT': 2, 24 | 'CAM_BACK': 3, 25 | 'CAM_BACK_LEFT': 4, 26 | 'CAM_FRONT_LEFT': 5, 27 | } 28 | 29 | class CARLADataset(object): 30 | """ 31 | - dataroot: path to data directory 32 | - is_train: flag for training 33 | - data_aug_conf: configuration 34 | - grid_conf: configuration for bev grid 35 | - ret_boxes: flag for returning the bev seg boxes from getitem() 36 | - limit: max number of images 37 | - pitch, yaw, height: these are specified if we want to override the pitch/yaw/height specified in the info.json files (info.json contains correct extrinsics) 38 | """ 39 | def __init__(self, dataroot, is_train, data_aug_conf, grid_conf, ret_boxes, limit=None, pitch=0, yaw=0, height=0): 40 | self.pitch = pitch 41 | self.yaw = yaw 42 | self.height = height 43 | self.dataroot = dataroot 44 | self.is_train = is_train 45 | self.data_aug_conf = data_aug_conf 46 | self.ret_boxes = ret_boxes 47 | self.grid_conf = grid_conf 48 | self.ixes = self.get_ixes() 49 | if limit: 50 | self.ixes = self.ixes[:limit] 51 | 52 | # hard code this for now 53 | with open('./nusccalib.json', 'r') as reader: 54 | self.nusccalib = json.load(reader) 55 | 56 | dx, bx, nx = gen_dx_bx(grid_conf['xbound'], grid_conf['ybound'], grid_conf['zbound']) 57 | self.dx, self.bx, self.nx = dx.numpy(), bx.numpy(), nx.numpy() 58 | 59 | print('Carla sim:', len(self), 'is train:', self.is_train) 60 | print('CARLA BEV size:', len(self), '| is_train:', self.is_train) 61 | 62 | def get_ixes(self): 63 | timesteps = [] 64 | for path in Path(self.dataroot).rglob('info.json'): 65 | f = str(path.parents[0]) 66 | imgix = set( 67 | [int(fo.split('_')[0]) for fo in os.listdir(f) if fo != 'info.json' and fo[-4:] == '.jpg']) 68 | if len(imgix) == 0: 69 | imgix = set( 70 | [int(fo.split('_')[0]) for fo in os.listdir(f) if fo != 'info.json' and fo[-4:] == '.png']) 71 | for img in imgix: 72 | timesteps.append((f, img)) 73 | 74 | timesteps = sorted(timesteps, key=lambda x: (x[0], x[1])) 75 | return timesteps 76 | 77 | def get_image_data(self, f, fo, cams, calib, cam_adjust, use_cam_name=False): 78 | imgs = [] 79 | rots = [] 80 | trans = [] 81 | intrins = [] 82 | post_rots = [] 83 | post_trans = [] 84 | cal = parse_calibration(calib, width=self.data_aug_conf['W'], height=self.data_aug_conf['H'], 85 | cam_adjust=cam_adjust, pitch=self.pitch, yaw=self.yaw, height_adjust=self.height) 86 | path = None 87 | for cam in cams: 88 | if use_cam_name: 89 | path = os.path.join(f, f'{fo:04}_{cam}.jpg') 90 | else: 91 | path = os.path.join(f, f'{fo:04}_{CARLA_CAMORDER[cam]:02}.jpg') 92 | if not os.path.isfile(path): 93 | path = path[:-4] + ".png" 94 | img = Image.open(path) 95 | 96 | post_rot = torch.eye(2) 97 | post_tran = torch.zeros(2) 98 | 99 | intrin = torch.Tensor(cal[cam]['intrins']) 100 | rot = torch.Tensor(cal[cam]['rot'].rotation_matrix) 101 | tran = torch.Tensor(cal[cam]['trans']) 102 | 103 | # augmentation (resize, crop, horizontal flip, rotate) 104 | resize, resize_dims, crop, flip, rotate = sample_augmentation(self.data_aug_conf, self.is_train) 105 | img, post_rot2, post_tran2 = img_transform(img, post_rot, post_tran, 106 | resize=resize, 107 | resize_dims=resize_dims, 108 | crop=crop, 109 | flip=flip, 110 | rotate=rotate, 111 | ) 112 | 113 | # for convenience, make augmentation matrices 3x3 114 | post_tran = torch.zeros(3) 115 | post_rot = torch.eye(3) 116 | post_tran[:2] = post_tran2 117 | post_rot[:2, :2] = post_rot2 118 | 119 | imgs.append(normalize_img(img)) 120 | intrins.append(intrin) 121 | rots.append(rot) 122 | trans.append(tran) 123 | post_rots.append(post_rot) 124 | post_trans.append(post_tran) 125 | 126 | return (torch.stack(imgs), torch.stack(rots), torch.stack(trans), 127 | torch.stack(intrins), torch.stack(post_rots), torch.stack(post_trans), path) 128 | 129 | def get_binimg(self, gt): 130 | def get_box(box): 131 | diffw = box[:3, 1] - box[:3, 2] 132 | diffl = box[:3, 0] - box[:3, 1] 133 | diffh = box[:3, 4] - box[:3, 0] 134 | 135 | center = (box[:3, 4] + box[:3, 2]) / 2 136 | # carla flips y axis 137 | center[1] = -center[1] 138 | 139 | dims = [np.linalg.norm(diffw), np.linalg.norm(diffl), np.linalg.norm(diffh)] 140 | 141 | rot = np.zeros((3, 3)) 142 | rot[:, 1] = diffw / dims[0] 143 | rot[:, 0] = diffl / dims[1] 144 | rot[:, 2] = diffh / dims[2] 145 | 146 | quat = Quaternion(matrix=rot) 147 | # again, carla flips y axis 148 | newquat = Quaternion(quat.w, -quat.x, quat.y, -quat.z) 149 | 150 | nbox = Box(center, dims, newquat) 151 | return nbox 152 | 153 | boxes = [get_box(box) for box in gt] 154 | 155 | img = np.zeros((int(self.nx[0]), int(self.nx[1]))) 156 | for nbox in boxes: 157 | pts = nbox.bottom_corners()[:2].T 158 | pts = np.round( 159 | (pts - self.bx[:2] + self.dx[:2] / 2.) / self.dx[:2] 160 | ).astype(np.int32) 161 | pts[:, [1, 0]] = pts[:, [0, 1]] 162 | cv2.fillPoly(img, [pts], 1.0) 163 | 164 | return torch.Tensor(img).unsqueeze(0), boxes 165 | 166 | def __len__(self): 167 | return len(self.ixes) 168 | 169 | def __getitem__(self, index): 170 | f, fo = self.ixes[index] 171 | 172 | cams = choose_cams(self.is_train, self.data_aug_conf) 173 | 174 | with open(os.path.join(f, 'info.json'), 'r') as reader: 175 | gt = json.load(reader) 176 | if not 'cam_adjust' in gt: 177 | gt['cam_adjust'] = {k: {'fov': 0.0, 'yaw': 0.0} for k in CARLA_CAMORDER} 178 | 179 | imgs, rots, trans, intrins, post_rots, post_trans, path = self.get_image_data(f, fo, cams, self.nusccalib[gt['scene_calib']], 180 | gt['cam_adjust'], 181 | use_cam_name=False) 182 | binimg, boxes = self.get_binimg(np.array(gt['boxes'][fo])) 183 | 184 | if self.ret_boxes: 185 | if len(boxes) > 0: 186 | pts = torch.cat([torch.Tensor(box.bottom_corners()[:3]) for box in boxes], 1) 187 | else: 188 | return imgs, rots, trans, intrins, post_rots, post_trans, binimg, binimg 189 | return imgs, rots, trans, intrins, post_rots, post_trans, pts, binimg 190 | 191 | return imgs, rots, trans, intrins, post_rots, post_trans, binimg, path 192 | 193 | def worker_rnd_init(x): 194 | np.random.seed(13 + x) 195 | 196 | def compile_data(version, dataroot, data_aug_conf, grid_conf, bsz, 197 | nworkers, pitch=0, yaw=0, height=0,limit=5000): 198 | traindata = CARLADataset(dataroot, True, data_aug_conf, grid_conf, ret_boxes=False, pitch=pitch, yaw=yaw, height=height) 199 | val_root = "/".join(dataroot.split("/")[:-1]) + "/town05" 200 | valdata = CARLADataset(val_root, True, data_aug_conf, grid_conf, ret_boxes=False, limit=limit, pitch=pitch, yaw=yaw, height=height) 201 | 202 | trainloader = torch.utils.data.DataLoader(traindata, batch_size=bsz, 203 | shuffle=True, 204 | num_workers=nworkers, 205 | drop_last=True, 206 | worker_init_fn=worker_rnd_init) 207 | valloader = torch.utils.data.DataLoader(valdata, batch_size=bsz, 208 | shuffle=False, 209 | num_workers=nworkers) 210 | return trainloader, valloader 211 | 212 | if __name__ == "__main__": 213 | H=512 214 | W=512 215 | resize_lim=(0.339, 0.703) 216 | final_dim=(128, 352) 217 | bot_pct_lim=(0.0, 0.22) 218 | rot_lim=(-5.4, 5.4) 219 | rand_flip=True 220 | ncams=1 221 | xbound=[-50.0, 50.0, 0.5] 222 | ybound=[-50.0, 50.0, 0.5] 223 | zbound=[-10.0, 10.0, 20.0] 224 | dbound=[4.0, 45.0, 1.0] 225 | grid_conf = { 226 | 'xbound': xbound, 227 | 'ybound': ybound, 228 | 'zbound': zbound, 229 | 'dbound': dbound, 230 | } 231 | data_aug_conf = { 232 | 'resize_lim': resize_lim, 233 | 'final_dim': final_dim, 234 | 'rot_lim': rot_lim, 235 | 'H': H, 'W': W, 236 | 'rand_flip': rand_flip, 237 | 'bot_pct_lim': bot_pct_lim, 238 | 'cams': ['CAM_FRONT'], 239 | 'Ncams': ncams, 240 | } 241 | 242 | data = CARLADataset(os.path.join(sys.argv[1], "home/carla/town03"), 243 | True, 244 | data_aug_conf, 245 | grid_conf, 246 | ret_boxes=False) 247 | -------------------------------------------------------------------------------- /docs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/viewpoint-robustness/dda573520dc366ad045940f7de95c34ac4443621/docs/.DS_Store -------------------------------------------------------------------------------- /docs/assets/method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/viewpoint-robustness/dda573520dc366ad045940f7de95c34ac4443621/docs/assets/method.png -------------------------------------------------------------------------------- /docs/assets/paper-thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/viewpoint-robustness/dda573520dc366ad045940f7de95c34ac4443621/docs/assets/paper-thumbnail.png -------------------------------------------------------------------------------- /docs/assets/problem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/viewpoint-robustness/dda573520dc366ad045940f7de95c34ac4443621/docs/assets/problem.png -------------------------------------------------------------------------------- /docs/assets/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/viewpoint-robustness/dda573520dc366ad045940f7de95c34ac4443621/docs/assets/results.png -------------------------------------------------------------------------------- /docs/assets/sedan_to_sedan_trim.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/viewpoint-robustness/dda573520dc366ad045940f7de95c34ac4443621/docs/assets/sedan_to_sedan_trim.mp4 -------------------------------------------------------------------------------- /docs/assets/sedan_to_suv_our_method_trim.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/viewpoint-robustness/dda573520dc366ad045940f7de95c34ac4443621/docs/assets/sedan_to_suv_our_method_trim.mp4 -------------------------------------------------------------------------------- /docs/assets/sedan_to_suv_trim.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/viewpoint-robustness/dda573520dc366ad045940f7de95c34ac4443621/docs/assets/sedan_to_suv_trim.mp4 -------------------------------------------------------------------------------- /docs/assets/tzofi2023view.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{tzofi2023view, 2 | author = {Klinghoffer, Tzofi and Philion, Jonah and Chen, Wenzheng and Litany, Or and Gojcic, Zan 3 | and Joo, Jungseock and Raskar, Ramesh and Fidler, Sanja and Alvarez, Jose M}, 4 | title = {Towards Viewpoint Robustness in Bird's Eye View Segmentation}, 5 | booktitle = {International Conference on Computer Vision}, 6 | year = {2023} 7 | } -------------------------------------------------------------------------------- /docs/assets/tzofi2023view.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/viewpoint-robustness/dda573520dc366ad045940f7de95c34ac4443621/docs/assets/tzofi2023view.pdf -------------------------------------------------------------------------------- /docs/assets/viewpoint_problem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/viewpoint-robustness/dda573520dc366ad045940f7de95c34ac4443621/docs/assets/viewpoint_problem.png -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 253 | 254 | 255 | Towards Viewpoint Robustness in Bird's Eye View Segmentation 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 |
268 |
269 |

Towards Viewpoint Robustness in Bird's Eye View Segmentation

270 |
271 | 272 |
273 |
274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 |
284 |
285 | 286 |
287 |
1MIT
288 |
2NVIDIA
289 |
3University of Toronto
290 |
4Vector Institute
291 |
5UCLA
292 |
293 | 294 | 297 | 298 |
299 | 317 |
318 |
319 | 320 | 321 |
322 |
323 |
324 |
325 | 326 |
327 |
328 |
329 |
330 | 331 |
332 |

Abstract

333 |
334 |

335 | Autonomous vehicles (AV) require that neural networks used for perception be robust to different viewpoints if they are to be deployed across many types of vehicles without the repeated cost of data collection and labeling for each. AV companies typically focus on collecting data from diverse scenarios and locations, but not camera rig configurations, due to cost. As a result, only a small number of rig variations exist across most fleets. In this paper, we study how AV perception models are affected by changes in camera viewpoint and propose a way to scale them across vehicle types without repeated data collection and labeling. Using bird's eye view (BEV) segmentation as a motivating task, we find through extensive experiments that existing perception models are surprisingly sensitive to changes in camera viewpoint. When trained with data from one camera rig, small changes to pitch, yaw, depth, or height of the camera at inference time lead to large drops in performance. We introduce a technique for novel view synthesis and use it to transform collected data to the viewpoint of target rigs, allowing us to train BEV segmentation models for diverse target rigs without any additional data collection or labeling cost. To analyze the impact of viewpoint changes, we leverage synthetic data to mitigate other gaps (content, ISP, etc). Our approach is then trained on real data and evaluated on synthetic data, enabling evaluation on diverse target rigs. We release all data for use in future work. Our method is able to recover an average of 14.7% of the IoU that is otherwise lost when deploying to new rigs. 336 |

337 |
338 | 339 |
340 |
341 | 342 | 343 | 344 |

345 | We find that the performance of state-of-the-art methods for bird’s eye view (BEV) segmentation quickly drop with small changes to viewpoint at inference. Above we see predictions from Cross View Transformers trained on data from a source rig (top). The target rig pitch is reduced by 10° (bottom), leading a 17% drop in IoU. To eliminate the viewpoint robustness problem, we propose a new method for novel view synthesis and use it to transform collected data from the source to target rig automatically. 346 |

347 |
348 |
349 | 350 |
351 |

Paper

352 |
353 |
354 |
355 | 356 |
357 |
358 |

Towards Viewpoint Robustness in Bird's Eye View Segmentation

359 |

Tzofi Klinghoffer, Jonah Philion, Wenzheng Chen, Or Litany, Zan Gojcic, Jungseock Joo, Ramesh Raskar, Sanja Fidler, Jose Alvarez

360 | 361 | 362 |
description arXiv version
363 |
insert_comment BibTeX
364 |
365 |
366 |
367 | 368 |
369 |
370 |

371 | Technical Video 372 |

373 |
374 |
375 | 376 |
377 |
378 |
379 |
380 | 381 |
382 |

The Viewpoint Robustness Problem

383 |
384 |
385 | 386 | 387 | 388 |

389 | We train a source BEV model using Lift Splat Shoot and Cross View Transformers, denoted at point 0 on the x axis of each graph. We then test the model across different target rigs where the camera pitch, yaw, height, or pitch and height are changed, as denoted by the different points along the x axes. We also trained each model on the target rig directly and refer to this model as the ”oracle”, as it reflects the expected upper bound IoU for each viewpoint. 390 |

391 |
392 |

393 | To quantify the impact of training a perception model on data from one camera viewpoint (the source rig) and testing on another camera viewpoint (the target rig), we run extensive experiments in simulation. We train on data from a source rig and then vary either the pitch, yaw, height, or pitch and height together. We find that even small changes in camera viewpoint lead to significant drops in perception model accuracy. 394 |

395 |
396 | 397 |
398 |

Method

399 |
400 |
401 | 402 | 403 | 404 |

405 | Current methods for bird’s eye view (BEV) segmentation are trained on data captured from one set of camera rigs (the source rig). At inference time, these models perform well on that camera rig, but, according to our analysis, even small changes in camera viewpoint lead to large drops in BEV segmentation accuracy. Our solution is to use novel view synthesis to augment the training dataset. We find this simple solution drastically improves the robustness of BEV segmentation models to data from a target camera rig, even when no real data from the target rig is available during training. 406 |

407 |
408 |

409 | We propose a method for novel view synthesis for complex autonomous vehicle (AV) scenes. Our approach builds off of Worldsheet, a recent method for novel view synthesis, that uses monocular depth estimation to transform the scene into a textured mesh, which can be used to render novel views. We use LiDAR data to supervise depth estimation, automasking to improve the quality of the LiDAR depth maps, SSIM loss to improve training robustness, and apply only the minimal loss between neighboring frames to improve performance on scenes with occlusions. More ablations are included in the paper. 410 |

411 |
412 | 413 |
414 |

Results

415 |
416 |
417 | 418 | 419 | 420 |

421 | Shown above are the novel view synthesis results (rectified) obtained with our method for novel view synthesis. We transform images from the source rig to each of the target viewpoints and then use them for BEV segmentation training. 422 |

423 |
424 |

425 | We find that our method is able to produce realistic changes in camera viewpoint from monocular images. We then use these images to re-train the BEV segmentation model for the target viewpoint. As a result, BEV segmentation accuracy (IoU) significantly increases, removing the viewpoint domain gap. The improvement in performance achieved with our method is highlighted in the videos before. Without our method, we see that the BEV segmentation model is unable to generalize to a different viewpoint (middle), whereas, with our method, it is able to perform well on the target viewpoint (right). 426 |

427 |
428 |
429 | 432 | 435 | 438 |

439 | (Left) Performance of model trained and tested on sedan viewpoint, (Middle) Performance of model trained on sedan and tested on SUV viewpoint, (Right) Performance of model trained on trained on sedan transformed to SUV with our approach and tested on SUV. 440 |

441 |
442 | 443 |
444 | 445 |
446 |

Citation

447 |
448 |
@inproceedings{tzofi2023view,
449 |     author = {Klinghoffer, Tzofi and Philion, Jonah and Chen, Wenzheng and Litany, Or and Gojcic, Zan
450 |         and Joo, Jungseock and Raskar, Ramesh and Fidler, Sanja and Alvarez, Jose M},
451 |     title = {Towards Viewpoint Robustness in Bird's Eye View Segmentation},
452 |     booktitle = {International Conference on Computer Vision},
453 |     year = {2023}
454 | }
455 | 
456 | 
457 |
458 | 459 |
460 |

Acknowledgements

461 |
462 |
463 |

464 | We thank Alperen Degirmenci for his valuable help with AV data preparation and Maying Shen for her valuable support with experiments. 465 |

466 |
467 |
468 |
469 | 470 | 471 | -------------------------------------------------------------------------------- /drivesim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2023 NVIDIA Corporation. All rights reserved. 3 | Licensed under the NVIDIA Source Code License. See LICENSE at https://github.com/NVlabs/viewpoint-robustness 4 | Authors: Tzofi Klinghoffer, Jonah Philion, Wenzheng Chen, Or Litany, Zan Gojcic, Jungseock Joo, Ramesh Raskar, Sanja Fidler, Jose M. Alvarez 5 | """ 6 | 7 | import cv2 8 | import json 9 | import numpy as np 10 | import os 11 | import time 12 | import torch 13 | import torchvision 14 | from numpy.polynomial.polynomial import Polynomial 15 | from pathlib import Path 16 | from PIL import Image 17 | 18 | from tools.augmentations import StrongAug, GeometricAug 19 | from tools.common import get_view_matrix 20 | from tools.data_tools import parse, camera_intrinsic_parameters, sensor_to_rig, compute_ftheta_fov, compute_cuboid_corners, \ 21 | degree_to_radian, parse_rig_sensors_from_file, parse_drivesim 22 | from tools.misc import img_transform, normalize_img, gen_dx_bx, sample_augmentation 23 | from tools.sensor_models import FThetaCamera, IdealPinholeCamera, rectify 24 | from tools.transforms import Sample 25 | 26 | """ 27 | Maps datasets (e.g. fov120_0, ... fov120_11) to extrinsics (pitch, height, x; x is depth) 28 | pitch units are degrees, height and x are in meters 29 | positive pitch ==> camera looks up 30 | positive x ==> camera moves forward 31 | positive height ==> camera moves up 32 | """ 33 | DATASETS = { 34 | '0': [0,0,0], 35 | '2': [0,0,1.5], 36 | '3': [0,0.2,0], 37 | '4': [0,0.4,0], 38 | '5': [0,0.6,0], 39 | '6': [0,0.8,0], 40 | '7': [5,0,0], 41 | '8': [10,0,0], 42 | '9': [10,0.6,0], 43 | '10': [-5,0,0], 44 | '11': [-10,0,0] 45 | } 46 | 47 | CAMORDER = { 48 | 'CAM_FRONT': 0, 49 | 'CAM_FRONT_RIGHT': 1, 50 | 'CAM_BACK_RIGHT': 2, 51 | 'CAM_BACK': 3, 52 | 'CAM_BACK_LEFT': 4, 53 | 'CAM_FRONT_LEFT': 5, 54 | } 55 | 56 | class DRIVESimDataset(object): 57 | """ 58 | Logic for loading DRIVE Sim data. Assumes data has first been rectified from FTheta to Pinhole model. 59 | 60 | Args: 61 | - dataroot: path to data directory 62 | - im_path: path to subdirectory containing images of interest 63 | - sessions: session id (stored as list) 64 | - data_aug_conf: data configuration 65 | - grid_conf: bev segmentation configuration 66 | - ret_boxes: boolean specifying whether boxes are returned in getitem() 67 | - pitch: pitch of data 68 | - height: height of data 69 | - x: depth of data 70 | - camname: camera name (str) 71 | - rectified_fov: field of view (either to rectify to if rectifying or of the rectified data) 72 | - start: start index to load data 73 | - stop: stop index to load data (e.g. will load images[start:stop] 74 | - viz: boolean specifying whether to visualize data when getitem() is called, will save images locally 75 | """ 76 | def __init__(self, 77 | dataroot, 78 | im_path, 79 | sessions, 80 | data_aug_conf, 81 | grid_conf, 82 | ret_boxes, 83 | pitch=0, 84 | height=0, 85 | x=0, 86 | camname="camera:front:wide:120fov", 87 | rectified_fov=50, 88 | start=0, 89 | stop=None, 90 | viz=False, 91 | rectify_data=False): 92 | self.camname = camname 93 | self.rectified_fov = rectified_fov 94 | self.path = im_path 95 | self.camera = im_path.split("/")[-2] 96 | self.viz = viz 97 | self.rectify_data = rectify_data 98 | 99 | self.pitch = pitch 100 | self.height = height 101 | self.x = x 102 | 103 | self.dataroot = dataroot 104 | self.sessions = sessions 105 | self.data_aug_conf = data_aug_conf 106 | self.ret_boxes = ret_boxes 107 | self.grid_conf = grid_conf 108 | self.ixes, self.data = self.get_ixes() 109 | if stop is None: 110 | self.ixes = self.ixes[start:] 111 | else: 112 | self.ixes = self.ixes[start:stop] 113 | 114 | dx, bx, nx = gen_dx_bx(grid_conf['xbound'], grid_conf['ybound'], grid_conf['zbound']) 115 | self.dx, self.bx, self.nx = dx.numpy(), bx.numpy(), nx.numpy() 116 | 117 | print('Dataset:', len(self)) 118 | 119 | bev = {'h': 200, 'w': 200, 'h_meters': 100, 'w_meters': 100, 'offset': 0.0} 120 | self.view = get_view_matrix(**bev) 121 | 122 | def get_ixes(self): 123 | train = [] 124 | val = [] 125 | test = [] 126 | samples = [] 127 | data = {} 128 | start = time.time() 129 | frame_count = 0 130 | bbox_count = 0 131 | for i, f in enumerate(self.sessions): 132 | sqlite = None 133 | ignore = [] 134 | fcount = 0 135 | bcount = 0 136 | sqlite = os.path.join(self.dataroot, "dataset.sqlite") 137 | self.session_rig = os.path.join(self.dataroot, "sql_template", "session_rigs", f) + ".json" 138 | data, fcount, bcount = self.parse_drivesim(data, os.path.join(self.dataroot, "sql_template", "session_rigs", f) + ".json", \ 139 | os.path.join(self.dataroot, "sql_template", "features", f, self.camera), \ 140 | self.path) 141 | imgix = [(self.path, sqlite, fo) for fo in os.listdir(self.path) \ 142 | if fo[-5:] == '.jpeg' and os.path.join(self.path,fo) not in ignore] 143 | samples.extend(imgix) 144 | frame_count += fcount 145 | bbox_count += bcount 146 | 147 | print("Frames: {}, Average boxes per frame: {}".format(frame_count, float(bbox_count)/float(frame_count))) 148 | return samples, data 149 | 150 | def get_image_data(self, path, intrin, rot, tran, extrin): 151 | cams = ["CAM_FRONT"] 152 | augment = 'none' 153 | xform = { 154 | 'none': [], 155 | 'strong': [StrongAug()], 156 | 'geometric': [StrongAug(), GeometricAug()], 157 | }[augment] + [torchvision.transforms.ToTensor()] 158 | 159 | self.img_transform = torchvision.transforms.Compose(xform) 160 | 161 | imgs = [] 162 | rots = [] 163 | trans = [] 164 | intrins = [] 165 | extrins = [] 166 | cam_rig = [] 167 | cam_channel = [] 168 | 169 | for cam in cams: 170 | image = Image.open(path) 171 | if image.size != (1924, 1084): 172 | image = image.resize((1924,1084)) 173 | 174 | intrin = torch.Tensor(intrin) 175 | rot = torch.Tensor(rot) 176 | tran = torch.Tensor(tran) 177 | extrin = torch.Tensor(extrin) 178 | 179 | h = self.data_aug_conf['H'] 180 | w = self.data_aug_conf['W'] 181 | top_crop=46 182 | h_resize = h 183 | w_resize = w 184 | image_new = image.resize((w_resize, h_resize), resample=Image.Resampling.BILINEAR) 185 | I = np.float32(intrin) 186 | I[0, 0] *= w_resize / image.width 187 | I[0, 2] *= w_resize / image.width 188 | I[1, 1] *= h_resize / image.height 189 | I[1, 2] *= h_resize / image.height 190 | 191 | img = self.img_transform(image_new) 192 | imgs.append(img) 193 | 194 | intrins.append(torch.tensor(I)) 195 | extrins.append(extrin.tolist()) 196 | rots.append(rot) 197 | trans.append(tran) 198 | cam_rig.append(CAMORDER[cam]) 199 | cam_channel.append(cam) 200 | 201 | return { 202 | 'cam_ids': torch.LongTensor(cam_rig), 203 | 'cam_channels': cam_channel, 204 | 'intrinsics': torch.stack(intrins,0), 205 | 'extrinsics': torch.tensor(np.float32(extrins)), 206 | 'rots': rots, 207 | 'trans': trans, 208 | 'image': torch.stack(imgs,0), 209 | } 210 | 211 | def get_binimg(self, boxes, rig2sensor, ph_model): 212 | thickness = 1 213 | img = np.zeros((int(self.nx[0]), int(self.nx[1]))) 214 | boxz = [] 215 | points = [] 216 | for i, cuboid3d in enumerate(boxes): 217 | cuboid3d_array = np.array( 218 | [ 219 | cuboid3d.center.x, 220 | cuboid3d.center.y, 221 | cuboid3d.center.z, 222 | cuboid3d.dimension.x, 223 | cuboid3d.dimension.y, 224 | cuboid3d.dimension.z, 225 | cuboid3d.orientation.yaw, 226 | cuboid3d.orientation.pitch, 227 | cuboid3d.orientation.roll, 228 | ] 229 | ) 230 | cuboid3d_array[6:9] = degree_to_radian(cuboid3d_array[6:9]) 231 | cuboid3d_corners = compute_cuboid_corners( 232 | cuboid3d_array, use_polar=False, use_mrp=False) 233 | 234 | points.append(torch.tensor(cuboid3d_corners[4])) 235 | points.append(torch.tensor(cuboid3d_corners[5])) 236 | points.append(torch.tensor(cuboid3d_corners[6])) 237 | points.append(torch.tensor(cuboid3d_corners[7])) 238 | 239 | corners = cuboid3d_corners[4:8].T 240 | 241 | pts = corners[:2].T 242 | pts = np.round( 243 | (pts - self.bx[:2] + self.dx[:2] / 2.) / self.dx[:2] 244 | ).astype(np.int32) 245 | pts[:, [1, 0]] = pts[:, [0, 1]] 246 | cv2.fillPoly(img, [pts], 1.0) 247 | 248 | return torch.Tensor(img).unsqueeze(0), torch.stack(points, -1).float() 249 | 250 | def rectify_drivesim(self, key, rig_path): 251 | DOWNSAMPLE = 4.0 252 | rig = parse_rig_sensors_from_file(rig_path)[self.camname] 253 | sensor2rig = sensor_to_rig(rig) 254 | fisheye_intrins = camera_intrinsic_parameters(rig) 255 | size = np.array([float(rig['properties']['width']), float(rig['properties']['height'])]) 256 | bwpoly = fisheye_intrins[4:] 257 | fov_x, fov_y, _ = compute_ftheta_fov(fisheye_intrins) 258 | fx = (fisheye_intrins[2] / (2.0 * np.tan(float(fov_x / 2.0 )))) / DOWNSAMPLE 259 | fy = (fisheye_intrins[3] / (2.0 * np.tan(float(fov_y / 2.0 )))) / DOWNSAMPLE 260 | 261 | fx = ((float(fisheye_intrins[3]) / DOWNSAMPLE) / 2.0 / np.tan(float(fov_x / 2.0 ))) 262 | fy = ((float(fisheye_intrins[2]) / DOWNSAMPLE) / 2.0 / np.tan(float(fov_y / 2.0 ))) 263 | 264 | focal = (float(size[0]) / DOWNSAMPLE) / 2.0 / np.tan(np.radians(120.0/2.0)) 265 | cx = fisheye_intrins[0] / DOWNSAMPLE 266 | cy = fisheye_intrins[1] / DOWNSAMPLE 267 | intrins = np.array([[fx,0,cx],[0,fy,cy],[0,0,1]]) 268 | 269 | properties = rig['properties'] 270 | bw_poly = Polynomial(fisheye_intrins[4:]) 271 | downsample_pixel_map = np.polynomial.Polynomial([0.0, DOWNSAMPLE]) 272 | new_bw_poly = bw_poly(downsample_pixel_map) 273 | ftheta_model = FThetaCamera( 274 | cx=float(properties['cx']) / DOWNSAMPLE, 275 | cy=float(properties['cy']) / DOWNSAMPLE, 276 | width=size[0] / DOWNSAMPLE, 277 | height=size[1] / DOWNSAMPLE, 278 | bw_poly=new_bw_poly.convert().coef, 279 | ) 280 | 281 | target_fov = np.radians(self.rectified_fov) 282 | target_foc = (ftheta_model.width / 2) / np.tan(target_fov / 2) 283 | ph_model = IdealPinholeCamera( 284 | fov_x_deg=None, 285 | fov_y_deg=None, 286 | width=ftheta_model.width, 287 | height=ftheta_model.height, 288 | f_x=target_foc, 289 | f_y=target_foc, 290 | ) 291 | 292 | img = cv2.imread(key) 293 | img = rectify(img, ftheta_model, ph_model) 294 | 295 | fname = key.split("/")[-1] 296 | #save_path = "/".join(key.split("/")[:-1]).replace("rgb_jpeg", "rgb_jpeg_rectified") 297 | save_path = "/".join(key.split("/")[:-1]).replace("rgb_half_jpeg-100-xavierisp", "rgb_jpeg_rectified2") 298 | if not os.path.exists(save_path): 299 | path = Path(save_path) 300 | path.mkdir(parents=True) 301 | cv2.imwrite(os.path.join(save_path, fname), img) 302 | 303 | def parse_drivesim(self, data, rig_path, json_path, img_path): 304 | DOWNSAMPLE = 2.0 # images are 4x smaller than specified in the json, but then upsampled 2x in get_image_data() 305 | rig = parse_rig_sensors_from_file(rig_path)[self.camname] 306 | 307 | rig["nominalSensor2Rig_FLU"]["roll-pitch-yaw"][1] += self.pitch 308 | rig["nominalSensor2Rig_FLU"]["t"][2] += self.height 309 | rig["nominalSensor2Rig_FLU"]["t"][0] += self.x 310 | 311 | sensor2rig, _ = sensor_to_rig(rig) 312 | print("DriveSIM x {}, y {}, z {}".format(sensor2rig[0,3],sensor2rig[1,3],sensor2rig[2,3])) 313 | fisheye_intrins = camera_intrinsic_parameters(rig) 314 | fov_x, fov_y, _ = compute_ftheta_fov(fisheye_intrins) 315 | size = np.array([float(rig['properties']['width']), float(rig['properties']['height'])]) 316 | 317 | target_fov = np.radians(self.rectified_fov) 318 | target_foc = ((size[0]/DOWNSAMPLE) / 2) / np.tan(target_fov / 2) 319 | cx = fisheye_intrins[0] / DOWNSAMPLE 320 | cy = fisheye_intrins[1] / DOWNSAMPLE 321 | intrins = np.array([[target_foc,0,cx],[0,target_foc,cy],[0,0,1]]) 322 | 323 | ph_model = IdealPinholeCamera( 324 | fov_x_deg=None, 325 | fov_y_deg=None, 326 | width=size[0] / DOWNSAMPLE, 327 | height=size[1] / DOWNSAMPLE, 328 | f_x=target_foc, 329 | f_y=target_foc, 330 | ) 331 | 332 | frame_count = 0 333 | bbox_count = 0 334 | 335 | for f in os.listdir(json_path): 336 | key = os.path.join(img_path, f[:-5]) + ".jpeg" 337 | if self.rectify_data: 338 | self.rectify_drivesim(key, rig_path) 339 | continue 340 | with open(os.path.join(json_path, f)) as fp: 341 | d = json.load(fp) 342 | boxes = [] 343 | for item in d: 344 | if item["label_family"] == "SHAPE2D" and item["label_name"] == "automobile": 345 | cuboid3d, ratio = parse_drivesim(json.loads(item["data"])) 346 | if float(ratio) < 0.5: 347 | boxes.append(cuboid3d) 348 | 349 | if len(boxes) > 0: 350 | data[key] = {"boxes": boxes, 351 | "intrins": intrins, 352 | "sensor2rig": sensor2rig, 353 | "fisheye_intrins": fisheye_intrins, 354 | "size": size, 355 | "ph_model": ph_model} 356 | frame_count += 1 357 | bbox_count += len(boxes) 358 | 359 | return data, frame_count, bbox_count 360 | 361 | def pinhole_project(self, points, img, intrin, sensor2rig, ph_model): 362 | intrin = intrin.detach().cpu().numpy() 363 | intrin = np.array(intrin, dtype=np.float64) 364 | rig2sensor = np.linalg.inv(sensor2rig) 365 | points = torch.swapaxes(points, 0, 1) 366 | for point in points: 367 | point = np.array([point[0],point[1],point[2],1]).T 368 | point = np.dot(rig2sensor, point)[:-1] 369 | cam_pixels = ph_model.ray2pixel(point.T)[0][0] 370 | x = int(cam_pixels[0]) 371 | y = int(cam_pixels[1]) 372 | img = cv2.circle(img, (x,y), radius=2, color=(0, 0, 255), thickness=2) 373 | return img 374 | 375 | def ftheta_project(self, points, img, sensor2rig): 376 | ftheta_model = FThetaCamera.from_rig(self.session_rig, self.camname) 377 | points = torch.swapaxes(points, 0, 1) 378 | for i, point in enumerate(points): 379 | rig2sensor = np.linalg.inv(sensor2rig) 380 | point = np.array([point[0],point[1],point[2],1]).T 381 | point = np.dot(rig2sensor, point)[:-1] 382 | cam_pixels = ftheta_model.ray2pixel(point.T)[0] / 4.0 383 | x = int(cam_pixels[0]) 384 | y = int(cam_pixels[1]) 385 | img = cv2.circle(img, (x,y), radius=2, color=(0, 0, 255), thickness=2) 386 | return img 387 | 388 | def is_point_in_fov(self, point, rig2sensor, ph_model): 389 | point = np.array([point[0], point[1], point[2], 1]).T 390 | cam_point = np.dot(rig2sensor, point)[:-1] 391 | cam_pixels = ph_model.ray2pixel(cam_point.T)[0][0] 392 | x = int(cam_pixels[0]) 393 | y = int(cam_pixels[1]) 394 | if x >= 0 and x <= ph_model.width and \ 395 | y >= 0 and y <= ph_model.height: 396 | return True 397 | return False 398 | 399 | def is_box_in_fov(self, cuboid3d, rig2sensor, ph_model): 400 | cuboid3d_array = np.array( 401 | [ 402 | cuboid3d.center.x, 403 | cuboid3d.center.y, 404 | cuboid3d.center.z, 405 | cuboid3d.dimension.x, 406 | cuboid3d.dimension.y, 407 | cuboid3d.dimension.z, 408 | cuboid3d.orientation.yaw, 409 | cuboid3d.orientation.pitch, 410 | cuboid3d.orientation.roll, 411 | ] 412 | ) 413 | cuboid3d_array[6:9] = degree_to_radian(cuboid3d_array[6:9]) 414 | cuboid3d_corners = compute_cuboid_corners( 415 | cuboid3d_array, use_polar=False, use_mrp=False)#[:8] 416 | 417 | inside = 0 418 | for corner in cuboid3d_corners: 419 | inside += int(self.is_point_in_fov(corner, rig2sensor, ph_model)) 420 | 421 | return bool(inside) 422 | 423 | def __len__(self): 424 | return len(self.ixes) 425 | 426 | def __getitem__(self, index): 427 | ipath, sqlite, fname = self.ixes[index] 428 | 429 | data = self.data[os.path.join(ipath,fname)] 430 | boxes = data["boxes"] 431 | intrins = data["intrins"] 432 | sensor2rig = data["sensor2rig"] 433 | ph_model = data["ph_model"] 434 | 435 | sample = self.get_image_data(os.path.join(ipath, fname), intrins, sensor2rig[:3,:3], sensor2rig[:3,3], sensor2rig) 436 | binimg, points = self.get_binimg(boxes, np.linalg.inv(sensor2rig), ph_model) 437 | 438 | # Visualizations (save image with projected points and ground truth bev segmentation map) 439 | if self.viz: 440 | im = cv2.imread(os.path.join(ipath,fname)) 441 | im = self.ftheta_project(points, im, sensor2rig) 442 | idx = fname.split(".")[0] 443 | cv2.imwrite("viz_{}.png".format(idx.zfill(5)), im) 444 | cv2.imwrite("gt_{}.png".format(idx.zfill(5)), np.rollaxis(binimg.detach().cpu().numpy(), 0, 3) *255) 445 | 446 | if self.ret_boxes: 447 | pts = points 448 | return imgs, rots, trans, intrins, post_rots, post_trans, pts, binimg 449 | 450 | data = Sample( 451 | view=self.view, 452 | bev=binimg, 453 | **sample 454 | ) 455 | 456 | return data 457 | 458 | def compile_data(dataroot, im_path, sessions, pitch=0, height=0, x=0, viz=False, rectify_data=False): 459 | xbound=[-50.0, 50.0, 0.5] 460 | ybound=[-50.0, 50.0, 0.5] 461 | zbound=[-10.0, 10.0, 20.0] 462 | dbound=[4.0, 45.0, 1.0] 463 | grid_conf = { 464 | 'xbound': xbound, 465 | 'ybound': ybound, 466 | 'zbound': zbound, 467 | 'dbound': dbound, 468 | } 469 | data_aug_conf = { 470 | 'H': 224, 'W': 480, 471 | 'cams': ['CAM_FRONT'], 472 | 'Ncams': 1, 473 | } 474 | data = DRIVESimDataset(dataroot, 475 | im_path, 476 | sessions, 477 | data_aug_conf, 478 | grid_conf, 479 | ret_boxes=False, 480 | pitch=pitch, 481 | height=height, 482 | x=x, 483 | start=0, 484 | stop=None, 485 | viz=viz, 486 | rectify_data=rectify_data) 487 | 488 | return data 489 | 490 | if __name__ == "__main__": 491 | import configargparse 492 | parser = configargparse.ArgumentParser() 493 | parser.add_argument('--dataroot', type=str, default='./DRIVESim_datasets/', 494 | help='path to top level data directory, i.e. DRIVESim_datasets') 495 | parser.add_argument("--session", type=str, default='5f8f235e-9304-11ed-8a70-6479f09080c1', 496 | help='Session number (specified the folder containing all frames)') 497 | parser.add_argument("--dataset_idx", type=str, default='0', 498 | help='index of dataset, i.e. the number after 120fov_ (120fov_0, ... , 120fov_11)') 499 | parser.add_argument("--frames", type=str, default='rgb_half_jpeg-100-xavierisp', 500 | help='Either rgb_half_jpeg-100-xavierisp if loading non-rectified or rgb_jpeg_rectified if loading rectified') 501 | parser.add_argument("--vis", type=int, default=0, 502 | help='whether or not to visualize data') 503 | parser.add_argument("--rectify", type=int, default=0, 504 | help='whether or not to rectify data') 505 | args = parser.parse_args() 506 | 507 | im_path = os.path.join(args.dataroot, "frames", args.session, "camera_front_wide_120fov_" + args.dataset_idx, args.frames) 508 | pitch, height, x = DATASETS[args.dataset_idx] 509 | dataset = compile_data(args.dataroot, im_path, [args.session], pitch=pitch, height=height, x=x, viz=bool(args.vis), rectify_data=bool(args.rectify)) 510 | 511 | # Only visualize first image ==> this can be changed to iterate the entire dataset if desired 512 | if args.vis: 513 | sample = dataset[0] 514 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==4.5.5.64 2 | torch==1.11.0 3 | torchvision==0.12.0 4 | pyquaternion==0.9.9 5 | torchmetrics==0.7.2 6 | efficientnet-pytorch==0.7.1 7 | nuscenes-devkit 8 | tensorboardX 9 | Pillow 10 | protobuf==3.20.1 11 | -------------------------------------------------------------------------------- /tools/augmentations.py: -------------------------------------------------------------------------------- 1 | """ 2 | https://github.com/eriklindernoren/PyTorch-YOLOv3/blob/master/pytorchyolo/utils/transforms.py 3 | """ 4 | import imgaug.augmenters as iaa 5 | import torchvision 6 | import numpy as np 7 | 8 | 9 | class AugBase(torchvision.transforms.ToTensor): 10 | def __init__(self): 11 | super().__init__() 12 | 13 | self.augment = self.get_augment().augment_image 14 | 15 | def __call__(self, x): 16 | if not isinstance(x, np.ndarray): 17 | x = np.array(x) 18 | 19 | return self.augment(x) 20 | 21 | 22 | class StrongAug(AugBase): 23 | def get_augment(self): 24 | return iaa.Sequential([ 25 | iaa.Dropout([0.0, 0.01]), 26 | iaa.Sharpen((0.0, 0.1)), 27 | iaa.AddToBrightness((-60, 40)), 28 | iaa.AddToHue((-20, 20)), 29 | ]) 30 | 31 | 32 | class GeometricAug(AugBase): 33 | def get_augment(self): 34 | return iaa.Affine(rotate=(-2.5, 2.5), 35 | translate_percent=(-0.05, 0.05), 36 | scale=(0.95, 1.05), 37 | mode='symmetric') 38 | -------------------------------------------------------------------------------- /tools/av_types.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2023 NVIDIA Corporation. All rights reserved. 3 | Licensed under the NVIDIA Source Code License. See LICENSE at https://github.com/NVlabs/viewpoint-robustness 4 | Authors: Tzofi Klinghoffer, Jonah Philion, Wenzheng Chen, Or Litany, Zan Gojcic, Jungseock Joo, Ramesh Raskar, Sanja Fidler, Jose M. Alvarez 5 | """ 6 | 7 | """Type definitions for Obstacle3D.""" 8 | from typing import Dict, List, NamedTuple, Tuple 9 | class TopDownLidarGT(NamedTuple): 10 | """Collection of top-down 2D lidar GT features transformed into the rig frame.""" 11 | # Lidar label IDs (used for association between camera/lidar GT labels). 12 | label_ids: List[str] 13 | # label names (automobile, etc.). 14 | label_names: List[str] 15 | # 2-level list of x(col) and y(row) coordinates (in the rig frame). 16 | # The outer list is over objects, the inner list is over vertices. 17 | vertices: List[List[Tuple[float, float]]] 18 | # The number of vertices in the each list inside `vertices` 19 | vertex_count: List[int] 20 | class Center(NamedTuple): 21 | """Describes a Center coordinate. 22 | contains center x, y, z 23 | """ 24 | x: float 25 | y: float 26 | z: float 27 | class Orientation(NamedTuple): 28 | """Describes an Orientation coordinate. 29 | Contains roll, pitch, and yaw. 30 | """ 31 | yaw: float 32 | pitch: float 33 | roll: float 34 | class Dimension(NamedTuple): 35 | """Describes a Dimension coordinate. 36 | This contains dimension along x, y, z. 37 | """ 38 | x: float 39 | y: float 40 | z: float 41 | class BBox2D(NamedTuple): 42 | """Describes a single 2D bounding box. 43 | x1, y1, x2, y2 correspond to x_min, y_min, x_max, y_max 44 | """ 45 | x1: float 46 | y1: float 47 | x2: float 48 | y2: float 49 | @classmethod 50 | def from_ilf(cls, box2d): 51 | """ 52 | Parses the bounding box 2D from the dataset. 53 | Returns: 54 | BBox2D: an instance of BBox2D class. 55 | """ 56 | return cls(x1=box2d[0][0], y1=box2d[0][1], x2=box2d[1][0], y2=box2d[1][1]) 57 | class Cuboid3D(NamedTuple): 58 | """Describes a single 3D cuboid. 59 | contains Center, Dimension, and Orientation 60 | classes. 61 | """ 62 | center: Center 63 | orientation: Orientation 64 | dimension: Dimension 65 | @classmethod 66 | def from_ilf(cls, cuboid3d): 67 | """ 68 | Parses the cuboid3d structure from the dataset. 69 | Converts from a json structure to python 70 | Cuboid3D structure. 71 | Returns: 72 | Cuboid3D: an instance of Cuboid3D class. 73 | """ 74 | center = Center(cuboid3d.center.x, cuboid3d.center.y, cuboid3d.center.z) 75 | orientation = Orientation( 76 | cuboid3d.orientation.x, cuboid3d.orientation.y, cuboid3d.orientation.z 77 | ) 78 | dimension = Dimension( 79 | cuboid3d.dimensions.x, cuboid3d.dimensions.y, cuboid3d.dimensions.z 80 | ) 81 | return cls(center=center, orientation=orientation, dimension=dimension) 82 | class Cuboid3DGT(NamedTuple): 83 | """Collection of 2D bounding box and 3D cuboid GT features.""" 84 | # Lidar label IDs: Used for 2D bbox and 3D Cuboid association. 85 | label_ids: List[str] 86 | # automobile, etc. 87 | label_names: List[str] 88 | # list of 2D bounding boxes from multiple cameras 89 | bbox2ds: List[Dict[str, BBox2D]] 90 | # list of 3D cuboids 91 | cuboid3ds: List[Cuboid3D] 92 | # visibility of 3D Cuboids 93 | # Values range between [0,1] that indicates the level of occlusion 94 | # If cuboid is occluded it will have a value closer to 0 95 | # If cuboid is not occluded it will have a value near 1 96 | # defaults to -1.0 if not present when feature parsing which means ignore 97 | visibility: List[float] 98 | # Per-camera visibility attribute. One vis float per cam if the 99 | # cuboid is visible in that camera. Jira AMP-454 is task to track 100 | # a merging of this attribute with the above visibility[List[float]] 101 | visibility_per_camera: List[Dict[str, float]] 102 | 103 | -------------------------------------------------------------------------------- /tools/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | from pathlib import Path 5 | from pyquaternion import Quaternion 6 | 7 | 8 | INTERPOLATION = cv2.LINE_8 9 | 10 | 11 | def get_split(split, dataset_name): 12 | split_dir = Path(__file__).parent / 'splits' / dataset_name 13 | split_path = split_dir / f'{split}.txt' 14 | 15 | return split_path.read_text().strip().split('\n') 16 | 17 | 18 | def get_view_matrix(h=200, w=200, h_meters=100.0, w_meters=100.0, offset=0.0): 19 | sh = h / h_meters 20 | sw = w / w_meters 21 | 22 | return np.float32([ 23 | [ 0., -sw, w/2.], 24 | [-sh, 0., h*offset+h/2.], 25 | [ 0., 0., 1.] 26 | ]) 27 | 28 | 29 | def get_transformation_matrix(R, t, inv=False): 30 | pose = np.eye(4, dtype=np.float32) 31 | pose[:3, :3] = R if not inv else R.T 32 | pose[:3, -1] = t if not inv else R.T @ -t 33 | 34 | return pose 35 | 36 | 37 | def get_pose(rotation, translation, inv=False, flat=False): 38 | if flat: 39 | yaw = Quaternion(rotation).yaw_pitch_roll[0] 40 | R = Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).rotation_matrix 41 | else: 42 | R = Quaternion(rotation).rotation_matrix 43 | 44 | t = np.array(translation, dtype=np.float32) 45 | 46 | return get_transformation_matrix(R, t, inv=inv) 47 | 48 | 49 | def encode(x): 50 | """ 51 | (h, w, c) np.uint8 {0, 255} 52 | """ 53 | n = x.shape[2] 54 | 55 | # assert n < 16 56 | assert x.ndim == 3 57 | assert x.dtype == np.uint8 58 | assert all(x in [0, 255] for x in np.unique(x)) 59 | 60 | shift = np.arange(n, dtype=np.int32)[None, None] 61 | 62 | binary = (x > 0) 63 | binary = (binary << shift).sum(-1) 64 | binary = binary.astype(np.int32) 65 | 66 | return binary 67 | 68 | 69 | def decode(img, n): 70 | """ 71 | returns (h, w, n) np.int32 {0, 1} 72 | """ 73 | shift = np.arange(n, dtype=np.int32)[None, None] 74 | 75 | x = np.array(img)[..., None] 76 | x = (x >> shift) & 1 77 | 78 | return x 79 | 80 | 81 | if __name__ == '__main__': 82 | from PIL import Image 83 | 84 | n = 12 85 | 86 | x = np.random.rand(64, 64, n) 87 | x = 255 * (x > 0.5).astype(np.uint8) 88 | 89 | x_encoded = encode(x) 90 | x_img = Image.fromarray(x_encoded) 91 | x_img.save('tmp.png') 92 | x_loaded = Image.open('tmp.png') 93 | x_decoded = 255 * decode(x_loaded, 12) 94 | x_decoded = x_decoded[..., :n] 95 | 96 | print(abs(x_decoded - x).max()) 97 | -------------------------------------------------------------------------------- /tools/data_tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2023 NVIDIA Corporation. All rights reserved. 3 | Licensed under the NVIDIA Source Code License. See LICENSE at https://github.com/NVlabs/viewpoint-robustness 4 | Authors: Tzofi Klinghoffer, Jonah Philion, Wenzheng Chen, Or Litany, Zan Gojcic, Jungseock Joo, Ramesh Raskar, Sanja Fidler, Jose M. Alvarez 5 | """ 6 | import cv2 7 | import torch 8 | import numpy as np 9 | from typing import Dict, List, Optional, Set, Tuple, Union 10 | from tools.av_types import BBox2D, Cuboid3D, Cuboid3DGT, Center, Orientation, Dimension 11 | from tools.transformations import euler_2_so3, transform_point_cloud, lat_lng_alt_2_ecef, axis_angle_trans_2_se3 12 | from google import protobuf 13 | 14 | import json 15 | import base64 16 | import logging 17 | import os 18 | import hashlib 19 | 20 | import tqdm 21 | 22 | import pyarrow.parquet as pq 23 | 24 | from PIL import Image 25 | from google.protobuf import text_format 26 | from scipy.optimize import curve_fit 27 | from numpy.polynomial.polynomial import Polynomial 28 | 29 | ArrayLike = Union[torch.Tensor, np.ndarray] 30 | MIN_ORIENTATION_ARROW_LENGTH = 3 31 | 32 | def draw_3d_cuboids_on_bev_image( 33 | image: np.ndarray, cuboids: list, meter_to_pixel_scale: int = 4, thickness: int = 2 34 | ) -> np.ndarray: 35 | """Draw 3d cuboids on top-down image. 36 | Args: 37 | image (np.ndarray): an image of shape [W, H, 3] where objects will be drawn. 38 | cuboids: an list of 3d cuboids, where each cuboid is an numpy array 39 | of shape [10, 3] containing 10 3d vertices. 40 | meter_to_pixel_scale (int): Default value is 4. 41 | thickness (int): Default is 2. 42 | Outputs: 43 | image (np.ndarray): an image of shape [W, H, 3]. 44 | """ 45 | if image.shape[0] == 3: 46 | image = np.transpose(image, (1, 2, 0)) 47 | image = np.ascontiguousarray(image, dtype=np.uint8) 48 | img_width, img_height = image.shape[0:2] 49 | #for corners, source, class_id, visibility, _ in cuboids: 50 | for corners in cuboids: 51 | # Rotate 180 degree 52 | corners = -1 * corners 53 | # Convert meters to pixels 54 | corners = corners * meter_to_pixel_scale 55 | # Move center to top left 56 | corners[:, 1] += img_height / 2 57 | corners[:, 0] += img_width / 2 58 | # Swap x and y coordinates 59 | corners = np.int32(corners[:, [1, 0]].squeeze()) 60 | #class_id = class_id.type(torch.uint8) 61 | boundary_color = [0.0, 1.0, 0.0] # Green 62 | #COLOR_RGB_FP32[class_id] 63 | body_color = [0.0, 0.0, 1.0] # Blue 64 | """ 65 | if source == 1: 66 | body_color = [0.0, 0.0, 1.0] # Blue 67 | elif source == 2: 68 | body_color = [0.0, 1.0, 0.0] # Green 69 | else: 70 | body_color = cm.hot(100 + int((1 - visibility) * 200))[0:3] # Red 71 | """ 72 | cv2.fillPoly(image, [corners[0:4]], body_color) 73 | cv2.polylines(image, [corners[0:4]], True, boundary_color, thickness * 2) 74 | if corners.shape[0] > 8: 75 | start_point = corners[8, :] 76 | end_point = corners[9, :] 77 | cv2.arrowedLine( 78 | image, tuple(start_point), tuple(end_point), boundary_color, thickness 79 | ) 80 | return image 81 | 82 | def create_bev_image( 83 | max_x_viz_range_meters: int = 200, 84 | max_y_viz_range_meters: int = 100, 85 | meter_to_pixel_scale: int = 4, 86 | step_meters: int = 20, 87 | color: Optional[list] = None, 88 | thickness: int = 2, 89 | ) -> np.ndarray: 90 | """Draw a bev image. 91 | Args: 92 | max_viz_range_meters (int): maximum visualization range. Default is 120 meters. 93 | meter_to_pixel_scale (int): Default is 4 94 | step_meters (int): steps to draw a spatial grid. 95 | color (tuple): self-explained. 96 | thickness (scalar): self-explained. Default is 2. 97 | Return: 98 | top_down_image (np.ndarray): an image of shape W x H x 3 where predictions are drawn. 99 | """ 100 | # Create a back background image 101 | top_down_image = np.zeros( 102 | [ 103 | 2 * max_x_viz_range_meters * meter_to_pixel_scale, 104 | 2 * max_y_viz_range_meters * meter_to_pixel_scale, 105 | 3, 106 | ] 107 | ) 108 | # Get bev center coordinates 109 | cx, cy = ( 110 | max_y_viz_range_meters * meter_to_pixel_scale, 111 | max_x_viz_range_meters * meter_to_pixel_scale, 112 | ) 113 | # Draw circular rings 114 | for d in range(step_meters, max_x_viz_range_meters, step_meters): 115 | radius = d * meter_to_pixel_scale 116 | ring_color = [1.0, 1.0, 1.0] 117 | text_color = [1.0, 1.0, 1.0] 118 | ring_thickness = 1 119 | cv2.circle(top_down_image, (cx, cy), radius, ring_color, ring_thickness) 120 | cv2.putText( 121 | top_down_image, 122 | str(d), 123 | (cx + 50, cy - radius), 124 | cv2.FONT_HERSHEY_SIMPLEX, 125 | 1, 126 | text_color, 127 | 2, 128 | ) 129 | # Draw ego car 130 | scale = meter_to_pixel_scale 131 | ego_car_half_length = 2.0 # meters 132 | ego_car_half_width = 1.0 # meters 133 | ego_corners = np.array( 134 | [ 135 | [cx - ego_car_half_width * scale, cy - ego_car_half_length * scale], 136 | [cx - ego_car_half_width * scale, cy + ego_car_half_length * scale], 137 | [cx + ego_car_half_width * scale, cy + ego_car_half_length * scale], 138 | [cx + ego_car_half_width * scale, cy - ego_car_half_length * scale], 139 | ], 140 | np.int32, 141 | ) 142 | ego_corners = ego_corners.reshape((-1, 1, 2)) 143 | ego_color = [1.0, 1.0, 1.0] 144 | cv2.fillPoly(top_down_image, [ego_corners], ego_color) 145 | ego_start = (cx, cy) 146 | ego_end = (cx, cy - int(3 * ego_car_half_length * meter_to_pixel_scale)) 147 | cv2.arrowedLine(top_down_image, ego_start, ego_end, ego_color, thickness) 148 | return top_down_image 149 | 150 | def numericallyStable2Norm2D(x, y): 151 | absX = abs(x) 152 | absY = abs(y) 153 | minimum = min(absX, absY) 154 | maximum = max(absX, absY) 155 | 156 | if maximum <= np.float32(0.0): 157 | return np.float32(0.0) 158 | 159 | oneOverMaximum = np.float32(1.0) / maximum 160 | minMaxRatio = np.float32(minimum) * oneOverMaximum 161 | return maximum * np.sqrt(np.float32(1.0) + minMaxRatio * minMaxRatio) 162 | 163 | def project_camera_rays_2_img(points, cam_metadata): 164 | ''' Projects the points in the camera coordinate system to the image plane 165 | 166 | Args: 167 | points (np.array): point coordinates in the camera coordinate system [n,3] 168 | intrinsic (np.array): camera intrinsic parameters (size depends on the camera model) 169 | img_width (float): image width in pixels 170 | img_height (float): image hight in pixels 171 | camera_model (string): camera model used for projection. Must be one of ['pinhole', 'f_theta'] 172 | Out: 173 | points_img (np.array): pixel coordinates of the image projections [m,2] 174 | valid (np.array): array of boolean flags. True for point that project to the image plane 175 | ''' 176 | 177 | intrinsic = cam_metadata['intrinsic'] 178 | camera_model = cam_metadata['camera_model'] 179 | img_width = cam_metadata['img_width'] 180 | img_height = cam_metadata['img_height'] 181 | 182 | if camera_model == "pinhole": 183 | 184 | # Camera coordinates system is FLU and image is RDF 185 | normalized_points = -points[:,1:3] / points[:,0:1] 186 | f_u, f_v, c_u, c_v, k1, k2, k3, k4, k5 = intrinsic 187 | u_n = normalized_points[:,0] 188 | v_n = normalized_points[:,1] 189 | 190 | r2 = np.square(u_n) + np.square(v_n) 191 | r4 = r2 * r2 192 | r6 = r4 * r2 193 | 194 | r_d = 1.0 + k1 * r2 + k2 * r4 + k5 * r6 195 | 196 | # If the radial distortion is too large, the computed coordinates will be unreasonable 197 | kMinRadialDistortion = 0.8 198 | kMaxRadialDistortion = 1.2 199 | 200 | invalid_idx = np.where(np.logical_or(np.less_equal(r_d,kMinRadialDistortion),np.greater_equal(r_d,kMaxRadialDistortion)))[0] 201 | 202 | u_nd = u_n * r_d + 2.0 * k3 * u_n * v_n + k4 * (r2 + 2.0 * u_n * u_n) 203 | v_nd = v_n * r_d + k3 * (r2 + 2.0 * v_n * v_n) + 2.0 * k4 * u_n * v_n 204 | 205 | u_d = u_nd * f_u + c_u 206 | v_d = v_nd * f_v + c_v 207 | 208 | valid_flag = np.ones_like(u_d) 209 | valid_flag[points[:,0] <0] = 0 210 | 211 | # Replace the invalid ones 212 | r2_sqrt_rcp = 1.0 / np.sqrt(r2) 213 | clipping_radius = np.sqrt(img_width**2 + img_height**2) 214 | u_d[invalid_idx] = u_n[invalid_idx] * r2_sqrt_rcp[invalid_idx] * clipping_radius + c_u 215 | v_d[invalid_idx] = v_n[invalid_idx] * r2_sqrt_rcp[invalid_idx] * clipping_radius + c_v 216 | valid_flag[invalid_idx] = 0 217 | 218 | # Change the flags of the pixels that project outside of an image 219 | valid_flag[u_d < 0 ] = 0 220 | valid_flag[v_d < 0 ] = 0 221 | valid_flag[u_d > img_width] = 0 222 | valid_flag[v_d > img_height] = 0 223 | 224 | 225 | return np.concatenate((u_d[:,None], v_d[:,None]),axis=1), np.where(valid_flag == 1)[0] 226 | 227 | elif camera_model == "f_theta": 228 | 229 | # Initialize the forward polynomial 230 | fw_poly = Polynomial(intrinsic[9:14]) 231 | 232 | xy_norm = np.zeros((points.shape[0], 1)) 233 | 234 | for i, point in enumerate(points): 235 | xy_norm[i] = numericallyStable2Norm2D(point[0], point[1]) 236 | 237 | cos_alpha = points[:, 2:] / np.linalg.norm(points, axis=1, keepdims=True) 238 | alpha = np.arccos(np.clip(cos_alpha, -1 + 1e-7, 1 - 1e-7)) 239 | delta = np.zeros_like(cos_alpha) 240 | valid = alpha <= intrinsic[16] 241 | 242 | delta[valid] = fw_poly(alpha[valid]) 243 | 244 | # For outside the model (which need to do linear extrapolation) 245 | delta[~valid] = (intrinsic[14] + (alpha[~valid] - intrinsic[16]) * intrinsic[15]) 246 | 247 | # Determine the bad points with a norm of zero, and avoid division by zero 248 | bad_norm = xy_norm <= 0 249 | xy_norm[bad_norm] = 1 250 | delta[bad_norm] = 0 251 | 252 | # compute pixel relative to center 253 | scale = delta / xy_norm 254 | pixel = scale * points 255 | 256 | # Handle the edge cases (ray along image plane normal) 257 | edge_case_cond = (xy_norm <= 0.0).squeeze() 258 | pixel[edge_case_cond, :] = points[edge_case_cond, :] 259 | points_img = pixel 260 | points_img[:, :2] += intrinsic[0:2] 261 | 262 | # Mark the points that do not fall on the camera plane as invalid 263 | x_ok = np.logical_and(0 <= points_img[:, 0], points_img[:, 0] < img_width) 264 | y_ok = np.logical_and(0 <= points_img[:, 1], points_img[:, 1] < img_height) 265 | z_ok = points_img[:,2] > 0.0 266 | valid = np.logical_and(np.logical_and(x_ok, y_ok), z_ok) 267 | 268 | return points_img, np.where(valid==True)[0] 269 | 270 | 271 | def backwards_polynomial(pixel_norms, intrinsic): 272 | ret = 0 273 | for k, coeff in enumerate(intrinsic): 274 | 275 | ret += coeff * pixel_norms**k 276 | 277 | return ret 278 | 279 | def pixel_2_camera_ray(pixel_coords, intrinsic, camera_model): 280 | ''' Convert the pixel coordinates to a 3D ray in the camera coordinate system. 281 | 282 | Args: 283 | pixel_coords (np.array): pixel coordinates of the selected points [n,2] 284 | intrinsic (np.array): camera intrinsic parameters (size depends on the camera model) 285 | camera_model (string): camera model used for projection. Must be one of ['pinhole', 'f_theta'] 286 | 287 | Out: 288 | camera_rays (np.array): rays in the camera coordinate system [n,3] 289 | ''' 290 | 291 | camera_rays = np.ones((pixel_coords.shape[0],3)) 292 | 293 | if camera_model == 'pinhole': 294 | camera_rays[:,0] = (pixel_coords[:,0] + 0.5 - intrinsic[2]) / intrinsic[0] 295 | camera_rays[:,1] = (pixel_coords[:,1] + 0.5 - intrinsic[5]) / intrinsic[4] 296 | 297 | elif camera_model == "f_theta": 298 | pixel_offsets = np.ones((pixel_coords.shape[0],2)) 299 | pixel_offsets[:,0] = pixel_coords[:,0] - intrinsic[0] 300 | pixel_offsets[:,1] = pixel_coords[:,1] - intrinsic[1] 301 | 302 | pixel_norms = np.linalg.norm(pixel_offsets, axis=1, keepdims=True) 303 | 304 | alphas = backwards_polynomial(pixel_norms, intrinsic[4:9]) 305 | camera_rays[:,0:1] = (np.sin(alphas) * pixel_offsets[:,0:1]) / pixel_norms 306 | camera_rays[:,1:2] = (np.sin(alphas) * pixel_offsets[:,1:2]) / pixel_norms 307 | camera_rays[:,2:3] = np.cos(alphas) 308 | 309 | # special case: ray is perpendicular to image plane normal 310 | valid = (pixel_norms > np.finfo(np.float32).eps).squeeze() 311 | camera_rays[~valid, :] = (0, 0, 1) # This is what DW sets these rays to 312 | 313 | return camera_rays 314 | 315 | def compute_fw_polynomial(intrinsic): 316 | 317 | img_width = intrinsic[2] 318 | img_height = intrinsic[3] 319 | cxcy = np.array(intrinsic[0:2]) 320 | 321 | max_value = 0.0 322 | value = np.linalg.norm(np.asarray([0.0, 0.0], dtype=cxcy.dtype) - cxcy) 323 | max_value = max(max_value, value) 324 | value = np.linalg.norm(np.asarray([0.0, img_height], dtype=cxcy.dtype) - cxcy) 325 | max_value = max(max_value, value) 326 | value = np.linalg.norm(np.asarray([img_width, 0.0], dtype=cxcy.dtype) - cxcy) 327 | max_value = max(max_value, value) 328 | value = np.linalg.norm(np.asarray([img_width, img_height], dtype=cxcy.dtype) - cxcy) 329 | max_value = max(max_value, value) 330 | 331 | SAMPLE_COUNT = 500 332 | samples_x = [] 333 | samples_b = [] 334 | step = max_value / SAMPLE_COUNT 335 | x = step 336 | 337 | for _ in range(0, SAMPLE_COUNT): 338 | p = np.asarray([cxcy[0] + x, cxcy[1]], dtype=np.float64).reshape(-1,2) 339 | ray = pixel_2_camera_ray(p, intrinsic, 'f_theta') 340 | xy_norm = np.linalg.norm(ray[0, :2]) 341 | theta = np.arctan2(float(xy_norm), float(ray[0, 2])) 342 | samples_x.append(theta) 343 | samples_b.append(float(x)) 344 | x += step 345 | 346 | x = np.asarray(samples_x, dtype=np.float64) 347 | y = np.asarray(samples_b, dtype=np.float64) 348 | # Fit a 4th degree polynomial. The polynomial function is as follows: 349 | 350 | def f(x, b, x1, x2, x3, x4): 351 | """4th degree polynomial.""" 352 | return b + x1 * x + x2 * (x ** 2) + x3 * (x ** 3) + x4 * (x ** 4) 353 | 354 | # The constant in the polynomial should be zero, so add the `bounds` condition. 355 | coeffs, _ = curve_fit( 356 | f, 357 | x, 358 | y, 359 | bounds=( 360 | [0, -np.inf, -np.inf, -np.inf, -np.inf], 361 | [np.finfo(np.float64).eps, np.inf, np.inf, np.inf, np.inf], 362 | ), 363 | ) 364 | 365 | return np.array([np.float32(val) if i > 0 else 0 for i, val in enumerate(coeffs)], dtype=np.float32) 366 | 367 | 368 | def compute_ftheta_fov(intrinsic): 369 | """Computes the FOV of this camera model.""" 370 | max_x = intrinsic[2] - 1 371 | max_y = intrinsic[3] - 1 372 | 373 | point_left = np.asarray([0.0, intrinsic[1]]).reshape(-1,2) 374 | point_right = np.asarray([max_x, intrinsic[1]]).reshape(-1,2) 375 | point_top = np.asarray([intrinsic[0], 0.0]).reshape(-1,2) 376 | point_bottom = np.asarray([intrinsic[0], max_y]).reshape(-1,2) 377 | 378 | fov_left = _get_pixel_fov(point_left, intrinsic) 379 | fov_right = _get_pixel_fov(point_right, intrinsic) 380 | fov_top = _get_pixel_fov(point_top, intrinsic) 381 | fov_bottom = _get_pixel_fov(point_bottom, intrinsic) 382 | 383 | v_fov = fov_top + fov_bottom 384 | hz_fov = fov_left + fov_right 385 | max_angle = _compute_max_angle(intrinsic) 386 | 387 | return v_fov, hz_fov, max_angle 388 | 389 | 390 | def _get_pixel_fov(pt, intrinsic): 391 | """Gets the FOV for a given point. Used internally for FOV computation of the F-theta camera. 392 | 393 | Args: 394 | pt (np.ndarray): 2D pixel. 395 | 396 | Returns: 397 | fov (float): the FOV of the pixel. 398 | """ 399 | ray = pixel_2_camera_ray(pt, intrinsic, 'f_theta') 400 | fov = np.arctan2(np.linalg.norm(ray[:, :2], axis=1), ray[:, 2]) 401 | return fov 402 | 403 | 404 | def _compute_max_angle(intrinsic): 405 | 406 | p = np.asarray( 407 | [[0, 0], [intrinsic[2] - 1, 0], [0, intrinsic[3] - 1], [intrinsic[2] - 1, intrinsic[3] - 1]], dtype=np.float32 408 | ) 409 | 410 | return max( 411 | max(_get_pixel_fov(p[0:1, ...], intrinsic), _get_pixel_fov(p[1:2, ...], intrinsic)), 412 | max(_get_pixel_fov(p[2:3, ...], intrinsic), _get_pixel_fov(p[3:4, ...], intrinsic)), 413 | ) 414 | 415 | def get_sensor_to_sensor_flu(sensor): 416 | """Compute a rotation transformation matrix that rotates sensor to Front-Left-Up format. 417 | 418 | Args: 419 | sensor (str): sensor name. 420 | 421 | Returns: 422 | np.ndarray: the resulting rotation matrix. 423 | """ 424 | if "cam" in sensor: 425 | rot = [ 426 | [0.0, 0.0, 1.0, 0.0], 427 | [-1.0, 0.0, 0.0, 0.0], 428 | [0.0, -1.0, 0.0, 0.0], 429 | [0.0, 0.0, 0.0, 1.0], 430 | ] 431 | else: 432 | rot = np.eye(4, dtype=np.float32) 433 | 434 | return np.asarray(rot, dtype=np.float32) 435 | 436 | def parse_rig_sensors_from_dict(rig): 437 | """Parses the provided rig dictionary into a dictionary indexed by sensor name. 438 | 439 | Args: 440 | rig (Dict): Complete rig file as a dictionary. 441 | 442 | Returns: 443 | (Dict): Dictionary of sensor rigs indexed by sensor name. 444 | """ 445 | # Parse the properties from the rig file 446 | sensors = rig["rig"]["sensors"] 447 | 448 | sensors_dict = {sensor["name"]: sensor for sensor in sensors} 449 | return sensors_dict 450 | 451 | 452 | def parse_rig_sensors_from_file(rig_fp): 453 | """Parses the provided rig file into a dictionary indexed by sensor name. 454 | 455 | Args: 456 | rig_fp (str): Filepath to rig file. 457 | 458 | Returns: 459 | (Dict): Dictionary of sensor rigs indexed by sensor name. 460 | """ 461 | # Read the json file 462 | with open(rig_fp, "r") as fp: 463 | rig = json.load(fp) 464 | 465 | return parse_rig_sensors_from_dict(rig) 466 | 467 | 468 | def sensor_to_rig(sensor): 469 | 470 | sensor_name = sensor["name"] 471 | sensor_to_FLU = get_sensor_to_sensor_flu(sensor_name) 472 | 473 | nominal_T = sensor["nominalSensor2Rig_FLU"]["t"] 474 | nominal_R = sensor["nominalSensor2Rig_FLU"]["roll-pitch-yaw"] 475 | 476 | correction_T = np.zeros(3, dtype=np.float32) 477 | correction_R = np.zeros(3, dtype=np.float32) 478 | 479 | if ("correction_rig_T" in sensor.keys()): 480 | correction_T = sensor["correction_rig_T"] 481 | 482 | if ("correction_sensor_R_FLU" in sensor.keys()): 483 | assert "roll-pitch-yaw" in sensor["correction_sensor_R_FLU"].keys(), str(sensor["correction_sensor_R_FLU"]) 484 | correction_R = sensor["correction_sensor_R_FLU"]["roll-pitch-yaw"] 485 | 486 | nominal_R = euler_2_so3(nominal_R) 487 | correction_R = euler_2_so3(correction_R) 488 | 489 | #from worldsheet.render_utils import rotationMatrixToEulerAngles 490 | #from transformations import so3_2_axis_angle 491 | #print(np.degrees(rotationMatrixToEulerAngles(nominal_R))) 492 | #print(np.degrees(rotationMatrixToEulerAngles(correction_R))) 493 | R = nominal_R @ correction_R 494 | #print(np.degrees(rotationMatrixToEulerAngles(R))) 495 | #exit() 496 | T = np.array(nominal_T, dtype=np.float32) + np.array(correction_T, dtype=np.float32) 497 | 498 | transform = np.eye(4, dtype=np.float32) 499 | transform[:3, :3] = R 500 | transform[:3, 3] = T 501 | 502 | sensor_to_rig = transform @ sensor_to_FLU 503 | 504 | return sensor_to_rig, R 505 | 506 | 507 | def camera_intrinsic_parameters(sensor: dict, 508 | logger: Optional[logging.Logger] = None 509 | ) -> np.ndarray: 510 | """ Parses the provided rig-style camera sensor dictionary into FTheta camera intrinsic parameters. 511 | 512 | Note: currenlty only 5th-order 'pixeldistance-to-angle' ("bw-poly") FTheta are supported, possibly 513 | available 6th-order term will be dropped with a warning 514 | 515 | Args: 516 | sensor: the dictionary of the sensor parameters read from the rig file 517 | logger: if provided, the logger to issue warnings in (e.g., on not supported coeffiecients) 518 | Returns: 519 | intrinsic: array of FTheta intrinsics [cx, cy, width, height, [bwpoly]] 520 | """ 521 | 522 | assert sensor['properties'][ 523 | 'Model'] == 'ftheta', "unsupported camera model (only supporting FTheta)" 524 | 525 | cx = float(sensor['properties']['cx']) 526 | cy = float(sensor['properties']['cy']) 527 | width = float(sensor['properties']['width']) 528 | height = float(sensor['properties']['height']) 529 | 530 | if 'bw-poly' in sensor['properties']: 531 | # Legacy 5-th order backwards-polynomial 532 | bwpoly = [ 533 | np.float32(val) for val in sensor['properties']['bw-poly'].split() 534 | ] 535 | assert len( 536 | bwpoly 537 | ) == 5, "expecting fifth-order coefficients for 'bw-poly / 'pixeldistance-to-angle' polynomial" 538 | elif 'polynomial' in sensor['properties']: 539 | # Two-way forward / backward polynomial encoding 540 | assert sensor['properties']['polynomial-type'] == 'pixeldistance-to-angle', \ 541 | f"currently only supporting 'pixeldistance-to-angle' polynomial type, received '{sensor['properties']['polynomial-type']}'" 542 | 543 | bwpoly = [ 544 | np.float32(val) 545 | for val in sensor['properties']['polynomial'].split() 546 | ] 547 | 548 | if len(bwpoly) > 5: 549 | # WAR: 6th-order polynomials are currently not supported in the software-stack, drop highest order coeffient for now 550 | # TODO: extend internal camera model and NGP with support for 6th-order polynomials 551 | if logger: 552 | logger.warn( 553 | f"> encountered higher-order distortion polynomial for camera '{sensor['name']}', restricting to 5th-order, dropping coefficients '{bwpoly[5:]}' - parsed model might be inaccurate" 554 | ) 555 | 556 | bwpoly = bwpoly[:5] 557 | 558 | # Affine term is currently not supported, issue a warning if it differs from identity 559 | # TODO: properly incorporate c,d,e coefficients of affine term [c, d; e, 1] into software stack (internal camera models + NGP) 560 | A = np.matrix([[np.float32(sensor['properties'].get('c', 1.0)), np.float32(sensor['properties'].get('d', 0.0))], \ 561 | [np.float32(sensor['properties'].get('e', 0.0)), np.float32(1.0)]]) 562 | 563 | if (A != np.identity(2, dtype=np.float32)).any(): 564 | if logger: 565 | logger.warn( 566 | f"> *not* considering non-identity affine term '{A}' for '{sensor['name']}' - parsed model might be inaccurate" 567 | ) 568 | 569 | else: 570 | raise ValueError("unsupported distortion polynomial type") 571 | 572 | intrinsic = [cx, cy, width, height] + bwpoly 573 | 574 | return np.array(intrinsic, dtype=np.float32) 575 | 576 | def degree_to_radian(angles: ArrayLike): 577 | """Convert angles in degrees to radians. 578 | Args: 579 | angles (torch.Tensor | np.ndarray): An array in degrees. 580 | Returns: 581 | angles (torch.Tensor | np.ndarray): An array in radians. Range will be [0, 2 * pi). 582 | """ 583 | if isinstance(angles, np.ndarray): 584 | opset = np 585 | elif isinstance(angles, torch.Tensor): 586 | opset = torch 587 | else: 588 | raise TypeError("Unsupported data type.") 589 | angles = opset.remainder(angles, 360) 590 | angles = opset.divide(angles * np.pi, 180) 591 | return angles 592 | 593 | def euler_to_rotation_matrix(euler_angles: ArrayLike) -> np.ndarray: 594 | """Convert euler angles to a rotation matrix. 595 | Args: 596 | euler_angles: An array of euler angles using the Tait-Bryan convention 597 | of yaw, pitch, roll (the ZYX sequence of intrinsic rotations), as 598 | described in: 599 | https://en.wikipedia.org/wiki/Euler_angles#Conventions. 600 | Angles are assumed to be in radians. 601 | Returns: 602 | rotation_matrix: A 3x3 rotation matrix as a numpy array. 603 | """ 604 | sin_yaw, sin_pitch, sin_roll = np.sin(euler_angles) 605 | cos_yaw, cos_pitch, cos_roll = np.cos(euler_angles) 606 | # Calculations according to the Tait-Bryan column and ZYX row of the 607 | # following table: 608 | # https://en.wikipedia.org/wiki/Euler_angles#Rotation_matrix 609 | rotation_matrix = np.array( 610 | [ 611 | [ 612 | cos_yaw * cos_pitch, 613 | cos_yaw * sin_pitch * sin_roll - cos_roll * sin_yaw, 614 | sin_yaw * sin_roll + cos_yaw * cos_roll * sin_pitch, 615 | ], 616 | [ 617 | cos_pitch * sin_yaw, 618 | cos_yaw * cos_roll + sin_yaw * sin_pitch * sin_roll, 619 | cos_roll * sin_yaw * sin_pitch - cos_yaw * sin_roll, 620 | ], 621 | [-sin_pitch, cos_pitch * sin_roll, cos_pitch * cos_roll], 622 | ] 623 | ) 624 | return rotation_matrix 625 | 626 | def mrp_to_quat(mrp: ArrayLike) -> ArrayLike: 627 | """MRP to Quat. 628 | Convert modified Rodrigues parameters (MRP) 629 | representation to quaternion rotation. 630 | Args: 631 | mrp (torch.Tensor | np.ndarray): MRP based rotation 632 | Return: 633 | quat (torch.Tensor | np.ndarray): quaternion based rotation 634 | """ 635 | if isinstance(mrp, np.ndarray): 636 | opset = np 637 | elif isinstance(mrp, torch.Tensor): 638 | opset = torch 639 | else: 640 | raise TypeError("Unsupported data type for MRP.") 641 | if len(mrp.shape) == 1: 642 | # mrp is a 1d array 643 | assert mrp.shape[0] == 3 644 | quat = opset.zeros(4, dtype=mrp.dtype) 645 | mrp_squared_plus = 1 + mrp[0] ** 2 + mrp[1] ** 2 + mrp[2] ** 2 646 | quat[0] = 2 * mrp[0] / mrp_squared_plus 647 | quat[1] = 2 * mrp[1] / mrp_squared_plus 648 | quat[2] = 2 * mrp[2] / mrp_squared_plus 649 | quat[3] = (2 - mrp_squared_plus) / mrp_squared_plus 650 | elif len(mrp.shape) == 2: 651 | assert mrp.shape[1] == 3 652 | quat = opset.zeros((mrp.shape[0], 4), dtype=mrp.dtype) 653 | mrp_squared_plus = 1 + mrp[:, 0:1] ** 2 + mrp[:, 1:2] ** 2 + mrp[:, 2:3] ** 2 654 | quat[:, 0:1] = 2 * mrp[:, 0:1] / mrp_squared_plus 655 | quat[:, 1:2] = 2 * mrp[:, 1:2] / mrp_squared_plus 656 | quat[:, 2:3] = 2 * mrp[:, 2:3] / mrp_squared_plus 657 | quat[:, 3:4] = (2 - mrp_squared_plus) / mrp_squared_plus 658 | return quat 659 | 660 | def quat_to_rotation(quat: ArrayLike) -> ArrayLike: 661 | """Quat to Rotation matrix conversion. 662 | Convert quaternion rotation representation to Rotation matrix. 663 | Args: 664 | quat (torch.Tensor | np.ndarray): quaternion based rotation 665 | Return: 666 | rot_matrix (torch.Tensor | np.ndarray): 3x3 rotation matrix 667 | """ 668 | # TOTO(bala) Looks like this quat to rotation implementation is inconsistent with 669 | # scipy.spatial.transform.Rotation.from_quat 670 | # Jira AMP-313 671 | if isinstance(quat, np.ndarray): 672 | opset = np 673 | elif isinstance(quat, torch.Tensor): 674 | opset = torch 675 | else: 676 | raise TypeError("Unsupported data type.") 677 | # Order is changed to match the convention of scipy.spatial.transform.Rotation.from_quat 678 | q0 = quat[3] 679 | q1 = quat[0] 680 | q2 = quat[1] 681 | q3 = quat[2] 682 | rot_matrix = opset.zeros((3, 3), dtype=quat.dtype) 683 | # First row of the rotation matrix 684 | rot_matrix[0][0] = 1 - 2 * (q2 * q2 + q3 * q3) 685 | rot_matrix[0][1] = 2 * (q1 * q2 - q0 * q3) 686 | rot_matrix[0][2] = 2 * (q1 * q3 + q0 * q2) 687 | # Second row of the rotation matrix 688 | rot_matrix[1][0] = 2 * (q1 * q2 + q0 * q3) 689 | rot_matrix[1][1] = 1 - 2 * (q1 * q1 + q3 * q3) 690 | rot_matrix[1][2] = 2 * (q2 * q3 - q0 * q1) 691 | # Third row of the rotation matrix 692 | rot_matrix[2][0] = 2 * (q1 * q3 - q0 * q2) 693 | rot_matrix[2][1] = 2 * (q2 * q3 + q0 * q1) 694 | rot_matrix[2][2] = 1 - 2 * (q1 * q1 + q2 * q2) 695 | # 3x3 rotation matrix 696 | return rot_matrix 697 | 698 | def compute_cuboid_corners( 699 | cuboid: ArrayLike, use_polar: bool = False, use_mrp: bool = True 700 | ) -> ArrayLike: 701 | """Compute 8 cuboid vertices. 702 | Args: 703 | cuboid (torch.Tensor | np.ndarray): It can be an array of 9 elements 704 | (x/radius, y/angle, z, w, l, h, mrp_0/x_angle, mrp_1/y_angle, mrp_2/z_angle) 705 | representing a 3d cuboid. (mrp_0, mrp_1, mrp_2) triplet is the MRP representation from 706 | https://link.springer.com/content/pdf/10.1007/s10851-017-0765-x.pdf 707 | Or it is also can be an array of 15 elements 708 | (x/radius, y/angle, z, w, l, h, 9 elements in rotation matrix). 709 | use_polar (bool): flag whether center_x/center_y is represented polar coordinates. 710 | Default `False` means using Cartesian coordinates. 711 | use_mrp (bool): flag whether the rotation is represented as MRP (see above for details) 712 | or as (yaw,pitch,roll). Default `True` means MRP representation is expected. 713 | Returns: 714 | corners ((torch.Tensor | np.ndarray)): an array of 3d vertices of shape [8, 3] 715 | """ 716 | if isinstance(cuboid, np.ndarray): 717 | opset = np 718 | elif isinstance(cuboid, torch.Tensor): 719 | opset = torch 720 | else: 721 | raise TypeError("Unsupported data type.") 722 | if cuboid.shape[0] == 15: 723 | rot = cuboid[6::].reshape(3, 3) 724 | else: 725 | rot = cuboid[6:9] 726 | if use_mrp: 727 | rot = quat_to_rotation(mrp_to_quat(rot)) 728 | else: 729 | rot = euler_to_rotation_matrix(rot) 730 | if use_polar: 731 | u = cuboid[0] 732 | v = cuboid[1] 733 | center_x = u * opset.cos(v) 734 | center_y = u * opset.sin(v) 735 | else: 736 | center_x = cuboid[0] 737 | center_y = cuboid[1] 738 | center_z = cuboid[2] 739 | dim_0 = cuboid[3] / 2 740 | dim_1 = cuboid[4] / 2 741 | dim_2 = cuboid[5] / 2 742 | t = opset.zeros((3, 1), dtype=cuboid.dtype) 743 | t[0][0] = center_x 744 | t[1][0] = center_y 745 | t[2][0] = center_z 746 | corners = opset.zeros((3, 10), dtype=cuboid.dtype) 747 | corners[0, 0] = center_x + dim_0 # Front, Left, Top 748 | corners[1, 0] = center_y + dim_1 749 | corners[2, 0] = center_z + dim_2 750 | corners[0, 1] = center_x + dim_0 # Front, right, top 751 | corners[1, 1] = center_y - dim_1 752 | corners[2, 1] = center_z + dim_2 753 | corners[0, 2] = center_x - dim_0 # Back, right, top 754 | corners[1, 2] = center_y - dim_1 755 | corners[2, 2] = center_z + dim_2 756 | corners[0, 3] = center_x - dim_0 # Back, left, top 757 | corners[1, 3] = center_y + dim_1 758 | corners[2, 3] = center_z + dim_2 759 | corners[0, 4] = center_x + dim_0 # Front, left, bot 760 | corners[1, 4] = center_y + dim_1 761 | corners[2, 4] = center_z - dim_2 762 | corners[0, 5] = center_x + dim_0 # Front, right, bot 763 | corners[1, 5] = center_y - dim_1 764 | corners[2, 5] = center_z - dim_2 765 | corners[0, 6] = center_x - dim_0 # Back, right, bot 766 | corners[1, 6] = center_y - dim_1 767 | corners[2, 6] = center_z - dim_2 768 | corners[0, 7] = center_x - dim_0 # Back, leftt, bot 769 | corners[1, 7] = center_y + dim_1 770 | corners[2, 7] = center_z - dim_2 771 | corners[0, 8] = center_x 772 | corners[1, 8] = center_y 773 | corners[2, 8] = center_z 774 | corners[0, 9] = center_x + max(MIN_ORIENTATION_ARROW_LENGTH, 2 * dim_0) 775 | corners[1, 9] = center_y 776 | corners[2, 9] = center_z 777 | corners = opset.matmul(rot, (corners - t)) + t 778 | corners = corners.T 779 | return corners 780 | 781 | def shape2d_flatten_vertices(shape2d): 782 | """Retrieves and flattens the vertices of a Shape2D object into a single list of pairs. 783 | Args: 784 | shape2d(protobuf): Shape2D protobuf with Repeated field of vertices. 785 | Returns: 786 | list of list of floats. 787 | """ 788 | vertices = getattr(shape2d, shape2d.WhichOneof("shape2d")).vertices 789 | return [[vertex.x, vertex.y] for vertex in vertices] 790 | 791 | def shape2d_flatten_cuboid2d_vertices(cuboid2d): 792 | """Flattens the vertices into a single list pairs as expected by the rest of the code. 793 | Args: 794 | cuboid2d(protobuf): cuboid2d protobuf with repeated field of vertices. 795 | Returns: 796 | list of list of floats. 797 | """ 798 | return [[vertex.x, vertex.y] for vertex in cuboid2d] 799 | 800 | def attribute_as_dict(attributes: protobuf.pyext._message.RepeatedCompositeContainer): 801 | """Converts interchange attributes in a label to a dict mapping attribute name to value. 802 | Handles extraction of the attribute based on the type of the attribute and extracts the list 803 | out in the case where it's an enum_list. 804 | Args: 805 | attributes: (proto) label with attributes to be extracted. 806 | Returns: 807 | Dict: parsed dictionary mapping attribute.name to attribute.value 808 | """ 809 | attribute_dict = {} 810 | for attribute in attributes: 811 | which_one = attribute.WhichOneof("attr") 812 | value = getattr(attribute, which_one) 813 | if which_one == "enums_list": 814 | value = value.enums_list 815 | attribute_dict[attribute.name] = value 816 | return attribute_dict 817 | 818 | def parse(features, frames) -> Cuboid3DGT: 819 | """Parse method for camera features. 820 | Args: 821 | scene (Dict): A dictionary mapping a sensor name to a tuple consisting of the list 822 | of Features belonging to that frame and a Frame object containing metadata about the 823 | sensor frame. 824 | Raises: 825 | AssertionError: If scene doesn't contain a sensor defined in the feature_sensors list. 826 | AssertionError: If label is not in interchange format. 827 | Returns: 828 | result (Cuboid3DGT): A collection of the cuboid3d labels accumulated over all feature 829 | rows corresponding to this scene. 830 | """ 831 | """ 832 | if row.label_data_type == LabelDataType.from_string("SHAPE2D:BOX2D"): 833 | shape2d = row.data.shape2d 834 | for attr in shape2d.attributes: 835 | if attr.name == "cuboid3d_rig": 836 | cuboid3d = Cuboid3D.from_ilf(attr.cuboid3d) 837 | """ 838 | 839 | result = Cuboid3DGT(*[[] for _ in Cuboid3DGT._fields]) 840 | default_box2ds = {} 841 | for row in features: 842 | if row.label_data_type == LabelDataType.from_string("SHAPE2D:BOX2D"): 843 | shape3d = row.data.shape2d 844 | attributes = attribute_as_dict(shape3d.attributes) 845 | label_name = attributes.get("label_name", None) 846 | occlusion_ratio = attributes.get("occlusion_ratio", None) 847 | #print(label_name) 848 | if "vehicle" not in label_name and "truck" not in label_name and "automobile" not in label_name: 849 | continue 850 | #print(attributes.get("cuboid3d_rig")) 851 | #print(type(attributes.get("cuboid3d_rig"))) 852 | #exit() 853 | cuboid3d = Cuboid3D.from_ilf(attributes.get("cuboid3d_rig", None)) 854 | result.cuboid3ds.append(cuboid3d) 855 | result.visibility.append(occlusion_ratio) 856 | return result 857 | 858 | def parse_drivesim(data) -> Cuboid3DGT: 859 | cuboid3d = None 860 | ratio = None 861 | for attr in data["shape2d"]["attributes"]: 862 | if attr["name"] == "occlusion_ratio": 863 | ratio = attr["numerical"] 864 | elif attr["name"] == "cuboid3d_rig": 865 | cuboid3d = attr["cuboid3d"] 866 | center = Center(cuboid3d['center']['x'], cuboid3d['center']['y'], cuboid3d['center']['z']) 867 | orientation = Orientation( 868 | cuboid3d['orientation']['x'], cuboid3d['orientation']['y'], cuboid3d['orientation']['z'] 869 | ) 870 | dimension = Dimension( 871 | cuboid3d['dimensions']['x'], cuboid3d['dimensions']['y'], cuboid3d['dimensions']['z'] 872 | ) 873 | cuboid3d = Cuboid3D(center=center, orientation=orientation, dimension=dimension) 874 | return cuboid3d, ratio 875 | 876 | """ 877 | cuboid3d = data["shape3d"]["cuboid3d"] 878 | center = Center(cuboid3d['center']['x'], cuboid3d['center']['y'], cuboid3d['center']['z']) 879 | orientation = Orientation( 880 | cuboid3d['orientation']['x'], cuboid3d['orientation']['y'], cuboid3d['orientation']['z'] 881 | ) 882 | dimension = Dimension( 883 | cuboid3d['dimensions']['x'], cuboid3d['dimensions']['y'], cuboid3d['dimensions']['z'] 884 | ) 885 | cuboid3d = Cuboid3D(center=center, orientation=orientation, dimension=dimension) 886 | visibility = None 887 | for attr in data["shape3d"]["attributes"]: 888 | if attr["name"] == "camera_front_wide_120fov_visibility": 889 | visibility = attr["numerical"] 890 | return cuboid3d, visibility 891 | """ 892 | 893 | """ 894 | def read_2d_boxes(shape2d, frames: Frame, denormalize=True) -> Optional[BBox2D]: 895 | if False: #self._use_human_label: 896 | vertices = shape2d_flatten_vertices(shape2d) 897 | x_min, x_max = vertices[0][0], vertices[1][0] 898 | y_min, y_max = vertices[0][1], vertices[1][1] 899 | else: 900 | attributes = attribute_as_dict(shape2d.attributes) 901 | cuboid2d = attributes.get("cuboid2d", None) 902 | if cuboid2d is None: 903 | return None 904 | cuboid_vertices = shape2d_flatten_cuboid2d_vertices( 905 | cuboid2d.vertices_object 906 | ) 907 | if cuboid_vertices == []: 908 | return None 909 | x_min = min(cuboid_vertices, key=lambda x: x[0])[0] 910 | x_max = max(cuboid_vertices, key=lambda x: x[0])[0] 911 | y_min = min(cuboid_vertices, key=lambda x: x[1])[1] 912 | y_max = max(cuboid_vertices, key=lambda x: x[1])[1] 913 | if denormalize: 914 | x_min = x_min * frames.original_width 915 | x_max = x_max * frames.original_width 916 | y_min = y_min * frames.original_height 917 | y_max = y_max * frames.original_height 918 | box2d = BBox2D.from_ilf([[x_min, y_min], [x_max, y_max]]) 919 | return box2d 920 | """ 921 | -------------------------------------------------------------------------------- /tools/misc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2023 NVIDIA Corporation. All rights reserved. 3 | Licensed under the NVIDIA Source Code License. See LICENSE at https://github.com/NVlabs/viewpoint-robustness 4 | Authors: Tzofi Klinghoffer, Jonah Philion, Wenzheng Chen, Or Litany, Zan Gojcic, Jungseock Joo, Ramesh Raskar, Sanja Fidler, Jose M. Alvarez 5 | """ 6 | import os 7 | import numpy as np 8 | import torch 9 | import torchvision 10 | from tqdm import tqdm 11 | from pyquaternion import Quaternion 12 | from PIL import Image 13 | from functools import reduce 14 | import matplotlib as mpl 15 | mpl.use('Agg') 16 | import matplotlib.pyplot as plt 17 | from nuscenes.utils.data_classes import LidarPointCloud 18 | from nuscenes.utils.geometry_utils import transform_matrix 19 | from nuscenes.map_expansion.map_api import NuScenesMap 20 | 21 | def pixel2ray(u, v, u0, v0, coef): 22 | xdiff = u - u0 23 | ydiff = v - v0 24 | norm = np.sqrt(xdiff*xdiff + ydiff*ydiff) 25 | if not isinstance(norm, float) or norm > 1e-6: 26 | xdiff /= norm 27 | ydiff /= norm 28 | 29 | alpha = coef[0] + coef[1]*norm + coef[2]*norm*norm + coef[3]*norm*norm*norm\ 30 | + coef[4]*norm*norm*norm*norm 31 | alphaz = np.cos(alpha) 32 | 33 | radicand = 1.0 - alphaz*alphaz 34 | if isinstance(radicand, float) and radicand < 1e-6: 35 | radicand = 1e-6 36 | m = np.sqrt(radicand) 37 | 38 | return m*xdiff, m*ydiff, alphaz 39 | 40 | def single_dewarp(u, v, u0, v0, fx, fy, coef): 41 | x, y, alphaz = pixel2ray(u, v, u0, v0, coef) 42 | 43 | x = x / alphaz * fx + u0 44 | y = y / alphaz * fy + v0 45 | 46 | return x, y 47 | 48 | def ray2pixel(finalu, finalv, scale, coef, u0, v0): 49 | alpha = np.arccos(scale / np.sqrt(finalu*finalu + finalv*finalv + scale*scale)) 50 | delta = coef[0] + coef[1]*alpha + coef[2]*alpha*alpha + coef[3]*alpha*alpha*alpha\ 51 | + coef[4]*alpha*alpha*alpha*alpha 52 | xynorm = np.sqrt(finalu*finalu + finalv*finalv) 53 | kfac = delta / xynorm 54 | 55 | u = kfac*finalu + u0 56 | v = kfac*finalv + v0 57 | return u, v 58 | 59 | def single_towarp(u, v, coef, u0, v0, fx, fy): 60 | newu = (u - u0) / fx 61 | newv = (v - v0) / fy 62 | scale = 1.0 / np.sqrt(newu*newu + newv*newv + 1.0) 63 | newu *= scale 64 | newv *= scale 65 | return ray2pixel(newu, newv, scale, coef, u0, v0) 66 | 67 | def dewarp_img(img, u0, v0, fx, fy, coef, kfac, fwpoly): 68 | xs = kfac*np.array([np.arange(img.width) for _ in range(img.height)]) 69 | ys = kfac*np.array([np.arange(img.height) for _ in range(img.width)]).T 70 | 71 | # x, y = single_dewarp(xs, ys, u0*kfac, v0*kfac, focal*kfac, coef) 72 | # x /= kfac 73 | # y /= kfac 74 | 75 | # kept = np.logical_and(x >= 0, x < img.width) 76 | # kept = np.logical_and(kept, y >= 0) 77 | # kept = np.logical_and(kept, y < img.height) 78 | 79 | # final = np.zeros((img.height, img.width, 3), dtype=np.uint8) 80 | # final[y[kept].astype(int), x[kept].astype(int), :] = np.array(img)[kept] 81 | 82 | # # checker 83 | # inx, iny = 400.0, 200.0 84 | # print('checking', inx, iny) 85 | # outx, outy = single_dewarp(400.0, 200.0, u0*kfac, v0*kfac, focal*kfac, coef) 86 | # print('middle', outx, outy) 87 | # outx, outy = single_towarp(outx, outy, fwpoly, u0*kfac, v0*kfac, focal*kfac) 88 | # print('back', outx, outy) 89 | 90 | x, y = single_towarp(xs, ys, fwpoly, u0*kfac, v0*kfac, fx*kfac, fy*kfac) 91 | x /= kfac 92 | y /= kfac 93 | 94 | kept = np.logical_and(x >= 0, x < img.width) 95 | kept = np.logical_and(kept, y >= 0) 96 | kept = np.logical_and(kept, y < img.height) 97 | final = np.zeros((img.height, img.width, 3), dtype=np.uint8) 98 | final[kept] = np.array(img)[y[kept].astype(int), x[kept].astype(int), :] 99 | 100 | return final 101 | 102 | def choose_cams(is_train, data_aug_conf): 103 | if is_train and data_aug_conf['Ncams'] < len(data_aug_conf['cams']): 104 | cams = np.random.choice(data_aug_conf['cams'], data_aug_conf['Ncams'], 105 | replace=False) 106 | else: 107 | cams = data_aug_conf['cams'] 108 | return cams 109 | 110 | def parse_calibration(calib, width, height, cam_adjust, pitch=0, yaw=0, height_adjust=0): 111 | info = {} 112 | for k, cal in calib.items(): 113 | if k == 'LIDAR_TOP': 114 | continue 115 | if pitch == 0: 116 | pitch = cam_adjust[k]['pitch'] 117 | if yaw == 0: 118 | yaw = cam_adjust[k]['yaw'] 119 | if height_adjust == 0: 120 | height_adjust = cam_adjust[k]['height'] 121 | 122 | trans = [cal['trans'][0], -cal['trans'][1], cal['trans'][2] + height_adjust] 123 | intrins = np.identity(3) 124 | intrins[0, 2] = width / 2.0 125 | intrins[1, 2] = height / 2.0 126 | intrins[0, 0] = intrins[1, 1] = width / (2.0 * np.tan((cal['fov'] + cam_adjust[k]['fov']) * np.pi / 360.0)) 127 | 128 | coordmat = np.array([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) 129 | rot = np.matmul( 130 | Quaternion(axis=[0, 0, 1], angle=np.radians(-(cal['yaw'] + yaw))).rotation_matrix, 131 | np.linalg.inv(coordmat)) 132 | if pitch != 0: 133 | rot = np.matmul( 134 | Quaternion(axis=[0, 1, 0], angle=np.radians(-(pitch))).rotation_matrix, 135 | np.linalg.inv(coordmat)) 136 | 137 | quat = Quaternion(matrix=rot) 138 | 139 | info[k] = {'trans': trans, 'intrins': intrins, 140 | 'rot': quat} 141 | return info 142 | 143 | def sample_augmentation(data_aug_conf, is_train): 144 | H, W = data_aug_conf['H'], data_aug_conf['W'] 145 | fH, fW = data_aug_conf['final_dim'] 146 | if is_train: 147 | resize = np.random.uniform(*data_aug_conf['resize_lim']) 148 | resize_dims = (int(W * resize), int(H * resize)) 149 | newW, newH = resize_dims 150 | crop_h = int((1 - np.random.uniform(*data_aug_conf['bot_pct_lim'])) * newH) - fH 151 | crop_w = int(np.random.uniform(0, max(0, newW - fW))) 152 | crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) 153 | flip = False 154 | if data_aug_conf['rand_flip'] and np.random.choice([0, 1]): 155 | flip = True 156 | rotate = np.random.uniform(*data_aug_conf['rot_lim']) 157 | else: 158 | resize = max(fH / H, fW / W) 159 | resize_dims = (int(W * resize), int(H * resize)) 160 | newW, newH = resize_dims 161 | crop_h = int((1 - np.mean(data_aug_conf['bot_pct_lim'])) * newH) - fH 162 | crop_w = int(max(0, newW - fW) / 2) 163 | crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) 164 | flip = False 165 | rotate = 0 166 | return resize, resize_dims, crop, flip, rotate 167 | 168 | def get_lidar_data(nusc, sample_rec, nsweeps, min_distance): 169 | """ 170 | Returns at most nsweeps of lidar in the ego frame. 171 | Returned tensor is 5(x, y, z, reflectance, dt) x N 172 | Adapted from https://github.com/nutonomy/nuscenes-devkit/blob/master/python-sdk/nuscenes/utils/data_classes.py#L56 173 | """ 174 | points = np.zeros((5, 0)) 175 | 176 | # Get reference pose and timestamp. 177 | ref_sd_token = sample_rec['data']['LIDAR_TOP'] 178 | ref_sd_rec = nusc.get('sample_data', ref_sd_token) 179 | ref_pose_rec = nusc.get('ego_pose', ref_sd_rec['ego_pose_token']) 180 | ref_cs_rec = nusc.get('calibrated_sensor', ref_sd_rec['calibrated_sensor_token']) 181 | ref_time = 1e-6 * ref_sd_rec['timestamp'] 182 | 183 | # Homogeneous transformation matrix from global to _current_ ego car frame. 184 | car_from_global = transform_matrix(ref_pose_rec['translation'], Quaternion(ref_pose_rec['rotation']), 185 | inverse=True) 186 | 187 | # Aggregate current and previous sweeps. 188 | sample_data_token = sample_rec['data']['LIDAR_TOP'] 189 | current_sd_rec = nusc.get('sample_data', sample_data_token) 190 | for _ in range(nsweeps): 191 | # Load up the pointcloud and remove points close to the sensor. 192 | current_pc = LidarPointCloud.from_file(os.path.join(nusc.dataroot, current_sd_rec['filename'])) 193 | current_pc.remove_close(min_distance) 194 | 195 | # Get past pose. 196 | current_pose_rec = nusc.get('ego_pose', current_sd_rec['ego_pose_token']) 197 | global_from_car = transform_matrix(current_pose_rec['translation'], 198 | Quaternion(current_pose_rec['rotation']), inverse=False) 199 | 200 | # Homogeneous transformation matrix from sensor coordinate frame to ego car frame. 201 | current_cs_rec = nusc.get('calibrated_sensor', current_sd_rec['calibrated_sensor_token']) 202 | car_from_current = transform_matrix(current_cs_rec['translation'], Quaternion(current_cs_rec['rotation']), 203 | inverse=False) 204 | 205 | # Fuse four transformation matrices into one and perform transform. 206 | trans_matrix = reduce(np.dot, [car_from_global, global_from_car, car_from_current]) 207 | current_pc.transform(trans_matrix) 208 | 209 | # Add time vector which can be used as a temporal feature. 210 | time_lag = ref_time - 1e-6 * current_sd_rec['timestamp'] 211 | times = time_lag * np.ones((1, current_pc.nbr_points())) 212 | 213 | new_points = np.concatenate((current_pc.points, times), 0) 214 | points = np.concatenate((points, new_points), 1) 215 | 216 | # Abort if there are no previous sweeps. 217 | if current_sd_rec['prev'] == '': 218 | break 219 | else: 220 | current_sd_rec = nusc.get('sample_data', current_sd_rec['prev']) 221 | 222 | return points 223 | 224 | 225 | def ego_to_cam(points, rot, trans, intrins): 226 | """Transform points (3 x N) from ego frame into a pinhole camera 227 | """ 228 | print(points.dtype, rot.dtype, trans.dtype, intrins.dtype) 229 | points = points - trans.unsqueeze(1) 230 | points = rot.permute(1, 0).matmul(points) 231 | 232 | points = intrins.matmul(points) 233 | points[:2] /= points[2:3] 234 | 235 | return points 236 | 237 | 238 | def cam_to_ego(points, rot, trans, intrins): 239 | """Transform points (3 x N) from pinhole camera with depth 240 | to the ego frame 241 | """ 242 | points = torch.cat((points[:2] * points[2:3], points[2:3])) 243 | points = intrins.inverse().matmul(points) 244 | 245 | points = rot.matmul(points) 246 | points += trans.unsqueeze(1) 247 | 248 | return points 249 | 250 | 251 | def get_only_in_img_mask(pts, H, W): 252 | """pts should be 3 x N 253 | """ 254 | return (pts[2] > 0) &\ 255 | (pts[0] > 1) & (pts[0] < W - 1) &\ 256 | (pts[1] > 1) & (pts[1] < H - 1) 257 | 258 | 259 | def get_rot(h): 260 | return torch.Tensor([ 261 | [np.cos(h), np.sin(h)], 262 | [-np.sin(h), np.cos(h)], 263 | ]) 264 | 265 | 266 | def img_transform(img, post_rot, post_tran, 267 | resize, resize_dims, crop, 268 | flip, rotate): 269 | # adjust image 270 | img = img.resize(resize_dims) 271 | img = img.crop(crop) 272 | if flip: 273 | img = img.transpose(method=Image.FLIP_LEFT_RIGHT) 274 | img = img.rotate(rotate) 275 | 276 | # post-homography transformation 277 | post_rot *= resize 278 | post_tran -= torch.Tensor(crop[:2]) 279 | if flip: 280 | A = torch.Tensor([[-1, 0], [0, 1]]) 281 | b = torch.Tensor([crop[2] - crop[0], 0]) 282 | post_rot = A.matmul(post_rot) 283 | post_tran = A.matmul(post_tran) + b 284 | A = get_rot(rotate/180*np.pi) 285 | b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2 286 | b = A.matmul(-b) + b 287 | post_rot = A.matmul(post_rot) 288 | post_tran = A.matmul(post_tran) + b 289 | 290 | return img, post_rot, post_tran 291 | 292 | 293 | class NormalizeInverse(torchvision.transforms.Normalize): 294 | # https://discuss.pytorch.org/t/simple-way-to-inverse-transform-normalization/4821/8 295 | def __init__(self, mean, std): 296 | mean = torch.as_tensor(mean) 297 | std = torch.as_tensor(std) 298 | std_inv = 1 / (std + 1e-7) 299 | mean_inv = -mean * std_inv 300 | super().__init__(mean=mean_inv, std=std_inv) 301 | 302 | def __call__(self, tensor): 303 | return super().__call__(tensor.clone()) 304 | 305 | 306 | denormalize_img = torchvision.transforms.Compose(( 307 | NormalizeInverse(mean=[0.485, 0.456, 0.406], 308 | std=[0.229, 0.224, 0.225]), 309 | torchvision.transforms.ToPILImage(), 310 | )) 311 | 312 | 313 | normalize_img = torchvision.transforms.Compose(( 314 | torchvision.transforms.ToTensor(), 315 | torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], 316 | std=[0.229, 0.224, 0.225]), 317 | )) 318 | 319 | 320 | def gen_dx_bx(xbound, ybound, zbound): 321 | dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]]) 322 | bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]]) 323 | #nx = torch.LongTensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]]) 324 | nx = torch.Tensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]]) 325 | 326 | return dx, bx, nx 327 | 328 | 329 | def cumsum_trick(x, geom_feats, ranks): 330 | x = x.cumsum(0) 331 | kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool) 332 | kept[:-1] = (ranks[1:] != ranks[:-1]) 333 | 334 | x, geom_feats = x[kept], geom_feats[kept] 335 | x = torch.cat((x[:1], x[1:] - x[:-1])) 336 | 337 | return x, geom_feats 338 | 339 | 340 | class QuickCumsum(torch.autograd.Function): 341 | @staticmethod 342 | def forward(ctx, x, geom_feats, ranks): 343 | x = x.cumsum(0) 344 | kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool) 345 | kept[:-1] = (ranks[1:] != ranks[:-1]) 346 | 347 | x, geom_feats = x[kept], geom_feats[kept] 348 | x = torch.cat((x[:1], x[1:] - x[:-1])) 349 | 350 | # save kept for backward 351 | ctx.save_for_backward(kept) 352 | 353 | # no gradient for geom_feats 354 | ctx.mark_non_differentiable(geom_feats) 355 | 356 | return x, geom_feats 357 | 358 | @staticmethod 359 | def backward(ctx, gradx, gradgeom): 360 | kept, = ctx.saved_tensors 361 | back = torch.cumsum(kept, 0) 362 | back[kept] -= 1 363 | 364 | val = gradx[back] 365 | 366 | return val, None, None 367 | 368 | 369 | class SimpleLoss(torch.nn.Module): 370 | def __init__(self, pos_weight): 371 | super(SimpleLoss, self).__init__() 372 | self.loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([pos_weight])) 373 | 374 | def forward(self, ypred, ytgt): 375 | loss = self.loss_fn(ypred, ytgt) 376 | return loss 377 | 378 | 379 | def get_batch_iou(preds, binimgs): 380 | """Assumes preds has NOT been sigmoided yet 381 | """ 382 | with torch.no_grad(): 383 | pred = (preds > 0) 384 | tgt = binimgs.bool() 385 | intersect = (pred & tgt).sum().float().item() 386 | union = (pred | tgt).sum().float().item() 387 | return intersect, union, intersect / union if (union > 0) else 1.0 388 | 389 | 390 | def get_val_info(model, valloader, loss_fn, device, use_tqdm=False): 391 | model.eval() 392 | total_loss = 0.0 393 | total_intersect = 0.0 394 | total_union = 0 395 | print('running eval...') 396 | loader = tqdm(valloader) if use_tqdm else valloader 397 | with torch.no_grad(): 398 | for batch in loader: 399 | allimgs, rots, trans, intrins, post_rots, post_trans, binimgs = batch 400 | preds = model(allimgs.to(device), rots.to(device), 401 | trans.to(device), intrins.to(device), post_rots.to(device), 402 | post_trans.to(device)) 403 | binimgs = binimgs.to(device) 404 | 405 | # loss 406 | total_loss += loss_fn(preds, binimgs).item() * preds.shape[0] 407 | 408 | # iou 409 | intersect, union, _ = get_batch_iou(preds, binimgs) 410 | total_intersect += intersect 411 | total_union += union 412 | 413 | model.train() 414 | return { 415 | 'loss': total_loss / len(valloader.dataset), 416 | 'iou': total_intersect / total_union, 417 | } 418 | 419 | 420 | def add_ego(bx, dx): 421 | # approximate rear axel 422 | W = 1.85 423 | pts = np.array([ 424 | [-4.084/2.+0.5, W/2.], 425 | [4.084/2.+0.5, W/2.], 426 | [4.084/2.+0.5, -W/2.], 427 | [-4.084/2.+0.5, -W/2.], 428 | ]) 429 | pts = (pts - bx) / dx 430 | pts[:, [0,1]] = pts[:, [1,0]] 431 | plt.fill(pts[:, 0], pts[:, 1], '#76b900') 432 | 433 | 434 | def get_nusc_maps(map_folder): 435 | nusc_maps = {map_name: NuScenesMap(dataroot=map_folder, 436 | map_name=map_name) for map_name in [ 437 | "singapore-hollandvillage", 438 | "singapore-queenstown", 439 | "boston-seaport", 440 | "singapore-onenorth", 441 | ]} 442 | return nusc_maps 443 | 444 | 445 | def plot_nusc_map(rec, nusc_maps, nusc, scene2map, dx, bx): 446 | egopose = nusc.get('ego_pose', nusc.get('sample_data', rec['data']['LIDAR_TOP'])['ego_pose_token']) 447 | map_name = scene2map[nusc.get('scene', rec['scene_token'])['name']] 448 | 449 | rot = Quaternion(egopose['rotation']).rotation_matrix 450 | rot = np.arctan2(rot[1, 0], rot[0, 0]) 451 | center = np.array([egopose['translation'][0], egopose['translation'][1], np.cos(rot), np.sin(rot)]) 452 | 453 | poly_names = ['road_segment', 'lane'] 454 | line_names = ['road_divider', 'lane_divider'] 455 | lmap = get_local_map(nusc_maps[map_name], center, 456 | 50.0, poly_names, line_names) 457 | for name in poly_names: 458 | for la in lmap[name]: 459 | pts = (la - bx) / dx 460 | plt.fill(pts[:, 1], pts[:, 0], c=(1.00, 0.50, 0.31), alpha=0.2) 461 | for la in lmap['road_divider']: 462 | pts = (la - bx) / dx 463 | plt.plot(pts[:, 1], pts[:, 0], c=(0.0, 0.0, 1.0), alpha=0.5) 464 | for la in lmap['lane_divider']: 465 | pts = (la - bx) / dx 466 | plt.plot(pts[:, 1], pts[:, 0], c=(159./255., 0.0, 1.0), alpha=0.5) 467 | 468 | 469 | def get_local_map(nmap, center, stretch, layer_names, line_names): 470 | # need to get the map here... 471 | box_coords = ( 472 | center[0] - stretch, 473 | center[1] - stretch, 474 | center[0] + stretch, 475 | center[1] + stretch, 476 | ) 477 | 478 | polys = {} 479 | 480 | # polygons 481 | records_in_patch = nmap.get_records_in_patch(box_coords, 482 | layer_names=layer_names, 483 | mode='intersect') 484 | for layer_name in layer_names: 485 | polys[layer_name] = [] 486 | for token in records_in_patch[layer_name]: 487 | poly_record = nmap.get(layer_name, token) 488 | if layer_name == 'drivable_area': 489 | polygon_tokens = poly_record['polygon_tokens'] 490 | else: 491 | polygon_tokens = [poly_record['polygon_token']] 492 | 493 | for polygon_token in polygon_tokens: 494 | polygon = nmap.extract_polygon(polygon_token) 495 | polys[layer_name].append(np.array(polygon.exterior.xy).T) 496 | 497 | # lines 498 | for layer_name in line_names: 499 | polys[layer_name] = [] 500 | for record in getattr(nmap, layer_name): 501 | token = record['token'] 502 | 503 | line = nmap.extract_line(record['line_token']) 504 | if line.is_empty: # Skip lines without nodes 505 | continue 506 | xs, ys = line.xy 507 | 508 | polys[layer_name].append( 509 | np.array([xs, ys]).T 510 | ) 511 | 512 | # convert to local coordinates in place 513 | rot = get_rot(np.arctan2(center[3], center[2])).T 514 | for layer_name in polys: 515 | for rowi in range(len(polys[layer_name])): 516 | polys[layer_name][rowi] -= center[:2] 517 | polys[layer_name][rowi] = np.dot(polys[layer_name][rowi], rot) 518 | 519 | return polys 520 | 521 | -------------------------------------------------------------------------------- /tools/sensor_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2023 NVIDIA Corporation. All rights reserved. 3 | Licensed under the NVIDIA Source Code License. See LICENSE at https://github.com/NVlabs/viewpoint-robustness 4 | Authors: Tzofi Klinghoffer, Jonah Philion, Wenzheng Chen, Or Litany, Zan Gojcic, Jungseock Joo, Ramesh Raskar, Sanja Fidler, Jose M. Alvarez 5 | """ 6 | 7 | """Camera model definitions.""" 8 | import cv2 9 | import json 10 | import logging 11 | from typing import Any, Dict, Tuple, Union 12 | import numpy as np 13 | from numpy.polynomial.polynomial import Polynomial 14 | from scipy.optimize import curve_fit 15 | logger = logging.getLogger("dlav.autonet_v2.sdk.sensors.camera.camera_models") 16 | 17 | 18 | class IdealPinholeCamera: 19 | """Reperesents an ideal pinhole camera with no distortions. 20 | You can either pass in the fov or you can pass in the actual 21 | focal point parameters. It is the users choice. If you pass 22 | in the fov, then the f_x, f_y parameters are computed for you. 23 | Otherwise, they are directly inserted into the intrinsic matrix. 24 | """ 25 | def __init__( 26 | self, 27 | fov_x_deg: Union[float, int] = None, 28 | fov_y_deg: Union[float, int] = None, 29 | f_x: Union[float, int] = None, 30 | f_y: Union[float, int] = None, 31 | width: int = 3848, 32 | height: int = 2168, 33 | ): 34 | """The __init__ function. 35 | Args: 36 | fov_x_deg (Union[float, int]): the horizontal FOV in degrees. 37 | fov_y_deg (Union[float, int]): the vertical FOV in degrees. 38 | f_x (Union[float, int]): the f_x value of the intrinsic calibration 39 | matrix 40 | f_y (Union[float, int]): the f_y value of the intrinsic calibration 41 | matrix 42 | width (int): the width of the image. Defaults to 3848 43 | height (int): the height of the image. Defaults to 2168 44 | """ 45 | if f_x and fov_x_deg or f_y and fov_y_deg: 46 | raise ValueError( 47 | "Either f_x,f_y or fov_x_deg, fov_y_deg can" 48 | "be passed in but not both. User must select which" 49 | "operational mode you intend to use. If you want to" 50 | "directly insert fx,fy into the intrinsic calibration" 51 | "matrix then do not pass in fov_x_deg or fov_y_deg" 52 | "and if you want to compute f_x, f_y from the FOV then" 53 | "do not pass in f_x, f_y" 54 | ) 55 | self._width = width 56 | self._height = height 57 | self._cx = width / 2 58 | self._cy = height / 2 59 | # You can pass in the values directly. 60 | if f_x and f_y: 61 | self._f_x = f_x 62 | self._f_y = f_y 63 | else: 64 | self._focal_from_fov(fov_x_deg, fov_y_deg) 65 | # The intrinsics matrix 66 | self._k = np.asarray( 67 | [[self._f_x, 0, self._cx], [0, self._f_y, self._cy], [0, 0, 1]], 68 | dtype=np.float32, 69 | ) 70 | # The inverse of the intrinsics matrix (for backprojection) 71 | self._k_inv = np.asarray( 72 | [ 73 | [1.0 / self._f_x, 0, -self._cx / self._f_x], 74 | [0, 1.0 / self._f_y, -self._cy / self._f_y], 75 | [0, 0, 1], 76 | ], 77 | dtype=np.float32, 78 | ) 79 | @property 80 | def width(self) -> int: 81 | """Returns the width of the sensor.""" 82 | return self._width 83 | @property 84 | def height(self) -> int: 85 | """"Returns the height of the sensor.""" 86 | return self._height 87 | def _focal_from_fov( 88 | self, fov_x_deg: Union[float, int], fov_y_deg: Union[float, int] 89 | ): 90 | """Compute the focal length from horizontal and vertical FOVs. 91 | Args: 92 | fov_x_deg (Union[float, int]): the horizontal FOV in degrees. 93 | fov_y_deg (Union[float, int]): the vertical FOV in degrees. 94 | """ 95 | fov_x = np.radians(fov_x_deg) 96 | fov_y = np.radians(fov_y_deg) 97 | self._f_x = self._width / (2.0 * np.tan(fov_x * 0.5)) 98 | self._f_y = self._height / (2.0 * np.tan(fov_y * 0.5)) 99 | def ray2pixel(self, rays: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 100 | """Project 3D rays to 2D pixel coordinates. 101 | Args: 102 | rays (np.ndarray): the rays as (N, 3) where N corresponds to 103 | the number of rays and 3 is the (x,y,z) coordinates for each 104 | ray. 105 | Returns: 106 | projected (np.ndarray): Shape (N,2) the projected pixel coordinates 107 | where N is the number of points and 2 corresponds to the (x,y) dimensions. 108 | valid (np.ndarray): of Shape (N,) the validity flag for each projected pixel. 109 | Valid is a boolean array that can be used for indexing rays 110 | that are within FOV. 111 | """ 112 | if np.ndim(rays) == 1: 113 | rays = rays[np.newaxis, :] 114 | rays = rays.astype(np.float32) 115 | r = rays / rays[:, 2:] 116 | projected = np.matmul(self._k, r.T).T 117 | x_ok = np.logical_and(0 <= projected[:, 0], projected[:, 0] < self._width) 118 | y_ok = np.logical_and(0 <= projected[:, 1], projected[:, 1] < self._height) 119 | valid = np.logical_and(x_ok, y_ok) 120 | return projected[:, :2], valid 121 | def pixel2ray(self, pixels: np.ndarray) -> np.ndarray: 122 | """Backproject 2D pixels into 3D rays. 123 | Args: 124 | pixels (np.ndarray): the pixels to backproject. Size of (n_points, 2), where the first 125 | column contains the `x` values, and the second column contains the `y` values. 126 | Returns: 127 | rays (np.ndarray): the backprojected 3D rays. 128 | """ 129 | if np.ndim(pixels) == 1: 130 | pixels = pixels[np.newaxis, :] 131 | pixels = pixels.astype(np.float32) 132 | # Add the third component of ones 133 | pixels = np.c_[pixels, np.ones((pixels.shape[0], 1), dtype=np.float32)] 134 | rays = np.matmul(self._k_inv, pixels.T).T 135 | # Normalize the rays 136 | norm = np.linalg.norm(rays, axis=1, keepdims=True) 137 | norm[norm == 0] = 1 138 | return rays / norm 139 | class FThetaCamera: 140 | """Defines an FTheta camera model.""" 141 | @classmethod 142 | def from_rig(cls, rig_file: str, sensor_name: str): 143 | """Helper method to initialize a new object using a rig file and the sensor's name. 144 | Args: 145 | rig_file (str): the rig file path. 146 | sensor_name (str): the name of the sensor. 147 | Returns: 148 | FThetaCamera: the newly created object. 149 | """ 150 | with open(rig_file, "r") as fp: 151 | rig = json.load(fp) 152 | # Parse the properties from the rig file 153 | sensors = rig["rig"]["sensors"] 154 | sensor = None 155 | sensor_found = False 156 | for sensor in sensors: 157 | if sensor["name"] == sensor_name: 158 | sensor_found = True 159 | break 160 | if not sensor_found: 161 | raise ValueError(f"The camera '{sensor_name}' was not found in the rig!") 162 | return cls.from_dict(sensor) 163 | @classmethod 164 | def from_dict(cls, rig_dict: Dict[str, Any]): 165 | """Helper method to initialize a new object using a dictionary of the rig. 166 | Args: 167 | rig_dict (dict): the sensor dictionary to initialize with. 168 | Returns: 169 | FThetaCamera: the newly created object. 170 | """ 171 | cx, cy, width, height, bw_poly = FThetaCamera.get_ftheta_parameters_from_json( 172 | rig_dict 173 | ) 174 | return cls(cx, cy, width, height, bw_poly) 175 | @classmethod 176 | def from_intrinsics_array(cls, intrinsics: np.ndarray): 177 | """Helper method to initialize a new object using an array of intrinsics. 178 | Args: 179 | intrinsics (np.ndarray): the intrinsics array. The ordering is expected to be 180 | "cx, cy, width, height, bw_poly". This is the same ordering as the `intrinsics` 181 | property of this class. 182 | Returns: 183 | FThetaCaamera: the newly created object. 184 | """ 185 | return cls( 186 | cx=intrinsics[0], 187 | cy=intrinsics[1], 188 | width=intrinsics[2], 189 | height=intrinsics[3], 190 | bw_poly=intrinsics[4:], 191 | ) 192 | def __init__( 193 | self, cx: float, cy: float, width: int, height: int, bw_poly: np.ndarray 194 | ): 195 | """The __init__ method. 196 | Args: 197 | cx (float): optical center x. 198 | cy (float): optical center y. 199 | width (int): the width of the image. 200 | height (int): the height of the image. 201 | bw_poly (np.ndarray): the backward polynomial of the FTheta model. 202 | """ 203 | self._center = np.asarray([cx, cy], dtype=np.float32) 204 | self._width = int(width) 205 | self._height = int(height) 206 | assert isinstance(bw_poly, np.ndarray) or isinstance(bw_poly, list) 207 | self.bw_poly = Polynomial(bw_poly) 208 | self._fw_poly = self._compute_fw_poly() 209 | # Other properties that need to be computed 210 | self._horizontal_fov = None 211 | self._vertical_fov = None 212 | self._max_angle = None 213 | self._max_ray_angle = None 214 | # Populate the array of intrinsics 215 | self._intrinsics = np.append([cx, cy, width, height], bw_poly).astype( 216 | np.float32 217 | ) 218 | self._update_calibrated_camera() 219 | @staticmethod 220 | def get_ftheta_parameters_from_json(rig_dict: Dict[str, Any]) -> Tuple[Any]: 221 | """Helper method for obtaining FTheta camera model parameters from a rig dict. 222 | Args: 223 | rig_dict (Dict[str, Any]): the rig dictionary to parse. 224 | Raises: 225 | ValueError: if the provided rig is not supported. 226 | AssertionError: if the provided model is supported, but cannot be parsed properly. 227 | Returns: 228 | Tuple[Any]: the values `cx`, `cy`, `width`, `height` and `bw_poly` that were parsed. 229 | """ 230 | props = rig_dict["properties"] 231 | if props["Model"] != "ftheta": 232 | raise ValueError("The given camera is not an FTheta camera") 233 | cx = float(props["cx"]) 234 | cy = float(props["cy"]) 235 | width = int(props["width"]) 236 | height = int(props["height"]) 237 | if "bw-poly" in props: # Is this a regular rig? 238 | poly = props["bw-poly"] 239 | elif "polynomial" in props: # Is this a VT rig? 240 | # VT rigs have a slightly different format, so need to handle these 241 | # specifically. Refer to the following thread for more details: 242 | # https://nvidia.slack.com/archives/C017LLEG763/p1633304770105300 243 | poly_type = props["polynomial-type"] 244 | assert poly_type == "pixeldistance-to-angle", ( 245 | "Encountered an unsupported VT rig. Only `pixeldistance-to-angle` " 246 | f"polynomials are supported (got {poly_type}). Rig:\n{rig_dict}" 247 | ) 248 | linear_c = float(props["linear-c"]) if "linear-c" in props else None 249 | linear_d = float(props["linear-d"]) if "linear-d" in props else None 250 | linear_e = float(props["linear-e"]) if "linear-e" in props else None 251 | # If we had all the terms present, sanity check to make sure they are [1, 0, 0] 252 | if linear_c is not None and linear_d is not None and linear_e is not None: 253 | assert ( 254 | linear_c == 1.0 255 | ), f"Expected `linear-c` term to be 1.0 (got {linear_c}. Rig:\n{rig_dict})" 256 | assert ( 257 | linear_d == 0.0 258 | ), f"Expected `linear-d` term to be 0.0 (got {linear_d}. Rig:\n{rig_dict})" 259 | assert ( 260 | linear_e == 0.0 261 | ), f"Expected `linear-e` term to be 0.0 (got {linear_e}. Rig:\n{rig_dict})" 262 | # If we're here, then it means we can parse the rig successfully. 263 | poly = props["polynomial"] 264 | else: 265 | raise ValueError( 266 | f"Unable to parse the rig. Only FTheta rigs are supported! Rig:\n{rig_dict}" 267 | ) 268 | bw_poly = [np.float32(val) for val in poly.split()] 269 | return cx, cy, width, height, bw_poly 270 | @property 271 | def fov(self) -> tuple: 272 | """Returns a tuple of horizontal and vertical fov of the sensor.""" 273 | if self._vertical_fov is None or self._horizontal_fov is None: 274 | self._compute_fov() 275 | return self._horizontal_fov, self._vertical_fov 276 | @property 277 | def width(self) -> int: 278 | """Returns the width of the sensor.""" 279 | return self._width 280 | @property 281 | def height(self) -> int: 282 | """"Returns the height of the sensor.""" 283 | return self._height 284 | @property 285 | def center(self) -> np.ndarray: 286 | """"Returns the center of the sensor.""" 287 | return self._center 288 | @property 289 | def intrinsics(self) -> np.ndarray: 290 | """Obtain an array of the intrinsics of this camera model. 291 | Returns: 292 | np.ndarray: an array of intrinsics. The ordering is "cx, cy, width, height, bw_poly". 293 | dtype is np.float32. 294 | """ 295 | return self._intrinsics 296 | def __str__(self): 297 | """Returns a string representation of this object.""" 298 | return ( 299 | f"FTheta camera model:\n\t{self.bw_poly}\n\t" 300 | f"center={self._center}\n\twidth={self._width}\n\theight={self._height}\n\t" 301 | f"h_fov={np.degrees(self._horizontal_fov)}\n\tv_fov={np.degrees(self._vertical_fov)}" 302 | ) 303 | def _update_calibrated_camera(self): 304 | """Updates the internals of this object after calulating various properties.""" 305 | self._compute_fov() 306 | self._max_ray_angle = (self._max_angle).copy() 307 | is_fw_poly_slope_negative_in_domain = False 308 | ray_angle = (np.float32(self._max_ray_angle)).copy() 309 | deg2rad = np.pi / 180.0 310 | while ray_angle >= np.float32(0.0): 311 | temp_dval = self._fw_poly.deriv()(self._max_ray_angle).item() 312 | if temp_dval < 0: 313 | is_fw_poly_slope_negative_in_domain = True 314 | ray_angle -= deg2rad * np.float32(1.0) 315 | if is_fw_poly_slope_negative_in_domain: 316 | ray_angle = (np.float32(self._max_ray_angle)).copy() 317 | while ray_angle >= np.float32(0.0): 318 | ray_angle -= deg2rad * np.float32(1.0) 319 | raise ArithmeticError( 320 | "FThetaCamera: derivative of distortion within image interior is negative" 321 | ) 322 | # Evaluate the forward polynomial at point (self._max_ray_angle, 0) 323 | # Also evaluate its derivative at the same point 324 | val = self._fw_poly(self._max_ray_angle).item() 325 | dval = self._fw_poly.deriv()(self._max_ray_angle).item() 326 | if dval < 0: 327 | raise ArithmeticError( 328 | "FThetaCamera: derivative of distortion at edge of image is negative" 329 | ) 330 | self._max_ray_distortion = np.asarray([val, dval], dtype=np.float32) 331 | def _compute_fw_poly(self): 332 | """Computes the forward polynomial for this camera. 333 | This function is a replication of the logic in the following file from the DW repo: 334 | src/dw/calibration/cameramodel/CameraModels.cpp 335 | """ 336 | def get_max_value(p0, p1): 337 | return np.linalg.norm( 338 | np.asarray([p0, p1], dtype=self._center.dtype) - self._center 339 | ) 340 | max_value = 0.0 341 | size = (self._width, self._height) 342 | value = get_max_value(0.0, 0.0) 343 | max_value = max(max_value, value) 344 | value = get_max_value(0.0, size[1]) 345 | max_value = max(max_value, value) 346 | value = get_max_value(size[0], 0.0) 347 | max_value = max(max_value, value) 348 | value = get_max_value(size[0], size[1]) 349 | max_value = max(max_value, value) 350 | SAMPLE_COUNT = 500 351 | samples_x = [] 352 | samples_b = [] 353 | step = max_value / SAMPLE_COUNT 354 | x = step 355 | for _ in range(0, SAMPLE_COUNT): 356 | p = np.asarray([self._center[0] + x, self._center[1]], dtype=np.float32) 357 | ray, _ = self.pixel2ray(p) 358 | xy_norm = np.linalg.norm(ray[0, :2]) 359 | theta = np.arctan2(float(xy_norm), float(ray[0, 2])) 360 | samples_x.append(theta) 361 | samples_b.append(float(x)) 362 | x += step 363 | x = np.asarray(samples_x, dtype=np.float64) 364 | y = np.asarray(samples_b, dtype=np.float64) 365 | # Fit a 4th degree polynomial. The polynomial function is as follows: 366 | def f(x, b, x1, x2, x3, x4): 367 | """4th degree polynomial.""" 368 | return b + x * (x1 + x * (x2 + x * (x3 + x * x4))) 369 | coeffs, _ = curve_fit( 370 | f, 371 | x, 372 | y, 373 | bounds=( 374 | [0, -np.inf, -np.inf, -np.inf, -np.inf], 375 | [np.finfo(np.float64).eps, np.inf, np.inf, np.inf, np.inf], 376 | ), 377 | ) 378 | # Return the polynomial and hardcode the bias value to 0 379 | return Polynomial( 380 | [np.float32(val) if i > 0 else 0 for i, val in enumerate(coeffs)] 381 | ) 382 | def pixel2ray(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 383 | """Backproject 2D pixels into 3D rays. 384 | Args: 385 | x (np.ndarray): the pixels to backproject. Size of (n_points, 2), where the first 386 | column contains the `x` values, and the second column contains the `y` values. 387 | Returns: 388 | rays (np.ndarray): the backprojected 3D rays. Size of (n_points, 3). 389 | valid (np.ndarray): bool flag indicating the validity of each backprojected pixel. 390 | """ 391 | # Make sure x is n x 2 392 | if np.ndim(x) == 1: 393 | x = x[np.newaxis, :] 394 | # Fix the type 395 | x = x.astype(np.float32) 396 | xd = x - self._center 397 | xd_norm = np.linalg.norm(xd, axis=1, keepdims=True) 398 | alpha = self.bw_poly(xd_norm) 399 | sin_alpha = np.sin(alpha) 400 | rx = sin_alpha * xd[:, 0:1] / xd_norm 401 | ry = sin_alpha * xd[:, 1:] / xd_norm 402 | rz = np.cos(alpha) 403 | rays = np.hstack((rx, ry, rz)) 404 | # special case: ray is perpendicular to image plane normal 405 | valid = (xd_norm > np.finfo(np.float32).eps).squeeze() 406 | rays[~valid, :] = (0, 0, 1) # This is what DW sets these rays to 407 | # note: 408 | # if constant coefficient of bwPoly is non-zero, 409 | # the resulting ray might not be normalized. 410 | return rays, valid 411 | def ray2pixel(self, rays: np.ndarray) -> np.ndarray: 412 | """Project 3D rays to 2D pixel coordinates. 413 | Args: 414 | rays (np.ndarray): the rays. 415 | Returns: 416 | result (np.ndarray): the projected pixel coordinates. 417 | """ 418 | # Make sure the input shape is (n_points, 3) 419 | if np.ndim(rays) == 1: 420 | rays = rays[np.newaxis, :] 421 | # Fix the type 422 | rays = rays.astype(np.float32) 423 | xy_norm = np.linalg.norm(rays[:, :2], axis=1, keepdims=True) 424 | cos_alpha = rays[:, 2:] / np.linalg.norm(rays, axis=1, keepdims=True) 425 | alpha = np.empty_like(cos_alpha) 426 | cos_alpha_condition = np.logical_and( 427 | cos_alpha > np.float32(-1.0), cos_alpha < np.float32(1.0) 428 | ).squeeze() 429 | alpha[cos_alpha_condition] = np.arccos(cos_alpha[cos_alpha_condition]) 430 | alpha[~cos_alpha_condition] = xy_norm[~cos_alpha_condition] 431 | delta = np.empty_like(cos_alpha) 432 | alpha_cond = alpha <= self._max_ray_angle 433 | delta[alpha_cond] = self._fw_poly(alpha[alpha_cond]) 434 | # For outside the model (which need to do linear extrapolation) 435 | delta[~alpha_cond] = ( 436 | self._max_ray_distortion[0] 437 | + (alpha[~alpha_cond] - self._max_ray_angle) * self._max_ray_distortion[1] 438 | ) 439 | # Determine the bad points with a norm of zero, and avoid division by zero 440 | bad_norm = xy_norm <= 0 441 | xy_norm[bad_norm] = 1 442 | delta[bad_norm] = 0 443 | # compute pixel relative to center 444 | scale = delta / xy_norm 445 | pixel = scale * rays 446 | # Handle the edge cases (ray along image plane normal) 447 | edge_case_cond = (xy_norm <= np.float32(0.0)).squeeze() 448 | pixel[edge_case_cond, :] = rays[edge_case_cond, :] 449 | result = pixel[:, :2] + self._center 450 | return result 451 | def _get_pixel_fov(self, pt: np.ndarray) -> float: 452 | """Gets the FOV for a given point. Used internally for FOV computation. 453 | Args: 454 | pt (np.ndarray): 2D pixel. 455 | Returns: 456 | fov (float): the FOV of the pixel. 457 | """ 458 | ray, _ = self.pixel2ray(pt) 459 | fov = np.arctan2(np.linalg.norm(ray[:, :2], axis=1), ray[:, 2]) 460 | return fov 461 | def _compute_fov(self): 462 | """Computes the FOV of this camera model.""" 463 | max_x = self._width - 1 464 | max_y = self._height - 1 465 | point_left = np.asarray([0, self._center[1]], dtype=np.float32) 466 | point_right = np.asarray([max_x, self._center[1]], dtype=np.float32) 467 | point_top = np.asarray([self._center[0], 0], dtype=np.float32) 468 | point_bottom = np.asarray([self._center[0], max_y], dtype=np.float32) 469 | fov_left = self._get_pixel_fov(point_left) 470 | fov_right = self._get_pixel_fov(point_right) 471 | fov_top = self._get_pixel_fov(point_top) 472 | fov_bottom = self._get_pixel_fov(point_bottom) 473 | self._vertical_fov = fov_top + fov_bottom 474 | self._horizontal_fov = fov_left + fov_right 475 | self._compute_max_angle() 476 | def _compute_max_angle(self): 477 | """Computes the maximum ray angle for this camera.""" 478 | max_x = self._width - 1 479 | max_y = self._height - 1 480 | p = np.asarray( 481 | [[0, 0], [max_x, 0], [0, max_y], [max_x, max_y]], dtype=np.float32 482 | ) 483 | self._max_angle = max( 484 | max(self._get_pixel_fov(p[0, ...]), self._get_pixel_fov(p[1, ...])), 485 | max(self._get_pixel_fov(p[2, ...]), self._get_pixel_fov(p[3, ...])), 486 | ) 487 | def is_ray_inside_fov(self, ray: np.ndarray) -> bool: 488 | """Determines whether a given ray is inside the FOV of this camera. 489 | Args: 490 | ray (np.ndarray): the 3D ray. 491 | Returns: 492 | bool: whether the ray is inside the FOV. 493 | """ 494 | if np.ndim(ray) == 1: 495 | ray = ray[np.newaxis, :] 496 | ray_angle = np.arctan2(np.linalg.norm(ray[:, :2], axis=1), ray[:, 2]) 497 | return ray_angle <= self._max_angle 498 | 499 | def rectify(img, model, ph_model: IdealPinholeCamera): 500 | """Apply rectification to the frame via backward sampling. 501 | Args: 502 | ph_model: pinhole camera model used to rectify the image. 503 | Returns: 504 | A copy of the CameraFrame object with updated `data`, `model`, 505 | and `car_mask` attributes. 506 | """ 507 | # we assume it's an FTheta camera model 508 | assert isinstance(model, FThetaCamera) 509 | ftheta_model: FThetaCamera = model 510 | # compute rays through each pixel in pinhole image 511 | ii, jj = np.meshgrid( 512 | np.arange(ph_model.height), np.arange(ph_model.width), indexing="ij" 513 | ) 514 | ph_coords = np.stack((jj, ii), axis=-1) # (H, W, 2) 515 | ph_coords = ph_coords.reshape(-1, 2) # (H*W, 2) 516 | ph_rays = ph_model.pixel2ray(ph_coords) 517 | # project rays into ftheta image 518 | ftheta_coords = ftheta_model.ray2pixel(ph_rays) 519 | # sample ftheta RGB and car mask 520 | map_xy = ftheta_coords.reshape((ph_model.height, ph_model.width, 2)) 521 | sampled_rgb = cv2.remap( 522 | img, map_xy[..., 0], map_xy[..., 1], interpolation=cv2.INTER_LINEAR 523 | ) 524 | # return new camera frame container 525 | """ 526 | rect = CameraFrame( 527 | timestamp=None, 528 | frame_number=None, 529 | sensor_name=None, 530 | data=sampled_rgb, 531 | rig=None, 532 | model=ph_model, 533 | ) 534 | """ 535 | return sampled_rgb 536 | 537 | def ph_pixel2ray(pixels: np.ndarray, k_inv) -> np.ndarray: 538 | """Backproject 2D pixels into 3D rays. 539 | Args: 540 | pixels (np.ndarray): the pixels to backproject. Size of (n_points, 2), where the first 541 | column contains the `x` values, and the second column contains the `y` values. 542 | Returns: 543 | rays (np.ndarray): the backprojected 3D rays. 544 | """ 545 | if np.ndim(pixels) == 1: 546 | pixels = pixels[np.newaxis, :] 547 | pixels = pixels.astype(np.float32) 548 | # Add the third component of ones 549 | pixels = np.c_[pixels, np.ones((pixels.shape[0], 1), dtype=np.float32)] 550 | rays = np.matmul(k_inv, pixels.T).T 551 | # Normalize the rays 552 | norm = np.linalg.norm(rays, axis=1, keepdims=True) 553 | norm[norm == 0] = 1 554 | return rays / norm 555 | 556 | def ftheta_pixel2ray(x: np.ndarray, center, bw_poly) -> Tuple[np.ndarray, np.ndarray]: 557 | """Backproject 2D pixels into 3D rays. 558 | Args: 559 | x (np.ndarray): the pixels to backproject. Size of (n_points, 2), where the first 560 | column contains the `x` values, and the second column contains the `y` values. 561 | Returns: 562 | rays (np.ndarray): the backprojected 3D rays. Size of (n_points, 3). 563 | valid (np.ndarray): bool flag indicating the validity of each backprojected pixel. 564 | """ 565 | # Make sure x is n x 2 566 | if np.ndim(x) == 1: 567 | x = x[np.newaxis, :] 568 | # Fix the type 569 | x = x.astype(np.float32) 570 | xd = x - center 571 | xd_norm = np.linalg.norm(xd, axis=1, keepdims=True) 572 | alpha = bw_poly(xd_norm) 573 | sin_alpha = np.sin(alpha) 574 | rx = sin_alpha * xd[:, 0:1] / xd_norm 575 | ry = sin_alpha * xd[:, 1:] / xd_norm 576 | rz = np.cos(alpha) 577 | rays = np.hstack((rx, ry, rz)) 578 | # special case: ray is perpendicular to image plane normal 579 | valid = (xd_norm > np.finfo(np.float32).eps).squeeze() 580 | rays[~valid, :] = (0, 0, 1) # This is what DW sets these rays to 581 | # note: 582 | # if constant coefficient of bwPoly is non-zero, 583 | # the resulting ray might not be normalized. 584 | return rays, valid 585 | 586 | def ftheta_ray2pixel(rays: np.ndarray, max_ray_angle, max_ray_distortion, center, fw_poly) -> np.ndarray: 587 | """Project 3D rays to 2D pixel coordinates. 588 | Args: 589 | rays (np.ndarray): the rays. 590 | Returns: 591 | result (np.ndarray): the projected pixel coordinates. 592 | """ 593 | # Make sure the input shape is (n_points, 3) 594 | if np.ndim(rays) == 1: 595 | rays = rays[np.newaxis, :] 596 | # Fix the type 597 | rays = rays.astype(np.float32) 598 | xy_norm = np.linalg.norm(rays[:, :2], axis=1, keepdims=True) 599 | cos_alpha = rays[:, 2:] / np.linalg.norm(rays, axis=1, keepdims=True) 600 | alpha = np.empty_like(cos_alpha) 601 | cos_alpha_condition = np.logical_and( 602 | cos_alpha > np.float32(-1.0), cos_alpha < np.float32(1.0) 603 | ).squeeze() 604 | alpha[cos_alpha_condition] = np.arccos(cos_alpha[cos_alpha_condition]) 605 | alpha[~cos_alpha_condition] = xy_norm[~cos_alpha_condition] 606 | delta = np.empty_like(cos_alpha) 607 | alpha_cond = alpha <= max_ray_angle 608 | delta[alpha_cond] = fw_poly(alpha[alpha_cond]) 609 | # For outside the model (which need to do linear extrapolation) 610 | delta[~alpha_cond] = ( 611 | max_ray_distortion[0] 612 | + (alpha[~alpha_cond] - max_ray_angle) * max_ray_distortion[1] 613 | ) 614 | # Determine the bad points with a norm of zero, and avoid division by zero 615 | bad_norm = xy_norm <= 0 616 | xy_norm[bad_norm] = 1 617 | delta[bad_norm] = 0 618 | # compute pixel relative to center 619 | scale = delta / xy_norm 620 | pixel = scale * rays 621 | # Handle the edge cases (ray along image plane normal) 622 | edge_case_cond = (xy_norm <= np.float32(0.0)).squeeze() 623 | pixel[edge_case_cond, :] = rays[edge_case_cond, :] 624 | result = pixel[:, :2] + center 625 | return result 626 | 627 | def compute_fw_poly(height, width, center): 628 | """Computes the forward polynomial for this camera. 629 | This function is a replication of the logic in the following file from the DW repo: 630 | src/dw/calibration/cameramodel/CameraModels.cpp 631 | """ 632 | def get_max_value(p0, p1): 633 | return np.linalg.norm( 634 | np.asarray([p0, p1], dtype=center.dtype) - center 635 | ) 636 | max_value = 0.0 637 | size = (width, height) 638 | value = get_max_value(0.0, 0.0) 639 | max_value = max(max_value, value) 640 | value = get_max_value(0.0, size[1]) 641 | max_value = max(max_value, value) 642 | value = get_max_value(size[0], 0.0) 643 | max_value = max(max_value, value) 644 | value = get_max_value(size[0], size[1]) 645 | max_value = max(max_value, value) 646 | SAMPLE_COUNT = 500 647 | samples_x = [] 648 | samples_b = [] 649 | step = max_value / SAMPLE_COUNT 650 | x = step 651 | for _ in range(0, SAMPLE_COUNT): 652 | p = np.asarray([center[0] + x, center[1]], dtype=np.float32) 653 | ray, _ = ftheta_pixel2ray(p) 654 | xy_norm = np.linalg.norm(ray[0, :2]) 655 | theta = np.arctan2(float(xy_norm), float(ray[0, 2])) 656 | samples_x.append(theta) 657 | samples_b.append(float(x)) 658 | x += step 659 | x = np.asarray(samples_x, dtype=np.float64) 660 | y = np.asarray(samples_b, dtype=np.float64) 661 | # Fit a 4th degree polynomial. The polynomial function is as follows: 662 | def f(x, b, x1, x2, x3, x4): 663 | """4th degree polynomial.""" 664 | return b + x * (x1 + x * (x2 + x * (x3 + x * x4))) 665 | coeffs, _ = curve_fit( 666 | f, 667 | x, 668 | y, 669 | bounds=( 670 | [0, -np.inf, -np.inf, -np.inf, -np.inf], 671 | [np.finfo(np.float64).eps, np.inf, np.inf, np.inf, np.inf], 672 | ), 673 | ) 674 | # Return the polynomial and hardcode the bias value to 0 675 | return Polynomial( 676 | [np.float32(val) if i > 0 else 0 for i, val in enumerate(coeffs)] 677 | ) 678 | 679 | def rectify_fast(img, height, width, cx, cy, fx, fy, bw_poly): 680 | """Apply rectification to the frame via backward sampling. 681 | Args: 682 | ph_model: pinhole camera model used to rectify the image. 683 | Returns: 684 | A copy of the CameraFrame object with updated `data`, `model`, 685 | and `car_mask` attributes. 686 | """ 687 | # ASSUME PH AND FTHETA SHARE THE SAME CENTER 688 | center = np.asarray([cx, cy], dtype=np.float32) 689 | k_inv = np.asarray( 690 | [ 691 | [1.0 / fx, 0, -cx / fx], 692 | [0, 1.0 / fy, -cy / fy], 693 | [0, 0, 1], 694 | ], 695 | dtype=np.float32, 696 | ) 697 | # compute rays through each pixel in pinhole image 698 | ii, jj = np.meshgrid( 699 | np.arange(height), np.arange(width), indexing="ij" 700 | ) 701 | ph_coords = np.stack((jj, ii), axis=-1) # (H, W, 2) 702 | ph_coords = ph_coords.reshape(-1, 2) # (H*W, 2) 703 | ph_rays = ph_pixel2ray(ph_coords, k_inv) 704 | # project rays into ftheta image 705 | ftheta_coords = ftheta_ray2pixel(ph_rays) 706 | # sample ftheta RGB and car mask 707 | map_xy = ftheta_coords.reshape((height, width, 2)) 708 | sampled_rgb = cv2.remap( 709 | img, map_xy[..., 0], map_xy[..., 1], interpolation=cv2.INTER_LINEAR 710 | ) 711 | return sampled_rgb 712 | -------------------------------------------------------------------------------- /tools/transformations.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2023 NVIDIA Corporation. All rights reserved. 3 | Licensed under the NVIDIA Source Code License. See LICENSE at https://github.com/NVlabs/viewpoint-robustness 4 | Authors: Tzofi Klinghoffer, Jonah Philion, Wenzheng Chen, Or Litany, Zan Gojcic, Jungseock Joo, Ramesh Raskar, Sanja Fidler, Jose M. Alvarez 5 | """ 6 | import numpy as np 7 | 8 | from scipy.spatial.transform import Rotation as R 9 | 10 | def so3_trans_2_se3(so3, trans): 11 | """Create a 4x4 rigid transformation matrix given so3 rotation and translation. 12 | 13 | Args: 14 | so3: rotation matrix [n,3,3] 15 | trans: x, y, z translation [n, 3] 16 | 17 | Returns: 18 | np.ndarray: the constructed transformation matrix [n,4,4] 19 | """ 20 | 21 | if so3.ndim > 2: 22 | T = np.eye(4) 23 | T = np.tile(T,(so3.shape[0], 1, 1)) 24 | T[:, 0:3, 0:3] = so3 25 | T[:, 0:3, 3] = trans.reshape(-1,3,) 26 | 27 | else: 28 | T = np.eye(4) 29 | T[0:3, 0:3] = so3 30 | T[0:3, 3] = trans.reshape(3,) 31 | 32 | return T 33 | 34 | def axis_angle_trans_2_se3(rot_axis, rot_angle, trans, degrees=True): 35 | ''' Converts the axis/angle rotation and translation to a se3 transformation matrix 36 | Args: 37 | translation (np.array): translation vectors (x,y,z) [n,3] 38 | axis (np.array): the rotation axes [n,3] 39 | angle float/double: rotation angles either in degrees or radians [n,1] 40 | degrees bool: True if angle is given in degrees else False 41 | 42 | Out: 43 | (np array): transformations in a se3 matrix representation [n,4,4] 44 | ''' 45 | 46 | return so3_trans_2_se3(axis_angle_2_so3(rot_axis, rot_angle, degrees), trans) 47 | 48 | 49 | 50 | 51 | def euler_trans_2_se3(euler_angles, trans, degrees=True, seq='xyz'): 52 | """Create a 4x4 rigid transformation matrix given euler angles and translation. 53 | 54 | Args: 55 | euler_angles (np.array): euler angles [n,3] 56 | trans (Sequence[float]): x, y, z translation. 57 | seq string: sequence in which the euler angles are given 58 | 59 | Returns: 60 | np.ndarray: the constructed transformation matrix. 61 | """ 62 | 63 | return so3_trans_2_se3(euler_2_so3(euler_angles, degrees), trans) 64 | 65 | 66 | 67 | def axis_angle_2_so3(axis, angle, degrees=True): 68 | ''' Converts the axis angle representation of the so3 rotation matrix 69 | Args: 70 | axis (np.array): the rotation axes [n,3] 71 | angle float/double: rotation angles either in degrees or radians [n] 72 | degrees bool: True if angle is given in degrees else False 73 | 74 | Out: 75 | (np array): rotations given so3 matrix representation [n,3,3] 76 | ''' 77 | # Treat angle (radians) below this as 0. 78 | cutoff_angle = 1e-9 if not degrees else np.rad2deg(1e-9) 79 | angle[angle < cutoff_angle] = 0.0 80 | 81 | # Scale the axis to have the norm representing the angle 82 | axis_angle = (angle/np.linalg.norm(axis, axis=1, keepdims=True)) * axis 83 | 84 | return R.from_rotvec(axis_angle, degrees=degrees).as_matrix() 85 | 86 | 87 | def euler_2_so3(euler_angles, degrees=True, seq='xyz'): 88 | ''' Converts the euler angles representation to the so3 rotation matrix 89 | Args: 90 | euler_angles (np.array): euler angles [n,3] 91 | degrees bool: True if angle is given in degrees else False 92 | seq string: sequence in which the euler angles are given 93 | 94 | Out: 95 | (np array): rotations given so3 matrix representation [n,3,3] 96 | ''' 97 | 98 | return R.from_euler(seq=seq, angles=euler_angles, degrees=degrees).as_matrix().astype(np.float32) 99 | 100 | 101 | def axis_angle_2_quaternion(axis, angle, degrees=True): 102 | ''' Converts the axis angle representation of the rotation to a unit quaternion 103 | Args: 104 | axis (np.array): the rotation axis [n,3] 105 | angle float/double: rotation angle either in degrees or radians [n,1] 106 | degrees bool: True if angle is given in degrees else False 107 | 108 | Out: 109 | (np array): rotation given in unit quaternion [n,4] 110 | ''' 111 | return axis_angle_2_so3(axis, angle, degrees).as_quat() 112 | 113 | 114 | def so3_2_axis_angle(so3, degrees=True): 115 | ''' Converts the so3 representation to axis_angle 116 | Args: 117 | so3 (np.array): the rotation matrices [n,3,3] 118 | degrees bool: True if angle should be given in degrees 119 | 120 | Out: 121 | axis (np array): the rotation axis [n,3] 122 | angle (np array): the rotation angles, either in degrees (if degrees=True) or radians [n,] 123 | ''' 124 | rot_vec = R.from_matrix(so3).as_rotvec() #degrees=degrees) 125 | 126 | angle = np.linalg.norm(rot_vec, axis=-1, keepdims=True) 127 | axis = rot_vec / angle 128 | 129 | return axis, angle 130 | 131 | 132 | def euclidean_2_spherical_coords(coords): 133 | 134 | r = np.linalg.norm(coords, axis=-1, keepdims=True) 135 | el = np.arctan(coords[:,2]/np.linalg.norm(coords[:,:2], axis=-1)).reshape(-1,1) 136 | az = np.arctan2(coords[:,1],coords[:,0]).reshape(-1,1) 137 | 138 | return np.concatenate((r,az,el),axis=-1) 139 | 140 | def spherical_2_direction(spherical_coords): 141 | 142 | dx = np.cos(spherical_coords[:,2]) * np.cos(spherical_coords[:,1]) 143 | dy = np.cos(spherical_coords[:,2]) * np.sin(spherical_coords[:,1]) 144 | dz = np.sin(spherical_coords[:,2]) 145 | 146 | return np.concatenate((dx[:,None],dy[:,None],dz[:,None]),axis=-1) 147 | 148 | def transform_point_cloud(pc, T): 149 | ''' Transform the point cloud with the provided transformation matrix 150 | Args: 151 | pc (np.array): point cloud coordinates (x,y,z) [n,3] 152 | T (np.array): se3 transformation matrix [4,4] 153 | 154 | Out: 155 | (np array): transformed point cloud coordinated [n,3] 156 | ''' 157 | return (T[:3,:3] @ pc[:,:3].transpose() + T[:3,3:4]).transpose() 158 | 159 | 160 | 161 | def local_ENU_2_ECEF_orientation(theta, phi): 162 | ''' Computes the rotation matrix between the world_pose and ECEF coordinate system 163 | Args: 164 | theta (np.array): theta coordinates in radians [n,1] 165 | phi (np.array): phi coordinates in radians [n,1] 166 | Out: 167 | (np.array): rotation from world pose to ECEF in so3 representation [n,3,3] 168 | ''' 169 | z_dir = np.concatenate([(np.sin(theta)*np.cos(phi))[:,None], 170 | (np.sin(theta)*np.sin(phi))[:,None], 171 | (np.cos(theta))[:,None] ],axis=1) 172 | z_dir = z_dir/np.linalg.norm(z_dir, axis=-1, keepdims=True) 173 | 174 | y_dir = np.concatenate([-(np.cos(theta)*np.cos(phi))[:,None], 175 | -(np.cos(theta)*np.sin(phi))[:,None], 176 | (np.sin(theta))[:,None] ],axis=1) 177 | y_dir = y_dir/np.linalg.norm(y_dir, axis=-1, keepdims=True) 178 | 179 | x_dir = np.cross(y_dir, z_dir) 180 | 181 | return np.concatenate([x_dir[:,:,None], y_dir[:,:,None], z_dir[:,:,None]], axis = -1) 182 | 183 | 184 | def lat_lng_alt_2_ECEF_elipsoidal(lat_lng_alt, a, b): 185 | ''' Converts the GPS (lat,long, alt) coordinates to the ECEF ones based on the ellipsoidal earth model 186 | Args: 187 | lat_lng_alt (np.array): latitude, longitude and altitude coordinate (in degrees and meters) [n,3] 188 | a (float/double): Semi-major axis of the ellipsoid 189 | b (float/double): Semi-minor axis of the ellipsoid 190 | Out: 191 | (np.array): ECEF coordinates[n,3] 192 | ''' 193 | 194 | phi = np.deg2rad(lat_lng_alt[:, 0]) 195 | gamma = np.deg2rad(lat_lng_alt[:, 1]) 196 | 197 | cos_phi = np.cos(phi) 198 | sin_phi = np.sin(phi) 199 | cos_gamma = np.cos(gamma) 200 | sin_gamma = np.sin(gamma) 201 | e_square = (a * a - b * b) / (a * a) 202 | 203 | N = a / np.sqrt(1 - e_square * sin_phi * sin_phi) 204 | 205 | 206 | x = (N + lat_lng_alt[:, 2]) * cos_phi * cos_gamma 207 | y = (N + lat_lng_alt[:, 2]) * cos_phi * sin_gamma 208 | z = (N * (b*b)/(a*a) + lat_lng_alt[:, 2]) * sin_phi 209 | 210 | return np.concatenate([x[:,None] ,y[:,None], z[:,None]], axis=1 ) 211 | 212 | def translation_2_lat_lng_alt_spherical(translation, earth_radius): 213 | ''' Computes the translation in the ECEF to latitude, longitude, altitude based on the spherical earth model 214 | Args: 215 | translation (np.array): translation in the ECEF coordinate frame (in meters) [n,3] 216 | earth_radius (float/double): earth radius 217 | Out: 218 | (np.array): latitude, longitude and altitude [n,3] 219 | ''' 220 | altitude = np.linalg.norm(translation, axis=-1) - earth_radius 221 | latitude = np.rad2deg(90 - np.arccos(translation[:,2] / np.linalg.norm(translation, axis=-1, keepdims=True))) 222 | longitude = np.rad2deg(np.arctan2(translation[:,1],translation[:,0])) 223 | 224 | return np.concatenate([latitude[:,None], longitude[:,None], altitude[:,None]], axis=1) 225 | 226 | def translation_2_lat_lng_alt_ellipsoidal(translation, a, f): 227 | ''' Computes the translation in the ECEF to latitude, longitude, altitude based on the ellipsoidal earth model 228 | Args: 229 | translation (np.array): translation in the ECEF coordinate frame (in meters) [n,3] 230 | a (float/double): Semi-major axis of the ellipsoid 231 | f (float/double): flattening factor of the earth 232 | radius 233 | Out: 234 | (np.array): latitude, longitude and altitude [n,3] 235 | ''' 236 | 237 | # Compute support parameters 238 | f0 = (1 - f) * (1 - f) 239 | f1 = 1 - f0 240 | f2 = 1 / f0 - 1 241 | 242 | z_div_1_f = translation[:,2] / (1 - f) 243 | x2y2 = np.square(translation[:,0]) + np.square(translation[:,1]) 244 | 245 | x2y2z2 = x2y2 + z_div_1_f*z_div_1_f 246 | x2y2z2_pow_3_2 = x2y2z2 * np.sqrt(x2y2z2) 247 | 248 | gamma = (x2y2z2_pow_3_2 + a * f2 * z_div_1_f * z_div_1_f) / (x2y2z2_pow_3_2 - a * f1 * x2y2) * translation[:,2] / np.sqrt(x2y2) 249 | 250 | longitude = np.rad2deg(np.arctan2(translation[:,1], translation[:,0])) 251 | latitude = np.rad2deg(np.arctan(gamma)) 252 | altitude = np.sqrt(1 + np.square(gamma)) * (np.sqrt(x2y2) - a / np.sqrt(1 + f0 * np.square(gamma))) 253 | 254 | return np.concatenate([latitude[:,None], longitude[:,None], altitude[:,None]], axis=1) 255 | 256 | def lat_lng_alt_2_ecef(lat_lng_alt, orientation_axis, orientation_angle, earth_model='WGS84'): 257 | ''' Computes the transformation from the world pose coordiante system to the earth centered earth fixed (ECEF) one 258 | Args: 259 | lat_lng_alt (np.array): latitude, longitude and altitude coordinate (in degrees and meters) [n,3] 260 | orientation_axis (np.array): orientation in the local ENU coordinate system [n,3] 261 | orientation_angle (np.array): orientation angle of the local ENU coordinate system in degrees [n,1] 262 | earth_model (string): earth model used for conversion (spheric will be unaccurate when maps are large) 263 | Out: 264 | trans (np.array): transformation parameters from world pose to ECEF coordinate system in se3 form (n, 4, 4) 265 | ''' 266 | n = lat_lng_alt.shape[0] 267 | trans = np.tile(np.eye(4).reshape(1,4,4),[n,1,1]) 268 | 269 | theta = np.deg2rad(90. - lat_lng_alt[:, 0]) 270 | phi = np.deg2rad(lat_lng_alt[:, 1]) 271 | 272 | R_enu_ecef = local_ENU_2_ECEF_orientation(theta, phi) 273 | 274 | 275 | if earth_model == 'WGS84': 276 | a = 6378137.0 277 | flattening = 1.0 / 298.257223563 278 | b = a * (1.0 - flattening) 279 | translation = lat_lng_alt_2_ECEF_elipsoidal(lat_lng_alt, a, b) 280 | 281 | elif earth_model == 'sphere': 282 | earth_radius = 6378137.0 # Earth radius in meters 283 | z_dir = np.concatenate([(np.sin(theta)*np.cos(phi))[:,None], 284 | (np.sin(theta)*np.sin(phi))[:,None], 285 | (np.cos(theta))[:,None] ],axis=1) 286 | 287 | translation = (earth_radius + lat_lng_alt[:, -1])[:,None] * z_dir 288 | 289 | else: 290 | raise ValueError ("Selected ellipsoid not implemented!") 291 | 292 | world_pose_orientation = axis_angle_2_so3(orientation_axis, orientation_angle) 293 | 294 | trans[:,:3,:3] = R_enu_ecef @ world_pose_orientation 295 | trans[:,:3,3] = translation 296 | 297 | return trans 298 | 299 | 300 | def ecef_2_lat_lng_alt(trans, earth_model='WGS84'): 301 | ''' Converts the transformation from the earth centered earth fixed (ECEF) coordinate frame to the world pose 302 | Args: 303 | trans (np.array): transformation parameters in ECEF [n,4,4] 304 | earth_model (string): earth model used for conversion (spheric will be unaccurate when maps are large) 305 | Out: 306 | lat_lng_alt (np.array): latitude, longitude and altitude coordinate (in degrees and meters) [n,3] 307 | orientation_axis (np.array): orientation in the local ENU coordinate system [n,3] 308 | orientation_angle (np.array): orientation angle of the local ENU coordinate system in degrees [n,1] 309 | ''' 310 | 311 | translation = trans[:,:3,3] 312 | rotation = trans[:,:3,:3] 313 | 314 | if earth_model == 'WGS84': 315 | a = 6378137.0 316 | flattening = 1.0 / 298.257223563 317 | lat_lng_alt = translation_2_lat_lng_alt_ellipsoidal(translation, a, flattening) 318 | 319 | elif earth_model == 'sphere': 320 | earth_radius = 6378137.0 # Earth radius in meters 321 | lat_lng_alt = translation_2_lat_lng_alt_spherical(translation, earth_radius) 322 | 323 | else: 324 | raise ValueError ("Selected ellipsoid not implemented!") 325 | 326 | 327 | # Compute the orientation axis and angle 328 | theta = np.deg2rad((90. - lat_lng_alt[:, 0])) 329 | phi = np.deg2rad(lat_lng_alt[:, 1]) 330 | 331 | R_ecef_enu = local_ENU_2_ECEF_orientation(theta, phi).transpose(0,2,1) 332 | 333 | orientation = R_ecef_enu @ rotation 334 | orientation_axis, orientation_angle = so3_2_axis_angle(orientation) 335 | 336 | 337 | return lat_lng_alt, orientation_axis, orientation_angle 338 | 339 | def ecef_2_ENU(loc_ref_point: np.ndarray, earth_model: str ='WGS84'): 340 | ''' 341 | Compute the transformation matrix that transforms points from the ECEF to a local ENU coordinate frame 342 | Args: 343 | loc_ref_point: GPS coordinates of the local reference point of the map [1,3] 344 | earth_model: earth model used for conversion (spheric will be unaccurate when maps are large) 345 | Out: 346 | T_ecef_enu: transformation matrix from ECEF to ENU [4,4] 347 | ''' 348 | 349 | # initialize the transformation to identity 350 | T_ecef_enu = np.eye(4) 351 | 352 | if earth_model == 'WGS84': 353 | a = 6378137.0 354 | flattening = 1.0 / 298.257223563 355 | b = a * (1.0 - flattening) 356 | translation = lat_lng_alt_2_ECEF_elipsoidal(loc_ref_point, a, b).reshape(3,1) 357 | 358 | elif earth_model == 'sphere': 359 | earth_radius = 6378137.0 # Earth radius in meters 360 | z_dir = np.concatenate([(np.sin(loc_ref_point[1])*np.cos(loc_ref_point[0]))[:,None], 361 | (np.sin(loc_ref_point[1])*np.sin(loc_ref_point[0]))[:,None], 362 | (np.cos(loc_ref_point[0]))[:,None] ],axis=1) 363 | 364 | translation = ((earth_radius + loc_ref_point[:, -1])[:,None] * z_dir).reshape(3,1) 365 | 366 | else: 367 | raise ValueError ("Selected ellipsoid not implemented!") 368 | 369 | rad_lat = np.deg2rad(loc_ref_point[:, 0]) 370 | rad_lon = np.deg2rad(loc_ref_point[:, 1]) 371 | T_ecef_enu[:3,:3] = np.array([[-np.sin(rad_lon), np.cos(rad_lon), 0], 372 | [ -np.sin(rad_lat)*np.cos(rad_lon), -np.sin(rad_lat)*np.sin(rad_lon), np.cos(rad_lat)], 373 | [ np.cos(rad_lat)*np.cos(rad_lon), np.cos(rad_lat)*np.sin(rad_lon), np.sin(rad_lat)]]) 374 | 375 | T_ecef_enu[:3,3:4] = -T_ecef_enu[:3,:3] @ translation 376 | 377 | return T_ecef_enu 378 | 379 | -------------------------------------------------------------------------------- /tools/transforms.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import torch 4 | import torchvision 5 | import numpy as np 6 | 7 | from PIL import Image 8 | from tools.common import encode, decode 9 | from tools.augmentations import StrongAug, GeometricAug 10 | 11 | 12 | class Sample(dict): 13 | def __init__( 14 | self, 15 | #token, 16 | #scene, 17 | intrinsics, 18 | extrinsics, 19 | image, 20 | view, 21 | bev, 22 | **kwargs 23 | ): 24 | super().__init__(**kwargs) 25 | 26 | # Used to create path in save/load 27 | #self.token = token 28 | #self.scene = scene 29 | 30 | self.view = view 31 | self.bev = bev 32 | 33 | self.image = image 34 | self.intrinsics = intrinsics 35 | self.extrinsics = extrinsics 36 | 37 | def __getattr__(self, key): 38 | return super().__getitem__(key) 39 | 40 | def __setattr__(self, key, val): 41 | self[key] = val 42 | 43 | return super().__setattr__(key, val) 44 | 45 | 46 | class SaveDataTransform: 47 | """ 48 | All data to be saved to .json must be passed in as native Python lists 49 | """ 50 | def __init__(self, labels_dir): 51 | self.labels_dir = pathlib.Path(labels_dir) 52 | 53 | def get_cameras(self, batch: Sample): 54 | return { 55 | 'images': batch.images, 56 | 'intrinsics': batch.intrinsics, 57 | 'extrinsics': batch.extrinsics 58 | } 59 | 60 | def get_bev(self, batch: Sample): 61 | result = { 62 | 'view': batch.view, 63 | } 64 | 65 | scene_dir = self.labels_dir / batch.scene 66 | 67 | bev_path = f'bev_{batch.token}.png' 68 | Image.fromarray(encode(batch.bev)).save(scene_dir / bev_path) 69 | 70 | result['bev'] = bev_path 71 | 72 | # Auxilliary labels 73 | if batch.get('aux') is not None: 74 | aux_path = f'aux_{batch.token}.npz' 75 | np.savez_compressed(scene_dir / aux_path, aux=batch.aux) 76 | 77 | result['aux'] = aux_path 78 | 79 | # Visibility mask 80 | if batch.get('visibility') is not None: 81 | visibility_path = f'visibility_{batch.token}.png' 82 | Image.fromarray(batch.visibility).save(scene_dir / visibility_path) 83 | 84 | result['visibility'] = visibility_path 85 | 86 | return result 87 | 88 | def __call__(self, batch): 89 | """ 90 | Save sensor/label data and return any additional info to be saved to json 91 | """ 92 | result = {} 93 | result.update(self.get_cameras(batch)) 94 | result.update(self.get_bev(batch)) 95 | result.update({k: v for k, v in batch.items() if k not in result}) 96 | 97 | return result 98 | 99 | 100 | class LoadDataTransform(torchvision.transforms.ToTensor): 101 | def __init__(self, dataset_dir, labels_dir, image_config, num_classes, augment='none'): 102 | super().__init__() 103 | 104 | self.dataset_dir = pathlib.Path(dataset_dir) 105 | self.labels_dir = pathlib.Path(labels_dir) 106 | self.image_config = image_config 107 | self.num_classes = num_classes 108 | 109 | xform = { 110 | 'none': [], 111 | 'strong': [StrongAug()], 112 | 'geometric': [StrongAug(), GeometricAug()], 113 | }[augment] + [torchvision.transforms.ToTensor()] 114 | 115 | self.img_transform = torchvision.transforms.Compose(xform) 116 | self.to_tensor = super().__call__ 117 | 118 | def get_cameras(self, sample: Sample, h, w, top_crop): 119 | """ 120 | Note: we invert I and E here for convenience. 121 | """ 122 | images = list() 123 | intrinsics = list() 124 | 125 | for image_path, I_original in zip(sample.images, sample.intrinsics): 126 | h_resize = h + top_crop 127 | w_resize = w 128 | 129 | image = Image.open(self.dataset_dir / image_path) 130 | 131 | image_new = image.resize((w_resize, h_resize), resample=Image.BILINEAR) 132 | image_new = image_new.crop((0, top_crop, image_new.width, image_new.height)) 133 | 134 | I = np.float32(I_original) 135 | I[0, 0] *= w_resize / image.width 136 | I[0, 2] *= w_resize / image.width 137 | I[1, 1] *= h_resize / image.height 138 | I[1, 2] *= h_resize / image.height 139 | I[1, 2] -= top_crop 140 | 141 | images.append(self.img_transform(image_new)) 142 | intrinsics.append(torch.tensor(I)) 143 | 144 | return { 145 | 'cam_idx': torch.LongTensor(sample.cam_ids), 146 | 'image': torch.stack(images, 0), 147 | 'intrinsics': torch.stack(intrinsics, 0), 148 | 'extrinsics': torch.tensor(np.float32(sample.extrinsics)), 149 | } 150 | 151 | def get_bev(self, sample: Sample): 152 | scene_dir = self.labels_dir / sample.scene 153 | bev = None 154 | 155 | if sample.bev is not None: 156 | bev = Image.open(scene_dir / sample.bev) 157 | bev = decode(bev, self.num_classes) 158 | bev = (255 * bev).astype(np.uint8) 159 | bev = self.to_tensor(bev) 160 | 161 | result = { 162 | 'bev': bev, 163 | 'view': torch.tensor(sample.view), 164 | } 165 | 166 | if 'visibility' in sample: 167 | visibility = Image.open(scene_dir / sample.visibility) 168 | result['visibility'] = np.array(visibility, dtype=np.uint8) 169 | 170 | if 'aux' in sample: 171 | aux = np.load(scene_dir / sample.aux)['aux'] 172 | result['center'] = self.to_tensor(aux[..., 1]) 173 | 174 | if 'pose' in sample: 175 | result['pose'] = np.float32(sample['pose']) 176 | 177 | return result 178 | 179 | def __call__(self, batch): 180 | if not isinstance(batch, Sample): 181 | batch = Sample(**batch) 182 | 183 | result = dict() 184 | result.update(self.get_cameras(batch, **self.image_config)) 185 | result.update(self.get_bev(batch)) 186 | 187 | return result 188 | --------------------------------------------------------------------------------