├── assets └── BEVSAN.png ├── layers ├── backbones │ ├── __init__.py │ ├── depth_net.py │ ├── lss_fpn.py │ └── multi_head_fpn.py ├── __init__.py ├── heads │ ├── __init__.py │ ├── mh_depth_head.py │ └── bev_depth_head.py └── voxel_mix.py ├── ops └── voxel_pooling │ ├── __init__.py │ ├── src │ ├── voxel_pooling_forward.cpp │ └── voxel_pooling_forward_cuda.cu │ └── voxel_pooling.py ├── requirements.txt ├── test ├── data │ └── nuscenes │ │ └── samples │ │ ├── CAM_BACK │ │ └── n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg │ │ ├── CAM_FRONT │ │ └── n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg │ │ ├── CAM_BACK_LEFT │ │ └── n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg │ │ ├── CAM_BACK_RIGHT │ │ └── n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg │ │ ├── CAM_FRONT_LEFT │ │ └── n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg │ │ └── CAM_FRONT_RIGHT │ │ └── n015-2018-07-18-11-07-57+0800__CAM_FRONT_RIGHT__1531883530420339.jpg ├── test_ops │ └── test_voxel_pooling.py ├── test_layers │ ├── test_backbone.py │ └── test_head.py └── test_dataset │ └── test_nusc_mv_det_dataset.py ├── requirements-dev.txt ├── .pre-commit-config.yaml ├── LICENSE.md ├── utils └── torch_dist.py ├── README.md ├── setup.py ├── exps ├── bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py ├── bev_depth_lss_r50_256x704_128x128_24e_2key.py └── bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da.py ├── .gitignore ├── models ├── bev_depth.py └── uda_depth.py ├── callbacks └── ema.py ├── scripts ├── gen_depth_gt.py └── gen_info.py ├── evaluators └── det_mv_evaluators.py └── dataset └── nusc_mv_det_dataset.py /assets/BEVSAN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/assets/BEVSAN.png -------------------------------------------------------------------------------- /layers/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .lss_fpn import LSSFPN 2 | 3 | __all__ = ['LSSFPN'] 4 | -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .heads.bev_depth_head import BEVDepthHead 2 | 3 | __all__ = ['BEVDepthHead'] 4 | -------------------------------------------------------------------------------- /layers/heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .bev_depth_head import BEVDepthHead 2 | 3 | __all__ = ['BEVDepthHead'] 4 | -------------------------------------------------------------------------------- /ops/voxel_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | from .voxel_pooling import voxel_pooling 2 | 3 | __all__ = ['voxel_pooling'] 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numba 2 | numpy 3 | nuscenes-devkit 4 | opencv-python-headless 5 | pandas 6 | pytorch-lightning==1.6.0 7 | scikit-image 8 | scipy 9 | setuptools==59.5.0 10 | tensorboardX 11 | torch==1.9.0 12 | torchvision==0.10.0 13 | -------------------------------------------------------------------------------- /test/data/nuscenes/samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg -------------------------------------------------------------------------------- /test/data/nuscenes/samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg -------------------------------------------------------------------------------- /test/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg -------------------------------------------------------------------------------- /test/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg -------------------------------------------------------------------------------- /test/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg -------------------------------------------------------------------------------- /test/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_RIGHT__1531883530420339.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_RIGHT__1531883530420339.jpg -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # code formatter 2 | # force to use same version of the formatter, can be changed only by maintainer. 3 | 4 | anybadge 5 | autoflake==1.4 6 | black==20.8b1 7 | flake8 8 | gitlint 9 | isort==4.3.21 10 | nbsphinx 11 | pre-commit 12 | pre-commit==2.7.1 13 | pylint==2.3.1 14 | pytest 15 | pytest-cov 16 | radon==4.2.0 17 | recommonmark 18 | seed-isort-config 19 | setuptools 20 | 21 | # ----- document usage 22 | sphinx==3.5.4 23 | sphinx-material 24 | sphinx_markdown_tables 25 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/PyCQA/flake8 3 | rev: 3.8.3 4 | hooks: 5 | - id: flake8 6 | - repo: https://github.com/PyCQA/isort 7 | rev: 5.10.1 8 | hooks: 9 | - id: isort 10 | - repo: https://github.com/pre-commit/mirrors-yapf 11 | rev: v0.30.0 12 | hooks: 13 | - id: yapf 14 | - repo: https://github.com/pre-commit/pre-commit-hooks 15 | rev: v3.1.0 16 | hooks: 17 | - id: trailing-whitespace 18 | - id: check-yaml 19 | - id: end-of-file-fixer 20 | - id: requirements-txt-fixer 21 | - id: double-quote-string-fixer 22 | - id: check-merge-conflict 23 | - id: fix-encoding-pragma 24 | args: ["--remove"] 25 | - id: mixed-line-ending 26 | args: ["--fix=lf"] 27 | - repo: https://github.com/codespell-project/codespell 28 | rev: v2.1.0 29 | hooks: 30 | - id: codespell 31 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Megvii-BaseDetection 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utils/torch_dist.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: zeming li 3 | @contact: zengarden2009@gmail.com 4 | """ 5 | from torch import distributed as dist 6 | 7 | 8 | def get_rank() -> int: 9 | if not dist.is_available(): 10 | return 0 11 | if not dist.is_initialized(): 12 | return 0 13 | return dist.get_rank() 14 | 15 | 16 | def get_world_size() -> int: 17 | if not dist.is_available(): 18 | return 1 19 | if not dist.is_initialized(): 20 | return 1 21 | return dist.get_world_size() 22 | 23 | 24 | def synchronize(): 25 | """Helper function to synchronize (barrier) 26 | among all processes when using distributed training""" 27 | if not dist.is_available(): 28 | return 29 | if not dist.is_initialized(): 30 | return 31 | current_world_size = dist.get_world_size() 32 | if current_world_size == 1: 33 | return 34 | dist.barrier() 35 | 36 | 37 | def all_gather_object(obj): 38 | world_size = get_world_size() 39 | if world_size < 2: 40 | return [obj] 41 | output = [None for _ in range(world_size)] 42 | dist.all_gather_object(output, obj) 43 | return output 44 | 45 | 46 | def is_available() -> bool: 47 | return dist.is_available() 48 | -------------------------------------------------------------------------------- /test/test_ops/test_voxel_pooling.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | import torch 5 | 6 | from ops.voxel_pooling import voxel_pooling 7 | 8 | 9 | class TestLSSFPN(unittest.TestCase): 10 | @pytest.mark.skipif(condition=torch.cuda.is_available() is False, 11 | reason='No gpu available.') 12 | def test_voxel_pooling(self): 13 | import numpy as np 14 | 15 | np.random.seed(0) 16 | torch.manual_seed(0) 17 | geom_xyz = torch.rand([2, 6, 10, 10, 10, 3]) * 160 - 80 18 | geom_xyz[..., 2] /= 100 19 | geom_xyz = geom_xyz.reshape(2, -1, 3) 20 | features = torch.rand([2, 6, 10, 10, 10, 80]) - 0.5 21 | gt_features = features.reshape(2, -1, 80) 22 | gt_bev_featuremap = features.new_zeros(2, 128, 128, 80) 23 | for i in range(2): 24 | for j in range(geom_xyz.shape[1]): 25 | x = geom_xyz[i, j, 0].int() 26 | y = geom_xyz[i, j, 1].int() 27 | z = geom_xyz[i, j, 2].int() 28 | if x < 0 or x >= 128 or y < 0 or y >= 128 or z < 0 or z >= 1: 29 | continue 30 | gt_bev_featuremap[i, y, x, :] += gt_features[i, j, :] 31 | gt_bev_featuremap = gt_bev_featuremap.permute(0, 3, 1, 2).cuda() 32 | bev_featuremap = voxel_pooling( 33 | geom_xyz.cuda().int(), features.cuda(), 34 | torch.tensor([128, 128, 1], dtype=torch.int, device='cuda')) 35 | assert torch.allclose(gt_bev_featuremap.cuda(), 36 | bev_featuremap, 37 | rtol=1e-3) 38 | -------------------------------------------------------------------------------- /test/test_layers/test_backbone.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | import torch 5 | 6 | from layers.backbones.lss_fpn import LSSFPN 7 | 8 | 9 | class TestLSSFPN(unittest.TestCase): 10 | def setUp(self) -> None: 11 | backbone_conf = { 12 | 'x_bound': [-10, 10, 0.5], 13 | 'y_bound': [-10, 10, 0.5], 14 | 'z_bound': [-5, 3, 8], 15 | 'd_bound': [2.0, 22, 1.0], 16 | 'final_dim': [64, 64], 17 | 'output_channels': 18 | 10, 19 | 'downsample_factor': 20 | 16, 21 | 'img_backbone_conf': 22 | dict(type='ResNet', 23 | depth=18, 24 | frozen_stages=0, 25 | out_indices=[0, 1, 2, 3], 26 | norm_eval=False, 27 | base_channels=8), 28 | 'img_neck_conf': 29 | dict( 30 | type='SECONDFPN', 31 | in_channels=[8, 16, 32, 64], 32 | upsample_strides=[0.25, 0.5, 1, 2], 33 | out_channels=[16, 16, 16, 16], 34 | ), 35 | 'depth_net_conf': 36 | dict(in_channels=64, mid_channels=64), 37 | } 38 | self.lss_fpn = LSSFPN(**backbone_conf).cuda() 39 | 40 | @pytest.mark.skipif(torch.cuda.is_available() is False, 41 | reason='No gpu available.') 42 | def test_forward(self): 43 | sweep_imgs = torch.rand(2, 2, 6, 3, 64, 64).cuda() 44 | sensor2ego_mats = torch.rand(2, 2, 6, 4, 4).cuda() 45 | intrin_mats = torch.rand(2, 2, 6, 4, 4).cuda() 46 | ida_mats = torch.rand(2, 2, 6, 4, 4).cuda() 47 | sensor2sensor_mats = torch.rand(2, 2, 6, 4, 4).cuda() 48 | bda_mat = torch.rand(2, 4, 4).cuda() 49 | mats_dict = dict() 50 | mats_dict['sensor2ego_mats'] = sensor2ego_mats 51 | mats_dict['intrin_mats'] = intrin_mats 52 | mats_dict['ida_mats'] = ida_mats 53 | mats_dict['sensor2sensor_mats'] = sensor2sensor_mats 54 | mats_dict['bda_mat'] = bda_mat 55 | preds = self.lss_fpn.forward(sweep_imgs, mats_dict) 56 | assert preds.shape == torch.Size([2, 20, 40, 40]) 57 | -------------------------------------------------------------------------------- /ops/voxel_pooling/src/voxel_pooling_forward.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Megvii Inc. All rights reserved. 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | extern THCState *state; 11 | 12 | #define CHECK_CUDA(x) \ 13 | TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") 14 | #define CHECK_CONTIGUOUS(x) \ 15 | TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") 16 | #define CHECK_INPUT(x) \ 17 | CHECK_CUDA(x); \ 18 | CHECK_CONTIGUOUS(x) 19 | 20 | int voxel_pooling_forward_wrapper(int batch_size, int num_points, int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z, at::Tensor geom_xyz_tensor, 21 | at::Tensor input_features_tensor, at::Tensor output_features_tensor, at::Tensor pos_memo_tensor); 22 | 23 | void voxel_pooling_forward_kernel_launcher(int batch_size, int num_points, int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z, const int *geom_xyz, const float *input_features, 24 | float *output_features, int *pos_memo, cudaStream_t stream); 25 | 26 | int voxel_pooling_forward_wrapper(int batch_size, int num_points, int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z, at::Tensor geom_xyz_tensor, 27 | at::Tensor input_features_tensor, at::Tensor output_features_tensor, at::Tensor pos_memo_tensor) { 28 | CHECK_INPUT(geom_xyz_tensor); 29 | CHECK_INPUT(input_features_tensor); 30 | const int *geom_xyz = geom_xyz_tensor.data_ptr(); 31 | const float *input_features = input_features_tensor.data_ptr(); 32 | float *output_features = output_features_tensor.data_ptr(); 33 | int *pos_memo = pos_memo_tensor.data_ptr(); 34 | 35 | cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream(); 36 | voxel_pooling_forward_kernel_launcher(batch_size, num_points, num_channels, num_voxel_x, num_voxel_y, num_voxel_z, geom_xyz, input_features, 37 | output_features, pos_memo, stream); 38 | return 1; 39 | } 40 | 41 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 42 | m.def("voxel_pooling_forward_wrapper", &voxel_pooling_forward_wrapper, "voxel_pooling_forward_wrapper"); 43 | } 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BEV-SAN: Accurate BEV 3D Object Detection via Slice Attention Networks 2 | The official release of BEV-SAN is now available. 3 | 4 | ![Python 3.7](https://img.shields.io/badge/Python-3.7-red) 5 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2212.01231) 6 | 7 | 8 | 9 | ## Updates!! 10 | * 【2023/10/24】 We publish the code base of our work. Updating soon. 11 | * 【2023/02/24】 Our work has been accepted by the CVPR2023. 12 | ## Quick Start 13 | ### Installation 14 | **Step 0.** Install [pytorch](https://pytorch.org/)(v1.9.0). 15 | 16 | **Step 1.** Install [MMDetection3D](https://github.com/open-mmlab/mmdetection3d)(v1.0.0rc4). 17 | 18 | **Step 2.** Install requirements. 19 | ```shell 20 | pip install -r requirements.txt 21 | ``` 22 | **Step 3.** Install BEVDepth(gpu required). 23 | ```shell 24 | python setup.py develop 25 | ``` 26 | 27 | ### Data Preparation 28 | **Step 0.** Download nuScenes official dataset. 29 | 30 | **Step 1.** Symlink the dataset root to `./data/`. 31 | ``` 32 | ln -s [nuscenes root] ./data/ 33 | ``` 34 | The directory will be as follows. 35 | ``` 36 | BEVDepth 37 | ├── data 38 | │ ├── nuScenes 39 | │ │ ├── maps 40 | │ │ ├── samples 41 | │ │ ├── sweeps 42 | │ │ ├── v1.0-test 43 | | | ├── v1.0-trainval 44 | ``` 45 | **Step 2.** Prepare infos. 46 | ``` 47 | python scripts/gen_info.py 48 | ``` 49 | **Step 3.** Prepare depth gt. 50 | ``` 51 | python scripts/gen_depth_gt.py 52 | ``` 53 | 54 | ### Tutorials 55 | **Train.** 56 | ``` 57 | python [EXP_PATH] --amp_backend native -b 8 --gpus 8 58 | ``` 59 | **Eval.** 60 | ``` 61 | python [EXP_PATH] --ckpt_path [CKPT_PATH] -e -b 8 --gpus 8 62 | ``` 63 | 64 | ** 65 | 66 | ## Cite BEV-SAN 67 | If you use BEV-SAN in your research, please cite our work by using the following BibTeX entry: 68 | 69 | ## Thanks 70 | Our code is based on the BEVDepth(https://github.com/Megvii-BaseDetection/BEVDepth) 71 | 72 | ```latex 73 | @misc{chi2022bevsan, 74 | title={BEV-SAN: Accurate BEV 3D Object Detection via Slice Attention Networks}, 75 | author={Xiaowei Chi and Jiaming Liu and Ming Lu and Rongyu Zhang and Zhaoqing Wang and Yandong Guo and Shanghang Zhang}, 76 | year={2022}, 77 | eprint={2212.01231}, 78 | archivePrefix={arXiv}, 79 | primaryClass={cs.CV} 80 | } 81 | ``` 82 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from setuptools import find_packages, setup 5 | from torch.utils.cpp_extension import (BuildExtension, CppExtension, 6 | CUDAExtension) 7 | 8 | with open('README.md', 'r') as fh: 9 | long_description = fh.read() 10 | 11 | 12 | def make_cuda_ext(name, 13 | module, 14 | sources, 15 | sources_cuda=[], 16 | extra_args=[], 17 | extra_include_path=[]): 18 | 19 | define_macros = [] 20 | extra_compile_args = {'cxx': [] + extra_args} 21 | 22 | if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1': 23 | define_macros += [('WITH_CUDA', None)] 24 | extension = CUDAExtension 25 | extra_compile_args['nvcc'] = extra_args + [ 26 | '-D__CUDA_NO_HALF_OPERATORS__', 27 | '-D__CUDA_NO_HALF_CONVERSIONS__', 28 | '-D__CUDA_NO_HALF2_OPERATORS__', 29 | ] 30 | sources += sources_cuda 31 | else: 32 | print('Compiling {} without CUDA'.format(name)) 33 | extension = CppExtension 34 | # raise EnvironmentError('CUDA is required to compile MMDetection!') 35 | 36 | return extension( 37 | name='{}.{}'.format(module, name), 38 | sources=[os.path.join(*module.split('.'), p) for p in sources], 39 | include_dirs=extra_include_path, 40 | define_macros=define_macros, 41 | extra_compile_args=extra_compile_args, 42 | ) 43 | 44 | 45 | setup( 46 | name='BEVDepth', 47 | version='0.0.1', 48 | author='Megvii', 49 | author_email='liyinhao@megvii.com', 50 | description='Code for BEVDepth', 51 | long_description=long_description, 52 | long_description_content_type='text/markdown', 53 | url=None, 54 | packages=find_packages(), 55 | classifiers=[ 56 | 'Programming Language :: Python :: 3', 57 | 'Operating System :: OS Independent', 58 | ], 59 | install_requires=[], 60 | ext_modules=[ 61 | make_cuda_ext( 62 | name='voxel_pooling_ext', 63 | module='ops.voxel_pooling', 64 | sources=['src/voxel_pooling_forward.cpp'], 65 | sources_cuda=['src/voxel_pooling_forward_cuda.cu'], 66 | ), 67 | ], 68 | cmdclass={'build_ext': BuildExtension}, 69 | ) 70 | -------------------------------------------------------------------------------- /test/test_dataset/test_nusc_mv_det_dataset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from dataset.nusc_mv_det_dataset import NuscMVDetDataset 7 | 8 | CLASSES = [ 9 | 'car', 10 | 'truck', 11 | 'construction_vehicle', 12 | 'bus', 13 | 'trailer', 14 | 'barrier', 15 | 'motorcycle', 16 | 'bicycle', 17 | 'pedestrian', 18 | 'traffic_cone', 19 | ] 20 | H = 900 21 | W = 1600 22 | final_dim = (256, 704) 23 | img_conf = dict(img_mean=[123.675, 116.28, 103.53], 24 | img_std=[58.395, 57.12, 57.375], 25 | to_rgb=True) 26 | ida_aug_conf = { 27 | 'resize_lim': (0.4, 0.4), 28 | 'final_dim': 29 | final_dim, 30 | 'rot_lim': (0, 0), 31 | 'H': 32 | H, 33 | 'W': 34 | W, 35 | 'rand_flip': 36 | True, 37 | 'bot_pct_lim': (0.0, 0.0), 38 | 'cams': [ 39 | 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 40 | 'CAM_BACK', 'CAM_BACK_RIGHT' 41 | ], 42 | 'Ncams': 43 | 6, 44 | } 45 | 46 | bda_aug_conf = { 47 | 'rot_lim': (0, 0), 48 | 'scale_lim': (1, 1), 49 | 'flip_dx_ratio': 0, 50 | 'flip_dy_ratio': 0 51 | } 52 | 53 | 54 | class TestNuscMVDetData(unittest.TestCase): 55 | def test_voxel_pooling(self): 56 | np.random.seed(0) 57 | torch.random.manual_seed(0) 58 | nusc = NuscMVDetDataset(ida_aug_conf, 59 | bda_aug_conf, 60 | CLASSES, 61 | './test/data/nuscenes', 62 | './test/data/nuscenes/infos.pkl', 63 | True, 64 | sweep_idxes=[4]) 65 | ret_list = nusc[0] 66 | assert torch.isclose(ret_list[0].mean(), 67 | torch.tensor(-0.4667), 68 | rtol=1e-3) 69 | assert torch.isclose(ret_list[1].mean(), 70 | torch.tensor(0.1678), 71 | rtol=1e-3) 72 | assert torch.isclose(ret_list[2].mean(), 73 | torch.tensor(230.0464), 74 | rtol=1e-3) 75 | assert torch.isclose(ret_list[3].mean(), 76 | torch.tensor(8.3250), 77 | rtol=1e-3) 78 | assert torch.isclose(ret_list[4].mean(), torch.tensor(0.25), rtol=1e-3) 79 | assert torch.isclose(ret_list[5].mean(), torch.tensor(0.25), rtol=1e-3) 80 | -------------------------------------------------------------------------------- /ops/voxel_pooling/src/voxel_pooling_forward_cuda.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Megvii Inc. All rights reserved. 2 | #include 3 | #include 4 | #include 5 | 6 | #define THREADS_PER_BLOCK 256 7 | #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) 8 | 9 | __global__ void voxel_pooling_forward_kernel(int batch_size, int num_points, int num_channels, int num_voxel_x, 10 | int num_voxel_y, int num_voxel_z, const int *geom_xyz, 11 | const float *input_features, float *output_features, int *pos_memo) { 12 | // Each thread process only one channel of one voxel. 13 | int blk_idx = blockIdx.x; 14 | int thd_idx = threadIdx.x; 15 | int pt_idx = blk_idx * blockDim.x + thd_idx; 16 | if (pt_idx >= batch_size * num_points) { 17 | return; 18 | } else { 19 | int batch_idx = pt_idx / num_points; 20 | int x = geom_xyz[pt_idx * 3]; 21 | int y = geom_xyz[pt_idx * 3 + 1]; 22 | int z = geom_xyz[pt_idx * 3 + 2]; 23 | // if coord of current voxel is out of boundary, return. 24 | if (x < 0 || x >= num_voxel_x || y < 0 || y >= num_voxel_y || z < 0 || z >= num_voxel_z) { 25 | return; 26 | } 27 | pos_memo[pt_idx * 3] = batch_idx; 28 | pos_memo[pt_idx * 3 + 1] = y; 29 | pos_memo[pt_idx * 3 + 2] = x; 30 | for (int channel_idx = 0; channel_idx < num_channels; channel_idx++) { 31 | atomicAdd( 32 | &output_features[(batch_idx * num_voxel_y * num_voxel_x + y * num_voxel_x + x) * num_channels + channel_idx], 33 | input_features[pt_idx * num_channels + channel_idx]); 34 | } 35 | } 36 | } 37 | 38 | void voxel_pooling_forward_kernel_launcher(int batch_size, int num_points, int num_channels, int num_voxel_x, 39 | int num_voxel_y, int num_voxel_z, const int *geom_xyz, 40 | const float *input_features, float *output_features, int *pos_memo, 41 | cudaStream_t stream) { 42 | cudaError_t err; 43 | 44 | dim3 blocks(DIVUP(batch_size * num_points, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row) 45 | dim3 threads(THREADS_PER_BLOCK); 46 | 47 | voxel_pooling_forward_kernel<<>>(batch_size, num_points, num_channels, num_voxel_x, 48 | num_voxel_y, num_voxel_z, geom_xyz, input_features, 49 | output_features, pos_memo); 50 | // cudaDeviceSynchronize(); // for using printf in kernel function 51 | err = cudaGetLastError(); 52 | if (cudaSuccess != err) { 53 | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); 54 | exit(-1); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /ops/voxel_pooling/voxel_pooling.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Megvii Inc. All rights reserved. 2 | import torch 3 | from torch.autograd import Function 4 | 5 | from . import voxel_pooling_ext 6 | 7 | 8 | class VoxelPooling(Function): 9 | @staticmethod 10 | def forward(ctx, geom_xyz: torch.Tensor, input_features: torch.Tensor, 11 | voxel_num: torch.Tensor) -> torch.Tensor: 12 | """Forward function for `voxel pooling. 13 | 14 | Args: 15 | geom_xyz (Tensor): xyz coord for each voxel with the shape 16 | of [B, N, 3]. 17 | input_features (Tensor): feature for each voxel with the 18 | shape of [B, N, C]. 19 | voxel_num (Tensor): Number of voxels for each dim with the 20 | shape of [3]. 21 | 22 | Returns: 23 | Tensor: (B, C, H, W) bev feature map. 24 | """ 25 | assert geom_xyz.is_contiguous() 26 | assert input_features.is_contiguous() 27 | # no gradient for input_features and geom_feats 28 | ctx.mark_non_differentiable(geom_xyz) 29 | grad_input_features = torch.zeros_like(input_features) 30 | geom_xyz = geom_xyz.reshape(geom_xyz.shape[0], -1, geom_xyz.shape[-1]) 31 | input_features = input_features.reshape( 32 | (geom_xyz.shape[0], -1, input_features.shape[-1])) 33 | assert geom_xyz.shape[1] == input_features.shape[1] 34 | batch_size = input_features.shape[0] 35 | num_points = input_features.shape[1] 36 | num_channels = input_features.shape[2] 37 | output_features = input_features.new_zeros(batch_size, voxel_num[1], 38 | voxel_num[0], num_channels) 39 | # Save the position of bev_feature_map for each input point. 40 | pos_memo = geom_xyz.new_ones(batch_size, num_points, 3) * -1 41 | voxel_pooling_ext.voxel_pooling_forward_wrapper( 42 | batch_size, 43 | num_points, 44 | num_channels, 45 | voxel_num[0], 46 | voxel_num[1], 47 | voxel_num[2], 48 | geom_xyz, 49 | input_features, 50 | output_features, 51 | pos_memo, 52 | ) 53 | # save grad_input_features and pos_memo for backward 54 | ctx.save_for_backward(grad_input_features, pos_memo) 55 | return output_features.permute(0, 3, 1, 2) 56 | 57 | @staticmethod 58 | def backward(ctx, grad_output_features): 59 | (grad_input_features, pos_memo) = ctx.saved_tensors 60 | kept = (pos_memo != -1)[..., 0] 61 | grad_input_features_shape = grad_input_features.shape 62 | grad_input_features = grad_input_features.reshape( 63 | grad_input_features.shape[0], -1, grad_input_features.shape[-1]) 64 | grad_input_features[kept] = grad_output_features[ 65 | pos_memo[kept][..., 0].long(), :, pos_memo[kept][..., 1].long(), 66 | pos_memo[kept][..., 2].long()] 67 | grad_input_features = grad_input_features.reshape( 68 | grad_input_features_shape) 69 | return None, grad_input_features, None 70 | 71 | 72 | voxel_pooling = VoxelPooling.apply 73 | -------------------------------------------------------------------------------- /exps/bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Megvii Inc. All rights reserved. 2 | """ 3 | mAP: 0.3589 4 | mATE: 0.6119 5 | mASE: 0.2692 6 | mAOE: 0.5074 7 | mAVE: 0.4086 8 | mAAE: 0.2009 9 | NDS: 0.4797 10 | Eval time: 183.3s 11 | Per-class results: 12 | Object Class AP ATE ASE AOE AVE AAE 13 | car 0.559 0.475 0.157 0.112 0.370 0.205 14 | truck 0.270 0.659 0.196 0.103 0.356 0.181 15 | bus 0.374 0.651 0.184 0.072 0.846 0.326 16 | trailer 0.179 0.963 0.227 0.512 0.294 0.127 17 | construction_vehicle 0.081 0.825 0.481 1.352 0.094 0.345 18 | pedestrian 0.363 0.690 0.297 0.831 0.491 0.244 19 | motorcycle 0.354 0.580 0.255 0.545 0.615 0.164 20 | bicycle 0.301 0.447 0.280 0.920 0.203 0.015 21 | traffic_cone 0.539 0.435 0.324 nan nan nan 22 | barrier 0.569 0.394 0.293 0.120 nan nan 23 | """ 24 | from argparse import ArgumentParser, Namespace 25 | 26 | import pytorch_lightning as pl 27 | import torch 28 | 29 | from callbacks.ema import EMACallback 30 | from exps.bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da import \ 31 | BEVDepthLightningModel as BaseBEVDepthLightningModel 32 | 33 | 34 | class BEVDepthLightningModel(BaseBEVDepthLightningModel): 35 | def __init__(self, **kwargs): 36 | super().__init__(**kwargs) 37 | self.data_use_cbgs = True 38 | 39 | def configure_optimizers(self): 40 | lr = self.basic_lr_per_img * \ 41 | self.batch_size_per_device * self.gpus 42 | optimizer = torch.optim.AdamW(self.model.parameters(), 43 | lr=lr, 44 | weight_decay=1e-2) 45 | return [optimizer] 46 | 47 | 48 | def main(args: Namespace) -> None: 49 | if args.seed is not None: 50 | pl.seed_everything(args.seed) 51 | 52 | model = BEVDepthLightningModel(**vars(args)) 53 | train_dataloader = model.train_dataloader() 54 | ema_callback = EMACallback(len(train_dataloader.dataset) * args.max_epochs) 55 | trainer = pl.Trainer.from_argparse_args(args, callbacks=[ema_callback]) 56 | if args.evaluate: 57 | trainer.test(model, ckpt_path=args.ckpt_path) 58 | else: 59 | trainer.fit(model) 60 | 61 | 62 | def run_cli(): 63 | parent_parser = ArgumentParser(add_help=False) 64 | parent_parser = pl.Trainer.add_argparse_args(parent_parser) 65 | parent_parser.add_argument('-e', 66 | '--evaluate', 67 | dest='evaluate', 68 | action='store_true', 69 | help='evaluate model on validation set') 70 | parent_parser.add_argument('-b', '--batch_size_per_device', type=int) 71 | parent_parser.add_argument('--seed', 72 | type=int, 73 | default=0, 74 | help='seed for initializing training.') 75 | parent_parser.add_argument('--ckpt_path', type=str) 76 | parser = BEVDepthLightningModel.add_model_specific_args(parent_parser) 77 | parser.set_defaults(profiler='simple', 78 | deterministic=False, 79 | max_epochs=20, 80 | accelerator='ddp', 81 | num_sanity_val_steps=0, 82 | gradient_clip_val=5, 83 | limit_val_batches=0, 84 | enable_checkpointing=False, 85 | precision=16, 86 | default_root_dir='./outputs/bev_depth_lss_r50_' 87 | '256x704_128x128_20e_cbgs_2key_da_ema') 88 | args = parser.parse_args() 89 | main(args) 90 | 91 | 92 | if __name__ == '__main__': 93 | run_cli() 94 | -------------------------------------------------------------------------------- /exps/bev_depth_lss_r50_256x704_128x128_24e_2key.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Megvii Inc. All rights reserved. 2 | """ 3 | mAP: 0.3304 4 | mATE: 0.7021 5 | mASE: 0.2795 6 | mAOE: 0.5346 7 | mAVE: 0.5530 8 | mAAE: 0.2274 9 | NDS: 0.4355 10 | Eval time: 171.8s 11 | 12 | Per-class results: 13 | Object Class AP ATE ASE AOE AVE AAE 14 | car 0.499 0.540 0.165 0.211 0.650 0.233 15 | truck 0.278 0.719 0.218 0.265 0.547 0.215 16 | bus 0.386 0.661 0.211 0.171 1.132 0.274 17 | trailer 0.168 1.034 0.235 0.548 0.408 0.168 18 | construction_vehicle 0.075 1.124 0.510 1.177 0.111 0.385 19 | pedestrian 0.284 0.757 0.298 0.966 0.578 0.301 20 | motorcycle 0.335 0.624 0.263 0.621 0.734 0.237 21 | bicycle 0.305 0.554 0.264 0.653 0.263 0.006 22 | traffic_cone 0.462 0.516 0.355 nan nan nan 23 | barrier 0.512 0.491 0.275 0.200 nan nan 24 | """ 25 | from argparse import ArgumentParser, Namespace 26 | 27 | import pytorch_lightning as pl 28 | 29 | from callbacks.ema import EMACallback 30 | from exps.bev_depth_lss_r50_256x704_128x128_24e import \ 31 | BEVDepthLightningModel as BaseBEVDepthLightningModel 32 | # from models.bev_depth import BEVDepth 33 | from models.mh_depth import BEVDepth 34 | 35 | 36 | class BEVDepthLightningModel(BaseBEVDepthLightningModel): 37 | def __init__(self, **kwargs): 38 | super().__init__(**kwargs) 39 | self.key_idxes = [-1] 40 | self.head_conf['bev_backbone_conf']['in_channels'] = 80 * ( 41 | len(self.key_idxes) + 1) 42 | self.head_conf['bev_neck_conf']['in_channels'] = [ 43 | 80 * (len(self.key_idxes) + 1), 160, 320, 640 44 | ] 45 | self.head_conf['train_cfg']['code_weight'] = [ 46 | 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 47 | ] 48 | self.model = BEVDepth(self.backbone_conf, 49 | self.head_conf, 50 | is_train_depth=True) 51 | 52 | 53 | def main(args: Namespace) -> None: 54 | if args.seed is not None: 55 | pl.seed_everything(args.seed) 56 | 57 | model = BEVDepthLightningModel(**vars(args)) 58 | train_dataloader = model.train_dataloader() 59 | ema_callback = EMACallback(len(train_dataloader.dataset) * args.max_epochs) 60 | trainer = pl.Trainer.from_argparse_args(args, callbacks=[ema_callback]) 61 | if args.evaluate: 62 | trainer.test(model, ckpt_path=args.ckpt_path) 63 | else: 64 | trainer.fit(model) 65 | 66 | 67 | def run_cli(): 68 | parent_parser = ArgumentParser(add_help=False) 69 | parent_parser = pl.Trainer.add_argparse_args(parent_parser) 70 | parent_parser.add_argument('-e', 71 | '--evaluate', 72 | dest='evaluate', 73 | action='store_true', 74 | help='evaluate model on validation set') 75 | parent_parser.add_argument('-b', '--batch_size_per_device', type=int) 76 | parent_parser.add_argument('--seed', 77 | type=int, 78 | default=0, 79 | help='seed for initializing training.') 80 | parent_parser.add_argument('--ckpt_path', type=str) 81 | parser = BEVDepthLightningModel.add_model_specific_args(parent_parser) 82 | parser.set_defaults( 83 | profiler='simple', 84 | deterministic=False, 85 | max_epochs=24, 86 | accelerator='ddp', 87 | num_sanity_val_steps=0, 88 | gradient_clip_val=5, 89 | limit_val_batches=0, 90 | enable_checkpointing=True, 91 | precision=16, 92 | default_root_dir='./outputs/bev_depth_lss_r50_256x704_128x128_24e_2key' 93 | ) 94 | args = parser.parse_args() 95 | main(args) 96 | 97 | 98 | if __name__ == '__main__': 99 | run_cli() 100 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Linux ### 2 | *~ 3 | 4 | # temporary files which can be created if a process still has a handle open of a deleted file 5 | .fuse_hidden* 6 | 7 | # KDE directory preferences 8 | .directory 9 | 10 | # Linux trash folder which might appear on any partition or disk 11 | .Trash-* 12 | 13 | # .nfs files are created when an open file is removed but is still being accessed 14 | .nfs* 15 | 16 | ### PyCharm ### 17 | # User-specific stuff 18 | .idea 19 | 20 | # CMake 21 | cmake-build-*/ 22 | 23 | # Mongo Explorer plugin 24 | .idea/**/mongoSettings.xml 25 | 26 | # File-based project format 27 | *.iws 28 | 29 | # IntelliJ 30 | out/ 31 | 32 | # mpeltonen/sbt-idea plugin 33 | .idea_modules/ 34 | 35 | # JIRA plugin 36 | atlassian-ide-plugin.xml 37 | 38 | # Cursive Clojure plugin 39 | .idea/replstate.xml 40 | 41 | # Crashlytics plugin (for Android Studio and IntelliJ) 42 | com_crashlytics_export_strings.xml 43 | crashlytics.properties 44 | crashlytics-build.properties 45 | fabric.properties 46 | 47 | # Editor-based Rest Client 48 | .idea/httpRequests 49 | 50 | # Android studio 3.1+ serialized cache file 51 | .idea/caches/build_file_checksums.ser 52 | 53 | # JetBrains templates 54 | **___jb_tmp___ 55 | 56 | ### Python ### 57 | # Byte-compiled / optimized / DLL files 58 | __pycache__/ 59 | *.py[cod] 60 | *$py.class 61 | 62 | # C extensions 63 | *.so 64 | 65 | # Distribution / packaging 66 | .Python 67 | build/ 68 | develop-eggs/ 69 | dist/ 70 | downloads/ 71 | eggs/ 72 | .eggs/ 73 | lib/ 74 | lib64/ 75 | parts/ 76 | sdist/ 77 | var/ 78 | wheels/ 79 | pip-wheel-metadata/ 80 | share/python-wheels/ 81 | *.egg-info/ 82 | .installed.cfg 83 | *.egg 84 | MANIFEST 85 | 86 | # PyInstaller 87 | # Usually these files are written by a python script from a template 88 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 89 | *.manifest 90 | *.spec 91 | 92 | # Installer logs 93 | pip-log.txt 94 | pip-delete-this-directory.txt 95 | 96 | # Unit test / coverage reports 97 | htmlcov/ 98 | .tox/ 99 | .nox/ 100 | .coverage 101 | .coverage.* 102 | .cache 103 | nosetests.xml 104 | coverage.xml 105 | *.cover 106 | .hypothesis/ 107 | .pytest_cache/ 108 | 109 | # Translations 110 | *.mo 111 | *.pot 112 | 113 | # Django stuff: 114 | *.log 115 | local_settings.py 116 | db.sqlite3 117 | 118 | # Flask stuff: 119 | instance/ 120 | .webassets-cache 121 | 122 | # Scrapy stuff: 123 | .scrapy 124 | 125 | # Sphinx documentation 126 | docs/_build/ 127 | docs/build/ 128 | 129 | # PyBuilder 130 | target/ 131 | 132 | # Jupyter Notebook 133 | .ipynb_checkpoints 134 | 135 | # IPython 136 | profile_default/ 137 | ipython_config.py 138 | 139 | # pyenv 140 | .python-version 141 | 142 | # pipenv 143 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 144 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 145 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 146 | # install all needed dependencies. 147 | #Pipfile.lock 148 | 149 | # celery beat schedule file 150 | celerybeat-schedule 151 | 152 | # SageMath parsed files 153 | *.sage.py 154 | 155 | # Environments 156 | .env 157 | .venv 158 | env/ 159 | venv/ 160 | ENV/ 161 | env.bak/ 162 | venv.bak/ 163 | 164 | # Spyder project settings 165 | .spyderproject 166 | .spyproject 167 | 168 | # Rope project settings 169 | .ropeproject 170 | 171 | # mkdocs documentation 172 | /site 173 | 174 | # mypy 175 | .mypy_cache/ 176 | .dmypy.json 177 | dmypy.json 178 | 179 | # Pyre type checker 180 | .pyre/ 181 | 182 | ### Vim ### 183 | # Swap 184 | [._]*.s[a-v][a-z] 185 | [._]*.sw[a-p] 186 | [._]s[a-rt-v][a-z] 187 | [._]ss[a-gi-z] 188 | [._]sw[a-p] 189 | 190 | # Session 191 | Session.vim 192 | 193 | # Temporary 194 | .netrwhist 195 | # Auto-generated tag files 196 | tags 197 | # Persistent undo 198 | [._]*.un~ 199 | 200 | ### Researcher ### 201 | # output 202 | train_log 203 | docs/api 204 | .code-workspace.code-workspace 205 | output 206 | outputs 207 | instant_test_output 208 | inference_test_output 209 | *.pkl 210 | *.npy 211 | *.pth 212 | events.out.tfevents* 213 | 214 | # vscode 215 | *.code-workspace 216 | .vscode 217 | 218 | # vim 219 | .vim 220 | -------------------------------------------------------------------------------- /layers/voxel_mix.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | 4 | class SELayer(nn.Module): 5 | def __init__(self, channel, reduction=16): 6 | super(SELayer, self).__init__() 7 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 8 | self.fc = nn.Sequential( 9 | nn.Linear(channel, channel // reduction, bias=False), 10 | nn.ReLU(inplace=True), 11 | nn.Linear(channel // reduction, channel, bias=False), 12 | nn.Sigmoid() 13 | ) 14 | def forward(self, x): 15 | b, c, _, _ = x.size() 16 | y = self.avg_pool(x).view(b, c) 17 | y = self.fc(y).view(b, c, 1, 1) 18 | return x * y.expand_as(x) 19 | 20 | class voxel_mix_net(nn.Module): 21 | # [4,240,128,128] -> [4,80,128,128] 22 | def __init__(self): 23 | super(voxel_mix_net, self).__init__() 24 | in_channels = 80*6 25 | out_channels = 80 26 | local_global_channels = 80*3 27 | mix_channels = 80*2 28 | stride = 1 29 | self.se = SELayer(in_channels) 30 | self.residual_function = nn.Sequential( 31 | nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False), 32 | nn.BatchNorm2d(out_channels), 33 | nn.ReLU(inplace=True), 34 | nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False), 35 | nn.BatchNorm2d(out_channels) 36 | ) 37 | self.shortcut = nn.Sequential( 38 | nn.Conv2d(in_channels, out_channels, stride=stride, kernel_size=1, bias=False), 39 | nn.BatchNorm2d(out_channels) 40 | ) 41 | self.relu = nn.ReLU(inplace=True) 42 | 43 | self.gl_se = SELayer(local_global_channels) 44 | self.gl_residual = nn.Sequential( 45 | nn.Conv2d(local_global_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False), 46 | nn.BatchNorm2d(out_channels), 47 | nn.ReLU(inplace=True), 48 | nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False), 49 | nn.BatchNorm2d(out_channels) 50 | ) 51 | self.gl_shortcut = nn.Sequential( 52 | nn.Conv2d(local_global_channels, out_channels, stride=stride, kernel_size=1, bias=False), 53 | nn.BatchNorm2d(out_channels) 54 | ) 55 | self.relu2 = nn.ReLU(inplace=True) 56 | 57 | # self.mix_se = SELayer(mix_channels) 58 | # self.mix_residual = nn.Sequential( 59 | # nn.Conv2d(mix_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False), 60 | # nn.BatchNorm2d(out_channels), 61 | # nn.ReLU(inplace=True), 62 | # nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False), 63 | # nn.BatchNorm2d(out_channels) 64 | # ) 65 | # self.mix_shortcut = nn.Sequential( 66 | # nn.Conv2d(mix_channels, out_channels, stride=stride, kernel_size=1, bias=False), 67 | # nn.BatchNorm2d(out_channels) 68 | # ) 69 | # self.relu3 = nn.ReLU(inplace=True) 70 | 71 | def forward(self, input): 72 | local_depth = input[:,:-80*3,:,:] 73 | lc_feature = self.se(local_depth) 74 | lc_feature = self.relu(self.residual_function(lc_feature) + self.shortcut(lc_feature)) 75 | 76 | glob_depth = input[:,-80*3:,:,:] 77 | gl_feature = self.gl_se(glob_depth) 78 | gl_feature = self.relu2(self.gl_residual(gl_feature) + self.gl_shortcut(gl_feature)) 79 | 80 | # mix_feature = torch.cat([lc_feature,gl_feature],dim=1) 81 | # result = self.mix_se(mix_feature) 82 | # result = self.relu2(self.mix_residual(result) + self.mix_shortcut(mix_feature)) 83 | # return result 84 | gl_feature = self.gl_se(glob_depth) 85 | gl_feature = self.relu2(self.gl_residual(gl_feature) + self.gl_shortcut(gl_feature)) 86 | 87 | return gl_feature 88 | 89 | if __name__ == '__main__': 90 | 91 | mixer = voxel_mix_net() 92 | voxel_feature = torch.randn((4, 80, 128,128)) 93 | mix_feature = torch.cat([voxel_feature,voxel_feature,voxel_feature,voxel_feature,voxel_feature,voxel_feature,voxel_feature,voxel_feature,voxel_feature],dim=1) 94 | out = mixer(mix_feature) 95 | out = out.type(torch.HalfTensor) 96 | exit(0) 97 | -------------------------------------------------------------------------------- /models/bev_depth.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from layers.backbones.lss_fpn import LSSFPN 4 | from layers.heads.bev_depth_head import BEVDepthHead 5 | 6 | __all__ = ['BEVDepth'] 7 | 8 | 9 | class BEVDepth(nn.Module): 10 | """Source code of `BEVDepth`, `https://arxiv.org/abs/2112.11790`. 11 | 12 | Args: 13 | backbone_conf (dict): Config of backbone. 14 | head_conf (dict): Config of head. 15 | is_train_depth (bool): Whether to return depth. 16 | Default: False. 17 | """ 18 | 19 | # TODO: Reduce grid_conf and data_aug_conf 20 | def __init__(self, backbone_conf, head_conf, is_train_depth=False): 21 | super(BEVDepth, self).__init__() 22 | self.backbone = LSSFPN(**backbone_conf) 23 | self.head = BEVDepthHead(**head_conf) 24 | self.is_train_depth = is_train_depth 25 | 26 | def forward( 27 | self, 28 | x, 29 | mats_dict, 30 | timestamps=None, 31 | ): 32 | """Forward function for BEVDepth 33 | 34 | Args: 35 | x (Tensor): Input ferature map. 36 | mats_dict(dict): 37 | sensor2ego_mats(Tensor): Transformation matrix from 38 | camera to ego with shape of (B, num_sweeps, 39 | num_cameras, 4, 4). 40 | intrin_mats(Tensor): Intrinsic matrix with shape 41 | of (B, num_sweeps, num_cameras, 4, 4). 42 | ida_mats(Tensor): Transformation matrix for ida with 43 | shape of (B, num_sweeps, num_cameras, 4, 4). 44 | sensor2sensor_mats(Tensor): Transformation matrix 45 | from key frame camera to sweep frame camera with 46 | shape of (B, num_sweeps, num_cameras, 4, 4). 47 | bda_mat(Tensor): Rotation matrix for bda with shape 48 | of (B, 4, 4). 49 | timestamps (long): Timestamp. 50 | Default: None. 51 | 52 | Returns: 53 | tuple(list[dict]): Output results for tasks. 54 | """ 55 | if self.is_train_depth and self.training: 56 | x, depth_pred, img_feats, voxel_feats, camera_feats = self.backbone(x, 57 | mats_dict, 58 | timestamps, 59 | is_return_depth=True) 60 | preds = self.head(x) 61 | return preds, depth_pred, img_feats, x, voxel_feats, camera_feats 62 | else: 63 | x = self.backbone(x, mats_dict, timestamps) 64 | preds = self.head(x) 65 | return preds 66 | 67 | def get_targets(self, gt_boxes, gt_labels): 68 | """Generate training targets for a single sample. 69 | 70 | Args: 71 | gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. 72 | gt_labels_3d (torch.Tensor): Labels of boxes. 73 | 74 | Returns: 75 | tuple[list[torch.Tensor]]: Tuple of target including \ 76 | the following results in order. 77 | 78 | - list[torch.Tensor]: Heatmap scores. 79 | - list[torch.Tensor]: Ground truth boxes. 80 | - list[torch.Tensor]: Indexes indicating the position \ 81 | of the valid boxes. 82 | - list[torch.Tensor]: Masks indicating which boxes \ 83 | are valid. 84 | """ 85 | return self.head.get_targets(gt_boxes, gt_labels) 86 | 87 | def loss(self, targets, preds_dicts): 88 | """Loss function for BEVDepth. 89 | 90 | Args: 91 | gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground 92 | truth gt boxes. 93 | gt_labels_3d (list[torch.Tensor]): Labels of boxes. 94 | preds_dicts (dict): Output of forward function. 95 | 96 | Returns: 97 | dict[str:torch.Tensor]: Loss of heatmap and bbox of each task. 98 | """ 99 | return self.head.loss(targets, preds_dicts) 100 | 101 | def get_bboxes(self, preds_dicts, img_metas=None, img=None, rescale=False): 102 | """Generate bboxes from bbox head predictions. 103 | 104 | Args: 105 | preds_dicts (tuple[list[dict]]): Prediction results. 106 | img_metas (list[dict]): Point cloud and image's meta info. 107 | 108 | Returns: 109 | list[dict]: Decoded bbox, scores and labels after nms. 110 | """ 111 | return self.head.get_bboxes(preds_dicts, img_metas, img, rescale) 112 | -------------------------------------------------------------------------------- /callbacks/ema.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. 3 | import math 4 | import os 5 | from copy import deepcopy 6 | 7 | import torch 8 | import torch.nn as nn 9 | from pytorch_lightning.callbacks import Callback 10 | 11 | __all__ = ['ModelEMA', 'is_parallel'] 12 | 13 | 14 | def is_parallel(model): 15 | """check if model is in parallel mode.""" 16 | parallel_type = ( 17 | nn.parallel.DataParallel, 18 | nn.parallel.DistributedDataParallel, 19 | ) 20 | return isinstance(model, parallel_type) 21 | 22 | 23 | class ModelEMA: 24 | """ 25 | Model Exponential Moving Average from https://github.com/rwightman/ 26 | pytorch-image-models Keep a moving average of everything in 27 | the model state_dict (parameters and buffers). 28 | This is intended to allow functionality like 29 | https://www.tensorflow.org/api_docs/python/tf/train/ 30 | ExponentialMovingAverage 31 | A smoothed version of the weights is necessary for some training 32 | schemes to perform well. 33 | This class is sensitive where it is initialized in the sequence 34 | of model init, GPU assignment and distributed training wrappers. 35 | """ 36 | def __init__(self, model, decay=0.9999, updates=0): 37 | """ 38 | Args: 39 | model (nn.Module): model to apply EMA. 40 | decay (float): ema decay reate. 41 | updates (int): counter of EMA updates. 42 | """ 43 | # Create EMA(FP32) 44 | self.ema = deepcopy( 45 | model.module if is_parallel(model) else model).eval() 46 | self.updates = updates 47 | # decay exponential ramp (to help early epochs) 48 | self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) 49 | for p in self.ema.parameters(): 50 | p.requires_grad_(False) 51 | 52 | def update(self, trainer, model): 53 | # Update EMA parameters 54 | with torch.no_grad(): 55 | self.updates += 1 56 | d = self.decay(self.updates) 57 | 58 | msd = model.module.state_dict() if is_parallel( 59 | model) else model.state_dict() # model state_dict 60 | for k, v in self.ema.state_dict().items(): 61 | if v.dtype.is_floating_point: 62 | v *= d 63 | v += (1.0 - d) * msd[k].detach() 64 | 65 | 66 | class EMACallback(Callback): 67 | def __init__(self, len_updates) -> None: 68 | super().__init__() 69 | self.len_updates = len_updates 70 | 71 | def on_fit_start(self, trainer, pl_module): 72 | # Todo (@lizeming@megvii.com): delete manually specified device 73 | from torch.nn.modules.batchnorm import SyncBatchNorm 74 | 75 | bn_model_list = list() 76 | bn_model_dist_group_list = list() 77 | for model_ref in trainer.model.modules(): 78 | if isinstance(model_ref, SyncBatchNorm): 79 | bn_model_list.append(model_ref) 80 | bn_model_dist_group_list.append(model_ref.process_group) 81 | model_ref.process_group = None 82 | trainer.ema_model = ModelEMA(trainer.model.module.module.model.cuda(), 83 | 0.9990) 84 | 85 | for bn_model, dist_group in zip(bn_model_list, 86 | bn_model_dist_group_list): 87 | bn_model.process_group = dist_group 88 | trainer.ema_model.updates = self.len_updates 89 | 90 | def on_train_batch_end(self, 91 | trainer, 92 | pl_module, 93 | outputs, 94 | batch, 95 | batch_idx, 96 | unused=0): 97 | trainer.ema_model.update(trainer, trainer.model.module.module.model) 98 | 99 | def on_train_epoch_end(self, trainer, pl_module) -> None: 100 | state_dict = trainer.ema_model.ema.state_dict() 101 | state_dict_keys = list(state_dict.keys()) 102 | # TODO: Change to more elegant way. 103 | for state_dict_key in state_dict_keys: 104 | new_key = 'model.' + state_dict_key 105 | state_dict[new_key] = state_dict.pop(state_dict_key) 106 | checkpoint = { 107 | # the epoch and global step are saved for 108 | # compatibility but they are not relevant for restoration 109 | 'epoch': trainer.current_epoch, 110 | 'global_step': trainer.global_step, 111 | 'state_dict': state_dict 112 | } 113 | torch.save( 114 | checkpoint, 115 | os.path.join(trainer.log_dir, f'{trainer.current_epoch}.pth')) 116 | -------------------------------------------------------------------------------- /scripts/gen_depth_gt.py: -------------------------------------------------------------------------------- 1 | import os 2 | from multiprocessing import Pool 3 | 4 | import mmcv 5 | import numpy as np 6 | from nuscenes.utils.data_classes import LidarPointCloud 7 | from nuscenes.utils.geometry_utils import view_points 8 | from pyquaternion import Quaternion 9 | 10 | 11 | # https://github.com/nutonomy/nuscenes-devkit/blob/master/python-sdk/nuscenes/nuscenes.py#L834 12 | def map_pointcloud_to_image( 13 | pc, 14 | im, 15 | lidar_calibrated_sensor, 16 | lidar_ego_pose, 17 | cam_calibrated_sensor, 18 | cam_ego_pose, 19 | min_dist: float = 0.0, 20 | ): 21 | 22 | # Points live in the point sensor frame. So they need to be 23 | # transformed via global to the image plane. 24 | # First step: transform the pointcloud to the ego vehicle 25 | # frame for the timestamp of the sweep. 26 | 27 | pc = LidarPointCloud(pc.T) 28 | pc.rotate(Quaternion(lidar_calibrated_sensor['rotation']).rotation_matrix) 29 | pc.translate(np.array(lidar_calibrated_sensor['translation'])) 30 | 31 | # Second step: transform from ego to the global frame. 32 | pc.rotate(Quaternion(lidar_ego_pose['rotation']).rotation_matrix) 33 | pc.translate(np.array(lidar_ego_pose['translation'])) 34 | 35 | # Third step: transform from global into the ego vehicle 36 | # frame for the timestamp of the image. 37 | pc.translate(-np.array(cam_ego_pose['translation'])) 38 | pc.rotate(Quaternion(cam_ego_pose['rotation']).rotation_matrix.T) 39 | 40 | # Fourth step: transform from ego into the camera. 41 | pc.translate(-np.array(cam_calibrated_sensor['translation'])) 42 | pc.rotate(Quaternion(cam_calibrated_sensor['rotation']).rotation_matrix.T) 43 | 44 | # Fifth step: actually take a "picture" of the point cloud. 45 | # Grab the depths (camera frame z axis points away from the camera). 46 | depths = pc.points[2, :] 47 | coloring = depths 48 | 49 | # Take the actual picture (matrix multiplication with camera-matrix 50 | # + renormalization). 51 | points = view_points(pc.points[:3, :], 52 | np.array(cam_calibrated_sensor['camera_intrinsic']), 53 | normalize=True) 54 | 55 | # Remove points that are either outside or behind the camera. 56 | # Leave a margin of 1 pixel for aesthetic reasons. Also make 57 | # sure points are at least 1m in front of the camera to avoid 58 | # seeing the lidar points on the camera casing for non-keyframes 59 | # which are slightly out of sync. 60 | mask = np.ones(depths.shape[0], dtype=bool) 61 | mask = np.logical_and(mask, depths > min_dist) 62 | mask = np.logical_and(mask, points[0, :] > 1) 63 | mask = np.logical_and(mask, points[0, :] < im.shape[1] - 1) 64 | mask = np.logical_and(mask, points[1, :] > 1) 65 | mask = np.logical_and(mask, points[1, :] < im.shape[0] - 1) 66 | points = points[:, mask] 67 | coloring = coloring[mask] 68 | 69 | return points, coloring 70 | 71 | 72 | data_root = 'data/nuScenes' 73 | info_path = 'data/nuScenes/nuscenes_12hz_infos_train.pkl' 74 | # data3d_nusc = NuscMVDetData() 75 | 76 | lidar_key = 'LIDAR_TOP' 77 | cam_keys = [ 78 | 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 79 | 'CAM_BACK', 'CAM_BACK_LEFT' 80 | ] 81 | 82 | 83 | def worker(info): 84 | lidar_path = info['lidar_infos'][lidar_key]['filename'] 85 | points = np.fromfile(os.path.join(data_root, lidar_path), 86 | dtype=np.float32, 87 | count=-1).reshape(-1, 5)[..., :4] 88 | lidar_calibrated_sensor = info['lidar_infos'][lidar_key][ 89 | 'calibrated_sensor'] 90 | lidar_ego_pose = info['lidar_infos'][lidar_key]['ego_pose'] 91 | for i, cam_key in enumerate(cam_keys): 92 | cam_calibrated_sensor = info['cam_infos'][cam_key]['calibrated_sensor'] 93 | cam_ego_pose = info['cam_infos'][cam_key]['ego_pose'] 94 | img = mmcv.imread( 95 | os.path.join(data_root, info['cam_infos'][cam_key]['filename'])) 96 | pts_img, depth = map_pointcloud_to_image( 97 | points.copy(), img, lidar_calibrated_sensor.copy(), 98 | lidar_ego_pose.copy(), cam_calibrated_sensor, cam_ego_pose) 99 | file_name = os.path.split(info['cam_infos'][cam_key]['filename'])[-1] 100 | np.concatenate([pts_img[:2, :].T, depth[:, None]], 101 | axis=1).astype(np.float32).flatten().tofile( 102 | os.path.join(data_root, 'depth_gt', 103 | f'{file_name}.bin')) 104 | # plt.savefig(f"{sample_idx}") 105 | 106 | 107 | if __name__ == '__main__': 108 | po = Pool(24) 109 | mmcv.mkdir_or_exist(os.path.join(data_root, 'depth_gt')) 110 | infos = mmcv.load(info_path) 111 | # import ipdb; ipdb.set_trace() 112 | for info in infos: 113 | po.apply_async(func=worker, args=(info, )) 114 | po.close() 115 | po.join() 116 | -------------------------------------------------------------------------------- /scripts/gen_info.py: -------------------------------------------------------------------------------- 1 | import mmcv 2 | import numpy as np 3 | from nuscenes.nuscenes import NuScenes 4 | from nuscenes.utils import splits 5 | from tqdm import tqdm 6 | 7 | 8 | def generate_info(nusc, scenes): 9 | infos = list() 10 | for cur_scene in tqdm(nusc.scene): 11 | if cur_scene['name'] not in scenes: 12 | continue 13 | first_sample_token = cur_scene['first_sample_token'] 14 | cur_sample = nusc.get('sample', first_sample_token) 15 | while True: 16 | info = dict() 17 | cam_info = dict() 18 | info['sample_token'] = cur_sample['token'] 19 | info['timestamp'] = cur_sample['timestamp'] 20 | info['scene_token'] = cur_sample['scene_token'] 21 | cam_names = [ 22 | 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 23 | 'CAM_BACK_LEFT', 'CAM_FRONT_LEFT' 24 | ] 25 | lidar_names = ['LIDAR_TOP'] 26 | cam_infos = dict() 27 | lidar_infos = dict() 28 | for cam_name in cam_names: 29 | cam_data = nusc.get('sample_data', 30 | cur_sample['data'][cam_name]) 31 | cam_info = dict() 32 | cam_info['sample_token'] = cam_data['sample_token'] 33 | cam_info['ego_pose'] = nusc.get('ego_pose', 34 | cam_data['ego_pose_token']) 35 | cam_info['timestamp'] = cam_data['timestamp'] 36 | cam_info['is_key_frame'] = cam_data['is_key_frame'] 37 | cam_info['height'] = cam_data['height'] 38 | cam_info['width'] = cam_data['width'] 39 | cam_info['filename'] = cam_data['filename'] 40 | cam_info['calibrated_sensor'] = nusc.get( 41 | 'calibrated_sensor', cam_data['calibrated_sensor_token']) 42 | cam_infos[cam_name] = cam_info 43 | for lidar_name in lidar_names: 44 | lidar_data = nusc.get('sample_data', 45 | cur_sample['data'][lidar_name]) 46 | lidar_info = dict() 47 | lidar_info['sample_token'] = lidar_data['sample_token'] 48 | lidar_info['ego_pose'] = nusc.get('ego_pose', 49 | lidar_data['ego_pose_token']) 50 | lidar_info['timestamp'] = lidar_data['timestamp'] 51 | lidar_info['filename'] = lidar_data['filename'] 52 | lidar_info['calibrated_sensor'] = nusc.get( 53 | 'calibrated_sensor', lidar_data['calibrated_sensor_token']) 54 | lidar_infos[lidar_name] = lidar_info 55 | 56 | sweeps = list() 57 | info['cam_infos'] = cam_infos 58 | info['lidar_infos'] = lidar_infos 59 | cam_datas = list() 60 | for i in range(6): 61 | sweeps.append(dict()) 62 | for cam_name in cam_names: 63 | cam_datas.append( 64 | nusc.get('sample_data', cur_sample['data'][cam_name])) 65 | for k, cam_data in enumerate(cam_datas): 66 | for j in range(6): 67 | if cam_data['prev'] == '': 68 | break 69 | else: 70 | cam_data = nusc.get('sample_data', cam_data['prev']) 71 | cam_info = dict() 72 | cam_info['sample_token'] = cam_data['sample_token'] 73 | assert cam_info['sample_token'] == cam_info[ 74 | 'sample_token'] 75 | cam_info['ego_pose'] = nusc.get( 76 | 'ego_pose', cam_data['ego_pose_token']) 77 | cam_info['timestamp'] = cam_data['timestamp'] 78 | cam_info['is_key_frame'] = cam_data['is_key_frame'] 79 | cam_info['height'] = cam_data['height'] 80 | cam_info['width'] = cam_data['width'] 81 | cam_info['filename'] = cam_data['filename'] 82 | cam_info['calibrated_sensor'] = nusc.get( 83 | 'calibrated_sensor', 84 | cam_data['calibrated_sensor_token']) 85 | sweeps[j][cam_names[k]] = cam_info 86 | # Remove empty sweeps. 87 | for i, sweep in enumerate(sweeps): 88 | if len(sweep.keys()) == 0: 89 | sweeps = sweeps[:i] 90 | break 91 | info['sweeps'] = sweeps 92 | ann_infos = list() 93 | for ann in cur_sample['anns']: 94 | ann_info = nusc.get('sample_annotation', ann) 95 | velocity = nusc.box_velocity(ann_info['token']) 96 | if np.any(np.isnan(velocity)): 97 | velocity = np.zeros(3) 98 | ann_info['velocity'] = velocity 99 | ann_infos.append(ann_info) 100 | info['ann_infos'] = ann_infos 101 | infos.append(info) 102 | if cur_sample['next'] == '': 103 | break 104 | else: 105 | cur_sample = nusc.get('sample', cur_sample['next']) 106 | return infos 107 | 108 | 109 | def main(): 110 | nusc = NuScenes(version='v1.0-trainval', 111 | dataroot='./data/nuScenes/', 112 | verbose=True) 113 | train_scenes = splits.train 114 | val_scenes = splits.val 115 | train_infos = generate_info(nusc, train_scenes) 116 | val_infos = generate_info(nusc, val_scenes) 117 | mmcv.dump(train_infos, './data/nuScenes/nuscenes_12hz_infos_train.pkl') 118 | mmcv.dump(val_infos, './data/nuScenes/nuscenes_12hz_infos_val.pkl') 119 | 120 | 121 | if __name__ == '__main__': 122 | main() 123 | -------------------------------------------------------------------------------- /test/test_layers/test_head.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pytest 4 | import torch 5 | from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes 6 | 7 | from layers.heads.bev_depth_head import BEVDepthHead 8 | 9 | 10 | class TestLSSFPN(unittest.TestCase): 11 | def setUp(self) -> None: 12 | bev_backbone = dict( 13 | type='ResNet', 14 | in_channels=10, 15 | depth=18, 16 | num_stages=3, 17 | strides=(1, 2, 2), 18 | dilations=(1, 1, 1), 19 | out_indices=[0, 1, 2], 20 | norm_eval=False, 21 | base_channels=20, 22 | ) 23 | 24 | bev_neck = dict(type='SECONDFPN', 25 | in_channels=[10, 20, 40, 80], 26 | upsample_strides=[1, 2, 4, 8], 27 | out_channels=[8, 8, 8, 8]) 28 | 29 | TASKS = [ 30 | dict(num_class=1, class_names=['car']), 31 | dict(num_class=2, class_names=['truck', 'construction_vehicle']), 32 | dict(num_class=2, class_names=['bus', 'trailer']), 33 | dict(num_class=1, class_names=['barrier']), 34 | dict(num_class=2, class_names=['motorcycle', 'bicycle']), 35 | dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), 36 | ] 37 | 38 | common_heads = dict(reg=(2, 2), 39 | height=(1, 2), 40 | dim=(3, 2), 41 | rot=(2, 2), 42 | vel=(2, 2)) 43 | 44 | bbox_coder = dict( 45 | type='CenterPointBBoxCoder', 46 | post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], 47 | max_num=500, 48 | score_threshold=0.1, 49 | out_size_factor=32, 50 | voxel_size=[0.2, 0.2, 8], 51 | pc_range=[-51.2, -51.2, -5, 51.2, 51.2, 3], 52 | code_size=9, 53 | ) 54 | 55 | train_cfg = dict( 56 | point_cloud_range=[-51.2, -51.2, -5, 51.2, 51.2, 3], 57 | grid_size=[512, 512, 1], 58 | voxel_size=[0.2, 0.2, 8], 59 | out_size_factor=32, 60 | dense_reg=1, 61 | gaussian_overlap=0.1, 62 | max_objs=500, 63 | min_radius=2, 64 | code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5], 65 | ) 66 | 67 | test_cfg = dict( 68 | post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], 69 | max_per_img=500, 70 | max_pool_nms=False, 71 | min_radius=[4, 12, 10, 1, 0.85, 0.175], 72 | score_threshold=0.1, 73 | out_size_factor=4, 74 | voxel_size=[0.2, 0.2, 8], 75 | nms_type='circle', 76 | pre_max_size=1000, 77 | post_max_size=83, 78 | nms_thr=0.2, 79 | ) 80 | 81 | head_conf = { 82 | 'bev_backbone_conf': bev_backbone, 83 | 'bev_neck_conf': bev_neck, 84 | 'tasks': TASKS, 85 | 'common_heads': common_heads, 86 | 'bbox_coder': bbox_coder, 87 | 'train_cfg': train_cfg, 88 | 'test_cfg': test_cfg, 89 | 'in_channels': 32, # Equal to bev_neck output_channels. 90 | 'loss_cls': dict(type='GaussianFocalLoss', reduction='mean'), 91 | 'loss_bbox': dict(type='L1Loss', 92 | reduction='mean', 93 | loss_weight=0.25), 94 | 'gaussian_overlap': 0.1, 95 | 'min_radius': 2, 96 | } 97 | self.bevdet_head = BEVDepthHead(**head_conf).cuda() 98 | 99 | @pytest.mark.skipif(torch.cuda.is_available() is False, 100 | reason='No gpu available.') 101 | def test_forward(self): 102 | x = torch.rand(2, 10, 32, 32).cuda() 103 | ret_results = self.bevdet_head.forward(x) 104 | assert len(ret_results) == 6 105 | assert ret_results[0][0]['reg'].shape == torch.Size([2, 2, 32, 32]) 106 | assert ret_results[0][0]['height'].shape == torch.Size([2, 1, 32, 32]) 107 | assert ret_results[0][0]['dim'].shape == torch.Size([2, 3, 32, 32]) 108 | assert ret_results[0][0]['rot'].shape == torch.Size([2, 2, 32, 32]) 109 | assert ret_results[0][0]['vel'].shape == torch.Size([2, 2, 32, 32]) 110 | assert ret_results[0][0]['heatmap'].shape == torch.Size([2, 1, 32, 32]) 111 | 112 | @pytest.mark.skipif(torch.cuda.is_available() is False, 113 | reason='No gpu available.') 114 | def test_get_targets(self): 115 | gt_boxes_3d_0 = torch.rand(10, 9).cuda() 116 | gt_boxes_3d_1 = torch.rand(15, 9).cuda() 117 | gt_boxes_3d_0[:, :2] *= 10 118 | gt_boxes_3d_1[:, :2] *= 10 119 | gt_labels_3d_0 = torch.randint(0, 10, (10, )).cuda() 120 | gt_labels_3d_1 = torch.randint(0, 10, (15, )).cuda() 121 | gt_boxes_3d = [gt_boxes_3d_0, gt_boxes_3d_1] 122 | gt_labels_3d = [gt_labels_3d_0, gt_labels_3d_1] 123 | heatmaps, anno_boxes, inds, masks = self.bevdet_head.get_targets( 124 | gt_boxes_3d, gt_labels_3d) 125 | assert len(heatmaps) == 6 126 | assert len(anno_boxes) == 6 127 | assert len(inds) == 6 128 | assert len(masks) == 6 129 | assert heatmaps[0].shape == torch.Size([2, 1, 16, 16]) 130 | assert anno_boxes[0].shape == torch.Size([2, 500, 10]) 131 | assert inds[0].shape == torch.Size([2, 500]) 132 | assert masks[0].shape == torch.Size([2, 500]) 133 | 134 | @pytest.mark.skipif(torch.cuda.is_available() is False, 135 | reason='No gpu available.') 136 | def test_get_bboxes(self): 137 | x = torch.rand(2, 10, 32, 32).cuda() 138 | ret_results = self.bevdet_head.forward(x) 139 | img_metas = [ 140 | dict(box_type_3d=LiDARInstance3DBoxes), 141 | dict(box_type_3d=LiDARInstance3DBoxes) 142 | ] 143 | pred_bboxes = self.bevdet_head.get_bboxes(ret_results, 144 | img_metas=img_metas) 145 | assert len(pred_bboxes) == 2 146 | assert len(pred_bboxes[0]) == 3 147 | assert pred_bboxes[0][1].shape == torch.Size([498]) 148 | assert pred_bboxes[0][2].shape == torch.Size([498]) 149 | -------------------------------------------------------------------------------- /models/uda_depth.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | # from layers.backbones.lss_fpn import LSSFPN 4 | from layers.backbones.uda_fpn import LSSFPN 5 | from layers.heads.bev_depth_head import BEVDepthHead 6 | 7 | __all__ = ['BEVDepth'] 8 | 9 | 10 | class BEVDepth(nn.Module): 11 | """Source code of `BEVDepth`, `https://arxiv.org/abs/2112.11790`. 12 | 13 | Args: 14 | backbone_conf (dict): Config of backbone. 15 | head_conf (dict): Config of head. 16 | is_train_depth (bool): Whether to return depth. 17 | Default: False. 18 | """ 19 | 20 | # TODO: Reduce grid_conf and data_aug_conf 21 | def __init__(self, backbone_conf, head_conf, is_train_depth=False): 22 | super(BEVDepth, self).__init__() 23 | self.backbone = LSSFPN(**backbone_conf) 24 | self.head = BEVDepthHead(**head_conf) 25 | self.is_train_depth = is_train_depth 26 | 27 | def forward( 28 | self, 29 | x, 30 | mats_dict, 31 | depth_label=None, 32 | timestamps=None, 33 | ): 34 | """Forward function for BEVDepth 35 | 36 | Args: 37 | x (Tensor): Input ferature map. 38 | mats_dict(dict): 39 | sensor2ego_mats(Tensor): Transformation matrix from 40 | camera to ego with shape of (B, num_sweeps, 41 | num_cameras, 4, 4). 42 | intrin_mats(Tensor): Intrinsic matrix with shape 43 | of (B, num_sweeps, num_cameras, 4, 4). 44 | ida_mats(Tensor): Transformation matrix for ida with 45 | shape of (B, num_sweeps, num_cameras, 4, 4). 46 | sensor2sensor_mats(Tensor): Transformation matrix 47 | from key frame camera to sweep frame camera with 48 | shape of (B, num_sweeps, num_cameras, 4, 4). 49 | bda_mat(Tensor): Rotation matrix for bda with shape 50 | of (B, 4, 4). 51 | timestamps (long): Timestamp. 52 | Default: None. 53 | 54 | Returns: 55 | tuple(list[dict]): Output results for tasks. 56 | """ 57 | if self.is_train_depth and self.training: 58 | if depth_label is None: 59 | x, depth_pred, img_feats, voxel_feats, camera_feats = self.backbone(x, 60 | mats_dict, 61 | depth_label=None, 62 | timestamps=timestamps, 63 | is_return_depth=True) 64 | else: 65 | # print("Your are forwarding with out depthNet") 66 | x, depth_pred, img_feats, voxel_feats, camera_feats = self.backbone(x, 67 | mats_dict, 68 | depth_label, 69 | timestamps, 70 | is_return_depth=True) 71 | preds = self.head(x) 72 | return preds, depth_pred, img_feats, x, voxel_feats, camera_feats 73 | else: 74 | if depth_label is None: 75 | x, depth_pred, img_feats, voxel_feats, camera_feats = self.backbone(x, 76 | mats_dict, 77 | depth_label=None, 78 | timestamps=timestamps, 79 | is_return_depth=True) 80 | else: 81 | # print("Your are forwarding with out depthNet") 82 | x, depth_pred, img_feats, voxel_feats, camera_feats = self.backbone(x, 83 | mats_dict, 84 | depth_label, 85 | timestamps, 86 | is_return_depth=True) 87 | # origin eval code here 88 | # if depth_label is None: 89 | # x = self.backbone(x, mats_dict, timestamps) 90 | # else: 91 | # x = self.backbone(x, 92 | # mats_dict, 93 | # depth_label=depth_label, 94 | # timestamps=timestamps) 95 | preds = self.head(x) 96 | return preds, depth_pred, img_feats, x, voxel_feats, camera_feats 97 | 98 | def get_targets(self, gt_boxes, gt_labels): 99 | """Generate training targets for a single sample. 100 | 101 | Args: 102 | gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. 103 | gt_labels_3d (torch.Tensor): Labels of boxes. 104 | 105 | Returns: 106 | tuple[list[torch.Tensor]]: Tuple of target including \ 107 | the following results in order. 108 | 109 | - list[torch.Tensor]: Heatmap scores. 110 | - list[torch.Tensor]: Ground truth boxes. 111 | - list[torch.Tensor]: Indexes indicating the position \ 112 | of the valid boxes. 113 | - list[torch.Tensor]: Masks indicating which boxes \ 114 | are valid. 115 | """ 116 | return self.head.get_targets(gt_boxes, gt_labels) 117 | 118 | def loss(self, targets, preds_dicts): 119 | """Loss function for BEVDepth. 120 | 121 | Args: 122 | gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground 123 | truth gt boxes. 124 | gt_labels_3d (list[torch.Tensor]): Labels of boxes. 125 | preds_dicts (dict): Output of forward function. 126 | 127 | Returns: 128 | dict[str:torch.Tensor]: Loss of heatmap and bbox of each task. 129 | """ 130 | return self.head.loss(targets, preds_dicts) 131 | 132 | def get_bboxes(self, preds_dicts, img_metas=None, img=None, rescale=False): 133 | """Generate bboxes from bbox head predictions. 134 | 135 | Args: 136 | preds_dicts (tuple[list[dict]]): Prediction results. 137 | img_metas (list[dict]): Point cloud and image's meta info. 138 | 139 | Returns: 140 | list[dict]: Decoded bbox, scores and labels after nms. 141 | """ 142 | return self.head.get_bboxes(preds_dicts, img_metas, img, rescale) 143 | -------------------------------------------------------------------------------- /exps/bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Megvii Inc. All rights reserved. 2 | """ 3 | mAP: 0.3484 4 | mATE: 0.6159 5 | mASE: 0.2716 6 | mAOE: 0.4144 7 | mAVE: 0.4402 8 | mAAE: 0.1954 9 | NDS: 0.4805 10 | Eval time: 110.7s 11 | Per-class results: 12 | Object Class AP ATE ASE AOE AVE AAE 13 | car 0.553 0.480 0.157 0.117 0.386 0.205 14 | truck 0.252 0.645 0.202 0.097 0.381 0.185 15 | bus 0.378 0.674 0.197 0.090 0.871 0.298 16 | trailer 0.163 0.932 0.230 0.409 0.543 0.098 17 | construction_vehicle 0.076 0.878 0.495 1.015 0.103 0.344 18 | pedestrian 0.361 0.694 0.300 0.816 0.491 0.247 19 | motorcycle 0.319 0.569 0.252 0.431 0.552 0.181 20 | bicycle 0.286 0.457 0.255 0.630 0.194 0.006 21 | traffic_cone 0.536 0.438 0.339 nan nan nan 22 | barrier 0.559 0.392 0.289 0.124 nan nan 23 | """ 24 | from argparse import ArgumentParser, Namespace 25 | 26 | import pytorch_lightning as pl 27 | import torch 28 | import torch.nn as nn 29 | from torch.cuda.amp.autocast_mode import autocast 30 | from torch.optim.lr_scheduler import MultiStepLR 31 | 32 | from callbacks.ema import EMACallback 33 | from exps.bev_depth_lss_r50_256x704_128x128_24e_2key import \ 34 | BEVDepthLightningModel as BaseBEVDepthLightningModel 35 | # from layers.backbones.lss_fpn import LSSFPN as BaseLSSFPN 36 | # from layers.heads.bev_depth_head import BEVDepthHead 37 | # from models.bev_depth import BEVDepth as BaseBEVDepth 38 | from models.mh_depth import BEVDepth as BaseBEVDepth 39 | from layers.heads.mh_depth_head import BEVDepthHead 40 | from layers.backbones.multi_head_fpn import LSSFPN as BaseLSSFPN 41 | 42 | class DepthAggregation(nn.Module): 43 | """ 44 | pixel cloud feature extraction 45 | """ 46 | def __init__(self, in_channels, mid_channels, out_channels): 47 | super(DepthAggregation, self).__init__() 48 | 49 | self.reduce_conv = nn.Sequential( 50 | nn.Conv2d(in_channels, 51 | mid_channels, 52 | kernel_size=3, 53 | stride=1, 54 | padding=1, 55 | bias=False), 56 | nn.BatchNorm2d(mid_channels), 57 | nn.ReLU(inplace=True), 58 | ) 59 | 60 | self.conv = nn.Sequential( 61 | nn.Conv2d(mid_channels, 62 | mid_channels, 63 | kernel_size=3, 64 | stride=1, 65 | padding=1, 66 | bias=False), 67 | nn.BatchNorm2d(mid_channels), 68 | nn.ReLU(inplace=True), 69 | nn.Conv2d(mid_channels, 70 | mid_channels, 71 | kernel_size=3, 72 | stride=1, 73 | padding=1, 74 | bias=False), 75 | nn.BatchNorm2d(mid_channels), 76 | nn.ReLU(inplace=True), 77 | ) 78 | 79 | self.out_conv = nn.Sequential( 80 | nn.Conv2d(mid_channels, 81 | out_channels, 82 | kernel_size=3, 83 | stride=1, 84 | padding=1, 85 | bias=True), 86 | # nn.BatchNorm3d(out_channels), 87 | # nn.ReLU(inplace=True), 88 | ) 89 | 90 | @autocast(False) 91 | def forward(self, x): 92 | x = self.reduce_conv(x) 93 | x = self.conv(x) + x 94 | x = self.out_conv(x) 95 | return x 96 | 97 | 98 | class LSSFPN(BaseLSSFPN): 99 | def __init__(self, **kwargs): 100 | super().__init__(**kwargs) 101 | self.depth_aggregation_net = self._configure_depth_aggregation_net() 102 | 103 | def _configure_depth_aggregation_net(self): 104 | """build pixel cloud feature extractor""" 105 | return DepthAggregation(self.output_channels, self.output_channels, 106 | self.output_channels) 107 | 108 | def _forward_voxel_net(self, img_feat_with_depth): 109 | # BEVConv2D [n, c, d, h, w] -> [n, h, c, w, d] 110 | img_feat_with_depth = img_feat_with_depth.permute( 111 | 0, 3, 1, 4, 2).contiguous() # [n, c, d, h, w] -> [n, h, c, w, d] 112 | n, h, c, w, d = img_feat_with_depth.shape 113 | img_feat_with_depth = img_feat_with_depth.view(-1, c, w, d) 114 | img_feat_with_depth = ( 115 | self.depth_aggregation_net(img_feat_with_depth).view( 116 | n, h, c, w, d).permute(0, 2, 4, 1, 3).contiguous().float()) 117 | return img_feat_with_depth 118 | 119 | 120 | class BEVDepth(BaseBEVDepth): 121 | def __init__(self, backbone_conf, head_conf, is_train_depth=True): 122 | super(BaseBEVDepth, self).__init__() 123 | self.backbone = LSSFPN(**backbone_conf) 124 | self.head = BEVDepthHead(**head_conf) 125 | self.is_train_depth = is_train_depth 126 | 127 | 128 | class BEVDepthLightningModel(BaseBEVDepthLightningModel): 129 | def __init__(self, **kwargs): 130 | super().__init__(**kwargs) 131 | self.model = BEVDepth(self.backbone_conf, 132 | self.head_conf, 133 | is_train_depth=True) 134 | self.data_use_cbgs = True 135 | 136 | def configure_optimizers(self): 137 | lr = self.basic_lr_per_img * \ 138 | self.batch_size_per_device * self.gpus 139 | optimizer = torch.optim.AdamW(self.model.parameters(), 140 | lr=lr, 141 | weight_decay=1e-7) 142 | scheduler = MultiStepLR(optimizer, [19, 22]) 143 | return [[optimizer], [scheduler]] 144 | 145 | 146 | def main(args: Namespace) -> None: 147 | if args.seed is not None: 148 | pl.seed_everything(args.seed) 149 | 150 | model = BEVDepthLightningModel(**vars(args)) 151 | train_dataloader = model.train_dataloader() 152 | ema_callback = EMACallback(len(train_dataloader.dataset) * args.max_epochs) 153 | trainer = pl.Trainer.from_argparse_args(args, callbacks=[ema_callback]) 154 | if args.evaluate: 155 | trainer.test(model, ckpt_path=args.ckpt_path) 156 | else: 157 | #.load_from_checkpoint(args.ckpt_path, strict=False) 158 | trainer.fit(model) 159 | 160 | 161 | def run_cli(): 162 | parent_parser = ArgumentParser(add_help=False) 163 | parent_parser = pl.Trainer.add_argparse_args(parent_parser) 164 | parent_parser.add_argument('-e', 165 | '--evaluate', 166 | dest='evaluate', 167 | action='store_true', 168 | help='evaluate model on validation set') 169 | parent_parser.add_argument('-b', '--batch_size_per_device', type=int) 170 | parent_parser.add_argument('--seed', 171 | type=int, 172 | default=0, 173 | help='seed for initializing training.') 174 | parent_parser.add_argument('--ckpt_path', type=str) 175 | parser = BEVDepthLightningModel.add_model_specific_args(parent_parser) 176 | parser.set_defaults(profiler='simple', 177 | deterministic=False, 178 | max_epochs=25, 179 | accelerator='ddp', 180 | num_sanity_val_steps=0, 181 | gradient_clip_val=5, 182 | limit_val_batches=0, 183 | enable_checkpointing=True, 184 | precision=16, 185 | default_root_dir='./outputs/SAN-CBGS50-3SE') 186 | args = parser.parse_args() 187 | main(args) 188 | 189 | 190 | if __name__ == '__main__': 191 | run_cli() 192 | -------------------------------------------------------------------------------- /layers/backbones/depth_net.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from mmcv.cnn import build_conv_layer 6 | from mmdet3d.models import build_neck 7 | from mmdet.models import build_backbone 8 | from mmdet.models.backbones.resnet import BasicBlock 9 | 10 | class _ASPPModule(nn.Module): 11 | def __init__(self, inplanes, planes, kernel_size, padding, dilation, 12 | BatchNorm): 13 | super(_ASPPModule, self).__init__() 14 | self.atrous_conv = nn.Conv2d(inplanes, 15 | planes, 16 | kernel_size=kernel_size, 17 | stride=1, 18 | padding=padding, 19 | dilation=dilation, 20 | bias=False) 21 | self.bn = BatchNorm(planes) 22 | self.relu = nn.ReLU() 23 | 24 | self._init_weight() 25 | 26 | def forward(self, x): 27 | x = self.atrous_conv(x) 28 | x = self.bn(x) 29 | 30 | return self.relu(x) 31 | 32 | def _init_weight(self): 33 | for m in self.modules(): 34 | if isinstance(m, nn.Conv2d): 35 | torch.nn.init.kaiming_normal_(m.weight) 36 | elif isinstance(m, nn.BatchNorm2d): 37 | m.weight.data.fill_(1) 38 | m.bias.data.zero_() 39 | 40 | 41 | class ASPP(nn.Module): 42 | def __init__(self, inplanes, mid_channels=256, BatchNorm=nn.BatchNorm2d): 43 | super(ASPP, self).__init__() 44 | 45 | dilations = [1, 6, 12, 18] 46 | 47 | self.aspp1 = _ASPPModule(inplanes, 48 | mid_channels, 49 | 1, 50 | padding=0, 51 | dilation=dilations[0], 52 | BatchNorm=BatchNorm) 53 | self.aspp2 = _ASPPModule(inplanes, 54 | mid_channels, 55 | 3, 56 | padding=dilations[1], 57 | dilation=dilations[1], 58 | BatchNorm=BatchNorm) 59 | self.aspp3 = _ASPPModule(inplanes, 60 | mid_channels, 61 | 3, 62 | padding=dilations[2], 63 | dilation=dilations[2], 64 | BatchNorm=BatchNorm) 65 | self.aspp4 = _ASPPModule(inplanes, 66 | mid_channels, 67 | 3, 68 | padding=dilations[3], 69 | dilation=dilations[3], 70 | BatchNorm=BatchNorm) 71 | 72 | self.global_avg_pool = nn.Sequential( 73 | nn.AdaptiveAvgPool2d((1, 1)), 74 | nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False), 75 | BatchNorm(mid_channels), 76 | nn.ReLU(), 77 | ) 78 | self.conv1 = nn.Conv2d(int(mid_channels * 5), 79 | mid_channels, 80 | 1, 81 | bias=False) 82 | self.bn1 = BatchNorm(mid_channels) 83 | self.relu = nn.ReLU() 84 | self.dropout = nn.Dropout(0.5) 85 | self._init_weight() 86 | 87 | def forward(self, x): 88 | x1 = self.aspp1(x) 89 | x2 = self.aspp2(x) 90 | x3 = self.aspp3(x) 91 | x4 = self.aspp4(x) 92 | x5 = self.global_avg_pool(x) 93 | x5 = F.interpolate(x5, 94 | size=x4.size()[2:], 95 | mode='bilinear', 96 | align_corners=True) 97 | x = torch.cat((x1, x2, x3, x4, x5), dim=1) 98 | 99 | x = self.conv1(x) 100 | x = self.bn1(x) 101 | x = self.relu(x) 102 | 103 | return self.dropout(x) 104 | 105 | def _init_weight(self): 106 | for m in self.modules(): 107 | if isinstance(m, nn.Conv2d): 108 | torch.nn.init.kaiming_normal_(m.weight) 109 | elif isinstance(m, nn.BatchNorm2d): 110 | m.weight.data.fill_(1) 111 | m.bias.data.zero_() 112 | 113 | class Mlp(nn.Module): 114 | def __init__(self, 115 | in_features, 116 | hidden_features=None, 117 | out_features=None, 118 | act_layer=nn.ReLU, 119 | drop=0.0): 120 | super().__init__() 121 | out_features = out_features or in_features 122 | hidden_features = hidden_features or in_features 123 | self.fc1 = nn.Linear(in_features, hidden_features) 124 | self.act = act_layer() 125 | self.drop1 = nn.Dropout(drop) 126 | self.fc2 = nn.Linear(hidden_features, out_features) 127 | self.drop2 = nn.Dropout(drop) 128 | 129 | def forward(self, x): 130 | x = self.fc1(x) 131 | x = self.act(x) 132 | x = self.drop1(x) 133 | x = self.fc2(x) 134 | x = self.drop2(x) 135 | return x 136 | 137 | class SELayer(nn.Module): 138 | def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid): 139 | super().__init__() 140 | self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True) 141 | self.act1 = act_layer() 142 | self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True) 143 | self.gate = gate_layer() 144 | 145 | def forward(self, x, x_se): 146 | x_se = self.conv_reduce(x_se) 147 | x_se = self.act1(x_se) 148 | x_se = self.conv_expand(x_se) 149 | return x * self.gate(x_se) 150 | 151 | class DepthNet(nn.Module): 152 | def __init__(self, in_channels, mid_channels, context_channels, 153 | depth_channels): 154 | super(DepthNet, self).__init__() 155 | self.reduce_conv = nn.Sequential( 156 | nn.Conv2d(in_channels, 157 | mid_channels, 158 | kernel_size=3, 159 | stride=1, 160 | padding=1), 161 | nn.BatchNorm2d(mid_channels), 162 | nn.ReLU(inplace=True), 163 | ) 164 | self.context_conv = nn.Conv2d(mid_channels, 165 | context_channels, 166 | kernel_size=1, 167 | stride=1, 168 | padding=0) 169 | self.bn = nn.BatchNorm1d(27) 170 | self.depth_mlp = Mlp(27, mid_channels, mid_channels) 171 | self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware 172 | self.context_mlp = Mlp(27, mid_channels, mid_channels) 173 | self.context_se = SELayer(mid_channels) # NOTE: add camera-aware 174 | self.depth_conv = nn.Sequential( 175 | BasicBlock(mid_channels, mid_channels), 176 | BasicBlock(mid_channels, mid_channels), 177 | BasicBlock(mid_channels, mid_channels), 178 | ASPP(mid_channels, mid_channels), 179 | build_conv_layer(cfg=dict( 180 | type='DCN', 181 | in_channels=mid_channels, 182 | out_channels=mid_channels, 183 | kernel_size=3, 184 | padding=1, 185 | groups=4, 186 | im2col_step=128, 187 | )), 188 | nn.Conv2d(mid_channels, 189 | depth_channels, 190 | kernel_size=1, 191 | stride=1, 192 | padding=0), 193 | ) 194 | 195 | def forward(self, x, mats_dict): 196 | intrins = mats_dict['intrin_mats'][:, 0:1, ..., :3, :3] 197 | batch_size = intrins.shape[0] 198 | num_cams = intrins.shape[2] 199 | ida = mats_dict['ida_mats'][:, 0:1, ...] 200 | sensor2ego = mats_dict['sensor2ego_mats'][:, 0:1, ..., :3, :] 201 | bda = mats_dict['bda_mat'].view(batch_size, 1, 1, 4, 202 | 4).repeat(1, 1, num_cams, 1, 1) 203 | mlp_input = torch.cat( 204 | [ 205 | torch.stack( 206 | [ 207 | intrins[:, 0:1, ..., 0, 0], 208 | intrins[:, 0:1, ..., 1, 1], 209 | intrins[:, 0:1, ..., 0, 2], 210 | intrins[:, 0:1, ..., 1, 2], 211 | ida[:, 0:1, ..., 0, 0], 212 | ida[:, 0:1, ..., 0, 1], 213 | ida[:, 0:1, ..., 0, 3], 214 | ida[:, 0:1, ..., 1, 0], 215 | ida[:, 0:1, ..., 1, 1], 216 | ida[:, 0:1, ..., 1, 3], 217 | bda[:, 0:1, ..., 0, 0], 218 | bda[:, 0:1, ..., 0, 1], 219 | bda[:, 0:1, ..., 1, 0], 220 | bda[:, 0:1, ..., 1, 1], 221 | bda[:, 0:1, ..., 2, 2], 222 | ], 223 | dim=-1, 224 | ), 225 | sensor2ego.view(batch_size, 1, num_cams, -1), 226 | ], 227 | -1, 228 | ) 229 | mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1])) 230 | x = self.reduce_conv(x) 231 | context_se = self.context_mlp(mlp_input)[..., None, None] 232 | context = self.context_se(x, context_se) 233 | context = self.context_conv(context) 234 | depth_se = self.depth_mlp(mlp_input)[..., None, None] 235 | depth = self.depth_se(x, depth_se) 236 | depth_mid = depth 237 | depth = self.depth_conv(depth) 238 | return torch.cat([depth, context], dim=1), depth_mid 239 | -------------------------------------------------------------------------------- /evaluators/det_mv_evaluators.py: -------------------------------------------------------------------------------- 1 | '''Modified from # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa 2 | ''' 3 | import os.path as osp 4 | import tempfile 5 | 6 | import mmcv 7 | import numpy as np 8 | import pyquaternion 9 | from nuscenes.utils.data_classes import Box 10 | from pyquaternion import Quaternion 11 | 12 | __all__ = ['DetMVNuscEvaluator'] 13 | 14 | 15 | class DetMVNuscEvaluator(): 16 | ErrNameMapping = { 17 | 'trans_err': 'mATE', 18 | 'scale_err': 'mASE', 19 | 'orient_err': 'mAOE', 20 | 'vel_err': 'mAVE', 21 | 'attr_err': 'mAAE', 22 | } 23 | 24 | DefaultAttribute = { 25 | 'car': 'vehicle.parked', 26 | 'pedestrian': 'pedestrian.moving', 27 | 'trailer': 'vehicle.parked', 28 | 'truck': 'vehicle.parked', 29 | 'bus': 'vehicle.moving', 30 | 'motorcycle': 'cycle.without_rider', 31 | 'construction_vehicle': 'vehicle.parked', 32 | 'bicycle': 'cycle.without_rider', 33 | 'barrier': '', 34 | 'traffic_cone': '', 35 | } 36 | 37 | def __init__( 38 | self, 39 | class_names, 40 | eval_version='detection_cvpr_2019', 41 | data_root='./data/nuScenes', 42 | version='v1.0-trainval', 43 | modality=dict(use_lidar=False, 44 | use_camera=True, 45 | use_radar=False, 46 | use_map=False, 47 | use_external=False), 48 | output_dir=None, 49 | ) -> None: 50 | self.eval_version = eval_version 51 | self.data_root = data_root 52 | if self.eval_version is not None: 53 | from nuscenes.eval.detection.config import config_factory 54 | 55 | self.eval_detection_configs = config_factory(self.eval_version) 56 | self.version = version 57 | self.class_names = class_names 58 | self.modality = modality 59 | self.output_dir = output_dir 60 | 61 | def _evaluate_single(self, 62 | result_path, 63 | logger=None, 64 | metric='bbox', 65 | result_name='pts_bbox'): 66 | """Evaluation for a single model in nuScenes protocol. 67 | 68 | Args: 69 | result_path (str): Path of the result file. 70 | logger (logging.Logger | str | None): Logger used for printing 71 | related information during evaluation. Default: None. 72 | metric (str): Metric name used for evaluation. Default: 'bbox'. 73 | result_name (str): Result name in the metric prefix. 74 | Default: 'pts_bbox'. 75 | 76 | Returns: 77 | dict: Dictionary of evaluation details. 78 | """ 79 | from nuscenes import NuScenes 80 | from nuscenes.eval.detection.evaluate import NuScenesEval 81 | 82 | output_dir = osp.join(*osp.split(result_path)[:-1]) 83 | nusc = NuScenes(version=self.version, 84 | dataroot=self.data_root, 85 | verbose=False) 86 | eval_set_map = { 87 | 'v1.0-mini': 'mini_val', 88 | 'v1.0-trainval': 'val', 89 | } 90 | nusc_eval = NuScenesEval(nusc, 91 | config=self.eval_detection_configs, 92 | result_path=result_path, 93 | eval_set=eval_set_map[self.version], 94 | output_dir=output_dir, 95 | verbose=False) 96 | nusc_eval.main(render_curves=False) 97 | 98 | # record metrics 99 | metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) 100 | detail = dict() 101 | metric_prefix = f'{result_name}_NuScenes' 102 | for class_name in self.class_names: 103 | for k, v in metrics['label_aps'][class_name].items(): 104 | val = float('{:.4f}'.format(v)) 105 | detail['{}/{}_AP_dist_{}'.format(metric_prefix, class_name, 106 | k)] = val 107 | for k, v in metrics['label_tp_errors'][class_name].items(): 108 | val = float('{:.4f}'.format(v)) 109 | detail['{}/{}_{}'.format(metric_prefix, class_name, k)] = val 110 | for k, v in metrics['tp_errors'].items(): 111 | val = float('{:.4f}'.format(v)) 112 | detail['{}/{}'.format(metric_prefix, 113 | self.ErrNameMapping[k])] = val 114 | 115 | detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score'] 116 | detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap'] 117 | return detail 118 | 119 | def format_results(self, 120 | results, 121 | img_metas, 122 | result_names=['img_bbox'], 123 | jsonfile_prefix=None, 124 | **kwargs): 125 | """Format the results to json (standard format for COCO evaluation). 126 | 127 | Args: 128 | results (list[tuple | numpy.ndarray]): Testing results of the 129 | dataset. 130 | jsonfile_prefix (str | None): The prefix of json files. It includes 131 | the file path and the prefix of filename, e.g., "a/b/prefix". 132 | If not specified, a temp file will be created. Default: None. 133 | 134 | Returns: 135 | tuple: (result_files, tmp_dir), result_files is a dict containing \ 136 | the json filepaths, tmp_dir is the temporal directory created \ 137 | for saving json files when jsonfile_prefix is not specified. 138 | """ 139 | assert isinstance(results, list), 'results must be a list' 140 | 141 | if jsonfile_prefix is None: 142 | tmp_dir = tempfile.TemporaryDirectory() 143 | jsonfile_prefix = osp.join(tmp_dir.name, 'results') 144 | else: 145 | tmp_dir = None 146 | 147 | # currently the output prediction results could be in two formats 148 | # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) 149 | # 2. list of dict('pts_bbox' or 'img_bbox': 150 | # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) 151 | # this is a workaround to enable evaluation of both formats on nuScenes 152 | # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 153 | # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict 154 | result_files = dict() 155 | # refactor this. 156 | for rasult_name in result_names: 157 | # not evaluate 2D predictions on nuScenes 158 | if '2d' in rasult_name: 159 | continue 160 | print(f'\nFormating bboxes of {rasult_name}') 161 | tmp_file_ = osp.join(jsonfile_prefix, rasult_name) 162 | if self.output_dir: 163 | result_files.update({ 164 | rasult_name: 165 | self._format_bbox(results, img_metas, self.output_dir) 166 | }) 167 | else: 168 | result_files.update({ 169 | rasult_name: 170 | self._format_bbox(results, img_metas, tmp_file_) 171 | }) 172 | return result_files, tmp_dir 173 | 174 | def evaluate( 175 | self, 176 | results, 177 | img_metas, 178 | metric='bbox', 179 | logger=None, 180 | jsonfile_prefix=None, 181 | result_names=['img_bbox'], 182 | show=False, 183 | out_dir=None, 184 | pipeline=None, 185 | ): 186 | """Evaluation in nuScenes protocol. 187 | 188 | Args: 189 | results (list[dict]): Testing results of the dataset. 190 | metric (str | list[str]): Metrics to be evaluated. 191 | logger (logging.Logger | str | None): Logger used for printing 192 | related information during evaluation. Default: None. 193 | jsonfile_prefix (str | None): The prefix of json files. It includes 194 | the file path and the prefix of filename, e.g., "a/b/prefix". 195 | If not specified, a temp file will be created. Default: None. 196 | show (bool): Whether to visualize. 197 | Default: False. 198 | out_dir (str): Path to save the visualization results. 199 | Default: None. 200 | pipeline (list[dict], optional): raw data loading for showing. 201 | Default: None. 202 | 203 | Returns: 204 | dict[str, float]: Results of each evaluation metric. 205 | """ 206 | result_files, tmp_dir = self.format_results(results, img_metas, 207 | result_names, 208 | jsonfile_prefix) 209 | if isinstance(result_files, dict): 210 | for name in result_names: 211 | print('Evaluating bboxes of {}'.format(name)) 212 | self._evaluate_single(result_files[name]) 213 | elif isinstance(result_files, str): 214 | self._evaluate_single(result_files) 215 | 216 | if tmp_dir is not None: 217 | tmp_dir.cleanup() 218 | 219 | def _format_bbox(self, results, img_metas, jsonfile_prefix=None): 220 | """Convert the results to the standard format. 221 | 222 | Args: 223 | results (list[dict]): Testing results of the dataset. 224 | jsonfile_prefix (str): The prefix of the output jsonfile. 225 | You can specify the output directory/filename by 226 | modifying the jsonfile_prefix. Default: None. 227 | 228 | Returns: 229 | str: Path of the output json file. 230 | """ 231 | nusc_annos = {} 232 | mapped_class_names = self.class_names 233 | 234 | print('Start to convert detection format...') 235 | 236 | for sample_id, det in enumerate(mmcv.track_iter_progress(results)): 237 | boxes, scores, labels = det 238 | boxes = boxes 239 | sample_token = img_metas[sample_id]['token'] 240 | trans = np.array(img_metas[sample_id]['ego2global_translation']) 241 | rot = Quaternion(img_metas[sample_id]['ego2global_rotation']) 242 | annos = list() 243 | for i, box in enumerate(boxes): 244 | name = mapped_class_names[labels[i]] 245 | center = box[:3] 246 | wlh = box[[4, 3, 5]] 247 | box_yaw = box[6] 248 | box_vel = box[7:].tolist() 249 | box_vel.append(0) 250 | quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw) 251 | nusc_box = Box(center, wlh, quat, velocity=box_vel) 252 | nusc_box.rotate(rot) 253 | nusc_box.translate(trans) 254 | if np.sqrt(nusc_box.velocity[0]**2 + 255 | nusc_box.velocity[1]**2) > 0.2: 256 | if name in [ 257 | 'car', 258 | 'construction_vehicle', 259 | 'bus', 260 | 'truck', 261 | 'trailer', 262 | ]: 263 | attr = 'vehicle.moving' 264 | elif name in ['bicycle', 'motorcycle']: 265 | attr = 'cycle.with_rider' 266 | else: 267 | attr = self.DefaultAttribute[name] 268 | else: 269 | if name in ['pedestrian']: 270 | attr = 'pedestrian.standing' 271 | elif name in ['bus']: 272 | attr = 'vehicle.stopped' 273 | else: 274 | attr = self.DefaultAttribute[name] 275 | nusc_anno = dict( 276 | sample_token=sample_token, 277 | translation=nusc_box.center.tolist(), 278 | size=nusc_box.wlh.tolist(), 279 | rotation=nusc_box.orientation.elements.tolist(), 280 | velocity=nusc_box.velocity[:2], 281 | detection_name=name, 282 | detection_score=float(scores[i]), 283 | attribute_name=attr, 284 | ) 285 | annos.append(nusc_anno) 286 | # other views results of the same frame should be concatenated 287 | if sample_token in nusc_annos: 288 | nusc_annos[sample_token].extend(annos) 289 | else: 290 | nusc_annos[sample_token] = annos 291 | nusc_submissions = { 292 | 'meta': self.modality, 293 | 'results': nusc_annos, 294 | } 295 | mmcv.mkdir_or_exist(jsonfile_prefix) 296 | res_path = osp.join(jsonfile_prefix, 'results_nusc.json') 297 | print('Results writes to', res_path) 298 | mmcv.dump(nusc_submissions, res_path) 299 | return res_path 300 | -------------------------------------------------------------------------------- /layers/heads/mh_depth_head.py: -------------------------------------------------------------------------------- 1 | """Inherited from `https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/dense_heads/centerpoint_head.py`""" # noqa 2 | import torch 3 | from mmdet3d.core import draw_heatmap_gaussian, gaussian_radius 4 | from mmdet3d.models.dense_heads.centerpoint_head import CenterHead 5 | from mmdet3d.models.utils import clip_sigmoid 6 | from mmdet.core import reduce_mean 7 | from mmdet.models import build_backbone 8 | from mmdet3d.models import build_neck 9 | from torch.cuda.amp import autocast 10 | 11 | __all__ = ['BEVDepthHead'] 12 | 13 | bev_backbone_conf = dict( 14 | type='ResNet', 15 | in_channels=80, 16 | depth=18, 17 | num_stages=3, 18 | strides=(1, 2, 2), 19 | dilations=(1, 1, 1), 20 | out_indices=[0, 1, 2], 21 | norm_eval=False, 22 | base_channels=160, 23 | ) 24 | 25 | bev_neck_conf = dict(type='SECONDFPN', 26 | in_channels=[160, 320, 640], 27 | upsample_strides=[2, 4, 8], 28 | out_channels=[64, 64, 128]) 29 | 30 | 31 | class BEVDepthHead(CenterHead): 32 | """Head for BevDepth. 33 | 34 | Args: 35 | in_channels(int): Number of channels after bev_neck. 36 | tasks(dict): Tasks for head. 37 | bbox_coder(dict): Config of bbox coder. 38 | common_heads(dict): Config of head for each task. 39 | loss_cls(dict): Config of classification loss. 40 | loss_bbox(dict): Config of regression loss. 41 | gaussian_overlap(float): Gaussian overlap used for `get_targets`. 42 | min_radius(int): Min radius used for `get_targets`. 43 | train_cfg(dict): Config used in the training process. 44 | test_cfg(dict): Config used in the test process. 45 | bev_backbone_conf(dict): Cnfig of bev_backbone. 46 | bev_neck_conf(dict): Cnfig of bev_neck. 47 | """ 48 | def __init__( 49 | self, 50 | in_channels=256, 51 | tasks=None, 52 | bbox_coder=None, 53 | common_heads=dict(), 54 | loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), 55 | loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), 56 | gaussian_overlap=0.1, 57 | min_radius=2, 58 | train_cfg=None, 59 | test_cfg=None, 60 | bev_backbone_conf=bev_backbone_conf, 61 | bev_neck_conf=bev_neck_conf, 62 | separate_head=dict(type='SeparateHead', 63 | init_bias=-2.19, 64 | final_kernel=3), 65 | ): 66 | super(BEVDepthHead, self).__init__( 67 | in_channels=in_channels, 68 | tasks=tasks, 69 | bbox_coder=bbox_coder, 70 | common_heads=common_heads, 71 | loss_cls=loss_cls, 72 | loss_bbox=loss_bbox, 73 | separate_head=separate_head, 74 | ) 75 | self.trunk = build_backbone(bev_backbone_conf) 76 | self.trunk.init_weights() 77 | self.neck = build_neck(bev_neck_conf) 78 | self.neck.init_weights() 79 | del self.trunk.maxpool 80 | self.gaussian_overlap = gaussian_overlap 81 | self.min_radius = min_radius 82 | self.train_cfg = train_cfg 83 | self.test_cfg = test_cfg 84 | 85 | @autocast(False) 86 | def forward(self, x): 87 | """Forward pass. 88 | 89 | Args: 90 | feats (list[torch.Tensor]): Multi-level features, e.g., 91 | features produced by FPN. 92 | 93 | Returns: 94 | tuple(list[dict]): Output results for tasks. 95 | """ 96 | # FPN 97 | trunk_outs = [x] 98 | if self.trunk.deep_stem: 99 | x = self.trunk.stem(x) 100 | else: 101 | x = self.trunk.conv1(x) 102 | x = self.trunk.norm1(x) 103 | x = self.trunk.relu(x) 104 | for i, layer_name in enumerate(self.trunk.res_layers): 105 | res_layer = getattr(self.trunk, layer_name) 106 | x = res_layer(x) 107 | if i in self.trunk.out_indices: 108 | trunk_outs.append(x) 109 | fpn_output = self.neck(trunk_outs) 110 | ret_values = super().forward(fpn_output) 111 | return ret_values 112 | 113 | def get_targets_single(self, gt_bboxes_3d, gt_labels_3d): 114 | """Generate training targets for a single sample. 115 | 116 | Args: 117 | gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. 118 | gt_labels_3d (torch.Tensor): Labels of boxes. 119 | 120 | Returns: 121 | tuple[list[torch.Tensor]]: Tuple of target including \ 122 | the following results in order. 123 | 124 | - list[torch.Tensor]: Heatmap scores. 125 | - list[torch.Tensor]: Ground truth boxes. 126 | - list[torch.Tensor]: Indexes indicating the position \ 127 | of the valid boxes. 128 | - list[torch.Tensor]: Masks indicating which boxes \ 129 | are valid. 130 | """ 131 | max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg'] 132 | grid_size = torch.tensor(self.train_cfg['grid_size']) 133 | pc_range = torch.tensor(self.train_cfg['point_cloud_range']) 134 | voxel_size = torch.tensor(self.train_cfg['voxel_size']) 135 | 136 | feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] 137 | 138 | # reorganize the gt_dict by tasks 139 | task_masks = [] 140 | flag = 0 141 | for class_name in self.class_names: 142 | task_masks.append([ 143 | torch.where(gt_labels_3d == class_name.index(i) + flag) 144 | for i in class_name 145 | ]) 146 | flag += len(class_name) 147 | 148 | task_boxes = [] 149 | task_classes = [] 150 | flag2 = 0 151 | for idx, mask in enumerate(task_masks): 152 | task_box = [] 153 | task_class = [] 154 | for m in mask: 155 | task_box.append(gt_bboxes_3d[m]) 156 | # 0 is background for each task, so we need to add 1 here. 157 | task_class.append(gt_labels_3d[m] + 1 - flag2) 158 | task_boxes.append( 159 | torch.cat(task_box, axis=0).to(gt_bboxes_3d.device)) 160 | task_classes.append( 161 | torch.cat(task_class).long().to(gt_bboxes_3d.device)) 162 | flag2 += len(mask) 163 | draw_gaussian = draw_heatmap_gaussian 164 | heatmaps, anno_boxes, inds, masks = [], [], [], [] 165 | 166 | for idx, task_head in enumerate(self.task_heads): 167 | heatmap = gt_bboxes_3d.new_zeros( 168 | (len(self.class_names[idx]), feature_map_size[1], 169 | feature_map_size[0]), 170 | device='cuda') 171 | 172 | anno_box = gt_bboxes_3d.new_zeros((max_objs, 10), 173 | dtype=torch.float32, 174 | device='cuda') 175 | 176 | ind = gt_labels_3d.new_zeros((max_objs), 177 | dtype=torch.int64, 178 | device='cuda') 179 | mask = gt_bboxes_3d.new_zeros((max_objs), 180 | dtype=torch.uint8, 181 | device='cuda') 182 | 183 | num_objs = min(task_boxes[idx].shape[0], max_objs) 184 | 185 | for k in range(num_objs): 186 | cls_id = task_classes[idx][k] - 1 187 | 188 | width = task_boxes[idx][k][3] 189 | length = task_boxes[idx][k][4] 190 | width = width / voxel_size[0] / self.train_cfg[ 191 | 'out_size_factor'] 192 | length = length / voxel_size[1] / self.train_cfg[ 193 | 'out_size_factor'] 194 | 195 | if width > 0 and length > 0: 196 | radius = gaussian_radius( 197 | (length, width), 198 | min_overlap=self.train_cfg['gaussian_overlap']) 199 | radius = max(self.train_cfg['min_radius'], int(radius)) 200 | 201 | # be really careful for the coordinate system of 202 | # your box annotation. 203 | x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][ 204 | 1], task_boxes[idx][k][2] 205 | 206 | coor_x = ( 207 | x - pc_range[0] 208 | ) / voxel_size[0] / self.train_cfg['out_size_factor'] 209 | coor_y = ( 210 | y - pc_range[1] 211 | ) / voxel_size[1] / self.train_cfg['out_size_factor'] 212 | 213 | center = torch.tensor([coor_x, coor_y], 214 | dtype=torch.float32, 215 | device='cuda') 216 | center_int = center.to(torch.int32) 217 | 218 | # throw out not in range objects to avoid out of array 219 | # area when creating the heatmap 220 | if not (0 <= center_int[0] < feature_map_size[0] 221 | and 0 <= center_int[1] < feature_map_size[1]): 222 | continue 223 | 224 | draw_gaussian(heatmap[cls_id], center_int, radius) 225 | 226 | new_idx = k 227 | x, y = center_int[0], center_int[1] 228 | 229 | assert y * feature_map_size[0] + x < feature_map_size[ 230 | 0] * feature_map_size[1] 231 | 232 | ind[new_idx] = y * feature_map_size[0] + x 233 | mask[new_idx] = 1 234 | # TODO: support other outdoor dataset 235 | vx, vy = task_boxes[idx][k][7:] 236 | rot = task_boxes[idx][k][6] 237 | box_dim = task_boxes[idx][k][3:6] 238 | if self.norm_bbox: 239 | box_dim = box_dim.log() 240 | anno_box[new_idx] = torch.cat([ 241 | center - torch.tensor([x, y], device='cuda'), 242 | z.unsqueeze(0), 243 | box_dim, 244 | torch.sin(rot).unsqueeze(0), 245 | torch.cos(rot).unsqueeze(0), 246 | vx.unsqueeze(0), 247 | vy.unsqueeze(0), 248 | ]) 249 | 250 | heatmaps.append(heatmap) 251 | anno_boxes.append(anno_box) 252 | masks.append(mask) 253 | inds.append(ind) 254 | return heatmaps, anno_boxes, inds, masks 255 | 256 | def loss(self, targets, preds_dicts, **kwargs): 257 | """Loss function for BEVDepthHead. 258 | 259 | Args: 260 | gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground 261 | truth gt boxes. 262 | gt_labels_3d (list[torch.Tensor]): Labels of boxes. 263 | preds_dicts (dict): Output of forward function. 264 | 265 | Returns: 266 | dict[str:torch.Tensor]: Loss of heatmap and bbox of each task. 267 | """ 268 | heatmaps, anno_boxes, inds, masks = targets 269 | return_loss = 0 270 | for task_id, preds_dict in enumerate(preds_dicts): 271 | # heatmap focal loss 272 | preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap']) 273 | num_pos = heatmaps[task_id].eq(1).float().sum().item() 274 | cls_avg_factor = torch.clamp(reduce_mean( 275 | heatmaps[task_id].new_tensor(num_pos)), 276 | min=1).item() 277 | loss_heatmap = self.loss_cls(preds_dict[0]['heatmap'], 278 | heatmaps[task_id], 279 | avg_factor=cls_avg_factor) 280 | target_box = anno_boxes[task_id] 281 | # reconstruct the anno_box from multiple reg heads 282 | preds_dict[0]['anno_box'] = torch.cat( 283 | ( 284 | preds_dict[0]['reg'], 285 | preds_dict[0]['height'], 286 | preds_dict[0]['dim'], 287 | preds_dict[0]['rot'], 288 | preds_dict[0]['vel'], 289 | ), 290 | dim=1, 291 | ) 292 | 293 | # Regression loss for dimension, offset, height, rotation 294 | num = masks[task_id].float().sum() 295 | ind = inds[task_id] 296 | pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous() 297 | pred = pred.view(pred.size(0), -1, pred.size(3)) 298 | pred = self._gather_feat(pred, ind) 299 | mask = masks[task_id].unsqueeze(2).expand_as(target_box).float() 300 | num = torch.clamp(reduce_mean(target_box.new_tensor(num)), 301 | min=1e-4).item() 302 | isnotnan = (~torch.isnan(target_box)).float() 303 | mask *= isnotnan 304 | code_weights = self.train_cfg['code_weights'] 305 | bbox_weights = mask * mask.new_tensor(code_weights) 306 | loss_bbox = self.loss_bbox(pred, 307 | target_box, 308 | bbox_weights, 309 | avg_factor=num) 310 | return_loss += loss_bbox 311 | return_loss += loss_heatmap 312 | return return_loss 313 | -------------------------------------------------------------------------------- /layers/heads/bev_depth_head.py: -------------------------------------------------------------------------------- 1 | """Inherited from `https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/dense_heads/centerpoint_head.py`""" # noqa 2 | import torch 3 | from mmdet3d.core import draw_heatmap_gaussian, gaussian_radius 4 | from mmdet3d.models.dense_heads.centerpoint_head import CenterHead 5 | from mmdet3d.models.utils import clip_sigmoid 6 | from mmdet.core import reduce_mean 7 | from mmdet.models import build_backbone 8 | from mmdet3d.models import build_neck 9 | from torch.cuda.amp import autocast 10 | 11 | __all__ = ['BEVDepthHead'] 12 | 13 | bev_backbone_conf = dict( 14 | type='ResNet', 15 | in_channels=80, 16 | depth=18, 17 | num_stages=3, 18 | strides=(1, 2, 2), 19 | dilations=(1, 1, 1), 20 | out_indices=[0, 1, 2], 21 | norm_eval=False, 22 | base_channels=160, 23 | ) 24 | 25 | bev_neck_conf = dict(type='SECONDFPN', 26 | in_channels=[160, 320, 640], 27 | upsample_strides=[2, 4, 8], 28 | out_channels=[64, 64, 128]) 29 | 30 | 31 | class BEVDepthHead(CenterHead): 32 | """Head for BevDepth. 33 | 34 | Args: 35 | in_channels(int): Number of channels after bev_neck. 36 | tasks(dict): Tasks for head. 37 | bbox_coder(dict): Config of bbox coder. 38 | common_heads(dict): Config of head for each task. 39 | loss_cls(dict): Config of classification loss. 40 | loss_bbox(dict): Config of regression loss. 41 | gaussian_overlap(float): Gaussian overlap used for `get_targets`. 42 | min_radius(int): Min radius used for `get_targets`. 43 | train_cfg(dict): Config used in the training process. 44 | test_cfg(dict): Config used in the test process. 45 | bev_backbone_conf(dict): Cnfig of bev_backbone. 46 | bev_neck_conf(dict): Cnfig of bev_neck. 47 | """ 48 | def __init__( 49 | self, 50 | in_channels=256, 51 | tasks=None, 52 | bbox_coder=None, 53 | common_heads=dict(), 54 | loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), 55 | loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), 56 | gaussian_overlap=0.1, 57 | min_radius=2, 58 | train_cfg=None, 59 | test_cfg=None, 60 | bev_backbone_conf=bev_backbone_conf, 61 | bev_neck_conf=bev_neck_conf, 62 | separate_head=dict(type='SeparateHead', 63 | init_bias=-2.19, 64 | final_kernel=3), 65 | ): 66 | super(BEVDepthHead, self).__init__( 67 | in_channels=in_channels, 68 | tasks=tasks, 69 | bbox_coder=bbox_coder, 70 | common_heads=common_heads, 71 | loss_cls=loss_cls, 72 | loss_bbox=loss_bbox, 73 | separate_head=separate_head, 74 | ) 75 | self.trunk = build_backbone(bev_backbone_conf) 76 | self.trunk.init_weights() 77 | self.neck = build_neck(bev_neck_conf) 78 | self.neck.init_weights() 79 | del self.trunk.maxpool 80 | self.gaussian_overlap = gaussian_overlap 81 | self.min_radius = min_radius 82 | self.train_cfg = train_cfg 83 | self.test_cfg = test_cfg 84 | 85 | @autocast(False) 86 | def forward(self, x): 87 | """Forward pass. 88 | 89 | Args: 90 | feats (list[torch.Tensor]): Multi-level features, e.g., 91 | features produced by FPN. 92 | 93 | Returns: 94 | tuple(list[dict]): Output results for tasks. 95 | """ 96 | # FPN 97 | trunk_outs = [x] 98 | if self.trunk.deep_stem: 99 | x = self.trunk.stem(x) 100 | else: 101 | x = self.trunk.conv1(x) 102 | x = self.trunk.norm1(x) 103 | x = self.trunk.relu(x) 104 | for i, layer_name in enumerate(self.trunk.res_layers): 105 | res_layer = getattr(self.trunk, layer_name) 106 | x = res_layer(x) 107 | if i in self.trunk.out_indices: 108 | trunk_outs.append(x) 109 | fpn_output = self.neck(trunk_outs) 110 | ret_values = super().forward(fpn_output) 111 | return ret_values 112 | 113 | def get_targets_single(self, gt_bboxes_3d, gt_labels_3d): 114 | """Generate training targets for a single sample. 115 | 116 | Args: 117 | gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. 118 | gt_labels_3d (torch.Tensor): Labels of boxes. 119 | 120 | Returns: 121 | tuple[list[torch.Tensor]]: Tuple of target including \ 122 | the following results in order. 123 | 124 | - list[torch.Tensor]: Heatmap scores. 125 | - list[torch.Tensor]: Ground truth boxes. 126 | - list[torch.Tensor]: Indexes indicating the position \ 127 | of the valid boxes. 128 | - list[torch.Tensor]: Masks indicating which boxes \ 129 | are valid. 130 | """ 131 | max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg'] 132 | grid_size = torch.tensor(self.train_cfg['grid_size']) 133 | pc_range = torch.tensor(self.train_cfg['point_cloud_range']) 134 | voxel_size = torch.tensor(self.train_cfg['voxel_size']) 135 | 136 | feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] 137 | 138 | # reorganize the gt_dict by tasks 139 | task_masks = [] 140 | flag = 0 141 | for class_name in self.class_names: 142 | task_masks.append([ 143 | torch.where(gt_labels_3d == class_name.index(i) + flag) 144 | for i in class_name 145 | ]) 146 | flag += len(class_name) 147 | 148 | task_boxes = [] 149 | task_classes = [] 150 | flag2 = 0 151 | for idx, mask in enumerate(task_masks): 152 | task_box = [] 153 | task_class = [] 154 | for m in mask: 155 | task_box.append(gt_bboxes_3d[m]) 156 | # 0 is background for each task, so we need to add 1 here. 157 | task_class.append(gt_labels_3d[m] + 1 - flag2) 158 | task_boxes.append( 159 | torch.cat(task_box, axis=0).to(gt_bboxes_3d.device)) 160 | task_classes.append( 161 | torch.cat(task_class).long().to(gt_bboxes_3d.device)) 162 | flag2 += len(mask) 163 | draw_gaussian = draw_heatmap_gaussian 164 | heatmaps, anno_boxes, inds, masks = [], [], [], [] 165 | 166 | for idx, task_head in enumerate(self.task_heads): 167 | heatmap = gt_bboxes_3d.new_zeros( 168 | (len(self.class_names[idx]), feature_map_size[1], 169 | feature_map_size[0]), 170 | device='cuda') 171 | 172 | anno_box = gt_bboxes_3d.new_zeros((max_objs, 10), 173 | dtype=torch.float32, 174 | device='cuda') 175 | 176 | ind = gt_labels_3d.new_zeros((max_objs), 177 | dtype=torch.int64, 178 | device='cuda') 179 | mask = gt_bboxes_3d.new_zeros((max_objs), 180 | dtype=torch.uint8, 181 | device='cuda') 182 | 183 | num_objs = min(task_boxes[idx].shape[0], max_objs) 184 | 185 | for k in range(num_objs): 186 | cls_id = task_classes[idx][k] - 1 187 | 188 | width = task_boxes[idx][k][3] 189 | length = task_boxes[idx][k][4] 190 | width = width / voxel_size[0] / self.train_cfg[ 191 | 'out_size_factor'] 192 | length = length / voxel_size[1] / self.train_cfg[ 193 | 'out_size_factor'] 194 | 195 | if width > 0 and length > 0: 196 | radius = gaussian_radius( 197 | (length, width), 198 | min_overlap=self.train_cfg['gaussian_overlap']) 199 | radius = max(self.train_cfg['min_radius'], int(radius)) 200 | 201 | # be really careful for the coordinate system of 202 | # your box annotation. 203 | x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][ 204 | 1], task_boxes[idx][k][2] 205 | 206 | coor_x = ( 207 | x - pc_range[0] 208 | ) / voxel_size[0] / self.train_cfg['out_size_factor'] 209 | coor_y = ( 210 | y - pc_range[1] 211 | ) / voxel_size[1] / self.train_cfg['out_size_factor'] 212 | 213 | center = torch.tensor([coor_x, coor_y], 214 | dtype=torch.float32, 215 | device='cuda') 216 | center_int = center.to(torch.int32) 217 | 218 | # throw out not in range objects to avoid out of array 219 | # area when creating the heatmap 220 | if not (0 <= center_int[0] < feature_map_size[0] 221 | and 0 <= center_int[1] < feature_map_size[1]): 222 | continue 223 | 224 | draw_gaussian(heatmap[cls_id], center_int, radius) 225 | 226 | new_idx = k 227 | x, y = center_int[0], center_int[1] 228 | 229 | assert y * feature_map_size[0] + x < feature_map_size[ 230 | 0] * feature_map_size[1] 231 | 232 | ind[new_idx] = y * feature_map_size[0] + x 233 | mask[new_idx] = 1 234 | # TODO: support other outdoor dataset 235 | vx, vy = task_boxes[idx][k][7:] 236 | rot = task_boxes[idx][k][6] 237 | box_dim = task_boxes[idx][k][3:6] 238 | if self.norm_bbox: 239 | box_dim = box_dim.log() 240 | anno_box[new_idx] = torch.cat([ 241 | center - torch.tensor([x, y], device='cuda'), 242 | z.unsqueeze(0), 243 | box_dim, 244 | torch.sin(rot).unsqueeze(0), 245 | torch.cos(rot).unsqueeze(0), 246 | vx.unsqueeze(0), 247 | vy.unsqueeze(0), 248 | ]) 249 | 250 | heatmaps.append(heatmap) 251 | anno_boxes.append(anno_box) 252 | masks.append(mask) 253 | inds.append(ind) 254 | return heatmaps, anno_boxes, inds, masks 255 | 256 | def loss(self, targets, preds_dicts, **kwargs): 257 | """Loss function for BEVDepthHead. 258 | 259 | Args: 260 | gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground 261 | truth gt boxes. 262 | gt_labels_3d (list[torch.Tensor]): Labels of boxes. 263 | preds_dicts (dict): Output of forward function. 264 | 265 | Returns: 266 | dict[str:torch.Tensor]: Loss of heatmap and bbox of each task. 267 | """ 268 | heatmaps, anno_boxes, inds, masks = targets 269 | return_loss = 0 270 | for task_id, preds_dict in enumerate(preds_dicts): 271 | # heatmap focal loss 272 | preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap']) 273 | num_pos = heatmaps[task_id].eq(1).float().sum().item() 274 | cls_avg_factor = torch.clamp(reduce_mean( 275 | heatmaps[task_id].new_tensor(num_pos)), 276 | min=1).item() 277 | loss_heatmap = self.loss_cls(preds_dict[0]['heatmap'], 278 | heatmaps[task_id], 279 | avg_factor=cls_avg_factor) 280 | target_box = anno_boxes[task_id] 281 | # reconstruct the anno_box from multiple reg heads 282 | preds_dict[0]['anno_box'] = torch.cat( 283 | ( 284 | preds_dict[0]['reg'], 285 | preds_dict[0]['height'], 286 | preds_dict[0]['dim'], 287 | preds_dict[0]['rot'], 288 | preds_dict[0]['vel'], 289 | ), 290 | dim=1, 291 | ) 292 | 293 | # Regression loss for dimension, offset, height, rotation 294 | num = masks[task_id].float().sum() 295 | ind = inds[task_id] 296 | pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous() 297 | pred = pred.view(pred.size(0), -1, pred.size(3)) 298 | pred = self._gather_feat(pred, ind) 299 | mask = masks[task_id].unsqueeze(2).expand_as(target_box).float() 300 | num = torch.clamp(reduce_mean(target_box.new_tensor(num)), 301 | min=1e-4).item() 302 | isnotnan = (~torch.isnan(target_box)).float() 303 | mask *= isnotnan 304 | code_weights = self.train_cfg['code_weights'] 305 | bbox_weights = mask * mask.new_tensor(code_weights) 306 | loss_bbox = self.loss_bbox(pred, 307 | target_box, 308 | bbox_weights, 309 | avg_factor=num) 310 | return_loss += loss_bbox 311 | return_loss += loss_heatmap 312 | return return_loss 313 | -------------------------------------------------------------------------------- /layers/backbones/lss_fpn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Megvii Inc. All rights reserved. 2 | import torch 3 | import torch.nn.functional as F 4 | from mmcv.cnn import build_conv_layer 5 | from mmdet3d.models import build_neck 6 | from mmdet.models import build_backbone 7 | from mmdet.models.backbones.resnet import BasicBlock 8 | from torch import nn 9 | 10 | from ops.voxel_pooling import voxel_pooling 11 | 12 | __all__ = ['LSSFPN'] 13 | 14 | 15 | class _ASPPModule(nn.Module): 16 | def __init__(self, inplanes, planes, kernel_size, padding, dilation, 17 | BatchNorm): 18 | super(_ASPPModule, self).__init__() 19 | self.atrous_conv = nn.Conv2d(inplanes, 20 | planes, 21 | kernel_size=kernel_size, 22 | stride=1, 23 | padding=padding, 24 | dilation=dilation, 25 | bias=False) 26 | self.bn = BatchNorm(planes) 27 | self.relu = nn.ReLU() 28 | 29 | self._init_weight() 30 | 31 | def forward(self, x): 32 | x = self.atrous_conv(x) 33 | x = self.bn(x) 34 | 35 | return self.relu(x) 36 | 37 | def _init_weight(self): 38 | for m in self.modules(): 39 | if isinstance(m, nn.Conv2d): 40 | torch.nn.init.kaiming_normal_(m.weight) 41 | elif isinstance(m, nn.BatchNorm2d): 42 | m.weight.data.fill_(1) 43 | m.bias.data.zero_() 44 | 45 | 46 | class ASPP(nn.Module): 47 | def __init__(self, inplanes, mid_channels=256, BatchNorm=nn.BatchNorm2d): 48 | super(ASPP, self).__init__() 49 | 50 | dilations = [1, 6, 12, 18] 51 | 52 | self.aspp1 = _ASPPModule(inplanes, 53 | mid_channels, 54 | 1, 55 | padding=0, 56 | dilation=dilations[0], 57 | BatchNorm=BatchNorm) 58 | self.aspp2 = _ASPPModule(inplanes, 59 | mid_channels, 60 | 3, 61 | padding=dilations[1], 62 | dilation=dilations[1], 63 | BatchNorm=BatchNorm) 64 | self.aspp3 = _ASPPModule(inplanes, 65 | mid_channels, 66 | 3, 67 | padding=dilations[2], 68 | dilation=dilations[2], 69 | BatchNorm=BatchNorm) 70 | self.aspp4 = _ASPPModule(inplanes, 71 | mid_channels, 72 | 3, 73 | padding=dilations[3], 74 | dilation=dilations[3], 75 | BatchNorm=BatchNorm) 76 | 77 | self.global_avg_pool = nn.Sequential( 78 | nn.AdaptiveAvgPool2d((1, 1)), 79 | nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False), 80 | BatchNorm(mid_channels), 81 | nn.ReLU(), 82 | ) 83 | self.conv1 = nn.Conv2d(int(mid_channels * 5), 84 | mid_channels, 85 | 1, 86 | bias=False) 87 | self.bn1 = BatchNorm(mid_channels) 88 | self.relu = nn.ReLU() 89 | self.dropout = nn.Dropout(0.5) 90 | self._init_weight() 91 | 92 | def forward(self, x): 93 | x1 = self.aspp1(x) 94 | x2 = self.aspp2(x) 95 | x3 = self.aspp3(x) 96 | x4 = self.aspp4(x) 97 | x5 = self.global_avg_pool(x) 98 | x5 = F.interpolate(x5, 99 | size=x4.size()[2:], 100 | mode='bilinear', 101 | align_corners=True) 102 | x = torch.cat((x1, x2, x3, x4, x5), dim=1) 103 | 104 | x = self.conv1(x) 105 | x = self.bn1(x) 106 | x = self.relu(x) 107 | 108 | return self.dropout(x) 109 | 110 | def _init_weight(self): 111 | for m in self.modules(): 112 | if isinstance(m, nn.Conv2d): 113 | torch.nn.init.kaiming_normal_(m.weight) 114 | elif isinstance(m, nn.BatchNorm2d): 115 | m.weight.data.fill_(1) 116 | m.bias.data.zero_() 117 | 118 | 119 | class Mlp(nn.Module): 120 | def __init__(self, 121 | in_features, 122 | hidden_features=None, 123 | out_features=None, 124 | act_layer=nn.ReLU, 125 | drop=0.0): 126 | super().__init__() 127 | out_features = out_features or in_features 128 | hidden_features = hidden_features or in_features 129 | self.fc1 = nn.Linear(in_features, hidden_features) 130 | self.act = act_layer() 131 | self.drop1 = nn.Dropout(drop) 132 | self.fc2 = nn.Linear(hidden_features, out_features) 133 | self.drop2 = nn.Dropout(drop) 134 | 135 | def forward(self, x): 136 | x = self.fc1(x) 137 | x = self.act(x) 138 | x = self.drop1(x) 139 | x = self.fc2(x) 140 | x = self.drop2(x) 141 | return x 142 | 143 | 144 | class SELayer(nn.Module): 145 | def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid): 146 | super().__init__() 147 | self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True) 148 | self.act1 = act_layer() 149 | self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True) 150 | self.gate = gate_layer() 151 | 152 | def forward(self, x, x_se): 153 | x_se = self.conv_reduce(x_se) 154 | x_se = self.act1(x_se) 155 | x_se = self.conv_expand(x_se) 156 | return x * self.gate(x_se) 157 | 158 | 159 | class DepthNet(nn.Module): 160 | def __init__(self, in_channels, mid_channels, context_channels, 161 | depth_channels): 162 | super(DepthNet, self).__init__() 163 | self.reduce_conv = nn.Sequential( 164 | nn.Conv2d(in_channels, 165 | mid_channels, 166 | kernel_size=3, 167 | stride=1, 168 | padding=1), 169 | nn.BatchNorm2d(mid_channels), 170 | nn.ReLU(inplace=True), 171 | ) 172 | self.context_conv = nn.Conv2d(mid_channels, 173 | context_channels, 174 | kernel_size=1, 175 | stride=1, 176 | padding=0) 177 | self.bn = nn.BatchNorm1d(27) 178 | self.depth_mlp = Mlp(27, mid_channels, mid_channels) 179 | self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware 180 | self.context_mlp = Mlp(27, mid_channels, mid_channels) 181 | self.context_se = SELayer(mid_channels) # NOTE: add camera-aware 182 | self.depth_conv = nn.Sequential( 183 | BasicBlock(mid_channels, mid_channels), 184 | BasicBlock(mid_channels, mid_channels), 185 | BasicBlock(mid_channels, mid_channels), 186 | ASPP(mid_channels, mid_channels), 187 | build_conv_layer(cfg=dict( 188 | type='DCN', 189 | in_channels=mid_channels, 190 | out_channels=mid_channels, 191 | kernel_size=3, 192 | padding=1, 193 | groups=4, 194 | im2col_step=128, 195 | )), 196 | nn.Conv2d(mid_channels, 197 | depth_channels, 198 | kernel_size=1, 199 | stride=1, 200 | padding=0), 201 | ) 202 | 203 | def forward(self, x, mats_dict): 204 | intrins = mats_dict['intrin_mats'][:, 0:1, ..., :3, :3] 205 | batch_size = intrins.shape[0] 206 | num_cams = intrins.shape[2] 207 | ida = mats_dict['ida_mats'][:, 0:1, ...] 208 | sensor2ego = mats_dict['sensor2ego_mats'][:, 0:1, ..., :3, :] 209 | bda = mats_dict['bda_mat'].view(batch_size, 1, 1, 4, 210 | 4).repeat(1, 1, num_cams, 1, 1) 211 | mlp_input = torch.cat( 212 | [ 213 | torch.stack( 214 | [ 215 | intrins[:, 0:1, ..., 0, 0], 216 | intrins[:, 0:1, ..., 1, 1], 217 | intrins[:, 0:1, ..., 0, 2], 218 | intrins[:, 0:1, ..., 1, 2], 219 | ida[:, 0:1, ..., 0, 0], 220 | ida[:, 0:1, ..., 0, 1], 221 | ida[:, 0:1, ..., 0, 3], 222 | ida[:, 0:1, ..., 1, 0], 223 | ida[:, 0:1, ..., 1, 1], 224 | ida[:, 0:1, ..., 1, 3], 225 | bda[:, 0:1, ..., 0, 0], 226 | bda[:, 0:1, ..., 0, 1], 227 | bda[:, 0:1, ..., 1, 0], 228 | bda[:, 0:1, ..., 1, 1], 229 | bda[:, 0:1, ..., 2, 2], 230 | ], 231 | dim=-1, 232 | ), 233 | sensor2ego.view(batch_size, 1, num_cams, -1), 234 | ], 235 | -1, 236 | ) 237 | mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1])) 238 | x = self.reduce_conv(x) 239 | context_se = self.context_mlp(mlp_input)[..., None, None] 240 | context = self.context_se(x, context_se) 241 | context = self.context_conv(context) 242 | depth_se = self.depth_mlp(mlp_input)[..., None, None] 243 | depth = self.depth_se(x, depth_se) 244 | depth_mid = depth 245 | depth = self.depth_conv(depth) 246 | return torch.cat([depth, context], dim=1), depth_mid 247 | 248 | 249 | class LSSFPN(nn.Module): 250 | def __init__(self, x_bound, y_bound, z_bound, d_bound, final_dim, 251 | downsample_factor, output_channels, img_backbone_conf, 252 | img_neck_conf, depth_net_conf): 253 | """Modified from `https://github.com/nv-tlabs/lift-splat-shoot`. 254 | 255 | Args: 256 | x_bound (list): Boundaries for x. 257 | y_bound (list): Boundaries for y. 258 | z_bound (list): Boundaries for z. 259 | d_bound (list): Boundaries for d. 260 | final_dim (list): Dimension for input images. 261 | downsample_factor (int): Downsample factor between feature map 262 | and input image. 263 | output_channels (int): Number of channels for the output 264 | feature map. 265 | img_backbone_conf (dict): Config for image backbone. 266 | img_neck_conf (dict): Config for image neck. 267 | depth_net_conf (dict): Config for depth net. 268 | """ 269 | 270 | super(LSSFPN, self).__init__() 271 | self.downsample_factor = downsample_factor 272 | self.d_bound = d_bound 273 | self.final_dim = final_dim 274 | self.output_channels = output_channels 275 | 276 | self.register_buffer( 277 | 'voxel_size', 278 | torch.Tensor([row[2] for row in [x_bound, y_bound, z_bound]])) 279 | self.register_buffer( 280 | 'voxel_coord', 281 | torch.Tensor([ 282 | row[0] + row[2] / 2.0 for row in [x_bound, y_bound, z_bound] 283 | ])) 284 | self.register_buffer( 285 | 'voxel_num', 286 | torch.LongTensor([(row[1] - row[0]) / row[2] 287 | for row in [x_bound, y_bound, z_bound]])) 288 | self.register_buffer('frustum', self.create_frustum()) 289 | self.depth_channels, _, _, _ = self.frustum.shape 290 | 291 | self.img_backbone = build_backbone(img_backbone_conf) 292 | self.img_neck = build_neck(img_neck_conf) 293 | self.depth_net = self._configure_depth_net(depth_net_conf) 294 | 295 | self.img_neck.init_weights() 296 | self.img_backbone.init_weights() 297 | 298 | def _configure_depth_net(self, depth_net_conf): 299 | return DepthNet( 300 | depth_net_conf['in_channels'], 301 | depth_net_conf['mid_channels'], 302 | self.output_channels, 303 | self.depth_channels, 304 | ) 305 | 306 | def create_frustum(self): 307 | """Generate frustum""" 308 | # make grid in image plane 309 | ogfH, ogfW = self.final_dim 310 | fH, fW = ogfH // self.downsample_factor, ogfW // self.downsample_factor 311 | d_coords = torch.arange(*self.d_bound, 312 | dtype=torch.float).view(-1, 1, 313 | 1).expand(-1, fH, fW) 314 | D, _, _ = d_coords.shape 315 | x_coords = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view( 316 | 1, 1, fW).expand(D, fH, fW) 317 | y_coords = torch.linspace(0, ogfH - 1, fH, 318 | dtype=torch.float).view(1, fH, 319 | 1).expand(D, fH, fW) 320 | paddings = torch.ones_like(d_coords) 321 | 322 | # D x H x W x 3 323 | frustum = torch.stack((x_coords, y_coords, d_coords, paddings), -1) 324 | return frustum 325 | 326 | def get_geometry(self, sensor2ego_mat, intrin_mat, ida_mat, bda_mat): 327 | """Transfer points from camera coord to ego coord. 328 | 329 | Args: 330 | rots(Tensor): Rotation matrix from camera to ego. 331 | trans(Tensor): Translation matrix from camera to ego. 332 | intrins(Tensor): Intrinsic matrix. 333 | post_rots_ida(Tensor): Rotation matrix for ida. 334 | post_trans_ida(Tensor): Translation matrix for ida 335 | post_rot_bda(Tensor): Rotation matrix for bda. 336 | 337 | Returns: 338 | Tensors: points ego coord. 339 | """ 340 | batch_size, num_cams, _, _ = sensor2ego_mat.shape 341 | 342 | # undo post-transformation 343 | # B x N x D x H x W x 3 344 | points = self.frustum 345 | ida_mat = ida_mat.view(batch_size, num_cams, 1, 1, 1, 4, 4) 346 | points = ida_mat.inverse().matmul(points.unsqueeze(-1)) 347 | # cam_to_ego 348 | points = torch.cat( 349 | (points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3], 350 | points[:, :, :, :, :, 2:]), 5) 351 | 352 | combine = sensor2ego_mat.matmul(torch.inverse(intrin_mat)) 353 | points = combine.view(batch_size, num_cams, 1, 1, 1, 4, 354 | 4).matmul(points) 355 | if bda_mat is not None: 356 | bda_mat = bda_mat.unsqueeze(1).repeat(1, num_cams, 1, 1).view( 357 | batch_size, num_cams, 1, 1, 1, 4, 4) 358 | points = (bda_mat @ points).squeeze(-1) 359 | else: 360 | points = points.squeeze(-1) 361 | return points[..., :3] 362 | 363 | def get_cam_feats(self, imgs): 364 | """Get feature maps from images.""" 365 | batch_size, num_sweeps, num_cams, num_channels, imH, imW = imgs.shape 366 | 367 | imgs = imgs.flatten().view(batch_size * num_sweeps * num_cams, 368 | num_channels, imH, imW) 369 | img_feats = self.img_neck(self.img_backbone(imgs))[0] 370 | img_feats = img_feats.reshape(batch_size, num_sweeps, num_cams, 371 | img_feats.shape[1], img_feats.shape[2], 372 | img_feats.shape[3]) 373 | return img_feats 374 | 375 | def _forward_depth_net(self, feat, mats_dict): 376 | return self.depth_net(feat, mats_dict) 377 | 378 | def _forward_voxel_net(self, img_feat_with_depth): 379 | return img_feat_with_depth 380 | 381 | def _forward_single_sweep(self, 382 | sweep_index, 383 | sweep_imgs, 384 | mats_dict, 385 | is_return_depth=False): 386 | """Forward function for single sweep. 387 | 388 | Args: 389 | sweep_index (int): Index of sweeps. 390 | sweep_imgs (Tensor): Input images. 391 | mats_dict (dict): 392 | sensor2ego_mats(Tensor): Transformation matrix from 393 | camera to ego with shape of (B, num_sweeps, 394 | num_cameras, 4, 4). 395 | intrin_mats(Tensor): Intrinsic matrix with shape 396 | of (B, num_sweeps, num_cameras, 4, 4). 397 | ida_mats(Tensor): Transformation matrix for ida with 398 | shape of (B, num_sweeps, num_cameras, 4, 4). 399 | sensor2sensor_mats(Tensor): Transformation matrix 400 | from key frame camera to sweep frame camera with 401 | shape of (B, num_sweeps, num_cameras, 4, 4). 402 | bda_mat(Tensor): Rotation matrix for bda with shape 403 | of (B, 4, 4). 404 | is_return_depth (bool, optional): Whether to return depth. 405 | Default: False. 406 | 407 | Returns: 408 | Tensor: BEV feature map. 409 | """ 410 | batch_size, num_sweeps, num_cams, num_channels, img_height, \ 411 | img_width = sweep_imgs.shape 412 | img_feats = self.get_cam_feats(sweep_imgs) 413 | source_features = img_feats[:, 0, ...] 414 | depth_feature, cp_feature = self._forward_depth_net( 415 | source_features.reshape(batch_size * num_cams, 416 | source_features.shape[2], 417 | source_features.shape[3], 418 | source_features.shape[4]), 419 | mats_dict, 420 | ) 421 | depth = depth_feature[:, :self.depth_channels].softmax(1) 422 | img_feat_with_depth = depth.unsqueeze( 423 | 1) * depth_feature[:, self.depth_channels:( 424 | self.depth_channels + self.output_channels)].unsqueeze(2) 425 | 426 | # get voxel feature 427 | voxel_feature = img_feat_with_depth 428 | 429 | img_feat_with_depth = self._forward_voxel_net(img_feat_with_depth) 430 | 431 | img_feat_with_depth = img_feat_with_depth.reshape( 432 | batch_size, 433 | num_cams, 434 | img_feat_with_depth.shape[1], 435 | img_feat_with_depth.shape[2], 436 | img_feat_with_depth.shape[3], 437 | img_feat_with_depth.shape[4], 438 | ) 439 | geom_xyz = self.get_geometry( 440 | mats_dict['sensor2ego_mats'][:, sweep_index, ...], 441 | mats_dict['intrin_mats'][:, sweep_index, ...], 442 | mats_dict['ida_mats'][:, sweep_index, ...], 443 | mats_dict.get('bda_mat', None), 444 | ) 445 | img_feat_with_depth = img_feat_with_depth.permute(0, 1, 3, 4, 5, 2) 446 | geom_xyz = ((geom_xyz - (self.voxel_coord - self.voxel_size / 2.0)) / 447 | self.voxel_size).int() 448 | feature_map = voxel_pooling(geom_xyz, img_feat_with_depth.contiguous(), 449 | self.voxel_num.cuda()) 450 | if is_return_depth: 451 | return feature_map.contiguous(), depth, source_features, voxel_feature, cp_feature 452 | return feature_map.contiguous() 453 | 454 | def forward(self, 455 | sweep_imgs, 456 | mats_dict, 457 | timestamps=None, 458 | is_return_depth=False): 459 | """Forward function. 460 | 461 | Args: 462 | sweep_imgs(Tensor): Input images with shape of (B, num_sweeps, 463 | num_cameras, 3, H, W). 464 | mats_dict(dict): 465 | sensor2ego_mats(Tensor): Transformation matrix from 466 | camera to ego with shape of (B, num_sweeps, 467 | num_cameras, 4, 4). 468 | intrin_mats(Tensor): Intrinsic matrix with shape 469 | of (B, num_sweeps, num_cameras, 4, 4). 470 | ida_mats(Tensor): Transformation matrix for ida with 471 | shape of (B, num_sweeps, num_cameras, 4, 4). 472 | sensor2sensor_mats(Tensor): Transformation matrix 473 | from key frame camera to sweep frame camera with 474 | shape of (B, num_sweeps, num_cameras, 4, 4). 475 | bda_mat(Tensor): Rotation matrix for bda with shape 476 | of (B, 4, 4). 477 | timestamps(Tensor): Timestamp for all images with the shape of(B, 478 | num_sweeps, num_cameras). 479 | 480 | Return: 481 | Tensor: bev feature map. 482 | """ 483 | batch_size, num_sweeps, num_cams, num_channels, img_height, \ 484 | img_width = sweep_imgs.shape 485 | 486 | key_frame_res = self._forward_single_sweep( 487 | 0, 488 | sweep_imgs[:, 0:1, ...], 489 | mats_dict, 490 | is_return_depth=is_return_depth) 491 | if num_sweeps == 1: 492 | return key_frame_res 493 | 494 | key_frame_feature = key_frame_res[ 495 | 0] if is_return_depth else key_frame_res 496 | 497 | ret_feature_list = [key_frame_feature] 498 | for sweep_index in range(1, num_sweeps): 499 | with torch.no_grad(): 500 | feature_map = self._forward_single_sweep( 501 | sweep_index, 502 | sweep_imgs[:, sweep_index:sweep_index + 1, ...], 503 | mats_dict, 504 | is_return_depth=False) 505 | ret_feature_list.append(feature_map) 506 | 507 | if is_return_depth: 508 | return torch.cat(ret_feature_list, 1), key_frame_res[1], key_frame_res[2], key_frame_res[3], key_frame_res[4] 509 | else: 510 | return torch.cat(ret_feature_list, 1) 511 | -------------------------------------------------------------------------------- /layers/backbones/multi_head_fpn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Megvii Inc. All rights reserved. 2 | import torch 3 | import torch.nn.functional as F 4 | from mmcv.cnn import build_conv_layer 5 | from mmdet3d.models import build_neck 6 | from mmdet.models import build_backbone 7 | from mmdet.models.backbones.resnet import BasicBlock 8 | from torch import nn 9 | import time 10 | 11 | from ops.voxel_pooling import voxel_pooling 12 | from layers.voxel_mix import voxel_mix_net 13 | __all__ = ['LSSFPN'] 14 | 15 | 16 | class _ASPPModule(nn.Module): 17 | def __init__(self, inplanes, planes, kernel_size, padding, dilation, 18 | BatchNorm): 19 | super(_ASPPModule, self).__init__() 20 | self.atrous_conv = nn.Conv2d(inplanes, 21 | planes, 22 | kernel_size=kernel_size, 23 | stride=1, 24 | padding=padding, 25 | dilation=dilation, 26 | bias=False) 27 | self.bn = BatchNorm(planes) 28 | self.relu = nn.ReLU() 29 | 30 | self._init_weight() 31 | 32 | def forward(self, x): 33 | x = self.atrous_conv(x) 34 | x = self.bn(x) 35 | 36 | return self.relu(x) 37 | 38 | def _init_weight(self): 39 | for m in self.modules(): 40 | if isinstance(m, nn.Conv2d): 41 | torch.nn.init.kaiming_normal_(m.weight) 42 | elif isinstance(m, nn.BatchNorm2d): 43 | m.weight.data.fill_(1) 44 | m.bias.data.zero_() 45 | 46 | 47 | class ASPP(nn.Module): 48 | def __init__(self, inplanes, mid_channels=256, BatchNorm=nn.BatchNorm2d): 49 | super(ASPP, self).__init__() 50 | 51 | dilations = [1, 6, 12, 18] 52 | 53 | self.aspp1 = _ASPPModule(inplanes, 54 | mid_channels, 55 | 1, 56 | padding=0, 57 | dilation=dilations[0], 58 | BatchNorm=BatchNorm) 59 | self.aspp2 = _ASPPModule(inplanes, 60 | mid_channels, 61 | 3, 62 | padding=dilations[1], 63 | dilation=dilations[1], 64 | BatchNorm=BatchNorm) 65 | self.aspp3 = _ASPPModule(inplanes, 66 | mid_channels, 67 | 3, 68 | padding=dilations[2], 69 | dilation=dilations[2], 70 | BatchNorm=BatchNorm) 71 | self.aspp4 = _ASPPModule(inplanes, 72 | mid_channels, 73 | 3, 74 | padding=dilations[3], 75 | dilation=dilations[3], 76 | BatchNorm=BatchNorm) 77 | 78 | self.global_avg_pool = nn.Sequential( 79 | nn.AdaptiveAvgPool2d((1, 1)), 80 | nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False), 81 | BatchNorm(mid_channels), 82 | nn.ReLU(), 83 | ) 84 | self.conv1 = nn.Conv2d(int(mid_channels * 5), 85 | mid_channels, 86 | 1, 87 | bias=False) 88 | self.bn1 = BatchNorm(mid_channels) 89 | self.relu = nn.ReLU() 90 | self.dropout = nn.Dropout(0.5) 91 | self._init_weight() 92 | 93 | def forward(self, x): 94 | x1 = self.aspp1(x) 95 | x2 = self.aspp2(x) 96 | x3 = self.aspp3(x) 97 | x4 = self.aspp4(x) 98 | x5 = self.global_avg_pool(x) 99 | x5 = F.interpolate(x5, 100 | size=x4.size()[2:], 101 | mode='bilinear', 102 | align_corners=True) 103 | x = torch.cat((x1, x2, x3, x4, x5), dim=1) 104 | 105 | x = self.conv1(x) 106 | x = self.bn1(x) 107 | x = self.relu(x) 108 | 109 | return self.dropout(x) 110 | 111 | def _init_weight(self): 112 | for m in self.modules(): 113 | if isinstance(m, nn.Conv2d): 114 | torch.nn.init.kaiming_normal_(m.weight) 115 | elif isinstance(m, nn.BatchNorm2d): 116 | m.weight.data.fill_(1) 117 | m.bias.data.zero_() 118 | 119 | 120 | class Mlp(nn.Module): 121 | def __init__(self, 122 | in_features, 123 | hidden_features=None, 124 | out_features=None, 125 | act_layer=nn.ReLU, 126 | drop=0.0): 127 | super().__init__() 128 | out_features = out_features or in_features 129 | hidden_features = hidden_features or in_features 130 | self.fc1 = nn.Linear(in_features, hidden_features) 131 | self.act = act_layer() 132 | self.drop1 = nn.Dropout(drop) 133 | self.fc2 = nn.Linear(hidden_features, out_features) 134 | self.drop2 = nn.Dropout(drop) 135 | 136 | def forward(self, x): 137 | x = self.fc1(x) 138 | x = self.act(x) 139 | x = self.drop1(x) 140 | x = self.fc2(x) 141 | x = self.drop2(x) 142 | return x 143 | 144 | 145 | class SELayer(nn.Module): 146 | def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid): 147 | super().__init__() 148 | self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True) 149 | self.act1 = act_layer() 150 | self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True) 151 | self.gate = gate_layer() 152 | 153 | def forward(self, x, x_se): 154 | x_se = self.conv_reduce(x_se) 155 | x_se = self.act1(x_se) 156 | x_se = self.conv_expand(x_se) 157 | return x * self.gate(x_se) 158 | 159 | 160 | class DepthNet(nn.Module): 161 | def __init__(self, in_channels, mid_channels, context_channels, 162 | depth_channels): 163 | super(DepthNet, self).__init__() 164 | self.reduce_conv = nn.Sequential( 165 | nn.Conv2d(in_channels, 166 | mid_channels, 167 | kernel_size=3, 168 | stride=1, 169 | padding=1), 170 | nn.BatchNorm2d(mid_channels), 171 | nn.ReLU(inplace=True), 172 | ) 173 | self.context_conv = nn.Conv2d(mid_channels, 174 | context_channels, 175 | kernel_size=1, 176 | stride=1, 177 | padding=0) 178 | self.bn = nn.BatchNorm1d(27) 179 | self.depth_mlp = Mlp(27, mid_channels, mid_channels) 180 | self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware 181 | self.context_mlp = Mlp(27, mid_channels, mid_channels) 182 | self.context_se = SELayer(mid_channels) # NOTE: add camera-aware 183 | self.depth_conv = nn.Sequential( 184 | BasicBlock(mid_channels, mid_channels), 185 | BasicBlock(mid_channels, mid_channels), 186 | BasicBlock(mid_channels, mid_channels), 187 | ASPP(mid_channels, mid_channels), 188 | build_conv_layer(cfg=dict( 189 | type='DCN', 190 | in_channels=mid_channels, 191 | out_channels=mid_channels, 192 | kernel_size=3, 193 | padding=1, 194 | groups=4, 195 | im2col_step=128, 196 | )), 197 | nn.Conv2d(mid_channels, 198 | depth_channels, 199 | kernel_size=1, 200 | stride=1, 201 | padding=0), 202 | ) 203 | 204 | def forward(self, x, mats_dict): 205 | intrins = mats_dict['intrin_mats'][:, 0:1, ..., :3, :3] 206 | batch_size = intrins.shape[0] 207 | num_cams = intrins.shape[2] 208 | ida = mats_dict['ida_mats'][:, 0:1, ...] 209 | sensor2ego = mats_dict['sensor2ego_mats'][:, 0:1, ..., :3, :] 210 | bda = mats_dict['bda_mat'].view(batch_size, 1, 1, 4, 211 | 4).repeat(1, 1, num_cams, 1, 1) 212 | mlp_input = torch.cat( 213 | [ 214 | torch.stack( 215 | [ 216 | intrins[:, 0:1, ..., 0, 0], 217 | intrins[:, 0:1, ..., 1, 1], 218 | intrins[:, 0:1, ..., 0, 2], 219 | intrins[:, 0:1, ..., 1, 2], 220 | ida[:, 0:1, ..., 0, 0], 221 | ida[:, 0:1, ..., 0, 1], 222 | ida[:, 0:1, ..., 0, 3], 223 | ida[:, 0:1, ..., 1, 0], 224 | ida[:, 0:1, ..., 1, 1], 225 | ida[:, 0:1, ..., 1, 3], 226 | bda[:, 0:1, ..., 0, 0], 227 | bda[:, 0:1, ..., 0, 1], 228 | bda[:, 0:1, ..., 1, 0], 229 | bda[:, 0:1, ..., 1, 1], 230 | bda[:, 0:1, ..., 2, 2], 231 | ], 232 | dim=-1, 233 | ), 234 | sensor2ego.view(batch_size, 1, num_cams, -1), 235 | ], 236 | -1, 237 | ) 238 | mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1])) 239 | x = self.reduce_conv(x) 240 | context_se = self.context_mlp(mlp_input)[..., None, None] 241 | context = self.context_se(x, context_se) 242 | context = self.context_conv(context) 243 | depth_se = self.depth_mlp(mlp_input)[..., None, None] 244 | depth = self.depth_se(x, depth_se) 245 | depth = self.depth_conv(depth) 246 | return torch.cat([depth, context], dim=1) 247 | 248 | 249 | class LSSFPN(nn.Module): 250 | def __init__(self, x_bound, y_bound, z_bound, d_bound, final_dim, 251 | downsample_factor, output_channels, img_backbone_conf, 252 | img_neck_conf, depth_net_conf): 253 | """Modified from `https://github.com/nv-tlabs/lift-splat-shoot`. 254 | 255 | Args: 256 | x_bound (list): Boundaries for x. 257 | y_bound (list): Boundaries for y. 258 | z_bound (list): Boundaries for z. 259 | d_bound (list): Boundaries for d. 260 | final_dim (list): Dimension for input images. 261 | downsample_factor (int): Downsample factor between feature map 262 | and input image. 263 | output_channels (int): Number of channels for the output 264 | feature map. 265 | img_backbone_conf (dict): Config for image backbone. 266 | img_neck_conf (dict): Config for image neck. 267 | depth_net_conf (dict): Config for depth net. 268 | """ 269 | 270 | super(LSSFPN, self).__init__() 271 | self.downsample_factor = downsample_factor 272 | self.d_bound = d_bound 273 | self.final_dim = final_dim 274 | self.output_channels = output_channels 275 | 276 | # TODO hard code here, only for test!!!! 277 | self.z_bounds = [[-6, -3, 3],[-3, -2, 2],[-2, -1, 1],[-1, 0, 1],[0, 2, 2],[2, 4, 2],[-6, 4, 10],[-5, 3, 8],[-4, 2, 6]] 278 | self.register_buffer( 279 | f'voxel_size', 280 | torch.Tensor([[row[2] for row in [x_bound, y_bound, zb]] for zb in self.z_bounds]) 281 | ) 282 | self.register_buffer( 283 | f'voxel_coord', 284 | torch.Tensor([[row[0] + row[2] / 2.0 for row in [x_bound, y_bound, zb]] for zb in self.z_bounds])) 285 | self.register_buffer( 286 | f'voxel_num', 287 | torch.LongTensor([[(row[1] - row[0]) / row[2] for row in [x_bound, y_bound, zb]]for zb in self.z_bounds])) 288 | self.voxel_mix_net = voxel_mix_net() 289 | 290 | self.register_buffer('frustum', self.create_frustum()) 291 | self.depth_channels, _, _, _ = self.frustum.shape 292 | 293 | self.img_backbone = build_backbone(img_backbone_conf) 294 | self.img_neck = build_neck(img_neck_conf) 295 | self.depth_net = self._configure_depth_net(depth_net_conf) 296 | 297 | self.img_neck.init_weights() 298 | self.img_backbone.init_weights() 299 | 300 | def _configure_depth_net(self, depth_net_conf): 301 | return DepthNet( 302 | depth_net_conf['in_channels'], 303 | depth_net_conf['mid_channels'], 304 | self.output_channels, 305 | self.depth_channels, 306 | ) 307 | 308 | def create_frustum(self): 309 | """Generate frustum""" 310 | # make grid in image plane 311 | ogfH, ogfW = self.final_dim 312 | fH, fW = ogfH // self.downsample_factor, ogfW // self.downsample_factor 313 | d_coords = torch.arange(*self.d_bound, 314 | dtype=torch.float).view(-1, 1, 315 | 1).expand(-1, fH, fW) 316 | D, _, _ = d_coords.shape 317 | x_coords = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view( 318 | 1, 1, fW).expand(D, fH, fW) 319 | y_coords = torch.linspace(0, ogfH - 1, fH, 320 | dtype=torch.float).view(1, fH, 321 | 1).expand(D, fH, fW) 322 | paddings = torch.ones_like(d_coords) 323 | 324 | # D x H x W x 3 325 | frustum = torch.stack((x_coords, y_coords, d_coords, paddings), -1) 326 | return frustum 327 | 328 | def get_geometry(self, sensor2ego_mat, intrin_mat, ida_mat, bda_mat): 329 | """Transfer points from camera coord to ego coord. 330 | 331 | Args: 332 | rots(Tensor): Rotation matrix from camera to ego. 333 | trans(Tensor): Translation matrix from camera to ego. 334 | intrins(Tensor): Intrinsic matrix. 335 | post_rots_ida(Tensor): Rotation matrix for ida. 336 | post_trans_ida(Tensor): Translation matrix for ida 337 | post_rot_bda(Tensor): Rotation matrix for bda. 338 | 339 | Returns: 340 | Tensors: points ego coord. 341 | """ 342 | batch_size, num_cams, _, _ = sensor2ego_mat.shape 343 | 344 | # undo post-transformation 345 | # B x N x D x H x W x 3 346 | points = self.frustum 347 | ida_mat = ida_mat.view(batch_size, num_cams, 1, 1, 1, 4, 4) 348 | points = ida_mat.inverse().matmul(points.unsqueeze(-1)) 349 | # cam_to_ego 350 | points = torch.cat( 351 | (points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3], 352 | points[:, :, :, :, :, 2:]), 5) 353 | 354 | combine = sensor2ego_mat.matmul(torch.inverse(intrin_mat)) 355 | points = combine.view(batch_size, num_cams, 1, 1, 1, 4, 356 | 4).matmul(points) 357 | if bda_mat is not None: 358 | bda_mat = bda_mat.unsqueeze(1).repeat(1, num_cams, 1, 1).view( 359 | batch_size, num_cams, 1, 1, 1, 4, 4) 360 | points = (bda_mat @ points).squeeze(-1) 361 | else: 362 | points = points.squeeze(-1) 363 | return points[..., :3] 364 | 365 | def get_cam_feats(self, imgs): 366 | """Get feature maps from images. 367 | return img_feats [batch,sweep,cam,d,h,w] 368 | """ 369 | 370 | batch_size, num_sweeps, num_cams, num_channels, imH, imW = imgs.shape 371 | 372 | imgs = imgs.flatten().view(batch_size * num_sweeps * num_cams, 373 | num_channels, imH, imW) 374 | img_feats = self.img_neck(self.img_backbone(imgs))[0] 375 | img_feats = img_feats.reshape(batch_size, num_sweeps, num_cams, 376 | img_feats.shape[1], img_feats.shape[2], 377 | img_feats.shape[3]) 378 | return img_feats 379 | 380 | def _forward_depth_net(self, feat, mats_dict): 381 | return self.depth_net(feat, mats_dict) 382 | 383 | def _forward_voxel_net(self, img_feat_with_depth): 384 | return img_feat_with_depth 385 | 386 | def _forward_single_sweep(self, 387 | sweep_index, 388 | sweep_imgs, 389 | mats_dict, 390 | is_return_depth=False): 391 | """Forward function for single sweep. 392 | 393 | Args: 394 | sweep_index (int): Index of sweeps. 395 | sweep_imgs (Tensor): Input images. 396 | mats_dict (dict): 397 | sensor2ego_mats(Tensor): Transformation matrix from 398 | camera to ego with shape of (B, num_sweeps, 399 | num_cameras, 4, 4). 400 | intrin_mats(Tensor): Intrinsic matrix with shape 401 | of (B, num_sweeps, num_cameras, 4, 4). 402 | ida_mats(Tensor): Transformation matrix for ida with 403 | shape of (B, num_sweeps, num_cameras, 4, 4). 404 | sensor2sensor_mats(Tensor): Transformation matrix 405 | from key frame camera to sweep frame camera with 406 | shape of (B, num_sweeps, num_cameras, 4, 4). 407 | bda_mat(Tensor): Rotation matrix for bda with shape 408 | of (B, 4, 4). 409 | is_return_depth (bool, optional): Whether to return depth. 410 | Default: False. 411 | 412 | Returns: 413 | Tensor: BEV feature map. 414 | """ 415 | batch_size, num_sweeps, num_cams, num_channels, img_height, \ 416 | img_width = sweep_imgs.shape 417 | img_feats = self.get_cam_feats(sweep_imgs) 418 | source_features = img_feats[:, 0, ...] 419 | depth_feature = self._forward_depth_net( 420 | source_features.reshape(batch_size * num_cams, 421 | source_features.shape[2], 422 | source_features.shape[3], 423 | source_features.shape[4]), 424 | mats_dict, 425 | ) 426 | depth = depth_feature[:, :self.depth_channels].softmax(1) 427 | img_feat_with_depth = depth.unsqueeze( 428 | 1) * depth_feature[:, self.depth_channels:( 429 | self.depth_channels + self.output_channels)].unsqueeze(2) 430 | 431 | img_feat_with_depth = self._forward_voxel_net(img_feat_with_depth) 432 | 433 | img_feat_with_depth = img_feat_with_depth.reshape( 434 | batch_size, 435 | num_cams, 436 | img_feat_with_depth.shape[1], 437 | img_feat_with_depth.shape[2], 438 | img_feat_with_depth.shape[3], 439 | img_feat_with_depth.shape[4], 440 | ) 441 | geom_xyz = self.get_geometry( 442 | mats_dict['sensor2ego_mats'][:, sweep_index, ...], 443 | mats_dict['intrin_mats'][:, sweep_index, ...], 444 | mats_dict['ida_mats'][:, sweep_index, ...], 445 | mats_dict.get('bda_mat', None), 446 | ) 447 | img_feat_with_depth = img_feat_with_depth.permute(0, 1, 3, 4, 5, 2) 448 | 449 | # TODO hard code here!!!! careful!! 450 | feature_maps = [] 451 | for i in range(len(self.z_bounds)): 452 | tmp_geom_xyz = ((geom_xyz - (self.voxel_coord[i] - self.voxel_size[i] / 2.0)) / 453 | self.voxel_size[i]).int() 454 | feature_maps.append(voxel_pooling(tmp_geom_xyz, img_feat_with_depth.contiguous(), 455 | self.voxel_num[i].cuda())) 456 | overlap_feature = torch.cat(feature_maps,dim=1) 457 | feature_map = self.voxel_mix_net(overlap_feature) 458 | feature_map = feature_map.type(torch.cuda.FloatTensor) 459 | if is_return_depth: 460 | return feature_map.contiguous(), depth, source_features 461 | return feature_map.contiguous() 462 | 463 | def forward(self, 464 | sweep_imgs, 465 | mats_dict, 466 | timestamps=None, 467 | is_return_depth=False): 468 | """Forward function. 469 | 470 | Args: 471 | sweep_imgs(Tensor): Input images with shape of (B, num_sweeps, 472 | num_cameras, 3, H, W). 473 | mats_dict(dict): 474 | sensor2ego_mats(Tensor): Transformation matrix from 475 | camera to ego with shape of (B, num_sweeps, 476 | num_cameras, 4, 4). 477 | intrin_mats(Tensor): Intrinsic matrix with shape 478 | of (B, num_sweeps, num_cameras, 4, 4). 479 | ida_mats(Tensor): Transformation matrix for ida with 480 | shape of (B, num_sweeps, num_cameras, 4, 4). 481 | sensor2sensor_mats(Tensor): Transformation matrix 482 | from key frame camera to sweep frame camera with 483 | shape of (B, num_sweeps, num_cameras, 4, 4). 484 | bda_mat(Tensor): Rotation matrix for bda with shape 485 | of (B, 4, 4). 486 | timestamps(Tensor): Timestamp for all images with the shape of(B, 487 | num_sweeps, num_cameras). 488 | 489 | Return: 490 | Tensor: bev feature map. 491 | """ 492 | batch_size, num_sweeps, num_cams, num_channels, img_height, \ 493 | img_width = sweep_imgs.shape 494 | 495 | key_frame_res = self._forward_single_sweep( 496 | 0, 497 | sweep_imgs[:, 0:1, ...], 498 | mats_dict, 499 | is_return_depth=is_return_depth) 500 | if num_sweeps == 1: 501 | return key_frame_res 502 | 503 | key_frame_feature = key_frame_res[ 504 | 0] if is_return_depth else key_frame_res 505 | 506 | ret_feature_list = [key_frame_feature] 507 | for sweep_index in range(1, num_sweeps): 508 | with torch.no_grad(): 509 | feature_map = self._forward_single_sweep( 510 | sweep_index, 511 | sweep_imgs[:, sweep_index:sweep_index + 1, ...], 512 | mats_dict, 513 | is_return_depth=False) 514 | ret_feature_list.append(feature_map) 515 | 516 | if is_return_depth: 517 | return torch.cat(ret_feature_list, 1), key_frame_res[1], key_frame_res[2] 518 | else: 519 | return torch.cat(ret_feature_list, 1) 520 | -------------------------------------------------------------------------------- /dataset/nusc_mv_det_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import mmcv 4 | import numpy as np 5 | import torch 6 | from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes 7 | from nuscenes.utils.data_classes import Box 8 | from PIL import Image 9 | from pyquaternion import Quaternion 10 | from torch.utils.data import Dataset 11 | 12 | __all__ = ['NuscMVDetDataset'] 13 | 14 | map_name_from_general_to_detection = { 15 | 'human.pedestrian.adult': 'pedestrian', 16 | 'human.pedestrian.child': 'pedestrian', 17 | 'human.pedestrian.wheelchair': 'ignore', 18 | 'human.pedestrian.stroller': 'ignore', 19 | 'human.pedestrian.personal_mobility': 'ignore', 20 | 'human.pedestrian.police_officer': 'pedestrian', 21 | 'human.pedestrian.construction_worker': 'pedestrian', 22 | 'animal': 'ignore', 23 | 'vehicle.car': 'car', 24 | 'vehicle.motorcycle': 'motorcycle', 25 | 'vehicle.bicycle': 'bicycle', 26 | 'vehicle.bus.bendy': 'bus', 27 | 'vehicle.bus.rigid': 'bus', 28 | 'vehicle.truck': 'truck', 29 | 'vehicle.construction': 'construction_vehicle', 30 | 'vehicle.emergency.ambulance': 'ignore', 31 | 'vehicle.emergency.police': 'ignore', 32 | 'vehicle.trailer': 'trailer', 33 | 'movable_object.barrier': 'barrier', 34 | 'movable_object.trafficcone': 'traffic_cone', 35 | 'movable_object.pushable_pullable': 'ignore', 36 | 'movable_object.debris': 'ignore', 37 | 'static_object.bicycle_rack': 'ignore', 38 | } 39 | 40 | 41 | def get_rot(h): 42 | return torch.Tensor([ 43 | [np.cos(h), np.sin(h)], 44 | [-np.sin(h), np.cos(h)], 45 | ]) 46 | 47 | 48 | def img_transform(img, resize, resize_dims, crop, flip, rotate): 49 | ida_rot = torch.eye(2) 50 | ida_tran = torch.zeros(2) 51 | # adjust image 52 | img = img.resize(resize_dims) 53 | img = img.crop(crop) 54 | if flip: 55 | img = img.transpose(method=Image.FLIP_LEFT_RIGHT) 56 | img = img.rotate(rotate) 57 | 58 | # post-homography transformation 59 | ida_rot *= resize 60 | ida_tran -= torch.Tensor(crop[:2]) 61 | if flip: 62 | A = torch.Tensor([[-1, 0], [0, 1]]) 63 | b = torch.Tensor([crop[2] - crop[0], 0]) 64 | ida_rot = A.matmul(ida_rot) 65 | ida_tran = A.matmul(ida_tran) + b 66 | A = get_rot(rotate / 180 * np.pi) 67 | b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2 68 | b = A.matmul(-b) + b 69 | ida_rot = A.matmul(ida_rot) 70 | ida_tran = A.matmul(ida_tran) + b 71 | ida_mat = ida_rot.new_zeros(4, 4) 72 | ida_mat[3, 3] = 1 73 | ida_mat[2, 2] = 1 74 | ida_mat[:2, :2] = ida_rot 75 | ida_mat[:2, 3] = ida_tran 76 | return img, ida_mat 77 | 78 | 79 | def bev_transform(gt_boxes, rotate_angle, scale_ratio, flip_dx, flip_dy): 80 | rotate_angle = torch.tensor(rotate_angle / 180 * np.pi) 81 | rot_sin = torch.sin(rotate_angle) 82 | rot_cos = torch.cos(rotate_angle) 83 | rot_mat = torch.Tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], 84 | [0, 0, 1]]) 85 | scale_mat = torch.Tensor([[scale_ratio, 0, 0], [0, scale_ratio, 0], 86 | [0, 0, scale_ratio]]) 87 | flip_mat = torch.Tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) 88 | if flip_dx: 89 | flip_mat = flip_mat @ torch.Tensor([[-1, 0, 0], [0, 1, 0], [0, 0, 1]]) 90 | if flip_dy: 91 | flip_mat = flip_mat @ torch.Tensor([[1, 0, 0], [0, -1, 0], [0, 0, 1]]) 92 | rot_mat = flip_mat @ (scale_mat @ rot_mat) 93 | if gt_boxes.shape[0] > 0: 94 | gt_boxes[:, :3] = (rot_mat @ gt_boxes[:, :3].unsqueeze(-1)).squeeze(-1) 95 | gt_boxes[:, 3:6] *= scale_ratio 96 | gt_boxes[:, 6] += rotate_angle 97 | if flip_dx: 98 | gt_boxes[:, 6] = 2 * torch.asin(torch.tensor(1.0)) - gt_boxes[:, 6] 99 | if flip_dy: 100 | gt_boxes[:, 6] = -gt_boxes[:, 6] 101 | gt_boxes[:, 7:] = ( 102 | rot_mat[:2, :2] @ gt_boxes[:, 7:].unsqueeze(-1)).squeeze(-1) 103 | return gt_boxes, rot_mat 104 | 105 | 106 | def depth_transform(cam_depth, resize, resize_dims, crop, flip, rotate): 107 | """Transform depth based on ida augmentation configuration. 108 | Args: 109 | cam_depth (np array): Nx3, 3: x,y,d. 110 | resize (float): Resize factor. 111 | resize_dims (list): Final dimension. 112 | crop (list): x1, y1, x2, y2 113 | flip (bool): Whether to flip. 114 | rotate (float): Rotation value. 115 | Returns: 116 | np array: [h/down_ratio, w/down_ratio, d] 117 | """ 118 | 119 | H, W = resize_dims 120 | cam_depth[:, :2] = cam_depth[:, :2] * resize 121 | cam_depth[:, 0] -= crop[0] 122 | cam_depth[:, 1] -= crop[1] 123 | if flip: 124 | cam_depth[:, 0] = resize_dims[1] - cam_depth[:, 0] 125 | 126 | cam_depth[:, 0] -= W / 2.0 127 | cam_depth[:, 1] -= H / 2.0 128 | 129 | h = rotate / 180 * np.pi 130 | rot_matrix = [ 131 | [np.cos(h), np.sin(h)], 132 | [-np.sin(h), np.cos(h)], 133 | ] 134 | cam_depth[:, :2] = np.matmul(rot_matrix, cam_depth[:, :2].T).T 135 | 136 | cam_depth[:, 0] += W / 2.0 137 | cam_depth[:, 1] += H / 2.0 138 | 139 | depth_coords = cam_depth[:, :2].astype(np.int16) 140 | 141 | depth_map = np.zeros(resize_dims) 142 | valid_mask = ((depth_coords[:, 1] < resize_dims[0]) 143 | & (depth_coords[:, 0] < resize_dims[1]) 144 | & (depth_coords[:, 1] >= 0) 145 | & (depth_coords[:, 0] >= 0)) 146 | depth_map[depth_coords[valid_mask, 1], 147 | depth_coords[valid_mask, 0]] = cam_depth[valid_mask, 2] 148 | 149 | return torch.Tensor(depth_map) 150 | 151 | 152 | class NuscMVDetDataset(Dataset): 153 | def __init__(self, 154 | ida_aug_conf, 155 | bda_aug_conf, 156 | classes, 157 | data_root, 158 | info_path, 159 | is_train, 160 | use_cbgs=False, 161 | num_sweeps=1, 162 | img_conf=dict(img_mean=[123.675, 116.28, 103.53], 163 | img_std=[58.395, 57.12, 57.375], 164 | to_rgb=True), 165 | return_depth=False, 166 | sweep_idxes=list(), 167 | key_idxes=list()): 168 | """Dataset used for bevdetection task. 169 | Args: 170 | ida_aug_conf (dict): Config for ida augmentation. 171 | bda_aug_conf (dict): Config for bda augmentation. 172 | classes (list): Class names. 173 | use_cbgs (bool): Whether to use cbgs strategy, 174 | Default: False. 175 | num_sweeps (int): Number of sweeps to be used for each sample. 176 | default: 1. 177 | img_conf (dict): Config for image. 178 | return_depth (bool): Whether to use depth gt. 179 | default: False. 180 | sweep_idxes (list): List of sweep idxes to be used. 181 | default: list(). 182 | key_idxes (list): List of key idxes to be used. 183 | default: list(). 184 | """ 185 | super().__init__() 186 | self.infos = mmcv.load(info_path) 187 | self.is_train = is_train 188 | self.ida_aug_conf = ida_aug_conf 189 | self.bda_aug_conf = bda_aug_conf 190 | self.data_root = data_root 191 | self.classes = classes 192 | self.use_cbgs = use_cbgs 193 | if self.use_cbgs: 194 | self.cat2id = {name: i for i, name in enumerate(self.classes)} 195 | self.sample_indices = self._get_sample_indices() 196 | self.num_sweeps = num_sweeps 197 | self.img_mean = np.array(img_conf['img_mean'], np.float32) 198 | self.img_std = np.array(img_conf['img_std'], np.float32) 199 | self.to_rgb = img_conf['to_rgb'] 200 | self.return_depth = return_depth 201 | assert sum([sweep_idx >= 0 for sweep_idx in sweep_idxes]) \ 202 | == len(sweep_idxes), 'All `sweep_idxes` must greater \ 203 | than or equal to 0.' 204 | 205 | self.sweeps_idx = sweep_idxes 206 | assert sum([key_idx < 0 for key_idx in key_idxes]) == len(key_idxes),\ 207 | 'All `key_idxes` must less than 0.' 208 | self.key_idxes = [0] + key_idxes 209 | 210 | def _get_sample_indices(self): 211 | """Load annotations from ann_file. 212 | Args: 213 | ann_file (str): Path of the annotation file. 214 | Returns: 215 | list[dict]: List of annotations after class sampling. 216 | """ 217 | class_sample_idxs = {cat_id: [] for cat_id in self.cat2id.values()} 218 | for idx, info in enumerate(self.infos): 219 | gt_names = set( 220 | [ann_info['category_name'] for ann_info in info['ann_infos']]) 221 | for gt_name in gt_names: 222 | gt_name = map_name_from_general_to_detection[gt_name] 223 | if gt_name not in self.classes: 224 | continue 225 | class_sample_idxs[self.cat2id[gt_name]].append(idx) 226 | duplicated_samples = sum( 227 | [len(v) for _, v in class_sample_idxs.items()]) 228 | class_distribution = { 229 | k: len(v) / duplicated_samples 230 | for k, v in class_sample_idxs.items() 231 | } 232 | 233 | sample_indices = [] 234 | 235 | frac = 1.0 / len(self.classes) 236 | ratios = [frac / v for v in class_distribution.values()] 237 | for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios): 238 | sample_indices += np.random.choice(cls_inds, 239 | int(len(cls_inds) * 240 | ratio)).tolist() 241 | return sample_indices 242 | 243 | def sample_ida_augmentation(self): 244 | """Generate ida augmentation values based on ida_config.""" 245 | H, W = self.ida_aug_conf['H'], self.ida_aug_conf['W'] 246 | fH, fW = self.ida_aug_conf['final_dim'] 247 | if self.is_train: 248 | resize = np.random.uniform(*self.ida_aug_conf['resize_lim']) 249 | resize_dims = (int(W * resize), int(H * resize)) 250 | newW, newH = resize_dims 251 | crop_h = int( 252 | (1 - np.random.uniform(*self.ida_aug_conf['bot_pct_lim'])) * 253 | newH) - fH 254 | crop_w = int(np.random.uniform(0, max(0, newW - fW))) 255 | crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) 256 | flip = False 257 | if self.ida_aug_conf['rand_flip'] and np.random.choice([0, 1]): 258 | flip = True 259 | rotate_ida = np.random.uniform(*self.ida_aug_conf['rot_lim']) 260 | else: 261 | resize = max(fH / H, fW / W) 262 | resize_dims = (int(W * resize), int(H * resize)) 263 | newW, newH = resize_dims 264 | crop_h = int( 265 | (1 - np.mean(self.ida_aug_conf['bot_pct_lim'])) * newH) - fH 266 | crop_w = int(max(0, newW - fW) / 2) 267 | crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) 268 | flip = False 269 | rotate_ida = 0 270 | return resize, resize_dims, crop, flip, rotate_ida 271 | 272 | def sample_bda_augmentation(self): 273 | """Generate bda augmentation values based on bda_config.""" 274 | if self.is_train: 275 | rotate_bda = np.random.uniform(*self.bda_aug_conf['rot_lim']) 276 | scale_bda = np.random.uniform(*self.bda_aug_conf['scale_lim']) 277 | flip_dx = np.random.uniform() < self.bda_aug_conf['flip_dx_ratio'] 278 | flip_dy = np.random.uniform() < self.bda_aug_conf['flip_dy_ratio'] 279 | else: 280 | rotate_bda = 0 281 | scale_bda = 1.0 282 | flip_dx = False 283 | flip_dy = False 284 | return rotate_bda, scale_bda, flip_dx, flip_dy 285 | 286 | def get_image(self, cam_infos, cams): 287 | """Given data and cam_names, return image data needed. 288 | Args: 289 | sweeps_data (list): Raw data used to generate the data we needed. 290 | cams (list): Camera names. 291 | Returns: 292 | Tensor: Image data after processing. 293 | Tensor: Transformation matrix from camera to ego. 294 | Tensor: Intrinsic matrix. 295 | Tensor: Transformation matrix for ida. 296 | Tensor: Transformation matrix from key 297 | frame camera to sweep frame camera. 298 | Tensor: timestamps. 299 | dict: meta infos needed for evaluation. 300 | """ 301 | assert len(cam_infos) > 0 302 | sweep_imgs = list() 303 | sweep_sensor2ego_mats = list() 304 | sweep_intrin_mats = list() 305 | sweep_ida_mats = list() 306 | sweep_sensor2sensor_mats = list() 307 | sweep_timestamps = list() 308 | gt_depth = list() 309 | for cam in cams: 310 | imgs = list() 311 | sensor2ego_mats = list() 312 | intrin_mats = list() 313 | ida_mats = list() 314 | sensor2sensor_mats = list() 315 | timestamps = list() 316 | key_info = cam_infos[0] 317 | resize, resize_dims, crop, flip, \ 318 | rotate_ida = self.sample_ida_augmentation( 319 | ) 320 | for sweep_idx, cam_info in enumerate(cam_infos): 321 | 322 | img = Image.open( 323 | os.path.join(self.data_root, cam_info[cam]['filename'])) 324 | # img = Image.fromarray(img) 325 | w, x, y, z = cam_info[cam]['calibrated_sensor']['rotation'] 326 | # sweep sensor to sweep ego 327 | sweepsensor2sweepego_rot = torch.Tensor( 328 | Quaternion(w, x, y, z).rotation_matrix) 329 | sweepsensor2sweepego_tran = torch.Tensor( 330 | cam_info[cam]['calibrated_sensor']['translation']) 331 | sweepsensor2sweepego = sweepsensor2sweepego_rot.new_zeros( 332 | (4, 4)) 333 | sweepsensor2sweepego[3, 3] = 1 334 | sweepsensor2sweepego[:3, :3] = sweepsensor2sweepego_rot 335 | sweepsensor2sweepego[:3, -1] = sweepsensor2sweepego_tran 336 | # sweep ego to global 337 | w, x, y, z = cam_info[cam]['ego_pose']['rotation'] 338 | sweepego2global_rot = torch.Tensor( 339 | Quaternion(w, x, y, z).rotation_matrix) 340 | sweepego2global_tran = torch.Tensor( 341 | cam_info[cam]['ego_pose']['translation']) 342 | sweepego2global = sweepego2global_rot.new_zeros((4, 4)) 343 | sweepego2global[3, 3] = 1 344 | sweepego2global[:3, :3] = sweepego2global_rot 345 | sweepego2global[:3, -1] = sweepego2global_tran 346 | 347 | # global sensor to cur ego 348 | w, x, y, z = key_info[cam]['ego_pose']['rotation'] 349 | keyego2global_rot = torch.Tensor( 350 | Quaternion(w, x, y, z).rotation_matrix) 351 | keyego2global_tran = torch.Tensor( 352 | key_info[cam]['ego_pose']['translation']) 353 | keyego2global = keyego2global_rot.new_zeros((4, 4)) 354 | keyego2global[3, 3] = 1 355 | keyego2global[:3, :3] = keyego2global_rot 356 | keyego2global[:3, -1] = keyego2global_tran 357 | global2keyego = keyego2global.inverse() 358 | 359 | # cur ego to sensor 360 | w, x, y, z = key_info[cam]['calibrated_sensor']['rotation'] 361 | keysensor2keyego_rot = torch.Tensor( 362 | Quaternion(w, x, y, z).rotation_matrix) 363 | keysensor2keyego_tran = torch.Tensor( 364 | key_info[cam]['calibrated_sensor']['translation']) 365 | keysensor2keyego = keysensor2keyego_rot.new_zeros((4, 4)) 366 | keysensor2keyego[3, 3] = 1 367 | keysensor2keyego[:3, :3] = keysensor2keyego_rot 368 | keysensor2keyego[:3, -1] = keysensor2keyego_tran 369 | keyego2keysensor = keysensor2keyego.inverse() 370 | keysensor2sweepsensor = ( 371 | keyego2keysensor @ global2keyego @ sweepego2global 372 | @ sweepsensor2sweepego).inverse() 373 | sweepsensor2keyego = global2keyego @ sweepego2global @\ 374 | sweepsensor2sweepego 375 | sensor2ego_mats.append(sweepsensor2keyego) 376 | sensor2sensor_mats.append(keysensor2sweepsensor) 377 | intrin_mat = torch.zeros((4, 4)) 378 | intrin_mat[3, 3] = 1 379 | intrin_mat[:3, :3] = torch.Tensor( 380 | cam_info[cam]['calibrated_sensor']['camera_intrinsic']) 381 | if self.return_depth and sweep_idx == 0: 382 | file_name = os.path.split(cam_info[cam]['filename'])[-1] 383 | point_depth = np.fromfile(os.path.join( 384 | self.data_root, 'depth_gt', f'{file_name}.bin'), 385 | dtype=np.float32, 386 | count=-1).reshape(-1, 3) 387 | point_depth_augmented = depth_transform( 388 | point_depth, resize, self.ida_aug_conf['final_dim'], 389 | crop, flip, rotate_ida) 390 | gt_depth.append(point_depth_augmented) 391 | img, ida_mat = img_transform( 392 | img, 393 | resize=resize, 394 | resize_dims=resize_dims, 395 | crop=crop, 396 | flip=flip, 397 | rotate=rotate_ida, 398 | ) 399 | ida_mats.append(ida_mat) 400 | img = mmcv.imnormalize(np.array(img), self.img_mean, 401 | self.img_std, self.to_rgb) 402 | img = torch.from_numpy(img).permute(2, 0, 1) 403 | imgs.append(img) 404 | intrin_mats.append(intrin_mat) 405 | timestamps.append(cam_info[cam]['timestamp']) 406 | sweep_imgs.append(torch.stack(imgs)) 407 | sweep_sensor2ego_mats.append(torch.stack(sensor2ego_mats)) 408 | sweep_intrin_mats.append(torch.stack(intrin_mats)) 409 | sweep_ida_mats.append(torch.stack(ida_mats)) 410 | sweep_sensor2sensor_mats.append(torch.stack(sensor2sensor_mats)) 411 | sweep_timestamps.append(torch.tensor(timestamps)) 412 | # Get mean pose of all cams. 413 | ego2global_rotation = np.mean( 414 | [key_info[cam]['ego_pose']['rotation'] for cam in cams], 0) 415 | ego2global_translation = np.mean( 416 | [key_info[cam]['ego_pose']['translation'] for cam in cams], 0) 417 | img_metas = dict( 418 | box_type_3d=LiDARInstance3DBoxes, 419 | ego2global_translation=ego2global_translation, 420 | ego2global_rotation=ego2global_rotation, 421 | ) 422 | 423 | ret_list = [ 424 | torch.stack(sweep_imgs).permute(1, 0, 2, 3, 4), 425 | torch.stack(sweep_sensor2ego_mats).permute(1, 0, 2, 3), 426 | torch.stack(sweep_intrin_mats).permute(1, 0, 2, 3), 427 | torch.stack(sweep_ida_mats).permute(1, 0, 2, 3), 428 | torch.stack(sweep_sensor2sensor_mats).permute(1, 0, 2, 3), 429 | torch.stack(sweep_timestamps).permute(1, 0), 430 | img_metas, 431 | ] 432 | if self.return_depth: 433 | ret_list.append(torch.stack(gt_depth)) 434 | return ret_list 435 | 436 | def get_gt(self, info, cams): 437 | """Generate gt labels from info. 438 | Args: 439 | info(dict): Infos needed to generate gt labels. 440 | cams(list): Camera names. 441 | Returns: 442 | Tensor: GT bboxes. 443 | Tensor: GT labels. 444 | """ 445 | ego2global_rotation = np.mean( 446 | [info['cam_infos'][cam]['ego_pose']['rotation'] for cam in cams], 447 | 0) 448 | ego2global_translation = np.mean([ 449 | info['cam_infos'][cam]['ego_pose']['translation'] for cam in cams 450 | ], 0) 451 | trans = -np.array(ego2global_translation) 452 | rot = Quaternion(ego2global_rotation).inverse 453 | gt_boxes = list() 454 | gt_labels = list() 455 | for ann_info in info['ann_infos']: 456 | # Use ego coordinate. 457 | if (map_name_from_general_to_detection[ann_info['category_name']] 458 | not in self.classes 459 | or ann_info['num_lidar_pts'] + ann_info['num_radar_pts'] <= 460 | 0): 461 | continue 462 | box = Box( 463 | ann_info['translation'], 464 | ann_info['size'], 465 | Quaternion(ann_info['rotation']), 466 | velocity=ann_info['velocity'], 467 | ) 468 | box.translate(trans) 469 | box.rotate(rot) 470 | box_xyz = np.array(box.center) 471 | box_dxdydz = np.array(box.wlh)[[1, 0, 2]] 472 | box_yaw = np.array([box.orientation.yaw_pitch_roll[0]]) 473 | box_velo = np.array(box.velocity[:2]) 474 | gt_box = np.concatenate([box_xyz, box_dxdydz, box_yaw, box_velo]) 475 | gt_boxes.append(gt_box) 476 | gt_labels.append( 477 | self.classes.index(map_name_from_general_to_detection[ 478 | ann_info['category_name']])) 479 | return torch.Tensor(gt_boxes), torch.tensor(gt_labels) 480 | 481 | def choose_cams(self): 482 | """Choose cameras randomly. 483 | Returns: 484 | list: Cameras to be used. 485 | """ 486 | if self.is_train and self.ida_aug_conf['Ncams'] < len( 487 | self.ida_aug_conf['cams']): 488 | cams = np.random.choice(self.ida_aug_conf['cams'], 489 | self.ida_aug_conf['Ncams'], 490 | replace=False) 491 | else: 492 | cams = self.ida_aug_conf['cams'] 493 | return cams 494 | 495 | def __getitem__(self, idx): 496 | if self.use_cbgs: 497 | idx = self.sample_indices[idx] 498 | cam_infos = list() 499 | # TODO: Check if it still works when number of cameras is reduced. 500 | cams = self.choose_cams() 501 | for key_idx in self.key_idxes: 502 | cur_idx = key_idx + idx 503 | # Handle scenarios when current idx doesn't have previous key 504 | # frame or previous key frame is from another scene. 505 | if cur_idx < 0: 506 | cur_idx = idx 507 | elif self.infos[cur_idx]['scene_token'] != self.infos[idx][ 508 | 'scene_token']: 509 | cur_idx = idx 510 | info = self.infos[cur_idx] 511 | cam_infos.append(info['cam_infos']) 512 | for sweep_idx in self.sweeps_idx: 513 | if len(info['sweeps']) == 0: 514 | cam_infos.append(info['cam_infos']) 515 | else: 516 | # Handle scenarios when current sweep doesn't have all 517 | # cam keys. 518 | for i in range(min(len(info['sweeps']) - 1, sweep_idx), -1, 519 | -1): 520 | if sum([cam in info['sweeps'][i] 521 | for cam in cams]) == len(cams): 522 | cam_infos.append(info['sweeps'][i]) 523 | break 524 | image_data_list = self.get_image(cam_infos, cams) 525 | ret_list = list() 526 | ( 527 | sweep_imgs, 528 | sweep_sensor2ego_mats, 529 | sweep_intrins, 530 | sweep_ida_mats, 531 | sweep_sensor2sensor_mats, 532 | sweep_timestamps, 533 | img_metas, 534 | ) = image_data_list[:7] 535 | img_metas['token'] = self.infos[idx]['sample_token'] 536 | if self.is_train: 537 | gt_boxes, gt_labels = self.get_gt(self.infos[idx], cams) 538 | # Temporary solution for test. 539 | else: 540 | gt_boxes, gt_labels = self.get_gt(self.infos[idx], cams) 541 | 542 | rotate_bda, scale_bda, flip_dx, flip_dy = self.sample_bda_augmentation( 543 | ) 544 | bda_mat = sweep_imgs.new_zeros(4, 4) 545 | bda_mat[3, 3] = 1 546 | gt_boxes, bda_rot = bev_transform(gt_boxes, rotate_bda, scale_bda, 547 | flip_dx, flip_dy) 548 | bda_mat[:3, :3] = bda_rot 549 | ret_list = [ 550 | sweep_imgs, 551 | sweep_sensor2ego_mats, 552 | sweep_intrins, 553 | sweep_ida_mats, 554 | sweep_sensor2sensor_mats, 555 | bda_mat, 556 | sweep_timestamps, 557 | img_metas, 558 | gt_boxes, 559 | gt_labels, 560 | self.infos[idx], 561 | ] 562 | if self.return_depth: 563 | ret_list.append(image_data_list[7]) 564 | return ret_list 565 | 566 | def __str__(self): 567 | return f"""NuscData: {len(self)} samples. Split: \ 568 | {"train" if self.is_train else "val"}. 569 | Augmentation Conf: {self.ida_aug_conf}""" 570 | 571 | def __len__(self): 572 | if self.use_cbgs: 573 | return len(self.sample_indices) 574 | else: 575 | return len(self.infos) 576 | 577 | 578 | def collate_fn(data, is_return_depth=False): 579 | imgs_batch = list() 580 | sensor2ego_mats_batch = list() 581 | intrin_mats_batch = list() 582 | ida_mats_batch = list() 583 | sensor2sensor_mats_batch = list() 584 | bda_mat_batch = list() 585 | timestamps_batch = list() 586 | gt_boxes_batch = list() 587 | gt_labels_batch = list() 588 | img_metas_batch = list() 589 | depth_labels_batch = list() 590 | infos_batch = list() 591 | for iter_data in data: 592 | ( 593 | sweep_imgs, 594 | sweep_sensor2ego_mats, 595 | sweep_intrins, 596 | sweep_ida_mats, 597 | sweep_sensor2sensor_mats, 598 | bda_mat, 599 | sweep_timestamps, 600 | img_metas, 601 | gt_boxes, 602 | gt_labels, 603 | infos, 604 | ) = iter_data[:11] 605 | if is_return_depth: 606 | gt_depth = iter_data[11] 607 | depth_labels_batch.append(gt_depth) 608 | imgs_batch.append(sweep_imgs) 609 | sensor2ego_mats_batch.append(sweep_sensor2ego_mats) 610 | intrin_mats_batch.append(sweep_intrins) 611 | ida_mats_batch.append(sweep_ida_mats) 612 | sensor2sensor_mats_batch.append(sweep_sensor2sensor_mats) 613 | bda_mat_batch.append(bda_mat) 614 | timestamps_batch.append(sweep_timestamps) 615 | img_metas_batch.append(img_metas) 616 | gt_boxes_batch.append(gt_boxes) 617 | gt_labels_batch.append(gt_labels) 618 | infos_batch.append(infos) 619 | mats_dict = dict() 620 | mats_dict['sensor2ego_mats'] = torch.stack(sensor2ego_mats_batch) 621 | mats_dict['intrin_mats'] = torch.stack(intrin_mats_batch) 622 | mats_dict['ida_mats'] = torch.stack(ida_mats_batch) 623 | mats_dict['sensor2sensor_mats'] = torch.stack(sensor2sensor_mats_batch) 624 | mats_dict['bda_mat'] = torch.stack(bda_mat_batch) 625 | ret_list = [ 626 | torch.stack(imgs_batch), 627 | mats_dict, 628 | torch.stack(timestamps_batch), 629 | img_metas_batch, 630 | gt_boxes_batch, 631 | gt_labels_batch, 632 | infos_batch 633 | ] 634 | if is_return_depth: 635 | ret_list.append(torch.stack(depth_labels_batch)) 636 | return ret_list --------------------------------------------------------------------------------