├── assets
    └── BEVSAN.png
├── layers
    ├── backbones
    │   ├── __init__.py
    │   ├── depth_net.py
    │   ├── lss_fpn.py
    │   └── multi_head_fpn.py
    ├── __init__.py
    ├── heads
    │   ├── __init__.py
    │   ├── mh_depth_head.py
    │   └── bev_depth_head.py
    └── voxel_mix.py
├── ops
    └── voxel_pooling
    │   ├── __init__.py
    │   ├── src
    │       ├── voxel_pooling_forward.cpp
    │       └── voxel_pooling_forward_cuda.cu
    │   └── voxel_pooling.py
├── requirements.txt
├── test
    ├── data
    │   └── nuscenes
    │   │   └── samples
    │   │       ├── CAM_BACK
    │   │           └── n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg
    │   │       ├── CAM_FRONT
    │   │           └── n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg
    │   │       ├── CAM_BACK_LEFT
    │   │           └── n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg
    │   │       ├── CAM_BACK_RIGHT
    │   │           └── n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg
    │   │       ├── CAM_FRONT_LEFT
    │   │           └── n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg
    │   │       └── CAM_FRONT_RIGHT
    │   │           └── n015-2018-07-18-11-07-57+0800__CAM_FRONT_RIGHT__1531883530420339.jpg
    ├── test_ops
    │   └── test_voxel_pooling.py
    ├── test_layers
    │   ├── test_backbone.py
    │   └── test_head.py
    └── test_dataset
    │   └── test_nusc_mv_det_dataset.py
├── requirements-dev.txt
├── .pre-commit-config.yaml
├── LICENSE.md
├── utils
    └── torch_dist.py
├── README.md
├── setup.py
├── exps
    ├── bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py
    ├── bev_depth_lss_r50_256x704_128x128_24e_2key.py
    └── bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da.py
├── .gitignore
├── models
    ├── bev_depth.py
    └── uda_depth.py
├── callbacks
    └── ema.py
├── scripts
    ├── gen_depth_gt.py
    └── gen_info.py
├── evaluators
    └── det_mv_evaluators.py
└── dataset
    └── nusc_mv_det_dataset.py


/assets/BEVSAN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/assets/BEVSAN.png


--------------------------------------------------------------------------------
/layers/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .lss_fpn import LSSFPN
2 | 
3 | __all__ = ['LSSFPN']
4 | 


--------------------------------------------------------------------------------
/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .heads.bev_depth_head import BEVDepthHead
2 | 
3 | __all__ = ['BEVDepthHead']
4 | 


--------------------------------------------------------------------------------
/layers/heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .bev_depth_head import BEVDepthHead
2 | 
3 | __all__ = ['BEVDepthHead']
4 | 


--------------------------------------------------------------------------------
/ops/voxel_pooling/__init__.py:
--------------------------------------------------------------------------------
1 | from .voxel_pooling import voxel_pooling
2 | 
3 | __all__ = ['voxel_pooling']
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numba
 2 | numpy
 3 | nuscenes-devkit
 4 | opencv-python-headless
 5 | pandas
 6 | pytorch-lightning==1.6.0
 7 | scikit-image
 8 | scipy
 9 | setuptools==59.5.0
10 | tensorboardX
11 | torch==1.9.0
12 | torchvision==0.10.0
13 | 


--------------------------------------------------------------------------------
/test/data/nuscenes/samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_BACK/n015-2018-07-18-11-07-57+0800__CAM_BACK__1531883530437525.jpg


--------------------------------------------------------------------------------
/test/data/nuscenes/samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_FRONT/n015-2018-07-18-11-07-57+0800__CAM_FRONT__1531883530412470.jpg


--------------------------------------------------------------------------------
/test/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-07-18-11-07-57+0800__CAM_BACK_LEFT__1531883530447423.jpg


--------------------------------------------------------------------------------
/test/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_BACK_RIGHT__1531883530427893.jpg


--------------------------------------------------------------------------------
/test/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_LEFT__1531883530404844.jpg


--------------------------------------------------------------------------------
/test/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_RIGHT__1531883530420339.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/litwellchi/BEV-SAN/HEAD/test/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-07-18-11-07-57+0800__CAM_FRONT_RIGHT__1531883530420339.jpg


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # code formatter
 2 | # force to use same version of the formatter, can be changed only by maintainer.
 3 | 
 4 | anybadge
 5 | autoflake==1.4
 6 | black==20.8b1
 7 | flake8
 8 | gitlint
 9 | isort==4.3.21
10 | nbsphinx
11 | pre-commit
12 | pre-commit==2.7.1
13 | pylint==2.3.1
14 | pytest
15 | pytest-cov
16 | radon==4.2.0
17 | recommonmark
18 | seed-isort-config
19 | setuptools
20 | 
21 | # -----  document usage
22 | sphinx==3.5.4
23 | sphinx-material
24 | sphinx_markdown_tables
25 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/PyCQA/flake8
 3 |     rev: 3.8.3
 4 |     hooks:
 5 |       - id: flake8
 6 |   - repo: https://github.com/PyCQA/isort
 7 |     rev: 5.10.1
 8 |     hooks:
 9 |       - id: isort
10 |   - repo: https://github.com/pre-commit/mirrors-yapf
11 |     rev: v0.30.0
12 |     hooks:
13 |       - id: yapf
14 |   - repo: https://github.com/pre-commit/pre-commit-hooks
15 |     rev: v3.1.0
16 |     hooks:
17 |       - id: trailing-whitespace
18 |       - id: check-yaml
19 |       - id: end-of-file-fixer
20 |       - id: requirements-txt-fixer
21 |       - id: double-quote-string-fixer
22 |       - id: check-merge-conflict
23 |       - id: fix-encoding-pragma
24 |         args: ["--remove"]
25 |       - id: mixed-line-ending
26 |         args: ["--fix=lf"]
27 |   - repo: https://github.com/codespell-project/codespell
28 |     rev: v2.1.0
29 |     hooks:
30 |       - id: codespell
31 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Megvii-BaseDetection
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/utils/torch_dist.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @author: zeming li
 3 | @contact: zengarden2009@gmail.com
 4 | """
 5 | from torch import distributed as dist
 6 | 
 7 | 
 8 | def get_rank() -> int:
 9 |     if not dist.is_available():
10 |         return 0
11 |     if not dist.is_initialized():
12 |         return 0
13 |     return dist.get_rank()
14 | 
15 | 
16 | def get_world_size() -> int:
17 |     if not dist.is_available():
18 |         return 1
19 |     if not dist.is_initialized():
20 |         return 1
21 |     return dist.get_world_size()
22 | 
23 | 
24 | def synchronize():
25 |     """Helper function to synchronize (barrier)
26 |         among all processes when using distributed training"""
27 |     if not dist.is_available():
28 |         return
29 |     if not dist.is_initialized():
30 |         return
31 |     current_world_size = dist.get_world_size()
32 |     if current_world_size == 1:
33 |         return
34 |     dist.barrier()
35 | 
36 | 
37 | def all_gather_object(obj):
38 |     world_size = get_world_size()
39 |     if world_size < 2:
40 |         return [obj]
41 |     output = [None for _ in range(world_size)]
42 |     dist.all_gather_object(output, obj)
43 |     return output
44 | 
45 | 
46 | def is_available() -> bool:
47 |     return dist.is_available()
48 | 


--------------------------------------------------------------------------------
/test/test_ops/test_voxel_pooling.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | from ops.voxel_pooling import voxel_pooling
 7 | 
 8 | 
 9 | class TestLSSFPN(unittest.TestCase):
10 |     @pytest.mark.skipif(condition=torch.cuda.is_available() is False,
11 |                         reason='No gpu available.')
12 |     def test_voxel_pooling(self):
13 |         import numpy as np
14 | 
15 |         np.random.seed(0)
16 |         torch.manual_seed(0)
17 |         geom_xyz = torch.rand([2, 6, 10, 10, 10, 3]) * 160 - 80
18 |         geom_xyz[..., 2] /= 100
19 |         geom_xyz = geom_xyz.reshape(2, -1, 3)
20 |         features = torch.rand([2, 6, 10, 10, 10, 80]) - 0.5
21 |         gt_features = features.reshape(2, -1, 80)
22 |         gt_bev_featuremap = features.new_zeros(2, 128, 128, 80)
23 |         for i in range(2):
24 |             for j in range(geom_xyz.shape[1]):
25 |                 x = geom_xyz[i, j, 0].int()
26 |                 y = geom_xyz[i, j, 1].int()
27 |                 z = geom_xyz[i, j, 2].int()
28 |                 if x < 0 or x >= 128 or y < 0 or y >= 128 or z < 0 or z >= 1:
29 |                     continue
30 |                 gt_bev_featuremap[i, y, x, :] += gt_features[i, j, :]
31 |         gt_bev_featuremap = gt_bev_featuremap.permute(0, 3, 1, 2).cuda()
32 |         bev_featuremap = voxel_pooling(
33 |             geom_xyz.cuda().int(), features.cuda(),
34 |             torch.tensor([128, 128, 1], dtype=torch.int, device='cuda'))
35 |         assert torch.allclose(gt_bev_featuremap.cuda(),
36 |                               bev_featuremap,
37 |                               rtol=1e-3)
38 | 


--------------------------------------------------------------------------------
/test/test_layers/test_backbone.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | from layers.backbones.lss_fpn import LSSFPN
 7 | 
 8 | 
 9 | class TestLSSFPN(unittest.TestCase):
10 |     def setUp(self) -> None:
11 |         backbone_conf = {
12 |             'x_bound': [-10, 10, 0.5],
13 |             'y_bound': [-10, 10, 0.5],
14 |             'z_bound': [-5, 3, 8],
15 |             'd_bound': [2.0, 22, 1.0],
16 |             'final_dim': [64, 64],
17 |             'output_channels':
18 |             10,
19 |             'downsample_factor':
20 |             16,
21 |             'img_backbone_conf':
22 |             dict(type='ResNet',
23 |                  depth=18,
24 |                  frozen_stages=0,
25 |                  out_indices=[0, 1, 2, 3],
26 |                  norm_eval=False,
27 |                  base_channels=8),
28 |             'img_neck_conf':
29 |             dict(
30 |                 type='SECONDFPN',
31 |                 in_channels=[8, 16, 32, 64],
32 |                 upsample_strides=[0.25, 0.5, 1, 2],
33 |                 out_channels=[16, 16, 16, 16],
34 |             ),
35 |             'depth_net_conf':
36 |             dict(in_channels=64, mid_channels=64),
37 |         }
38 |         self.lss_fpn = LSSFPN(**backbone_conf).cuda()
39 | 
40 |     @pytest.mark.skipif(torch.cuda.is_available() is False,
41 |                         reason='No gpu available.')
42 |     def test_forward(self):
43 |         sweep_imgs = torch.rand(2, 2, 6, 3, 64, 64).cuda()
44 |         sensor2ego_mats = torch.rand(2, 2, 6, 4, 4).cuda()
45 |         intrin_mats = torch.rand(2, 2, 6, 4, 4).cuda()
46 |         ida_mats = torch.rand(2, 2, 6, 4, 4).cuda()
47 |         sensor2sensor_mats = torch.rand(2, 2, 6, 4, 4).cuda()
48 |         bda_mat = torch.rand(2, 4, 4).cuda()
49 |         mats_dict = dict()
50 |         mats_dict['sensor2ego_mats'] = sensor2ego_mats
51 |         mats_dict['intrin_mats'] = intrin_mats
52 |         mats_dict['ida_mats'] = ida_mats
53 |         mats_dict['sensor2sensor_mats'] = sensor2sensor_mats
54 |         mats_dict['bda_mat'] = bda_mat
55 |         preds = self.lss_fpn.forward(sweep_imgs, mats_dict)
56 |         assert preds.shape == torch.Size([2, 20, 40, 40])
57 | 


--------------------------------------------------------------------------------
/ops/voxel_pooling/src/voxel_pooling_forward.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Megvii Inc. All rights reserved.
 2 | #include <THC/THC.h>
 3 | #include <cuda.h>
 4 | #include <cuda_runtime_api.h>
 5 | #include <torch/extension.h>
 6 | #include <torch/serialize/tensor.h>
 7 | 
 8 | #include <vector>
 9 | 
10 | extern THCState *state;
11 | 
12 | #define CHECK_CUDA(x) \
13 |   TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
14 | #define CHECK_CONTIGUOUS(x) \
15 |   TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
16 | #define CHECK_INPUT(x) \
17 |   CHECK_CUDA(x);       \
18 |   CHECK_CONTIGUOUS(x)
19 | 
20 | int voxel_pooling_forward_wrapper(int batch_size, int num_points, int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z, at::Tensor geom_xyz_tensor,
21 |                        at::Tensor input_features_tensor, at::Tensor output_features_tensor, at::Tensor pos_memo_tensor);
22 | 
23 | void voxel_pooling_forward_kernel_launcher(int batch_size, int num_points, int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z, const int *geom_xyz, const float *input_features,
24 |                                 float *output_features, int *pos_memo, cudaStream_t stream);
25 | 
26 | int voxel_pooling_forward_wrapper(int batch_size, int num_points, int num_channels, int num_voxel_x, int num_voxel_y, int num_voxel_z, at::Tensor geom_xyz_tensor,
27 |                        at::Tensor input_features_tensor, at::Tensor output_features_tensor, at::Tensor pos_memo_tensor) {
28 |     CHECK_INPUT(geom_xyz_tensor);
29 |     CHECK_INPUT(input_features_tensor);
30 |     const int *geom_xyz = geom_xyz_tensor.data_ptr<int>();
31 |     const float *input_features = input_features_tensor.data_ptr<float>();
32 |     float *output_features = output_features_tensor.data_ptr<float>();
33 |     int *pos_memo = pos_memo_tensor.data_ptr<int>();
34 | 
35 |     cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
36 |     voxel_pooling_forward_kernel_launcher(batch_size, num_points, num_channels, num_voxel_x, num_voxel_y, num_voxel_z, geom_xyz, input_features,
37 |                                 output_features, pos_memo, stream);
38 |     return 1;
39 | }
40 | 
41 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
42 |   m.def("voxel_pooling_forward_wrapper", &voxel_pooling_forward_wrapper, "voxel_pooling_forward_wrapper");
43 | }
44 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BEV-SAN: Accurate BEV 3D Object Detection via Slice Attention Networks
 2 | The official release of BEV-SAN is now available.
 3 | 
 4 | ![Python 3.7](https://img.shields.io/badge/Python-3.7-red)
 5 | [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2212.01231)
 6 | 
 7 | <img src="assets/BEVSAN.png" width="1000" >
 8 | 
 9 | ## Updates!!
10 | * 【2023/10/24】 We publish the code base of our work. Updating soon.
11 | * 【2023/02/24】 Our work has been accepted by the CVPR2023.
12 | ## Quick Start
13 | ### Installation
14 | **Step 0.** Install [pytorch](https://pytorch.org/)(v1.9.0).
15 | 
16 | **Step 1.** Install [MMDetection3D](https://github.com/open-mmlab/mmdetection3d)(v1.0.0rc4).
17 | 
18 | **Step 2.** Install requirements.
19 | ```shell
20 | pip install -r requirements.txt
21 | ```
22 | **Step 3.** Install BEVDepth(gpu required).
23 | ```shell
24 | python setup.py develop
25 | ```
26 | 
27 | ### Data Preparation
28 | **Step 0.** Download nuScenes official dataset.
29 | 
30 | **Step 1.** Symlink the dataset root to `./data/`.
31 | ```
32 | ln -s [nuscenes root] ./data/
33 | ```
34 | The directory will be as follows.
35 | ```
36 | BEVDepth
37 | ├── data
38 | │   ├── nuScenes
39 | │   │   ├── maps
40 | │   │   ├── samples
41 | │   │   ├── sweeps
42 | │   │   ├── v1.0-test
43 | |   |   ├── v1.0-trainval
44 | ```
45 | **Step 2.** Prepare infos.
46 | ```
47 | python scripts/gen_info.py
48 | ```
49 | **Step 3.** Prepare depth gt.
50 | ```
51 | python scripts/gen_depth_gt.py
52 | ```
53 | 
54 | ### Tutorials
55 | **Train.**
56 | ```
57 | python [EXP_PATH] --amp_backend native -b 8 --gpus 8
58 | ```
59 | **Eval.**
60 | ```
61 | python [EXP_PATH] --ckpt_path [CKPT_PATH] -e -b 8 --gpus 8
62 | ```
63 | 
64 | **
65 | 
66 | ## Cite BEV-SAN
67 | If you use BEV-SAN in your research, please cite our work by using the following BibTeX entry:
68 | 
69 | ## Thanks
70 | Our code is based on the BEVDepth(https://github.com/Megvii-BaseDetection/BEVDepth)
71 | 
72 | ```latex
73 | @misc{chi2022bevsan,
74 |       title={BEV-SAN: Accurate BEV 3D Object Detection via Slice Attention Networks}, 
75 |       author={Xiaowei Chi and Jiaming Liu and Ming Lu and Rongyu Zhang and Zhaoqing Wang and Yandong Guo and Shanghang Zhang},
76 |       year={2022},
77 |       eprint={2212.01231},
78 |       archivePrefix={arXiv},
79 |       primaryClass={cs.CV}
80 | }
81 | ```
82 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from setuptools import find_packages, setup
 5 | from torch.utils.cpp_extension import (BuildExtension, CppExtension,
 6 |                                        CUDAExtension)
 7 | 
 8 | with open('README.md', 'r') as fh:
 9 |     long_description = fh.read()
10 | 
11 | 
12 | def make_cuda_ext(name,
13 |                   module,
14 |                   sources,
15 |                   sources_cuda=[],
16 |                   extra_args=[],
17 |                   extra_include_path=[]):
18 | 
19 |     define_macros = []
20 |     extra_compile_args = {'cxx': [] + extra_args}
21 | 
22 |     if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
23 |         define_macros += [('WITH_CUDA', None)]
24 |         extension = CUDAExtension
25 |         extra_compile_args['nvcc'] = extra_args + [
26 |             '-D__CUDA_NO_HALF_OPERATORS__',
27 |             '-D__CUDA_NO_HALF_CONVERSIONS__',
28 |             '-D__CUDA_NO_HALF2_OPERATORS__',
29 |         ]
30 |         sources += sources_cuda
31 |     else:
32 |         print('Compiling {} without CUDA'.format(name))
33 |         extension = CppExtension
34 |         # raise EnvironmentError('CUDA is required to compile MMDetection!')
35 | 
36 |     return extension(
37 |         name='{}.{}'.format(module, name),
38 |         sources=[os.path.join(*module.split('.'), p) for p in sources],
39 |         include_dirs=extra_include_path,
40 |         define_macros=define_macros,
41 |         extra_compile_args=extra_compile_args,
42 |     )
43 | 
44 | 
45 | setup(
46 |     name='BEVDepth',
47 |     version='0.0.1',
48 |     author='Megvii',
49 |     author_email='liyinhao@megvii.com',
50 |     description='Code for BEVDepth',
51 |     long_description=long_description,
52 |     long_description_content_type='text/markdown',
53 |     url=None,
54 |     packages=find_packages(),
55 |     classifiers=[
56 |         'Programming Language :: Python :: 3',
57 |         'Operating System :: OS Independent',
58 |     ],
59 |     install_requires=[],
60 |     ext_modules=[
61 |         make_cuda_ext(
62 |             name='voxel_pooling_ext',
63 |             module='ops.voxel_pooling',
64 |             sources=['src/voxel_pooling_forward.cpp'],
65 |             sources_cuda=['src/voxel_pooling_forward_cuda.cu'],
66 |         ),
67 |     ],
68 |     cmdclass={'build_ext': BuildExtension},
69 | )
70 | 


--------------------------------------------------------------------------------
/test/test_dataset/test_nusc_mv_det_dataset.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | from dataset.nusc_mv_det_dataset import NuscMVDetDataset
 7 | 
 8 | CLASSES = [
 9 |     'car',
10 |     'truck',
11 |     'construction_vehicle',
12 |     'bus',
13 |     'trailer',
14 |     'barrier',
15 |     'motorcycle',
16 |     'bicycle',
17 |     'pedestrian',
18 |     'traffic_cone',
19 | ]
20 | H = 900
21 | W = 1600
22 | final_dim = (256, 704)
23 | img_conf = dict(img_mean=[123.675, 116.28, 103.53],
24 |                 img_std=[58.395, 57.12, 57.375],
25 |                 to_rgb=True)
26 | ida_aug_conf = {
27 |     'resize_lim': (0.4, 0.4),
28 |     'final_dim':
29 |     final_dim,
30 |     'rot_lim': (0, 0),
31 |     'H':
32 |     H,
33 |     'W':
34 |     W,
35 |     'rand_flip':
36 |     True,
37 |     'bot_pct_lim': (0.0, 0.0),
38 |     'cams': [
39 |         'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
40 |         'CAM_BACK', 'CAM_BACK_RIGHT'
41 |     ],
42 |     'Ncams':
43 |     6,
44 | }
45 | 
46 | bda_aug_conf = {
47 |     'rot_lim': (0, 0),
48 |     'scale_lim': (1, 1),
49 |     'flip_dx_ratio': 0,
50 |     'flip_dy_ratio': 0
51 | }
52 | 
53 | 
54 | class TestNuscMVDetData(unittest.TestCase):
55 |     def test_voxel_pooling(self):
56 |         np.random.seed(0)
57 |         torch.random.manual_seed(0)
58 |         nusc = NuscMVDetDataset(ida_aug_conf,
59 |                                 bda_aug_conf,
60 |                                 CLASSES,
61 |                                 './test/data/nuscenes',
62 |                                 './test/data/nuscenes/infos.pkl',
63 |                                 True,
64 |                                 sweep_idxes=[4])
65 |         ret_list = nusc[0]
66 |         assert torch.isclose(ret_list[0].mean(),
67 |                              torch.tensor(-0.4667),
68 |                              rtol=1e-3)
69 |         assert torch.isclose(ret_list[1].mean(),
70 |                              torch.tensor(0.1678),
71 |                              rtol=1e-3)
72 |         assert torch.isclose(ret_list[2].mean(),
73 |                              torch.tensor(230.0464),
74 |                              rtol=1e-3)
75 |         assert torch.isclose(ret_list[3].mean(),
76 |                              torch.tensor(8.3250),
77 |                              rtol=1e-3)
78 |         assert torch.isclose(ret_list[4].mean(), torch.tensor(0.25), rtol=1e-3)
79 |         assert torch.isclose(ret_list[5].mean(), torch.tensor(0.25), rtol=1e-3)
80 | 


--------------------------------------------------------------------------------
/ops/voxel_pooling/src/voxel_pooling_forward_cuda.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Megvii Inc. All rights reserved.
 2 | #include <math.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | 
 6 | #define THREADS_PER_BLOCK 256
 7 | #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 8 | 
 9 | __global__ void voxel_pooling_forward_kernel(int batch_size, int num_points, int num_channels, int num_voxel_x,
10 |                                              int num_voxel_y, int num_voxel_z, const int *geom_xyz,
11 |                                              const float *input_features, float *output_features, int *pos_memo) {
12 |   // Each thread process only one channel of one voxel.
13 |   int blk_idx = blockIdx.x;
14 |   int thd_idx = threadIdx.x;
15 |   int pt_idx = blk_idx * blockDim.x + thd_idx;
16 |   if (pt_idx >= batch_size * num_points) {
17 |     return;
18 |   } else {
19 |     int batch_idx = pt_idx / num_points;
20 |     int x = geom_xyz[pt_idx * 3];
21 |     int y = geom_xyz[pt_idx * 3 + 1];
22 |     int z = geom_xyz[pt_idx * 3 + 2];
23 |     // if coord of current voxel is out of boundary, return.
24 |     if (x < 0 || x >= num_voxel_x || y < 0 || y >= num_voxel_y || z < 0 || z >= num_voxel_z) {
25 |       return;
26 |     }
27 |     pos_memo[pt_idx * 3] = batch_idx;
28 |     pos_memo[pt_idx * 3 + 1] = y;
29 |     pos_memo[pt_idx * 3 + 2] = x;
30 |     for (int channel_idx = 0; channel_idx < num_channels; channel_idx++) {
31 |       atomicAdd(
32 |           &output_features[(batch_idx * num_voxel_y * num_voxel_x + y * num_voxel_x + x) * num_channels + channel_idx],
33 |           input_features[pt_idx * num_channels + channel_idx]);
34 |     }
35 |   }
36 | }
37 | 
38 | void voxel_pooling_forward_kernel_launcher(int batch_size, int num_points, int num_channels, int num_voxel_x,
39 |                                            int num_voxel_y, int num_voxel_z, const int *geom_xyz,
40 |                                            const float *input_features, float *output_features, int *pos_memo,
41 |                                            cudaStream_t stream) {
42 |   cudaError_t err;
43 | 
44 |   dim3 blocks(DIVUP(batch_size * num_points, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
45 |   dim3 threads(THREADS_PER_BLOCK);
46 | 
47 |   voxel_pooling_forward_kernel<<<blocks, threads, 0, stream>>>(batch_size, num_points, num_channels, num_voxel_x,
48 |                                                                num_voxel_y, num_voxel_z, geom_xyz, input_features,
49 |                                                                output_features, pos_memo);
50 |   // cudaDeviceSynchronize();  // for using printf in kernel function
51 |   err = cudaGetLastError();
52 |   if (cudaSuccess != err) {
53 |     fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
54 |     exit(-1);
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/ops/voxel_pooling/voxel_pooling.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Megvii Inc. All rights reserved.
 2 | import torch
 3 | from torch.autograd import Function
 4 | 
 5 | from . import voxel_pooling_ext
 6 | 
 7 | 
 8 | class VoxelPooling(Function):
 9 |     @staticmethod
10 |     def forward(ctx, geom_xyz: torch.Tensor, input_features: torch.Tensor,
11 |                 voxel_num: torch.Tensor) -> torch.Tensor:
12 |         """Forward function for `voxel pooling.
13 | 
14 |         Args:
15 |             geom_xyz (Tensor): xyz coord for each voxel with the shape
16 |                 of [B, N, 3].
17 |             input_features (Tensor): feature for each voxel with the
18 |                 shape of [B, N, C].
19 |             voxel_num (Tensor): Number of voxels for each dim with the
20 |                 shape of [3].
21 | 
22 |         Returns:
23 |             Tensor: (B, C, H, W) bev feature map.
24 |         """
25 |         assert geom_xyz.is_contiguous()
26 |         assert input_features.is_contiguous()
27 |         # no gradient for input_features and geom_feats
28 |         ctx.mark_non_differentiable(geom_xyz)
29 |         grad_input_features = torch.zeros_like(input_features)
30 |         geom_xyz = geom_xyz.reshape(geom_xyz.shape[0], -1, geom_xyz.shape[-1])
31 |         input_features = input_features.reshape(
32 |             (geom_xyz.shape[0], -1, input_features.shape[-1]))
33 |         assert geom_xyz.shape[1] == input_features.shape[1]
34 |         batch_size = input_features.shape[0]
35 |         num_points = input_features.shape[1]
36 |         num_channels = input_features.shape[2]
37 |         output_features = input_features.new_zeros(batch_size, voxel_num[1],
38 |                                                    voxel_num[0], num_channels)
39 |         # Save the position of bev_feature_map for each input point.
40 |         pos_memo = geom_xyz.new_ones(batch_size, num_points, 3) * -1
41 |         voxel_pooling_ext.voxel_pooling_forward_wrapper(
42 |             batch_size,
43 |             num_points,
44 |             num_channels,
45 |             voxel_num[0],
46 |             voxel_num[1],
47 |             voxel_num[2],
48 |             geom_xyz,
49 |             input_features,
50 |             output_features,
51 |             pos_memo,
52 |         )
53 |         # save grad_input_features and pos_memo for backward
54 |         ctx.save_for_backward(grad_input_features, pos_memo)
55 |         return output_features.permute(0, 3, 1, 2)
56 | 
57 |     @staticmethod
58 |     def backward(ctx, grad_output_features):
59 |         (grad_input_features, pos_memo) = ctx.saved_tensors
60 |         kept = (pos_memo != -1)[..., 0]
61 |         grad_input_features_shape = grad_input_features.shape
62 |         grad_input_features = grad_input_features.reshape(
63 |             grad_input_features.shape[0], -1, grad_input_features.shape[-1])
64 |         grad_input_features[kept] = grad_output_features[
65 |             pos_memo[kept][..., 0].long(), :, pos_memo[kept][..., 1].long(),
66 |             pos_memo[kept][..., 2].long()]
67 |         grad_input_features = grad_input_features.reshape(
68 |             grad_input_features_shape)
69 |         return None, grad_input_features, None
70 | 
71 | 
72 | voxel_pooling = VoxelPooling.apply
73 | 


--------------------------------------------------------------------------------
/exps/bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da_ema.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Megvii Inc. All rights reserved.
 2 | """
 3 | mAP: 0.3589
 4 | mATE: 0.6119
 5 | mASE: 0.2692
 6 | mAOE: 0.5074
 7 | mAVE: 0.4086
 8 | mAAE: 0.2009
 9 | NDS: 0.4797
10 | Eval time: 183.3s
11 | Per-class results:
12 | Object Class    AP      ATE     ASE     AOE     AVE     AAE
13 | car     0.559   0.475   0.157   0.112   0.370   0.205
14 | truck   0.270   0.659   0.196   0.103   0.356   0.181
15 | bus     0.374   0.651   0.184   0.072   0.846   0.326
16 | trailer 0.179   0.963   0.227   0.512   0.294   0.127
17 | construction_vehicle    0.081   0.825   0.481   1.352   0.094   0.345
18 | pedestrian      0.363   0.690   0.297   0.831   0.491   0.244
19 | motorcycle      0.354   0.580   0.255   0.545   0.615   0.164
20 | bicycle 0.301   0.447   0.280   0.920   0.203   0.015
21 | traffic_cone    0.539   0.435   0.324   nan     nan     nan
22 | barrier 0.569   0.394   0.293   0.120   nan     nan
23 | """
24 | from argparse import ArgumentParser, Namespace
25 | 
26 | import pytorch_lightning as pl
27 | import torch
28 | 
29 | from callbacks.ema import EMACallback
30 | from exps.bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da import \
31 |     BEVDepthLightningModel as BaseBEVDepthLightningModel
32 | 
33 | 
34 | class BEVDepthLightningModel(BaseBEVDepthLightningModel):
35 |     def __init__(self, **kwargs):
36 |         super().__init__(**kwargs)
37 |         self.data_use_cbgs = True
38 | 
39 |     def configure_optimizers(self):
40 |         lr = self.basic_lr_per_img * \
41 |             self.batch_size_per_device * self.gpus
42 |         optimizer = torch.optim.AdamW(self.model.parameters(),
43 |                                       lr=lr,
44 |                                       weight_decay=1e-2)
45 |         return [optimizer]
46 | 
47 | 
48 | def main(args: Namespace) -> None:
49 |     if args.seed is not None:
50 |         pl.seed_everything(args.seed)
51 | 
52 |     model = BEVDepthLightningModel(**vars(args))
53 |     train_dataloader = model.train_dataloader()
54 |     ema_callback = EMACallback(len(train_dataloader.dataset) * args.max_epochs)
55 |     trainer = pl.Trainer.from_argparse_args(args, callbacks=[ema_callback])
56 |     if args.evaluate:
57 |         trainer.test(model, ckpt_path=args.ckpt_path)
58 |     else:
59 |         trainer.fit(model)
60 | 
61 | 
62 | def run_cli():
63 |     parent_parser = ArgumentParser(add_help=False)
64 |     parent_parser = pl.Trainer.add_argparse_args(parent_parser)
65 |     parent_parser.add_argument('-e',
66 |                                '--evaluate',
67 |                                dest='evaluate',
68 |                                action='store_true',
69 |                                help='evaluate model on validation set')
70 |     parent_parser.add_argument('-b', '--batch_size_per_device', type=int)
71 |     parent_parser.add_argument('--seed',
72 |                                type=int,
73 |                                default=0,
74 |                                help='seed for initializing training.')
75 |     parent_parser.add_argument('--ckpt_path', type=str)
76 |     parser = BEVDepthLightningModel.add_model_specific_args(parent_parser)
77 |     parser.set_defaults(profiler='simple',
78 |                         deterministic=False,
79 |                         max_epochs=20,
80 |                         accelerator='ddp',
81 |                         num_sanity_val_steps=0,
82 |                         gradient_clip_val=5,
83 |                         limit_val_batches=0,
84 |                         enable_checkpointing=False,
85 |                         precision=16,
86 |                         default_root_dir='./outputs/bev_depth_lss_r50_'
87 |                         '256x704_128x128_20e_cbgs_2key_da_ema')
88 |     args = parser.parse_args()
89 |     main(args)
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     run_cli()
94 | 


--------------------------------------------------------------------------------
/exps/bev_depth_lss_r50_256x704_128x128_24e_2key.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Megvii Inc. All rights reserved.
  2 | """
  3 | mAP: 0.3304
  4 | mATE: 0.7021
  5 | mASE: 0.2795
  6 | mAOE: 0.5346
  7 | mAVE: 0.5530
  8 | mAAE: 0.2274
  9 | NDS: 0.4355
 10 | Eval time: 171.8s
 11 | 
 12 | Per-class results:
 13 | Object Class    AP      ATE     ASE     AOE     AVE     AAE
 14 | car     0.499   0.540   0.165   0.211   0.650   0.233
 15 | truck   0.278   0.719   0.218   0.265   0.547   0.215
 16 | bus     0.386   0.661   0.211   0.171   1.132   0.274
 17 | trailer 0.168   1.034   0.235   0.548   0.408   0.168
 18 | construction_vehicle    0.075   1.124   0.510   1.177   0.111   0.385
 19 | pedestrian      0.284   0.757   0.298   0.966   0.578   0.301
 20 | motorcycle      0.335   0.624   0.263   0.621   0.734   0.237
 21 | bicycle 0.305   0.554   0.264   0.653   0.263   0.006
 22 | traffic_cone    0.462   0.516   0.355   nan     nan     nan
 23 | barrier 0.512   0.491   0.275   0.200   nan     nan
 24 | """
 25 | from argparse import ArgumentParser, Namespace
 26 | 
 27 | import pytorch_lightning as pl
 28 | 
 29 | from callbacks.ema import EMACallback
 30 | from exps.bev_depth_lss_r50_256x704_128x128_24e import \
 31 |     BEVDepthLightningModel as BaseBEVDepthLightningModel
 32 | # from models.bev_depth import BEVDepth
 33 | from models.mh_depth import BEVDepth 
 34 | 
 35 | 
 36 | class BEVDepthLightningModel(BaseBEVDepthLightningModel):
 37 |     def __init__(self, **kwargs):
 38 |         super().__init__(**kwargs)
 39 |         self.key_idxes = [-1]
 40 |         self.head_conf['bev_backbone_conf']['in_channels'] = 80 * (
 41 |             len(self.key_idxes) + 1)
 42 |         self.head_conf['bev_neck_conf']['in_channels'] = [
 43 |             80 * (len(self.key_idxes) + 1), 160, 320, 640
 44 |         ]
 45 |         self.head_conf['train_cfg']['code_weight'] = [
 46 |             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 47 |         ]
 48 |         self.model = BEVDepth(self.backbone_conf,
 49 |                               self.head_conf,
 50 |                               is_train_depth=True)
 51 | 
 52 | 
 53 | def main(args: Namespace) -> None:
 54 |     if args.seed is not None:
 55 |         pl.seed_everything(args.seed)
 56 | 
 57 |     model = BEVDepthLightningModel(**vars(args))
 58 |     train_dataloader = model.train_dataloader()
 59 |     ema_callback = EMACallback(len(train_dataloader.dataset) * args.max_epochs)
 60 |     trainer = pl.Trainer.from_argparse_args(args, callbacks=[ema_callback])
 61 |     if args.evaluate:
 62 |         trainer.test(model, ckpt_path=args.ckpt_path)
 63 |     else:
 64 |         trainer.fit(model)
 65 | 
 66 | 
 67 | def run_cli():
 68 |     parent_parser = ArgumentParser(add_help=False)
 69 |     parent_parser = pl.Trainer.add_argparse_args(parent_parser)
 70 |     parent_parser.add_argument('-e',
 71 |                                '--evaluate',
 72 |                                dest='evaluate',
 73 |                                action='store_true',
 74 |                                help='evaluate model on validation set')
 75 |     parent_parser.add_argument('-b', '--batch_size_per_device', type=int)
 76 |     parent_parser.add_argument('--seed',
 77 |                                type=int,
 78 |                                default=0,
 79 |                                help='seed for initializing training.')
 80 |     parent_parser.add_argument('--ckpt_path', type=str)
 81 |     parser = BEVDepthLightningModel.add_model_specific_args(parent_parser)
 82 |     parser.set_defaults(
 83 |         profiler='simple',
 84 |         deterministic=False,
 85 |         max_epochs=24,
 86 |         accelerator='ddp',
 87 |         num_sanity_val_steps=0,
 88 |         gradient_clip_val=5,
 89 |         limit_val_batches=0,
 90 |         enable_checkpointing=True,
 91 |         precision=16,
 92 |         default_root_dir='./outputs/bev_depth_lss_r50_256x704_128x128_24e_2key'
 93 |     )
 94 |     args = parser.parse_args()
 95 |     main(args)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     run_cli()
100 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Linux ###
  2 | *~
  3 | 
  4 | # temporary files which can be created if a process still has a handle open of a deleted file
  5 | .fuse_hidden*
  6 | 
  7 | # KDE directory preferences
  8 | .directory
  9 | 
 10 | # Linux trash folder which might appear on any partition or disk
 11 | .Trash-*
 12 | 
 13 | # .nfs files are created when an open file is removed but is still being accessed
 14 | .nfs*
 15 | 
 16 | ### PyCharm ###
 17 | # User-specific stuff
 18 | .idea
 19 | 
 20 | # CMake
 21 | cmake-build-*/
 22 | 
 23 | # Mongo Explorer plugin
 24 | .idea/**/mongoSettings.xml
 25 | 
 26 | # File-based project format
 27 | *.iws
 28 | 
 29 | # IntelliJ
 30 | out/
 31 | 
 32 | # mpeltonen/sbt-idea plugin
 33 | .idea_modules/
 34 | 
 35 | # JIRA plugin
 36 | atlassian-ide-plugin.xml
 37 | 
 38 | # Cursive Clojure plugin
 39 | .idea/replstate.xml
 40 | 
 41 | # Crashlytics plugin (for Android Studio and IntelliJ)
 42 | com_crashlytics_export_strings.xml
 43 | crashlytics.properties
 44 | crashlytics-build.properties
 45 | fabric.properties
 46 | 
 47 | # Editor-based Rest Client
 48 | .idea/httpRequests
 49 | 
 50 | # Android studio 3.1+ serialized cache file
 51 | .idea/caches/build_file_checksums.ser
 52 | 
 53 | # JetBrains templates
 54 | **___jb_tmp___
 55 | 
 56 | ### Python ###
 57 | # Byte-compiled / optimized / DLL files
 58 | __pycache__/
 59 | *.py[cod]
 60 | *$py.class
 61 | 
 62 | # C extensions
 63 | *.so
 64 | 
 65 | # Distribution / packaging
 66 | .Python
 67 | build/
 68 | develop-eggs/
 69 | dist/
 70 | downloads/
 71 | eggs/
 72 | .eggs/
 73 | lib/
 74 | lib64/
 75 | parts/
 76 | sdist/
 77 | var/
 78 | wheels/
 79 | pip-wheel-metadata/
 80 | share/python-wheels/
 81 | *.egg-info/
 82 | .installed.cfg
 83 | *.egg
 84 | MANIFEST
 85 | 
 86 | # PyInstaller
 87 | #  Usually these files are written by a python script from a template
 88 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 89 | *.manifest
 90 | *.spec
 91 | 
 92 | # Installer logs
 93 | pip-log.txt
 94 | pip-delete-this-directory.txt
 95 | 
 96 | # Unit test / coverage reports
 97 | htmlcov/
 98 | .tox/
 99 | .nox/
100 | .coverage
101 | .coverage.*
102 | .cache
103 | nosetests.xml
104 | coverage.xml
105 | *.cover
106 | .hypothesis/
107 | .pytest_cache/
108 | 
109 | # Translations
110 | *.mo
111 | *.pot
112 | 
113 | # Django stuff:
114 | *.log
115 | local_settings.py
116 | db.sqlite3
117 | 
118 | # Flask stuff:
119 | instance/
120 | .webassets-cache
121 | 
122 | # Scrapy stuff:
123 | .scrapy
124 | 
125 | # Sphinx documentation
126 | docs/_build/
127 | docs/build/
128 | 
129 | # PyBuilder
130 | target/
131 | 
132 | # Jupyter Notebook
133 | .ipynb_checkpoints
134 | 
135 | # IPython
136 | profile_default/
137 | ipython_config.py
138 | 
139 | # pyenv
140 | .python-version
141 | 
142 | # pipenv
143 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
144 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
145 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
146 | #   install all needed dependencies.
147 | #Pipfile.lock
148 | 
149 | # celery beat schedule file
150 | celerybeat-schedule
151 | 
152 | # SageMath parsed files
153 | *.sage.py
154 | 
155 | # Environments
156 | .env
157 | .venv
158 | env/
159 | venv/
160 | ENV/
161 | env.bak/
162 | venv.bak/
163 | 
164 | # Spyder project settings
165 | .spyderproject
166 | .spyproject
167 | 
168 | # Rope project settings
169 | .ropeproject
170 | 
171 | # mkdocs documentation
172 | /site
173 | 
174 | # mypy
175 | .mypy_cache/
176 | .dmypy.json
177 | dmypy.json
178 | 
179 | # Pyre type checker
180 | .pyre/
181 | 
182 | ### Vim ###
183 | # Swap
184 | [._]*.s[a-v][a-z]
185 | [._]*.sw[a-p]
186 | [._]s[a-rt-v][a-z]
187 | [._]ss[a-gi-z]
188 | [._]sw[a-p]
189 | 
190 | # Session
191 | Session.vim
192 | 
193 | # Temporary
194 | .netrwhist
195 | # Auto-generated tag files
196 | tags
197 | # Persistent undo
198 | [._]*.un~
199 | 
200 | ### Researcher ###
201 | # output
202 | train_log
203 | docs/api
204 | .code-workspace.code-workspace
205 | output
206 | outputs
207 | instant_test_output
208 | inference_test_output
209 | *.pkl
210 | *.npy
211 | *.pth
212 | events.out.tfevents*
213 | 
214 | # vscode
215 | *.code-workspace
216 | .vscode
217 | 
218 | # vim
219 | .vim
220 | 


--------------------------------------------------------------------------------
/layers/voxel_mix.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | 
 4 | class SELayer(nn.Module):
 5 |     def __init__(self, channel, reduction=16):
 6 |         super(SELayer, self).__init__()
 7 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 8 |         self.fc = nn.Sequential(
 9 |             nn.Linear(channel, channel // reduction, bias=False),
10 |             nn.ReLU(inplace=True),
11 |             nn.Linear(channel // reduction, channel, bias=False),
12 |             nn.Sigmoid()
13 |         )
14 |     def forward(self, x):
15 |         b, c, _, _ = x.size()
16 |         y = self.avg_pool(x).view(b, c)
17 |         y = self.fc(y).view(b, c, 1, 1)
18 |         return x * y.expand_as(x)
19 | 
20 | class voxel_mix_net(nn.Module):
21 |     # [4,240,128,128] -> [4,80,128,128]
22 |     def __init__(self):
23 |         super(voxel_mix_net, self).__init__()
24 |         in_channels = 80*6
25 |         out_channels = 80
26 |         local_global_channels = 80*3
27 |         mix_channels = 80*2
28 |         stride = 1
29 |         self.se = SELayer(in_channels)
30 |         self.residual_function = nn.Sequential(
31 |             nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
32 |             nn.BatchNorm2d(out_channels),
33 |             nn.ReLU(inplace=True),
34 |             nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False),
35 |             nn.BatchNorm2d(out_channels)
36 |         )
37 |         self.shortcut = nn.Sequential(
38 |                 nn.Conv2d(in_channels, out_channels, stride=stride, kernel_size=1, bias=False),
39 |                 nn.BatchNorm2d(out_channels)
40 |             )
41 |         self.relu = nn.ReLU(inplace=True)
42 |         
43 |         self.gl_se = SELayer(local_global_channels)
44 |         self.gl_residual = nn.Sequential(
45 |             nn.Conv2d(local_global_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
46 |             nn.BatchNorm2d(out_channels),
47 |             nn.ReLU(inplace=True),
48 |             nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False),
49 |             nn.BatchNorm2d(out_channels)
50 |         )
51 |         self.gl_shortcut = nn.Sequential(
52 |                 nn.Conv2d(local_global_channels, out_channels, stride=stride, kernel_size=1, bias=False),
53 |                 nn.BatchNorm2d(out_channels)
54 |             )
55 |         self.relu2 = nn.ReLU(inplace=True)
56 | 
57 |         # self.mix_se = SELayer(mix_channels)
58 |         # self.mix_residual = nn.Sequential(
59 |         #     nn.Conv2d(mix_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
60 |         #     nn.BatchNorm2d(out_channels),
61 |         #     nn.ReLU(inplace=True),
62 |         #     nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False),
63 |         #     nn.BatchNorm2d(out_channels)
64 |         # )
65 |         # self.mix_shortcut = nn.Sequential(
66 |         #         nn.Conv2d(mix_channels, out_channels, stride=stride, kernel_size=1, bias=False),
67 |         #         nn.BatchNorm2d(out_channels)
68 |         #     )
69 |         # self.relu3 = nn.ReLU(inplace=True)
70 | 
71 |     def forward(self, input):
72 |         local_depth = input[:,:-80*3,:,:]
73 |         lc_feature = self.se(local_depth)
74 |         lc_feature = self.relu(self.residual_function(lc_feature) + self.shortcut(lc_feature))
75 |         
76 |         glob_depth = input[:,-80*3:,:,:]
77 |         gl_feature = self.gl_se(glob_depth)
78 |         gl_feature = self.relu2(self.gl_residual(gl_feature) + self.gl_shortcut(gl_feature))
79 |         
80 |         # mix_feature = torch.cat([lc_feature,gl_feature],dim=1)
81 |         # result = self.mix_se(mix_feature)
82 |         # result = self.relu2(self.mix_residual(result) + self.mix_shortcut(mix_feature))
83 |         # return result
84 |         gl_feature = self.gl_se(glob_depth)
85 |         gl_feature = self.relu2(self.gl_residual(gl_feature) + self.gl_shortcut(gl_feature))
86 |         
87 |         return gl_feature
88 | 
89 | if __name__ == '__main__':
90 | 
91 |     mixer = voxel_mix_net()
92 |     voxel_feature = torch.randn((4, 80, 128,128))
93 |     mix_feature = torch.cat([voxel_feature,voxel_feature,voxel_feature,voxel_feature,voxel_feature,voxel_feature,voxel_feature,voxel_feature,voxel_feature],dim=1)
94 |     out = mixer(mix_feature)
95 |     out = out.type(torch.HalfTensor)
96 |     exit(0)
97 | 


--------------------------------------------------------------------------------
/models/bev_depth.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | 
  3 | from layers.backbones.lss_fpn import LSSFPN
  4 | from layers.heads.bev_depth_head import BEVDepthHead
  5 | 
  6 | __all__ = ['BEVDepth']
  7 | 
  8 | 
  9 | class BEVDepth(nn.Module):
 10 |     """Source code of `BEVDepth`, `https://arxiv.org/abs/2112.11790`.
 11 | 
 12 |     Args:
 13 |         backbone_conf (dict): Config of backbone.
 14 |         head_conf (dict): Config of head.
 15 |         is_train_depth (bool): Whether to return depth.
 16 |             Default: False.
 17 |     """
 18 | 
 19 |     # TODO: Reduce grid_conf and data_aug_conf
 20 |     def __init__(self, backbone_conf, head_conf, is_train_depth=False):
 21 |         super(BEVDepth, self).__init__()
 22 |         self.backbone = LSSFPN(**backbone_conf)
 23 |         self.head = BEVDepthHead(**head_conf)
 24 |         self.is_train_depth = is_train_depth
 25 | 
 26 |     def forward(
 27 |         self,
 28 |         x,
 29 |         mats_dict,
 30 |         timestamps=None,
 31 |     ):
 32 |         """Forward function for BEVDepth
 33 | 
 34 |         Args:
 35 |             x (Tensor): Input ferature map.
 36 |             mats_dict(dict):
 37 |                 sensor2ego_mats(Tensor): Transformation matrix from
 38 |                     camera to ego with shape of (B, num_sweeps,
 39 |                     num_cameras, 4, 4).
 40 |                 intrin_mats(Tensor): Intrinsic matrix with shape
 41 |                     of (B, num_sweeps, num_cameras, 4, 4).
 42 |                 ida_mats(Tensor): Transformation matrix for ida with
 43 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
 44 |                 sensor2sensor_mats(Tensor): Transformation matrix
 45 |                     from key frame camera to sweep frame camera with
 46 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
 47 |                 bda_mat(Tensor): Rotation matrix for bda with shape
 48 |                     of (B, 4, 4).
 49 |             timestamps (long): Timestamp.
 50 |                 Default: None.
 51 | 
 52 |         Returns:
 53 |             tuple(list[dict]): Output results for tasks.
 54 |         """
 55 |         if self.is_train_depth and self.training:
 56 |             x, depth_pred, img_feats, voxel_feats, camera_feats = self.backbone(x,
 57 |                                           mats_dict,
 58 |                                           timestamps,
 59 |                                           is_return_depth=True)
 60 |             preds = self.head(x)
 61 |             return preds, depth_pred, img_feats, x, voxel_feats, camera_feats
 62 |         else:
 63 |             x = self.backbone(x, mats_dict, timestamps)
 64 |             preds = self.head(x)
 65 |             return preds
 66 | 
 67 |     def get_targets(self, gt_boxes, gt_labels):
 68 |         """Generate training targets for a single sample.
 69 | 
 70 |         Args:
 71 |             gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
 72 |             gt_labels_3d (torch.Tensor): Labels of boxes.
 73 | 
 74 |         Returns:
 75 |             tuple[list[torch.Tensor]]: Tuple of target including \
 76 |                 the following results in order.
 77 | 
 78 |                 - list[torch.Tensor]: Heatmap scores.
 79 |                 - list[torch.Tensor]: Ground truth boxes.
 80 |                 - list[torch.Tensor]: Indexes indicating the position \
 81 |                     of the valid boxes.
 82 |                 - list[torch.Tensor]: Masks indicating which boxes \
 83 |                     are valid.
 84 |         """
 85 |         return self.head.get_targets(gt_boxes, gt_labels)
 86 | 
 87 |     def loss(self, targets, preds_dicts):
 88 |         """Loss function for BEVDepth.
 89 | 
 90 |         Args:
 91 |             gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
 92 |                 truth gt boxes.
 93 |             gt_labels_3d (list[torch.Tensor]): Labels of boxes.
 94 |             preds_dicts (dict): Output of forward function.
 95 | 
 96 |         Returns:
 97 |             dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
 98 |         """
 99 |         return self.head.loss(targets, preds_dicts)
100 | 
101 |     def get_bboxes(self, preds_dicts, img_metas=None, img=None, rescale=False):
102 |         """Generate bboxes from bbox head predictions.
103 | 
104 |         Args:
105 |             preds_dicts (tuple[list[dict]]): Prediction results.
106 |             img_metas (list[dict]): Point cloud and image's meta info.
107 | 
108 |         Returns:
109 |             list[dict]: Decoded bbox, scores and labels after nms.
110 |         """
111 |         return self.head.get_bboxes(preds_dicts, img_metas, img, rescale)
112 | 


--------------------------------------------------------------------------------
/callbacks/ema.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  3 | import math
  4 | import os
  5 | from copy import deepcopy
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from pytorch_lightning.callbacks import Callback
 10 | 
 11 | __all__ = ['ModelEMA', 'is_parallel']
 12 | 
 13 | 
 14 | def is_parallel(model):
 15 |     """check if model is in parallel mode."""
 16 |     parallel_type = (
 17 |         nn.parallel.DataParallel,
 18 |         nn.parallel.DistributedDataParallel,
 19 |     )
 20 |     return isinstance(model, parallel_type)
 21 | 
 22 | 
 23 | class ModelEMA:
 24 |     """
 25 |     Model Exponential Moving Average from https://github.com/rwightman/
 26 |     pytorch-image-models Keep a moving average of everything in
 27 |     the model state_dict (parameters and buffers).
 28 |     This is intended to allow functionality like
 29 |     https://www.tensorflow.org/api_docs/python/tf/train/
 30 |     ExponentialMovingAverage
 31 |     A smoothed version of the weights is necessary for some training
 32 |     schemes to perform well.
 33 |     This class is sensitive where it is initialized in the sequence
 34 |     of model init, GPU assignment and distributed training wrappers.
 35 |     """
 36 |     def __init__(self, model, decay=0.9999, updates=0):
 37 |         """
 38 |         Args:
 39 |             model (nn.Module): model to apply EMA.
 40 |             decay (float): ema decay reate.
 41 |             updates (int): counter of EMA updates.
 42 |         """
 43 |         # Create EMA(FP32)
 44 |         self.ema = deepcopy(
 45 |             model.module if is_parallel(model) else model).eval()
 46 |         self.updates = updates
 47 |         # decay exponential ramp (to help early epochs)
 48 |         self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
 49 |         for p in self.ema.parameters():
 50 |             p.requires_grad_(False)
 51 | 
 52 |     def update(self, trainer, model):
 53 |         # Update EMA parameters
 54 |         with torch.no_grad():
 55 |             self.updates += 1
 56 |             d = self.decay(self.updates)
 57 | 
 58 |             msd = model.module.state_dict() if is_parallel(
 59 |                 model) else model.state_dict()  # model state_dict
 60 |             for k, v in self.ema.state_dict().items():
 61 |                 if v.dtype.is_floating_point:
 62 |                     v *= d
 63 |                     v += (1.0 - d) * msd[k].detach()
 64 | 
 65 | 
 66 | class EMACallback(Callback):
 67 |     def __init__(self, len_updates) -> None:
 68 |         super().__init__()
 69 |         self.len_updates = len_updates
 70 | 
 71 |     def on_fit_start(self, trainer, pl_module):
 72 |         # Todo (@lizeming@megvii.com): delete manually specified device
 73 |         from torch.nn.modules.batchnorm import SyncBatchNorm
 74 | 
 75 |         bn_model_list = list()
 76 |         bn_model_dist_group_list = list()
 77 |         for model_ref in trainer.model.modules():
 78 |             if isinstance(model_ref, SyncBatchNorm):
 79 |                 bn_model_list.append(model_ref)
 80 |                 bn_model_dist_group_list.append(model_ref.process_group)
 81 |                 model_ref.process_group = None
 82 |         trainer.ema_model = ModelEMA(trainer.model.module.module.model.cuda(),
 83 |                                      0.9990)
 84 | 
 85 |         for bn_model, dist_group in zip(bn_model_list,
 86 |                                         bn_model_dist_group_list):
 87 |             bn_model.process_group = dist_group
 88 |         trainer.ema_model.updates = self.len_updates
 89 | 
 90 |     def on_train_batch_end(self,
 91 |                            trainer,
 92 |                            pl_module,
 93 |                            outputs,
 94 |                            batch,
 95 |                            batch_idx,
 96 |                            unused=0):
 97 |         trainer.ema_model.update(trainer, trainer.model.module.module.model)
 98 | 
 99 |     def on_train_epoch_end(self, trainer, pl_module) -> None:
100 |         state_dict = trainer.ema_model.ema.state_dict()
101 |         state_dict_keys = list(state_dict.keys())
102 |         # TODO: Change to more elegant way.
103 |         for state_dict_key in state_dict_keys:
104 |             new_key = 'model.' + state_dict_key
105 |             state_dict[new_key] = state_dict.pop(state_dict_key)
106 |         checkpoint = {
107 |             # the epoch and global step are saved for
108 |             # compatibility but they are not relevant for restoration
109 |             'epoch': trainer.current_epoch,
110 |             'global_step': trainer.global_step,
111 |             'state_dict': state_dict
112 |         }
113 |         torch.save(
114 |             checkpoint,
115 |             os.path.join(trainer.log_dir, f'{trainer.current_epoch}.pth'))
116 | 


--------------------------------------------------------------------------------
/scripts/gen_depth_gt.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from multiprocessing import Pool
  3 | 
  4 | import mmcv
  5 | import numpy as np
  6 | from nuscenes.utils.data_classes import LidarPointCloud
  7 | from nuscenes.utils.geometry_utils import view_points
  8 | from pyquaternion import Quaternion
  9 | 
 10 | 
 11 | # https://github.com/nutonomy/nuscenes-devkit/blob/master/python-sdk/nuscenes/nuscenes.py#L834
 12 | def map_pointcloud_to_image(
 13 |     pc,
 14 |     im,
 15 |     lidar_calibrated_sensor,
 16 |     lidar_ego_pose,
 17 |     cam_calibrated_sensor,
 18 |     cam_ego_pose,
 19 |     min_dist: float = 0.0,
 20 | ):
 21 | 
 22 |     # Points live in the point sensor frame. So they need to be
 23 |     # transformed via global to the image plane.
 24 |     # First step: transform the pointcloud to the ego vehicle
 25 |     # frame for the timestamp of the sweep.
 26 | 
 27 |     pc = LidarPointCloud(pc.T)
 28 |     pc.rotate(Quaternion(lidar_calibrated_sensor['rotation']).rotation_matrix)
 29 |     pc.translate(np.array(lidar_calibrated_sensor['translation']))
 30 | 
 31 |     # Second step: transform from ego to the global frame.
 32 |     pc.rotate(Quaternion(lidar_ego_pose['rotation']).rotation_matrix)
 33 |     pc.translate(np.array(lidar_ego_pose['translation']))
 34 | 
 35 |     # Third step: transform from global into the ego vehicle
 36 |     # frame for the timestamp of the image.
 37 |     pc.translate(-np.array(cam_ego_pose['translation']))
 38 |     pc.rotate(Quaternion(cam_ego_pose['rotation']).rotation_matrix.T)
 39 | 
 40 |     # Fourth step: transform from ego into the camera.
 41 |     pc.translate(-np.array(cam_calibrated_sensor['translation']))
 42 |     pc.rotate(Quaternion(cam_calibrated_sensor['rotation']).rotation_matrix.T)
 43 | 
 44 |     # Fifth step: actually take a "picture" of the point cloud.
 45 |     # Grab the depths (camera frame z axis points away from the camera).
 46 |     depths = pc.points[2, :]
 47 |     coloring = depths
 48 | 
 49 |     # Take the actual picture (matrix multiplication with camera-matrix
 50 |     # + renormalization).
 51 |     points = view_points(pc.points[:3, :],
 52 |                          np.array(cam_calibrated_sensor['camera_intrinsic']),
 53 |                          normalize=True)
 54 | 
 55 |     # Remove points that are either outside or behind the camera.
 56 |     # Leave a margin of 1 pixel for aesthetic reasons. Also make
 57 |     # sure points are at least 1m in front of the camera to avoid
 58 |     # seeing the lidar points on the camera casing for non-keyframes
 59 |     # which are slightly out of sync.
 60 |     mask = np.ones(depths.shape[0], dtype=bool)
 61 |     mask = np.logical_and(mask, depths > min_dist)
 62 |     mask = np.logical_and(mask, points[0, :] > 1)
 63 |     mask = np.logical_and(mask, points[0, :] < im.shape[1] - 1)
 64 |     mask = np.logical_and(mask, points[1, :] > 1)
 65 |     mask = np.logical_and(mask, points[1, :] < im.shape[0] - 1)
 66 |     points = points[:, mask]
 67 |     coloring = coloring[mask]
 68 | 
 69 |     return points, coloring
 70 | 
 71 | 
 72 | data_root = 'data/nuScenes'
 73 | info_path = 'data/nuScenes/nuscenes_12hz_infos_train.pkl'
 74 | # data3d_nusc = NuscMVDetData()
 75 | 
 76 | lidar_key = 'LIDAR_TOP'
 77 | cam_keys = [
 78 |     'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT',
 79 |     'CAM_BACK', 'CAM_BACK_LEFT'
 80 | ]
 81 | 
 82 | 
 83 | def worker(info):
 84 |     lidar_path = info['lidar_infos'][lidar_key]['filename']
 85 |     points = np.fromfile(os.path.join(data_root, lidar_path),
 86 |                          dtype=np.float32,
 87 |                          count=-1).reshape(-1, 5)[..., :4]
 88 |     lidar_calibrated_sensor = info['lidar_infos'][lidar_key][
 89 |         'calibrated_sensor']
 90 |     lidar_ego_pose = info['lidar_infos'][lidar_key]['ego_pose']
 91 |     for i, cam_key in enumerate(cam_keys):
 92 |         cam_calibrated_sensor = info['cam_infos'][cam_key]['calibrated_sensor']
 93 |         cam_ego_pose = info['cam_infos'][cam_key]['ego_pose']
 94 |         img = mmcv.imread(
 95 |             os.path.join(data_root, info['cam_infos'][cam_key]['filename']))
 96 |         pts_img, depth = map_pointcloud_to_image(
 97 |             points.copy(), img, lidar_calibrated_sensor.copy(),
 98 |             lidar_ego_pose.copy(), cam_calibrated_sensor, cam_ego_pose)
 99 |         file_name = os.path.split(info['cam_infos'][cam_key]['filename'])[-1]
100 |         np.concatenate([pts_img[:2, :].T, depth[:, None]],
101 |                        axis=1).astype(np.float32).flatten().tofile(
102 |                            os.path.join(data_root, 'depth_gt',
103 |                                         f'{file_name}.bin'))
104 |     # plt.savefig(f"{sample_idx}")
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     po = Pool(24)
109 |     mmcv.mkdir_or_exist(os.path.join(data_root, 'depth_gt'))
110 |     infos = mmcv.load(info_path)
111 |     # import ipdb; ipdb.set_trace()
112 |     for info in infos:
113 |         po.apply_async(func=worker, args=(info, ))
114 |     po.close()
115 |     po.join()
116 | 


--------------------------------------------------------------------------------
/scripts/gen_info.py:
--------------------------------------------------------------------------------
  1 | import mmcv
  2 | import numpy as np
  3 | from nuscenes.nuscenes import NuScenes
  4 | from nuscenes.utils import splits
  5 | from tqdm import tqdm
  6 | 
  7 | 
  8 | def generate_info(nusc, scenes):
  9 |     infos = list()
 10 |     for cur_scene in tqdm(nusc.scene):
 11 |         if cur_scene['name'] not in scenes:
 12 |             continue
 13 |         first_sample_token = cur_scene['first_sample_token']
 14 |         cur_sample = nusc.get('sample', first_sample_token)
 15 |         while True:
 16 |             info = dict()
 17 |             cam_info = dict()
 18 |             info['sample_token'] = cur_sample['token']
 19 |             info['timestamp'] = cur_sample['timestamp']
 20 |             info['scene_token'] = cur_sample['scene_token']
 21 |             cam_names = [
 22 |                 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK',
 23 |                 'CAM_BACK_LEFT', 'CAM_FRONT_LEFT'
 24 |             ]
 25 |             lidar_names = ['LIDAR_TOP']
 26 |             cam_infos = dict()
 27 |             lidar_infos = dict()
 28 |             for cam_name in cam_names:
 29 |                 cam_data = nusc.get('sample_data',
 30 |                                     cur_sample['data'][cam_name])
 31 |                 cam_info = dict()
 32 |                 cam_info['sample_token'] = cam_data['sample_token']
 33 |                 cam_info['ego_pose'] = nusc.get('ego_pose',
 34 |                                                 cam_data['ego_pose_token'])
 35 |                 cam_info['timestamp'] = cam_data['timestamp']
 36 |                 cam_info['is_key_frame'] = cam_data['is_key_frame']
 37 |                 cam_info['height'] = cam_data['height']
 38 |                 cam_info['width'] = cam_data['width']
 39 |                 cam_info['filename'] = cam_data['filename']
 40 |                 cam_info['calibrated_sensor'] = nusc.get(
 41 |                     'calibrated_sensor', cam_data['calibrated_sensor_token'])
 42 |                 cam_infos[cam_name] = cam_info
 43 |             for lidar_name in lidar_names:
 44 |                 lidar_data = nusc.get('sample_data',
 45 |                                       cur_sample['data'][lidar_name])
 46 |                 lidar_info = dict()
 47 |                 lidar_info['sample_token'] = lidar_data['sample_token']
 48 |                 lidar_info['ego_pose'] = nusc.get('ego_pose',
 49 |                                                   lidar_data['ego_pose_token'])
 50 |                 lidar_info['timestamp'] = lidar_data['timestamp']
 51 |                 lidar_info['filename'] = lidar_data['filename']
 52 |                 lidar_info['calibrated_sensor'] = nusc.get(
 53 |                     'calibrated_sensor', lidar_data['calibrated_sensor_token'])
 54 |                 lidar_infos[lidar_name] = lidar_info
 55 | 
 56 |             sweeps = list()
 57 |             info['cam_infos'] = cam_infos
 58 |             info['lidar_infos'] = lidar_infos
 59 |             cam_datas = list()
 60 |             for i in range(6):
 61 |                 sweeps.append(dict())
 62 |             for cam_name in cam_names:
 63 |                 cam_datas.append(
 64 |                     nusc.get('sample_data', cur_sample['data'][cam_name]))
 65 |             for k, cam_data in enumerate(cam_datas):
 66 |                 for j in range(6):
 67 |                     if cam_data['prev'] == '':
 68 |                         break
 69 |                     else:
 70 |                         cam_data = nusc.get('sample_data', cam_data['prev'])
 71 |                         cam_info = dict()
 72 |                         cam_info['sample_token'] = cam_data['sample_token']
 73 |                         assert cam_info['sample_token'] == cam_info[
 74 |                             'sample_token']
 75 |                         cam_info['ego_pose'] = nusc.get(
 76 |                             'ego_pose', cam_data['ego_pose_token'])
 77 |                         cam_info['timestamp'] = cam_data['timestamp']
 78 |                         cam_info['is_key_frame'] = cam_data['is_key_frame']
 79 |                         cam_info['height'] = cam_data['height']
 80 |                         cam_info['width'] = cam_data['width']
 81 |                         cam_info['filename'] = cam_data['filename']
 82 |                         cam_info['calibrated_sensor'] = nusc.get(
 83 |                             'calibrated_sensor',
 84 |                             cam_data['calibrated_sensor_token'])
 85 |                         sweeps[j][cam_names[k]] = cam_info
 86 |             # Remove empty sweeps.
 87 |             for i, sweep in enumerate(sweeps):
 88 |                 if len(sweep.keys()) == 0:
 89 |                     sweeps = sweeps[:i]
 90 |                     break
 91 |             info['sweeps'] = sweeps
 92 |             ann_infos = list()
 93 |             for ann in cur_sample['anns']:
 94 |                 ann_info = nusc.get('sample_annotation', ann)
 95 |                 velocity = nusc.box_velocity(ann_info['token'])
 96 |                 if np.any(np.isnan(velocity)):
 97 |                     velocity = np.zeros(3)
 98 |                 ann_info['velocity'] = velocity
 99 |                 ann_infos.append(ann_info)
100 |             info['ann_infos'] = ann_infos
101 |             infos.append(info)
102 |             if cur_sample['next'] == '':
103 |                 break
104 |             else:
105 |                 cur_sample = nusc.get('sample', cur_sample['next'])
106 |     return infos
107 | 
108 | 
109 | def main():
110 |     nusc = NuScenes(version='v1.0-trainval',
111 |                     dataroot='./data/nuScenes/',
112 |                     verbose=True)
113 |     train_scenes = splits.train
114 |     val_scenes = splits.val
115 |     train_infos = generate_info(nusc, train_scenes)
116 |     val_infos = generate_info(nusc, val_scenes)
117 |     mmcv.dump(train_infos, './data/nuScenes/nuscenes_12hz_infos_train.pkl')
118 |     mmcv.dump(val_infos, './data/nuScenes/nuscenes_12hz_infos_val.pkl')
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     main()
123 | 


--------------------------------------------------------------------------------
/test/test_layers/test_head.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import pytest
  4 | import torch
  5 | from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
  6 | 
  7 | from layers.heads.bev_depth_head import BEVDepthHead
  8 | 
  9 | 
 10 | class TestLSSFPN(unittest.TestCase):
 11 |     def setUp(self) -> None:
 12 |         bev_backbone = dict(
 13 |             type='ResNet',
 14 |             in_channels=10,
 15 |             depth=18,
 16 |             num_stages=3,
 17 |             strides=(1, 2, 2),
 18 |             dilations=(1, 1, 1),
 19 |             out_indices=[0, 1, 2],
 20 |             norm_eval=False,
 21 |             base_channels=20,
 22 |         )
 23 | 
 24 |         bev_neck = dict(type='SECONDFPN',
 25 |                         in_channels=[10, 20, 40, 80],
 26 |                         upsample_strides=[1, 2, 4, 8],
 27 |                         out_channels=[8, 8, 8, 8])
 28 | 
 29 |         TASKS = [
 30 |             dict(num_class=1, class_names=['car']),
 31 |             dict(num_class=2, class_names=['truck', 'construction_vehicle']),
 32 |             dict(num_class=2, class_names=['bus', 'trailer']),
 33 |             dict(num_class=1, class_names=['barrier']),
 34 |             dict(num_class=2, class_names=['motorcycle', 'bicycle']),
 35 |             dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
 36 |         ]
 37 | 
 38 |         common_heads = dict(reg=(2, 2),
 39 |                             height=(1, 2),
 40 |                             dim=(3, 2),
 41 |                             rot=(2, 2),
 42 |                             vel=(2, 2))
 43 | 
 44 |         bbox_coder = dict(
 45 |             type='CenterPointBBoxCoder',
 46 |             post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
 47 |             max_num=500,
 48 |             score_threshold=0.1,
 49 |             out_size_factor=32,
 50 |             voxel_size=[0.2, 0.2, 8],
 51 |             pc_range=[-51.2, -51.2, -5, 51.2, 51.2, 3],
 52 |             code_size=9,
 53 |         )
 54 | 
 55 |         train_cfg = dict(
 56 |             point_cloud_range=[-51.2, -51.2, -5, 51.2, 51.2, 3],
 57 |             grid_size=[512, 512, 1],
 58 |             voxel_size=[0.2, 0.2, 8],
 59 |             out_size_factor=32,
 60 |             dense_reg=1,
 61 |             gaussian_overlap=0.1,
 62 |             max_objs=500,
 63 |             min_radius=2,
 64 |             code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5],
 65 |         )
 66 | 
 67 |         test_cfg = dict(
 68 |             post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
 69 |             max_per_img=500,
 70 |             max_pool_nms=False,
 71 |             min_radius=[4, 12, 10, 1, 0.85, 0.175],
 72 |             score_threshold=0.1,
 73 |             out_size_factor=4,
 74 |             voxel_size=[0.2, 0.2, 8],
 75 |             nms_type='circle',
 76 |             pre_max_size=1000,
 77 |             post_max_size=83,
 78 |             nms_thr=0.2,
 79 |         )
 80 | 
 81 |         head_conf = {
 82 |             'bev_backbone_conf': bev_backbone,
 83 |             'bev_neck_conf': bev_neck,
 84 |             'tasks': TASKS,
 85 |             'common_heads': common_heads,
 86 |             'bbox_coder': bbox_coder,
 87 |             'train_cfg': train_cfg,
 88 |             'test_cfg': test_cfg,
 89 |             'in_channels': 32,  # Equal to bev_neck output_channels.
 90 |             'loss_cls': dict(type='GaussianFocalLoss', reduction='mean'),
 91 |             'loss_bbox': dict(type='L1Loss',
 92 |                               reduction='mean',
 93 |                               loss_weight=0.25),
 94 |             'gaussian_overlap': 0.1,
 95 |             'min_radius': 2,
 96 |         }
 97 |         self.bevdet_head = BEVDepthHead(**head_conf).cuda()
 98 | 
 99 |     @pytest.mark.skipif(torch.cuda.is_available() is False,
100 |                         reason='No gpu available.')
101 |     def test_forward(self):
102 |         x = torch.rand(2, 10, 32, 32).cuda()
103 |         ret_results = self.bevdet_head.forward(x)
104 |         assert len(ret_results) == 6
105 |         assert ret_results[0][0]['reg'].shape == torch.Size([2, 2, 32, 32])
106 |         assert ret_results[0][0]['height'].shape == torch.Size([2, 1, 32, 32])
107 |         assert ret_results[0][0]['dim'].shape == torch.Size([2, 3, 32, 32])
108 |         assert ret_results[0][0]['rot'].shape == torch.Size([2, 2, 32, 32])
109 |         assert ret_results[0][0]['vel'].shape == torch.Size([2, 2, 32, 32])
110 |         assert ret_results[0][0]['heatmap'].shape == torch.Size([2, 1, 32, 32])
111 | 
112 |     @pytest.mark.skipif(torch.cuda.is_available() is False,
113 |                         reason='No gpu available.')
114 |     def test_get_targets(self):
115 |         gt_boxes_3d_0 = torch.rand(10, 9).cuda()
116 |         gt_boxes_3d_1 = torch.rand(15, 9).cuda()
117 |         gt_boxes_3d_0[:, :2] *= 10
118 |         gt_boxes_3d_1[:, :2] *= 10
119 |         gt_labels_3d_0 = torch.randint(0, 10, (10, )).cuda()
120 |         gt_labels_3d_1 = torch.randint(0, 10, (15, )).cuda()
121 |         gt_boxes_3d = [gt_boxes_3d_0, gt_boxes_3d_1]
122 |         gt_labels_3d = [gt_labels_3d_0, gt_labels_3d_1]
123 |         heatmaps, anno_boxes, inds, masks = self.bevdet_head.get_targets(
124 |             gt_boxes_3d, gt_labels_3d)
125 |         assert len(heatmaps) == 6
126 |         assert len(anno_boxes) == 6
127 |         assert len(inds) == 6
128 |         assert len(masks) == 6
129 |         assert heatmaps[0].shape == torch.Size([2, 1, 16, 16])
130 |         assert anno_boxes[0].shape == torch.Size([2, 500, 10])
131 |         assert inds[0].shape == torch.Size([2, 500])
132 |         assert masks[0].shape == torch.Size([2, 500])
133 | 
134 |     @pytest.mark.skipif(torch.cuda.is_available() is False,
135 |                         reason='No gpu available.')
136 |     def test_get_bboxes(self):
137 |         x = torch.rand(2, 10, 32, 32).cuda()
138 |         ret_results = self.bevdet_head.forward(x)
139 |         img_metas = [
140 |             dict(box_type_3d=LiDARInstance3DBoxes),
141 |             dict(box_type_3d=LiDARInstance3DBoxes)
142 |         ]
143 |         pred_bboxes = self.bevdet_head.get_bboxes(ret_results,
144 |                                                   img_metas=img_metas)
145 |         assert len(pred_bboxes) == 2
146 |         assert len(pred_bboxes[0]) == 3
147 |         assert pred_bboxes[0][1].shape == torch.Size([498])
148 |         assert pred_bboxes[0][2].shape == torch.Size([498])
149 | 


--------------------------------------------------------------------------------
/models/uda_depth.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | 
  3 | # from layers.backbones.lss_fpn import LSSFPN
  4 | from layers.backbones.uda_fpn import LSSFPN
  5 | from layers.heads.bev_depth_head import BEVDepthHead
  6 | 
  7 | __all__ = ['BEVDepth']
  8 | 
  9 | 
 10 | class BEVDepth(nn.Module):
 11 |     """Source code of `BEVDepth`, `https://arxiv.org/abs/2112.11790`.
 12 | 
 13 |     Args:
 14 |         backbone_conf (dict): Config of backbone.
 15 |         head_conf (dict): Config of head.
 16 |         is_train_depth (bool): Whether to return depth.
 17 |             Default: False.
 18 |     """
 19 | 
 20 |     # TODO: Reduce grid_conf and data_aug_conf
 21 |     def __init__(self, backbone_conf, head_conf, is_train_depth=False):
 22 |         super(BEVDepth, self).__init__()
 23 |         self.backbone = LSSFPN(**backbone_conf)
 24 |         self.head = BEVDepthHead(**head_conf)
 25 |         self.is_train_depth = is_train_depth
 26 | 
 27 |     def forward(
 28 |         self,
 29 |         x,
 30 |         mats_dict,
 31 |         depth_label=None,
 32 |         timestamps=None,
 33 |     ):
 34 |         """Forward function for BEVDepth
 35 | 
 36 |         Args:
 37 |             x (Tensor): Input ferature map.
 38 |             mats_dict(dict):
 39 |                 sensor2ego_mats(Tensor): Transformation matrix from
 40 |                     camera to ego with shape of (B, num_sweeps,
 41 |                     num_cameras, 4, 4).
 42 |                 intrin_mats(Tensor): Intrinsic matrix with shape
 43 |                     of (B, num_sweeps, num_cameras, 4, 4).
 44 |                 ida_mats(Tensor): Transformation matrix for ida with
 45 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
 46 |                 sensor2sensor_mats(Tensor): Transformation matrix
 47 |                     from key frame camera to sweep frame camera with
 48 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
 49 |                 bda_mat(Tensor): Rotation matrix for bda with shape
 50 |                     of (B, 4, 4).
 51 |             timestamps (long): Timestamp.
 52 |                 Default: None.
 53 | 
 54 |         Returns:
 55 |             tuple(list[dict]): Output results for tasks.
 56 |         """
 57 |         if self.is_train_depth and self.training:
 58 |             if depth_label is None:
 59 |                 x, depth_pred, img_feats, voxel_feats, camera_feats = self.backbone(x,
 60 |                                         mats_dict,
 61 |                                         depth_label=None,
 62 |                                         timestamps=timestamps,
 63 |                                         is_return_depth=True)
 64 |             else:
 65 |                 # print("Your are forwarding with out depthNet")
 66 |                 x, depth_pred, img_feats, voxel_feats, camera_feats = self.backbone(x,
 67 |                                             mats_dict,
 68 |                                             depth_label,
 69 |                                             timestamps,
 70 |                                             is_return_depth=True)
 71 |             preds = self.head(x)
 72 |             return preds, depth_pred, img_feats, x, voxel_feats, camera_feats
 73 |         else:
 74 |             if depth_label is None:
 75 |                 x, depth_pred, img_feats, voxel_feats, camera_feats = self.backbone(x,
 76 |                                         mats_dict,
 77 |                                         depth_label=None,
 78 |                                         timestamps=timestamps,
 79 |                                         is_return_depth=True)
 80 |             else:
 81 |                 # print("Your are forwarding with out depthNet")
 82 |                 x, depth_pred, img_feats, voxel_feats, camera_feats = self.backbone(x,
 83 |                                             mats_dict,
 84 |                                             depth_label,
 85 |                                             timestamps,
 86 |                                             is_return_depth=True)
 87 |             # origin eval code here
 88 |             # if depth_label is None:
 89 |             #     x = self.backbone(x, mats_dict, timestamps)
 90 |             # else:
 91 |             #     x = self.backbone(x,
 92 |             #                     mats_dict,
 93 |             #                     depth_label=depth_label,
 94 |             #                     timestamps=timestamps)
 95 |             preds = self.head(x)
 96 |             return preds, depth_pred, img_feats, x, voxel_feats, camera_feats
 97 | 
 98 |     def get_targets(self, gt_boxes, gt_labels):
 99 |         """Generate training targets for a single sample.
100 | 
101 |         Args:
102 |             gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
103 |             gt_labels_3d (torch.Tensor): Labels of boxes.
104 | 
105 |         Returns:
106 |             tuple[list[torch.Tensor]]: Tuple of target including \
107 |                 the following results in order.
108 | 
109 |                 - list[torch.Tensor]: Heatmap scores.
110 |                 - list[torch.Tensor]: Ground truth boxes.
111 |                 - list[torch.Tensor]: Indexes indicating the position \
112 |                     of the valid boxes.
113 |                 - list[torch.Tensor]: Masks indicating which boxes \
114 |                     are valid.
115 |         """
116 |         return self.head.get_targets(gt_boxes, gt_labels)
117 | 
118 |     def loss(self, targets, preds_dicts):
119 |         """Loss function for BEVDepth.
120 | 
121 |         Args:
122 |             gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
123 |                 truth gt boxes.
124 |             gt_labels_3d (list[torch.Tensor]): Labels of boxes.
125 |             preds_dicts (dict): Output of forward function.
126 | 
127 |         Returns:
128 |             dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
129 |         """
130 |         return self.head.loss(targets, preds_dicts)
131 | 
132 |     def get_bboxes(self, preds_dicts, img_metas=None, img=None, rescale=False):
133 |         """Generate bboxes from bbox head predictions.
134 | 
135 |         Args:
136 |             preds_dicts (tuple[list[dict]]): Prediction results.
137 |             img_metas (list[dict]): Point cloud and image's meta info.
138 | 
139 |         Returns:
140 |             list[dict]: Decoded bbox, scores and labels after nms.
141 |         """
142 |         return self.head.get_bboxes(preds_dicts, img_metas, img, rescale)
143 | 


--------------------------------------------------------------------------------
/exps/bev_depth_lss_r50_256x704_128x128_20e_cbgs_2key_da.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Megvii Inc. All rights reserved.
  2 | """
  3 | mAP: 0.3484
  4 | mATE: 0.6159
  5 | mASE: 0.2716
  6 | mAOE: 0.4144
  7 | mAVE: 0.4402
  8 | mAAE: 0.1954
  9 | NDS: 0.4805
 10 | Eval time: 110.7s
 11 | Per-class results:
 12 | Object Class    AP      ATE     ASE     AOE     AVE     AAE
 13 | car     0.553   0.480   0.157   0.117   0.386   0.205
 14 | truck   0.252   0.645   0.202   0.097   0.381   0.185
 15 | bus     0.378   0.674   0.197   0.090   0.871   0.298
 16 | trailer 0.163   0.932   0.230   0.409   0.543   0.098
 17 | construction_vehicle    0.076   0.878   0.495   1.015   0.103   0.344
 18 | pedestrian      0.361   0.694   0.300   0.816   0.491   0.247
 19 | motorcycle      0.319   0.569   0.252   0.431   0.552   0.181
 20 | bicycle 0.286   0.457   0.255   0.630   0.194   0.006
 21 | traffic_cone    0.536   0.438   0.339   nan     nan     nan
 22 | barrier 0.559   0.392   0.289   0.124   nan     nan
 23 | """
 24 | from argparse import ArgumentParser, Namespace
 25 | 
 26 | import pytorch_lightning as pl
 27 | import torch
 28 | import torch.nn as nn
 29 | from torch.cuda.amp.autocast_mode import autocast
 30 | from torch.optim.lr_scheduler import MultiStepLR
 31 | 
 32 | from callbacks.ema import EMACallback
 33 | from exps.bev_depth_lss_r50_256x704_128x128_24e_2key import \
 34 |     BEVDepthLightningModel as BaseBEVDepthLightningModel
 35 | # from layers.backbones.lss_fpn import LSSFPN as BaseLSSFPN
 36 | # from layers.heads.bev_depth_head import BEVDepthHead
 37 | # from models.bev_depth import BEVDepth as BaseBEVDepth
 38 | from models.mh_depth import BEVDepth as BaseBEVDepth
 39 | from layers.heads.mh_depth_head import BEVDepthHead
 40 | from layers.backbones.multi_head_fpn import LSSFPN as BaseLSSFPN
 41 | 
 42 | class DepthAggregation(nn.Module):
 43 |     """
 44 |     pixel cloud feature extraction
 45 |     """
 46 |     def __init__(self, in_channels, mid_channels, out_channels):
 47 |         super(DepthAggregation, self).__init__()
 48 | 
 49 |         self.reduce_conv = nn.Sequential(
 50 |             nn.Conv2d(in_channels,
 51 |                       mid_channels,
 52 |                       kernel_size=3,
 53 |                       stride=1,
 54 |                       padding=1,
 55 |                       bias=False),
 56 |             nn.BatchNorm2d(mid_channels),
 57 |             nn.ReLU(inplace=True),
 58 |         )
 59 | 
 60 |         self.conv = nn.Sequential(
 61 |             nn.Conv2d(mid_channels,
 62 |                       mid_channels,
 63 |                       kernel_size=3,
 64 |                       stride=1,
 65 |                       padding=1,
 66 |                       bias=False),
 67 |             nn.BatchNorm2d(mid_channels),
 68 |             nn.ReLU(inplace=True),
 69 |             nn.Conv2d(mid_channels,
 70 |                       mid_channels,
 71 |                       kernel_size=3,
 72 |                       stride=1,
 73 |                       padding=1,
 74 |                       bias=False),
 75 |             nn.BatchNorm2d(mid_channels),
 76 |             nn.ReLU(inplace=True),
 77 |         )
 78 | 
 79 |         self.out_conv = nn.Sequential(
 80 |             nn.Conv2d(mid_channels,
 81 |                       out_channels,
 82 |                       kernel_size=3,
 83 |                       stride=1,
 84 |                       padding=1,
 85 |                       bias=True),
 86 |             # nn.BatchNorm3d(out_channels),
 87 |             # nn.ReLU(inplace=True),
 88 |         )
 89 | 
 90 |     @autocast(False)
 91 |     def forward(self, x):
 92 |         x = self.reduce_conv(x)
 93 |         x = self.conv(x) + x
 94 |         x = self.out_conv(x)
 95 |         return x
 96 | 
 97 | 
 98 | class LSSFPN(BaseLSSFPN):
 99 |     def __init__(self, **kwargs):
100 |         super().__init__(**kwargs)
101 |         self.depth_aggregation_net = self._configure_depth_aggregation_net()
102 | 
103 |     def _configure_depth_aggregation_net(self):
104 |         """build pixel cloud feature extractor"""
105 |         return DepthAggregation(self.output_channels, self.output_channels,
106 |                                 self.output_channels)
107 | 
108 |     def _forward_voxel_net(self, img_feat_with_depth):
109 |         # BEVConv2D [n, c, d, h, w] -> [n, h, c, w, d]
110 |         img_feat_with_depth = img_feat_with_depth.permute(
111 |             0, 3, 1, 4, 2).contiguous()  # [n, c, d, h, w] -> [n, h, c, w, d]
112 |         n, h, c, w, d = img_feat_with_depth.shape
113 |         img_feat_with_depth = img_feat_with_depth.view(-1, c, w, d)
114 |         img_feat_with_depth = (
115 |             self.depth_aggregation_net(img_feat_with_depth).view(
116 |                 n, h, c, w, d).permute(0, 2, 4, 1, 3).contiguous().float())
117 |         return img_feat_with_depth
118 | 
119 | 
120 | class BEVDepth(BaseBEVDepth):
121 |     def __init__(self, backbone_conf, head_conf, is_train_depth=True):
122 |         super(BaseBEVDepth, self).__init__()
123 |         self.backbone = LSSFPN(**backbone_conf)
124 |         self.head = BEVDepthHead(**head_conf)
125 |         self.is_train_depth = is_train_depth
126 | 
127 | 
128 | class BEVDepthLightningModel(BaseBEVDepthLightningModel):
129 |     def __init__(self, **kwargs):
130 |         super().__init__(**kwargs)
131 |         self.model = BEVDepth(self.backbone_conf,
132 |                               self.head_conf,
133 |                               is_train_depth=True)
134 |         self.data_use_cbgs = True
135 | 
136 |     def configure_optimizers(self):
137 |         lr = self.basic_lr_per_img * \
138 |             self.batch_size_per_device * self.gpus
139 |         optimizer = torch.optim.AdamW(self.model.parameters(),
140 |                                       lr=lr,
141 |                                       weight_decay=1e-7)
142 |         scheduler = MultiStepLR(optimizer, [19, 22])
143 |         return [[optimizer], [scheduler]]
144 | 
145 | 
146 | def main(args: Namespace) -> None:
147 |     if args.seed is not None:
148 |         pl.seed_everything(args.seed)
149 | 
150 |     model = BEVDepthLightningModel(**vars(args))
151 |     train_dataloader = model.train_dataloader()
152 |     ema_callback = EMACallback(len(train_dataloader.dataset) * args.max_epochs)
153 |     trainer = pl.Trainer.from_argparse_args(args, callbacks=[ema_callback])
154 |     if args.evaluate:
155 |         trainer.test(model, ckpt_path=args.ckpt_path)
156 |     else:
157 |         #.load_from_checkpoint(args.ckpt_path, strict=False)
158 |         trainer.fit(model)
159 | 
160 | 
161 | def run_cli():
162 |     parent_parser = ArgumentParser(add_help=False)
163 |     parent_parser = pl.Trainer.add_argparse_args(parent_parser)
164 |     parent_parser.add_argument('-e',
165 |                                '--evaluate',
166 |                                dest='evaluate',
167 |                                action='store_true',
168 |                                help='evaluate model on validation set')
169 |     parent_parser.add_argument('-b', '--batch_size_per_device', type=int)
170 |     parent_parser.add_argument('--seed',
171 |                                type=int,
172 |                                default=0,
173 |                                help='seed for initializing training.')
174 |     parent_parser.add_argument('--ckpt_path', type=str)
175 |     parser = BEVDepthLightningModel.add_model_specific_args(parent_parser)
176 |     parser.set_defaults(profiler='simple',
177 |                         deterministic=False,
178 |                         max_epochs=25,
179 |                         accelerator='ddp',
180 |                         num_sanity_val_steps=0,
181 |                         gradient_clip_val=5,
182 |                         limit_val_batches=0,
183 |                         enable_checkpointing=True,
184 |                         precision=16,
185 |                         default_root_dir='./outputs/SAN-CBGS50-3SE')
186 |     args = parser.parse_args()
187 |     main(args)
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     run_cli()
192 | 


--------------------------------------------------------------------------------
/layers/backbones/depth_net.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from mmcv.cnn import build_conv_layer
  6 | from mmdet3d.models import build_neck
  7 | from mmdet.models import build_backbone
  8 | from mmdet.models.backbones.resnet import BasicBlock
  9 | 
 10 | class _ASPPModule(nn.Module):
 11 |     def __init__(self, inplanes, planes, kernel_size, padding, dilation,
 12 |                  BatchNorm):
 13 |         super(_ASPPModule, self).__init__()
 14 |         self.atrous_conv = nn.Conv2d(inplanes,
 15 |                                      planes,
 16 |                                      kernel_size=kernel_size,
 17 |                                      stride=1,
 18 |                                      padding=padding,
 19 |                                      dilation=dilation,
 20 |                                      bias=False)
 21 |         self.bn = BatchNorm(planes)
 22 |         self.relu = nn.ReLU()
 23 | 
 24 |         self._init_weight()
 25 | 
 26 |     def forward(self, x):
 27 |         x = self.atrous_conv(x)
 28 |         x = self.bn(x)
 29 | 
 30 |         return self.relu(x)
 31 | 
 32 |     def _init_weight(self):
 33 |         for m in self.modules():
 34 |             if isinstance(m, nn.Conv2d):
 35 |                 torch.nn.init.kaiming_normal_(m.weight)
 36 |             elif isinstance(m, nn.BatchNorm2d):
 37 |                 m.weight.data.fill_(1)
 38 |                 m.bias.data.zero_()
 39 | 
 40 | 
 41 | class ASPP(nn.Module):
 42 |     def __init__(self, inplanes, mid_channels=256, BatchNorm=nn.BatchNorm2d):
 43 |         super(ASPP, self).__init__()
 44 | 
 45 |         dilations = [1, 6, 12, 18]
 46 | 
 47 |         self.aspp1 = _ASPPModule(inplanes,
 48 |                                  mid_channels,
 49 |                                  1,
 50 |                                  padding=0,
 51 |                                  dilation=dilations[0],
 52 |                                  BatchNorm=BatchNorm)
 53 |         self.aspp2 = _ASPPModule(inplanes,
 54 |                                  mid_channels,
 55 |                                  3,
 56 |                                  padding=dilations[1],
 57 |                                  dilation=dilations[1],
 58 |                                  BatchNorm=BatchNorm)
 59 |         self.aspp3 = _ASPPModule(inplanes,
 60 |                                  mid_channels,
 61 |                                  3,
 62 |                                  padding=dilations[2],
 63 |                                  dilation=dilations[2],
 64 |                                  BatchNorm=BatchNorm)
 65 |         self.aspp4 = _ASPPModule(inplanes,
 66 |                                  mid_channels,
 67 |                                  3,
 68 |                                  padding=dilations[3],
 69 |                                  dilation=dilations[3],
 70 |                                  BatchNorm=BatchNorm)
 71 | 
 72 |         self.global_avg_pool = nn.Sequential(
 73 |             nn.AdaptiveAvgPool2d((1, 1)),
 74 |             nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False),
 75 |             BatchNorm(mid_channels),
 76 |             nn.ReLU(),
 77 |         )
 78 |         self.conv1 = nn.Conv2d(int(mid_channels * 5),
 79 |                                mid_channels,
 80 |                                1,
 81 |                                bias=False)
 82 |         self.bn1 = BatchNorm(mid_channels)
 83 |         self.relu = nn.ReLU()
 84 |         self.dropout = nn.Dropout(0.5)
 85 |         self._init_weight()
 86 | 
 87 |     def forward(self, x):
 88 |         x1 = self.aspp1(x)
 89 |         x2 = self.aspp2(x)
 90 |         x3 = self.aspp3(x)
 91 |         x4 = self.aspp4(x)
 92 |         x5 = self.global_avg_pool(x)
 93 |         x5 = F.interpolate(x5,
 94 |                            size=x4.size()[2:],
 95 |                            mode='bilinear',
 96 |                            align_corners=True)
 97 |         x = torch.cat((x1, x2, x3, x4, x5), dim=1)
 98 | 
 99 |         x = self.conv1(x)
100 |         x = self.bn1(x)
101 |         x = self.relu(x)
102 | 
103 |         return self.dropout(x)
104 | 
105 |     def _init_weight(self):
106 |         for m in self.modules():
107 |             if isinstance(m, nn.Conv2d):
108 |                 torch.nn.init.kaiming_normal_(m.weight)
109 |             elif isinstance(m, nn.BatchNorm2d):
110 |                 m.weight.data.fill_(1)
111 |                 m.bias.data.zero_()
112 | 
113 | class Mlp(nn.Module):
114 |     def __init__(self,
115 |                  in_features,
116 |                  hidden_features=None,
117 |                  out_features=None,
118 |                  act_layer=nn.ReLU,
119 |                  drop=0.0):
120 |         super().__init__()
121 |         out_features = out_features or in_features
122 |         hidden_features = hidden_features or in_features
123 |         self.fc1 = nn.Linear(in_features, hidden_features)
124 |         self.act = act_layer()
125 |         self.drop1 = nn.Dropout(drop)
126 |         self.fc2 = nn.Linear(hidden_features, out_features)
127 |         self.drop2 = nn.Dropout(drop)
128 | 
129 |     def forward(self, x):
130 |         x = self.fc1(x)
131 |         x = self.act(x)
132 |         x = self.drop1(x)
133 |         x = self.fc2(x)
134 |         x = self.drop2(x)
135 |         return x
136 | 
137 | class SELayer(nn.Module):
138 |     def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
139 |         super().__init__()
140 |         self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True)
141 |         self.act1 = act_layer()
142 |         self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True)
143 |         self.gate = gate_layer()
144 | 
145 |     def forward(self, x, x_se):
146 |         x_se = self.conv_reduce(x_se)
147 |         x_se = self.act1(x_se)
148 |         x_se = self.conv_expand(x_se)
149 |         return x * self.gate(x_se)
150 | 
151 | class DepthNet(nn.Module):
152 |     def __init__(self, in_channels, mid_channels, context_channels,
153 |                  depth_channels):
154 |         super(DepthNet, self).__init__()
155 |         self.reduce_conv = nn.Sequential(
156 |             nn.Conv2d(in_channels,
157 |                       mid_channels,
158 |                       kernel_size=3,
159 |                       stride=1,
160 |                       padding=1),
161 |             nn.BatchNorm2d(mid_channels),
162 |             nn.ReLU(inplace=True),
163 |         )
164 |         self.context_conv = nn.Conv2d(mid_channels,
165 |                                       context_channels,
166 |                                       kernel_size=1,
167 |                                       stride=1,
168 |                                       padding=0)
169 |         self.bn = nn.BatchNorm1d(27)
170 |         self.depth_mlp = Mlp(27, mid_channels, mid_channels)
171 |         self.depth_se = SELayer(mid_channels)  # NOTE: add camera-aware
172 |         self.context_mlp = Mlp(27, mid_channels, mid_channels)
173 |         self.context_se = SELayer(mid_channels)  # NOTE: add camera-aware
174 |         self.depth_conv = nn.Sequential(
175 |             BasicBlock(mid_channels, mid_channels),
176 |             BasicBlock(mid_channels, mid_channels),
177 |             BasicBlock(mid_channels, mid_channels),
178 |             ASPP(mid_channels, mid_channels),
179 |             build_conv_layer(cfg=dict(
180 |                 type='DCN',
181 |                 in_channels=mid_channels,
182 |                 out_channels=mid_channels,
183 |                 kernel_size=3,
184 |                 padding=1,
185 |                 groups=4,
186 |                 im2col_step=128,
187 |             )),
188 |             nn.Conv2d(mid_channels,
189 |                       depth_channels,
190 |                       kernel_size=1,
191 |                       stride=1,
192 |                       padding=0),
193 |         )
194 | 
195 |     def forward(self, x, mats_dict):
196 |         intrins = mats_dict['intrin_mats'][:, 0:1, ..., :3, :3]
197 |         batch_size = intrins.shape[0]
198 |         num_cams = intrins.shape[2]
199 |         ida = mats_dict['ida_mats'][:, 0:1, ...]
200 |         sensor2ego = mats_dict['sensor2ego_mats'][:, 0:1, ..., :3, :]
201 |         bda = mats_dict['bda_mat'].view(batch_size, 1, 1, 4,
202 |                                         4).repeat(1, 1, num_cams, 1, 1)
203 |         mlp_input = torch.cat(
204 |             [
205 |                 torch.stack(
206 |                     [
207 |                         intrins[:, 0:1, ..., 0, 0],
208 |                         intrins[:, 0:1, ..., 1, 1],
209 |                         intrins[:, 0:1, ..., 0, 2],
210 |                         intrins[:, 0:1, ..., 1, 2],
211 |                         ida[:, 0:1, ..., 0, 0],
212 |                         ida[:, 0:1, ..., 0, 1],
213 |                         ida[:, 0:1, ..., 0, 3],
214 |                         ida[:, 0:1, ..., 1, 0],
215 |                         ida[:, 0:1, ..., 1, 1],
216 |                         ida[:, 0:1, ..., 1, 3],
217 |                         bda[:, 0:1, ..., 0, 0],
218 |                         bda[:, 0:1, ..., 0, 1],
219 |                         bda[:, 0:1, ..., 1, 0],
220 |                         bda[:, 0:1, ..., 1, 1],
221 |                         bda[:, 0:1, ..., 2, 2],
222 |                     ],
223 |                     dim=-1,
224 |                 ),
225 |                 sensor2ego.view(batch_size, 1, num_cams, -1),
226 |             ],
227 |             -1,
228 |         )
229 |         mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1]))
230 |         x = self.reduce_conv(x)
231 |         context_se = self.context_mlp(mlp_input)[..., None, None]
232 |         context = self.context_se(x, context_se)
233 |         context = self.context_conv(context)
234 |         depth_se = self.depth_mlp(mlp_input)[..., None, None]
235 |         depth = self.depth_se(x, depth_se)
236 |         depth_mid = depth
237 |         depth = self.depth_conv(depth)
238 |         return torch.cat([depth, context], dim=1), depth_mid
239 | 


--------------------------------------------------------------------------------
/evaluators/det_mv_evaluators.py:
--------------------------------------------------------------------------------
  1 | '''Modified from # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
  2 | '''
  3 | import os.path as osp
  4 | import tempfile
  5 | 
  6 | import mmcv
  7 | import numpy as np
  8 | import pyquaternion
  9 | from nuscenes.utils.data_classes import Box
 10 | from pyquaternion import Quaternion
 11 | 
 12 | __all__ = ['DetMVNuscEvaluator']
 13 | 
 14 | 
 15 | class DetMVNuscEvaluator():
 16 |     ErrNameMapping = {
 17 |         'trans_err': 'mATE',
 18 |         'scale_err': 'mASE',
 19 |         'orient_err': 'mAOE',
 20 |         'vel_err': 'mAVE',
 21 |         'attr_err': 'mAAE',
 22 |     }
 23 | 
 24 |     DefaultAttribute = {
 25 |         'car': 'vehicle.parked',
 26 |         'pedestrian': 'pedestrian.moving',
 27 |         'trailer': 'vehicle.parked',
 28 |         'truck': 'vehicle.parked',
 29 |         'bus': 'vehicle.moving',
 30 |         'motorcycle': 'cycle.without_rider',
 31 |         'construction_vehicle': 'vehicle.parked',
 32 |         'bicycle': 'cycle.without_rider',
 33 |         'barrier': '',
 34 |         'traffic_cone': '',
 35 |     }
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         class_names,
 40 |         eval_version='detection_cvpr_2019',
 41 |         data_root='./data/nuScenes',
 42 |         version='v1.0-trainval',
 43 |         modality=dict(use_lidar=False,
 44 |                       use_camera=True,
 45 |                       use_radar=False,
 46 |                       use_map=False,
 47 |                       use_external=False),
 48 |         output_dir=None,
 49 |     ) -> None:
 50 |         self.eval_version = eval_version
 51 |         self.data_root = data_root
 52 |         if self.eval_version is not None:
 53 |             from nuscenes.eval.detection.config import config_factory
 54 | 
 55 |             self.eval_detection_configs = config_factory(self.eval_version)
 56 |         self.version = version
 57 |         self.class_names = class_names
 58 |         self.modality = modality
 59 |         self.output_dir = output_dir
 60 | 
 61 |     def _evaluate_single(self,
 62 |                          result_path,
 63 |                          logger=None,
 64 |                          metric='bbox',
 65 |                          result_name='pts_bbox'):
 66 |         """Evaluation for a single model in nuScenes protocol.
 67 | 
 68 |         Args:
 69 |             result_path (str): Path of the result file.
 70 |             logger (logging.Logger | str | None): Logger used for printing
 71 |                 related information during evaluation. Default: None.
 72 |             metric (str): Metric name used for evaluation. Default: 'bbox'.
 73 |             result_name (str): Result name in the metric prefix.
 74 |                 Default: 'pts_bbox'.
 75 | 
 76 |         Returns:
 77 |             dict: Dictionary of evaluation details.
 78 |         """
 79 |         from nuscenes import NuScenes
 80 |         from nuscenes.eval.detection.evaluate import NuScenesEval
 81 | 
 82 |         output_dir = osp.join(*osp.split(result_path)[:-1])
 83 |         nusc = NuScenes(version=self.version,
 84 |                         dataroot=self.data_root,
 85 |                         verbose=False)
 86 |         eval_set_map = {
 87 |             'v1.0-mini': 'mini_val',
 88 |             'v1.0-trainval': 'val',
 89 |         }
 90 |         nusc_eval = NuScenesEval(nusc,
 91 |                                  config=self.eval_detection_configs,
 92 |                                  result_path=result_path,
 93 |                                  eval_set=eval_set_map[self.version],
 94 |                                  output_dir=output_dir,
 95 |                                  verbose=False)
 96 |         nusc_eval.main(render_curves=False)
 97 | 
 98 |         # record metrics
 99 |         metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
100 |         detail = dict()
101 |         metric_prefix = f'{result_name}_NuScenes'
102 |         for class_name in self.class_names:
103 |             for k, v in metrics['label_aps'][class_name].items():
104 |                 val = float('{:.4f}'.format(v))
105 |                 detail['{}/{}_AP_dist_{}'.format(metric_prefix, class_name,
106 |                                                  k)] = val
107 |             for k, v in metrics['label_tp_errors'][class_name].items():
108 |                 val = float('{:.4f}'.format(v))
109 |                 detail['{}/{}_{}'.format(metric_prefix, class_name, k)] = val
110 |             for k, v in metrics['tp_errors'].items():
111 |                 val = float('{:.4f}'.format(v))
112 |                 detail['{}/{}'.format(metric_prefix,
113 |                                       self.ErrNameMapping[k])] = val
114 | 
115 |         detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
116 |         detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
117 |         return detail
118 | 
119 |     def format_results(self,
120 |                        results,
121 |                        img_metas,
122 |                        result_names=['img_bbox'],
123 |                        jsonfile_prefix=None,
124 |                        **kwargs):
125 |         """Format the results to json (standard format for COCO evaluation).
126 | 
127 |         Args:
128 |             results (list[tuple | numpy.ndarray]): Testing results of the
129 |                 dataset.
130 |             jsonfile_prefix (str | None): The prefix of json files. It includes
131 |                 the file path and the prefix of filename, e.g., "a/b/prefix".
132 |                 If not specified, a temp file will be created. Default: None.
133 | 
134 |         Returns:
135 |             tuple: (result_files, tmp_dir), result_files is a dict containing \
136 |                 the json filepaths, tmp_dir is the temporal directory created \
137 |                 for saving json files when jsonfile_prefix is not specified.
138 |         """
139 |         assert isinstance(results, list), 'results must be a list'
140 | 
141 |         if jsonfile_prefix is None:
142 |             tmp_dir = tempfile.TemporaryDirectory()
143 |             jsonfile_prefix = osp.join(tmp_dir.name, 'results')
144 |         else:
145 |             tmp_dir = None
146 | 
147 |         # currently the output prediction results could be in two formats
148 |         # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
149 |         # 2. list of dict('pts_bbox' or 'img_bbox':
150 |         #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
151 |         # this is a workaround to enable evaluation of both formats on nuScenes
152 |         # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
153 |         # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
154 |         result_files = dict()
155 |         # refactor this.
156 |         for rasult_name in result_names:
157 |             # not evaluate 2D predictions on nuScenes
158 |             if '2d' in rasult_name:
159 |                 continue
160 |             print(f'\nFormating bboxes of {rasult_name}')
161 |             tmp_file_ = osp.join(jsonfile_prefix, rasult_name)
162 |             if self.output_dir:
163 |                 result_files.update({
164 |                     rasult_name:
165 |                     self._format_bbox(results, img_metas, self.output_dir)
166 |                 })
167 |             else:
168 |                 result_files.update({
169 |                     rasult_name:
170 |                     self._format_bbox(results, img_metas, tmp_file_)
171 |                 })
172 |         return result_files, tmp_dir
173 | 
174 |     def evaluate(
175 |         self,
176 |         results,
177 |         img_metas,
178 |         metric='bbox',
179 |         logger=None,
180 |         jsonfile_prefix=None,
181 |         result_names=['img_bbox'],
182 |         show=False,
183 |         out_dir=None,
184 |         pipeline=None,
185 |     ):
186 |         """Evaluation in nuScenes protocol.
187 | 
188 |         Args:
189 |             results (list[dict]): Testing results of the dataset.
190 |             metric (str | list[str]): Metrics to be evaluated.
191 |             logger (logging.Logger | str | None): Logger used for printing
192 |                 related information during evaluation. Default: None.
193 |             jsonfile_prefix (str | None): The prefix of json files. It includes
194 |                 the file path and the prefix of filename, e.g., "a/b/prefix".
195 |                 If not specified, a temp file will be created. Default: None.
196 |             show (bool): Whether to visualize.
197 |                 Default: False.
198 |             out_dir (str): Path to save the visualization results.
199 |                 Default: None.
200 |             pipeline (list[dict], optional): raw data loading for showing.
201 |                 Default: None.
202 | 
203 |         Returns:
204 |             dict[str, float]: Results of each evaluation metric.
205 |         """
206 |         result_files, tmp_dir = self.format_results(results, img_metas,
207 |                                                     result_names,
208 |                                                     jsonfile_prefix)
209 |         if isinstance(result_files, dict):
210 |             for name in result_names:
211 |                 print('Evaluating bboxes of {}'.format(name))
212 |                 self._evaluate_single(result_files[name])
213 |         elif isinstance(result_files, str):
214 |             self._evaluate_single(result_files)
215 | 
216 |         if tmp_dir is not None:
217 |             tmp_dir.cleanup()
218 | 
219 |     def _format_bbox(self, results, img_metas, jsonfile_prefix=None):
220 |         """Convert the results to the standard format.
221 | 
222 |         Args:
223 |             results (list[dict]): Testing results of the dataset.
224 |             jsonfile_prefix (str): The prefix of the output jsonfile.
225 |                 You can specify the output directory/filename by
226 |                 modifying the jsonfile_prefix. Default: None.
227 | 
228 |         Returns:
229 |             str: Path of the output json file.
230 |         """
231 |         nusc_annos = {}
232 |         mapped_class_names = self.class_names
233 | 
234 |         print('Start to convert detection format...')
235 | 
236 |         for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
237 |             boxes, scores, labels = det
238 |             boxes = boxes
239 |             sample_token = img_metas[sample_id]['token']
240 |             trans = np.array(img_metas[sample_id]['ego2global_translation'])
241 |             rot = Quaternion(img_metas[sample_id]['ego2global_rotation'])
242 |             annos = list()
243 |             for i, box in enumerate(boxes):
244 |                 name = mapped_class_names[labels[i]]
245 |                 center = box[:3]
246 |                 wlh = box[[4, 3, 5]]
247 |                 box_yaw = box[6]
248 |                 box_vel = box[7:].tolist()
249 |                 box_vel.append(0)
250 |                 quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw)
251 |                 nusc_box = Box(center, wlh, quat, velocity=box_vel)
252 |                 nusc_box.rotate(rot)
253 |                 nusc_box.translate(trans)
254 |                 if np.sqrt(nusc_box.velocity[0]**2 +
255 |                            nusc_box.velocity[1]**2) > 0.2:
256 |                     if name in [
257 |                             'car',
258 |                             'construction_vehicle',
259 |                             'bus',
260 |                             'truck',
261 |                             'trailer',
262 |                     ]:
263 |                         attr = 'vehicle.moving'
264 |                     elif name in ['bicycle', 'motorcycle']:
265 |                         attr = 'cycle.with_rider'
266 |                     else:
267 |                         attr = self.DefaultAttribute[name]
268 |                 else:
269 |                     if name in ['pedestrian']:
270 |                         attr = 'pedestrian.standing'
271 |                     elif name in ['bus']:
272 |                         attr = 'vehicle.stopped'
273 |                     else:
274 |                         attr = self.DefaultAttribute[name]
275 |                 nusc_anno = dict(
276 |                     sample_token=sample_token,
277 |                     translation=nusc_box.center.tolist(),
278 |                     size=nusc_box.wlh.tolist(),
279 |                     rotation=nusc_box.orientation.elements.tolist(),
280 |                     velocity=nusc_box.velocity[:2],
281 |                     detection_name=name,
282 |                     detection_score=float(scores[i]),
283 |                     attribute_name=attr,
284 |                 )
285 |                 annos.append(nusc_anno)
286 |             # other views results of the same frame should be concatenated
287 |             if sample_token in nusc_annos:
288 |                 nusc_annos[sample_token].extend(annos)
289 |             else:
290 |                 nusc_annos[sample_token] = annos
291 |         nusc_submissions = {
292 |             'meta': self.modality,
293 |             'results': nusc_annos,
294 |         }
295 |         mmcv.mkdir_or_exist(jsonfile_prefix)
296 |         res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
297 |         print('Results writes to', res_path)
298 |         mmcv.dump(nusc_submissions, res_path)
299 |         return res_path
300 | 


--------------------------------------------------------------------------------
/layers/heads/mh_depth_head.py:
--------------------------------------------------------------------------------
  1 | """Inherited from `https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/dense_heads/centerpoint_head.py`"""  # noqa
  2 | import torch
  3 | from mmdet3d.core import draw_heatmap_gaussian, gaussian_radius
  4 | from mmdet3d.models.dense_heads.centerpoint_head import CenterHead
  5 | from mmdet3d.models.utils import clip_sigmoid
  6 | from mmdet.core import reduce_mean
  7 | from mmdet.models import build_backbone
  8 | from mmdet3d.models import build_neck
  9 | from torch.cuda.amp import autocast
 10 | 
 11 | __all__ = ['BEVDepthHead']
 12 | 
 13 | bev_backbone_conf = dict(
 14 |     type='ResNet',
 15 |     in_channels=80,
 16 |     depth=18,
 17 |     num_stages=3,
 18 |     strides=(1, 2, 2),
 19 |     dilations=(1, 1, 1),
 20 |     out_indices=[0, 1, 2],
 21 |     norm_eval=False,
 22 |     base_channels=160,
 23 | )
 24 | 
 25 | bev_neck_conf = dict(type='SECONDFPN',
 26 |                      in_channels=[160, 320, 640],
 27 |                      upsample_strides=[2, 4, 8],
 28 |                      out_channels=[64, 64, 128])
 29 | 
 30 | 
 31 | class BEVDepthHead(CenterHead):
 32 |     """Head for BevDepth.
 33 | 
 34 |     Args:
 35 |         in_channels(int): Number of channels after bev_neck.
 36 |         tasks(dict): Tasks for head.
 37 |         bbox_coder(dict): Config of bbox coder.
 38 |         common_heads(dict): Config of head for each task.
 39 |         loss_cls(dict): Config of classification loss.
 40 |         loss_bbox(dict): Config of regression loss.
 41 |         gaussian_overlap(float): Gaussian overlap used for `get_targets`.
 42 |         min_radius(int): Min radius used for `get_targets`.
 43 |         train_cfg(dict): Config used in the training process.
 44 |         test_cfg(dict): Config used in the test process.
 45 |         bev_backbone_conf(dict): Cnfig of bev_backbone.
 46 |         bev_neck_conf(dict): Cnfig of bev_neck.
 47 |     """
 48 |     def __init__(
 49 |         self,
 50 |         in_channels=256,
 51 |         tasks=None,
 52 |         bbox_coder=None,
 53 |         common_heads=dict(),
 54 |         loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
 55 |         loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
 56 |         gaussian_overlap=0.1,
 57 |         min_radius=2,
 58 |         train_cfg=None,
 59 |         test_cfg=None,
 60 |         bev_backbone_conf=bev_backbone_conf,
 61 |         bev_neck_conf=bev_neck_conf,
 62 |         separate_head=dict(type='SeparateHead',
 63 |                            init_bias=-2.19,
 64 |                            final_kernel=3),
 65 |     ):
 66 |         super(BEVDepthHead, self).__init__(
 67 |             in_channels=in_channels,
 68 |             tasks=tasks,
 69 |             bbox_coder=bbox_coder,
 70 |             common_heads=common_heads,
 71 |             loss_cls=loss_cls,
 72 |             loss_bbox=loss_bbox,
 73 |             separate_head=separate_head,
 74 |         )
 75 |         self.trunk = build_backbone(bev_backbone_conf)
 76 |         self.trunk.init_weights()
 77 |         self.neck = build_neck(bev_neck_conf)
 78 |         self.neck.init_weights()
 79 |         del self.trunk.maxpool
 80 |         self.gaussian_overlap = gaussian_overlap
 81 |         self.min_radius = min_radius
 82 |         self.train_cfg = train_cfg
 83 |         self.test_cfg = test_cfg
 84 | 
 85 |     @autocast(False)
 86 |     def forward(self, x):
 87 |         """Forward pass.
 88 | 
 89 |         Args:
 90 |             feats (list[torch.Tensor]): Multi-level features, e.g.,
 91 |                 features produced by FPN.
 92 | 
 93 |         Returns:
 94 |             tuple(list[dict]): Output results for tasks.
 95 |         """
 96 |         # FPN
 97 |         trunk_outs = [x]
 98 |         if self.trunk.deep_stem:
 99 |             x = self.trunk.stem(x)
100 |         else:
101 |             x = self.trunk.conv1(x)
102 |             x = self.trunk.norm1(x)
103 |             x = self.trunk.relu(x)
104 |         for i, layer_name in enumerate(self.trunk.res_layers):
105 |             res_layer = getattr(self.trunk, layer_name)
106 |             x = res_layer(x)
107 |             if i in self.trunk.out_indices:
108 |                 trunk_outs.append(x)
109 |         fpn_output = self.neck(trunk_outs)
110 |         ret_values = super().forward(fpn_output)
111 |         return ret_values
112 | 
113 |     def get_targets_single(self, gt_bboxes_3d, gt_labels_3d):
114 |         """Generate training targets for a single sample.
115 | 
116 |         Args:
117 |             gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
118 |             gt_labels_3d (torch.Tensor): Labels of boxes.
119 | 
120 |         Returns:
121 |             tuple[list[torch.Tensor]]: Tuple of target including \
122 |                 the following results in order.
123 | 
124 |                 - list[torch.Tensor]: Heatmap scores.
125 |                 - list[torch.Tensor]: Ground truth boxes.
126 |                 - list[torch.Tensor]: Indexes indicating the position \
127 |                     of the valid boxes.
128 |                 - list[torch.Tensor]: Masks indicating which boxes \
129 |                     are valid.
130 |         """
131 |         max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
132 |         grid_size = torch.tensor(self.train_cfg['grid_size'])
133 |         pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
134 |         voxel_size = torch.tensor(self.train_cfg['voxel_size'])
135 | 
136 |         feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
137 | 
138 |         # reorganize the gt_dict by tasks
139 |         task_masks = []
140 |         flag = 0
141 |         for class_name in self.class_names:
142 |             task_masks.append([
143 |                 torch.where(gt_labels_3d == class_name.index(i) + flag)
144 |                 for i in class_name
145 |             ])
146 |             flag += len(class_name)
147 | 
148 |         task_boxes = []
149 |         task_classes = []
150 |         flag2 = 0
151 |         for idx, mask in enumerate(task_masks):
152 |             task_box = []
153 |             task_class = []
154 |             for m in mask:
155 |                 task_box.append(gt_bboxes_3d[m])
156 |                 # 0 is background for each task, so we need to add 1 here.
157 |                 task_class.append(gt_labels_3d[m] + 1 - flag2)
158 |             task_boxes.append(
159 |                 torch.cat(task_box, axis=0).to(gt_bboxes_3d.device))
160 |             task_classes.append(
161 |                 torch.cat(task_class).long().to(gt_bboxes_3d.device))
162 |             flag2 += len(mask)
163 |         draw_gaussian = draw_heatmap_gaussian
164 |         heatmaps, anno_boxes, inds, masks = [], [], [], []
165 | 
166 |         for idx, task_head in enumerate(self.task_heads):
167 |             heatmap = gt_bboxes_3d.new_zeros(
168 |                 (len(self.class_names[idx]), feature_map_size[1],
169 |                  feature_map_size[0]),
170 |                 device='cuda')
171 | 
172 |             anno_box = gt_bboxes_3d.new_zeros((max_objs, 10),
173 |                                               dtype=torch.float32,
174 |                                               device='cuda')
175 | 
176 |             ind = gt_labels_3d.new_zeros((max_objs),
177 |                                          dtype=torch.int64,
178 |                                          device='cuda')
179 |             mask = gt_bboxes_3d.new_zeros((max_objs),
180 |                                           dtype=torch.uint8,
181 |                                           device='cuda')
182 | 
183 |             num_objs = min(task_boxes[idx].shape[0], max_objs)
184 | 
185 |             for k in range(num_objs):
186 |                 cls_id = task_classes[idx][k] - 1
187 | 
188 |                 width = task_boxes[idx][k][3]
189 |                 length = task_boxes[idx][k][4]
190 |                 width = width / voxel_size[0] / self.train_cfg[
191 |                     'out_size_factor']
192 |                 length = length / voxel_size[1] / self.train_cfg[
193 |                     'out_size_factor']
194 | 
195 |                 if width > 0 and length > 0:
196 |                     radius = gaussian_radius(
197 |                         (length, width),
198 |                         min_overlap=self.train_cfg['gaussian_overlap'])
199 |                     radius = max(self.train_cfg['min_radius'], int(radius))
200 | 
201 |                     # be really careful for the coordinate system of
202 |                     # your box annotation.
203 |                     x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
204 |                         1], task_boxes[idx][k][2]
205 | 
206 |                     coor_x = (
207 |                         x - pc_range[0]
208 |                     ) / voxel_size[0] / self.train_cfg['out_size_factor']
209 |                     coor_y = (
210 |                         y - pc_range[1]
211 |                     ) / voxel_size[1] / self.train_cfg['out_size_factor']
212 | 
213 |                     center = torch.tensor([coor_x, coor_y],
214 |                                           dtype=torch.float32,
215 |                                           device='cuda')
216 |                     center_int = center.to(torch.int32)
217 | 
218 |                     # throw out not in range objects to avoid out of array
219 |                     # area when creating the heatmap
220 |                     if not (0 <= center_int[0] < feature_map_size[0]
221 |                             and 0 <= center_int[1] < feature_map_size[1]):
222 |                         continue
223 | 
224 |                     draw_gaussian(heatmap[cls_id], center_int, radius)
225 | 
226 |                     new_idx = k
227 |                     x, y = center_int[0], center_int[1]
228 | 
229 |                     assert y * feature_map_size[0] + x < feature_map_size[
230 |                         0] * feature_map_size[1]
231 | 
232 |                     ind[new_idx] = y * feature_map_size[0] + x
233 |                     mask[new_idx] = 1
234 |                     # TODO: support other outdoor dataset
235 |                     vx, vy = task_boxes[idx][k][7:]
236 |                     rot = task_boxes[idx][k][6]
237 |                     box_dim = task_boxes[idx][k][3:6]
238 |                     if self.norm_bbox:
239 |                         box_dim = box_dim.log()
240 |                     anno_box[new_idx] = torch.cat([
241 |                         center - torch.tensor([x, y], device='cuda'),
242 |                         z.unsqueeze(0),
243 |                         box_dim,
244 |                         torch.sin(rot).unsqueeze(0),
245 |                         torch.cos(rot).unsqueeze(0),
246 |                         vx.unsqueeze(0),
247 |                         vy.unsqueeze(0),
248 |                     ])
249 | 
250 |             heatmaps.append(heatmap)
251 |             anno_boxes.append(anno_box)
252 |             masks.append(mask)
253 |             inds.append(ind)
254 |         return heatmaps, anno_boxes, inds, masks
255 | 
256 |     def loss(self, targets, preds_dicts, **kwargs):
257 |         """Loss function for BEVDepthHead.
258 | 
259 |         Args:
260 |             gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
261 |                 truth gt boxes.
262 |             gt_labels_3d (list[torch.Tensor]): Labels of boxes.
263 |             preds_dicts (dict): Output of forward function.
264 | 
265 |         Returns:
266 |             dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
267 |         """
268 |         heatmaps, anno_boxes, inds, masks = targets
269 |         return_loss = 0
270 |         for task_id, preds_dict in enumerate(preds_dicts):
271 |             # heatmap focal loss
272 |             preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
273 |             num_pos = heatmaps[task_id].eq(1).float().sum().item()
274 |             cls_avg_factor = torch.clamp(reduce_mean(
275 |                 heatmaps[task_id].new_tensor(num_pos)),
276 |                                          min=1).item()
277 |             loss_heatmap = self.loss_cls(preds_dict[0]['heatmap'],
278 |                                          heatmaps[task_id],
279 |                                          avg_factor=cls_avg_factor)
280 |             target_box = anno_boxes[task_id]
281 |             # reconstruct the anno_box from multiple reg heads
282 |             preds_dict[0]['anno_box'] = torch.cat(
283 |                 (
284 |                     preds_dict[0]['reg'],
285 |                     preds_dict[0]['height'],
286 |                     preds_dict[0]['dim'],
287 |                     preds_dict[0]['rot'],
288 |                     preds_dict[0]['vel'],
289 |                 ),
290 |                 dim=1,
291 |             )
292 | 
293 |             # Regression loss for dimension, offset, height, rotation
294 |             num = masks[task_id].float().sum()
295 |             ind = inds[task_id]
296 |             pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
297 |             pred = pred.view(pred.size(0), -1, pred.size(3))
298 |             pred = self._gather_feat(pred, ind)
299 |             mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
300 |             num = torch.clamp(reduce_mean(target_box.new_tensor(num)),
301 |                               min=1e-4).item()
302 |             isnotnan = (~torch.isnan(target_box)).float()
303 |             mask *= isnotnan
304 |             code_weights = self.train_cfg['code_weights']
305 |             bbox_weights = mask * mask.new_tensor(code_weights)
306 |             loss_bbox = self.loss_bbox(pred,
307 |                                        target_box,
308 |                                        bbox_weights,
309 |                                        avg_factor=num)
310 |             return_loss += loss_bbox
311 |             return_loss += loss_heatmap
312 |         return return_loss
313 | 


--------------------------------------------------------------------------------
/layers/heads/bev_depth_head.py:
--------------------------------------------------------------------------------
  1 | """Inherited from `https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/dense_heads/centerpoint_head.py`"""  # noqa
  2 | import torch
  3 | from mmdet3d.core import draw_heatmap_gaussian, gaussian_radius
  4 | from mmdet3d.models.dense_heads.centerpoint_head import CenterHead
  5 | from mmdet3d.models.utils import clip_sigmoid
  6 | from mmdet.core import reduce_mean
  7 | from mmdet.models import build_backbone
  8 | from mmdet3d.models import build_neck
  9 | from torch.cuda.amp import autocast
 10 | 
 11 | __all__ = ['BEVDepthHead']
 12 | 
 13 | bev_backbone_conf = dict(
 14 |     type='ResNet',
 15 |     in_channels=80,
 16 |     depth=18,
 17 |     num_stages=3,
 18 |     strides=(1, 2, 2),
 19 |     dilations=(1, 1, 1),
 20 |     out_indices=[0, 1, 2],
 21 |     norm_eval=False,
 22 |     base_channels=160,
 23 | )
 24 | 
 25 | bev_neck_conf = dict(type='SECONDFPN',
 26 |                      in_channels=[160, 320, 640],
 27 |                      upsample_strides=[2, 4, 8],
 28 |                      out_channels=[64, 64, 128])
 29 | 
 30 | 
 31 | class BEVDepthHead(CenterHead):
 32 |     """Head for BevDepth.
 33 | 
 34 |     Args:
 35 |         in_channels(int): Number of channels after bev_neck.
 36 |         tasks(dict): Tasks for head.
 37 |         bbox_coder(dict): Config of bbox coder.
 38 |         common_heads(dict): Config of head for each task.
 39 |         loss_cls(dict): Config of classification loss.
 40 |         loss_bbox(dict): Config of regression loss.
 41 |         gaussian_overlap(float): Gaussian overlap used for `get_targets`.
 42 |         min_radius(int): Min radius used for `get_targets`.
 43 |         train_cfg(dict): Config used in the training process.
 44 |         test_cfg(dict): Config used in the test process.
 45 |         bev_backbone_conf(dict): Cnfig of bev_backbone.
 46 |         bev_neck_conf(dict): Cnfig of bev_neck.
 47 |     """
 48 |     def __init__(
 49 |         self,
 50 |         in_channels=256,
 51 |         tasks=None,
 52 |         bbox_coder=None,
 53 |         common_heads=dict(),
 54 |         loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
 55 |         loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
 56 |         gaussian_overlap=0.1,
 57 |         min_radius=2,
 58 |         train_cfg=None,
 59 |         test_cfg=None,
 60 |         bev_backbone_conf=bev_backbone_conf,
 61 |         bev_neck_conf=bev_neck_conf,
 62 |         separate_head=dict(type='SeparateHead',
 63 |                            init_bias=-2.19,
 64 |                            final_kernel=3),
 65 |     ):
 66 |         super(BEVDepthHead, self).__init__(
 67 |             in_channels=in_channels,
 68 |             tasks=tasks,
 69 |             bbox_coder=bbox_coder,
 70 |             common_heads=common_heads,
 71 |             loss_cls=loss_cls,
 72 |             loss_bbox=loss_bbox,
 73 |             separate_head=separate_head,
 74 |         )
 75 |         self.trunk = build_backbone(bev_backbone_conf)
 76 |         self.trunk.init_weights()
 77 |         self.neck = build_neck(bev_neck_conf)
 78 |         self.neck.init_weights()
 79 |         del self.trunk.maxpool
 80 |         self.gaussian_overlap = gaussian_overlap
 81 |         self.min_radius = min_radius
 82 |         self.train_cfg = train_cfg
 83 |         self.test_cfg = test_cfg
 84 | 
 85 |     @autocast(False)
 86 |     def forward(self, x):
 87 |         """Forward pass.
 88 | 
 89 |         Args:
 90 |             feats (list[torch.Tensor]): Multi-level features, e.g.,
 91 |                 features produced by FPN.
 92 | 
 93 |         Returns:
 94 |             tuple(list[dict]): Output results for tasks.
 95 |         """
 96 |         # FPN
 97 |         trunk_outs = [x]
 98 |         if self.trunk.deep_stem:
 99 |             x = self.trunk.stem(x)
100 |         else:
101 |             x = self.trunk.conv1(x)
102 |             x = self.trunk.norm1(x)
103 |             x = self.trunk.relu(x)
104 |         for i, layer_name in enumerate(self.trunk.res_layers):
105 |             res_layer = getattr(self.trunk, layer_name)
106 |             x = res_layer(x)
107 |             if i in self.trunk.out_indices:
108 |                 trunk_outs.append(x)
109 |         fpn_output = self.neck(trunk_outs)
110 |         ret_values = super().forward(fpn_output)
111 |         return ret_values
112 | 
113 |     def get_targets_single(self, gt_bboxes_3d, gt_labels_3d):
114 |         """Generate training targets for a single sample.
115 | 
116 |         Args:
117 |             gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
118 |             gt_labels_3d (torch.Tensor): Labels of boxes.
119 | 
120 |         Returns:
121 |             tuple[list[torch.Tensor]]: Tuple of target including \
122 |                 the following results in order.
123 | 
124 |                 - list[torch.Tensor]: Heatmap scores.
125 |                 - list[torch.Tensor]: Ground truth boxes.
126 |                 - list[torch.Tensor]: Indexes indicating the position \
127 |                     of the valid boxes.
128 |                 - list[torch.Tensor]: Masks indicating which boxes \
129 |                     are valid.
130 |         """
131 |         max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
132 |         grid_size = torch.tensor(self.train_cfg['grid_size'])
133 |         pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
134 |         voxel_size = torch.tensor(self.train_cfg['voxel_size'])
135 | 
136 |         feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
137 | 
138 |         # reorganize the gt_dict by tasks
139 |         task_masks = []
140 |         flag = 0
141 |         for class_name in self.class_names:
142 |             task_masks.append([
143 |                 torch.where(gt_labels_3d == class_name.index(i) + flag)
144 |                 for i in class_name
145 |             ])
146 |             flag += len(class_name)
147 | 
148 |         task_boxes = []
149 |         task_classes = []
150 |         flag2 = 0
151 |         for idx, mask in enumerate(task_masks):
152 |             task_box = []
153 |             task_class = []
154 |             for m in mask:
155 |                 task_box.append(gt_bboxes_3d[m])
156 |                 # 0 is background for each task, so we need to add 1 here.
157 |                 task_class.append(gt_labels_3d[m] + 1 - flag2)
158 |             task_boxes.append(
159 |                 torch.cat(task_box, axis=0).to(gt_bboxes_3d.device))
160 |             task_classes.append(
161 |                 torch.cat(task_class).long().to(gt_bboxes_3d.device))
162 |             flag2 += len(mask)
163 |         draw_gaussian = draw_heatmap_gaussian
164 |         heatmaps, anno_boxes, inds, masks = [], [], [], []
165 | 
166 |         for idx, task_head in enumerate(self.task_heads):
167 |             heatmap = gt_bboxes_3d.new_zeros(
168 |                 (len(self.class_names[idx]), feature_map_size[1],
169 |                  feature_map_size[0]),
170 |                 device='cuda')
171 | 
172 |             anno_box = gt_bboxes_3d.new_zeros((max_objs, 10),
173 |                                               dtype=torch.float32,
174 |                                               device='cuda')
175 | 
176 |             ind = gt_labels_3d.new_zeros((max_objs),
177 |                                          dtype=torch.int64,
178 |                                          device='cuda')
179 |             mask = gt_bboxes_3d.new_zeros((max_objs),
180 |                                           dtype=torch.uint8,
181 |                                           device='cuda')
182 | 
183 |             num_objs = min(task_boxes[idx].shape[0], max_objs)
184 | 
185 |             for k in range(num_objs):
186 |                 cls_id = task_classes[idx][k] - 1
187 | 
188 |                 width = task_boxes[idx][k][3]
189 |                 length = task_boxes[idx][k][4]
190 |                 width = width / voxel_size[0] / self.train_cfg[
191 |                     'out_size_factor']
192 |                 length = length / voxel_size[1] / self.train_cfg[
193 |                     'out_size_factor']
194 | 
195 |                 if width > 0 and length > 0:
196 |                     radius = gaussian_radius(
197 |                         (length, width),
198 |                         min_overlap=self.train_cfg['gaussian_overlap'])
199 |                     radius = max(self.train_cfg['min_radius'], int(radius))
200 | 
201 |                     # be really careful for the coordinate system of
202 |                     # your box annotation.
203 |                     x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
204 |                         1], task_boxes[idx][k][2]
205 | 
206 |                     coor_x = (
207 |                         x - pc_range[0]
208 |                     ) / voxel_size[0] / self.train_cfg['out_size_factor']
209 |                     coor_y = (
210 |                         y - pc_range[1]
211 |                     ) / voxel_size[1] / self.train_cfg['out_size_factor']
212 | 
213 |                     center = torch.tensor([coor_x, coor_y],
214 |                                           dtype=torch.float32,
215 |                                           device='cuda')
216 |                     center_int = center.to(torch.int32)
217 | 
218 |                     # throw out not in range objects to avoid out of array
219 |                     # area when creating the heatmap
220 |                     if not (0 <= center_int[0] < feature_map_size[0]
221 |                             and 0 <= center_int[1] < feature_map_size[1]):
222 |                         continue
223 | 
224 |                     draw_gaussian(heatmap[cls_id], center_int, radius)
225 | 
226 |                     new_idx = k
227 |                     x, y = center_int[0], center_int[1]
228 | 
229 |                     assert y * feature_map_size[0] + x < feature_map_size[
230 |                         0] * feature_map_size[1]
231 | 
232 |                     ind[new_idx] = y * feature_map_size[0] + x
233 |                     mask[new_idx] = 1
234 |                     # TODO: support other outdoor dataset
235 |                     vx, vy = task_boxes[idx][k][7:]
236 |                     rot = task_boxes[idx][k][6]
237 |                     box_dim = task_boxes[idx][k][3:6]
238 |                     if self.norm_bbox:
239 |                         box_dim = box_dim.log()
240 |                     anno_box[new_idx] = torch.cat([
241 |                         center - torch.tensor([x, y], device='cuda'),
242 |                         z.unsqueeze(0),
243 |                         box_dim,
244 |                         torch.sin(rot).unsqueeze(0),
245 |                         torch.cos(rot).unsqueeze(0),
246 |                         vx.unsqueeze(0),
247 |                         vy.unsqueeze(0),
248 |                     ])
249 | 
250 |             heatmaps.append(heatmap)
251 |             anno_boxes.append(anno_box)
252 |             masks.append(mask)
253 |             inds.append(ind)
254 |         return heatmaps, anno_boxes, inds, masks
255 | 
256 |     def loss(self, targets, preds_dicts, **kwargs):
257 |         """Loss function for BEVDepthHead.
258 | 
259 |         Args:
260 |             gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
261 |                 truth gt boxes.
262 |             gt_labels_3d (list[torch.Tensor]): Labels of boxes.
263 |             preds_dicts (dict): Output of forward function.
264 | 
265 |         Returns:
266 |             dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
267 |         """
268 |         heatmaps, anno_boxes, inds, masks = targets
269 |         return_loss = 0
270 |         for task_id, preds_dict in enumerate(preds_dicts):
271 |             # heatmap focal loss
272 |             preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
273 |             num_pos = heatmaps[task_id].eq(1).float().sum().item()
274 |             cls_avg_factor = torch.clamp(reduce_mean(
275 |                 heatmaps[task_id].new_tensor(num_pos)),
276 |                                          min=1).item()
277 |             loss_heatmap = self.loss_cls(preds_dict[0]['heatmap'],
278 |                                          heatmaps[task_id],
279 |                                          avg_factor=cls_avg_factor)
280 |             target_box = anno_boxes[task_id]
281 |             # reconstruct the anno_box from multiple reg heads
282 |             preds_dict[0]['anno_box'] = torch.cat(
283 |                 (
284 |                     preds_dict[0]['reg'],
285 |                     preds_dict[0]['height'],
286 |                     preds_dict[0]['dim'],
287 |                     preds_dict[0]['rot'],
288 |                     preds_dict[0]['vel'],
289 |                 ),
290 |                 dim=1,
291 |             )
292 | 
293 |             # Regression loss for dimension, offset, height, rotation
294 |             num = masks[task_id].float().sum()
295 |             ind = inds[task_id]
296 |             pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
297 |             pred = pred.view(pred.size(0), -1, pred.size(3))
298 |             pred = self._gather_feat(pred, ind)
299 |             mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
300 |             num = torch.clamp(reduce_mean(target_box.new_tensor(num)),
301 |                               min=1e-4).item()
302 |             isnotnan = (~torch.isnan(target_box)).float()
303 |             mask *= isnotnan
304 |             code_weights = self.train_cfg['code_weights']
305 |             bbox_weights = mask * mask.new_tensor(code_weights)
306 |             loss_bbox = self.loss_bbox(pred,
307 |                                        target_box,
308 |                                        bbox_weights,
309 |                                        avg_factor=num)
310 |             return_loss += loss_bbox
311 |             return_loss += loss_heatmap
312 |         return return_loss
313 | 


--------------------------------------------------------------------------------
/layers/backbones/lss_fpn.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Megvii Inc. All rights reserved.
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from mmcv.cnn import build_conv_layer
  5 | from mmdet3d.models import build_neck
  6 | from mmdet.models import build_backbone
  7 | from mmdet.models.backbones.resnet import BasicBlock
  8 | from torch import nn
  9 | 
 10 | from ops.voxel_pooling import voxel_pooling
 11 | 
 12 | __all__ = ['LSSFPN']
 13 | 
 14 | 
 15 | class _ASPPModule(nn.Module):
 16 |     def __init__(self, inplanes, planes, kernel_size, padding, dilation,
 17 |                  BatchNorm):
 18 |         super(_ASPPModule, self).__init__()
 19 |         self.atrous_conv = nn.Conv2d(inplanes,
 20 |                                      planes,
 21 |                                      kernel_size=kernel_size,
 22 |                                      stride=1,
 23 |                                      padding=padding,
 24 |                                      dilation=dilation,
 25 |                                      bias=False)
 26 |         self.bn = BatchNorm(planes)
 27 |         self.relu = nn.ReLU()
 28 | 
 29 |         self._init_weight()
 30 | 
 31 |     def forward(self, x):
 32 |         x = self.atrous_conv(x)
 33 |         x = self.bn(x)
 34 | 
 35 |         return self.relu(x)
 36 | 
 37 |     def _init_weight(self):
 38 |         for m in self.modules():
 39 |             if isinstance(m, nn.Conv2d):
 40 |                 torch.nn.init.kaiming_normal_(m.weight)
 41 |             elif isinstance(m, nn.BatchNorm2d):
 42 |                 m.weight.data.fill_(1)
 43 |                 m.bias.data.zero_()
 44 | 
 45 | 
 46 | class ASPP(nn.Module):
 47 |     def __init__(self, inplanes, mid_channels=256, BatchNorm=nn.BatchNorm2d):
 48 |         super(ASPP, self).__init__()
 49 | 
 50 |         dilations = [1, 6, 12, 18]
 51 | 
 52 |         self.aspp1 = _ASPPModule(inplanes,
 53 |                                  mid_channels,
 54 |                                  1,
 55 |                                  padding=0,
 56 |                                  dilation=dilations[0],
 57 |                                  BatchNorm=BatchNorm)
 58 |         self.aspp2 = _ASPPModule(inplanes,
 59 |                                  mid_channels,
 60 |                                  3,
 61 |                                  padding=dilations[1],
 62 |                                  dilation=dilations[1],
 63 |                                  BatchNorm=BatchNorm)
 64 |         self.aspp3 = _ASPPModule(inplanes,
 65 |                                  mid_channels,
 66 |                                  3,
 67 |                                  padding=dilations[2],
 68 |                                  dilation=dilations[2],
 69 |                                  BatchNorm=BatchNorm)
 70 |         self.aspp4 = _ASPPModule(inplanes,
 71 |                                  mid_channels,
 72 |                                  3,
 73 |                                  padding=dilations[3],
 74 |                                  dilation=dilations[3],
 75 |                                  BatchNorm=BatchNorm)
 76 | 
 77 |         self.global_avg_pool = nn.Sequential(
 78 |             nn.AdaptiveAvgPool2d((1, 1)),
 79 |             nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False),
 80 |             BatchNorm(mid_channels),
 81 |             nn.ReLU(),
 82 |         )
 83 |         self.conv1 = nn.Conv2d(int(mid_channels * 5),
 84 |                                mid_channels,
 85 |                                1,
 86 |                                bias=False)
 87 |         self.bn1 = BatchNorm(mid_channels)
 88 |         self.relu = nn.ReLU()
 89 |         self.dropout = nn.Dropout(0.5)
 90 |         self._init_weight()
 91 | 
 92 |     def forward(self, x):
 93 |         x1 = self.aspp1(x)
 94 |         x2 = self.aspp2(x)
 95 |         x3 = self.aspp3(x)
 96 |         x4 = self.aspp4(x)
 97 |         x5 = self.global_avg_pool(x)
 98 |         x5 = F.interpolate(x5,
 99 |                            size=x4.size()[2:],
100 |                            mode='bilinear',
101 |                            align_corners=True)
102 |         x = torch.cat((x1, x2, x3, x4, x5), dim=1)
103 | 
104 |         x = self.conv1(x)
105 |         x = self.bn1(x)
106 |         x = self.relu(x)
107 | 
108 |         return self.dropout(x)
109 | 
110 |     def _init_weight(self):
111 |         for m in self.modules():
112 |             if isinstance(m, nn.Conv2d):
113 |                 torch.nn.init.kaiming_normal_(m.weight)
114 |             elif isinstance(m, nn.BatchNorm2d):
115 |                 m.weight.data.fill_(1)
116 |                 m.bias.data.zero_()
117 | 
118 | 
119 | class Mlp(nn.Module):
120 |     def __init__(self,
121 |                  in_features,
122 |                  hidden_features=None,
123 |                  out_features=None,
124 |                  act_layer=nn.ReLU,
125 |                  drop=0.0):
126 |         super().__init__()
127 |         out_features = out_features or in_features
128 |         hidden_features = hidden_features or in_features
129 |         self.fc1 = nn.Linear(in_features, hidden_features)
130 |         self.act = act_layer()
131 |         self.drop1 = nn.Dropout(drop)
132 |         self.fc2 = nn.Linear(hidden_features, out_features)
133 |         self.drop2 = nn.Dropout(drop)
134 | 
135 |     def forward(self, x):
136 |         x = self.fc1(x)
137 |         x = self.act(x)
138 |         x = self.drop1(x)
139 |         x = self.fc2(x)
140 |         x = self.drop2(x)
141 |         return x
142 | 
143 | 
144 | class SELayer(nn.Module):
145 |     def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
146 |         super().__init__()
147 |         self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True)
148 |         self.act1 = act_layer()
149 |         self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True)
150 |         self.gate = gate_layer()
151 | 
152 |     def forward(self, x, x_se):
153 |         x_se = self.conv_reduce(x_se)
154 |         x_se = self.act1(x_se)
155 |         x_se = self.conv_expand(x_se)
156 |         return x * self.gate(x_se)
157 | 
158 | 
159 | class DepthNet(nn.Module):
160 |     def __init__(self, in_channels, mid_channels, context_channels,
161 |                  depth_channels):
162 |         super(DepthNet, self).__init__()
163 |         self.reduce_conv = nn.Sequential(
164 |             nn.Conv2d(in_channels,
165 |                       mid_channels,
166 |                       kernel_size=3,
167 |                       stride=1,
168 |                       padding=1),
169 |             nn.BatchNorm2d(mid_channels),
170 |             nn.ReLU(inplace=True),
171 |         )
172 |         self.context_conv = nn.Conv2d(mid_channels,
173 |                                       context_channels,
174 |                                       kernel_size=1,
175 |                                       stride=1,
176 |                                       padding=0)
177 |         self.bn = nn.BatchNorm1d(27)
178 |         self.depth_mlp = Mlp(27, mid_channels, mid_channels)
179 |         self.depth_se = SELayer(mid_channels)  # NOTE: add camera-aware
180 |         self.context_mlp = Mlp(27, mid_channels, mid_channels)
181 |         self.context_se = SELayer(mid_channels)  # NOTE: add camera-aware
182 |         self.depth_conv = nn.Sequential(
183 |             BasicBlock(mid_channels, mid_channels),
184 |             BasicBlock(mid_channels, mid_channels),
185 |             BasicBlock(mid_channels, mid_channels),
186 |             ASPP(mid_channels, mid_channels),
187 |             build_conv_layer(cfg=dict(
188 |                 type='DCN',
189 |                 in_channels=mid_channels,
190 |                 out_channels=mid_channels,
191 |                 kernel_size=3,
192 |                 padding=1,
193 |                 groups=4,
194 |                 im2col_step=128,
195 |             )),
196 |             nn.Conv2d(mid_channels,
197 |                       depth_channels,
198 |                       kernel_size=1,
199 |                       stride=1,
200 |                       padding=0),
201 |         )
202 | 
203 |     def forward(self, x, mats_dict):
204 |         intrins = mats_dict['intrin_mats'][:, 0:1, ..., :3, :3]
205 |         batch_size = intrins.shape[0]
206 |         num_cams = intrins.shape[2]
207 |         ida = mats_dict['ida_mats'][:, 0:1, ...]
208 |         sensor2ego = mats_dict['sensor2ego_mats'][:, 0:1, ..., :3, :]
209 |         bda = mats_dict['bda_mat'].view(batch_size, 1, 1, 4,
210 |                                         4).repeat(1, 1, num_cams, 1, 1)
211 |         mlp_input = torch.cat(
212 |             [
213 |                 torch.stack(
214 |                     [
215 |                         intrins[:, 0:1, ..., 0, 0],
216 |                         intrins[:, 0:1, ..., 1, 1],
217 |                         intrins[:, 0:1, ..., 0, 2],
218 |                         intrins[:, 0:1, ..., 1, 2],
219 |                         ida[:, 0:1, ..., 0, 0],
220 |                         ida[:, 0:1, ..., 0, 1],
221 |                         ida[:, 0:1, ..., 0, 3],
222 |                         ida[:, 0:1, ..., 1, 0],
223 |                         ida[:, 0:1, ..., 1, 1],
224 |                         ida[:, 0:1, ..., 1, 3],
225 |                         bda[:, 0:1, ..., 0, 0],
226 |                         bda[:, 0:1, ..., 0, 1],
227 |                         bda[:, 0:1, ..., 1, 0],
228 |                         bda[:, 0:1, ..., 1, 1],
229 |                         bda[:, 0:1, ..., 2, 2],
230 |                     ],
231 |                     dim=-1,
232 |                 ),
233 |                 sensor2ego.view(batch_size, 1, num_cams, -1),
234 |             ],
235 |             -1,
236 |         )
237 |         mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1]))
238 |         x = self.reduce_conv(x)
239 |         context_se = self.context_mlp(mlp_input)[..., None, None]
240 |         context = self.context_se(x, context_se)
241 |         context = self.context_conv(context)
242 |         depth_se = self.depth_mlp(mlp_input)[..., None, None]
243 |         depth = self.depth_se(x, depth_se)
244 |         depth_mid = depth
245 |         depth = self.depth_conv(depth)
246 |         return torch.cat([depth, context], dim=1), depth_mid
247 | 
248 | 
249 | class LSSFPN(nn.Module):
250 |     def __init__(self, x_bound, y_bound, z_bound, d_bound, final_dim,
251 |                  downsample_factor, output_channels, img_backbone_conf,
252 |                  img_neck_conf, depth_net_conf):
253 |         """Modified from `https://github.com/nv-tlabs/lift-splat-shoot`.
254 | 
255 |         Args:
256 |             x_bound (list): Boundaries for x.
257 |             y_bound (list): Boundaries for y.
258 |             z_bound (list): Boundaries for z.
259 |             d_bound (list): Boundaries for d.
260 |             final_dim (list): Dimension for input images.
261 |             downsample_factor (int): Downsample factor between feature map
262 |                 and input image.
263 |             output_channels (int): Number of channels for the output
264 |                 feature map.
265 |             img_backbone_conf (dict): Config for image backbone.
266 |             img_neck_conf (dict): Config for image neck.
267 |             depth_net_conf (dict): Config for depth net.
268 |         """
269 | 
270 |         super(LSSFPN, self).__init__()
271 |         self.downsample_factor = downsample_factor
272 |         self.d_bound = d_bound
273 |         self.final_dim = final_dim
274 |         self.output_channels = output_channels
275 | 
276 |         self.register_buffer(
277 |             'voxel_size',
278 |             torch.Tensor([row[2] for row in [x_bound, y_bound, z_bound]]))
279 |         self.register_buffer(
280 |             'voxel_coord',
281 |             torch.Tensor([
282 |                 row[0] + row[2] / 2.0 for row in [x_bound, y_bound, z_bound]
283 |             ]))
284 |         self.register_buffer(
285 |             'voxel_num',
286 |             torch.LongTensor([(row[1] - row[0]) / row[2]
287 |                               for row in [x_bound, y_bound, z_bound]]))
288 |         self.register_buffer('frustum', self.create_frustum())
289 |         self.depth_channels, _, _, _ = self.frustum.shape
290 | 
291 |         self.img_backbone = build_backbone(img_backbone_conf)
292 |         self.img_neck = build_neck(img_neck_conf)
293 |         self.depth_net = self._configure_depth_net(depth_net_conf)
294 | 
295 |         self.img_neck.init_weights()
296 |         self.img_backbone.init_weights()
297 | 
298 |     def _configure_depth_net(self, depth_net_conf):
299 |         return DepthNet(
300 |             depth_net_conf['in_channels'],
301 |             depth_net_conf['mid_channels'],
302 |             self.output_channels,
303 |             self.depth_channels,
304 |         )
305 | 
306 |     def create_frustum(self):
307 |         """Generate frustum"""
308 |         # make grid in image plane
309 |         ogfH, ogfW = self.final_dim
310 |         fH, fW = ogfH // self.downsample_factor, ogfW // self.downsample_factor
311 |         d_coords = torch.arange(*self.d_bound,
312 |                                 dtype=torch.float).view(-1, 1,
313 |                                                         1).expand(-1, fH, fW)
314 |         D, _, _ = d_coords.shape
315 |         x_coords = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view(
316 |             1, 1, fW).expand(D, fH, fW)
317 |         y_coords = torch.linspace(0, ogfH - 1, fH,
318 |                                   dtype=torch.float).view(1, fH,
319 |                                                           1).expand(D, fH, fW)
320 |         paddings = torch.ones_like(d_coords)
321 | 
322 |         # D x H x W x 3
323 |         frustum = torch.stack((x_coords, y_coords, d_coords, paddings), -1)
324 |         return frustum
325 | 
326 |     def get_geometry(self, sensor2ego_mat, intrin_mat, ida_mat, bda_mat):
327 |         """Transfer points from camera coord to ego coord.
328 | 
329 |         Args:
330 |             rots(Tensor): Rotation matrix from camera to ego.
331 |             trans(Tensor): Translation matrix from camera to ego.
332 |             intrins(Tensor): Intrinsic matrix.
333 |             post_rots_ida(Tensor): Rotation matrix for ida.
334 |             post_trans_ida(Tensor): Translation matrix for ida
335 |             post_rot_bda(Tensor): Rotation matrix for bda.
336 | 
337 |         Returns:
338 |             Tensors: points ego coord.
339 |         """
340 |         batch_size, num_cams, _, _ = sensor2ego_mat.shape
341 | 
342 |         # undo post-transformation
343 |         # B x N x D x H x W x 3
344 |         points = self.frustum
345 |         ida_mat = ida_mat.view(batch_size, num_cams, 1, 1, 1, 4, 4)
346 |         points = ida_mat.inverse().matmul(points.unsqueeze(-1))
347 |         # cam_to_ego
348 |         points = torch.cat(
349 |             (points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
350 |              points[:, :, :, :, :, 2:]), 5)
351 | 
352 |         combine = sensor2ego_mat.matmul(torch.inverse(intrin_mat))
353 |         points = combine.view(batch_size, num_cams, 1, 1, 1, 4,
354 |                               4).matmul(points)
355 |         if bda_mat is not None:
356 |             bda_mat = bda_mat.unsqueeze(1).repeat(1, num_cams, 1, 1).view(
357 |                 batch_size, num_cams, 1, 1, 1, 4, 4)
358 |             points = (bda_mat @ points).squeeze(-1)
359 |         else:
360 |             points = points.squeeze(-1)
361 |         return points[..., :3]
362 | 
363 |     def get_cam_feats(self, imgs):
364 |         """Get feature maps from images."""
365 |         batch_size, num_sweeps, num_cams, num_channels, imH, imW = imgs.shape
366 | 
367 |         imgs = imgs.flatten().view(batch_size * num_sweeps * num_cams,
368 |                                    num_channels, imH, imW)
369 |         img_feats = self.img_neck(self.img_backbone(imgs))[0]
370 |         img_feats = img_feats.reshape(batch_size, num_sweeps, num_cams,
371 |                                       img_feats.shape[1], img_feats.shape[2],
372 |                                       img_feats.shape[3])
373 |         return img_feats
374 | 
375 |     def _forward_depth_net(self, feat, mats_dict):
376 |         return self.depth_net(feat, mats_dict)
377 | 
378 |     def _forward_voxel_net(self, img_feat_with_depth):
379 |         return img_feat_with_depth
380 | 
381 |     def _forward_single_sweep(self,
382 |                               sweep_index,
383 |                               sweep_imgs,
384 |                               mats_dict,
385 |                               is_return_depth=False):
386 |         """Forward function for single sweep.
387 | 
388 |         Args:
389 |             sweep_index (int): Index of sweeps.
390 |             sweep_imgs (Tensor): Input images.
391 |             mats_dict (dict):
392 |                 sensor2ego_mats(Tensor): Transformation matrix from
393 |                     camera to ego with shape of (B, num_sweeps,
394 |                     num_cameras, 4, 4).
395 |                 intrin_mats(Tensor): Intrinsic matrix with shape
396 |                     of (B, num_sweeps, num_cameras, 4, 4).
397 |                 ida_mats(Tensor): Transformation matrix for ida with
398 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
399 |                 sensor2sensor_mats(Tensor): Transformation matrix
400 |                     from key frame camera to sweep frame camera with
401 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
402 |                 bda_mat(Tensor): Rotation matrix for bda with shape
403 |                     of (B, 4, 4).
404 |             is_return_depth (bool, optional): Whether to return depth.
405 |                 Default: False.
406 | 
407 |         Returns:
408 |             Tensor: BEV feature map.
409 |         """
410 |         batch_size, num_sweeps, num_cams, num_channels, img_height, \
411 |             img_width = sweep_imgs.shape
412 |         img_feats = self.get_cam_feats(sweep_imgs)
413 |         source_features = img_feats[:, 0, ...]
414 |         depth_feature, cp_feature = self._forward_depth_net(
415 |             source_features.reshape(batch_size * num_cams,
416 |                                     source_features.shape[2],
417 |                                     source_features.shape[3],
418 |                                     source_features.shape[4]),
419 |             mats_dict,
420 |         )
421 |         depth = depth_feature[:, :self.depth_channels].softmax(1)
422 |         img_feat_with_depth = depth.unsqueeze(
423 |             1) * depth_feature[:, self.depth_channels:(
424 |                 self.depth_channels + self.output_channels)].unsqueeze(2)
425 | 
426 |         # get voxel feature
427 |         voxel_feature = img_feat_with_depth
428 |         
429 |         img_feat_with_depth = self._forward_voxel_net(img_feat_with_depth)
430 | 
431 |         img_feat_with_depth = img_feat_with_depth.reshape(
432 |             batch_size,
433 |             num_cams,
434 |             img_feat_with_depth.shape[1],
435 |             img_feat_with_depth.shape[2],
436 |             img_feat_with_depth.shape[3],
437 |             img_feat_with_depth.shape[4],
438 |         )
439 |         geom_xyz = self.get_geometry(
440 |             mats_dict['sensor2ego_mats'][:, sweep_index, ...],
441 |             mats_dict['intrin_mats'][:, sweep_index, ...],
442 |             mats_dict['ida_mats'][:, sweep_index, ...],
443 |             mats_dict.get('bda_mat', None),
444 |         )
445 |         img_feat_with_depth = img_feat_with_depth.permute(0, 1, 3, 4, 5, 2)
446 |         geom_xyz = ((geom_xyz - (self.voxel_coord - self.voxel_size / 2.0)) /
447 |                     self.voxel_size).int()
448 |         feature_map = voxel_pooling(geom_xyz, img_feat_with_depth.contiguous(),
449 |                                     self.voxel_num.cuda())
450 |         if is_return_depth:
451 |             return feature_map.contiguous(), depth, source_features, voxel_feature, cp_feature
452 |         return feature_map.contiguous()
453 | 
454 |     def forward(self,
455 |                 sweep_imgs,
456 |                 mats_dict,
457 |                 timestamps=None,
458 |                 is_return_depth=False):
459 |         """Forward function.
460 | 
461 |         Args:
462 |             sweep_imgs(Tensor): Input images with shape of (B, num_sweeps,
463 |                 num_cameras, 3, H, W).
464 |             mats_dict(dict):
465 |                 sensor2ego_mats(Tensor): Transformation matrix from
466 |                     camera to ego with shape of (B, num_sweeps,
467 |                     num_cameras, 4, 4).
468 |                 intrin_mats(Tensor): Intrinsic matrix with shape
469 |                     of (B, num_sweeps, num_cameras, 4, 4).
470 |                 ida_mats(Tensor): Transformation matrix for ida with
471 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
472 |                 sensor2sensor_mats(Tensor): Transformation matrix
473 |                     from key frame camera to sweep frame camera with
474 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
475 |                 bda_mat(Tensor): Rotation matrix for bda with shape
476 |                     of (B, 4, 4).
477 |             timestamps(Tensor): Timestamp for all images with the shape of(B,
478 |                 num_sweeps, num_cameras).
479 | 
480 |         Return:
481 |             Tensor: bev feature map.
482 |         """
483 |         batch_size, num_sweeps, num_cams, num_channels, img_height, \
484 |             img_width = sweep_imgs.shape
485 | 
486 |         key_frame_res = self._forward_single_sweep(
487 |             0,
488 |             sweep_imgs[:, 0:1, ...],
489 |             mats_dict,
490 |             is_return_depth=is_return_depth)
491 |         if num_sweeps == 1:
492 |             return key_frame_res
493 | 
494 |         key_frame_feature = key_frame_res[
495 |             0] if is_return_depth else key_frame_res
496 | 
497 |         ret_feature_list = [key_frame_feature]
498 |         for sweep_index in range(1, num_sweeps):
499 |             with torch.no_grad():
500 |                 feature_map = self._forward_single_sweep(
501 |                     sweep_index,
502 |                     sweep_imgs[:, sweep_index:sweep_index + 1, ...],
503 |                     mats_dict,
504 |                     is_return_depth=False)
505 |                 ret_feature_list.append(feature_map)
506 | 
507 |         if is_return_depth:
508 |             return torch.cat(ret_feature_list, 1), key_frame_res[1], key_frame_res[2], key_frame_res[3], key_frame_res[4]
509 |         else:
510 |             return torch.cat(ret_feature_list, 1)
511 | 


--------------------------------------------------------------------------------
/layers/backbones/multi_head_fpn.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Megvii Inc. All rights reserved.
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from mmcv.cnn import build_conv_layer
  5 | from mmdet3d.models import build_neck
  6 | from mmdet.models import build_backbone
  7 | from mmdet.models.backbones.resnet import BasicBlock
  8 | from torch import nn
  9 | import time
 10 | 
 11 | from ops.voxel_pooling import voxel_pooling
 12 | from layers.voxel_mix import voxel_mix_net
 13 | __all__ = ['LSSFPN']
 14 | 
 15 | 
 16 | class _ASPPModule(nn.Module):
 17 |     def __init__(self, inplanes, planes, kernel_size, padding, dilation,
 18 |                  BatchNorm):
 19 |         super(_ASPPModule, self).__init__()
 20 |         self.atrous_conv = nn.Conv2d(inplanes,
 21 |                                      planes,
 22 |                                      kernel_size=kernel_size,
 23 |                                      stride=1,
 24 |                                      padding=padding,
 25 |                                      dilation=dilation,
 26 |                                      bias=False)
 27 |         self.bn = BatchNorm(planes)
 28 |         self.relu = nn.ReLU()
 29 | 
 30 |         self._init_weight()
 31 | 
 32 |     def forward(self, x):
 33 |         x = self.atrous_conv(x)
 34 |         x = self.bn(x)
 35 | 
 36 |         return self.relu(x)
 37 | 
 38 |     def _init_weight(self):
 39 |         for m in self.modules():
 40 |             if isinstance(m, nn.Conv2d):
 41 |                 torch.nn.init.kaiming_normal_(m.weight)
 42 |             elif isinstance(m, nn.BatchNorm2d):
 43 |                 m.weight.data.fill_(1)
 44 |                 m.bias.data.zero_()
 45 | 
 46 | 
 47 | class ASPP(nn.Module):
 48 |     def __init__(self, inplanes, mid_channels=256, BatchNorm=nn.BatchNorm2d):
 49 |         super(ASPP, self).__init__()
 50 | 
 51 |         dilations = [1, 6, 12, 18]
 52 | 
 53 |         self.aspp1 = _ASPPModule(inplanes,
 54 |                                  mid_channels,
 55 |                                  1,
 56 |                                  padding=0,
 57 |                                  dilation=dilations[0],
 58 |                                  BatchNorm=BatchNorm)
 59 |         self.aspp2 = _ASPPModule(inplanes,
 60 |                                  mid_channels,
 61 |                                  3,
 62 |                                  padding=dilations[1],
 63 |                                  dilation=dilations[1],
 64 |                                  BatchNorm=BatchNorm)
 65 |         self.aspp3 = _ASPPModule(inplanes,
 66 |                                  mid_channels,
 67 |                                  3,
 68 |                                  padding=dilations[2],
 69 |                                  dilation=dilations[2],
 70 |                                  BatchNorm=BatchNorm)
 71 |         self.aspp4 = _ASPPModule(inplanes,
 72 |                                  mid_channels,
 73 |                                  3,
 74 |                                  padding=dilations[3],
 75 |                                  dilation=dilations[3],
 76 |                                  BatchNorm=BatchNorm)
 77 | 
 78 |         self.global_avg_pool = nn.Sequential(
 79 |             nn.AdaptiveAvgPool2d((1, 1)),
 80 |             nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False),
 81 |             BatchNorm(mid_channels),
 82 |             nn.ReLU(),
 83 |         )
 84 |         self.conv1 = nn.Conv2d(int(mid_channels * 5),
 85 |                                mid_channels,
 86 |                                1,
 87 |                                bias=False)
 88 |         self.bn1 = BatchNorm(mid_channels)
 89 |         self.relu = nn.ReLU()
 90 |         self.dropout = nn.Dropout(0.5)
 91 |         self._init_weight()
 92 | 
 93 |     def forward(self, x):
 94 |         x1 = self.aspp1(x)
 95 |         x2 = self.aspp2(x)
 96 |         x3 = self.aspp3(x)
 97 |         x4 = self.aspp4(x)
 98 |         x5 = self.global_avg_pool(x)
 99 |         x5 = F.interpolate(x5,
100 |                            size=x4.size()[2:],
101 |                            mode='bilinear',
102 |                            align_corners=True)
103 |         x = torch.cat((x1, x2, x3, x4, x5), dim=1)
104 | 
105 |         x = self.conv1(x)
106 |         x = self.bn1(x)
107 |         x = self.relu(x)
108 | 
109 |         return self.dropout(x)
110 | 
111 |     def _init_weight(self):
112 |         for m in self.modules():
113 |             if isinstance(m, nn.Conv2d):
114 |                 torch.nn.init.kaiming_normal_(m.weight)
115 |             elif isinstance(m, nn.BatchNorm2d):
116 |                 m.weight.data.fill_(1)
117 |                 m.bias.data.zero_()
118 | 
119 | 
120 | class Mlp(nn.Module):
121 |     def __init__(self,
122 |                  in_features,
123 |                  hidden_features=None,
124 |                  out_features=None,
125 |                  act_layer=nn.ReLU,
126 |                  drop=0.0):
127 |         super().__init__()
128 |         out_features = out_features or in_features
129 |         hidden_features = hidden_features or in_features
130 |         self.fc1 = nn.Linear(in_features, hidden_features)
131 |         self.act = act_layer()
132 |         self.drop1 = nn.Dropout(drop)
133 |         self.fc2 = nn.Linear(hidden_features, out_features)
134 |         self.drop2 = nn.Dropout(drop)
135 | 
136 |     def forward(self, x):
137 |         x = self.fc1(x)
138 |         x = self.act(x)
139 |         x = self.drop1(x)
140 |         x = self.fc2(x)
141 |         x = self.drop2(x)
142 |         return x
143 | 
144 | 
145 | class SELayer(nn.Module):
146 |     def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid):
147 |         super().__init__()
148 |         self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True)
149 |         self.act1 = act_layer()
150 |         self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True)
151 |         self.gate = gate_layer()
152 | 
153 |     def forward(self, x, x_se):
154 |         x_se = self.conv_reduce(x_se)
155 |         x_se = self.act1(x_se)
156 |         x_se = self.conv_expand(x_se)
157 |         return x * self.gate(x_se)
158 | 
159 | 
160 | class DepthNet(nn.Module):
161 |     def __init__(self, in_channels, mid_channels, context_channels,
162 |                  depth_channels):
163 |         super(DepthNet, self).__init__()
164 |         self.reduce_conv = nn.Sequential(
165 |             nn.Conv2d(in_channels,
166 |                       mid_channels,
167 |                       kernel_size=3,
168 |                       stride=1,
169 |                       padding=1),
170 |             nn.BatchNorm2d(mid_channels),
171 |             nn.ReLU(inplace=True),
172 |         )
173 |         self.context_conv = nn.Conv2d(mid_channels,
174 |                                       context_channels,
175 |                                       kernel_size=1,
176 |                                       stride=1,
177 |                                       padding=0)
178 |         self.bn = nn.BatchNorm1d(27)
179 |         self.depth_mlp = Mlp(27, mid_channels, mid_channels)
180 |         self.depth_se = SELayer(mid_channels)  # NOTE: add camera-aware
181 |         self.context_mlp = Mlp(27, mid_channels, mid_channels)
182 |         self.context_se = SELayer(mid_channels)  # NOTE: add camera-aware
183 |         self.depth_conv = nn.Sequential(
184 |             BasicBlock(mid_channels, mid_channels),
185 |             BasicBlock(mid_channels, mid_channels),
186 |             BasicBlock(mid_channels, mid_channels),
187 |             ASPP(mid_channels, mid_channels),
188 |             build_conv_layer(cfg=dict(
189 |                 type='DCN',
190 |                 in_channels=mid_channels,
191 |                 out_channels=mid_channels,
192 |                 kernel_size=3,
193 |                 padding=1,
194 |                 groups=4,
195 |                 im2col_step=128,
196 |             )),
197 |             nn.Conv2d(mid_channels,
198 |                       depth_channels,
199 |                       kernel_size=1,
200 |                       stride=1,
201 |                       padding=0),
202 |         )
203 | 
204 |     def forward(self, x, mats_dict):
205 |         intrins = mats_dict['intrin_mats'][:, 0:1, ..., :3, :3]
206 |         batch_size = intrins.shape[0]
207 |         num_cams = intrins.shape[2]
208 |         ida = mats_dict['ida_mats'][:, 0:1, ...]
209 |         sensor2ego = mats_dict['sensor2ego_mats'][:, 0:1, ..., :3, :]
210 |         bda = mats_dict['bda_mat'].view(batch_size, 1, 1, 4,
211 |                                         4).repeat(1, 1, num_cams, 1, 1)
212 |         mlp_input = torch.cat(
213 |             [
214 |                 torch.stack(
215 |                     [
216 |                         intrins[:, 0:1, ..., 0, 0],
217 |                         intrins[:, 0:1, ..., 1, 1],
218 |                         intrins[:, 0:1, ..., 0, 2],
219 |                         intrins[:, 0:1, ..., 1, 2],
220 |                         ida[:, 0:1, ..., 0, 0],
221 |                         ida[:, 0:1, ..., 0, 1],
222 |                         ida[:, 0:1, ..., 0, 3],
223 |                         ida[:, 0:1, ..., 1, 0],
224 |                         ida[:, 0:1, ..., 1, 1],
225 |                         ida[:, 0:1, ..., 1, 3],
226 |                         bda[:, 0:1, ..., 0, 0],
227 |                         bda[:, 0:1, ..., 0, 1],
228 |                         bda[:, 0:1, ..., 1, 0],
229 |                         bda[:, 0:1, ..., 1, 1],
230 |                         bda[:, 0:1, ..., 2, 2],
231 |                     ],
232 |                     dim=-1,
233 |                 ),
234 |                 sensor2ego.view(batch_size, 1, num_cams, -1),
235 |             ],
236 |             -1,
237 |         )
238 |         mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1]))
239 |         x = self.reduce_conv(x)
240 |         context_se = self.context_mlp(mlp_input)[..., None, None]
241 |         context = self.context_se(x, context_se)
242 |         context = self.context_conv(context)
243 |         depth_se = self.depth_mlp(mlp_input)[..., None, None]
244 |         depth = self.depth_se(x, depth_se)
245 |         depth = self.depth_conv(depth)
246 |         return torch.cat([depth, context], dim=1)
247 | 
248 | 
249 | class LSSFPN(nn.Module):
250 |     def __init__(self, x_bound, y_bound, z_bound, d_bound, final_dim,
251 |                  downsample_factor, output_channels, img_backbone_conf,
252 |                  img_neck_conf, depth_net_conf):
253 |         """Modified from `https://github.com/nv-tlabs/lift-splat-shoot`.
254 | 
255 |         Args:
256 |             x_bound (list): Boundaries for x.
257 |             y_bound (list): Boundaries for y.
258 |             z_bound (list): Boundaries for z.
259 |             d_bound (list): Boundaries for d.
260 |             final_dim (list): Dimension for input images.
261 |             downsample_factor (int): Downsample factor between feature map
262 |                 and input image.
263 |             output_channels (int): Number of channels for the output
264 |                 feature map.
265 |             img_backbone_conf (dict): Config for image backbone.
266 |             img_neck_conf (dict): Config for image neck.
267 |             depth_net_conf (dict): Config for depth net.
268 |         """
269 | 
270 |         super(LSSFPN, self).__init__()
271 |         self.downsample_factor = downsample_factor
272 |         self.d_bound = d_bound
273 |         self.final_dim = final_dim
274 |         self.output_channels = output_channels
275 | 
276 |         # TODO hard code here, only for test!!!!
277 |         self.z_bounds = [[-6, -3, 3],[-3, -2, 2],[-2, -1, 1],[-1, 0, 1],[0, 2, 2],[2, 4, 2],[-6, 4, 10],[-5, 3, 8],[-4, 2, 6]]
278 |         self.register_buffer(
279 |             f'voxel_size',
280 |             torch.Tensor([[row[2] for row in [x_bound, y_bound, zb]] for zb in self.z_bounds])
281 |                          )
282 |         self.register_buffer(
283 |             f'voxel_coord',
284 |             torch.Tensor([[row[0] + row[2] / 2.0 for row in [x_bound, y_bound, zb]] for zb in self.z_bounds]))
285 |         self.register_buffer(
286 |             f'voxel_num',
287 |             torch.LongTensor([[(row[1] - row[0]) / row[2] for row in [x_bound, y_bound, zb]]for zb in self.z_bounds]))
288 |         self.voxel_mix_net = voxel_mix_net()
289 | 
290 |         self.register_buffer('frustum', self.create_frustum())
291 |         self.depth_channels, _, _, _ = self.frustum.shape
292 | 
293 |         self.img_backbone = build_backbone(img_backbone_conf)
294 |         self.img_neck = build_neck(img_neck_conf)
295 |         self.depth_net = self._configure_depth_net(depth_net_conf)
296 | 
297 |         self.img_neck.init_weights()
298 |         self.img_backbone.init_weights()
299 | 
300 |     def _configure_depth_net(self, depth_net_conf):
301 |         return DepthNet(
302 |             depth_net_conf['in_channels'],
303 |             depth_net_conf['mid_channels'],
304 |             self.output_channels,
305 |             self.depth_channels,
306 |         )
307 | 
308 |     def create_frustum(self):
309 |         """Generate frustum"""
310 |         # make grid in image plane
311 |         ogfH, ogfW = self.final_dim
312 |         fH, fW = ogfH // self.downsample_factor, ogfW // self.downsample_factor
313 |         d_coords = torch.arange(*self.d_bound,
314 |                                 dtype=torch.float).view(-1, 1,
315 |                                                         1).expand(-1, fH, fW)
316 |         D, _, _ = d_coords.shape
317 |         x_coords = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view(
318 |             1, 1, fW).expand(D, fH, fW)
319 |         y_coords = torch.linspace(0, ogfH - 1, fH,
320 |                                   dtype=torch.float).view(1, fH,
321 |                                                           1).expand(D, fH, fW)
322 |         paddings = torch.ones_like(d_coords)
323 | 
324 |         # D x H x W x 3
325 |         frustum = torch.stack((x_coords, y_coords, d_coords, paddings), -1)
326 |         return frustum
327 | 
328 |     def get_geometry(self, sensor2ego_mat, intrin_mat, ida_mat, bda_mat):
329 |         """Transfer points from camera coord to ego coord.
330 | 
331 |         Args:
332 |             rots(Tensor): Rotation matrix from camera to ego.
333 |             trans(Tensor): Translation matrix from camera to ego.
334 |             intrins(Tensor): Intrinsic matrix.
335 |             post_rots_ida(Tensor): Rotation matrix for ida.
336 |             post_trans_ida(Tensor): Translation matrix for ida
337 |             post_rot_bda(Tensor): Rotation matrix for bda.
338 | 
339 |         Returns:
340 |             Tensors: points ego coord.
341 |         """
342 |         batch_size, num_cams, _, _ = sensor2ego_mat.shape
343 | 
344 |         # undo post-transformation
345 |         # B x N x D x H x W x 3
346 |         points = self.frustum
347 |         ida_mat = ida_mat.view(batch_size, num_cams, 1, 1, 1, 4, 4)
348 |         points = ida_mat.inverse().matmul(points.unsqueeze(-1))
349 |         # cam_to_ego
350 |         points = torch.cat(
351 |             (points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
352 |              points[:, :, :, :, :, 2:]), 5)
353 | 
354 |         combine = sensor2ego_mat.matmul(torch.inverse(intrin_mat))
355 |         points = combine.view(batch_size, num_cams, 1, 1, 1, 4,
356 |                               4).matmul(points)
357 |         if bda_mat is not None:
358 |             bda_mat = bda_mat.unsqueeze(1).repeat(1, num_cams, 1, 1).view(
359 |                 batch_size, num_cams, 1, 1, 1, 4, 4)
360 |             points = (bda_mat @ points).squeeze(-1)
361 |         else:
362 |             points = points.squeeze(-1)
363 |         return points[..., :3]
364 | 
365 |     def get_cam_feats(self, imgs):
366 |         """Get feature maps from images.
367 |         return img_feats [batch,sweep,cam,d,h,w]
368 |         """
369 | 
370 |         batch_size, num_sweeps, num_cams, num_channels, imH, imW = imgs.shape
371 | 
372 |         imgs = imgs.flatten().view(batch_size * num_sweeps * num_cams,
373 |                                    num_channels, imH, imW)
374 |         img_feats = self.img_neck(self.img_backbone(imgs))[0]
375 |         img_feats = img_feats.reshape(batch_size, num_sweeps, num_cams,
376 |                                       img_feats.shape[1], img_feats.shape[2],
377 |                                       img_feats.shape[3])
378 |         return img_feats
379 | 
380 |     def _forward_depth_net(self, feat, mats_dict):
381 |         return self.depth_net(feat, mats_dict)
382 | 
383 |     def _forward_voxel_net(self, img_feat_with_depth):
384 |         return img_feat_with_depth
385 | 
386 |     def _forward_single_sweep(self,
387 |                               sweep_index,
388 |                               sweep_imgs,
389 |                               mats_dict,
390 |                               is_return_depth=False):
391 |         """Forward function for single sweep.
392 | 
393 |         Args:
394 |             sweep_index (int): Index of sweeps.
395 |             sweep_imgs (Tensor): Input images.
396 |             mats_dict (dict):
397 |                 sensor2ego_mats(Tensor): Transformation matrix from
398 |                     camera to ego with shape of (B, num_sweeps,
399 |                     num_cameras, 4, 4).
400 |                 intrin_mats(Tensor): Intrinsic matrix with shape
401 |                     of (B, num_sweeps, num_cameras, 4, 4).
402 |                 ida_mats(Tensor): Transformation matrix for ida with
403 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
404 |                 sensor2sensor_mats(Tensor): Transformation matrix
405 |                     from key frame camera to sweep frame camera with
406 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
407 |                 bda_mat(Tensor): Rotation matrix for bda with shape
408 |                     of (B, 4, 4).
409 |             is_return_depth (bool, optional): Whether to return depth.
410 |                 Default: False.
411 | 
412 |         Returns:
413 |             Tensor: BEV feature map.
414 |         """
415 |         batch_size, num_sweeps, num_cams, num_channels, img_height, \
416 |             img_width = sweep_imgs.shape
417 |         img_feats = self.get_cam_feats(sweep_imgs)
418 |         source_features = img_feats[:, 0, ...]
419 |         depth_feature = self._forward_depth_net(
420 |             source_features.reshape(batch_size * num_cams,
421 |                                     source_features.shape[2],
422 |                                     source_features.shape[3],
423 |                                     source_features.shape[4]),
424 |             mats_dict,
425 |         )
426 |         depth = depth_feature[:, :self.depth_channels].softmax(1)
427 |         img_feat_with_depth = depth.unsqueeze(
428 |             1) * depth_feature[:, self.depth_channels:(
429 |                 self.depth_channels + self.output_channels)].unsqueeze(2)
430 | 
431 |         img_feat_with_depth = self._forward_voxel_net(img_feat_with_depth)
432 | 
433 |         img_feat_with_depth = img_feat_with_depth.reshape(
434 |             batch_size,
435 |             num_cams,
436 |             img_feat_with_depth.shape[1],
437 |             img_feat_with_depth.shape[2],
438 |             img_feat_with_depth.shape[3],
439 |             img_feat_with_depth.shape[4],
440 |         )
441 |         geom_xyz = self.get_geometry(
442 |             mats_dict['sensor2ego_mats'][:, sweep_index, ...],
443 |             mats_dict['intrin_mats'][:, sweep_index, ...],
444 |             mats_dict['ida_mats'][:, sweep_index, ...],
445 |             mats_dict.get('bda_mat', None),
446 |         )
447 |         img_feat_with_depth = img_feat_with_depth.permute(0, 1, 3, 4, 5, 2)
448 | 
449 |         # TODO hard code here!!!! careful!!
450 |         feature_maps = []
451 |         for i in range(len(self.z_bounds)):
452 |             tmp_geom_xyz = ((geom_xyz - (self.voxel_coord[i] - self.voxel_size[i] / 2.0)) /
453 |                         self.voxel_size[i]).int()
454 |             feature_maps.append(voxel_pooling(tmp_geom_xyz, img_feat_with_depth.contiguous(),
455 |                                         self.voxel_num[i].cuda()))
456 |         overlap_feature = torch.cat(feature_maps,dim=1)
457 |         feature_map = self.voxel_mix_net(overlap_feature)
458 |         feature_map = feature_map.type(torch.cuda.FloatTensor)
459 |         if is_return_depth:
460 |             return feature_map.contiguous(), depth, source_features
461 |         return feature_map.contiguous()
462 | 
463 |     def forward(self,
464 |                 sweep_imgs,
465 |                 mats_dict,
466 |                 timestamps=None,
467 |                 is_return_depth=False):
468 |         """Forward function.
469 | 
470 |         Args:
471 |             sweep_imgs(Tensor): Input images with shape of (B, num_sweeps,
472 |                 num_cameras, 3, H, W).
473 |             mats_dict(dict):
474 |                 sensor2ego_mats(Tensor): Transformation matrix from
475 |                     camera to ego with shape of (B, num_sweeps,
476 |                     num_cameras, 4, 4).
477 |                 intrin_mats(Tensor): Intrinsic matrix with shape
478 |                     of (B, num_sweeps, num_cameras, 4, 4).
479 |                 ida_mats(Tensor): Transformation matrix for ida with
480 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
481 |                 sensor2sensor_mats(Tensor): Transformation matrix
482 |                     from key frame camera to sweep frame camera with
483 |                     shape of (B, num_sweeps, num_cameras, 4, 4).
484 |                 bda_mat(Tensor): Rotation matrix for bda with shape
485 |                     of (B, 4, 4).
486 |             timestamps(Tensor): Timestamp for all images with the shape of(B,
487 |                 num_sweeps, num_cameras).
488 | 
489 |         Return:
490 |             Tensor: bev feature map.
491 |         """
492 |         batch_size, num_sweeps, num_cams, num_channels, img_height, \
493 |             img_width = sweep_imgs.shape
494 | 
495 |         key_frame_res = self._forward_single_sweep(
496 |             0,
497 |             sweep_imgs[:, 0:1, ...],
498 |             mats_dict,
499 |             is_return_depth=is_return_depth)
500 |         if num_sweeps == 1:
501 |             return key_frame_res
502 | 
503 |         key_frame_feature = key_frame_res[
504 |             0] if is_return_depth else key_frame_res
505 | 
506 |         ret_feature_list = [key_frame_feature]
507 |         for sweep_index in range(1, num_sweeps):
508 |             with torch.no_grad():
509 |                 feature_map = self._forward_single_sweep(
510 |                     sweep_index,
511 |                     sweep_imgs[:, sweep_index:sweep_index + 1, ...],
512 |                     mats_dict,
513 |                     is_return_depth=False)
514 |                 ret_feature_list.append(feature_map)
515 | 
516 |         if is_return_depth:
517 |             return torch.cat(ret_feature_list, 1), key_frame_res[1], key_frame_res[2]
518 |         else:
519 |             return torch.cat(ret_feature_list, 1)
520 | 


--------------------------------------------------------------------------------
/dataset/nusc_mv_det_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import mmcv
  4 | import numpy as np
  5 | import torch
  6 | from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
  7 | from nuscenes.utils.data_classes import Box
  8 | from PIL import Image
  9 | from pyquaternion import Quaternion
 10 | from torch.utils.data import Dataset
 11 | 
 12 | __all__ = ['NuscMVDetDataset']
 13 | 
 14 | map_name_from_general_to_detection = {
 15 |     'human.pedestrian.adult': 'pedestrian',
 16 |     'human.pedestrian.child': 'pedestrian',
 17 |     'human.pedestrian.wheelchair': 'ignore',
 18 |     'human.pedestrian.stroller': 'ignore',
 19 |     'human.pedestrian.personal_mobility': 'ignore',
 20 |     'human.pedestrian.police_officer': 'pedestrian',
 21 |     'human.pedestrian.construction_worker': 'pedestrian',
 22 |     'animal': 'ignore',
 23 |     'vehicle.car': 'car',
 24 |     'vehicle.motorcycle': 'motorcycle',
 25 |     'vehicle.bicycle': 'bicycle',
 26 |     'vehicle.bus.bendy': 'bus',
 27 |     'vehicle.bus.rigid': 'bus',
 28 |     'vehicle.truck': 'truck',
 29 |     'vehicle.construction': 'construction_vehicle',
 30 |     'vehicle.emergency.ambulance': 'ignore',
 31 |     'vehicle.emergency.police': 'ignore',
 32 |     'vehicle.trailer': 'trailer',
 33 |     'movable_object.barrier': 'barrier',
 34 |     'movable_object.trafficcone': 'traffic_cone',
 35 |     'movable_object.pushable_pullable': 'ignore',
 36 |     'movable_object.debris': 'ignore',
 37 |     'static_object.bicycle_rack': 'ignore',
 38 | }
 39 | 
 40 | 
 41 | def get_rot(h):
 42 |     return torch.Tensor([
 43 |         [np.cos(h), np.sin(h)],
 44 |         [-np.sin(h), np.cos(h)],
 45 |     ])
 46 | 
 47 | 
 48 | def img_transform(img, resize, resize_dims, crop, flip, rotate):
 49 |     ida_rot = torch.eye(2)
 50 |     ida_tran = torch.zeros(2)
 51 |     # adjust image
 52 |     img = img.resize(resize_dims)
 53 |     img = img.crop(crop)
 54 |     if flip:
 55 |         img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
 56 |     img = img.rotate(rotate)
 57 | 
 58 |     # post-homography transformation
 59 |     ida_rot *= resize
 60 |     ida_tran -= torch.Tensor(crop[:2])
 61 |     if flip:
 62 |         A = torch.Tensor([[-1, 0], [0, 1]])
 63 |         b = torch.Tensor([crop[2] - crop[0], 0])
 64 |         ida_rot = A.matmul(ida_rot)
 65 |         ida_tran = A.matmul(ida_tran) + b
 66 |     A = get_rot(rotate / 180 * np.pi)
 67 |     b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
 68 |     b = A.matmul(-b) + b
 69 |     ida_rot = A.matmul(ida_rot)
 70 |     ida_tran = A.matmul(ida_tran) + b
 71 |     ida_mat = ida_rot.new_zeros(4, 4)
 72 |     ida_mat[3, 3] = 1
 73 |     ida_mat[2, 2] = 1
 74 |     ida_mat[:2, :2] = ida_rot
 75 |     ida_mat[:2, 3] = ida_tran
 76 |     return img, ida_mat
 77 | 
 78 | 
 79 | def bev_transform(gt_boxes, rotate_angle, scale_ratio, flip_dx, flip_dy):
 80 |     rotate_angle = torch.tensor(rotate_angle / 180 * np.pi)
 81 |     rot_sin = torch.sin(rotate_angle)
 82 |     rot_cos = torch.cos(rotate_angle)
 83 |     rot_mat = torch.Tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0],
 84 |                             [0, 0, 1]])
 85 |     scale_mat = torch.Tensor([[scale_ratio, 0, 0], [0, scale_ratio, 0],
 86 |                               [0, 0, scale_ratio]])
 87 |     flip_mat = torch.Tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
 88 |     if flip_dx:
 89 |         flip_mat = flip_mat @ torch.Tensor([[-1, 0, 0], [0, 1, 0], [0, 0, 1]])
 90 |     if flip_dy:
 91 |         flip_mat = flip_mat @ torch.Tensor([[1, 0, 0], [0, -1, 0], [0, 0, 1]])
 92 |     rot_mat = flip_mat @ (scale_mat @ rot_mat)
 93 |     if gt_boxes.shape[0] > 0:
 94 |         gt_boxes[:, :3] = (rot_mat @ gt_boxes[:, :3].unsqueeze(-1)).squeeze(-1)
 95 |         gt_boxes[:, 3:6] *= scale_ratio
 96 |         gt_boxes[:, 6] += rotate_angle
 97 |         if flip_dx:
 98 |             gt_boxes[:, 6] = 2 * torch.asin(torch.tensor(1.0)) - gt_boxes[:, 6]
 99 |         if flip_dy:
100 |             gt_boxes[:, 6] = -gt_boxes[:, 6]
101 |         gt_boxes[:, 7:] = (
102 |             rot_mat[:2, :2] @ gt_boxes[:, 7:].unsqueeze(-1)).squeeze(-1)
103 |     return gt_boxes, rot_mat
104 | 
105 | 
106 | def depth_transform(cam_depth, resize, resize_dims, crop, flip, rotate):
107 |     """Transform depth based on ida augmentation configuration.
108 |     Args:
109 |         cam_depth (np array): Nx3, 3: x,y,d.
110 |         resize (float): Resize factor.
111 |         resize_dims (list): Final dimension.
112 |         crop (list): x1, y1, x2, y2
113 |         flip (bool): Whether to flip.
114 |         rotate (float): Rotation value.
115 |     Returns:
116 |         np array: [h/down_ratio, w/down_ratio, d]
117 |     """
118 | 
119 |     H, W = resize_dims
120 |     cam_depth[:, :2] = cam_depth[:, :2] * resize
121 |     cam_depth[:, 0] -= crop[0]
122 |     cam_depth[:, 1] -= crop[1]
123 |     if flip:
124 |         cam_depth[:, 0] = resize_dims[1] - cam_depth[:, 0]
125 | 
126 |     cam_depth[:, 0] -= W / 2.0
127 |     cam_depth[:, 1] -= H / 2.0
128 | 
129 |     h = rotate / 180 * np.pi
130 |     rot_matrix = [
131 |         [np.cos(h), np.sin(h)],
132 |         [-np.sin(h), np.cos(h)],
133 |     ]
134 |     cam_depth[:, :2] = np.matmul(rot_matrix, cam_depth[:, :2].T).T
135 | 
136 |     cam_depth[:, 0] += W / 2.0
137 |     cam_depth[:, 1] += H / 2.0
138 | 
139 |     depth_coords = cam_depth[:, :2].astype(np.int16)
140 | 
141 |     depth_map = np.zeros(resize_dims)
142 |     valid_mask = ((depth_coords[:, 1] < resize_dims[0])
143 |                   & (depth_coords[:, 0] < resize_dims[1])
144 |                   & (depth_coords[:, 1] >= 0)
145 |                   & (depth_coords[:, 0] >= 0))
146 |     depth_map[depth_coords[valid_mask, 1],
147 |               depth_coords[valid_mask, 0]] = cam_depth[valid_mask, 2]
148 | 
149 |     return torch.Tensor(depth_map)
150 | 
151 | 
152 | class NuscMVDetDataset(Dataset):
153 |     def __init__(self,
154 |                  ida_aug_conf,
155 |                  bda_aug_conf,
156 |                  classes,
157 |                  data_root,
158 |                  info_path,
159 |                  is_train,
160 |                  use_cbgs=False,
161 |                  num_sweeps=1,
162 |                  img_conf=dict(img_mean=[123.675, 116.28, 103.53],
163 |                                img_std=[58.395, 57.12, 57.375],
164 |                                to_rgb=True),
165 |                  return_depth=False,
166 |                  sweep_idxes=list(),
167 |                  key_idxes=list()):
168 |         """Dataset used for bevdetection task.
169 |         Args:
170 |             ida_aug_conf (dict): Config for ida augmentation.
171 |             bda_aug_conf (dict): Config for bda augmentation.
172 |             classes (list): Class names.
173 |             use_cbgs (bool): Whether to use cbgs strategy,
174 |                 Default: False.
175 |             num_sweeps (int): Number of sweeps to be used for each sample.
176 |                 default: 1.
177 |             img_conf (dict): Config for image.
178 |             return_depth (bool): Whether to use depth gt.
179 |                 default: False.
180 |             sweep_idxes (list): List of sweep idxes to be used.
181 |                 default: list().
182 |             key_idxes (list): List of key idxes to be used.
183 |                 default: list().
184 |         """
185 |         super().__init__()
186 |         self.infos = mmcv.load(info_path)
187 |         self.is_train = is_train
188 |         self.ida_aug_conf = ida_aug_conf
189 |         self.bda_aug_conf = bda_aug_conf
190 |         self.data_root = data_root
191 |         self.classes = classes
192 |         self.use_cbgs = use_cbgs
193 |         if self.use_cbgs:
194 |             self.cat2id = {name: i for i, name in enumerate(self.classes)}
195 |             self.sample_indices = self._get_sample_indices()
196 |         self.num_sweeps = num_sweeps
197 |         self.img_mean = np.array(img_conf['img_mean'], np.float32)
198 |         self.img_std = np.array(img_conf['img_std'], np.float32)
199 |         self.to_rgb = img_conf['to_rgb']
200 |         self.return_depth = return_depth
201 |         assert sum([sweep_idx >= 0 for sweep_idx in sweep_idxes]) \
202 |             == len(sweep_idxes), 'All `sweep_idxes` must greater \
203 |                 than or equal to 0.'
204 | 
205 |         self.sweeps_idx = sweep_idxes
206 |         assert sum([key_idx < 0 for key_idx in key_idxes]) == len(key_idxes),\
207 |             'All `key_idxes` must less than 0.'
208 |         self.key_idxes = [0] + key_idxes
209 | 
210 |     def _get_sample_indices(self):
211 |         """Load annotations from ann_file.
212 |         Args:
213 |             ann_file (str): Path of the annotation file.
214 |         Returns:
215 |             list[dict]: List of annotations after class sampling.
216 |         """
217 |         class_sample_idxs = {cat_id: [] for cat_id in self.cat2id.values()}
218 |         for idx, info in enumerate(self.infos):
219 |             gt_names = set(
220 |                 [ann_info['category_name'] for ann_info in info['ann_infos']])
221 |             for gt_name in gt_names:
222 |                 gt_name = map_name_from_general_to_detection[gt_name]
223 |                 if gt_name not in self.classes:
224 |                     continue
225 |                 class_sample_idxs[self.cat2id[gt_name]].append(idx)
226 |         duplicated_samples = sum(
227 |             [len(v) for _, v in class_sample_idxs.items()])
228 |         class_distribution = {
229 |             k: len(v) / duplicated_samples
230 |             for k, v in class_sample_idxs.items()
231 |         }
232 | 
233 |         sample_indices = []
234 | 
235 |         frac = 1.0 / len(self.classes)
236 |         ratios = [frac / v for v in class_distribution.values()]
237 |         for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios):
238 |             sample_indices += np.random.choice(cls_inds,
239 |                                                int(len(cls_inds) *
240 |                                                    ratio)).tolist()
241 |         return sample_indices
242 | 
243 |     def sample_ida_augmentation(self):
244 |         """Generate ida augmentation values based on ida_config."""
245 |         H, W = self.ida_aug_conf['H'], self.ida_aug_conf['W']
246 |         fH, fW = self.ida_aug_conf['final_dim']
247 |         if self.is_train:
248 |             resize = np.random.uniform(*self.ida_aug_conf['resize_lim'])
249 |             resize_dims = (int(W * resize), int(H * resize))
250 |             newW, newH = resize_dims
251 |             crop_h = int(
252 |                 (1 - np.random.uniform(*self.ida_aug_conf['bot_pct_lim'])) *
253 |                 newH) - fH
254 |             crop_w = int(np.random.uniform(0, max(0, newW - fW)))
255 |             crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
256 |             flip = False
257 |             if self.ida_aug_conf['rand_flip'] and np.random.choice([0, 1]):
258 |                 flip = True
259 |             rotate_ida = np.random.uniform(*self.ida_aug_conf['rot_lim'])
260 |         else:
261 |             resize = max(fH / H, fW / W)
262 |             resize_dims = (int(W * resize), int(H * resize))
263 |             newW, newH = resize_dims
264 |             crop_h = int(
265 |                 (1 - np.mean(self.ida_aug_conf['bot_pct_lim'])) * newH) - fH
266 |             crop_w = int(max(0, newW - fW) / 2)
267 |             crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
268 |             flip = False
269 |             rotate_ida = 0
270 |         return resize, resize_dims, crop, flip, rotate_ida
271 | 
272 |     def sample_bda_augmentation(self):
273 |         """Generate bda augmentation values based on bda_config."""
274 |         if self.is_train:
275 |             rotate_bda = np.random.uniform(*self.bda_aug_conf['rot_lim'])
276 |             scale_bda = np.random.uniform(*self.bda_aug_conf['scale_lim'])
277 |             flip_dx = np.random.uniform() < self.bda_aug_conf['flip_dx_ratio']
278 |             flip_dy = np.random.uniform() < self.bda_aug_conf['flip_dy_ratio']
279 |         else:
280 |             rotate_bda = 0
281 |             scale_bda = 1.0
282 |             flip_dx = False
283 |             flip_dy = False
284 |         return rotate_bda, scale_bda, flip_dx, flip_dy
285 | 
286 |     def get_image(self, cam_infos, cams):
287 |         """Given data and cam_names, return image data needed.
288 |         Args:
289 |             sweeps_data (list): Raw data used to generate the data we needed.
290 |             cams (list): Camera names.
291 |         Returns:
292 |             Tensor: Image data after processing.
293 |             Tensor: Transformation matrix from camera to ego.
294 |             Tensor: Intrinsic matrix.
295 |             Tensor: Transformation matrix for ida.
296 |             Tensor: Transformation matrix from key
297 |                 frame camera to sweep frame camera.
298 |             Tensor: timestamps.
299 |             dict: meta infos needed for evaluation.
300 |         """
301 |         assert len(cam_infos) > 0
302 |         sweep_imgs = list()
303 |         sweep_sensor2ego_mats = list()
304 |         sweep_intrin_mats = list()
305 |         sweep_ida_mats = list()
306 |         sweep_sensor2sensor_mats = list()
307 |         sweep_timestamps = list()
308 |         gt_depth = list()
309 |         for cam in cams:
310 |             imgs = list()
311 |             sensor2ego_mats = list()
312 |             intrin_mats = list()
313 |             ida_mats = list()
314 |             sensor2sensor_mats = list()
315 |             timestamps = list()
316 |             key_info = cam_infos[0]
317 |             resize, resize_dims, crop, flip, \
318 |                 rotate_ida = self.sample_ida_augmentation(
319 |                     )
320 |             for sweep_idx, cam_info in enumerate(cam_infos):
321 | 
322 |                 img = Image.open(
323 |                     os.path.join(self.data_root, cam_info[cam]['filename']))
324 |                 # img = Image.fromarray(img)
325 |                 w, x, y, z = cam_info[cam]['calibrated_sensor']['rotation']
326 |                 # sweep sensor to sweep ego
327 |                 sweepsensor2sweepego_rot = torch.Tensor(
328 |                     Quaternion(w, x, y, z).rotation_matrix)
329 |                 sweepsensor2sweepego_tran = torch.Tensor(
330 |                     cam_info[cam]['calibrated_sensor']['translation'])
331 |                 sweepsensor2sweepego = sweepsensor2sweepego_rot.new_zeros(
332 |                     (4, 4))
333 |                 sweepsensor2sweepego[3, 3] = 1
334 |                 sweepsensor2sweepego[:3, :3] = sweepsensor2sweepego_rot
335 |                 sweepsensor2sweepego[:3, -1] = sweepsensor2sweepego_tran
336 |                 # sweep ego to global
337 |                 w, x, y, z = cam_info[cam]['ego_pose']['rotation']
338 |                 sweepego2global_rot = torch.Tensor(
339 |                     Quaternion(w, x, y, z).rotation_matrix)
340 |                 sweepego2global_tran = torch.Tensor(
341 |                     cam_info[cam]['ego_pose']['translation'])
342 |                 sweepego2global = sweepego2global_rot.new_zeros((4, 4))
343 |                 sweepego2global[3, 3] = 1
344 |                 sweepego2global[:3, :3] = sweepego2global_rot
345 |                 sweepego2global[:3, -1] = sweepego2global_tran
346 | 
347 |                 # global sensor to cur ego
348 |                 w, x, y, z = key_info[cam]['ego_pose']['rotation']
349 |                 keyego2global_rot = torch.Tensor(
350 |                     Quaternion(w, x, y, z).rotation_matrix)
351 |                 keyego2global_tran = torch.Tensor(
352 |                     key_info[cam]['ego_pose']['translation'])
353 |                 keyego2global = keyego2global_rot.new_zeros((4, 4))
354 |                 keyego2global[3, 3] = 1
355 |                 keyego2global[:3, :3] = keyego2global_rot
356 |                 keyego2global[:3, -1] = keyego2global_tran
357 |                 global2keyego = keyego2global.inverse()
358 | 
359 |                 # cur ego to sensor
360 |                 w, x, y, z = key_info[cam]['calibrated_sensor']['rotation']
361 |                 keysensor2keyego_rot = torch.Tensor(
362 |                     Quaternion(w, x, y, z).rotation_matrix)
363 |                 keysensor2keyego_tran = torch.Tensor(
364 |                     key_info[cam]['calibrated_sensor']['translation'])
365 |                 keysensor2keyego = keysensor2keyego_rot.new_zeros((4, 4))
366 |                 keysensor2keyego[3, 3] = 1
367 |                 keysensor2keyego[:3, :3] = keysensor2keyego_rot
368 |                 keysensor2keyego[:3, -1] = keysensor2keyego_tran
369 |                 keyego2keysensor = keysensor2keyego.inverse()
370 |                 keysensor2sweepsensor = (
371 |                     keyego2keysensor @ global2keyego @ sweepego2global
372 |                     @ sweepsensor2sweepego).inverse()
373 |                 sweepsensor2keyego = global2keyego @ sweepego2global @\
374 |                     sweepsensor2sweepego
375 |                 sensor2ego_mats.append(sweepsensor2keyego)
376 |                 sensor2sensor_mats.append(keysensor2sweepsensor)
377 |                 intrin_mat = torch.zeros((4, 4))
378 |                 intrin_mat[3, 3] = 1
379 |                 intrin_mat[:3, :3] = torch.Tensor(
380 |                     cam_info[cam]['calibrated_sensor']['camera_intrinsic'])
381 |                 if self.return_depth and sweep_idx == 0:
382 |                     file_name = os.path.split(cam_info[cam]['filename'])[-1]
383 |                     point_depth = np.fromfile(os.path.join(
384 |                         self.data_root, 'depth_gt', f'{file_name}.bin'),
385 |                                               dtype=np.float32,
386 |                                               count=-1).reshape(-1, 3)
387 |                     point_depth_augmented = depth_transform(
388 |                         point_depth, resize, self.ida_aug_conf['final_dim'],
389 |                         crop, flip, rotate_ida)
390 |                     gt_depth.append(point_depth_augmented)
391 |                 img, ida_mat = img_transform(
392 |                     img,
393 |                     resize=resize,
394 |                     resize_dims=resize_dims,
395 |                     crop=crop,
396 |                     flip=flip,
397 |                     rotate=rotate_ida,
398 |                 )
399 |                 ida_mats.append(ida_mat)
400 |                 img = mmcv.imnormalize(np.array(img), self.img_mean,
401 |                                        self.img_std, self.to_rgb)
402 |                 img = torch.from_numpy(img).permute(2, 0, 1)
403 |                 imgs.append(img)
404 |                 intrin_mats.append(intrin_mat)
405 |                 timestamps.append(cam_info[cam]['timestamp'])
406 |             sweep_imgs.append(torch.stack(imgs))
407 |             sweep_sensor2ego_mats.append(torch.stack(sensor2ego_mats))
408 |             sweep_intrin_mats.append(torch.stack(intrin_mats))
409 |             sweep_ida_mats.append(torch.stack(ida_mats))
410 |             sweep_sensor2sensor_mats.append(torch.stack(sensor2sensor_mats))
411 |             sweep_timestamps.append(torch.tensor(timestamps))
412 |         # Get mean pose of all cams.
413 |         ego2global_rotation = np.mean(
414 |             [key_info[cam]['ego_pose']['rotation'] for cam in cams], 0)
415 |         ego2global_translation = np.mean(
416 |             [key_info[cam]['ego_pose']['translation'] for cam in cams], 0)
417 |         img_metas = dict(
418 |             box_type_3d=LiDARInstance3DBoxes,
419 |             ego2global_translation=ego2global_translation,
420 |             ego2global_rotation=ego2global_rotation,
421 |         )
422 | 
423 |         ret_list = [
424 |             torch.stack(sweep_imgs).permute(1, 0, 2, 3, 4),
425 |             torch.stack(sweep_sensor2ego_mats).permute(1, 0, 2, 3),
426 |             torch.stack(sweep_intrin_mats).permute(1, 0, 2, 3),
427 |             torch.stack(sweep_ida_mats).permute(1, 0, 2, 3),
428 |             torch.stack(sweep_sensor2sensor_mats).permute(1, 0, 2, 3),
429 |             torch.stack(sweep_timestamps).permute(1, 0),
430 |             img_metas,
431 |         ]
432 |         if self.return_depth:
433 |             ret_list.append(torch.stack(gt_depth))
434 |         return ret_list
435 | 
436 |     def get_gt(self, info, cams):
437 |         """Generate gt labels from info.
438 |         Args:
439 |             info(dict): Infos needed to generate gt labels.
440 |             cams(list): Camera names.
441 |         Returns:
442 |             Tensor: GT bboxes.
443 |             Tensor: GT labels.
444 |         """
445 |         ego2global_rotation = np.mean(
446 |             [info['cam_infos'][cam]['ego_pose']['rotation'] for cam in cams],
447 |             0)
448 |         ego2global_translation = np.mean([
449 |             info['cam_infos'][cam]['ego_pose']['translation'] for cam in cams
450 |         ], 0)
451 |         trans = -np.array(ego2global_translation)
452 |         rot = Quaternion(ego2global_rotation).inverse
453 |         gt_boxes = list()
454 |         gt_labels = list()
455 |         for ann_info in info['ann_infos']:
456 |             # Use ego coordinate.
457 |             if (map_name_from_general_to_detection[ann_info['category_name']]
458 |                     not in self.classes
459 |                     or ann_info['num_lidar_pts'] + ann_info['num_radar_pts'] <=
460 |                     0):
461 |                 continue
462 |             box = Box(
463 |                 ann_info['translation'],
464 |                 ann_info['size'],
465 |                 Quaternion(ann_info['rotation']),
466 |                 velocity=ann_info['velocity'],
467 |             )
468 |             box.translate(trans)
469 |             box.rotate(rot)
470 |             box_xyz = np.array(box.center)
471 |             box_dxdydz = np.array(box.wlh)[[1, 0, 2]]
472 |             box_yaw = np.array([box.orientation.yaw_pitch_roll[0]])
473 |             box_velo = np.array(box.velocity[:2])
474 |             gt_box = np.concatenate([box_xyz, box_dxdydz, box_yaw, box_velo])
475 |             gt_boxes.append(gt_box)
476 |             gt_labels.append(
477 |                 self.classes.index(map_name_from_general_to_detection[
478 |                     ann_info['category_name']]))
479 |         return torch.Tensor(gt_boxes), torch.tensor(gt_labels)
480 | 
481 |     def choose_cams(self):
482 |         """Choose cameras randomly.
483 |         Returns:
484 |             list: Cameras to be used.
485 |         """
486 |         if self.is_train and self.ida_aug_conf['Ncams'] < len(
487 |                 self.ida_aug_conf['cams']):
488 |             cams = np.random.choice(self.ida_aug_conf['cams'],
489 |                                     self.ida_aug_conf['Ncams'],
490 |                                     replace=False)
491 |         else:
492 |             cams = self.ida_aug_conf['cams']
493 |         return cams
494 | 
495 |     def __getitem__(self, idx):
496 |         if self.use_cbgs:
497 |             idx = self.sample_indices[idx]
498 |         cam_infos = list()
499 |         # TODO: Check if it still works when number of cameras is reduced.
500 |         cams = self.choose_cams()
501 |         for key_idx in self.key_idxes:
502 |             cur_idx = key_idx + idx
503 |             # Handle scenarios when current idx doesn't have previous key
504 |             # frame or previous key frame is from another scene.
505 |             if cur_idx < 0:
506 |                 cur_idx = idx
507 |             elif self.infos[cur_idx]['scene_token'] != self.infos[idx][
508 |                     'scene_token']:
509 |                 cur_idx = idx
510 |             info = self.infos[cur_idx]
511 |             cam_infos.append(info['cam_infos'])
512 |             for sweep_idx in self.sweeps_idx:
513 |                 if len(info['sweeps']) == 0:
514 |                     cam_infos.append(info['cam_infos'])
515 |                 else:
516 |                     # Handle scenarios when current sweep doesn't have all
517 |                     # cam keys.
518 |                     for i in range(min(len(info['sweeps']) - 1, sweep_idx), -1,
519 |                                    -1):
520 |                         if sum([cam in info['sweeps'][i]
521 |                                 for cam in cams]) == len(cams):
522 |                             cam_infos.append(info['sweeps'][i])
523 |                             break
524 |         image_data_list = self.get_image(cam_infos, cams)
525 |         ret_list = list()
526 |         (
527 |             sweep_imgs,
528 |             sweep_sensor2ego_mats,
529 |             sweep_intrins,
530 |             sweep_ida_mats,
531 |             sweep_sensor2sensor_mats,
532 |             sweep_timestamps,
533 |             img_metas,
534 |         ) = image_data_list[:7]
535 |         img_metas['token'] = self.infos[idx]['sample_token']
536 |         if self.is_train:
537 |             gt_boxes, gt_labels = self.get_gt(self.infos[idx], cams)
538 |         # Temporary solution for test.
539 |         else:
540 |             gt_boxes, gt_labels = self.get_gt(self.infos[idx], cams)
541 | 
542 |         rotate_bda, scale_bda, flip_dx, flip_dy = self.sample_bda_augmentation(
543 |         )
544 |         bda_mat = sweep_imgs.new_zeros(4, 4)
545 |         bda_mat[3, 3] = 1
546 |         gt_boxes, bda_rot = bev_transform(gt_boxes, rotate_bda, scale_bda,
547 |                                           flip_dx, flip_dy)
548 |         bda_mat[:3, :3] = bda_rot
549 |         ret_list = [
550 |             sweep_imgs,
551 |             sweep_sensor2ego_mats,
552 |             sweep_intrins,
553 |             sweep_ida_mats,
554 |             sweep_sensor2sensor_mats,
555 |             bda_mat,
556 |             sweep_timestamps,
557 |             img_metas,
558 |             gt_boxes,
559 |             gt_labels,
560 |             self.infos[idx],
561 |         ]
562 |         if self.return_depth:
563 |             ret_list.append(image_data_list[7])
564 |         return ret_list
565 | 
566 |     def __str__(self):
567 |         return f"""NuscData: {len(self)} samples. Split: \
568 |             {"train" if self.is_train else "val"}.
569 |                     Augmentation Conf: {self.ida_aug_conf}"""
570 | 
571 |     def __len__(self):
572 |         if self.use_cbgs:
573 |             return len(self.sample_indices)
574 |         else:
575 |             return len(self.infos)
576 | 
577 | 
578 | def collate_fn(data, is_return_depth=False):
579 |     imgs_batch = list()
580 |     sensor2ego_mats_batch = list()
581 |     intrin_mats_batch = list()
582 |     ida_mats_batch = list()
583 |     sensor2sensor_mats_batch = list()
584 |     bda_mat_batch = list()
585 |     timestamps_batch = list()
586 |     gt_boxes_batch = list()
587 |     gt_labels_batch = list()
588 |     img_metas_batch = list()
589 |     depth_labels_batch = list()
590 |     infos_batch = list()
591 |     for iter_data in data:
592 |         (
593 |             sweep_imgs,
594 |             sweep_sensor2ego_mats,
595 |             sweep_intrins,
596 |             sweep_ida_mats,
597 |             sweep_sensor2sensor_mats,
598 |             bda_mat,
599 |             sweep_timestamps,
600 |             img_metas,
601 |             gt_boxes,
602 |             gt_labels,
603 |             infos,
604 |         ) = iter_data[:11]
605 |         if is_return_depth:
606 |             gt_depth = iter_data[11]
607 |             depth_labels_batch.append(gt_depth)
608 |         imgs_batch.append(sweep_imgs)
609 |         sensor2ego_mats_batch.append(sweep_sensor2ego_mats)
610 |         intrin_mats_batch.append(sweep_intrins)
611 |         ida_mats_batch.append(sweep_ida_mats)
612 |         sensor2sensor_mats_batch.append(sweep_sensor2sensor_mats)
613 |         bda_mat_batch.append(bda_mat)
614 |         timestamps_batch.append(sweep_timestamps)
615 |         img_metas_batch.append(img_metas)
616 |         gt_boxes_batch.append(gt_boxes)
617 |         gt_labels_batch.append(gt_labels)
618 |         infos_batch.append(infos)
619 |     mats_dict = dict()
620 |     mats_dict['sensor2ego_mats'] = torch.stack(sensor2ego_mats_batch)
621 |     mats_dict['intrin_mats'] = torch.stack(intrin_mats_batch)
622 |     mats_dict['ida_mats'] = torch.stack(ida_mats_batch)
623 |     mats_dict['sensor2sensor_mats'] = torch.stack(sensor2sensor_mats_batch)
624 |     mats_dict['bda_mat'] = torch.stack(bda_mat_batch)
625 |     ret_list = [
626 |         torch.stack(imgs_batch),
627 |         mats_dict,
628 |         torch.stack(timestamps_batch),
629 |         img_metas_batch,
630 |         gt_boxes_batch,
631 |         gt_labels_batch,
632 |         infos_batch
633 |     ]
634 |     if is_return_depth:
635 |         ret_list.append(torch.stack(depth_labels_batch))
636 |     return ret_list


--------------------------------------------------------------------------------