├── models
    ├── csrc
    │   ├── __init__.py
    │   ├── setup.py
    │   ├── msmv_sampling
    │   │   └── msmv_sampling.h
    │   └── wrapper.py
    ├── core
    │   ├── __init__.py
    │   └── hook
    │   │   ├── __init__.py
    │   │   ├── utils.py
    │   │   ├── ema.py
    │   │   └── ema2.py
    ├── backbones
    │   ├── eva02
    │   │   ├── __init__.py
    │   │   ├── fpn.py
    │   │   ├── drop.py
    │   │   ├── main.py
    │   │   ├── backbone.py
    │   │   ├── blocks.py
    │   │   ├── wrappers.py
    │   │   └── batch_norm.py
    │   ├── __init__.py
    │   └── second_3d.py
    ├── bbox
    │   ├── match_costs
    │   │   ├── __init__.py
    │   │   └── match_cost.py
    │   ├── coders
    │   │   ├── __init__.py
    │   │   └── nms_free_coder.py
    │   ├── __init__.py
    │   ├── assigners
    │   │   ├── __init__.py
    │   │   └── hungarian_assigner_3d.py
    │   └── utils.py
    ├── lidar_encoder
    │   ├── __init__.py
    │   └── sparse_encoder4x.py
    ├── neck
    │   └── __init__.py
    ├── __init__.py
    ├── opusv1
    │   └── opus_sampling.py
    └── opusv1_fusion
    │   └── opus_sampling.py
├── demos
    ├── opusv1.png
    ├── opusv2.png
    ├── result.png
    └── teaser.png
├── dist_train.sh
├── dist_val.sh
├── loaders
    ├── __init__.py
    ├── pipelines
    │   └── __init__.py
    ├── utils.py
    ├── builder.py
    ├── nuscenes_dataset.py
    ├── ego_pose_dataset.py
    ├── nuscenes_occupancy_dataset.py
    ├── ray_metrics.py
    └── old_metrics.py
├── .gitignore
├── scripts
    ├── gen_fusion_pretrain_model.py
    ├── timing.py
    └── gen_sweep_info.py
├── LICENSE
├── lib
    └── dvr
    │   └── dvr.cpp
├── val.py
├── train.py
├── configs
    ├── opusv1_nusc-occ3d
    │   ├── opusv1-s_r50_704x256_8f_nusc-occ3d_100e.py
    │   ├── opusv1-l_r50_704x256_8f_nusc-occ3d_100e.py
    │   ├── opusv1-m_r50_704x256_8f_nusc-occ3d_100e.py
    │   └── opusv1-t_r50_704x256_8f_nusc-occ3d_100e.py
    └── opusv1-fusion_nusc-occ3d
    │   ├── opusv1-fusion-l_r50_704x256_8f_nusc-occ3d_100e.py
    │   └── opusv1-fusion-m_r50_704x256_8f_nusc-occ3d_100e.py
└── utils.py


/models/csrc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .hook import *


--------------------------------------------------------------------------------
/models/backbones/eva02/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import EVA02


--------------------------------------------------------------------------------
/demos/opusv1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbwang1997/OPUS/HEAD/demos/opusv1.png


--------------------------------------------------------------------------------
/demos/opusv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbwang1997/OPUS/HEAD/demos/opusv2.png


--------------------------------------------------------------------------------
/demos/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbwang1997/OPUS/HEAD/demos/result.png


--------------------------------------------------------------------------------
/demos/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jbwang1997/OPUS/HEAD/demos/teaser.png


--------------------------------------------------------------------------------
/models/bbox/match_costs/__init__.py:
--------------------------------------------------------------------------------
1 | from .match_cost import BBox3DL1Cost
2 | 
3 | __all__ = ['BBox3DL1Cost']


--------------------------------------------------------------------------------
/models/bbox/coders/__init__.py:
--------------------------------------------------------------------------------
1 | from .nms_free_coder import NMSFreeCoder
2 | 
3 | __all__ = ['NMSFreeCoder']
4 | 


--------------------------------------------------------------------------------
/models/lidar_encoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .sparse_encoder4x import SparseEncoder8x
2 | 
3 | __all__ = ['SparseEncoder8x']


--------------------------------------------------------------------------------
/models/bbox/__init__.py:
--------------------------------------------------------------------------------
1 | from .assigners import __all__
2 | from .coders import __all__
3 | from .match_costs import __all__


--------------------------------------------------------------------------------
/models/bbox/assigners/__init__.py:
--------------------------------------------------------------------------------
1 | from .hungarian_assigner_3d import HungarianAssigner3D
2 | 
3 | __all__ = ['HungarianAssigner3D']
4 | 


--------------------------------------------------------------------------------
/models/core/hook/__init__.py:
--------------------------------------------------------------------------------
1 | from .ema import MEGVIIEMAHook
2 | from .utils import is_parallel
3 | 
4 | 
5 | __all__ = ['MEGVIIEMAHook','is_parallel']


--------------------------------------------------------------------------------
/dist_train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | GPUS=$1
4 | CONFIG=$2
5 | python3 -m torch.distributed.run --nproc_per_node $GPUS train.py --config $CONFIG ${@:3}
6 | 


--------------------------------------------------------------------------------
/models/neck/__init__.py:
--------------------------------------------------------------------------------
1 | from .second_fpn_3d import SECONDFPN_3d,SECONDFPN_3dv2,SECONDFPN_3dv3
2 | 
3 | __all__ = ['SECONDFPN_3d','SECONDFPN_3dv2','SECONDFPN_3dv3']


--------------------------------------------------------------------------------
/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .vovnet import VoVNet
2 | from .eva02 import EVA02
3 | from .second_3d import SECOND_3d
4 | 
5 | __all__ = ['VoVNet', 'EVA02','SECOND_3d']
6 | 


--------------------------------------------------------------------------------
/dist_val.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | GPUS=$1
4 | CONFIG=$2
5 | WEIGHT=$3
6 | python3 -m torch.distributed.run --nproc_per_node $GPUS val.py --config $CONFIG --weights $WEIGHT
7 | 


--------------------------------------------------------------------------------
/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipelines import __all__
2 | from .nuscenes_dataset import CustomNuScenesDataset
3 | from .nuscenes_occ3d_dataset import NuScenesOcc3DDataset
4 | from .nuscenes_occupancy_dataset import NuScenesOccupancyDataset
5 | 


--------------------------------------------------------------------------------
/loaders/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from .loading import LoadMultiViewImageFromMultiSweeps
2 | from .transforms import PadMultiViewImage, NormalizeMultiviewImage, PhotoMetricDistortionMultiViewImage
3 | 
4 | __all__ = [
5 |     'LoadMultiViewImageFromMultiSweeps', 'PadMultiViewImage', 'NormalizeMultiviewImage', 
6 |     'PhotoMetricDistortionMultiViewImage'
7 | ]


--------------------------------------------------------------------------------
/models/core/hook/utils.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | 
 3 | __all__ = ['is_parallel']
 4 | 
 5 | 
 6 | def is_parallel(model):
 7 |     """check if model is in parallel mode."""
 8 |     parallel_type = (
 9 |         nn.parallel.DataParallel,
10 |         nn.parallel.DistributedDataParallel,
11 |     )
12 |     return isinstance(model, parallel_type)
13 | 
14 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .backbones import __all__
 2 | from .bbox import __all__
 3 | from .lidar_encoder import __all__
 4 | from .neck import __all__
 5 | 
 6 | from .opusv1.opus import OPUSV1
 7 | from .opusv1.opus_head import OPUSV1Head
 8 | from .opusv1.opus_transformer import OPUSV1Transformer
 9 | 
10 | from .opusv1_fusion.opus import OPUSV1Fusion
11 | from .opusv1_fusion.opus_head import OPUSV1FusionHead
12 | from .opusv1_fusion.opus_transformer import OPUSV1FusionTransformer


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # OS generated files
 2 | .DS_Store
 3 | .DS_Store?
 4 | ._*
 5 | .Spotlight-V100
 6 | .Trashes
 7 | ehthumbs.db
 8 | Thumbs.db
 9 | 
10 | # Compiled source
11 | build
12 | debug
13 | Debug
14 | release
15 | Release
16 | x64
17 | *.so
18 | *.whl
19 | 
20 | # VS project files
21 | *.sln
22 | *.vcxproj
23 | *.vcxproj.filters
24 | *.vcxproj.user
25 | *.rc
26 | .vs
27 | 
28 | # Byte-compiled / optimized / DLL files
29 | *__pycache__*
30 | *.py[cod]
31 | *$py.class
32 | 
33 | # Distribution / packaging
34 | .Python
35 | build
36 | develop-eggs
37 | dist
38 | downloads
39 | 
40 | # IDE
41 | .idea
42 | .vscode
43 | pyrightconfig.json
44 | 
45 | # Custom
46 | data
47 | outputs
48 | prediction
49 | submission
50 | checkpoints
51 | pretrain
52 | visualizations


--------------------------------------------------------------------------------
/loaders/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def compose_ego2img(ego2global_t,
 5 |                     ego2global_r,
 6 |                     sensor2global_t,
 7 |                     sensor2global_r,
 8 |                     cam_intrinsic):
 9 |     R = np.linalg.inv(sensor2global_r) @ ego2global_r
10 |     # (ego2global_t - sensor2global_t) @ _inv(sensor2global_r).T
11 |     # = (ego2global_t - sensor2global_t) @ sensor2global_r
12 |     T = (ego2global_t - sensor2global_t) @ sensor2global_r
13 | 
14 |     ego2cam_rt = np.eye(4)
15 |     ego2cam_rt[:3, :3] = R
16 |     ego2cam_rt[:3, 3] = T.T
17 | 
18 |     viewpad = np.eye(4)
19 |     viewpad[:cam_intrinsic.shape[0], :cam_intrinsic.shape[1]] = cam_intrinsic
20 |     ego2img = (viewpad @ ego2cam_rt).astype(np.float32)
21 | 
22 |     return ego2img


--------------------------------------------------------------------------------
/scripts/gen_fusion_pretrain_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import re
 3 | 
 4 | lidar_ckpt = torch.load('pretrain/dal-tiny-map66.9-nds71.1.pth')
 5 | img_ckpt = torch.load('pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth')
 6 | 
 7 | lidar_dict = lidar_ckpt['state_dict']
 8 | img_dict = img_ckpt['state_dict']
 9 | 
10 | lidar_prefix_keys_list=['pts_backbone', 'pts_middle_encoder', 'pts_neck']
11 | for key in list(lidar_dict.keys()):
12 |     flag=False
13 |     for prefix in lidar_prefix_keys_list:
14 |         if key.startswith(prefix):
15 |             flag=True
16 |             break
17 |     if not flag:
18 |         del lidar_dict[key]
19 |            
20 | img_prefix_keys_list=['backbone']
21 | for prefix in img_prefix_keys_list:
22 |     for key in img_dict:
23 |         if key.startswith(prefix):
24 |             new_key=re.sub('backbone', 'img_backbone', key)
25 |             lidar_dict[new_key] = img_dict[key]
26 | 
27 | torch.save({'state_dict': lidar_dict}, 'pretrain/fusion_pretrain_model.pth')
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Multimedia Computing Group, Nanjing University
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/models/csrc/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 3 | 
 4 | 
 5 | def get_ext_modules():
 6 |     return [
 7 |         CUDAExtension(
 8 |             name='_msmv_sampling_cuda',
 9 |             sources=[
10 |                 'msmv_sampling/msmv_sampling.cpp',
11 |                 'msmv_sampling/msmv_sampling_forward.cu',
12 |                 'msmv_sampling/msmv_sampling_backward.cu'
13 |             ],
14 |             include_dirs=['msmv_sampling'],
15 |             extra_compile_args=dict(
16 |                 nvcc=[
17 |                     "-gencode=arch=compute_60,code=sm_60",
18 |                     "-gencode=arch=compute_61,code=sm_61",
19 |                     "-gencode=arch=compute_70,code=sm_70",
20 |                     "-gencode=arch=compute_75,code=sm_75",
21 |                     "-gencode=arch=compute_80,code=sm_80",
22 |                     "-gencode=arch=compute_86,code=sm_86",
23 |                     "-gencode=arch=compute_86,code=compute_86",
24 |                 ]
25 |             )
26 |         )
27 |     ]
28 | 
29 | 
30 | setup(
31 |     name='csrc',
32 |     ext_modules=get_ext_modules(),
33 |     cmdclass={'build_ext': BuildExtension}
34 | )
35 | 
36 | 


--------------------------------------------------------------------------------
/models/backbones/eva02/fpn.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import fvcore.nn.weight_init as weight_init
 3 | import torch.nn.functional as F
 4 | from torch import nn
 5 | 
 6 | 
 7 | def _assert_strides_are_log2_contiguous(strides):
 8 |     """
 9 |     Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
10 |     """
11 |     for i, stride in enumerate(strides[1:], 1):
12 |         assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(
13 |             stride, strides[i - 1]
14 |         )
15 | 
16 | 
17 | class LastLevelMaxPool(nn.Module):
18 |     """
19 |     This module is used in the original FPN to generate a downsampled
20 |     P6 feature from P5.
21 |     """
22 | 
23 |     def __init__(self):
24 |         super().__init__()
25 |         self.num_levels = 1
26 |         self.in_feature = "p5"
27 | 
28 |     def forward(self, x):
29 |         return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
30 | 
31 | 
32 | class LastLevelP6P7(nn.Module):
33 |     """
34 |     This module is used in RetinaNet to generate extra layers, P6 and P7 from
35 |     C5 feature.
36 |     """
37 | 
38 |     def __init__(self, in_channels, out_channels, in_feature="res5"):
39 |         super().__init__()
40 |         self.num_levels = 2
41 |         self.in_feature = in_feature
42 |         self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
43 |         self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
44 |         for module in [self.p6, self.p7]:
45 |             weight_init.c2_xavier_fill(module)
46 | 
47 |     def forward(self, c5):
48 |         p6 = self.p6(c5)
49 |         p7 = self.p7(F.relu(p6))
50 |         return [p6, p7]
51 | 


--------------------------------------------------------------------------------
/models/backbones/eva02/drop.py:
--------------------------------------------------------------------------------
 1 | # https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
 6 |     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
 7 | 
 8 |     This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
 9 |     the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
10 |     See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
11 |     changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
12 |     'survival rate' as the argument.
13 | 
14 |     """
15 |     if drop_prob == 0. or not training:
16 |         return x
17 |     keep_prob = 1 - drop_prob
18 |     shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
19 |     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
20 |     if keep_prob > 0.0 and scale_by_keep:
21 |         random_tensor.div_(keep_prob)
22 |     return x * random_tensor
23 | 
24 | 
25 | class DropPath(nn.Module):
26 |     """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
27 |     """
28 |     def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
29 |         super(DropPath, self).__init__()
30 |         self.drop_prob = drop_prob
31 |         self.scale_by_keep = scale_by_keep
32 | 
33 |     def forward(self, x):
34 |         return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
35 | 
36 |     def extra_repr(self):
37 |         return f'drop_prob={round(self.drop_prob,3):0.3f}'


--------------------------------------------------------------------------------
/models/csrc/msmv_sampling/msmv_sampling.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <torch/extension.h>
 4 | 
 5 | at::Tensor ms_deform_attn_cuda_c2345_forward(
 6 |     const at::Tensor& feat_c2,  // [B, N, H, W, C]
 7 |     const at::Tensor& feat_c3,  // [B, N, H, W, C]
 8 |     const at::Tensor& feat_c4,  // [B, N, H, W, C]
 9 |     const at::Tensor& feat_c5,  // [B, N, H, W, C]
10 |     const at::Tensor& sampling_loc,  // [B, Q, P, 3]
11 |     const at::Tensor& attn_weight  // [B, Q, P, 4]
12 | );
13 | 
14 | std::vector<at::Tensor> ms_deform_attn_cuda_c2345_backward(
15 |     const at::Tensor& feat_c2,  // [B, N, H, W, C]
16 |     const at::Tensor& feat_c3,  // [B, N, H, W, C]
17 |     const at::Tensor& feat_c4,  // [B, N, H, W, C]
18 |     const at::Tensor& feat_c5,  // [B, N, H, W, C]
19 |     const at::Tensor& sampling_loc,  // [B, Q, P, 3]
20 |     const at::Tensor& attn_weight,  // [B, Q, P, 4]
21 |     const at::Tensor& grad_output
22 | );
23 | 
24 | at::Tensor ms_deform_attn_cuda_c23456_forward(
25 |     const at::Tensor& feat_c2,  // [B, N, H, W, C]
26 |     const at::Tensor& feat_c3,  // [B, N, H, W, C]
27 |     const at::Tensor& feat_c4,  // [B, N, H, W, C]
28 |     const at::Tensor& feat_c5,  // [B, N, H, W, C]
29 |     const at::Tensor& feat_c6,  // [B, N, H, W, C]
30 |     const at::Tensor& sampling_loc,  // [B, Q, P, 3]
31 |     const at::Tensor& attn_weight  // [B, Q, P, 4]
32 | );
33 | 
34 | std::vector<at::Tensor> ms_deform_attn_cuda_c23456_backward(
35 |     const at::Tensor& grad_output,
36 |     const at::Tensor& feat_c2,  // [B, N, H, W, C]
37 |     const at::Tensor& feat_c3,  // [B, N, H, W, C]
38 |     const at::Tensor& feat_c4,  // [B, N, H, W, C]
39 |     const at::Tensor& feat_c5,  // [B, N, H, W, C]
40 |     const at::Tensor& feat_c6,  // [B, N, H, W, C]
41 |     const at::Tensor& sampling_loc,  // [B, Q, P, 3]
42 |     const at::Tensor& attn_weight  // [B, Q, P, 4]
43 | );


--------------------------------------------------------------------------------
/loaders/builder.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | from mmcv.parallel import collate
 3 | from mmcv.runner import get_dist_info
 4 | from torch.utils.data import DataLoader
 5 | from mmdet.datasets.builder import worker_init_fn
 6 | from mmdet.datasets.samplers import DistributedGroupSampler, DistributedSampler, GroupSampler
 7 | 
 8 | 
 9 | def build_dataloader(dataset,
10 |                      samples_per_gpu,
11 |                      workers_per_gpu,
12 |                      num_gpus=1,
13 |                      dist=True,
14 |                      shuffle=True,
15 |                      seed=None,
16 |                      **kwargs):
17 | 
18 |     rank, world_size = get_dist_info()
19 |     if dist:
20 |         # DistributedGroupSampler will definitely shuffle the data to satisfy
21 |         # that images on each GPU are in the same group
22 |         if shuffle:
23 |             sampler = DistributedGroupSampler(
24 |                 dataset, samples_per_gpu, world_size, rank, seed=seed)
25 |         else:
26 |             sampler = DistributedSampler(
27 |                 dataset, world_size, rank, shuffle=False, seed=seed)
28 |         batch_size = samples_per_gpu
29 |         num_workers = workers_per_gpu
30 |     else:
31 |         sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
32 |         batch_size = num_gpus * samples_per_gpu
33 |         num_workers = num_gpus * workers_per_gpu
34 | 
35 |     init_fn = partial(
36 |         worker_init_fn, num_workers=num_workers, rank=rank,
37 |         seed=seed) if seed is not None else None
38 | 
39 |     data_loader = DataLoader(
40 |         dataset,
41 |         batch_size=batch_size,
42 |         sampler=sampler,
43 |         num_workers=num_workers,
44 |         collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
45 |         pin_memory=False,
46 |         worker_init_fn=init_fn,
47 |         **kwargs)
48 | 
49 |     return data_loader
50 | 


--------------------------------------------------------------------------------
/models/bbox/match_costs/match_cost.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from mmdet.core.bbox.match_costs.builder import MATCH_COST
 3 | 
 4 | 
 5 | @MATCH_COST.register_module()
 6 | class BBox3DL1Cost(object):
 7 |     """BBox3DL1Cost.
 8 |      Args:
 9 |          weight (int | float, optional): loss_weight
10 |     """
11 | 
12 |     def __init__(self, weight=1.0):
13 |         self.weight = weight
14 | 
15 |     def __call__(self, bbox_pred, gt_bboxes):
16 |         """
17 |         Args:
18 |             bbox_pred (Tensor): Predicted boxes with normalized coordinates
19 |                 (cx, cy, w, h), which are all in range [0, 1]. Shape
20 |                 [num_query, 4].
21 |             gt_bboxes (Tensor): Ground truth boxes with normalized
22 |                 coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
23 |         Returns:
24 |             torch.Tensor: bbox_cost value with weight
25 |         """
26 |         bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
27 |         return bbox_cost * self.weight
28 | 
29 | 
30 | @MATCH_COST.register_module()
31 | class BBoxBEVL1Cost(object):
32 |     def __init__(self, weight, pc_range):
33 |         self.weight = weight
34 |         self.pc_range = pc_range
35 | 
36 |     def __call__(self, bboxes, gt_bboxes):
37 |         pc_start = bboxes.new(self.pc_range[0:2])
38 |         pc_range = bboxes.new(self.pc_range[3:5]) - bboxes.new(self.pc_range[0:2])
39 |         # normalize the box center to [0, 1]
40 |         normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range
41 |         normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range
42 |         reg_cost = torch.cdist(normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1)
43 |         return reg_cost * self.weight
44 | 
45 | 
46 | @MATCH_COST.register_module()
47 | class IoU3DCost(object):
48 |     def __init__(self, weight):
49 |         self.weight = weight
50 | 
51 |     def __call__(self, iou):
52 |         iou_cost = - iou
53 |         return iou_cost * self.weight
54 | 


--------------------------------------------------------------------------------
/lib/dvr/dvr.cpp:
--------------------------------------------------------------------------------
 1 | // Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
 2 | // Modified by Haisong Liu
 3 | 
 4 | #include <string>
 5 | #include <torch/extension.h>
 6 | #include <vector>
 7 | 
 8 | /*
 9 |  * CUDA forward declarations
10 |  */
11 | 
12 | std::vector<torch::Tensor> render_forward_cuda(torch::Tensor sigma,
13 |                                                torch::Tensor origin,
14 |                                                torch::Tensor points,
15 |                                                torch::Tensor tindex,
16 |                                                const std::vector<int> grid,
17 |                                                std::string phase_name);
18 | 
19 | std::vector<torch::Tensor>
20 | render_cuda(torch::Tensor sigma, torch::Tensor origin, torch::Tensor points,
21 |             torch::Tensor tindex, std::string loss_name);
22 | 
23 | torch::Tensor init_cuda(torch::Tensor points, torch::Tensor tindex,
24 |                         const std::vector<int> grid);
25 | 
26 | 
27 | /*
28 |  * C++ interface
29 |  */
30 | 
31 | #define CHECK_CUDA(x)                                                          \
32 |   TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
33 | #define CHECK_CONTIGUOUS(x)                                                    \
34 |   TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
35 | #define CHECK_INPUT(x)                                                         \
36 |   CHECK_CUDA(x);                                                               \
37 |   CHECK_CONTIGUOUS(x)
38 | 
39 | std::vector<torch::Tensor>
40 | render_forward(torch::Tensor sigma, torch::Tensor origin, torch::Tensor points,
41 |                torch::Tensor tindex, const std::vector<int> grid,
42 |                std::string phase_name) {
43 |   CHECK_INPUT(sigma);
44 |   CHECK_INPUT(origin);
45 |   CHECK_INPUT(points);
46 |   CHECK_INPUT(tindex);
47 |   return render_forward_cuda(sigma, origin, points, tindex, grid, phase_name);
48 | }
49 | 
50 | 
51 | std::vector<torch::Tensor> render(torch::Tensor sigma, torch::Tensor origin,
52 |                                   torch::Tensor points, torch::Tensor tindex,
53 |                                   std::string loss_name) {
54 |   CHECK_INPUT(sigma);
55 |   CHECK_INPUT(origin);
56 |   CHECK_INPUT(points);
57 |   CHECK_INPUT(tindex);
58 |   return render_cuda(sigma, origin, points, tindex, loss_name);
59 | }
60 | 
61 | torch::Tensor init(torch::Tensor points, torch::Tensor tindex,
62 |                    const std::vector<int> grid) {
63 |   CHECK_INPUT(points);
64 |   CHECK_INPUT(tindex);
65 |   return init_cuda(points, tindex, grid);
66 | }
67 | 
68 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
69 |   m.def("init", &init, "Initialize");
70 |   m.def("render", &render, "Render");
71 |   m.def("render_forward", &render_forward, "Render (forward pass only)");
72 | }
73 | 


--------------------------------------------------------------------------------
/models/backbones/eva02/main.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | import torch.nn as nn
 4 | from mmcv.runner.checkpoint import load_state_dict
 5 | from mmdet.models.builder import BACKBONES
 6 | from .vit import ViT, SimpleFeaturePyramid, partial
 7 | from .fpn import LastLevelMaxPool
 8 | 
 9 | 
10 | @BACKBONES.register_module()
11 | class EVA02(nn.Module):
12 |     def __init__(
13 |         self,
14 |         # args for ViT
15 |         img_size=1024,
16 |         real_img_size=(256, 704),
17 |         patch_size=16,
18 |         in_chans=3,
19 |         embed_dim=768,
20 |         depth=12,
21 |         num_heads=12,
22 |         mlp_ratio=4*2/3,
23 |         qkv_bias=True,
24 |         drop_path_rate=0.0,
25 |         norm_layer=partial(nn.LayerNorm, eps=1e-6),
26 |         use_abs_pos=True,
27 |         pt_hw_seq_len=16,
28 |         intp_freq=True,
29 |         window_size=0,
30 |         window_block_indexes=(),
31 |         residual_block_indexes=(),
32 |         use_act_checkpoint=False,
33 |         pretrain_img_size=224,
34 |         pretrain_use_cls_token=True,
35 |         out_feature="last_feat",
36 |         xattn=False,
37 |         frozen_blocks=-1,
38 |         # args for simple FPN
39 |         fpn_in_feature="last_feat",
40 |         fpn_out_channels=256,
41 |         fpn_scale_factors=(4.0, 2.0, 1.0, 0.5),
42 |         fpn_top_block=False,
43 |         fpn_norm="LN",
44 |         fpn_square_pad=0,
45 |         pretrained=None
46 |     ):
47 |         super().__init__()
48 | 
49 |         self.backbone = SimpleFeaturePyramid(
50 |             ViT(
51 |                 img_size=img_size,
52 |                 real_img_size=real_img_size,
53 |                 patch_size=patch_size,
54 |                 in_chans=in_chans,
55 |                 embed_dim=embed_dim,
56 |                 depth=depth,
57 |                 num_heads=num_heads,
58 |                 mlp_ratio=mlp_ratio,
59 |                 qkv_bias=qkv_bias,
60 |                 drop_path_rate=drop_path_rate,
61 |                 norm_layer=norm_layer,
62 |                 use_abs_pos=use_abs_pos,
63 |                 pt_hw_seq_len=pt_hw_seq_len,
64 |                 intp_freq=intp_freq,
65 |                 window_size=window_size,
66 |                 window_block_indexes=window_block_indexes,
67 |                 residual_block_indexes=residual_block_indexes,
68 |                 use_act_checkpoint=use_act_checkpoint,
69 |                 pretrain_img_size=pretrain_img_size,
70 |                 pretrain_use_cls_token=pretrain_use_cls_token,
71 |                 out_feature=out_feature,
72 |                 xattn=xattn,
73 |                 frozen_blocks=frozen_blocks,
74 |             ),
75 |             in_feature=fpn_in_feature,
76 |             out_channels=fpn_out_channels,
77 |             scale_factors=fpn_scale_factors,
78 |             top_block=LastLevelMaxPool() if fpn_top_block else None,
79 |             norm=fpn_norm,
80 |             square_pad=fpn_square_pad,
81 |         )
82 |         self.init_weights(pretrained)
83 |     
84 |     def init_weights(self, pretrained=None):
85 |         if pretrained is None:
86 |             return
87 |         logging.info('Loading pretrained weights from %s' % pretrained)
88 |         state_dict = torch.load(pretrained)['model']
89 |         load_state_dict(self, state_dict, strict=False)
90 | 
91 |     def forward(self, x):
92 |         outs = self.backbone(x)
93 |         return list(outs.values())
94 | 


--------------------------------------------------------------------------------
/models/bbox/utils.py:
--------------------------------------------------------------------------------
 1 | import torch 
 2 | 
 3 | 
 4 | def normalize_bbox(bboxes):
 5 |     cx = bboxes[..., 0:1]
 6 |     cy = bboxes[..., 1:2]
 7 |     cz = bboxes[..., 2:3]
 8 |     w = bboxes[..., 3:4].log()
 9 |     l = bboxes[..., 4:5].log()
10 |     h = bboxes[..., 5:6].log()
11 |     rot = bboxes[..., 6:7]
12 | 
13 |     if bboxes.size(-1) > 7:
14 |         vx = bboxes[..., 7:8]
15 |         vy = bboxes[..., 8:9]
16 |         out = torch.cat([cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy], dim=-1)
17 |     else:
18 |         out = torch.cat([cx, cy, w, l, cz, h, rot.sin(), rot.cos()], dim=-1)
19 | 
20 |     return out
21 | 
22 | 
23 | def denormalize_bbox(normalized_bboxes):
24 |     rot_sin = normalized_bboxes[..., 6:7]
25 |     rot_cos = normalized_bboxes[..., 7:8]
26 |     rot = torch.atan2(rot_sin, rot_cos)
27 | 
28 |     cx = normalized_bboxes[..., 0:1]
29 |     cy = normalized_bboxes[..., 1:2]
30 |     cz = normalized_bboxes[..., 4:5]
31 | 
32 |     w = normalized_bboxes[..., 2:3].exp()
33 |     l = normalized_bboxes[..., 3:4].exp()
34 |     h = normalized_bboxes[..., 5:6].exp()
35 | 
36 |     if normalized_bboxes.size(-1) > 8:
37 |         vx = normalized_bboxes[..., 8:9]
38 |         vy = normalized_bboxes[..., 9:10]
39 |         out = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
40 |     else:
41 |         out = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
42 | 
43 |     return out
44 | 
45 | 
46 | def encode_bbox(bboxes, pc_range=None):
47 |     xyz = bboxes[..., 0:3].clone()
48 |     wlh = bboxes[..., 3:6].log()
49 |     rot = bboxes[..., 6:7]
50 | 
51 |     if pc_range is not None:
52 |         xyz[..., 0] = (xyz[..., 0] - pc_range[0]) / (pc_range[3] - pc_range[0])
53 |         xyz[..., 1] = (xyz[..., 1] - pc_range[1]) / (pc_range[4] - pc_range[1])
54 |         xyz[..., 2] = (xyz[..., 2] - pc_range[2]) / (pc_range[5] - pc_range[2])
55 | 
56 |     if bboxes.shape[-1] > 7:
57 |         vel = bboxes[..., 7:9].clone()
58 |         return torch.cat([xyz, wlh, rot.sin(), rot.cos(), vel], dim=-1)
59 |     else:
60 |         return torch.cat([xyz, wlh, rot.sin(), rot.cos()], dim=-1)
61 | 
62 | 
63 | def decode_bbox(bboxes, pc_range=None):
64 |     xyz = bboxes[..., 0:3].clone()
65 |     wlh = bboxes[..., 3:6].exp()
66 |     rot = torch.atan2(bboxes[..., 6:7], bboxes[..., 7:8])
67 | 
68 |     if pc_range is not None:
69 |         xyz[..., 0] = xyz[..., 0] * (pc_range[3] - pc_range[0]) + pc_range[0]
70 |         xyz[..., 1] = xyz[..., 1] * (pc_range[4] - pc_range[1]) + pc_range[1]
71 |         xyz[..., 2] = xyz[..., 2] * (pc_range[5] - pc_range[2]) + pc_range[2]
72 | 
73 |     if bboxes.shape[-1] > 8:
74 |         vel = bboxes[..., 8:10].clone()
75 |         return torch.cat([xyz, wlh, rot, vel], dim=-1)
76 |     else:
77 |         return torch.cat([xyz, wlh, rot], dim=-1)
78 | 
79 | 
80 | def encode_points(points, pc_range=None):
81 |     points = points.clone()
82 |     points[..., 0] = (points[..., 0] - pc_range[0]) / (pc_range[3] - pc_range[0])
83 |     points[..., 1] = (points[..., 1] - pc_range[1]) / (pc_range[4] - pc_range[1])
84 |     points[..., 2] = (points[..., 2] - pc_range[2]) / (pc_range[5] - pc_range[2])
85 |     return points
86 | 
87 | 
88 | def decode_points(points, pc_range=None):
89 |     points = points.clone()
90 |     points[..., 0] = points[..., 0] * (pc_range[3] - pc_range[0]) + pc_range[0]
91 |     points[..., 1] = points[..., 1] * (pc_range[4] - pc_range[1]) + pc_range[1]
92 |     points[..., 2] = points[..., 2] * (pc_range[5] - pc_range[2]) + pc_range[2]
93 |     return points
94 | 


--------------------------------------------------------------------------------
/models/backbones/eva02/backbone.py:
--------------------------------------------------------------------------------
 1 | # https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | from dataclasses import dataclass
 5 | from typing import Optional
 6 | 
 7 | 
 8 | @dataclass
 9 | class ShapeSpec:
10 |     """
11 |     A simple structure that contains basic shape specification about a tensor.
12 |     It is often used as the auxiliary inputs/outputs of models,
13 |     to complement the lack of shape inference ability among pytorch modules.
14 |     """
15 | 
16 |     channels: Optional[int] = None
17 |     height: Optional[int] = None
18 |     width: Optional[int] = None
19 |     stride: Optional[int] = None
20 | 
21 | 
22 | # Copyright (c) Facebook, Inc. and its affiliates.
23 | from abc import ABCMeta, abstractmethod
24 | from typing import Dict
25 | import torch.nn as nn
26 | 
27 | 
28 | __all__ = ["Backbone"]
29 | 
30 | 
31 | class Backbone(nn.Module, metaclass=ABCMeta):
32 |     """
33 |     Abstract base class for network backbones.
34 |     """
35 | 
36 |     def __init__(self):
37 |         """
38 |         The `__init__` method of any subclass can specify its own set of arguments.
39 |         """
40 |         super().__init__()
41 | 
42 |     @abstractmethod
43 |     def forward(self):
44 |         """
45 |         Subclasses must override this method, but adhere to the same return type.
46 | 
47 |         Returns:
48 |             dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
49 |         """
50 |         pass
51 | 
52 |     @property
53 |     def size_divisibility(self) -> int:
54 |         """
55 |         Some backbones require the input height and width to be divisible by a
56 |         specific integer. This is typically true for encoder / decoder type networks
57 |         with lateral connection (e.g., FPN) for which feature maps need to match
58 |         dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
59 |         input size divisibility is required.
60 |         """
61 |         return 0
62 | 
63 |     @property
64 |     def padding_constraints(self) -> Dict[str, int]:
65 |         """
66 |         This property is a generalization of size_divisibility. Some backbones and training
67 |         recipes require specific padding constraints, such as enforcing divisibility by a specific
68 |         integer (e.g., FPN) or padding to a square (e.g., ViTDet with large-scale jitter
69 |         in :paper:vitdet). `padding_constraints` contains these optional items like:
70 |         {
71 |             "size_divisibility": int,
72 |             "square_size": int,
73 |             # Future options are possible
74 |         }
75 |         `size_divisibility` will read from here if presented and `square_size` indicates the
76 |         square padding size if `square_size` > 0.
77 | 
78 |         TODO: use type of Dict[str, int] to avoid torchscipt issues. The type of padding_constraints
79 |         could be generalized as TypedDict (Python 3.8+) to support more types in the future.
80 |         """
81 |         return {}
82 | 
83 |     def output_shape(self):
84 |         """
85 |         Returns:
86 |             dict[str->ShapeSpec]
87 |         """
88 |         # this is a backward-compatible default
89 |         return {
90 |             name: ShapeSpec(
91 |                 channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
92 |             )
93 |             for name in self._out_features
94 |         }


--------------------------------------------------------------------------------
/models/backbones/eva02/blocks.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | 
  4 | import fvcore.nn.weight_init as weight_init
  5 | from torch import nn
  6 | 
  7 | from .batch_norm import FrozenBatchNorm2d, get_norm
  8 | from .wrappers import Conv2d
  9 | 
 10 | 
 11 | """
 12 | CNN building blocks.
 13 | """
 14 | 
 15 | 
 16 | class CNNBlockBase(nn.Module):
 17 |     """
 18 |     A CNN block is assumed to have input channels, output channels and a stride.
 19 |     The input and output of `forward()` method must be NCHW tensors.
 20 |     The method can perform arbitrary computation but must match the given
 21 |     channels and stride specification.
 22 | 
 23 |     Attribute:
 24 |         in_channels (int):
 25 |         out_channels (int):
 26 |         stride (int):
 27 |     """
 28 | 
 29 |     def __init__(self, in_channels, out_channels, stride):
 30 |         """
 31 |         The `__init__` method of any subclass should also contain these arguments.
 32 | 
 33 |         Args:
 34 |             in_channels (int):
 35 |             out_channels (int):
 36 |             stride (int):
 37 |         """
 38 |         super().__init__()
 39 |         self.in_channels = in_channels
 40 |         self.out_channels = out_channels
 41 |         self.stride = stride
 42 | 
 43 |     def freeze(self):
 44 |         """
 45 |         Make this block not trainable.
 46 |         This method sets all parameters to `requires_grad=False`,
 47 |         and convert all BatchNorm layers to FrozenBatchNorm
 48 | 
 49 |         Returns:
 50 |             the block itself
 51 |         """
 52 |         for p in self.parameters():
 53 |             p.requires_grad = False
 54 |         FrozenBatchNorm2d.convert_frozen_batchnorm(self)
 55 |         return self
 56 | 
 57 | 
 58 | class DepthwiseSeparableConv2d(nn.Module):
 59 |     """
 60 |     A kxk depthwise convolution + a 1x1 convolution.
 61 | 
 62 |     In :paper:`xception`, norm & activation are applied on the second conv.
 63 |     :paper:`mobilenet` uses norm & activation on both convs.
 64 |     """
 65 | 
 66 |     def __init__(
 67 |         self,
 68 |         in_channels,
 69 |         out_channels,
 70 |         kernel_size=3,
 71 |         padding=1,
 72 |         dilation=1,
 73 |         *,
 74 |         norm1=None,
 75 |         activation1=None,
 76 |         norm2=None,
 77 |         activation2=None,
 78 |     ):
 79 |         """
 80 |         Args:
 81 |             norm1, norm2 (str or callable): normalization for the two conv layers.
 82 |             activation1, activation2 (callable(Tensor) -> Tensor): activation
 83 |                 function for the two conv layers.
 84 |         """
 85 |         super().__init__()
 86 |         self.depthwise = Conv2d(
 87 |             in_channels,
 88 |             in_channels,
 89 |             kernel_size=kernel_size,
 90 |             padding=padding,
 91 |             dilation=dilation,
 92 |             groups=in_channels,
 93 |             bias=not norm1,
 94 |             norm=get_norm(norm1, in_channels),
 95 |             activation=activation1,
 96 |         )
 97 |         self.pointwise = Conv2d(
 98 |             in_channels,
 99 |             out_channels,
100 |             kernel_size=1,
101 |             bias=not norm2,
102 |             norm=get_norm(norm2, out_channels),
103 |             activation=activation2,
104 |         )
105 | 
106 |         # default initialization
107 |         weight_init.c2_msra_fill(self.depthwise)
108 |         weight_init.c2_msra_fill(self.pointwise)
109 | 
110 |     def forward(self, x):
111 |         return self.pointwise(self.depthwise(x))
112 | 


--------------------------------------------------------------------------------
/loaders/nuscenes_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from mmdet.datasets import DATASETS
 4 | from mmdet3d.datasets import NuScenesDataset
 5 | from pyquaternion import Quaternion
 6 | 
 7 | 
 8 | @DATASETS.register_module()
 9 | class CustomNuScenesDataset(NuScenesDataset):
10 | 
11 |     def collect_sweeps(self, index, into_past=60, into_future=60):
12 |         all_sweeps_prev = []
13 |         curr_index = index
14 |         while len(all_sweeps_prev) < into_past:
15 |             curr_sweeps = self.data_infos[curr_index]['sweeps']
16 |             if len(curr_sweeps) == 0:
17 |                 break
18 |             all_sweeps_prev.extend(curr_sweeps)
19 |             all_sweeps_prev.append(self.data_infos[curr_index - 1]['cams'])
20 |             curr_index = curr_index - 1
21 |         
22 |         all_sweeps_next = []
23 |         curr_index = index + 1
24 |         while len(all_sweeps_next) < into_future:
25 |             if curr_index >= len(self.data_infos):
26 |                 break
27 |             curr_sweeps = self.data_infos[curr_index]['sweeps']
28 |             all_sweeps_next.extend(curr_sweeps[::-1])
29 |             all_sweeps_next.append(self.data_infos[curr_index]['cams'])
30 |             curr_index = curr_index + 1
31 | 
32 |         return all_sweeps_prev, all_sweeps_next
33 | 
34 |     def get_data_info(self, index):
35 |         info = self.data_infos[index]
36 |         sweeps_prev, sweeps_next = self.collect_sweeps(index)
37 | 
38 |         ego2global_translation = info['ego2global_translation']
39 |         ego2global_rotation = info['ego2global_rotation']
40 |         lidar2ego_translation = info['lidar2ego_translation']
41 |         lidar2ego_rotation = info['lidar2ego_rotation']
42 |         ego2global_rotation = Quaternion(ego2global_rotation).rotation_matrix
43 |         lidar2ego_rotation = Quaternion(lidar2ego_rotation).rotation_matrix
44 | 
45 |         input_dict = dict(
46 |             sample_idx=info['token'],
47 |             sweeps={'prev': sweeps_prev, 'next': sweeps_next},
48 |             timestamp=info['timestamp'] / 1e6,
49 |             ego2global_translation=ego2global_translation,
50 |             ego2global_rotation=ego2global_rotation,
51 |             lidar2ego_translation=lidar2ego_translation,
52 |             lidar2ego_rotation=lidar2ego_rotation,
53 |         )
54 | 
55 |         if self.modality['use_camera']:
56 |             img_paths = []
57 |             img_timestamps = []
58 |             lidar2img_rts = []
59 | 
60 |             for _, cam_info in info['cams'].items():
61 |                 img_paths.append(os.path.relpath(cam_info['data_path']))
62 |                 img_timestamps.append(cam_info['timestamp'] / 1e6)
63 | 
64 |                 # obtain lidar to image transformation matrix
65 |                 lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
66 |                 lidar2cam_t = cam_info['sensor2lidar_translation'] @ lidar2cam_r.T
67 | 
68 |                 lidar2cam_rt = np.eye(4)
69 |                 lidar2cam_rt[:3, :3] = lidar2cam_r.T
70 |                 lidar2cam_rt[3, :3] = -lidar2cam_t
71 |                 
72 |                 intrinsic = cam_info['cam_intrinsic']
73 |                 viewpad = np.eye(4)
74 |                 viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
75 |                 lidar2img_rt = (viewpad @ lidar2cam_rt.T)
76 |                 lidar2img_rts.append(lidar2img_rt)
77 | 
78 |             input_dict.update(dict(
79 |                 img_filename=img_paths,
80 |                 img_timestamp=img_timestamps,
81 |                 lidar2img=lidar2img_rts,
82 |             ))
83 | 
84 |         if not self.test_mode:
85 |             annos = self.get_ann_info(index)
86 |             input_dict['ann_info'] = annos
87 | 
88 |         return input_dict
89 | 


--------------------------------------------------------------------------------
/loaders/ego_pose_dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from pyquaternion import Quaternion
 4 | from torch.utils.data import Dataset
 5 | np.set_printoptions(precision=3, suppress=True)
 6 | 
 7 | 
 8 | def trans_matrix(T, R):
 9 |     tm = np.eye(4)
10 |     tm[:3, :3] = R.rotation_matrix
11 |     tm[:3, 3] = T
12 |     return tm
13 | 
14 | 
15 | class EgoPoseDataset(Dataset):
16 |     def __init__(self, data_infos):
17 |         super(EgoPoseDataset, self).__init__()
18 | 
19 |         self.data_infos = data_infos
20 |         self.scene_frames = {}
21 | 
22 |         for info in data_infos:
23 |             scene_token = self.get_scene_token(info)
24 |             if scene_token not in self.scene_frames:
25 |                 self.scene_frames[scene_token] = []
26 |             self.scene_frames[scene_token].append(info)
27 | 
28 |     def __len__(self):
29 |         return len(self.data_infos)
30 | 
31 |     def get_scene_token(self, info):
32 |         if 'scene_token' in info:
33 |             scene_name = info['scene_token']
34 |         elif 'scene_name' in info:
35 |             scene_name = info['scene_name']
36 |         else:
37 |             scene_name = info['occ_path'].split('occupancy/')[-1].split('/')[0]
38 |         return scene_name
39 | 
40 |     def get_ego_from_lidar(self, info):
41 |         ego_from_lidar = trans_matrix(
42 |             np.array(info['lidar2ego_translation']), 
43 |             Quaternion(info['lidar2ego_rotation']))
44 |         return ego_from_lidar
45 | 
46 |     def get_global_pose(self, info, inverse=False):
47 |         global_from_ego = trans_matrix(
48 |             np.array(info['ego2global_translation']), 
49 |             Quaternion(info['ego2global_rotation']))
50 |         ego_from_lidar = trans_matrix(
51 |             np.array(info['lidar2ego_translation']), 
52 |             Quaternion(info['lidar2ego_rotation']))
53 |         pose = global_from_ego.dot(ego_from_lidar)
54 |         if inverse:
55 |             pose = np.linalg.inv(pose)
56 |         return pose
57 | 
58 |     def __getitem__(self, idx):
59 |         info = self.data_infos[idx]
60 | 
61 |         ref_sample_token = info['token']
62 |         ref_lidar_from_global = self.get_global_pose(info, inverse=True)
63 |         ref_ego_from_lidar = self.get_ego_from_lidar(info)
64 | 
65 |         scene_token = self.get_scene_token(info)
66 |         scene_frame = self.scene_frames[scene_token]
67 |         ref_index = scene_frame.index(info)
68 | 
69 |         # NOTE: getting output frames
70 |         output_origin_list = []
71 |         for curr_index in range(len(scene_frame)):
72 |             # if this exists a valid target
73 |             if curr_index == ref_index:
74 |                 origin_tf = np.array([0.0, 0.0, 0.0], dtype=np.float32)
75 |             else:
76 |                 # transform from the current lidar frame to global and then to the reference lidar frame
77 |                 global_from_curr = self.get_global_pose(scene_frame[curr_index], inverse=False)
78 |                 ref_from_curr = ref_lidar_from_global.dot(global_from_curr)
79 |                 origin_tf = np.array(ref_from_curr[:3, 3], dtype=np.float32)
80 | 
81 |             origin_tf_pad = np.ones([4])
82 |             origin_tf_pad[:3] = origin_tf  # pad to [4]
83 |             origin_tf = np.dot(ref_ego_from_lidar[:3], origin_tf_pad.T).T  # [3]
84 | 
85 |             # origin
86 |             if np.abs(origin_tf[0]) < 39 and np.abs(origin_tf[1]) < 39:
87 |                 output_origin_list.append(origin_tf)
88 |         
89 |         # select 8 origins
90 |         if len(output_origin_list) > 8:
91 |             select_idx = np.round(np.linspace(0, len(output_origin_list) - 1, 8)).astype(np.int64)
92 |             output_origin_list = [output_origin_list[i] for i in select_idx]
93 | 
94 |         output_origin_tensor = torch.from_numpy(np.stack(output_origin_list))  # [T, 3]
95 | 
96 |         return (ref_sample_token, output_origin_tensor)
97 | 


--------------------------------------------------------------------------------
/models/bbox/assigners/hungarian_assigner_3d.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from mmdet.core.bbox.builder import BBOX_ASSIGNERS
 4 | from mmdet.core.bbox.assigners import AssignResult
 5 | from mmdet.core.bbox.assigners import BaseAssigner
 6 | from mmdet.core.bbox.match_costs import build_match_cost
 7 | from ..utils import normalize_bbox
 8 | 
 9 | try:
10 |     from scipy.optimize import linear_sum_assignment
11 | except ImportError:
12 |     linear_sum_assignment = None
13 | 
14 | 
15 | @BBOX_ASSIGNERS.register_module()
16 | class HungarianAssigner3D(BaseAssigner):
17 |     def __init__(self,
18 |                  cls_cost=dict(type='ClassificationCost', weight=1.),
19 |                  reg_cost=dict(type='BBoxL1Cost', weight=1.0),
20 |                  iou_cost=dict(type='IoUCost', weight=0.0),
21 |                  pc_range=None):
22 |         self.cls_cost = build_match_cost(cls_cost)
23 |         self.reg_cost = build_match_cost(reg_cost)
24 |         self.iou_cost = build_match_cost(iou_cost)
25 |         self.pc_range = pc_range
26 | 
27 |     def assign(self,
28 |                bbox_pred,
29 |                cls_pred,
30 |                gt_bboxes,
31 |                gt_labels,
32 |                gt_bboxes_ignore=None,
33 |                code_weights=None,
34 |                with_velo=False):
35 |         assert gt_bboxes_ignore is None, \
36 |             'Only case when gt_bboxes_ignore is None is supported.'
37 |         num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
38 | 
39 |         # 1. assign -1 by default
40 |         assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
41 |                                               -1,
42 |                                               dtype=torch.long)
43 |         assigned_labels = bbox_pred.new_full((num_bboxes, ),
44 |                                              -1,
45 |                                              dtype=torch.long)
46 |         if num_gts == 0 or num_bboxes == 0:
47 |             # No ground truth or boxes, return empty assignment
48 |             if num_gts == 0:
49 |                 # No ground truth, assign all to background
50 |                 assigned_gt_inds[:] = 0
51 |             return AssignResult(
52 |                 num_gts, assigned_gt_inds, None, labels=assigned_labels)
53 |         
54 |         # 2. compute the weighted costs
55 |         # classification and bboxcost.
56 |         cls_cost = self.cls_cost(cls_pred, gt_labels)
57 |         # regression L1 cost
58 |         normalized_gt_bboxes = normalize_bbox(gt_bboxes)
59 |         
60 |         if code_weights is not None:
61 |             bbox_pred = bbox_pred * code_weights
62 |             normalized_gt_bboxes = normalized_gt_bboxes * code_weights
63 |         
64 |         if with_velo:
65 |             reg_cost = self.reg_cost(bbox_pred, normalized_gt_bboxes)
66 |         else:
67 |             reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
68 |         
69 |         # weighted sum of above two costs
70 |         cost = cls_cost + reg_cost
71 |         
72 |         # 3. do Hungarian matching on CPU using linear_sum_assignment
73 |         cost = cost.detach().cpu()
74 |         cost = torch.nan_to_num(cost, nan=100.0, posinf=100.0, neginf=-100.0)
75 |         
76 |         if linear_sum_assignment is None:
77 |             raise ImportError('Please run "pip install scipy" '
78 |                               'to install scipy first.')
79 |         
80 |         matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
81 |         matched_row_inds = torch.from_numpy(matched_row_inds).to(
82 |             bbox_pred.device)
83 |         matched_col_inds = torch.from_numpy(matched_col_inds).to(
84 |             bbox_pred.device)
85 | 
86 |         # 4. assign backgrounds and foregrounds
87 |         # assign all indices to backgrounds first
88 |         assigned_gt_inds[:] = 0
89 |         # assign foregrounds based on matching results
90 |         assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
91 |         assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
92 |         return AssignResult(
93 |             num_gts, assigned_gt_inds, None, labels=assigned_labels)
94 | 


--------------------------------------------------------------------------------
/models/csrc/wrapper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | try:
 5 |     from ._msmv_sampling_cuda import _ms_deform_attn_cuda_c2345_forward, _ms_deform_attn_cuda_c2345_backward
 6 |     from ._msmv_sampling_cuda import _ms_deform_attn_cuda_c23456_forward, _ms_deform_attn_cuda_c23456_backward
 7 |     MSMV_CUDA = True
 8 | except ImportError as e:
 9 |     print('Warning: failed to load one or more CUDA extensions, performance may be hurt.')
10 |     print('Error message:', e)
11 |     MSMV_CUDA = False
12 | 
13 | 
14 | def msmv_sampling_pytorch(mlvl_feats, sampling_locations, scale_weights):
15 |     """
16 |     value: [B, N, H1W1 + H2W2..., C]
17 |     sampling_locations: [B, Q, P, 3]
18 |     scale_weights: [B, Q, P, 4]
19 |     """
20 |     assert scale_weights.shape[-1] == len(mlvl_feats)
21 | 
22 |     B, C, _, _, _ = mlvl_feats[0].shape
23 |     _, Q, P, _ = sampling_locations.shape
24 | 
25 |     sampling_locations = sampling_locations * 2 - 1
26 |     sampling_locations = sampling_locations[:, :, :, None, :]  # [B, Q, P, 1, 3]
27 | 
28 |     final = torch.zeros([B, C, Q, P], device=mlvl_feats[0].device)
29 | 
30 |     for lvl, feat in enumerate(mlvl_feats):
31 |         out = F.grid_sample(
32 |             feat, sampling_locations, mode='bilinear',
33 |             padding_mode='zeros', align_corners=True,
34 |         )[..., 0]  # [B, C, Q, P]
35 |         out = out * scale_weights[..., lvl].reshape(B, 1, Q, P)
36 |         final += out
37 | 
38 |     return final.permute(0, 2, 1, 3)
39 | 
40 | 
41 | class MSMVSamplingC2345(torch.autograd.Function):
42 |     @staticmethod
43 |     def forward(ctx, feat_c2, feat_c3, feat_c4, feat_c5, sampling_locations, scale_weights):
44 |         ctx.save_for_backward(feat_c2, feat_c3, feat_c4, feat_c5, sampling_locations, scale_weights)
45 |         
46 |         assert callable(_ms_deform_attn_cuda_c2345_forward)
47 |         return _ms_deform_attn_cuda_c2345_forward(
48 |             feat_c2, feat_c3, feat_c4, feat_c5,
49 |             sampling_locations, scale_weights)
50 | 
51 |     @staticmethod
52 |     def backward(ctx, grad_output):
53 |         feat_c2, feat_c3, feat_c4, feat_c5, sampling_locations, scale_weights = ctx.saved_tensors
54 | 
55 |         assert callable(_ms_deform_attn_cuda_c2345_backward)
56 |         grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_sampling_loc, grad_attn_weight = _ms_deform_attn_cuda_c2345_backward(grad_output.contiguous(), 
57 |             feat_c2, feat_c3, feat_c4, feat_c5,
58 |             sampling_locations, scale_weights
59 |         )
60 |         
61 |         return grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_sampling_loc, grad_attn_weight
62 | 
63 | 
64 | class MSMVSamplingC23456(torch.autograd.Function):
65 |     @staticmethod
66 |     def forward(ctx, feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, sampling_locations, scale_weights):
67 |         ctx.save_for_backward(feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, sampling_locations, scale_weights)
68 |         
69 |         assert callable(_ms_deform_attn_cuda_c23456_forward)
70 |         return _ms_deform_attn_cuda_c23456_forward(
71 |             feat_c2, feat_c3, feat_c4, feat_c5, feat_c6,
72 |             sampling_locations, scale_weights)
73 | 
74 |     @staticmethod
75 |     def backward(ctx, grad_output):
76 |         feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, sampling_locations, scale_weights = ctx.saved_tensors
77 | 
78 |         assert callable(_ms_deform_attn_cuda_c23456_backward)
79 |         grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_value_c6, grad_sampling_loc, grad_attn_weight = _ms_deform_attn_cuda_c23456_backward(grad_output.contiguous(), 
80 |             feat_c2, feat_c3, feat_c4, feat_c5, feat_c6,
81 |             sampling_locations, scale_weights
82 |         )
83 |         
84 |         return grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_value_c6, grad_sampling_loc, grad_attn_weight
85 | 
86 | 
87 | def msmv_sampling(mlvl_feats, sampling_locations, scale_weights):
88 |     if len(mlvl_feats) == 4 and MSMV_CUDA:
89 |         return MSMVSamplingC2345.apply(*mlvl_feats, sampling_locations, scale_weights)
90 |     elif len(mlvl_feats) == 5 and MSMV_CUDA:
91 |         return MSMVSamplingC23456.apply(*mlvl_feats, sampling_locations, scale_weights)
92 |     else:
93 |         return msmv_sampling_pytorch(mlvl_feats, sampling_locations, scale_weights)
94 | 


--------------------------------------------------------------------------------
/scripts/timing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
  4 | sys.path.insert(0, path)
  5 | 
  6 | import time
  7 | import logging
  8 | import argparse
  9 | import importlib
 10 | import torch
 11 | import torch.distributed
 12 | import torch.backends.cudnn as cudnn
 13 | from mmcv import Config, DictAction
 14 | from mmcv.parallel import MMDataParallel
 15 | from mmcv.runner import load_checkpoint
 16 | from mmdet.apis import set_random_seed
 17 | from mmdet3d.datasets import build_dataset, build_dataloader
 18 | from mmdet3d.models import build_model
 19 | 
 20 | 
 21 | def init_logging(filename=None, debug=False):
 22 |     logging.root = logging.RootLogger('DEBUG' if debug else 'INFO')
 23 |     formatter = logging.Formatter('[%(asctime)s][%(levelname)s] - %(message)s')
 24 | 
 25 |     stream_handler = logging.StreamHandler(sys.stdout)
 26 |     stream_handler.setFormatter(formatter)
 27 |     logging.root.addHandler(stream_handler)
 28 | 
 29 |     if filename is not None:
 30 |         file_handler = logging.FileHandler(filename)
 31 |         file_handler.setFormatter(formatter)
 32 |         logging.root.addHandler(file_handler)
 33 | 
 34 | 
 35 | def main():
 36 |     parser = argparse.ArgumentParser(description='Validate a detector')
 37 |     parser.add_argument('--config', required=True)
 38 |     parser.add_argument('--weights', required=True)
 39 |     parser.add_argument('--num_warmup', default=10)
 40 |     parser.add_argument('--samples', default=500)
 41 |     parser.add_argument('--log-interval', default=50, help='interval of logging')
 42 |     parser.add_argument('--override', nargs='+', action=DictAction)
 43 |     args = parser.parse_args()
 44 | 
 45 |     # parse configs
 46 |     cfgs = Config.fromfile(args.config)
 47 |     if args.override is not None:
 48 |         cfgs.merge_from_dict(args.override)
 49 | 
 50 |     # register custom module
 51 |     importlib.import_module('models')
 52 |     importlib.import_module('loaders')
 53 | 
 54 |     # MMCV, please shut up
 55 |     from mmcv.utils.logging import logger_initialized
 56 |     logger_initialized['root'] = logging.Logger(__name__, logging.WARNING)
 57 |     logger_initialized['mmcv'] = logging.Logger(__name__, logging.WARNING)
 58 |     init_logging(None, cfgs.debug)
 59 | 
 60 |     # you need GPUs
 61 |     assert torch.cuda.is_available() and torch.cuda.device_count() == 1
 62 |     logging.info('Using GPU: %s' % torch.cuda.get_device_name(0))
 63 |     torch.cuda.set_device(0)
 64 | 
 65 |     logging.info('Setting random seed: 0')
 66 |     set_random_seed(0, deterministic=True)
 67 |     cudnn.benchmark = True
 68 | 
 69 |     logging.info('Loading validation set from %s' % cfgs.data.val.data_root)
 70 |     val_dataset = build_dataset(cfgs.data.val)
 71 |     val_loader = build_dataloader(
 72 |         val_dataset,
 73 |         samples_per_gpu=1,
 74 |         workers_per_gpu=cfgs.data.workers_per_gpu,
 75 |         num_gpus=1,
 76 |         dist=False,
 77 |         shuffle=False,
 78 |         seed=0,
 79 |     )
 80 | 
 81 |     logging.info('Creating model: %s' % cfgs.model.type)
 82 |     model = build_model(cfgs.model)
 83 |     model.cuda()
 84 | 
 85 |     assert torch.cuda.device_count() == 1
 86 |     model = MMDataParallel(model, [0])
 87 | 
 88 |     logging.info('Loading checkpoint from %s' % args.weights)
 89 |     load_checkpoint(
 90 |         model, args.weights, map_location='cuda', strict=False,
 91 |         logger=logging.Logger(__name__, logging.ERROR)
 92 |     )
 93 |     model.eval()
 94 | 
 95 |     pure_inf_time = 0
 96 |     with torch.no_grad():
 97 |         for i, data in enumerate(val_loader):
 98 |             torch.cuda.synchronize()
 99 |             start_time = time.perf_counter()
100 | 
101 |             model(return_loss=False, rescale=True, **data)
102 | 
103 |             torch.cuda.synchronize()
104 |             elapsed = time.perf_counter() - start_time
105 | 
106 |             if i >= args.num_warmup:
107 |                 pure_inf_time += elapsed
108 |                 if (i + 1) % args.log_interval == 0:
109 |                     fps = (i + 1 - args.num_warmup) / pure_inf_time
110 |                     print(f'Done sample [{i + 1:<3}/ {args.samples}], '
111 |                         f'fps: {fps:.1f} sample / s')
112 | 
113 |             if (i + 1) == args.samples:
114 |                 break
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     main()
119 | 


--------------------------------------------------------------------------------
/models/bbox/coders/nms_free_coder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from mmdet.core.bbox import BaseBBoxCoder
  4 | from mmdet.core.bbox.builder import BBOX_CODERS
  5 | from ..utils import denormalize_bbox
  6 | 
  7 | 
  8 | @BBOX_CODERS.register_module()
  9 | class NMSFreeCoder(BaseBBoxCoder):
 10 |     """Bbox coder for NMS-free detector.
 11 |     Args:
 12 |         pc_range (list[float]): Range of point cloud.
 13 |         post_center_range (list[float]): Limit of the center.
 14 |             Default: None.
 15 |         max_num (int): Max number to be kept. Default: 100.
 16 |         score_threshold (float): Threshold to filter boxes based on score.
 17 |             Default: None.
 18 |         code_size (int): Code size of bboxes. Default: 9
 19 |     """
 20 |     def __init__(self,
 21 |                  pc_range,
 22 |                  voxel_size=None,
 23 |                  post_center_range=None,
 24 |                  max_num=100,
 25 |                  score_threshold=None,
 26 |                  num_classes=10):
 27 |         self.pc_range = pc_range
 28 |         self.voxel_size = voxel_size
 29 |         self.post_center_range = post_center_range
 30 |         self.max_num = max_num
 31 |         self.score_threshold = score_threshold
 32 |         self.num_classes = num_classes
 33 | 
 34 |     def encode(self):
 35 |         pass
 36 | 
 37 |     def decode_single(self, cls_scores, bbox_preds):
 38 |         """Decode bboxes.
 39 |         Args:
 40 |             cls_scores (Tensor): Outputs from the classification head, \
 41 |                 shape [num_query, cls_out_channels]. Note \
 42 |                 cls_out_channels should includes background.
 43 |             bbox_preds (Tensor): Outputs from the regression \
 44 |                 head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
 45 |                 Shape [num_query, 9].
 46 |         Returns:
 47 |             list[dict]: Decoded boxes.
 48 |         """
 49 |         max_num = self.max_num
 50 | 
 51 |         cls_scores = cls_scores.sigmoid()
 52 |         scores, indexs = cls_scores.view(-1).topk(max_num)
 53 |         labels = indexs % self.num_classes
 54 |         bbox_index = torch.div(indexs, self.num_classes, rounding_mode='trunc')
 55 |         bbox_preds = bbox_preds[bbox_index]
 56 | 
 57 |         final_box_preds = denormalize_bbox(bbox_preds)   
 58 |         final_scores = scores 
 59 |         final_preds = labels 
 60 | 
 61 |         # use score threshold
 62 |         if self.score_threshold is not None:
 63 |             thresh_mask = final_scores > self.score_threshold
 64 | 
 65 |         if self.post_center_range is not None:
 66 |             limit = torch.tensor(self.post_center_range, device=scores.device)
 67 |             mask = (final_box_preds[..., :3] >= limit[:3]).all(1)
 68 |             mask &= (final_box_preds[..., :3] <= limit[3:]).all(1)
 69 | 
 70 |             if self.score_threshold:
 71 |                 mask &= thresh_mask
 72 | 
 73 |             boxes3d = final_box_preds[mask]
 74 |             scores = final_scores[mask]
 75 |             labels = final_preds[mask]
 76 |             predictions_dict = {
 77 |                 'bboxes': boxes3d,
 78 |                 'scores': scores,
 79 |                 'labels': labels
 80 |             }
 81 | 
 82 |         else:
 83 |             raise NotImplementedError(
 84 |                 'Need to reorganize output as a batch, only '
 85 |                 'support post_center_range is not None for now!'
 86 |             )
 87 | 
 88 |         return predictions_dict
 89 | 
 90 |     def decode(self, preds_dicts):
 91 |         """Decode bboxes.
 92 |         Args:
 93 |             all_cls_scores (Tensor): Outputs from the classification head, \
 94 |                 shape [nb_dec, bs, num_query, cls_out_channels]. Note \
 95 |                 cls_out_channels should includes background.
 96 |             all_bbox_preds (Tensor): Sigmoid outputs from the regression \
 97 |                 head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
 98 |                 Shape [nb_dec, bs, num_query, 9].
 99 |         Returns:
100 |             list[dict]: Decoded boxes.
101 |         """
102 |         all_cls_scores = preds_dicts['all_cls_scores'][-1]
103 |         all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
104 |         
105 |         batch_size = all_cls_scores.size()[0]
106 |         predictions_list = []
107 |         for i in range(batch_size):
108 |             predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
109 | 
110 |         return predictions_list
111 | 


--------------------------------------------------------------------------------
/val.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import utils
  3 | import logging
  4 | import argparse
  5 | import importlib
  6 | import torch
  7 | import torch.distributed
  8 | import torch.distributed as dist
  9 | import torch.backends.cudnn as cudnn
 10 | from mmcv import Config
 11 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
 12 | from mmcv.runner import load_checkpoint
 13 | from mmdet.apis import set_random_seed, multi_gpu_test, single_gpu_test
 14 | from mmdet3d.datasets import build_dataset, build_dataloader
 15 | from mmdet3d.models import build_model
 16 | from models.utils import VERSION
 17 | 
 18 | 
 19 | def evaluate(dataset, results, epoch):
 20 |     metrics = dataset.evaluate(results, jsonfile_prefix='submission')
 21 |     return metrics
 22 | 
 23 |     # mAP = metrics['pts_bbox_NuScenes/mAP']
 24 |     # mATE = metrics['pts_bbox_NuScenes/mATE']
 25 |     # mASE = metrics['pts_bbox_NuScenes/mASE']
 26 |     # mAOE = metrics['pts_bbox_NuScenes/mAOE']
 27 |     # mAVE = metrics['pts_bbox_NuScenes/mAVE']
 28 |     # mAAE = metrics['pts_bbox_NuScenes/mAAE']
 29 |     # NDS = metrics['pts_bbox_NuScenes/NDS']
 30 | 
 31 |     # logging.info('--- Evaluation Results (Epoch %d) ---' % epoch)
 32 |     # logging.info('mAP: %.4f' % metrics['pts_bbox_NuScenes/mAP'])
 33 |     # logging.info('mATE: %.4f' % metrics['pts_bbox_NuScenes/mATE'])
 34 |     # logging.info('mASE: %.4f' % metrics['pts_bbox_NuScenes/mASE'])
 35 |     # logging.info('mAOE: %.4f' % metrics['pts_bbox_NuScenes/mAOE'])
 36 |     # logging.info('mAVE: %.4f' % metrics['pts_bbox_NuScenes/mAVE'])
 37 |     # logging.info('mAAE: %.4f' % metrics['pts_bbox_NuScenes/mAAE'])
 38 |     # logging.info('NDS: %.4f' % metrics['pts_bbox_NuScenes/NDS'])
 39 | 
 40 |     # return {
 41 |     #     'mAP': mAP,
 42 |     #     'mATE': mATE,
 43 |     #     'mASE': mASE,
 44 |     #     'mAOE': mAOE,
 45 |     #     'mAVE': mAVE,
 46 |     #     'mAAE': mAAE,
 47 |     #     'NDS': NDS,
 48 |     # }
 49 | 
 50 | 
 51 | def main():
 52 |     parser = argparse.ArgumentParser(description='Validate a detector')
 53 |     parser.add_argument('--config', required=True)
 54 |     parser.add_argument('--weights', required=True)
 55 |     parser.add_argument('--local_rank', type=int, default=0)
 56 |     parser.add_argument('--world_size', type=int, default=1)
 57 |     parser.add_argument('--batch_size', type=int, default=1)
 58 |     args = parser.parse_args()
 59 | 
 60 |     # parse configs
 61 |     cfgs = Config.fromfile(args.config)
 62 | 
 63 |     # register custom module
 64 |     importlib.import_module('models')
 65 |     importlib.import_module('loaders')
 66 | 
 67 |     # MMCV, please shut up
 68 |     from mmcv.utils.logging import logger_initialized
 69 |     logger_initialized['root'] = logging.Logger(__name__, logging.WARNING)
 70 |     logger_initialized['mmcv'] = logging.Logger(__name__, logging.WARNING)
 71 | 
 72 |     # you need GPUs
 73 |     assert torch.cuda.is_available()
 74 | 
 75 |     # determine local_rank and world_size
 76 |     if 'LOCAL_RANK' not in os.environ:
 77 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
 78 |     
 79 |     if 'WORLD_SIZE' not in os.environ:
 80 |         os.environ['WORLD_SIZE'] = str(args.world_size)
 81 | 
 82 |     local_rank = int(os.environ['LOCAL_RANK'])
 83 |     world_size = int(os.environ['WORLD_SIZE'])
 84 | 
 85 |     if local_rank == 0:
 86 |         utils.init_logging(None, cfgs.debug)
 87 |     else:
 88 |         logging.root.disabled = True
 89 | 
 90 |     logging.info('Using GPU: %s' % torch.cuda.get_device_name(local_rank))
 91 |     torch.cuda.set_device(local_rank)
 92 | 
 93 |     if world_size > 1:
 94 |         logging.info('Initializing DDP with %d GPUs...' % world_size)
 95 |         dist.init_process_group('nccl', init_method='env://')
 96 | 
 97 |     logging.info('Setting random seed: 0')
 98 |     set_random_seed(0, deterministic=True)
 99 |     cudnn.benchmark = True
100 | 
101 |     logging.info('Loading validation set from %s' % cfgs.data.val.data_root)
102 |     val_dataset = build_dataset(cfgs.data.val)
103 |     val_loader = build_dataloader(
104 |         val_dataset,
105 |         samples_per_gpu=args.batch_size,
106 |         workers_per_gpu=cfgs.data.workers_per_gpu,
107 |         num_gpus=world_size,
108 |         dist=world_size > 1,
109 |         shuffle=False,
110 |         seed=0,
111 |     )
112 | 
113 |     logging.info('Creating model: %s' % cfgs.model.type)
114 |     model = build_model(cfgs.model)
115 |     model.cuda()
116 |     model.fp16_enabled = True
117 | 
118 |     if world_size > 1:
119 |         model = MMDistributedDataParallel(model, [local_rank], broadcast_buffers=False)
120 |     else:
121 |         model = MMDataParallel(model, [0])
122 | 
123 |     logging.info('Loading checkpoint from %s' % args.weights)
124 |     checkpoint = load_checkpoint(
125 |         model, args.weights, map_location='cuda', strict=True,
126 |         logger=logging.Logger(__name__, logging.ERROR)
127 |     )
128 | 
129 |     if 'version' in checkpoint:
130 |         VERSION.name = checkpoint['version']
131 | 
132 |     if world_size > 1:
133 |         results = multi_gpu_test(model, val_loader, gpu_collect=False)
134 |     else:
135 |         results = single_gpu_test(model, val_loader)
136 | 
137 |     if local_rank == 0:
138 |         evaluate(val_dataset, results, -1)
139 | 
140 | 
141 | if __name__ == '__main__':
142 |     main()
143 | 


--------------------------------------------------------------------------------
/models/core/hook/ema.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | # modified from megvii-bevdepth.
  3 | import math
  4 | import os
  5 | from copy import deepcopy
  6 | 
  7 | import torch
  8 | from mmcv.runner import load_state_dict
  9 | from mmcv.runner.dist_utils import master_only
 10 | from mmcv.runner.hooks import HOOKS, Hook
 11 | from .utils import is_parallel
 12 | from mmcv.fileio import FileClient
 13 | import os.path as osp
 14 | 
 15 | __all__ = ['ModelEMA']
 16 | 
 17 | 
 18 | class ModelEMA:
 19 |     """Model Exponential Moving Average from https://github.com/rwightman/
 20 |     pytorch-image-models Keep a moving average of everything in the model
 21 |     state_dict (parameters and buffers).
 22 | 
 23 |     This is intended to allow functionality like
 24 |     https://www.tensorflow.org/api_docs/python/tf/train/
 25 |     ExponentialMovingAverage
 26 |     A smoothed version of the weights is necessary for some training
 27 |     schemes to perform well.
 28 |     This class is sensitive where it is initialized in the sequence
 29 |     of model init, GPU assignment and distributed training wrappers.
 30 |     """
 31 | 
 32 |     def __init__(self, model, decay=0.9999, updates=0):
 33 |         """
 34 |         Args:
 35 |             model (nn.Module): model to apply EMA.
 36 |             decay (float): ema decay reate.
 37 |             updates (int): counter of EMA updates.
 38 |         """
 39 |         # Create EMA(FP32)
 40 |         self.ema_model = deepcopy(model).eval()
 41 |         self.ema = self.ema_model.module.module if is_parallel(
 42 |             self.ema_model.module) else self.ema_model.module
 43 |         self.updates = updates
 44 |         # decay exponential ramp (to help early epochs)
 45 |         self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
 46 |         for p in self.ema.parameters():
 47 |             p.requires_grad_(False)
 48 | 
 49 |         # print(f"Model is on device: {next(self.ema.parameters()).device}")
 50 |         # k=1
 51 |         
 52 |     def update(self, trainer, model):
 53 |         # Update EMA parameters
 54 |         with torch.no_grad():
 55 |             self.updates += 1
 56 |             d = self.decay(self.updates)
 57 | 
 58 |             msd = model.module.state_dict() if is_parallel(
 59 |                 model) else model.state_dict()  # model state_dict
 60 |             for k, v in self.ema.state_dict().items():
 61 |                 if v.dtype.is_floating_point:
 62 |                     print(k)
 63 |                     v *= d
 64 |                     v += (1.0 - d) * msd[k].detach()
 65 |         k=1
 66 | 
 67 | @HOOKS.register_module()
 68 | class MEGVIIEMAHook(Hook):
 69 |     """EMAHook used in BEVDepth.
 70 | 
 71 |     Modified from https://github.com/Megvii-Base
 72 |     Detection/BEVDepth/blob/main/callbacks/ema.py.
 73 |     """
 74 | 
 75 |     def __init__(self, init_updates=0, decay=0.9990, resume=None,max_keep_ckpts=-1):
 76 |         super().__init__()
 77 |         self.init_updates = init_updates
 78 |         self.resume = resume
 79 |         self.decay = decay
 80 |         self.interval = 1
 81 |         self.max_keep_ckpts = max_keep_ckpts
 82 |         
 83 | 
 84 |     def before_run(self, runner):
 85 |         from torch.nn.modules.batchnorm import SyncBatchNorm
 86 | 
 87 |         bn_model_list = list()
 88 |         bn_model_dist_group_list = list()
 89 |         for model_ref in runner.model.modules():
 90 |             if isinstance(model_ref, SyncBatchNorm):
 91 |                 bn_model_list.append(model_ref)
 92 |                 bn_model_dist_group_list.append(model_ref.process_group)
 93 |                 model_ref.process_group = None
 94 |         runner.ema_model = ModelEMA(runner.model, self.decay)
 95 | 
 96 |         for bn_model, dist_group in zip(bn_model_list,
 97 |                                         bn_model_dist_group_list):
 98 |             bn_model.process_group = dist_group
 99 |         runner.ema_model.updates = self.init_updates
100 | 
101 |         if self.resume is not None:
102 |             runner.logger.info(f'resume ema checkpoint from {self.resume}')
103 |             cpt = torch.load(self.resume, map_location='cpu')
104 |             load_state_dict(runner.ema_model.ema, cpt['state_dict'])
105 |             runner.ema_model.updates = cpt['updates']
106 | 
107 |     def after_train_iter(self, runner):
108 |         runner.ema_model.update(runner, runner.model.module)
109 | 
110 |     def after_train_epoch(self, runner):
111 |         self.save_checkpoint(runner)
112 | 
113 |     @master_only
114 |     def save_checkpoint(self, runner):
115 |         state_dict = runner.ema_model.ema.state_dict()
116 |         ema_checkpoint = {
117 |             'epoch': runner.epoch,
118 |             'state_dict': state_dict,
119 |             'updates': runner.ema_model.updates
120 |         }
121 |         save_path = f'epoch_{runner.epoch+1}_ema.pth'
122 |         save_path = os.path.join(runner.work_dir, save_path)
123 |         torch.save(ema_checkpoint, save_path)
124 | 
125 |         # if self.max_keep_ckpts > 0:
126 |         #     current_ckpt  = runner.epoch + 1
127 |         #     redundant_ckpts = range(
128 |         #         current_ckpt - self.max_keep_ckpts, 0,
129 |         #         -self.interval)
130 |         #     for _step in redundant_ckpts:
131 |         #         ckpt_pth=f'epoch_{_step}_ema.pth'
132 |         #         ckpt_pth = os.path.join(runner.work_dir, ckpt_pth)
133 |         #         if osp.exists(ckpt_pth):
134 |         #             os.remove(ckpt_pth)    
135 | 
136 | 
137 |         runner.logger.info(f'Saving ema checkpoint at {save_path}')
138 | 


--------------------------------------------------------------------------------
/models/core/hook/ema2.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | # modified from megvii-bevdepth.
  3 | import math
  4 | import os
  5 | from copy import deepcopy
  6 | 
  7 | import torch
  8 | from mmcv.runner import load_state_dict
  9 | from mmcv.runner.dist_utils import master_only
 10 | from mmcv.runner.hooks import HOOKS, Hook
 11 | from .utils import is_parallel
 12 | from mmdet3d.models import build_model
 13 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
 14 | 
 15 | __all__ = ['ModelEMA2']
 16 | 
 17 | 
 18 | class ModelEMA2:
 19 |     """Model Exponential Moving Average from https://github.com/rwightman/
 20 |     pytorch-image-models Keep a moving average of everything in the model
 21 |     state_dict (parameters and buffers).
 22 | 
 23 |     This is intended to allow functionality like
 24 |     https://www.tensorflow.org/api_docs/python/tf/train/
 25 |     ExponentialMovingAverage
 26 |     A smoothed version of the weights is necessary for some training
 27 |     schemes to perform well.
 28 |     This class is sensitive where it is initialized in the sequence
 29 |     of model init, GPU assignment and distributed training wrappers.
 30 |     """
 31 | 
 32 |     def __init__(self, model, decay=0.9999, updates=0,ema_model_cfg=None):
 33 |         """
 34 |         Args:
 35 |             model (nn.Module): model to apply EMA.
 36 |             decay (float): ema decay reate.
 37 |             updates (int): counter of EMA updates.
 38 |         """
 39 |         # Create EMA(FP32)
 40 |         assert ema_model_cfg is not None, 'ema_model_cfg is None'
 41 |         self.ema_model = self.clone_model(model,ema_model_cfg).eval()
 42 | 
 43 |         self.ema = self.ema_model.module.module if is_parallel(
 44 |             self.ema_model.module) else self.ema_model.module
 45 |         self.updates = updates
 46 |         # decay exponential ramp (to help early epochs)
 47 |         self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
 48 |         for p in self.ema.parameters():
 49 |             p.requires_grad_(False)
 50 |         k=1
 51 | 
 52 |     def clone_model(self, model,ema_model_cfg):
 53 |         """
 54 |         Clone a model by copying its state_dict.
 55 |         """
 56 |         cloned_model = build_model(ema_model_cfg)
 57 |         if is_parallel(model.module):
 58 |             if type(model)==MMDistributedDataParallel():
 59 |                 cloned_model=type(model)(cloned_model,[0],find_unused_parameters=True)
 60 |             else:
 61 |                 cloned_model=type(model)(cloned_model,[0])
 62 |         
 63 |         cloned_model.load_state_dict(model.state_dict())
 64 | 
 65 |         return cloned_model
 66 | 
 67 |     def update(self, trainer, model):
 68 |         # Update EMA parameters
 69 |         with torch.no_grad():
 70 |             self.updates += 1
 71 |             d = self.decay(self.updates)
 72 | 
 73 |             msd = model.module.state_dict() if is_parallel(
 74 |                 model) else model.state_dict()  # model state_dict
 75 |             for k, v in self.ema.state_dict().items():
 76 |                 if v.dtype.is_floating_point:
 77 |                     v *= d
 78 |                     v += (1.0 - d) * msd[k].detach()
 79 | 
 80 | 
 81 | @HOOKS.register_module()
 82 | class MEGVIIEMAHook2(Hook):
 83 |     """EMAHook used in BEVDepth.
 84 | 
 85 |     Modified from https://github.com/Megvii-Base
 86 |     Detection/BEVDepth/blob/main/callbacks/ema.py.
 87 |     """
 88 | 
 89 |     def __init__(self, init_updates=0, decay=0.9990, resume=None,ema_model_cfg=None):
 90 |         super().__init__()
 91 |         self.init_updates = init_updates
 92 |         self.resume = resume
 93 |         self.decay = decay
 94 |         self.ema_model_cfg = ema_model_cfg
 95 | 
 96 |     def before_run(self, runner):
 97 |         from torch.nn.modules.batchnorm import SyncBatchNorm
 98 | 
 99 |         bn_model_list = list()
100 |         bn_model_dist_group_list = list()
101 |         for model_ref in runner.model.modules():
102 |             if isinstance(model_ref, SyncBatchNorm):
103 |                 bn_model_list.append(model_ref)
104 |                 bn_model_dist_group_list.append(model_ref.process_group)
105 |                 model_ref.process_group = None
106 |         runner.ema_model = ModelEMA2(runner.model, self.decay,ema_model_cfg=self.ema_model_cfg)
107 | 
108 |         for bn_model, dist_group in zip(bn_model_list,
109 |                                         bn_model_dist_group_list):
110 |             bn_model.process_group = dist_group
111 |         runner.ema_model.updates = self.init_updates
112 | 
113 |         if self.resume is not None:
114 |             runner.logger.info(f'resume ema checkpoint from {self.resume}')
115 |             cpt = torch.load(self.resume, map_location='cpu')
116 |             load_state_dict(runner.ema_model.ema, cpt['state_dict'])
117 |             runner.ema_model.updates = cpt['updates']
118 | 
119 |     def after_train_iter(self, runner):
120 |         runner.ema_model.update(runner, runner.model.module)
121 | 
122 |     def after_train_epoch(self, runner):
123 |         self.save_checkpoint(runner)
124 | 
125 |     @master_only
126 |     def save_checkpoint(self, runner):
127 |         state_dict = runner.ema_model.ema.state_dict()
128 |         ema_checkpoint = {
129 |             'epoch': runner.epoch,
130 |             'state_dict': state_dict,
131 |             'updates': runner.ema_model.updates
132 |         }
133 |         save_path = f'epoch_{runner.epoch+1}_ema.pth'
134 |         save_path = os.path.join(runner.work_dir, save_path)
135 |         torch.save(ema_checkpoint, save_path)
136 |         runner.logger.info(f'Saving ema checkpoint at {save_path}')
137 | 


--------------------------------------------------------------------------------
/scripts/gen_sweep_info.py:
--------------------------------------------------------------------------------
  1 | # Generate info files manually
  2 | import os
  3 | import mmcv
  4 | import tqdm
  5 | import pickle
  6 | import argparse
  7 | import numpy as np
  8 | from nuscenes import NuScenes
  9 | from pyquaternion import Quaternion
 10 | 
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('--data-root', default='data/nuscenes')
 14 | parser.add_argument('--version', default='v1.0-trainval')
 15 | args = parser.parse_args()
 16 | 
 17 | 
 18 | def get_cam_info(nusc, sample_data):
 19 |     pose_record = nusc.get('ego_pose', sample_data['ego_pose_token'])
 20 |     cs_record = nusc.get('calibrated_sensor', sample_data['calibrated_sensor_token'])
 21 |     
 22 |     sensor2ego_translation = cs_record['translation']
 23 |     ego2global_translation = pose_record['translation']
 24 |     sensor2ego_rotation = Quaternion(cs_record['rotation']).rotation_matrix
 25 |     ego2global_rotation = Quaternion(pose_record['rotation']).rotation_matrix
 26 |     cam_intrinsic = np.array(cs_record['camera_intrinsic'])
 27 | 
 28 |     sensor2global_rotation = sensor2ego_rotation.T @ ego2global_rotation.T
 29 |     sensor2global_translation = sensor2ego_translation @ ego2global_rotation.T + ego2global_translation
 30 | 
 31 |     return {
 32 |         'data_path': os.path.join(args.data_root, sample_data['filename']),
 33 |         'sensor2global_rotation': sensor2global_rotation,
 34 |         'sensor2global_translation': sensor2global_translation,
 35 |         'cam_intrinsic': cam_intrinsic,
 36 |         'timestamp': sample_data['timestamp'],
 37 |     }
 38 | 
 39 | 
 40 | def add_sweep_info(nusc, sample_infos):
 41 |     for curr_id in tqdm.tqdm(range(len(sample_infos['infos']))):
 42 |         sample = nusc.get('sample', sample_infos['infos'][curr_id]['token'])
 43 | 
 44 |         # add scene name for occupancy
 45 |         scene = nusc.get('scene', sample['scene_token'])
 46 |         sample_infos['infos'][curr_id]['scene_name'] = scene['name']
 47 |         sample_infos['infos'][curr_id]['scene_token'] = scene['token']
 48 |         sample_infos['infos'][curr_id]['lidar_token'] = sample['data']['LIDAR_TOP']
 49 | 
 50 |         cam_types = [
 51 |             'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT',
 52 |             'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_FRONT_LEFT'
 53 |         ]
 54 | 
 55 |         curr_cams = dict()
 56 |         for cam in cam_types:
 57 |             curr_cams[cam] = nusc.get('sample_data', sample['data'][cam])
 58 | 
 59 |         for cam in cam_types:
 60 |             sample_data = nusc.get('sample_data', sample['data'][cam])
 61 |             sweep_cam = get_cam_info(nusc, sample_data)
 62 |             sample_infos['infos'][curr_id]['cams'][cam].update(sweep_cam)
 63 | 
 64 |         # remove unnecessary
 65 |         for cam in cam_types:
 66 |             del sample_infos['infos'][curr_id]['cams'][cam]['sample_data_token']
 67 |             del sample_infos['infos'][curr_id]['cams'][cam]['sensor2ego_translation']
 68 |             del sample_infos['infos'][curr_id]['cams'][cam]['sensor2ego_rotation']
 69 |             del sample_infos['infos'][curr_id]['cams'][cam]['ego2global_translation']
 70 |             del sample_infos['infos'][curr_id]['cams'][cam]['ego2global_rotation']
 71 | 
 72 |         sweep_infos = []
 73 |         if sample['prev'] != '':  # add sweep frame between two key frame
 74 |             for _ in range(5):
 75 |                 sweep_info = dict()
 76 |                 for cam in cam_types: 
 77 |                     if curr_cams[cam]['prev'] == '':    
 78 |                         sweep_info = sweep_infos[-1] 
 79 |                         break
 80 |                     sample_data = nusc.get('sample_data', curr_cams[cam]['prev'])
 81 |                     sweep_cam = get_cam_info(nusc, sample_data)
 82 |                     curr_cams[cam] = sample_data
 83 |                     sweep_info[cam] = sweep_cam
 84 |                 sweep_infos.append(sweep_info)
 85 | 
 86 |         sample_infos['infos'][curr_id]['lidar_sweeps'] = \
 87 |             sample_infos['infos'][curr_id].pop('sweeps')
 88 |         sample_infos['infos'][curr_id]['cam_sweeps'] = sweep_infos
 89 | 
 90 |     return sample_infos
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     nusc = NuScenes(args.version, args.data_root)
 95 | 
 96 |     if args.version == 'v1.0-trainval':
 97 |         sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_train.pkl'), 'rb'))
 98 |         sample_infos = add_sweep_info(nusc, sample_infos)
 99 |         mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_train_sweep.pkl'))
100 | 
101 |         sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_val.pkl'), 'rb'))
102 |         sample_infos = add_sweep_info(nusc, sample_infos)
103 |         mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_val_sweep.pkl'))
104 | 
105 |     elif args.version == 'v1.0-test':
106 |         sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_test.pkl'), 'rb'))
107 |         sample_infos = add_sweep_info(nusc, sample_infos)
108 |         mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_test_sweep.pkl'))
109 | 
110 |     elif args.version == 'v1.0-mini':
111 |         sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_train_mini.pkl'), 'rb'))
112 |         sample_infos = add_sweep_info(nusc, sample_infos)
113 |         mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_train_mini_sweep.pkl'))
114 | 
115 |         sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_val_mini.pkl'), 'rb'))
116 |         sample_infos = add_sweep_info(nusc, sample_infos)
117 |         mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_val_mini_sweep.pkl'))
118 | 
119 |     else:
120 |         raise ValueError
121 | 


--------------------------------------------------------------------------------
/models/backbones/eva02/wrappers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | """
  3 | Wrappers around on some nn functions, mainly to support empty tensors.
  4 | 
  5 | Ideally, add support directly in PyTorch to empty tensors in those functions.
  6 | 
  7 | These can be removed once https://github.com/pytorch/pytorch/issues/12013
  8 | is implemented
  9 | """
 10 | 
 11 | import warnings
 12 | from typing import List, Optional
 13 | import torch
 14 | from torch.nn import functional as F
 15 | 
 16 | 
 17 | def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor:
 18 |     """
 19 |     Turn a list of integer scalars or integer Tensor scalars into a vector,
 20 |     in a way that's both traceable and scriptable.
 21 | 
 22 |     In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs.
 23 |     In scripting or eager, `x` should be a list of int.
 24 |     """
 25 |     if torch.jit.is_scripting():
 26 |         return torch.as_tensor(x, device=device)
 27 |     if torch.jit.is_tracing():
 28 |         assert all(
 29 |             [isinstance(t, torch.Tensor) for t in x]
 30 |         ), "Shape should be tensor during tracing!"
 31 |         # as_tensor should not be used in tracing because it records a constant
 32 |         ret = torch.stack(x)
 33 |         if ret.device != device:  # avoid recording a hard-coded device if not necessary
 34 |             ret = ret.to(device=device)
 35 |         return ret
 36 |     return torch.as_tensor(x, device=device)
 37 | 
 38 | 
 39 | def cat(tensors: List[torch.Tensor], dim: int = 0):
 40 |     """
 41 |     Efficient version of torch.cat that avoids a copy if there is only a single element in a list
 42 |     """
 43 |     assert isinstance(tensors, (list, tuple))
 44 |     if len(tensors) == 1:
 45 |         return tensors[0]
 46 |     return torch.cat(tensors, dim)
 47 | 
 48 | 
 49 | def empty_input_loss_func_wrapper(loss_func):
 50 |     def wrapped_loss_func(input, target, *, reduction="mean", **kwargs):
 51 |         """
 52 |         Same as `loss_func`, but returns 0 (instead of nan) for empty inputs.
 53 |         """
 54 |         if target.numel() == 0 and reduction == "mean":
 55 |             return input.sum() * 0.0  # connect the gradient
 56 |         return loss_func(input, target, reduction=reduction, **kwargs)
 57 | 
 58 |     return wrapped_loss_func
 59 | 
 60 | 
 61 | cross_entropy = empty_input_loss_func_wrapper(F.cross_entropy)
 62 | 
 63 | 
 64 | class _NewEmptyTensorOp(torch.autograd.Function):
 65 |     @staticmethod
 66 |     def forward(ctx, x, new_shape):
 67 |         ctx.shape = x.shape
 68 |         return x.new_empty(new_shape)
 69 | 
 70 |     @staticmethod
 71 |     def backward(ctx, grad):
 72 |         shape = ctx.shape
 73 |         return _NewEmptyTensorOp.apply(grad, shape), None
 74 | 
 75 | 
 76 | class Conv2d(torch.nn.Conv2d):
 77 |     """
 78 |     A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
 79 |     """
 80 | 
 81 |     def __init__(self, *args, **kwargs):
 82 |         """
 83 |         Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
 84 | 
 85 |         Args:
 86 |             norm (nn.Module, optional): a normalization layer
 87 |             activation (callable(Tensor) -> Tensor): a callable activation function
 88 | 
 89 |         It assumes that norm layer is used before activation.
 90 |         """
 91 |         norm = kwargs.pop("norm", None)
 92 |         activation = kwargs.pop("activation", None)
 93 |         super().__init__(*args, **kwargs)
 94 | 
 95 |         self.norm = norm
 96 |         self.activation = activation
 97 | 
 98 |     def forward(self, x):
 99 |         # torchscript does not support SyncBatchNorm yet
100 |         # https://github.com/pytorch/pytorch/issues/40507
101 |         # and we skip these codes in torchscript since:
102 |         # 1. currently we only support torchscript in evaluation mode
103 |         # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
104 |         # later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
105 |         if not torch.jit.is_scripting():
106 |             with warnings.catch_warnings(record=True):
107 |                 if x.numel() == 0 and self.training:
108 |                     # https://github.com/pytorch/pytorch/issues/12013
109 |                     assert not isinstance(
110 |                         self.norm, torch.nn.SyncBatchNorm
111 |                     ), "SyncBatchNorm does not support empty inputs!"
112 | 
113 |         x = F.conv2d(
114 |             x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
115 |         )
116 |         if self.norm is not None:
117 |             x = self.norm(x)
118 |         if self.activation is not None:
119 |             x = self.activation(x)
120 |         return x
121 | 
122 | 
123 | ConvTranspose2d = torch.nn.ConvTranspose2d
124 | BatchNorm2d = torch.nn.BatchNorm2d
125 | interpolate = F.interpolate
126 | Linear = torch.nn.Linear
127 | 
128 | 
129 | def nonzero_tuple(x):
130 |     """
131 |     A 'as_tuple=True' version of torch.nonzero to support torchscript.
132 |     because of https://github.com/pytorch/pytorch/issues/38718
133 |     """
134 |     if torch.jit.is_scripting():
135 |         if x.dim() == 0:
136 |             return x.unsqueeze(0).nonzero().unbind(1)
137 |         return x.nonzero().unbind(1)
138 |     else:
139 |         return x.nonzero(as_tuple=True)
140 | 
141 | 
142 | @torch.jit.script_if_tracing
143 | def move_device_like(src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor:
144 |     """
145 |     Tracing friendly way to cast tensor to another tensor's device. Device will be treated
146 |     as constant during tracing, scripting the casting process as whole can workaround this issue.
147 |     """
148 |     return src.to(dst.device)


--------------------------------------------------------------------------------
/models/opusv1/opus_sampling.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from ..bbox.utils import decode_points
  4 | from ..utils import rotation_3d_in_axis, DUMP
  5 | from ..csrc.wrapper import msmv_sampling, msmv_sampling_pytorch
  6 | 
  7 | 
  8 | def make_sample_points(query_points, offset, pc_range):
  9 |     '''
 10 |     query_points: [B, Q, P, 3] (x, y, z)
 11 |     offset: [B, Q, G, P, 3]
 12 |     '''
 13 |     xyz = decode_points(query_points, pc_range)  # [B, Q, 3]
 14 |     xyz = xyz[..., None, None, :]  # [B, Q, 1, 1, 3]
 15 |     sample_xyz = xyz + offset  # [B, Q, G, P, 3]
 16 |     return sample_xyz
 17 | 
 18 | 
 19 | def sampling_4d(sample_points, mlvl_feats, scale_weights, occ2img, image_h, image_w, num_views=6, eps=1e-5):
 20 |     """
 21 |     Args:
 22 |         sample_points: 3D sampling points in shape [B, Q, T, G, P, 3]
 23 |         mlvl_feats: list of multi-scale features from neck, each in shape [B*T*G, C, N, H, W]
 24 |         scale_weights: weights for multi-scale aggregation, [B, Q, G, T, P, L]
 25 |         occ2img: 4x4 projection matrix in shape [B, TN, 4, 4]
 26 |     Symbol meaning:
 27 |         B: batch size
 28 |         Q: num of queries
 29 |         T: num of frames
 30 |         G: num of groups (we follow the group sampling mechanism of AdaMixer)
 31 |         P: num of sampling points per frame per group
 32 |         N: num of views (six for nuScenes)
 33 |         L: num of layers of feature pyramid (typically it is 4: C2, C3, C4, C5)
 34 |     """
 35 | 
 36 |     B, Q, T, G, P, _ = sample_points.shape  # [B, Q, T, G, P, 3]
 37 |     N = num_views
 38 |     
 39 |     sample_points = sample_points.reshape(B, Q, T, G * P, 3)
 40 | 
 41 |     # get the projection matrix
 42 |     occ2img = occ2img[:, :, None, None, :, :]  # [B, TN, 1, 1, 4, 4]
 43 |     occ2img = occ2img.expand(B, T*N, Q, G * P, 4, 4)
 44 |     occ2img = occ2img.reshape(B, T, N, Q, G*P, 4, 4)
 45 | 
 46 |     # expand the points
 47 |     ones = torch.ones_like(sample_points[..., :1])
 48 |     sample_points = torch.cat([sample_points, ones], dim=-1)  # [B, Q, GP, 4]
 49 |     sample_points = sample_points[:, :, None, ..., None]     # [B, Q, T, GP, 4]
 50 |     sample_points = sample_points.expand(B, Q, N, T, G * P, 4, 1)
 51 |     sample_points = sample_points.transpose(1, 3)   # [B, T, N, Q, GP, 4, 1]
 52 | 
 53 |     # project 3d sampling points to N views
 54 |     sample_points_cam = torch.matmul(occ2img, sample_points).squeeze(-1)  # [B, T, N, Q, GP, 4]
 55 | 
 56 |     # homo coord -> pixel coord
 57 |     homo = sample_points_cam[..., 2:3]
 58 |     homo_nonzero = torch.maximum(homo, torch.zeros_like(homo) + eps)
 59 |     sample_points_cam = sample_points_cam[..., 0:2] / homo_nonzero  # [B, T, N, Q, GP, 2]
 60 | 
 61 |     # normalize
 62 |     sample_points_cam[..., 0] /= image_w
 63 |     sample_points_cam[..., 1] /= image_h
 64 | 
 65 |     # check if out of image
 66 |     valid_mask = ((homo > eps) \
 67 |         & (sample_points_cam[..., 1:2] > 0.0)
 68 |         & (sample_points_cam[..., 1:2] < 1.0)
 69 |         & (sample_points_cam[..., 0:1] > 0.0)
 70 |         & (sample_points_cam[..., 0:1] < 1.0)
 71 |     ).squeeze(-1).float()  # [B, T, N, Q, GP]
 72 | 
 73 |     # for visualization only
 74 |     if DUMP.enabled:
 75 |         torch.save(torch.cat([sample_points_cam, homo_nonzero], dim=-1).cpu(),
 76 |                    '{}/sample_points_cam_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
 77 |         torch.save(valid_mask.cpu(),
 78 |                    '{}/sample_points_cam_valid_mask_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
 79 | 
 80 |     valid_mask = valid_mask.permute(0, 1, 3, 4, 2)  # [B, T, Q, GP, N]
 81 |     sample_points_cam = sample_points_cam.permute(0, 1, 3, 4, 2, 5)  # [B, T, Q, GP, N, 2]
 82 | 
 83 |     # prepare batched indexing
 84 |     i_batch = torch.arange(B, dtype=torch.long, device=sample_points.device)
 85 |     i_query = torch.arange(Q, dtype=torch.long, device=sample_points.device)
 86 |     i_time = torch.arange(T, dtype=torch.long, device=sample_points.device)
 87 |     i_point = torch.arange(G * P, dtype=torch.long, device=sample_points.device)
 88 |     i_batch = i_batch.view(B, 1, 1, 1, 1).expand(B, T, Q, G * P, 1)
 89 |     i_time = i_time.view(1, T, 1, 1, 1).expand(B, T, Q, G * P, 1)
 90 |     i_query = i_query.view(1, 1, Q, 1, 1).expand(B, T, Q, G * P, 1)
 91 |     i_point = i_point.view(1, 1, 1, G * P, 1).expand(B, T, Q, G * P, 1)
 92 |     
 93 |     # we only keep at most one valid sampling point, see https://zhuanlan.zhihu.com/p/654821380
 94 |     i_view = torch.argmax(valid_mask, dim=-1)[..., None]  # [B, T, Q, GP, 1]
 95 | 
 96 |     # index the only one sampling point and its valid flag
 97 |     sample_points_cam = sample_points_cam[i_batch, i_time, i_query, i_point, i_view, :]  # [B, Q, GP, 1, 2]
 98 |     valid_mask = valid_mask[i_batch, i_time, i_query, i_point, i_view]  # [B, Q, GP, 1]
 99 | 
100 |     # treat the view index as a new axis for grid_sample and normalize the view index to [0, 1]
101 |     sample_points_cam = torch.cat([sample_points_cam, i_view[..., None].float() / (N - 1)], dim=-1)
102 | 
103 |     # reorganize the tensor to stack T and G to the batch dim for better parallelism
104 |     sample_points_cam = sample_points_cam.reshape(B, T, Q, G, P, 1, 3)
105 |     sample_points_cam = sample_points_cam.permute(0, 1, 3, 2, 4, 5, 6)  # [B, T, G, Q, P, 1, 3]
106 |     sample_points_cam = sample_points_cam.reshape(B*T*G, Q, P, 3)
107 |     sample_points_cam = sample_points_cam.contiguous()
108 | 
109 |     # reorganize the tensor to stack T and G to the batch dim for better parallelism
110 |     scale_weights = scale_weights.reshape(B, Q, G, T, P, -1)
111 |     scale_weights = scale_weights.permute(0, 2, 3, 1, 4, 5)
112 |     scale_weights = scale_weights.reshape(B*G*T, Q, P, -1)
113 |     scale_weights = scale_weights.contiguous()
114 | 
115 |     # multi-scale multi-view grid sample
116 |     final = msmv_sampling(mlvl_feats, sample_points_cam, scale_weights)
117 | 
118 |     # reorganize the sampled features
119 |     C = final.shape[2]  # [BTG, Q, C, P]
120 |     final = final.reshape(B, T, G, Q, C, P)
121 |     final = final.permute(0, 3, 2, 1, 5, 4)
122 |     final = final.flatten(3, 4)  # [B, Q, G, FP, C]
123 | 
124 |     return final
125 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import utils
  4 | import shutil
  5 | import logging
  6 | import argparse
  7 | import importlib
  8 | import os.path as osp
  9 | import torch
 10 | import torch.distributed as dist
 11 | from datetime import datetime
 12 | from mmcv import Config, DictAction
 13 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
 14 | from mmcv.runner import EpochBasedRunner, build_optimizer, load_checkpoint
 15 | from mmdet.apis import set_random_seed
 16 | from mmdet.core import DistEvalHook, EvalHook
 17 | from mmdet3d.datasets import build_dataset
 18 | from mmdet3d.models import build_model
 19 | from loaders.builder import build_dataloader
 20 | from models.core.hook.ema import MEGVIIEMAHook
 21 | 
 22 | 
 23 | def main():
 24 |     parser = argparse.ArgumentParser(description='Train a detector')
 25 |     parser.add_argument('--config', required=True)
 26 |     parser.add_argument('--override', nargs='+', action=DictAction)
 27 |     parser.add_argument('--local_rank', type=int, default=0)
 28 |     parser.add_argument('--world_size', type=int, default=1)
 29 |     args = parser.parse_args()
 30 | 
 31 |     # parse configs
 32 |     cfgs = Config.fromfile(args.config)
 33 |     if args.override is not None:
 34 |         cfgs.merge_from_dict(args.override)
 35 | 
 36 |     # register custom module
 37 |     importlib.import_module('models')
 38 |     importlib.import_module('loaders')
 39 | 
 40 |     # MMCV, please shut up
 41 |     from mmcv.utils.logging import logger_initialized
 42 |     logger_initialized['root'] = logging.Logger(__name__, logging.WARNING)
 43 |     logger_initialized['mmcv'] = logging.Logger(__name__, logging.WARNING)
 44 |     logger_initialized['mmdet3d'] = logging.Logger(__name__, logging.WARNING)
 45 | 
 46 |     # you need GPUs
 47 |     assert torch.cuda.is_available()
 48 | 
 49 |     # determine local_rank and world_size
 50 |     if 'LOCAL_RANK' not in os.environ:
 51 |         os.environ['LOCAL_RANK'] = str(args.local_rank)
 52 |     
 53 |     if 'WORLD_SIZE' not in os.environ:
 54 |         os.environ['WORLD_SIZE'] = str(args.world_size)
 55 | 
 56 |     local_rank = int(os.environ['LOCAL_RANK'])
 57 |     world_size = int(os.environ['WORLD_SIZE'])
 58 | 
 59 |     logging.info('Using GPU: %s' % torch.cuda.get_device_name(local_rank))
 60 |     torch.cuda.set_device(local_rank)
 61 | 
 62 |     timestamp = torch.tensor(time.time(), dtype=torch.float64).cuda()
 63 |     if world_size > 1:
 64 |         logging.info('Initializing DDP with %d GPUs...' % world_size)
 65 |         dist.init_process_group('nccl', init_method='env://')
 66 |         dist.broadcast(timestamp, 0)
 67 | 
 68 |     # resume or start a new run
 69 |     if cfgs.resume_from is not None:
 70 |         assert os.path.isfile(cfgs.resume_from)
 71 |         work_dir = os.path.dirname(cfgs.resume_from)
 72 |     else:
 73 |         run_name = osp.splitext(osp.split(args.config)[-1])[0]
 74 |         run_name += '_' + time.strftime("%Y-%m-%d/%H-%M-%S",
 75 |                                         time.localtime(timestamp.cpu().item()))
 76 |         work_dir = os.path.join('outputs', cfgs.model.type, run_name)
 77 | 
 78 |     if local_rank == 0:
 79 |         if os.path.exists(work_dir):  # must be an empty dir
 80 |             raise FileExistsError(work_dir)
 81 |         os.makedirs(work_dir, exist_ok=False)
 82 | 
 83 |         # init logging, backup code
 84 |         utils.init_logging(os.path.join(work_dir, 'train.log'), cfgs.debug)
 85 |         utils.backup_code(work_dir)
 86 |         logging.info('Logs will be saved to %s' % work_dir)
 87 |     else:
 88 |         # disable logging on other workers
 89 |         logging.root.disabled = True
 90 | 
 91 |     logging.info('Setting random seed: 0')
 92 |     set_random_seed(0, deterministic=True)
 93 | 
 94 |     logging.info('Loading training set from %s' % cfgs.dataset_root)
 95 |     train_dataset = build_dataset(cfgs.data.train)
 96 |     train_loader = build_dataloader(
 97 |         train_dataset,
 98 |         samples_per_gpu=cfgs.batch_size // world_size,
 99 |         workers_per_gpu=cfgs.data.workers_per_gpu,
100 |         num_gpus=world_size,
101 |         dist=world_size > 1,
102 |         shuffle=True,
103 |         seed=0,
104 |     )
105 | 
106 |     logging.info('Loading validation set from %s' % cfgs.dataset_root)
107 |     val_dataset = build_dataset(cfgs.data.val)
108 |     val_loader = build_dataloader(
109 |         val_dataset,
110 |         samples_per_gpu=1,
111 |         workers_per_gpu=cfgs.data.workers_per_gpu,
112 |         num_gpus=world_size,
113 |         dist=world_size > 1,
114 |         shuffle=False
115 |     )
116 | 
117 |     logging.info('Creating model: %s' % cfgs.model.type)
118 |     model = build_model(cfgs.model)
119 |     model.init_weights()
120 | 
121 |     # logging.info(f'Model:\n{model}')
122 |     model.cuda()
123 |     model.train()
124 | 
125 |     n_params = sum([p.numel() for p in model.parameters() if p.requires_grad])
126 |     logging.info('Trainable parameters: %d (%.1fM)' % (n_params, n_params / 1e6))
127 |     logging.info('Batch size per GPU: %d' % (cfgs.batch_size // world_size))
128 | 
129 |     if world_size > 1:
130 |         model = MMDistributedDataParallel(model, [local_rank], broadcast_buffers=False)
131 |     else:
132 |         model = MMDataParallel(model, [0])
133 |     
134 |     logging.info('Creating optimizer: %s' % cfgs.optimizer.type)
135 |     optimizer = build_optimizer(model, cfgs.optimizer)
136 | 
137 |     runner = EpochBasedRunner(
138 |         model,
139 |         optimizer=optimizer,
140 |         work_dir=work_dir,
141 |         logger=logging.root,
142 |         max_epochs=cfgs.total_epochs,
143 |         meta=dict(),
144 |     )
145 | 
146 |     runner.register_lr_hook(cfgs.lr_config)
147 |     runner.register_optimizer_hook(cfgs.optimizer_config)
148 |     runner.register_checkpoint_hook(cfgs.checkpoint_config)
149 |     runner.register_logger_hooks(cfgs.log_config)
150 |     runner.register_timer_hook(dict(type='IterTimerHook'))
151 |     runner.register_custom_hooks(dict(type='DistSamplerSeedHook'))
152 |     if cfgs.get('custom_hooks', None) is not None:
153 |         for hook_cfg in cfgs.custom_hooks:
154 |             runner.register_custom_hooks(hook_cfg)
155 | 
156 |     if cfgs.eval_config['interval'] > 0:
157 |         if world_size > 1:
158 |             runner.register_hook(DistEvalHook(
159 |                 val_loader, interval=cfgs.eval_config['interval'], gpu_collect=False))
160 |         else:
161 |             runner.register_hook(EvalHook(val_loader, interval=cfgs.eval_config['interval']))
162 | 
163 |     if cfgs.resume_from is not None:
164 |         logging.info('Resuming from %s' % cfgs.resume_from)
165 |         runner.resume(cfgs.resume_from)
166 | 
167 |     elif cfgs.load_from is not None:
168 |         logging.info('Loading checkpoint from %s' % cfgs.load_from)
169 |         if cfgs.revise_keys is not None:
170 |             load_checkpoint(
171 |                 model, cfgs.load_from, map_location='cpu',
172 |                 revise_keys=cfgs.revise_keys
173 |             )
174 |         else:
175 |             load_checkpoint(
176 |                 model, cfgs.load_from, map_location='cpu',
177 |             )
178 | 
179 |     runner.run([train_loader], [('train', 1)])
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     main()
184 | 


--------------------------------------------------------------------------------
/models/opusv1_fusion/opus_sampling.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from ..bbox.utils import decode_points, encode_points
  4 | from ..utils import rotation_3d_in_axis, DUMP
  5 | from ..csrc.wrapper import msmv_sampling, msmv_sampling_pytorch
  6 | 
  7 | 
  8 | def make_sample_points(query_points, offset, pc_range):
  9 |     '''
 10 |     query_points: [B, Q, P, 3] (x, y, z)
 11 |     offset: [B, Q, G, P, 3]
 12 |     '''
 13 |     xyz = decode_points(query_points, pc_range)  # [B, Q, 3]
 14 |     xyz = xyz[..., None, None, :]  # [B, Q, 1, 1, 3]
 15 |     sample_xyz = xyz + offset  # [B, Q, G, P, 3]
 16 |     return sample_xyz
 17 | 
 18 | 
 19 | def sampling_4d(sample_points, mlvl_feats, scale_weights, occ2img, image_h, image_w, num_views=6, eps=1e-5):
 20 |     """
 21 |     Args:
 22 |         sample_points: 3D sampling points in shape [B, Q, T, G, P, 3]
 23 |         mlvl_feats: list of multi-scale features from neck, each in shape [B*T*G, C, N, H, W]
 24 |         scale_weights: weights for multi-scale aggregation, [B, Q, G, T, P, L]
 25 |         occ2img: 4x4 projection matrix in shape [B, TN, 4, 4]
 26 |     Symbol meaning:
 27 |         B: batch size
 28 |         Q: num of queries
 29 |         T: num of frames
 30 |         G: num of groups (we follow the group sampling mechanism of AdaMixer)
 31 |         P: num of sampling points per frame per group
 32 |         N: num of views (six for nuScenes)
 33 |         L: num of layers of feature pyramid (typically it is 4: C2, C3, C4, C5)
 34 |     """
 35 | 
 36 |     B, Q, T, G, P, _ = sample_points.shape  # [B, Q, T, G, P, 3]
 37 |     N = num_views
 38 |     
 39 |     sample_points = sample_points.reshape(B, Q, T, G * P, 3)
 40 | 
 41 |     # get the projection matrix
 42 |     occ2img = occ2img[:, :, None, None, :, :]  # [B, TN, 1, 1, 4, 4]
 43 |     occ2img = occ2img.expand(B, T*N, Q, G * P, 4, 4)
 44 |     occ2img = occ2img.reshape(B, T, N, Q, G*P, 4, 4)
 45 | 
 46 |     # expand the points
 47 |     ones = torch.ones_like(sample_points[..., :1])
 48 |     sample_points = torch.cat([sample_points, ones], dim=-1)  # [B, Q, GP, 4]
 49 |     sample_points = sample_points[:, :, None, ..., None]     # [B, Q, T, GP, 4]
 50 |     sample_points = sample_points.expand(B, Q, N, T, G * P, 4, 1)
 51 |     sample_points = sample_points.transpose(1, 3)   # [B, T, N, Q, GP, 4, 1]
 52 | 
 53 |     # project 3d sampling points to N views
 54 |     sample_points_cam = torch.matmul(occ2img, sample_points).squeeze(-1)  # [B, T, N, Q, GP, 4]
 55 | 
 56 |     # homo coord -> pixel coord
 57 |     homo = sample_points_cam[..., 2:3]
 58 |     homo_nonzero = torch.maximum(homo, torch.zeros_like(homo) + eps)
 59 |     sample_points_cam = sample_points_cam[..., 0:2] / homo_nonzero  # [B, T, N, Q, GP, 2]
 60 | 
 61 |     # normalize
 62 |     sample_points_cam[..., 0] /= image_w
 63 |     sample_points_cam[..., 1] /= image_h
 64 | 
 65 |     # check if out of image
 66 |     valid_mask = ((homo > eps) \
 67 |         & (sample_points_cam[..., 1:2] > 0.0)
 68 |         & (sample_points_cam[..., 1:2] < 1.0)
 69 |         & (sample_points_cam[..., 0:1] > 0.0)
 70 |         & (sample_points_cam[..., 0:1] < 1.0)
 71 |     ).squeeze(-1).float()  # [B, T, N, Q, GP]
 72 | 
 73 |     # for visualization only
 74 |     if DUMP.enabled:
 75 |         torch.save(torch.cat([sample_points_cam, homo_nonzero], dim=-1).cpu(),
 76 |                    '{}/sample_points_cam_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
 77 |         torch.save(valid_mask.cpu(),
 78 |                    '{}/sample_points_cam_valid_mask_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count))
 79 | 
 80 |     valid_mask = valid_mask.permute(0, 1, 3, 4, 2)  # [B, T, Q, GP, N]
 81 |     sample_points_cam = sample_points_cam.permute(0, 1, 3, 4, 2, 5)  # [B, T, Q, GP, N, 2]
 82 | 
 83 |     # prepare batched indexing
 84 |     i_batch = torch.arange(B, dtype=torch.long, device=sample_points.device)
 85 |     i_query = torch.arange(Q, dtype=torch.long, device=sample_points.device)
 86 |     i_time = torch.arange(T, dtype=torch.long, device=sample_points.device)
 87 |     i_point = torch.arange(G * P, dtype=torch.long, device=sample_points.device)
 88 |     i_batch = i_batch.view(B, 1, 1, 1, 1).expand(B, T, Q, G * P, 1)
 89 |     i_time = i_time.view(1, T, 1, 1, 1).expand(B, T, Q, G * P, 1)
 90 |     i_query = i_query.view(1, 1, Q, 1, 1).expand(B, T, Q, G * P, 1)
 91 |     i_point = i_point.view(1, 1, 1, G * P, 1).expand(B, T, Q, G * P, 1)
 92 |     
 93 |     # we only keep at most one valid sampling point, see https://zhuanlan.zhihu.com/p/654821380
 94 |     i_view = torch.argmax(valid_mask, dim=-1)[..., None]  # [B, T, Q, GP, 1]
 95 | 
 96 |     # index the only one sampling point and its valid flag
 97 |     sample_points_cam = sample_points_cam[i_batch, i_time, i_query, i_point, i_view, :]  # [B, Q, GP, 1, 2]
 98 |     valid_mask = valid_mask[i_batch, i_time, i_query, i_point, i_view]  # [B, Q, GP, 1]
 99 | 
100 |     # treat the view index as a new axis for grid_sample and normalize the view index to [0, 1]
101 |     sample_points_cam = torch.cat([sample_points_cam, i_view[..., None].float() / (N - 1)], dim=-1)
102 | 
103 |     # reorganize the tensor to stack T and G to the batch dim for better parallelism
104 |     sample_points_cam = sample_points_cam.reshape(B, T, Q, G, P, 1, 3)
105 |     sample_points_cam = sample_points_cam.permute(0, 1, 3, 2, 4, 5, 6)  # [B, T, G, Q, P, 1, 3]
106 |     sample_points_cam = sample_points_cam.reshape(B*T*G, Q, P, 3)
107 |     sample_points_cam = sample_points_cam.contiguous()
108 | 
109 |     # reorganize the tensor to stack T and G to the batch dim for better parallelism
110 |     scale_weights = scale_weights.reshape(B, Q, G, T, P, -1)
111 |     scale_weights = scale_weights.permute(0, 2, 3, 1, 4, 5)
112 |     scale_weights = scale_weights.reshape(B*G*T, Q, P, -1)
113 |     scale_weights = scale_weights.contiguous()
114 | 
115 |     # multi-scale multi-view grid sample
116 |     final = msmv_sampling(mlvl_feats, sample_points_cam, scale_weights)
117 | 
118 |     # reorganize the sampled features
119 |     C = final.shape[2]  # [BTG, Q, C, P]
120 |     final = final.reshape(B, T, G, Q, C, P)
121 |     final = final.permute(0, 3, 2, 1, 5, 4)
122 |     final = final.flatten(3, 4)  # [B, Q, G, FP, C]
123 | 
124 |     return final
125 | 
126 | 
127 | def sampling_pts_feats(sample_points, pts_feats, occ2lidar, pc_range):
128 |     C = pts_feats.shape[1]
129 |     B, Q, G, P, _ = sample_points.shape  # [B, Q, G, P, 3]
130 |     sample_points = sample_points.permute(0, 2, 1, 3, 4)
131 |     sample_points = sample_points.reshape(B*G, Q, P, 3)  # [BG, Q, P, 3]
132 | 
133 |     occ2lidar = occ2lidar[:, None, None, None, :, :]  # [B, 1, 1, 1, 4, 4]
134 |     occ2lidar = occ2lidar.expand(B, G, Q, P, 4, 4)
135 |     occ2lidar = occ2lidar.reshape(B*G, Q, P, 4, 4)
136 | 
137 |     ones = torch.ones_like(sample_points[..., :1])
138 |     sample_points = torch.cat([sample_points, ones], dim=-1)[..., None] # [BG, Q, P, 4, 1]
139 |     sample_points = torch.matmul(occ2lidar, sample_points).squeeze(-1)
140 | 
141 |     norm_sample_points = encode_points(sample_points[..., :3], pc_range)
142 |     norm_sample_points = norm_sample_points[..., :2] * 2 - 1 # [BG, Q, P, 2]
143 | 
144 |     feat = F.grid_sample(pts_feats, norm_sample_points, padding_mode='zeros', align_corners=True) 
145 |     feat = feat.reshape(B, G, C, Q, P)
146 |     feat = feat.permute(0, 3, 1, 4, 2)  # [B, Q, G, P, C]
147 | 
148 |     return feat
149 | 


--------------------------------------------------------------------------------
/models/backbones/second_3d.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) OpenMMLab. All rights reserved.
  2 | import warnings
  3 | 
  4 | from mmcv.cnn import build_conv_layer, build_norm_layer
  5 | from mmcv.runner import BaseModule
  6 | from torch import nn as nn
  7 | 
  8 | from mmdet3d.models.builder import BACKBONES
  9 | 
 10 | from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE
 11 | if IS_SPCONV2_AVAILABLE:
 12 |     from spconv.pytorch import SparseConvTensor, SparseSequential
 13 | else:
 14 |     from mmcv.ops import SparseConvTensor, SparseSequential
 15 | 
 16 | def make_sparse_convmodule(in_channels,
 17 |                            out_channels,
 18 |                            kernel_size,
 19 |                            indice_key,
 20 |                            stride=1,
 21 |                            padding=0,
 22 |                            conv_type='SubMConv3d',
 23 |                            norm_cfg=None,
 24 |                            order=('conv', 'norm', 'act')):
 25 |     """Make sparse convolution module.
 26 | 
 27 |     Args:
 28 |         in_channels (int): the number of input channels
 29 |         out_channels (int): the number of out channels
 30 |         kernel_size (int|tuple(int)): kernel size of convolution
 31 |         indice_key (str): the indice key used for sparse tensor
 32 |         stride (int|tuple(int)): the stride of convolution
 33 |         padding (int or list[int]): the padding number of input
 34 |         conv_type (str): sparse conv type in spconv
 35 |         norm_cfg (dict[str]): config of normalization layer
 36 |         order (tuple[str]): The order of conv/norm/activation layers. It is a
 37 |             sequence of "conv", "norm" and "act". Common examples are
 38 |             ("conv", "norm", "act") and ("act", "conv", "norm").
 39 | 
 40 |     Returns:
 41 |         spconv.SparseSequential: sparse convolution module.
 42 |     """
 43 |     assert isinstance(order, tuple) and len(order) <= 3
 44 |     assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}
 45 | 
 46 |     conv_cfg = dict(type=conv_type, indice_key=indice_key)
 47 | 
 48 |     layers = list()
 49 |     for layer in order:
 50 |         if layer == 'conv':
 51 |             if conv_type not in [
 52 |                     'SparseInverseConv3d', 'SparseInverseConv2d',
 53 |                     'SparseInverseConv1d'
 54 |             ]:
 55 |                 layers.append(
 56 |                     build_conv_layer(
 57 |                         conv_cfg,
 58 |                         in_channels,
 59 |                         out_channels,
 60 |                         kernel_size,
 61 |                         stride=stride,
 62 |                         padding=padding,
 63 |                         bias=False))
 64 |             else:
 65 |                 layers.append(
 66 |                     build_conv_layer(
 67 |                         conv_cfg,
 68 |                         in_channels,
 69 |                         out_channels,
 70 |                         kernel_size,
 71 |                         bias=False))
 72 |         elif layer == 'norm':
 73 |             layers.append(build_norm_layer(norm_cfg, out_channels)[1])
 74 |         elif layer == 'act':
 75 |             layers.append(nn.ReLU(inplace=True))
 76 | 
 77 |     layers = SparseSequential(*layers)
 78 |     return layers
 79 | 
 80 | @BACKBONES.register_module()
 81 | class SECOND_3d(BaseModule):
 82 |     """Backbone network for SECOND/PointPillars/PartA2/MVXNet.
 83 | 
 84 |     Args:
 85 |         in_channels (int): Input channels.
 86 |         out_channels (list[int]): Output channels for multi-scale feature maps.
 87 |         layer_nums (list[int]): Number of layers in each stage.
 88 |         layer_strides (list[int]): Strides of each stage.
 89 |         norm_cfg (dict): Config dict of normalization layers.
 90 |         conv_cfg (dict): Config dict of convolutional layers.
 91 | 
 92 |         use_sparse_conv: (int) the sparse conv layer,note that sparse conv can devoid
 93 |             the feature
 94 |     """
 95 | 
 96 |     def __init__(self,
 97 |                  in_channels=128,
 98 |                  out_channels=[128, 128, 256],
 99 |                  sparse_conv_cnt=0,
100 |                  layer_nums=[3, 5, 5],
101 |                  layer_strides=[2, 2, 2],
102 |                  norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
103 |                  conv_cfg=dict(type='Conv2d', bias=False),
104 |                  init_cfg=None,
105 |                  pretrained=None):
106 |         super(SECOND_3d, self).__init__(init_cfg=init_cfg)
107 |         assert len(layer_strides) == len(layer_nums)
108 |         assert len(out_channels) == len(layer_nums)
109 |         self.sparse_conv_cnt=sparse_conv_cnt
110 | 
111 |         in_filters = [in_channels, *out_channels[:-1]]
112 |         # note that when stride > 1, conv2d with same padding isn't
113 |         # equal to pad-conv2d. we should use pad-conv2d.
114 |         blocks = []
115 |         sparse_cnt=0
116 |         sparse_cnt2=0
117 |         for i, layer_num in enumerate(layer_nums):
118 |             if layer_strides[i]<=1:
119 |                 conv_cfg = dict(type='SubMConv3d', indice_key=f'subm_second_{sparse_cnt}')
120 |             else:
121 |                 conv_cfg = dict(type='SparseConv3d', indice_key=f'spconv_second_{sparse_cnt2}')
122 |             block = [
123 |                 build_conv_layer(
124 |                         conv_cfg,
125 |                         in_filters[i],
126 |                         out_channels[i],
127 |                         3,
128 |                         stride=layer_strides[i],
129 |                         padding=1,
130 |                         bias=False),
131 |                 build_norm_layer(norm_cfg, out_channels[i])[1],
132 |                 nn.ReLU(inplace=True),
133 |             ]
134 |             if layer_strides[i]>1:
135 |                 self.sparse_conv_cnt=0
136 |             for j in range(layer_num):
137 |                 if layer_num-j>self.sparse_conv_cnt:
138 |                     sparse_cnt+=1
139 |                     conv_cfg = dict(type='SubMConv3d', indice_key=f'subm_second_{sparse_cnt}')
140 |                     block.append(
141 |                         build_conv_layer(
142 |                             conv_cfg,
143 |                             out_channels[i],
144 |                             out_channels[i],
145 |                             3,
146 |                             padding=1))
147 |                     block.append(build_norm_layer(norm_cfg, out_channels[i])[1])
148 |                     block.append(nn.ReLU(inplace=True))
149 |                 else:
150 |                     conv_cfg = dict(type='SparseConv3d', indice_key=f'spconv_second_{sparse_cnt2}')
151 |                     block.append(
152 |                         build_conv_layer(
153 |                             conv_cfg,
154 |                             out_channels[i],
155 |                             out_channels[i],
156 |                             3,
157 |                             padding=1))
158 |                     block.append(build_norm_layer(norm_cfg, out_channels[i])[1])
159 |                     block.append(nn.ReLU(inplace=True))
160 |                     sparse_cnt2+=1
161 | 
162 |             block = SparseSequential(*block)
163 |             blocks.append(block)
164 | 
165 |         self.blocks = nn.ModuleList(blocks)
166 | 
167 | 
168 |     def forward(self, x):
169 |         """Forward function.
170 | 
171 |         Args:
172 |             x (torch.Tensor): sparse tensor
173 | 
174 |         Returns:
175 |             tuple[torch.Tensor]: Multi-scale features.
176 |         """
177 |         outs = []
178 |         for i in range(len(self.blocks)):
179 |             x = self.blocks[i](x)
180 |             outs.append(x)
181 |         return tuple(outs)
182 | 


--------------------------------------------------------------------------------
/configs/opusv1_nusc-occ3d/opusv1-s_r50_704x256_8f_nusc-occ3d_100e.py:
--------------------------------------------------------------------------------
  1 | dataset_type = 'NuScenesOcc3DDataset'
  2 | dataset_root = 'data/nuscenes/'
  3 | occ_root = 'data/nuscenes/gts/'
  4 | 
  5 | input_modality = dict(
  6 |     use_lidar=False,
  7 |     use_camera=True,
  8 |     use_radar=False,
  9 |     use_map=False,
 10 |     use_external=True
 11 | )
 12 | 
 13 | # For nuScenes we usually do 10-class detection
 14 | object_names = [
 15 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
 16 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
 17 | ]
 18 | 
 19 | occ_names = [
 20 |     'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
 21 |     'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
 22 |     'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade',
 23 |     'vegetation'
 24 | ]
 25 | 
 26 | # If point cloud range is changed, the models should also change their point
 27 | # cloud range accordingly
 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]
 29 | voxel_size = [0.4, 0.4, 0.4]
 30 | 
 31 | # arch config
 32 | embed_dims = 256
 33 | num_layers = 6
 34 | num_query = 1200
 35 | num_frames = 8
 36 | num_levels = 4
 37 | num_points = 2
 38 | num_refines = [1, 4, 8, 16, 32, 64]
 39 | 
 40 | img_backbone = dict(
 41 |     type='ResNet',
 42 |     depth=50,
 43 |     num_stages=4,
 44 |     out_indices=(0, 1, 2, 3),
 45 |     frozen_stages=1,
 46 |     norm_cfg=dict(type='BN2d', requires_grad=True),
 47 |     norm_eval=True,
 48 |     style='pytorch',
 49 |     with_cp=True)
 50 | img_neck = dict(
 51 |     type='FPN',
 52 |     in_channels=[256, 512, 1024, 2048],
 53 |     out_channels=embed_dims,
 54 |     num_outs=num_levels)
 55 | img_norm_cfg = dict(
 56 |     mean=[123.675, 116.280, 103.530],
 57 |     std=[58.395, 57.120, 57.375],
 58 |     to_rgb=True)
 59 | 
 60 | model = dict(
 61 |     type='OPUSV1',
 62 |     use_grid_mask=False,
 63 |     data_aug=dict(
 64 |         img_color_aug=True,  # Move some augmentations to GPU
 65 |         img_norm_cfg=img_norm_cfg,
 66 |         img_pad_cfg=dict(size_divisor=32)),
 67 |     stop_prev_grad=0,
 68 |     img_backbone=img_backbone,
 69 |     img_neck=img_neck,
 70 |     pts_bbox_head=dict(
 71 |         type='OPUSV1Head',
 72 |         num_classes=len(occ_names),
 73 |         in_channels=embed_dims,
 74 |         num_query=num_query,
 75 |         pc_range=point_cloud_range,
 76 |         voxel_size=voxel_size,
 77 |         transformer=dict(
 78 |             type='OPUSV1Transformer',
 79 |             embed_dims=embed_dims,
 80 |             num_frames=num_frames,
 81 |             num_points=num_points,
 82 |             num_layers=num_layers,
 83 |             num_levels=num_levels,
 84 |             num_classes=len(occ_names),
 85 |             num_refines=num_refines,
 86 |             scales=[0.5],
 87 |             pc_range=point_cloud_range),
 88 |         loss_cls=dict(
 89 |             type='FocalLoss',
 90 |             use_sigmoid=True,
 91 |             gamma=2.0,
 92 |             alpha=0.25,
 93 |             loss_weight=2.0),
 94 |         loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)),
 95 |     train_cfg=dict(
 96 |         pts=dict(
 97 |             cls_weights=[
 98 |                 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1],
 99 |             )
100 |         ),
101 |     test_cfg=dict(
102 |         pts=dict(
103 |             score_thr=0.5,
104 |             padding=True
105 |         )
106 |     )
107 | )
108 | 
109 | ida_aug_conf = {
110 |     'resize_lim': (0.38, 0.55),
111 |     'final_dim': (256, 704),
112 |     'bot_pct_lim': (0.0, 0.0),
113 |     'rot_lim': (0.0, 0.0),
114 |     'H': 900, 'W': 1600,
115 |     'rand_flip': True,
116 | }
117 | 
118 | train_pipeline = [
119 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
120 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1),
121 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
122 |     dict(type='LoadOcc3DFromFile', occ_root=occ_root), 
123 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
124 |     dict(type='ObjectNameFilter', classes=object_names),
125 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True),
126 |     dict(type='DefaultFormatBundle3D', class_names=object_names),
127 |     dict(type='Collect3D', keys=['img', 'voxel_semantics', 'mask_camera'], meta_keys=(
128 |         'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp'))
129 | ]
130 | 
131 | test_pipeline = [
132 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
133 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True),
134 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False),
135 |     dict(
136 |         type='MultiScaleFlipAug3D',
137 |         img_scale=(1600, 900),
138 |         pts_scale_ratio=1,
139 |         flip=False,
140 |         transforms=[
141 |             dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False),
142 |             dict(type='Collect3D', keys=['img'], meta_keys=(
143 |                 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape',
144 |                 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp'))
145 |         ])
146 | ]
147 | 
148 | data = dict(
149 |     workers_per_gpu=4,
150 |     train=dict(
151 |         type=dataset_type,
152 |         data_root=dataset_root,
153 |         ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl',
154 |         pipeline=train_pipeline,
155 |         classes=object_names,
156 |         modality=input_modality,
157 |         test_mode=False,
158 |         use_valid_flag=True,
159 |         box_type_3d='LiDAR'),
160 |     val=dict(
161 |         type=dataset_type,
162 |         data_root=dataset_root,
163 |         ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl',
164 |         pipeline=test_pipeline,
165 |         classes=object_names,
166 |         modality=input_modality,
167 |         test_mode=True,
168 |         box_type_3d='LiDAR'),
169 |     test=dict(
170 |         type=dataset_type,
171 |         data_root=dataset_root,
172 |         ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl',
173 |         pipeline=test_pipeline,
174 |         classes=object_names,
175 |         modality=input_modality,
176 |         test_mode=True,
177 |         box_type_3d='LiDAR')
178 | )
179 | 
180 | optimizer = dict(
181 |     type='AdamW',
182 |     lr=2e-4,
183 |     paramwise_cfg=dict(custom_keys={
184 |         'img_backbone': dict(lr_mult=0.1),
185 |         'sampling_offset': dict(lr_mult=0.1),
186 |     }),
187 |     weight_decay=0.01
188 | )
189 | 
190 | optimizer_config = dict(
191 |     type='Fp16OptimizerHook',
192 |     loss_scale=512.0,
193 |     grad_clip=dict(max_norm=35, norm_type=2)
194 | )
195 | 
196 | # learning policy
197 | lr_config = dict(
198 |     policy='CosineAnnealing',
199 |     warmup='linear',
200 |     warmup_iters=500,
201 |     warmup_ratio=1.0 / 3,
202 |     min_lr_ratio=1e-3
203 | )
204 | total_epochs = 100
205 | batch_size = 8
206 | 
207 | # load pretrained weights
208 | load_from = 'pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth'
209 | revise_keys = [('backbone', 'img_backbone')]
210 | 
211 | # resume the last training
212 | resume_from = None
213 | 
214 | # checkpointing
215 | checkpoint_config = dict(interval=1, max_keep_ckpts=1)
216 | 
217 | # logging
218 | log_config = dict(
219 |     interval=1,
220 |     hooks=[
221 |         dict(type='TextLoggerHook', interval=50, reset_flag=True),
222 |         dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True)
223 |     ]
224 | )
225 | 
226 | # evaluation
227 | eval_config = dict(interval=total_epochs)
228 | 
229 | # other flags
230 | debug = False
231 | 


--------------------------------------------------------------------------------
/configs/opusv1_nusc-occ3d/opusv1-l_r50_704x256_8f_nusc-occ3d_100e.py:
--------------------------------------------------------------------------------
  1 | dataset_type = 'NuScenesOcc3DDataset'
  2 | dataset_root = 'data/nuscenes/'
  3 | occ_root = 'data/nuscenes/gts/'
  4 | 
  5 | input_modality = dict(
  6 |     use_lidar=False,
  7 |     use_camera=True,
  8 |     use_radar=False,
  9 |     use_map=False,
 10 |     use_external=True
 11 | )
 12 | 
 13 | # For nuScenes we usually do 10-class detection
 14 | object_names = [
 15 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
 16 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
 17 | ]
 18 | 
 19 | occ_names = [
 20 |     'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
 21 |     'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
 22 |     'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade',
 23 |     'vegetation'
 24 | ]
 25 | 
 26 | # If point cloud range is changed, the models should also change their point
 27 | # cloud range accordingly
 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]
 29 | voxel_size = [0.4, 0.4, 0.4]
 30 | 
 31 | # arch config
 32 | embed_dims = 256
 33 | num_layers = 6
 34 | num_query = 4800
 35 | num_frames = 8
 36 | num_levels = 4
 37 | num_points = 2
 38 | num_refines = [1, 2, 4, 8, 16, 16]
 39 | 
 40 | img_backbone = dict(
 41 |     type='ResNet',
 42 |     depth=50,
 43 |     num_stages=4,
 44 |     out_indices=(0, 1, 2, 3),
 45 |     frozen_stages=1,
 46 |     norm_cfg=dict(type='BN2d', requires_grad=True),
 47 |     norm_eval=True,
 48 |     style='pytorch',
 49 |     with_cp=True)
 50 | img_neck = dict(
 51 |     type='FPN',
 52 |     in_channels=[256, 512, 1024, 2048],
 53 |     out_channels=embed_dims,
 54 |     num_outs=num_levels)
 55 | img_norm_cfg = dict(
 56 |     mean=[123.675, 116.280, 103.530],
 57 |     std=[58.395, 57.120, 57.375],
 58 |     to_rgb=True)
 59 | 
 60 | model = dict(
 61 |     type='OPUSV1',
 62 |     use_grid_mask=False,
 63 |     data_aug=dict(
 64 |         img_color_aug=True,  # Move some augmentations to GPU
 65 |         img_norm_cfg=img_norm_cfg,
 66 |         img_pad_cfg=dict(size_divisor=32)),
 67 |     stop_prev_grad=0,
 68 |     img_backbone=img_backbone,
 69 |     img_neck=img_neck,
 70 |     pts_bbox_head=dict(
 71 |         type='OPUSV1Head',
 72 |         num_classes=len(occ_names),
 73 |         in_channels=embed_dims,
 74 |         num_query=num_query,
 75 |         pc_range=point_cloud_range,
 76 |         voxel_size=voxel_size,
 77 |         transformer=dict(
 78 |             type='OPUSV1Transformer',
 79 |             embed_dims=embed_dims,
 80 |             num_frames=num_frames,
 81 |             num_points=num_points,
 82 |             num_layers=num_layers,
 83 |             num_levels=num_levels,
 84 |             num_classes=len(occ_names),
 85 |             num_refines=num_refines,
 86 |             scales=[0.5],
 87 |             pc_range=point_cloud_range),
 88 |         loss_cls=dict(
 89 |             type='FocalLoss',
 90 |             use_sigmoid=True,
 91 |             gamma=2.0,
 92 |             alpha=0.25,
 93 |             loss_weight=2.0),
 94 |         loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)),
 95 |     train_cfg=dict(
 96 |         pts=dict(
 97 |             cls_weights=[
 98 |                 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1],
 99 |             )
100 |         ),
101 |     test_cfg=dict(
102 |         pts=dict(
103 |             score_thr=0.5,
104 |             padding=True
105 |         )
106 |     )
107 | )
108 | 
109 | ida_aug_conf = {
110 |     'resize_lim': (0.38, 0.55),
111 |     'final_dim': (256, 704),
112 |     'bot_pct_lim': (0.0, 0.0),
113 |     'rot_lim': (0.0, 0.0),
114 |     'H': 900, 'W': 1600,
115 |     'rand_flip': True,
116 | }
117 | 
118 | train_pipeline = [
119 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
120 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1),
121 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
122 |     dict(type='LoadOcc3DFromFile', occ_root=occ_root), 
123 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
124 |     dict(type='ObjectNameFilter', classes=object_names),
125 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True),
126 |     dict(type='DefaultFormatBundle3D', class_names=object_names),
127 |     dict(type='Collect3D', keys=['img', 'voxel_semantics', 'mask_camera'], meta_keys=(
128 |         'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img',
129 |         'ego2lidar', 'img_timestamp'))
130 | ]
131 | 
132 | test_pipeline = [
133 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
134 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True),
135 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False),
136 |     dict(
137 |         type='MultiScaleFlipAug3D',
138 |         img_scale=(1600, 900),
139 |         pts_scale_ratio=1,
140 |         flip=False,
141 |         transforms=[
142 |             dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False),
143 |             dict(type='Collect3D', keys=['img'], meta_keys=(
144 |                 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape',
145 |                 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp'))
146 |         ])
147 | ]
148 | 
149 | data = dict(
150 |     workers_per_gpu=4,
151 |     train=dict(
152 |         type=dataset_type,
153 |         data_root=dataset_root,
154 |         ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl',
155 |         pipeline=train_pipeline,
156 |         classes=object_names,
157 |         modality=input_modality,
158 |         test_mode=False,
159 |         use_valid_flag=True,
160 |         box_type_3d='LiDAR'),
161 |     val=dict(
162 |         type=dataset_type,
163 |         data_root=dataset_root,
164 |         ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl',
165 |         pipeline=test_pipeline,
166 |         classes=object_names,
167 |         modality=input_modality,
168 |         test_mode=True,
169 |         box_type_3d='LiDAR'),
170 |     test=dict(
171 |         type=dataset_type,
172 |         data_root=dataset_root,
173 |         ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl',
174 |         pipeline=test_pipeline,
175 |         classes=object_names,
176 |         modality=input_modality,
177 |         test_mode=True,
178 |         box_type_3d='LiDAR')
179 | )
180 | 
181 | optimizer = dict(
182 |     type='AdamW',
183 |     lr=2e-4,
184 |     paramwise_cfg=dict(custom_keys={
185 |         'img_backbone': dict(lr_mult=0.1),
186 |         'sampling_offset': dict(lr_mult=0.1),
187 |     }),
188 |     weight_decay=0.01
189 | )
190 | 
191 | optimizer_config = dict(
192 |     type='Fp16OptimizerHook',
193 |     loss_scale=512.0,
194 |     grad_clip=dict(max_norm=35, norm_type=2)
195 | )
196 | 
197 | # learning policy
198 | lr_config = dict(
199 |     policy='CosineAnnealing',
200 |     warmup='linear',
201 |     warmup_iters=500,
202 |     warmup_ratio=1.0 / 3,
203 |     min_lr_ratio=1e-3
204 | )
205 | total_epochs = 100
206 | batch_size = 8
207 | 
208 | # load pretrained weights
209 | load_from = 'pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth'
210 | revise_keys = [('backbone', 'img_backbone')]
211 | 
212 | # resume the last training
213 | resume_from = None
214 | 
215 | # checkpointing
216 | checkpoint_config = dict(interval=1, max_keep_ckpts=1)
217 | 
218 | # logging
219 | log_config = dict(
220 |     interval=1,
221 |     hooks=[
222 |         dict(type='TextLoggerHook', interval=50, reset_flag=True),
223 |         dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True)
224 |     ]
225 | )
226 | 
227 | # evaluation
228 | eval_config = dict(interval=total_epochs)
229 | 
230 | # other flags
231 | debug = False


--------------------------------------------------------------------------------
/configs/opusv1_nusc-occ3d/opusv1-m_r50_704x256_8f_nusc-occ3d_100e.py:
--------------------------------------------------------------------------------
  1 | dataset_type = 'NuScenesOcc3DDataset'
  2 | dataset_root = 'data/nuscenes/'
  3 | occ_root = 'data/nuscenes/gts/'
  4 | 
  5 | input_modality = dict(
  6 |     use_lidar=False,
  7 |     use_camera=True,
  8 |     use_radar=False,
  9 |     use_map=False,
 10 |     use_external=True
 11 | )
 12 | 
 13 | # For nuScenes we usually do 10-class detection
 14 | object_names = [
 15 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
 16 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
 17 | ]
 18 | 
 19 | occ_names = [
 20 |     'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
 21 |     'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
 22 |     'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade',
 23 |     'vegetation'
 24 | ]
 25 | 
 26 | # If point cloud range is changed, the models should also change their point
 27 | # cloud range accordingly
 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]
 29 | voxel_size = [0.4, 0.4, 0.4]
 30 | 
 31 | # arch config
 32 | embed_dims = 256
 33 | num_layers = 6
 34 | num_query = 2400
 35 | num_frames = 8
 36 | num_levels = 4
 37 | num_points = 2
 38 | num_refines = [1, 2, 4, 8, 16, 32]
 39 | 
 40 | img_backbone = dict(
 41 |     type='ResNet',
 42 |     depth=50,
 43 |     num_stages=4,
 44 |     out_indices=(0, 1, 2, 3),
 45 |     frozen_stages=1,
 46 |     norm_cfg=dict(type='BN2d', requires_grad=True),
 47 |     norm_eval=True,
 48 |     style='pytorch',
 49 |     with_cp=True)
 50 | img_neck = dict(
 51 |     type='FPN',
 52 |     in_channels=[256, 512, 1024, 2048],
 53 |     out_channels=embed_dims,
 54 |     num_outs=num_levels)
 55 | img_norm_cfg = dict(
 56 |     mean=[123.675, 116.280, 103.530],
 57 |     std=[58.395, 57.120, 57.375],
 58 |     to_rgb=True)
 59 | 
 60 | model = dict(
 61 |     type='OPUSV1',
 62 |     use_grid_mask=False,
 63 |     data_aug=dict(
 64 |         img_color_aug=True,  # Move some augmentations to GPU
 65 |         img_norm_cfg=img_norm_cfg,
 66 |         img_pad_cfg=dict(size_divisor=32)),
 67 |     stop_prev_grad=0,
 68 |     img_backbone=img_backbone,
 69 |     img_neck=img_neck,
 70 |     pts_bbox_head=dict(
 71 |         type='OPUSV1Head',
 72 |         num_classes=len(occ_names),
 73 |         in_channels=embed_dims,
 74 |         num_query=num_query,
 75 |         pc_range=point_cloud_range,
 76 |         voxel_size=voxel_size,
 77 |         transformer=dict(
 78 |             type='OPUSV1Transformer',
 79 |             embed_dims=embed_dims,
 80 |             num_frames=num_frames,
 81 |             num_points=num_points,
 82 |             num_layers=num_layers,
 83 |             num_levels=num_levels,
 84 |             num_classes=len(occ_names),
 85 |             num_refines=num_refines,
 86 |             scales=[0.5],
 87 |             pc_range=point_cloud_range),
 88 |         loss_cls=dict(
 89 |             type='FocalLoss',
 90 |             use_sigmoid=True,
 91 |             gamma=2.0,
 92 |             alpha=0.25,
 93 |             loss_weight=2.0),
 94 |         loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)),
 95 |     train_cfg=dict(
 96 |         pts=dict(
 97 |             cls_weights=[
 98 |                 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1],
 99 |             )
100 |         ),
101 |     test_cfg=dict(
102 |         pts=dict(
103 |             score_thr=0.5,
104 |             padding=True
105 |         )
106 |     )
107 | )
108 | 
109 | ida_aug_conf = {
110 |     'resize_lim': (0.38, 0.55),
111 |     'final_dim': (256, 704),
112 |     'bot_pct_lim': (0.0, 0.0),
113 |     'rot_lim': (0.0, 0.0),
114 |     'H': 900, 'W': 1600,
115 |     'rand_flip': True,
116 | }
117 | 
118 | train_pipeline = [
119 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
120 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1),
121 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
122 |     dict(type='LoadOcc3DFromFile', occ_root=occ_root), 
123 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
124 |     dict(type='ObjectNameFilter', classes=object_names),
125 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True),
126 |     dict(type='DefaultFormatBundle3D', class_names=object_names),
127 |     dict(type='Collect3D', keys=['img', 'voxel_semantics', 'mask_camera'], meta_keys=(
128 |         'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 
129 |         'ego2lidar', 'img_timestamp'))
130 | ]
131 | 
132 | test_pipeline = [
133 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
134 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True),
135 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False),
136 |     dict(
137 |         type='MultiScaleFlipAug3D',
138 |         img_scale=(1600, 900),
139 |         pts_scale_ratio=1,
140 |         flip=False,
141 |         transforms=[
142 |             dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False),
143 |             dict(type='Collect3D', keys=['img'], meta_keys=(
144 |                 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape',
145 |                 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp'))
146 |         ])
147 | ]
148 | 
149 | data = dict(
150 |     workers_per_gpu=4,
151 |     train=dict(
152 |         type=dataset_type,
153 |         data_root=dataset_root,
154 |         ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl',
155 |         pipeline=train_pipeline,
156 |         classes=object_names,
157 |         modality=input_modality,
158 |         test_mode=False,
159 |         use_valid_flag=True,
160 |         box_type_3d='LiDAR'),
161 |     val=dict(
162 |         type=dataset_type,
163 |         data_root=dataset_root,
164 |         ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl',
165 |         pipeline=test_pipeline,
166 |         classes=object_names,
167 |         modality=input_modality,
168 |         test_mode=True,
169 |         box_type_3d='LiDAR'),
170 |     test=dict(
171 |         type=dataset_type,
172 |         data_root=dataset_root,
173 |         ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl',
174 |         pipeline=test_pipeline,
175 |         classes=object_names,
176 |         modality=input_modality,
177 |         test_mode=True,
178 |         box_type_3d='LiDAR')
179 | )
180 | 
181 | optimizer = dict(
182 |     type='AdamW',
183 |     lr=2e-4,
184 |     paramwise_cfg=dict(custom_keys={
185 |         'img_backbone': dict(lr_mult=0.1),
186 |         'sampling_offset': dict(lr_mult=0.1),
187 |     }),
188 |     weight_decay=0.01
189 | )
190 | 
191 | optimizer_config = dict(
192 |     type='Fp16OptimizerHook',
193 |     loss_scale=512.0,
194 |     grad_clip=dict(max_norm=35, norm_type=2)
195 | )
196 | 
197 | # learning policy
198 | lr_config = dict(
199 |     policy='CosineAnnealing',
200 |     warmup='linear',
201 |     warmup_iters=500,
202 |     warmup_ratio=1.0 / 3,
203 |     min_lr_ratio=1e-3
204 | )
205 | total_epochs = 100
206 | batch_size = 8
207 | 
208 | # load pretrained weights
209 | load_from = 'pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth'
210 | revise_keys = [('backbone', 'img_backbone')]
211 | 
212 | # resume the last training
213 | resume_from = None
214 | 
215 | # checkpointing
216 | checkpoint_config = dict(interval=1, max_keep_ckpts=1)
217 | 
218 | # logging
219 | log_config = dict(
220 |     interval=1,
221 |     hooks=[
222 |         dict(type='TextLoggerHook', interval=50, reset_flag=True),
223 |         dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True)
224 |     ]
225 | )
226 | 
227 | # evaluation
228 | eval_config = dict(interval=total_epochs)
229 | 
230 | # other flags
231 | debug = False
232 | 


--------------------------------------------------------------------------------
/configs/opusv1_nusc-occ3d/opusv1-t_r50_704x256_8f_nusc-occ3d_100e.py:
--------------------------------------------------------------------------------
  1 | dataset_type = 'NuScenesOcc3DDataset'
  2 | dataset_root = 'data/nuscenes/'
  3 | occ_root = 'data/nuscenes/gts/'
  4 | 
  5 | input_modality = dict(
  6 |     use_lidar=False,
  7 |     use_camera=True,
  8 |     use_radar=False,
  9 |     use_map=False,
 10 |     use_external=True
 11 | )
 12 | 
 13 | # For nuScenes we usually do 10-class detection
 14 | object_names = [
 15 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
 16 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
 17 | ]
 18 | 
 19 | occ_names = [
 20 |     'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
 21 |     'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
 22 |     'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade',
 23 |     'vegetation'
 24 | ]
 25 | 
 26 | # If point cloud range is changed, the models should also change their point
 27 | # cloud range accordingly
 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]
 29 | voxel_size = [0.4, 0.4, 0.4]
 30 | 
 31 | # arch config
 32 | embed_dims = 256
 33 | num_layers = 6
 34 | num_query = 600
 35 | num_frames = 8
 36 | num_levels = 4
 37 | num_points = 4
 38 | num_refines = [1, 4, 16, 32, 64, 128]
 39 | 
 40 | img_backbone = dict(
 41 |     type='ResNet',
 42 |     depth=50,
 43 |     num_stages=4,
 44 |     out_indices=(0, 1, 2, 3),
 45 |     frozen_stages=1,
 46 |     norm_cfg=dict(type='BN2d', requires_grad=True),
 47 |     norm_eval=True,
 48 |     style='pytorch',
 49 |     with_cp=True)
 50 | img_neck = dict(
 51 |     type='FPN',
 52 |     in_channels=[256, 512, 1024, 2048],
 53 |     out_channels=embed_dims,
 54 |     num_outs=num_levels)
 55 | img_norm_cfg = dict(
 56 |     mean=[123.675, 116.280, 103.530],
 57 |     std=[58.395, 57.120, 57.375],
 58 |     to_rgb=True)
 59 | 
 60 | model = dict(
 61 |     type='OPUSV1',
 62 |     use_grid_mask=False,
 63 |     data_aug=dict(
 64 |         img_color_aug=True,  # Move some augmentations to GPU
 65 |         img_norm_cfg=img_norm_cfg,
 66 |         img_pad_cfg=dict(size_divisor=32)),
 67 |     stop_prev_grad=0,
 68 |     img_backbone=img_backbone,
 69 |     img_neck=img_neck,
 70 |     pts_bbox_head=dict(
 71 |         type='OPUSV1Head',
 72 |         num_classes=len(occ_names),
 73 |         in_channels=embed_dims,
 74 |         num_query=num_query,
 75 |         pc_range=point_cloud_range,
 76 |         voxel_size=voxel_size,
 77 |         transformer=dict(
 78 |             type='OPUSV1Transformer',
 79 |             embed_dims=embed_dims,
 80 |             num_frames=num_frames,
 81 |             num_points=num_points,
 82 |             num_layers=num_layers,
 83 |             num_levels=num_levels,
 84 |             num_classes=len(occ_names),
 85 |             num_refines=num_refines,
 86 |             scales=[0.5],
 87 |             pc_range=point_cloud_range),
 88 |         loss_cls=dict(
 89 |             type='FocalLoss',
 90 |             use_sigmoid=True,
 91 |             gamma=2.0,
 92 |             alpha=0.25,
 93 |             loss_weight=2.0),
 94 |         loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)),
 95 |     train_cfg=dict(
 96 |         pts=dict(
 97 |             cls_weights=[
 98 |                 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1],
 99 |             )
100 |         ),
101 |     test_cfg=dict(
102 |         pts=dict(
103 |             score_thr=0.5,
104 |             padding=True
105 |         )
106 |     )
107 | )
108 | 
109 | ida_aug_conf = {
110 |     'resize_lim': (0.38, 0.55),
111 |     'final_dim': (256, 704),
112 |     'bot_pct_lim': (0.0, 0.0),
113 |     'rot_lim': (0.0, 0.0),
114 |     'H': 900, 'W': 1600,
115 |     'rand_flip': True,
116 | }
117 | 
118 | train_pipeline = [
119 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
120 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1),
121 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
122 |     dict(type='LoadOcc3DFromFile', occ_root=occ_root), 
123 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
124 |     dict(type='ObjectNameFilter', classes=object_names),
125 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True),
126 |     dict(type='DefaultFormatBundle3D', class_names=object_names),
127 |     dict(type='Collect3D', keys=['img', 'voxel_semantics', 'mask_camera'], meta_keys=(
128 |         'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 
129 |         'ego2lidar', 'img_timestamp'))
130 | ]
131 | 
132 | test_pipeline = [
133 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
134 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True),
135 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False),
136 |     dict(
137 |         type='MultiScaleFlipAug3D',
138 |         img_scale=(1600, 900),
139 |         pts_scale_ratio=1,
140 |         flip=False,
141 |         transforms=[
142 |             dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False),
143 |             dict(type='Collect3D', keys=['img'], meta_keys=(
144 |                 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape',
145 |                 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp'))
146 |         ])
147 | ]
148 | 
149 | data = dict(
150 |     workers_per_gpu=4,
151 |     train=dict(
152 |         type=dataset_type,
153 |         data_root=dataset_root,
154 |         ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl',
155 |         pipeline=train_pipeline,
156 |         classes=object_names,
157 |         modality=input_modality,
158 |         test_mode=False,
159 |         use_valid_flag=True,
160 |         box_type_3d='LiDAR'),
161 |     val=dict(
162 |         type=dataset_type,
163 |         data_root=dataset_root,
164 |         ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl',
165 |         pipeline=test_pipeline,
166 |         classes=object_names,
167 |         modality=input_modality,
168 |         test_mode=True,
169 |         box_type_3d='LiDAR'),
170 |     test=dict(
171 |         type=dataset_type,
172 |         data_root=dataset_root,
173 |         ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl',
174 |         pipeline=test_pipeline,
175 |         classes=object_names,
176 |         modality=input_modality,
177 |         test_mode=True,
178 |         box_type_3d='LiDAR')
179 | )
180 | 
181 | optimizer = dict(
182 |     type='AdamW',
183 |     lr=2e-4,
184 |     paramwise_cfg=dict(custom_keys={
185 |         'img_backbone': dict(lr_mult=0.1),
186 |         'sampling_offset': dict(lr_mult=0.1),
187 |     }),
188 |     weight_decay=0.01
189 | )
190 | 
191 | optimizer_config = dict(
192 |     type='Fp16OptimizerHook',
193 |     loss_scale=512.0,
194 |     grad_clip=dict(max_norm=35, norm_type=2)
195 | )
196 | 
197 | # learning policy
198 | lr_config = dict(
199 |     policy='CosineAnnealing',
200 |     warmup='linear',
201 |     warmup_iters=500,
202 |     warmup_ratio=1.0 / 3,
203 |     min_lr_ratio=1e-3
204 | )
205 | total_epochs = 100
206 | batch_size = 1
207 | 
208 | # load pretrained weights
209 | load_from = 'pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth'
210 | revise_keys = [('backbone', 'img_backbone')]
211 | 
212 | # resume the last training
213 | resume_from = None
214 | 
215 | # checkpointing
216 | checkpoint_config = dict(interval=1, max_keep_ckpts=1)
217 | 
218 | # logging
219 | log_config = dict(
220 |     interval=1,
221 |     hooks=[
222 |         dict(type='TextLoggerHook', interval=50, reset_flag=True),
223 |         dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True)
224 |     ]
225 | )
226 | 
227 | # evaluation
228 | eval_config = dict(interval=total_epochs)
229 | 
230 | # other flags
231 | debug = False
232 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import glob
  4 | import torch
  5 | import shutil
  6 | import logging
  7 | import datetime
  8 | from mmcv.runner.hooks import HOOKS
  9 | from mmcv.runner.hooks.logger import LoggerHook, TextLoggerHook
 10 | from mmcv.runner.dist_utils import master_only
 11 | from torch.utils.tensorboard import SummaryWriter
 12 | 
 13 | 
 14 | def init_logging(filename=None, debug=False):
 15 |     logging.root = logging.RootLogger('DEBUG' if debug else 'INFO')
 16 |     formatter = logging.Formatter('[%(asctime)s][%(levelname)s] - %(message)s')
 17 | 
 18 |     stream_handler = logging.StreamHandler(sys.stdout)
 19 |     stream_handler.setFormatter(formatter)
 20 |     logging.root.addHandler(stream_handler)
 21 | 
 22 |     if filename is not None:
 23 |         file_handler = logging.FileHandler(filename)
 24 |         file_handler.setFormatter(formatter)
 25 |         logging.root.addHandler(file_handler)
 26 | 
 27 | 
 28 | def backup_code(work_dir, verbose=False):
 29 |     base_dir = os.path.dirname(os.path.abspath(__file__))
 30 |     for pattern in ['*.py', 'configs/*.py', 'models/*.py', 'loaders/*.py', 'loaders/pipelines/*.py']:
 31 |         for file in glob.glob(pattern):
 32 |             src = os.path.join(base_dir, file)
 33 |             dst = os.path.join(work_dir, 'backup', os.path.dirname(file))
 34 | 
 35 |             if verbose:
 36 |                 logging.info('Copying %s -> %s' % (os.path.relpath(src), os.path.relpath(dst)))
 37 |             
 38 |             os.makedirs(dst, exist_ok=True)
 39 |             shutil.copy2(src, dst)
 40 | 
 41 | 
 42 | @HOOKS.register_module()
 43 | class MyTextLoggerHook(TextLoggerHook):
 44 |     def _log_info(self, log_dict, runner):
 45 |         # print exp name for users to distinguish experiments
 46 |         # at every ``interval_exp_name`` iterations and the end of each epoch
 47 |         if runner.meta is not None and 'exp_name' in runner.meta:
 48 |             if (self.every_n_iters(runner, self.interval_exp_name)) or (
 49 |                     self.by_epoch and self.end_of_epoch(runner)):
 50 |                 exp_info = f'Exp name: {runner.meta["exp_name"]}'
 51 |                 runner.logger.info(exp_info)
 52 | 
 53 |         # by epoch: Epoch [4][100/1000]
 54 |         # by iter:  Iter [100/100000]
 55 |         if self.by_epoch:
 56 |             log_str = f'Epoch [{log_dict["epoch"]}/{runner.max_epochs}]' \
 57 |                         f'[{log_dict["iter"]}/{len(runner.data_loader)}] '
 58 |         else:
 59 |             log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}] '
 60 | 
 61 |         log_str += 'loss: %.2f, ' % log_dict['loss']
 62 | 
 63 |         if 'time' in log_dict.keys():
 64 |             # MOD: skip the first iteration since it's not accurate
 65 |             if runner.iter == self.start_iter:
 66 |                 time_sec_avg = log_dict['time']
 67 |             else:
 68 |                 self.time_sec_tot += (log_dict['time'] * self.interval)
 69 |                 time_sec_avg = self.time_sec_tot / (runner.iter - self.start_iter)
 70 | 
 71 |             eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
 72 |             eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
 73 |             log_str += f'eta: {eta_str}, '
 74 |             log_str += f'time: {log_dict["time"]:.2f}s, ' \
 75 |                         f'data: {log_dict["data_time"] * 1000:.0f}ms, '
 76 |             # statistic memory
 77 |             if torch.cuda.is_available():
 78 |                 log_str += f'mem: {log_dict["memory"]}M'
 79 | 
 80 |         runner.logger.info(log_str)
 81 | 
 82 |     def log(self, runner):
 83 |         if 'eval_iter_num' in runner.log_buffer.output:
 84 |             # this doesn't modify runner.iter and is regardless of by_epoch
 85 |             cur_iter = runner.log_buffer.output.pop('eval_iter_num')
 86 |         else:
 87 |             cur_iter = self.get_iter(runner, inner_iter=True)
 88 | 
 89 |         log_dict = {
 90 |             'mode': self.get_mode(runner),
 91 |             'epoch': self.get_epoch(runner),
 92 |             'iter': cur_iter
 93 |         }
 94 | 
 95 |         # only record lr of the first param group
 96 |         cur_lr = runner.current_lr()
 97 |         if isinstance(cur_lr, list):
 98 |             log_dict['lr'] = cur_lr[0]
 99 |         else:
100 |             assert isinstance(cur_lr, dict)
101 |             log_dict['lr'] = {}
102 |             for k, lr_ in cur_lr.items():
103 |                 assert isinstance(lr_, list)
104 |                 log_dict['lr'].update({k: lr_[0]})
105 | 
106 |         if 'time' in runner.log_buffer.output:
107 |             # statistic memory
108 |             if torch.cuda.is_available():
109 |                 log_dict['memory'] = self._get_max_memory(runner)
110 | 
111 |         log_dict = dict(log_dict, **runner.log_buffer.output)
112 | 
113 |         # MOD: disable writing to files
114 |         # self._dump_log(log_dict, runner)
115 |         self._log_info(log_dict, runner)
116 | 
117 |         return log_dict
118 | 
119 |     def after_train_epoch(self, runner):
120 |         if runner.log_buffer.ready:
121 |             metrics = self.get_loggable_tags(runner)
122 |             runner.logger.info('--- Evaluation Results ---')
123 |             runner.logger.info('mAP: %.4f' % metrics['val/pts_bbox_NuScenes/mAP'])
124 |             runner.logger.info('mATE: %.4f' % metrics['val/pts_bbox_NuScenes/mATE'])
125 |             runner.logger.info('mASE: %.4f' % metrics['val/pts_bbox_NuScenes/mASE'])
126 |             runner.logger.info('mAOE: %.4f' % metrics['val/pts_bbox_NuScenes/mAOE'])
127 |             runner.logger.info('mAVE: %.4f' % metrics['val/pts_bbox_NuScenes/mAVE'])
128 |             runner.logger.info('mAAE: %.4f' % metrics['val/pts_bbox_NuScenes/mAAE'])
129 |             runner.logger.info('NDS: %.4f' % metrics['val/pts_bbox_NuScenes/NDS'])
130 | 
131 | 
132 | @HOOKS.register_module()
133 | class MyTensorboardLoggerHook(LoggerHook):
134 |     def __init__(self, log_dir=None, interval=10, ignore_last=True, reset_flag=False, by_epoch=True):
135 |         super(MyTensorboardLoggerHook, self).__init__(
136 |             interval, ignore_last, reset_flag, by_epoch)
137 |         self.log_dir = log_dir
138 | 
139 |     @master_only
140 |     def before_run(self, runner):
141 |         super(MyTensorboardLoggerHook, self).before_run(runner)
142 |         if self.log_dir is None:
143 |             self.log_dir = runner.work_dir
144 |         self.writer = SummaryWriter(self.log_dir)
145 | 
146 |     @master_only
147 |     def log(self, runner):
148 |         tags = self.get_loggable_tags(runner)
149 | 
150 |         for key, value in tags.items():
151 |             # MOD: merge into the 'train' group
152 |             if key == 'learning_rate':
153 |                 key = 'train/learning_rate'
154 | 
155 |             # MOD: skip momentum
156 |             ignore = False
157 |             if key == 'momentum':
158 |                 ignore = True
159 | 
160 |             # MOD: skip intermediate losses
161 |             for i in range(5):
162 |                 if key[:13] == 'train/d%d.loss' % i:
163 |                     ignore = True
164 | 
165 |             if key[:3] == 'val':
166 |                 metric_name = key[22:]
167 |                 if metric_name in ['mAP', 'mATE', 'mASE', 'mAOE', 'mAVE', 'mAAE', 'NDS']:
168 |                     key = 'val/' + metric_name
169 |                 else:
170 |                     ignore = True
171 | 
172 |             if self.get_mode(runner) == 'train' and key[:5] != 'train':
173 |                 ignore = True
174 | 
175 |             if self.get_mode(runner) != 'train' and key[:3] != 'val':
176 |                 ignore = True
177 | 
178 |             if ignore:
179 |                 continue
180 | 
181 |             if key[:5] == 'train':
182 |                 self.writer.add_scalar(key, value, self.get_iter(runner))
183 |             elif key[:3] == 'val':
184 |                 self.writer.add_scalar(key, value, self.get_epoch(runner))
185 | 
186 |     @master_only
187 |     def after_run(self, runner):
188 |         self.writer.close()
189 | 


--------------------------------------------------------------------------------
/loaders/nuscenes_occupancy_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import mmcv
  3 | import numpy as np
  4 | import torch
  5 | import pickle
  6 | import os.path as osp
  7 | from tqdm import tqdm
  8 | from mmdet.datasets import DATASETS
  9 | from mmdet3d.datasets import NuScenesDataset
 10 | from nuscenes.eval.common.utils import Quaternion
 11 | from nuscenes.utils.geometry_utils import transform_matrix
 12 | from torch.utils.data import DataLoader
 13 | from models.utils import sparse2dense
 14 | from .utils import compose_ego2img
 15 | from .old_metrics import Metric_mIoU_Occupancy
 16 | 
 17 | 
 18 | @DATASETS.register_module()
 19 | class NuScenesOccupancyDataset(NuScenesDataset):    
 20 |     def __init__(self, *args, **kwargs):
 21 |         super().__init__(filter_empty_gt=False, *args, **kwargs)
 22 |         self.data_infos = self.load_annotations(self.ann_file)
 23 |     
 24 |     def collect_cam_sweeps(self, index, into_past=150, into_future=0):
 25 |         all_sweeps_prev = []
 26 |         curr_index = index
 27 |         while len(all_sweeps_prev) < into_past:
 28 |             curr_sweeps = self.data_infos[curr_index]['cam_sweeps']
 29 |             if len(curr_sweeps) == 0:
 30 |                 break
 31 |             all_sweeps_prev.extend(curr_sweeps)
 32 |             all_sweeps_prev.append(self.data_infos[curr_index - 1]['cams'])
 33 |             curr_index = curr_index - 1
 34 |         
 35 |         all_sweeps_next = []
 36 |         curr_index = index + 1
 37 |         while len(all_sweeps_next) < into_future:
 38 |             if curr_index >= len(self.data_infos):
 39 |                 break
 40 |             curr_sweeps = self.data_infos[curr_index]['cam_sweeps']
 41 |             all_sweeps_next.extend(curr_sweeps[::-1])
 42 |             all_sweeps_next.append(self.data_infos[curr_index]['cams'])
 43 |             curr_index = curr_index + 1
 44 | 
 45 |         return all_sweeps_prev, all_sweeps_next
 46 | 
 47 |     def collect_lidar_sweeps(self, index, into_past=20, into_future=0):
 48 |         all_sweeps_prev = []
 49 |         curr_index = index
 50 |         while len(all_sweeps_prev) < into_past:
 51 |             curr_sweeps = self.data_infos[curr_index]['lidar_sweeps']
 52 |             if len(curr_sweeps) == 0:
 53 |                 break
 54 |             all_sweeps_prev.extend(curr_sweeps)
 55 |             curr_index = curr_index - 1
 56 |         
 57 |         all_sweeps_next = []
 58 |         curr_index = index + 1
 59 |         last_timestamp = self.data_infos[index]['timestamp']
 60 |         while len(all_sweeps_next) < into_future:
 61 |             if curr_index >= len(self.data_infos):
 62 |                 break
 63 |             curr_sweeps = self.data_infos[curr_index]['lidar_sweeps'][::-1]
 64 |             if curr_sweeps[0]['timestamp'] == last_timestamp:
 65 |                 curr_sweeps = curr_sweeps[1:]
 66 |             all_sweeps_next.extend(curr_sweeps)
 67 |             curr_index = curr_index + 1
 68 |             last_timestamp = all_sweeps_next[-1]['timestamp']
 69 | 
 70 |         return all_sweeps_prev, all_sweeps_next
 71 | 
 72 |     def get_data_info(self, index):
 73 |         info = self.data_infos[index]
 74 | 
 75 |         ego2global_translation = info['ego2global_translation']
 76 |         ego2global_rotation = info['ego2global_rotation']
 77 |         lidar2ego_translation = info['lidar2ego_translation']
 78 |         lidar2ego_rotation = info['lidar2ego_rotation']
 79 |         ego2global_rotation_mat = Quaternion(ego2global_rotation).rotation_matrix
 80 |         lidar2ego_rotation_mat = Quaternion(lidar2ego_rotation).rotation_matrix
 81 |         ego2lidar = transform_matrix(
 82 |             lidar2ego_translation, Quaternion(lidar2ego_rotation), inverse=True)
 83 | 
 84 |         input_dict = dict(
 85 |             sample_token=info['token'],
 86 |             scene_name=info['scene_name'],
 87 |             scene_token=info['scene_token'],
 88 |             lidar_token=info['lidar_token'],
 89 |             timestamp=info['timestamp'] / 1e6,
 90 |             ego2lidar=ego2lidar,
 91 |             ego2obj=ego2lidar,
 92 |             ego2occ=ego2lidar,
 93 |             ego2global_translation=ego2global_translation,
 94 |             ego2global_rotation=ego2global_rotation_mat,
 95 |             lidar2ego_translation=lidar2ego_translation,
 96 |             lidar2ego_rotation=lidar2ego_rotation_mat,
 97 |         )
 98 | 
 99 |         if self.modality['use_lidar']:
100 |             lidar_sweeps_prev, lidar_sweeps_next = self.collect_lidar_sweeps(index)
101 |             input_dict.update(dict(
102 |                 pts_filename=info['lidar_path'],
103 |                 lidar_sweeps={'prev': lidar_sweeps_prev, 'next': lidar_sweeps_next},
104 |             ))
105 | 
106 |         if self.modality['use_camera']:
107 |             img_paths = []
108 |             img_timestamps = []
109 |             ego2img = []
110 | 
111 |             for _, cam_info in info['cams'].items():
112 |                 img_paths.append(os.path.relpath(cam_info['data_path']))
113 |                 img_timestamps.append(cam_info['timestamp'] / 1e6)
114 |                 ego2img.append(
115 |                     compose_ego2img(
116 |                         ego2global_translation,
117 |                         ego2global_rotation_mat,
118 |                         cam_info['sensor2global_translation'],
119 |                         cam_info['sensor2global_rotation'].T,
120 |                         cam_info['cam_intrinsic']
121 |                     )
122 |                 )
123 | 
124 |             cam_sweeps_prev, cam_sweeps_next = self.collect_cam_sweeps(index)
125 | 
126 |             input_dict.update(dict(
127 |                 img_filename=img_paths,
128 |                 img_timestamp=img_timestamps,
129 |                 ego2img=ego2img,
130 |                 cam_sweeps={'prev': cam_sweeps_prev, 'next': cam_sweeps_next},
131 |             ))
132 | 
133 |         if not self.test_mode:
134 |             annos = self.get_ann_info(index)
135 |             input_dict['ann_info'] = annos
136 | 
137 |         return input_dict
138 |     
139 |     def evaluate(self, occ_results, runner=None, show_dir=None, **eval_kwargs):
140 |         occ_gts = []
141 |         occ_preds = []
142 |         lidar_origins = []
143 | 
144 |         print('\nStarting Evaluation...')
145 |         metric = Metric_mIoU_Occupancy()
146 | 
147 |         occ_class_names = [
148 |             'noise', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
149 |             'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
150 |             'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade', 'vegetation'
151 |         ]
152 |         ignore_class_names=['noise']
153 |         pc_range = np.array([-51.2, -51.2, -5.0, 51.2, 51.2, 3])
154 |         voxel_size = np.array([0.2, 0.2, 0.2])
155 |         voxel_num = ((pc_range[3:] - pc_range[:3]) / voxel_size).astype(np.int64)
156 | 
157 |         from tqdm import tqdm
158 |         for i in tqdm(range(len(occ_results))):
159 |             result_dict = occ_results[i]
160 |             info = self.get_data_info(i)
161 | 
162 |             scene_token, lidar_token = info['scene_token'], info['lidar_token']
163 |             occ_root = 'data/nuscenes/openoccupancy/'
164 |             occ_file = osp.join(occ_root, f'scene_{scene_token}', 'occupancy', f'{lidar_token}.npy')
165 |             # load lidar and camera visible label
166 |             occ_labels = np.load(occ_file)
167 |             coors, labels = occ_labels[:, :3], occ_labels[:, 3]
168 |             occ_labels, _ = sparse2dense(coors[:, ::-1], labels, voxel_num, empty_value=len(occ_class_names))
169 |             mask = occ_labels != 0 # ignore noise
170 | 
171 |             curr_class_names = [n for n in occ_class_names if n not in ignore_class_names]
172 |             curr_bg_class_idx = len(curr_class_names) # 16
173 |             label_mapper = [curr_class_names.index(n) if n in curr_class_names else 16
174 |                             for n in occ_class_names] + [curr_bg_class_idx]
175 |             label_mapper = np.array(label_mapper)
176 |             occ_labels = label_mapper[occ_labels]
177 | 
178 |             occ_pred, _ = sparse2dense(result_dict['occ_loc'], result_dict['sem_pred'], voxel_num, 16)
179 |             metric.add_batch(occ_pred, occ_labels, mask)
180 | 
181 |         mIoU, IoU = metric.count_miou()
182 |         return {'mIoU': mIoU, 'IoU': IoU}
183 | 
184 |     def format_results(self, occ_results, submission_prefix, **kwargs):
185 |         if submission_prefix is not None:
186 |             mmcv.mkdir_or_exist(submission_prefix)
187 | 
188 |         for index, occ_pred in enumerate(tqdm(occ_results)):
189 |             info = self.data_infos[index]
190 |             sample_token = info['token']
191 |             save_path=os.path.join(submission_prefix, '{}.npz'.format(sample_token))
192 |             np.savez_compressed(save_path,occ_pred.astype(np.uint8))
193 |         print('\nFinished.')


--------------------------------------------------------------------------------
/loaders/ray_metrics.py:
--------------------------------------------------------------------------------
  1 | # Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
  2 | # Modified by Haisong Liu
  3 | import math
  4 | import copy
  5 | import numpy as np
  6 | import torch
  7 | from torch.utils.cpp_extension import load
  8 | from tqdm import tqdm
  9 | from prettytable import PrettyTable
 10 | 
 11 | 
 12 | dvr = load("dvr", sources=["lib/dvr/dvr.cpp", "lib/dvr/dvr.cu"], verbose=True, extra_cuda_cflags=['-allow-unsupported-compiler'])
 13 | _pc_range = [-40, -40, -1.0, 40, 40, 5.4]
 14 | _voxel_size = 0.4
 15 | 
 16 | occ_class_names = [
 17 |     'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
 18 |     'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
 19 |     'driveable_surface', 'other_flat', 'sidewalk',
 20 |     'terrain', 'manmade', 'vegetation', 'free'
 21 | ]
 22 | 
 23 | # https://github.com/tarashakhurana/4d-occ-forecasting/blob/ff986082cd6ea10e67ab7839bf0e654736b3f4e2/test_fgbg.py#L29C1-L46C16
 24 | def get_rendered_pcds(origin, points, tindex, pred_dist):
 25 |     pcds = []
 26 |     
 27 |     for t in range(len(origin)):
 28 |         mask = (tindex == t)
 29 |         # skip the ones with no data
 30 |         if not mask.any():
 31 |             continue
 32 |         _pts = points[mask, :3]
 33 |         # use ground truth lidar points for the raycasting direction
 34 |         v = _pts - origin[t][None, :]
 35 |         d = v / np.sqrt((v ** 2).sum(axis=1, keepdims=True))
 36 |         pred_pts = origin[t][None, :] + d * pred_dist[mask][:, None]
 37 |         pcds.append(torch.from_numpy(pred_pts))
 38 |         
 39 |     return pcds
 40 | 
 41 | 
 42 | def meshgrid3d(occ_size, pc_range):
 43 |     W, H, D = occ_size
 44 |     
 45 |     xs = torch.linspace(0.5, W - 0.5, W).view(W, 1, 1).expand(W, H, D) / W
 46 |     ys = torch.linspace(0.5, H - 0.5, H).view(1, H, 1).expand(W, H, D) / H
 47 |     zs = torch.linspace(0.5, D - 0.5, D).view(1, 1, D).expand(W, H, D) / D
 48 |     xs = xs * (pc_range[3] - pc_range[0]) + pc_range[0]
 49 |     ys = ys * (pc_range[4] - pc_range[1]) + pc_range[1]
 50 |     zs = zs * (pc_range[5] - pc_range[2]) + pc_range[2]
 51 |     xyz = torch.stack((xs, ys, zs), -1)
 52 | 
 53 |     return xyz
 54 | 
 55 | 
 56 | def generate_lidar_rays():
 57 |     # prepare lidar ray angles
 58 |     pitch_angles = []
 59 |     for k in range(10):
 60 |         angle = math.pi / 2 - math.atan(k + 1)
 61 |         pitch_angles.append(-angle)
 62 |     
 63 |     # nuscenes lidar fov: [0.2107773983152201, -0.5439104895672159] (rad)
 64 |     while pitch_angles[-1] < 0.21:
 65 |         delta = pitch_angles[-1] - pitch_angles[-2]
 66 |         pitch_angles.append(pitch_angles[-1] + delta)
 67 | 
 68 |     lidar_rays = []
 69 |     for pitch_angle in pitch_angles:
 70 |         for azimuth_angle in np.arange(0, 360, 1):
 71 |             azimuth_angle = np.deg2rad(azimuth_angle)
 72 | 
 73 |             x = np.cos(pitch_angle) * np.cos(azimuth_angle)
 74 |             y = np.cos(pitch_angle) * np.sin(azimuth_angle)
 75 |             z = np.sin(pitch_angle)
 76 | 
 77 |             lidar_rays.append((x, y, z))
 78 | 
 79 |     return np.array(lidar_rays, dtype=np.float32)
 80 | 
 81 | 
 82 | def process_one_sample(sem_pred, lidar_rays, output_origin):
 83 |     # lidar origin in ego coordinate
 84 |     # lidar_origin = torch.tensor([[[0.9858, 0.0000, 1.8402]]])
 85 |     T = output_origin.shape[1]
 86 |     pred_pcds_t = []
 87 | 
 88 |     free_id = len(occ_class_names) - 1 
 89 |     occ_pred = copy.deepcopy(sem_pred)
 90 |     occ_pred[sem_pred < free_id] = 1
 91 |     occ_pred[sem_pred == free_id] = 0
 92 |     occ_pred = torch.from_numpy(occ_pred).permute(2, 1, 0)
 93 |     occ_pred = occ_pred[None, None, :].contiguous().float()
 94 | 
 95 |     offset = torch.Tensor(_pc_range[:3])[None, None, :]
 96 |     scaler = torch.Tensor([_voxel_size] * 3)[None, None, :]
 97 | 
 98 |     lidar_tindex = torch.zeros([1, lidar_rays.shape[0]])
 99 |     
100 |     for t in range(T): 
101 |         lidar_origin = output_origin[:, t:t+1, :]  # [1, 1, 3]
102 |         lidar_endpts = lidar_rays[None] + lidar_origin  # [1, 15840, 3]
103 | 
104 |         output_origin_render = ((lidar_origin - offset) / scaler).float()  # [1, 1, 3]
105 |         output_points_render = ((lidar_endpts - offset) / scaler).float()  # [1, N, 3]
106 |         output_tindex_render = lidar_tindex  # [1, N], all zeros
107 | 
108 |         with torch.no_grad():
109 |             pred_dist, _, coord_index = dvr.render_forward(
110 |                 occ_pred.cuda(),
111 |                 output_origin_render.cuda(),
112 |                 output_points_render.cuda(),
113 |                 output_tindex_render.cuda(),
114 |                 [1, 16, 200, 200],
115 |                 "test"
116 |             )
117 |             pred_dist *= _voxel_size
118 | 
119 |         pred_pcds = get_rendered_pcds(
120 |             lidar_origin[0].cpu().numpy(),
121 |             lidar_endpts[0].cpu().numpy(),
122 |             lidar_tindex[0].cpu().numpy(),
123 |             pred_dist[0].cpu().numpy()
124 |         )
125 |         coord_index = coord_index[0, :, :].int().cpu()  # [N, 3]
126 | 
127 |         pred_label = torch.from_numpy(sem_pred[coord_index[:, 0], coord_index[:, 1], coord_index[:, 2]])[:, None]  # [N, 1]
128 |         pred_dist = pred_dist[0, :, None].cpu()
129 |         pred_pcds = torch.cat([pred_label.float(), pred_dist], dim=-1)
130 | 
131 |         pred_pcds_t.append(pred_pcds)
132 | 
133 |     pred_pcds_t = torch.cat(pred_pcds_t, dim=0)
134 |    
135 |     return pred_pcds_t.numpy()
136 | 
137 | 
138 | def calc_metrics(pcd_pred_list, pcd_gt_list):
139 |     thresholds = [1, 2, 4]
140 | 
141 |     gt_cnt = np.zeros([len(occ_class_names)])
142 |     pred_cnt = np.zeros([len(occ_class_names)])
143 |     tp_cnt = np.zeros([len(thresholds), len(occ_class_names)])
144 | 
145 |     for pcd_pred, pcd_gt in zip(pcd_pred_list, pcd_gt_list):
146 |         for j, threshold in enumerate(thresholds):
147 |             # L1
148 |             depth_pred = pcd_pred[:, 1]
149 |             depth_gt = pcd_gt[:, 1]
150 |             l1_error = np.abs(depth_pred - depth_gt)
151 |             tp_dist_mask = (l1_error < threshold)
152 |             
153 |             for i, cls in enumerate(occ_class_names):
154 |                 cls_id = occ_class_names.index(cls)
155 |                 cls_mask_pred = (pcd_pred[:, 0] == cls_id)
156 |                 cls_mask_gt = (pcd_gt[:, 0] == cls_id)
157 | 
158 |                 gt_cnt_i = cls_mask_gt.sum()
159 |                 pred_cnt_i = cls_mask_pred.sum()
160 |                 if j == 0:
161 |                     gt_cnt[i] += gt_cnt_i
162 |                     pred_cnt[i] += pred_cnt_i
163 | 
164 |                 tp_cls = cls_mask_gt & cls_mask_pred  # [N]
165 |                 tp_mask = np.logical_and(tp_cls, tp_dist_mask)
166 |                 tp_cnt[j][i] += tp_mask.sum()
167 |     
168 |     iou_list = []
169 |     for j, threshold in enumerate(thresholds):
170 |         iou_list.append((tp_cnt[j] / (gt_cnt + pred_cnt - tp_cnt[j]))[:-1])
171 | 
172 |     return iou_list
173 | 
174 | 
175 | def main(sem_pred_list, sem_gt_list, lidar_origin_list):
176 |     torch.cuda.empty_cache()
177 | 
178 |     # generate lidar rays
179 |     lidar_rays = generate_lidar_rays()
180 |     lidar_rays = torch.from_numpy(lidar_rays)
181 | 
182 |     pcd_pred_list, pcd_gt_list = [], []
183 |     for sem_pred, sem_gt, lidar_origins in tqdm(zip(sem_pred_list, sem_gt_list, lidar_origin_list), ncols=50):
184 |         sem_pred = np.reshape(sem_pred, [200, 200, 16])
185 |         sem_gt = np.reshape(sem_gt, [200, 200, 16])
186 | 
187 |         pcd_pred = process_one_sample(sem_pred, lidar_rays, lidar_origins)
188 |         pcd_gt = process_one_sample(sem_gt, lidar_rays, lidar_origins)
189 | 
190 |         # evalute on non-free rays
191 |         valid_mask = (pcd_gt[:, 0].astype(np.int32) != len(occ_class_names) - 1)
192 |         pcd_pred = pcd_pred[valid_mask]
193 |         pcd_gt = pcd_gt[valid_mask]
194 | 
195 |         assert pcd_pred.shape == pcd_gt.shape
196 |         pcd_pred_list.append(pcd_pred)
197 |         pcd_gt_list.append(pcd_gt)
198 | 
199 |     iou_list = calc_metrics(pcd_pred_list, pcd_gt_list)
200 |     rayiou = np.nanmean(iou_list)
201 |     rayiou_0 = np.nanmean(iou_list[0])
202 |     rayiou_1 = np.nanmean(iou_list[1])
203 |     rayiou_2 = np.nanmean(iou_list[2])
204 |     
205 |     table = PrettyTable([
206 |         'Class Names',
207 |         'RayIoU@1', 'RayIoU@2', 'RayIoU@4'
208 |     ])
209 |     table.float_format = '.3'
210 | 
211 |     for i in range(len(occ_class_names) - 1):
212 |         table.add_row([
213 |             occ_class_names[i],
214 |             iou_list[0][i], iou_list[1][i], iou_list[2][i]
215 |         ], divider=(i == len(occ_class_names) - 2))
216 |     
217 |     table.add_row(['MEAN', rayiou_0, rayiou_1, rayiou_2])
218 | 
219 |     print(table)
220 | 
221 |     torch.cuda.empty_cache()
222 | 
223 |     return {
224 |         'RayIoU': rayiou,
225 |         'RayIoU@1': rayiou_0,
226 |         'RayIoU@2': rayiou_1,
227 |         'RayIoU@4': rayiou_2,
228 |     }
229 | 


--------------------------------------------------------------------------------
/models/backbones/eva02/batch_norm.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | import torch
  3 | import torch.distributed as dist
  4 | from fvcore.nn.distributed import differentiable_all_reduce
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | from .wrappers import BatchNorm2d
  9 | 
 10 | 
 11 | class FrozenBatchNorm2d(nn.Module):
 12 |     """
 13 |     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 14 | 
 15 |     It contains non-trainable buffers called
 16 |     "weight" and "bias", "running_mean", "running_var",
 17 |     initialized to perform identity transformation.
 18 | 
 19 |     The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
 20 |     which are computed from the original four parameters of BN.
 21 |     The affine transform `x * weight + bias` will perform the equivalent
 22 |     computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
 23 |     When loading a backbone model from Caffe2, "running_mean" and "running_var"
 24 |     will be left unchanged as identity transformation.
 25 | 
 26 |     Other pre-trained backbone models may contain all 4 parameters.
 27 | 
 28 |     The forward is implemented by `F.batch_norm(..., training=False)`.
 29 |     """
 30 | 
 31 |     _version = 3
 32 | 
 33 |     def __init__(self, num_features, eps=1e-5):
 34 |         super().__init__()
 35 |         self.num_features = num_features
 36 |         self.eps = eps
 37 |         self.register_buffer("weight", torch.ones(num_features))
 38 |         self.register_buffer("bias", torch.zeros(num_features))
 39 |         self.register_buffer("running_mean", torch.zeros(num_features))
 40 |         self.register_buffer("running_var", torch.ones(num_features) - eps)
 41 | 
 42 |     def forward(self, x):
 43 |         if x.requires_grad:
 44 |             # When gradients are needed, F.batch_norm will use extra memory
 45 |             # because its backward op computes gradients for weight/bias as well.
 46 |             scale = self.weight * (self.running_var + self.eps).rsqrt()
 47 |             bias = self.bias - self.running_mean * scale
 48 |             scale = scale.reshape(1, -1, 1, 1)
 49 |             bias = bias.reshape(1, -1, 1, 1)
 50 |             out_dtype = x.dtype  # may be half
 51 |             return x * scale.to(out_dtype) + bias.to(out_dtype)
 52 |         else:
 53 |             # When gradients are not needed, F.batch_norm is a single fused op
 54 |             # and provide more optimization opportunities.
 55 |             return F.batch_norm(
 56 |                 x,
 57 |                 self.running_mean,
 58 |                 self.running_var,
 59 |                 self.weight,
 60 |                 self.bias,
 61 |                 training=False,
 62 |                 eps=self.eps,
 63 |             )
 64 | 
 65 |     def _load_from_state_dict(
 66 |         self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 67 |     ):
 68 |         version = local_metadata.get("version", None)
 69 | 
 70 |         if version is None or version < 2:
 71 |             # No running_mean/var in early versions
 72 |             # This will silent the warnings
 73 |             if prefix + "running_mean" not in state_dict:
 74 |                 state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
 75 |             if prefix + "running_var" not in state_dict:
 76 |                 state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
 77 | 
 78 |         super()._load_from_state_dict(
 79 |             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
 80 |         )
 81 | 
 82 |     def __repr__(self):
 83 |         return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
 84 | 
 85 |     @classmethod
 86 |     def convert_frozen_batchnorm(cls, module):
 87 |         """
 88 |         Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
 89 | 
 90 |         Args:
 91 |             module (torch.nn.Module):
 92 | 
 93 |         Returns:
 94 |             If module is BatchNorm/SyncBatchNorm, returns a new module.
 95 |             Otherwise, in-place convert module and return it.
 96 | 
 97 |         Similar to convert_sync_batchnorm in
 98 |         https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
 99 |         """
100 |         bn_module = nn.modules.batchnorm
101 |         bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
102 |         res = module
103 |         if isinstance(module, bn_module):
104 |             res = cls(module.num_features)
105 |             if module.affine:
106 |                 res.weight.data = module.weight.data.clone().detach()
107 |                 res.bias.data = module.bias.data.clone().detach()
108 |             res.running_mean.data = module.running_mean.data
109 |             res.running_var.data = module.running_var.data
110 |             res.eps = module.eps
111 |         else:
112 |             for name, child in module.named_children():
113 |                 new_child = cls.convert_frozen_batchnorm(child)
114 |                 if new_child is not child:
115 |                     res.add_module(name, new_child)
116 |         return res
117 | 
118 | 
119 | def get_norm(norm, out_channels):
120 |     """
121 |     Args:
122 |         norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
123 |             or a callable that takes a channel number and returns
124 |             the normalization layer as a nn.Module.
125 | 
126 |     Returns:
127 |         nn.Module or None: the normalization layer
128 |     """
129 |     if norm is None:
130 |         return None
131 |     if isinstance(norm, str):
132 |         if len(norm) == 0:
133 |             return None
134 |         norm = {
135 |             "BN": BatchNorm2d,
136 |             # Fixed in https://github.com/pytorch/pytorch/pull/36382
137 |             "SyncBN": nn.SyncBatchNorm,
138 |             "FrozenBN": FrozenBatchNorm2d,
139 |             "GN": lambda channels: nn.GroupNorm(32, channels),
140 |             # for debugging:
141 |             "nnSyncBN": nn.SyncBatchNorm,
142 |             "LN": lambda channels: LayerNorm(channels)
143 |         }[norm]
144 |     return norm(out_channels)
145 | 
146 | 
147 | class CycleBatchNormList(nn.ModuleList):
148 |     """
149 |     Implement domain-specific BatchNorm by cycling.
150 | 
151 |     When a BatchNorm layer is used for multiple input domains or input
152 |     features, it might need to maintain a separate test-time statistics
153 |     for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`.
154 | 
155 |     This module implements it by using N separate BN layers
156 |     and it cycles through them every time a forward() is called.
157 | 
158 |     NOTE: The caller of this module MUST guarantee to always call
159 |     this module by multiple of N times. Otherwise its test-time statistics
160 |     will be incorrect.
161 |     """
162 | 
163 |     def __init__(self, length: int, bn_class=nn.BatchNorm2d, **kwargs):
164 |         """
165 |         Args:
166 |             length: number of BatchNorm layers to cycle.
167 |             bn_class: the BatchNorm class to use
168 |             kwargs: arguments of the BatchNorm class, such as num_features.
169 |         """
170 |         self._affine = kwargs.pop("affine", True)
171 |         super().__init__([bn_class(**kwargs, affine=False) for k in range(length)])
172 |         if self._affine:
173 |             # shared affine, domain-specific BN
174 |             channels = self[0].num_features
175 |             self.weight = nn.Parameter(torch.ones(channels))
176 |             self.bias = nn.Parameter(torch.zeros(channels))
177 |         self._pos = 0
178 | 
179 |     def forward(self, x):
180 |         ret = self[self._pos](x)
181 |         self._pos = (self._pos + 1) % len(self)
182 | 
183 |         if self._affine:
184 |             w = self.weight.reshape(1, -1, 1, 1)
185 |             b = self.bias.reshape(1, -1, 1, 1)
186 |             return ret * w + b
187 |         else:
188 |             return ret
189 | 
190 |     def extra_repr(self):
191 |         return f"affine={self._affine}"
192 | 
193 | 
194 | class LayerNorm(nn.Module):
195 |     """
196 |     A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
197 |     variance normalization over the channel dimension for inputs that have shape
198 |     (batch_size, channels, height, width).
199 |     https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa B950
200 |     """
201 | 
202 |     def __init__(self, normalized_shape, eps=1e-6):
203 |         super().__init__()
204 |         self.weight = nn.Parameter(torch.ones(normalized_shape))
205 |         self.bias = nn.Parameter(torch.zeros(normalized_shape))
206 |         self.eps = eps
207 |         self.normalized_shape = (normalized_shape,)
208 | 
209 |     def forward(self, x):
210 |         u = x.mean(1, keepdim=True)
211 |         s = (x - u).pow(2).mean(1, keepdim=True)
212 |         x = (x - u) / torch.sqrt(s + self.eps)
213 |         x = self.weight[:, None, None] * x + self.bias[:, None, None]
214 |         return x


--------------------------------------------------------------------------------
/models/lidar_encoder/sparse_encoder4x.py:
--------------------------------------------------------------------------------
  1 | from mmcv.runner import auto_fp16
  2 | from torch import nn as nn
  3 | 
  4 | from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule
  5 | from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE
  6 | from mmdet3d.models.builder import MIDDLE_ENCODERS
  7 | 
  8 | if IS_SPCONV2_AVAILABLE:
  9 |     from spconv.pytorch import SparseConvTensor, SparseSequential
 10 | else:
 11 |     from mmcv.ops import SparseConvTensor, SparseSequential
 12 | 
 13 | 
 14 | @MIDDLE_ENCODERS.register_module()
 15 | class SparseEncoder8x(nn.Module):
 16 |     r"""Sparse encoder for SECOND and Part-A2.
 17 | 
 18 |     Args:
 19 |         in_channels (int): The number of input channels.
 20 |         sparse_shape (list[int]): The sparse shape of input tensor.
 21 |         order (list[str], optional): Order of conv module.
 22 |             Defaults to ('conv', 'norm', 'act').
 23 |         norm_cfg (dict, optional): Config of normalization layer. Defaults to
 24 |             dict(type='BN1d', eps=1e-3, momentum=0.01).
 25 |         base_channels (int, optional): Out channels for conv_input layer.
 26 |             Defaults to 16.
 27 |         output_channels (int, optional): Out channels for conv_out layer.
 28 |             Defaults to 128.
 29 |         encoder_channels (tuple[tuple[int]], optional):
 30 |             Convolutional channels of each encode block.
 31 |             Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
 32 |         encoder_paddings (tuple[tuple[int]], optional):
 33 |             Paddings of each encode block.
 34 |             Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)).
 35 |         block_type (str, optional): Type of the block to use.
 36 |             Defaults to 'conv_module'.
 37 |     """
 38 | 
 39 |     def __init__(self,
 40 |                  in_channels,
 41 |                  sparse_shape,
 42 |                  order=('conv', 'norm', 'act'),
 43 |                  norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
 44 |                  base_channels=16,
 45 |                  output_channels=128,
 46 |                  encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
 47 |                                                                         64)),
 48 |                  encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
 49 |                                                                  1)),
 50 |                  block_type='conv_module'):
 51 |         super().__init__()
 52 |         assert block_type in ['conv_module', 'basicblock']
 53 |         self.sparse_shape = sparse_shape
 54 |         self.in_channels = in_channels
 55 |         self.order = order
 56 |         self.base_channels = base_channels
 57 |         self.output_channels = output_channels
 58 |         self.encoder_channels = encoder_channels
 59 |         self.encoder_paddings = encoder_paddings
 60 |         self.stage_num = len(self.encoder_channels)
 61 |         self.fp16_enabled = False
 62 |         # Spconv init all weight on its own
 63 | 
 64 |         assert isinstance(order, tuple) and len(order) == 3
 65 |         assert set(order) == {'conv', 'norm', 'act'}
 66 | 
 67 |         if self.order[0] != 'conv':  # pre activate
 68 |             self.conv_input = make_sparse_convmodule(
 69 |                 in_channels,
 70 |                 self.base_channels,
 71 |                 3,
 72 |                 norm_cfg=norm_cfg,
 73 |                 padding=1,
 74 |                 indice_key='subm1',
 75 |                 conv_type='SubMConv3d',
 76 |                 order=('conv', ))
 77 |         else:  # post activate
 78 |             self.conv_input = make_sparse_convmodule(
 79 |                 in_channels,
 80 |                 self.base_channels,
 81 |                 3,
 82 |                 norm_cfg=norm_cfg,
 83 |                 padding=1,
 84 |                 indice_key='subm1',
 85 |                 conv_type='SubMConv3d')
 86 | 
 87 |         encoder_out_channels = self.make_encoder_layers(
 88 |             make_sparse_convmodule,
 89 |             norm_cfg,
 90 |             self.base_channels,
 91 |             block_type=block_type)
 92 | 
 93 |         self.conv_out = make_sparse_convmodule(
 94 |             encoder_out_channels,
 95 |             self.output_channels,
 96 |             kernel_size=(1, 1, 1),
 97 |             stride=(1, 1, 1),
 98 |             norm_cfg=norm_cfg,
 99 |             padding=0,
100 |             indice_key='spconv_down2',
101 |             conv_type='SparseConv3d')
102 | 
103 |     @auto_fp16(apply_to=('voxel_features', ))
104 |     def forward(self, voxel_features, coors, batch_size):
105 |         """Forward of SparseEncoder.
106 | 
107 |         Args:
108 |             voxel_features (torch.Tensor): Voxel features in shape (N, C).
109 |             coors (torch.Tensor): Coordinates in shape (N, 4),
110 |                 the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
111 |             batch_size (int): Batch size.
112 | 
113 |         Returns:
114 |             dict: Backbone features.
115 |         """
116 |         coors = coors.int()
117 |         input_sp_tensor = SparseConvTensor(voxel_features, coors,
118 |                                            self.sparse_shape, batch_size)
119 |         x = self.conv_input(input_sp_tensor)
120 | 
121 |         encode_features = []
122 |         for encoder_layer in self.encoder_layers:
123 |             x = encoder_layer(x)
124 |             encode_features.append(x)
125 | 
126 |         # for detection head
127 |         # [200, 176, 5] -> [200, 176, 2]
128 |         out = self.conv_out(encode_features[-1])
129 | 
130 |         ## for following usage, comment following code
131 |         # spatial_features = out.dense()
132 | 
133 |         # N, C, D, H, W = spatial_features.shape
134 |         # spatial_features = spatial_features.view(N, C , D, H, W)
135 | 
136 |         return out
137 | 
138 |     def make_encoder_layers(self,
139 |                             make_block,
140 |                             norm_cfg,
141 |                             in_channels,
142 |                             block_type='conv_module',
143 |                             conv_cfg=dict(type='SubMConv3d')):
144 |         """make encoder layers using sparse convs.
145 | 
146 |         Args:
147 |             make_block (method): A bounded function to build blocks.
148 |             norm_cfg (dict[str]): Config of normalization layer.
149 |             in_channels (int): The number of encoder input channels.
150 |             block_type (str, optional): Type of the block to use.
151 |                 Defaults to 'conv_module'.
152 |             conv_cfg (dict, optional): Config of conv layer. Defaults to
153 |                 dict(type='SubMConv3d').
154 | 
155 |         Returns:
156 |             int: The number of encoder output channels.
157 |         """
158 |         assert block_type in ['conv_module', 'basicblock']
159 |         self.encoder_layers = SparseSequential()
160 | 
161 |         for i, blocks in enumerate(self.encoder_channels):
162 |             blocks_list = []
163 |             for j, out_channels in enumerate(tuple(blocks)):
164 |                 padding = tuple(self.encoder_paddings[i])[j]
165 |                 # each stage started with a spconv layer
166 |                 # except the first stage
167 |                 if i != 0 and j == 0 and block_type == 'conv_module':
168 |                     blocks_list.append(
169 |                         make_block(
170 |                             in_channels,
171 |                             out_channels,
172 |                             3,
173 |                             norm_cfg=norm_cfg,
174 |                             stride=2,
175 |                             padding=padding,
176 |                             indice_key=f'spconv{i + 1}',
177 |                             conv_type='SparseConv3d'))
178 |                 elif block_type == 'basicblock':
179 |                     if j == len(blocks) - 1 and i != len(
180 |                             self.encoder_channels) - 1:
181 |                         blocks_list.append(
182 |                             make_block(
183 |                                 in_channels,
184 |                                 out_channels,
185 |                                 3,
186 |                                 norm_cfg=norm_cfg,
187 |                                 stride=2,
188 |                                 padding=padding,
189 |                                 indice_key=f'spconv{i + 1}',
190 |                                 conv_type='SparseConv3d'))
191 |                     else:
192 |                         blocks_list.append(
193 |                             SparseBasicBlock(
194 |                                 out_channels,
195 |                                 out_channels,
196 |                                 norm_cfg=norm_cfg,
197 |                                 conv_cfg=conv_cfg))
198 |                 else:
199 |                     blocks_list.append(
200 |                         make_block(
201 |                             in_channels,
202 |                             out_channels,
203 |                             3,
204 |                             norm_cfg=norm_cfg,
205 |                             padding=padding,
206 |                             indice_key=f'subm{i + 1}',
207 |                             conv_type='SubMConv3d'))
208 |                 in_channels = out_channels
209 |             stage_name = f'encoder_layer{i + 1}'
210 |             stage_layers = SparseSequential(*blocks_list)
211 |             self.encoder_layers.add_module(stage_name, stage_layers)
212 |         return out_channels


--------------------------------------------------------------------------------
/loaders/old_metrics.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from sklearn.neighbors import KDTree
  4 | from termcolor import colored
  5 | from functools import reduce
  6 | from typing import Iterable
  7 | 
  8 | np.seterr(divide='ignore', invalid='ignore')
  9 | os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 10 | 
 11 | 
 12 | def pcolor(string, color, on_color=None, attrs=None):
 13 |     """
 14 |     Produces a colored string for printing
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     string : str
 19 |         String that will be colored
 20 |     color : str
 21 |         Color to use
 22 |     on_color : str
 23 |         Background color to use
 24 |     attrs : list of str
 25 |         Different attributes for the string
 26 | 
 27 |     Returns
 28 |     -------
 29 |     string: str
 30 |         Colored string
 31 |     """
 32 |     return colored(string, color, on_color, attrs)
 33 | 
 34 | 
 35 | def getCellCoordinates(points, voxelSize):
 36 |     return (points / voxelSize).astype(np.int)
 37 | 
 38 | 
 39 | def getNumUniqueCells(cells):
 40 |     M = cells.max() + 1
 41 |     return np.unique(cells[:, 0] + M * cells[:, 1] + M ** 2 * cells[:, 2]).shape[0]
 42 | 
 43 | 
 44 | class Metric_mIoU_Occ3D():
 45 |     def __init__(self,
 46 |                  save_dir='.',
 47 |                  num_classes=18,
 48 |                  use_lidar_mask=False,
 49 |                  use_image_mask=False,
 50 |                  ):
 51 |         if num_classes == 18:
 52 |             self.class_names = [
 53 |                 'others','barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
 54 |                 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
 55 |                 'driveable_surface', 'other_flat', 'sidewalk',
 56 |                 'terrain', 'manmade', 'vegetation','free'
 57 |             ]
 58 |         elif num_classes == 2:
 59 |             self.class_names = ['non-free', 'free']
 60 |         
 61 |         self.save_dir = save_dir
 62 |         self.use_lidar_mask = use_lidar_mask
 63 |         self.use_image_mask = use_image_mask
 64 |         self.num_classes = num_classes
 65 | 
 66 |         self.point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]
 67 |         self.occupancy_size = [0.4, 0.4, 0.4]
 68 |         self.voxel_size = 0.4
 69 |         self.occ_xdim = int((self.point_cloud_range[3] - self.point_cloud_range[0]) / self.occupancy_size[0])
 70 |         self.occ_ydim = int((self.point_cloud_range[4] - self.point_cloud_range[1]) / self.occupancy_size[1])
 71 |         self.occ_zdim = int((self.point_cloud_range[5] - self.point_cloud_range[2]) / self.occupancy_size[2])
 72 |         self.voxel_num = self.occ_xdim * self.occ_ydim * self.occ_zdim
 73 |         self.hist = np.zeros((self.num_classes, self.num_classes))
 74 |         self.cnt = 0
 75 | 
 76 |     def hist_info(self, n_cl, pred, gt):
 77 |         """
 78 |         build confusion matrix
 79 |         # empty classes:0
 80 |         non-empty class: 0-16
 81 |         free voxel class: 17
 82 | 
 83 |         Args:
 84 |             n_cl (int): num_classes_occupancy
 85 |             pred (1-d array): pred_occupancy_label
 86 |             gt (1-d array): gt_occupancu_label
 87 | 
 88 |         Returns:
 89 |             tuple:(hist, correctly number_predicted_labels, num_labelled_sample)
 90 |         """
 91 |         assert pred.shape == gt.shape
 92 |         k = (gt >= 0) & (gt < n_cl)  # exclude 255
 93 |         labeled = np.sum(k)
 94 |         correct = np.sum((pred[k] == gt[k]))
 95 | 
 96 |         return (
 97 |             np.bincount(
 98 |                 n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2
 99 |             ).reshape(n_cl, n_cl),
100 |             correct,
101 |             labeled,
102 |         )
103 | 
104 |     def per_class_iu(self, hist):
105 |         #return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
106 |         result = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
107 |         result[hist.sum(1) == 0] = float('nan')
108 |         return result
109 | 
110 |     def compute_mIoU(self, pred, label, n_classes):
111 |         hist = np.zeros((n_classes, n_classes))
112 |         new_hist, correct, labeled = self.hist_info(n_classes, pred.flatten(), label.flatten())
113 |         hist += new_hist
114 |         mIoUs = self.per_class_iu(hist)
115 |         # for ind_class in range(n_classes):
116 |         #     print(str(round(mIoUs[ind_class] * 100, 2)))
117 |         # print('===> mIoU: ' + str(round(np.nanmean(mIoUs) * 100, 2)))
118 |         return round(np.nanmean(mIoUs) * 100, 2), hist
119 | 
120 |     def add_batch(self,semantics_pred,semantics_gt,mask_lidar,mask_camera):
121 |         self.cnt += 1
122 |         if self.use_image_mask:
123 |             masked_semantics_gt = semantics_gt[mask_camera]
124 |             masked_semantics_pred = semantics_pred[mask_camera]
125 |         elif self.use_lidar_mask:
126 |             masked_semantics_gt = semantics_gt[mask_lidar]
127 |             masked_semantics_pred = semantics_pred[mask_lidar]
128 |         else:
129 |             masked_semantics_gt = semantics_gt
130 |             masked_semantics_pred = semantics_pred
131 | 
132 |         if self.num_classes == 2:
133 |             masked_semantics_pred = np.copy(masked_semantics_pred)
134 |             masked_semantics_gt = np.copy(masked_semantics_gt)
135 |             masked_semantics_pred[masked_semantics_pred < 17] = 0
136 |             masked_semantics_pred[masked_semantics_pred == 17] = 1
137 |             masked_semantics_gt[masked_semantics_gt < 17] = 0
138 |             masked_semantics_gt[masked_semantics_gt == 17] = 1
139 |         
140 |         _, _hist = self.compute_mIoU(masked_semantics_pred, masked_semantics_gt, self.num_classes)
141 |         self.hist += _hist
142 | 
143 |     def count_miou(self):
144 |         mIoU = self.per_class_iu(self.hist)
145 |         # assert cnt == num_samples, 'some samples are not included in the miou calculation'
146 |         print(f'===> per class IoU of {self.cnt} samples:')
147 |         for ind_class in range(self.num_classes-1):
148 |             print(f'===> {self.class_names[ind_class]} - IoU = ' + str(round(mIoU[ind_class] * 100, 2)))
149 | 
150 |         print(f'===> mIoU of {self.cnt} samples: ' + str(round(np.nanmean(mIoU[:self.num_classes-1]) * 100, 2)))
151 |         # print(f'===> sample-wise averaged mIoU of {cnt} samples: ' + str(round(np.nanmean(mIoU_avg), 2)))
152 | 
153 |         return round(np.nanmean(mIoU[:self.num_classes-1]) * 100, 2)
154 | 
155 | 
156 | class Metric_mIoU_Occupancy:
157 | 
158 |     def __init__(self):
159 |         self.class_names = [
160 |             'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
161 |             'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
162 |             'driveable_surface', 'other_flat', 'sidewalk',
163 |             'terrain', 'manmade', 'vegetation','free'
164 |         ]
165 |         self.point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3]
166 |         self.occupancy_size = [0.2, 0.2, 0.2]
167 |         self.voxel_size = 0.2
168 |         self.occ_xdim = int((self.point_cloud_range[3] - self.point_cloud_range[0]) / self.occupancy_size[0])
169 |         self.occ_ydim = int((self.point_cloud_range[4] - self.point_cloud_range[1]) / self.occupancy_size[1])
170 |         self.occ_zdim = int((self.point_cloud_range[5] - self.point_cloud_range[2]) / self.occupancy_size[2])
171 |         self.voxel_num = self.occ_xdim * self.occ_ydim * self.occ_zdim
172 |         self.num_classes = len(self.class_names)
173 |         self.hist = np.zeros((self.num_classes, self.num_classes))
174 |         self.bin_hist = np.zeros((2, 2))
175 |         self.cnt = 0
176 | 
177 |     def hist_info(self, n_cl, pred, gt):
178 |         assert pred.shape == gt.shape
179 |         k = (gt >= 0) & (gt < n_cl)  # exclude 255
180 |         return np.bincount(
181 |             n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2).reshape(n_cl, n_cl)
182 | 
183 |     def per_class_iu(self, hist):
184 |         #return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
185 |         result = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
186 |         result[hist.sum(1) == 0] = float('nan')
187 |         return result
188 | 
189 |     def add_batch(self, semantics_pred, semantics_gt, mask=None):
190 |         self.cnt += 1
191 |         if mask is not None:
192 |             semantics_pred = semantics_pred[mask]
193 |             semantics_gt = semantics_gt[mask]
194 |         
195 |         pred = semantics_pred.flatten()
196 |         binary_pred = pred.copy()
197 |         binary_pred[binary_pred < self.num_classes-1] = 0
198 |         binary_pred[binary_pred == self.num_classes-1] = 1
199 | 
200 |         gt = semantics_gt.flatten()
201 |         binary_gt = gt.copy()
202 |         binary_gt[binary_gt < self.num_classes-1] = 0
203 |         binary_gt[binary_gt == self.num_classes-1] = 1
204 |         
205 |         self.hist += self.hist_info(self.num_classes, pred, gt)
206 |         self.bin_hist += self.hist_info(2, binary_pred, binary_gt)
207 | 
208 |     def count_miou(self):
209 |         mIoU = self.per_class_iu(self.hist)
210 |         IoU = self.per_class_iu(self.bin_hist)
211 |         # assert cnt == num_samples, 'some samples are not included in the miou calculation'
212 |         print(f'===> per class IoU of {self.cnt} samples:')
213 |         for ind_class in range(self.num_classes-1):
214 |             print(f'===> {self.class_names[ind_class]} - IoU = ' + str(round(mIoU[ind_class] * 100, 2)))
215 | 
216 |         print(f'===> mIoU of {self.cnt} samples: ' + str(round(np.nanmean(mIoU[:self.num_classes-1]) * 100, 2)))
217 |         print(f'===> IoU of {self.cnt} samples: ' + str(round(IoU[0] * 100, 2)))
218 | 
219 |         return round(np.nanmean(mIoU[:self.num_classes-1]) * 100, 2), round(IoU[0] * 100, 2)


--------------------------------------------------------------------------------
/configs/opusv1-fusion_nusc-occ3d/opusv1-fusion-l_r50_704x256_8f_nusc-occ3d_100e.py:
--------------------------------------------------------------------------------
  1 | dataset_type = 'NuScenesOcc3DDataset'
  2 | dataset_root = 'data/nuscenes/'
  3 | occ_root = 'data/nuscenes/gts/'
  4 | 
  5 | input_modality = dict(
  6 |     use_lidar=True,
  7 |     use_camera=True,
  8 |     use_radar=False,
  9 |     use_map=False,
 10 |     use_external=True
 11 | )
 12 | 
 13 | # For nuScenes we usually do 10-class detection
 14 | object_names = [
 15 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
 16 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
 17 | ]
 18 | 
 19 | occ_names = [
 20 |     'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
 21 |     'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
 22 |     'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade',
 23 |     'vegetation'
 24 | ]
 25 | 
 26 | # If point cloud range is changed, the models should also change their point
 27 | # cloud range accordingly
 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]
 29 | pc_voxel_size = [0.05, 0.05, 0.16]
 30 | voxel_size = [0.4, 0.4, 0.4]
 31 | 
 32 | # arch config
 33 | embed_dims = 256
 34 | num_layers = 6
 35 | num_query = 4800
 36 | num_frames = 8
 37 | num_levels = 4
 38 | num_points = 2
 39 | num_refines = [1, 2, 4, 8, 16, 16]
 40 | 
 41 | img_backbone = dict(
 42 |     type='ResNet',
 43 |     depth=50,
 44 |     num_stages=4,
 45 |     out_indices=(0, 1, 2, 3),
 46 |     frozen_stages=1,
 47 |     norm_cfg=dict(type='BN2d', requires_grad=True),
 48 |     norm_eval=True,
 49 |     style='pytorch',
 50 |     with_cp=True)
 51 | img_neck = dict(
 52 |     type='FPN',
 53 |     in_channels=[256, 512, 1024, 2048],
 54 |     out_channels=embed_dims,
 55 |     num_outs=num_levels)
 56 | img_norm_cfg = dict(
 57 |     mean=[123.675, 116.280, 103.530],
 58 |     std=[58.395, 57.120, 57.375],
 59 |     to_rgb=True)
 60 | 
 61 | pts_voxel_layer=dict(max_num_points=10, voxel_size=pc_voxel_size, deterministic=False,
 62 |                      max_voxels=(90000, 120000), point_cloud_range=point_cloud_range)
 63 | pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5)
 64 | pts_middle_encoder=dict(
 65 |     type='SparseEncoder',
 66 |     in_channels=5,
 67 |     sparse_shape=[41, 1600, 1600],
 68 |     output_channels=128,
 69 |     order=('conv', 'norm', 'act'),
 70 |     encoder_channels=((16, 16, 32), 
 71 |                       (32, 32, 64), 
 72 |                       (64, 64, 128), 
 73 |                       (128,128)),
 74 |     encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
 75 |     block_type='basicblock')
 76 | pts_backbone=dict(
 77 |     type='SECOND',
 78 |     in_channels=256,
 79 |     out_channels=[128, 256],
 80 |     layer_nums=[5, 5],
 81 |     layer_strides=[1, 2],
 82 |     norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
 83 |     conv_cfg=dict(type='Conv2d', bias=False))
 84 | pts_neck=dict(
 85 |     type='SECONDFPN',
 86 |     in_channels=[128, 256],
 87 |     out_channels=[256, 256],
 88 |     upsample_strides=[1, 2],
 89 |     norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
 90 |     upsample_cfg=dict(type='deconv', bias=False),
 91 |     use_conv_for_no_stride=True)
 92 | 
 93 | model = dict(
 94 |     type='OPUSV1Fusion',
 95 |     use_grid_mask=False,
 96 |     data_aug=dict(
 97 |         img_color_aug=True,  # Move some augmentations to GPU
 98 |         img_norm_cfg=img_norm_cfg,
 99 |         img_pad_cfg=dict(size_divisor=32)),
100 |     stop_prev_grad=0,
101 |     img_backbone=img_backbone,
102 |     img_neck=img_neck,
103 |     pts_voxel_layer=pts_voxel_layer,
104 |     pts_voxel_encoder=pts_voxel_encoder,
105 |     pts_middle_encoder=pts_middle_encoder,
106 |     pts_backbone=pts_backbone,
107 |     pts_neck=pts_neck,
108 |     pts_bbox_head=dict(
109 |         type='OPUSV1FusionHead',
110 |         num_classes=len(occ_names),
111 |         in_channels=embed_dims,
112 |         num_query=num_query,
113 |         pc_range=point_cloud_range,
114 |         voxel_size=voxel_size,
115 |         init_pos_lidar='curr',
116 |         transformer=dict(
117 |             type='OPUSV1FusionTransformer',
118 |             embed_dims=embed_dims,
119 |             num_frames=num_frames,
120 |             num_points=num_points,
121 |             num_layers=num_layers,
122 |             num_levels=num_levels,
123 |             num_classes=len(occ_names),
124 |             num_refines=num_refines,
125 |             scales=[0.5],
126 |             pc_range=point_cloud_range),
127 |         loss_cls=dict(
128 |             type='FocalLoss',
129 |             use_sigmoid=True,
130 |             gamma=2.0,
131 |             alpha=0.25,
132 |             loss_weight=2.0),
133 |         loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)),
134 |     train_cfg=dict(
135 |         pts=dict(
136 |             cls_weights=[
137 |                 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1],
138 |             )
139 |         ),
140 |     test_cfg=dict(
141 |         pts=dict(
142 |             score_thr=0.5,
143 |             padding=True)
144 |     )
145 | )
146 | 
147 | ida_aug_conf = {
148 |     'resize_lim': (0.38, 0.55),
149 |     'final_dim': (256, 704),
150 |     'bot_pct_lim': (0.0, 0.0),
151 |     'rot_lim': (0.0, 0.0),
152 |     'H': 900, 'W': 1600,
153 |     'rand_flip': True,
154 | }
155 | 
156 | train_pipeline = [
157 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
158 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1),
159 |     dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5),
160 |     dict(type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4],
161 |          pad_empty_sweeps=True, remove_close=True),
162 |     dict(type='LiDARToOccSpace'),
163 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
164 |     dict(type='LoadOcc3DFromFile', occ_root=occ_root), 
165 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
166 |     dict(type='ObjectNameFilter', classes=object_names),
167 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True),
168 |     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
169 |     dict(type='DefaultFormatBundle3D', class_names=object_names),
170 |     dict(type='Collect3D', keys=['img', 'points', 'voxel_semantics', 'mask_camera'], meta_keys=(
171 |         'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp'))
172 | ]
173 | 
174 | test_pipeline = [
175 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
176 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True),
177 |     dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5),
178 |     dict(type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4],
179 |          pad_empty_sweeps=True, remove_close=True),
180 |     dict(type='LiDARToOccSpace'),
181 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False),
182 |     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
183 |     dict(
184 |         type='MultiScaleFlipAug3D',
185 |         img_scale=(1600, 900),
186 |         pts_scale_ratio=1,
187 |         flip=False,
188 |         transforms=[
189 |             dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False),
190 |             dict(type='Collect3D', keys=['img', 'points'], meta_keys=(
191 |                 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape',
192 |                 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp'))
193 |         ])
194 | ]
195 | 
196 | data = dict(
197 |     # workers_per_gpu=1,
198 |     workers_per_gpu=4,
199 |     train=dict(
200 |         type=dataset_type,
201 |         data_root=dataset_root,
202 |         ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl',
203 |         pipeline=train_pipeline,
204 |         classes=object_names,
205 |         modality=input_modality,
206 |         test_mode=False,
207 |         use_valid_flag=True,
208 |         box_type_3d='LiDAR'),
209 |     val=dict(
210 |         type=dataset_type,
211 |         data_root=dataset_root,
212 |         ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl',
213 |         pipeline=test_pipeline,
214 |         classes=object_names,
215 |         modality=input_modality,
216 |         test_mode=True,
217 |         box_type_3d='LiDAR'),
218 |     test=dict(
219 |         type=dataset_type,
220 |         data_root=dataset_root,
221 |         ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl',
222 |         pipeline=test_pipeline,
223 |         classes=object_names,
224 |         modality=input_modality,
225 |         test_mode=True,
226 |         box_type_3d='LiDAR')
227 | )
228 | 
229 | optimizer = dict(
230 |     type='AdamW',
231 |     lr=2e-4,
232 |     paramwise_cfg=dict(custom_keys={
233 |         'img_backbone': dict(lr_mult=0.1),
234 |         'sampling_offset': dict(lr_mult=0.1),
235 |     }),
236 |     weight_decay=0.01
237 | )
238 | 
239 | optimizer_config = dict(
240 |     type='Fp16OptimizerHook',
241 |     loss_scale=512.0,
242 |     grad_clip=dict(max_norm=35, norm_type=2)
243 | )
244 | 
245 | # learning policy
246 | lr_config = dict(
247 |     policy='CosineAnnealing',
248 |     warmup='linear',
249 |     warmup_iters=500,
250 |     warmup_ratio=1.0 / 3,
251 |     min_lr_ratio=1e-3
252 | )
253 | total_epochs = 100
254 | batch_size = 8
255 | 
256 | # load pretrained weights
257 | load_from = 'pretrain/fusion_pretrain_model.pth'
258 | revise_keys = []
259 | 
260 | # resume the last training
261 | resume_from = None
262 | 
263 | # checkpointing
264 | checkpoint_config = dict(interval=1, max_keep_ckpts=1)
265 | 
266 | # logging
267 | log_config = dict(
268 |     interval=1,
269 |     hooks=[
270 |         dict(type='TextLoggerHook', interval=50, reset_flag=True),
271 |         dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True)
272 |     ]
273 | )
274 | 
275 | # evaluation
276 | eval_config = dict(interval=total_epochs)
277 | 
278 | # other flags
279 | debug = False
280 | 


--------------------------------------------------------------------------------
/configs/opusv1-fusion_nusc-occ3d/opusv1-fusion-m_r50_704x256_8f_nusc-occ3d_100e.py:
--------------------------------------------------------------------------------
  1 | dataset_type = 'NuScenesOcc3DDataset'
  2 | dataset_root = 'data/nuscenes/'
  3 | occ_root = 'data/nuscenes/gts/'
  4 | 
  5 | input_modality = dict(
  6 |     use_lidar=True,
  7 |     use_camera=True,
  8 |     use_radar=False,
  9 |     use_map=False,
 10 |     use_external=True
 11 | )
 12 | 
 13 | # For nuScenes we usually do 10-class detection
 14 | object_names = [
 15 |     'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
 16 |     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
 17 | ]
 18 | 
 19 | occ_names = [
 20 |     'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle',
 21 |     'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck',
 22 |     'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade',
 23 |     'vegetation'
 24 | ]
 25 | 
 26 | # If point cloud range is changed, the models should also change their point
 27 | # cloud range accordingly
 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4]
 29 | pc_voxel_size = [0.05, 0.05, 0.16]
 30 | voxel_size = [0.4, 0.4, 0.4]
 31 | 
 32 | # arch config
 33 | embed_dims = 256
 34 | num_layers = 6
 35 | num_query = 2400
 36 | num_frames = 8
 37 | num_levels = 4
 38 | num_points = 2
 39 | num_refines = [1, 2, 4, 8, 16, 32]
 40 | 
 41 | img_backbone = dict(
 42 |     type='ResNet',
 43 |     depth=50,
 44 |     num_stages=4,
 45 |     out_indices=(0, 1, 2, 3),
 46 |     frozen_stages=1,
 47 |     norm_cfg=dict(type='BN2d', requires_grad=True),
 48 |     norm_eval=True,
 49 |     style='pytorch',
 50 |     with_cp=True)
 51 | img_neck = dict(
 52 |     type='FPN',
 53 |     in_channels=[256, 512, 1024, 2048],
 54 |     out_channels=embed_dims,
 55 |     num_outs=num_levels)
 56 | img_norm_cfg = dict(
 57 |     mean=[123.675, 116.280, 103.530],
 58 |     std=[58.395, 57.120, 57.375],
 59 |     to_rgb=True)
 60 | 
 61 | pts_voxel_layer=dict(max_num_points=10, voxel_size=pc_voxel_size, deterministic=False,
 62 |                      max_voxels=(90000, 120000), point_cloud_range=point_cloud_range)
 63 | pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5)
 64 | pts_middle_encoder=dict(
 65 |     type='SparseEncoder',
 66 |     in_channels=5,
 67 |     sparse_shape=[41, 1600, 1600],
 68 |     output_channels=128,
 69 |     order=('conv', 'norm', 'act'),
 70 |     encoder_channels=((16, 16, 32), 
 71 |                       (32, 32, 64), 
 72 |                       (64, 64, 128), 
 73 |                       (128,128)),
 74 |     encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
 75 |     block_type='basicblock')
 76 | pts_backbone=dict(
 77 |     type='SECOND',
 78 |     in_channels=256,
 79 |     out_channels=[128, 256],
 80 |     layer_nums=[5, 5],
 81 |     layer_strides=[1, 2],
 82 |     norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
 83 |     conv_cfg=dict(type='Conv2d', bias=False))
 84 | pts_neck=dict(
 85 |     type='SECONDFPN',
 86 |     in_channels=[128, 256],
 87 |     out_channels=[256, 256],
 88 |     upsample_strides=[1, 2],
 89 |     norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
 90 |     upsample_cfg=dict(type='deconv', bias=False),
 91 |     use_conv_for_no_stride=True)
 92 | 
 93 | model = dict(
 94 |     type='OPUSV1Fusion',
 95 |     use_grid_mask=False,
 96 |     data_aug=dict(
 97 |         img_color_aug=True,  # Move some augmentations to GPU
 98 |         img_norm_cfg=img_norm_cfg,
 99 |         img_pad_cfg=dict(size_divisor=32)),
100 |     stop_prev_grad=0,
101 |     img_backbone=img_backbone,
102 |     img_neck=img_neck,
103 |     pts_voxel_layer=pts_voxel_layer,
104 |     pts_voxel_encoder=pts_voxel_encoder,
105 |     pts_middle_encoder=pts_middle_encoder,
106 |     pts_backbone=pts_backbone,
107 |     pts_neck=pts_neck,
108 |     pts_bbox_head=dict(
109 |         type='OPUSV1FusionHead',
110 |         num_classes=len(occ_names),
111 |         in_channels=embed_dims,
112 |         num_query=num_query,
113 |         pc_range=point_cloud_range,
114 |         voxel_size=voxel_size,
115 |         init_pos_lidar='curr',
116 |         transformer=dict(
117 |             type='OPUSV1FusionTransformer',
118 |             embed_dims=embed_dims,
119 |             num_frames=num_frames,
120 |             num_points=num_points,
121 |             num_layers=num_layers,
122 |             num_levels=num_levels,
123 |             num_classes=len(occ_names),
124 |             num_refines=num_refines,
125 |             scales=[0.5],
126 |             pc_range=point_cloud_range),
127 |         loss_cls=dict(
128 |             type='FocalLoss',
129 |             use_sigmoid=True,
130 |             gamma=2.0,
131 |             alpha=0.25,
132 |             loss_weight=2.0),
133 |         loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)),
134 |     train_cfg=dict(
135 |         pts=dict(
136 |             cls_weights=[
137 |                 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1],
138 |             )
139 |         ),
140 |     test_cfg=dict(
141 |         pts=dict(
142 |             score_thr=0.5,
143 |             padding=True)
144 |     )
145 | )
146 | 
147 | ida_aug_conf = {
148 |     'resize_lim': (0.38, 0.55),
149 |     'final_dim': (256, 704),
150 |     'bot_pct_lim': (0.0, 0.0),
151 |     'rot_lim': (0.0, 0.0),
152 |     'H': 900, 'W': 1600,
153 |     'rand_flip': True,
154 | }
155 | 
156 | train_pipeline = [
157 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
158 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1),
159 |     dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5),
160 |     dict(type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4],
161 |          pad_empty_sweeps=True, remove_close=True),
162 |     dict(type='LiDARToOccSpace'),
163 |     dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
164 |     dict(type='LoadOcc3DFromFile', occ_root=occ_root), 
165 |     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
166 |     dict(type='ObjectNameFilter', classes=object_names),
167 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True),
168 |     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
169 |     dict(type='DefaultFormatBundle3D', class_names=object_names),
170 |     dict(type='Collect3D', keys=['img', 'points', 'voxel_semantics', 'mask_camera'], meta_keys=(
171 |         'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp'))
172 | ]
173 | 
174 | test_pipeline = [
175 |     dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
176 |     dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True),
177 |     dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5),
178 |     dict(type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4],
179 |          pad_empty_sweeps=True, remove_close=True),
180 |     dict(type='LiDARToOccSpace'),
181 |     dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False),
182 |     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
183 |     dict(
184 |         type='MultiScaleFlipAug3D',
185 |         img_scale=(1600, 900),
186 |         pts_scale_ratio=1,
187 |         flip=False,
188 |         transforms=[
189 |             dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False),
190 |             dict(type='Collect3D', keys=['img', 'points'], meta_keys=(
191 |                 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape',
192 |                 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp'))
193 |         ])
194 | ]
195 | 
196 | data = dict(
197 |     # workers_per_gpu=1,
198 |     workers_per_gpu=4,
199 |     train=dict(
200 |         type=dataset_type,
201 |         data_root=dataset_root,
202 |         ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl',
203 |         pipeline=train_pipeline,
204 |         classes=object_names,
205 |         modality=input_modality,
206 |         test_mode=False,
207 |         use_valid_flag=True,
208 |         box_type_3d='LiDAR'),
209 |     val=dict(
210 |         type=dataset_type,
211 |         data_root=dataset_root,
212 |         ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl',
213 |         pipeline=test_pipeline,
214 |         classes=object_names,
215 |         modality=input_modality,
216 |         test_mode=True,
217 |         box_type_3d='LiDAR'),
218 |     test=dict(
219 |         type=dataset_type,
220 |         data_root=dataset_root,
221 |         ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl',
222 |         pipeline=test_pipeline,
223 |         classes=object_names,
224 |         modality=input_modality,
225 |         test_mode=True,
226 |         box_type_3d='LiDAR')
227 | )
228 | 
229 | optimizer = dict(
230 |     type='AdamW',
231 |     lr=2e-4,
232 |     paramwise_cfg=dict(custom_keys={
233 |         'img_backbone': dict(lr_mult=0.1),
234 |         'sampling_offset': dict(lr_mult=0.1),
235 |     }),
236 |     weight_decay=0.01
237 | )
238 | 
239 | optimizer_config = dict(
240 |     type='Fp16OptimizerHook',
241 |     loss_scale=512.0,
242 |     grad_clip=dict(max_norm=35, norm_type=2)
243 | )
244 | 
245 | # learning policy
246 | lr_config = dict(
247 |     policy='CosineAnnealing',
248 |     warmup='linear',
249 |     warmup_iters=500,
250 |     warmup_ratio=1.0 / 3,
251 |     min_lr_ratio=1e-3
252 | )
253 | total_epochs = 100
254 | batch_size = 8
255 | 
256 | # load pretrained weights
257 | load_from = 'pretrain/fusion_pretrain_model.pth'
258 | revise_keys = []
259 | 
260 | # resume the last training
261 | resume_from = None
262 | 
263 | # checkpointing
264 | checkpoint_config = dict(interval=1, max_keep_ckpts=1)
265 | 
266 | # logging
267 | log_config = dict(
268 |     interval=1,
269 |     hooks=[
270 |         dict(type='TextLoggerHook', interval=50, reset_flag=True),
271 |         dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True)
272 |     ]
273 | )
274 | 
275 | # evaluation
276 | eval_config = dict(interval=total_epochs)
277 | 
278 | # other flags
279 | debug = False
280 | 


--------------------------------------------------------------------------------