├── models ├── csrc │ ├── __init__.py │ ├── setup.py │ ├── msmv_sampling │ │ └── msmv_sampling.h │ └── wrapper.py ├── core │ ├── __init__.py │ └── hook │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── ema.py │ │ └── ema2.py ├── backbones │ ├── eva02 │ │ ├── __init__.py │ │ ├── fpn.py │ │ ├── drop.py │ │ ├── main.py │ │ ├── backbone.py │ │ ├── blocks.py │ │ ├── wrappers.py │ │ └── batch_norm.py │ ├── __init__.py │ └── second_3d.py ├── bbox │ ├── match_costs │ │ ├── __init__.py │ │ └── match_cost.py │ ├── coders │ │ ├── __init__.py │ │ └── nms_free_coder.py │ ├── __init__.py │ ├── assigners │ │ ├── __init__.py │ │ └── hungarian_assigner_3d.py │ └── utils.py ├── lidar_encoder │ ├── __init__.py │ └── sparse_encoder4x.py ├── neck │ └── __init__.py ├── __init__.py ├── opusv1 │ └── opus_sampling.py └── opusv1_fusion │ └── opus_sampling.py ├── demos ├── opusv1.png ├── opusv2.png ├── result.png └── teaser.png ├── dist_train.sh ├── dist_val.sh ├── loaders ├── __init__.py ├── pipelines │ └── __init__.py ├── utils.py ├── builder.py ├── nuscenes_dataset.py ├── ego_pose_dataset.py ├── nuscenes_occupancy_dataset.py ├── ray_metrics.py └── old_metrics.py ├── .gitignore ├── scripts ├── gen_fusion_pretrain_model.py ├── timing.py └── gen_sweep_info.py ├── LICENSE ├── lib └── dvr │ └── dvr.cpp ├── val.py ├── train.py ├── configs ├── opusv1_nusc-occ3d │ ├── opusv1-s_r50_704x256_8f_nusc-occ3d_100e.py │ ├── opusv1-l_r50_704x256_8f_nusc-occ3d_100e.py │ ├── opusv1-m_r50_704x256_8f_nusc-occ3d_100e.py │ └── opusv1-t_r50_704x256_8f_nusc-occ3d_100e.py └── opusv1-fusion_nusc-occ3d │ ├── opusv1-fusion-l_r50_704x256_8f_nusc-occ3d_100e.py │ └── opusv1-fusion-m_r50_704x256_8f_nusc-occ3d_100e.py └── utils.py /models/csrc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .hook import * -------------------------------------------------------------------------------- /models/backbones/eva02/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import EVA02 -------------------------------------------------------------------------------- /demos/opusv1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbwang1997/OPUS/HEAD/demos/opusv1.png -------------------------------------------------------------------------------- /demos/opusv2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbwang1997/OPUS/HEAD/demos/opusv2.png -------------------------------------------------------------------------------- /demos/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbwang1997/OPUS/HEAD/demos/result.png -------------------------------------------------------------------------------- /demos/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbwang1997/OPUS/HEAD/demos/teaser.png -------------------------------------------------------------------------------- /models/bbox/match_costs/__init__.py: -------------------------------------------------------------------------------- 1 | from .match_cost import BBox3DL1Cost 2 | 3 | __all__ = ['BBox3DL1Cost'] -------------------------------------------------------------------------------- /models/bbox/coders/__init__.py: -------------------------------------------------------------------------------- 1 | from .nms_free_coder import NMSFreeCoder 2 | 3 | __all__ = ['NMSFreeCoder'] 4 | -------------------------------------------------------------------------------- /models/lidar_encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparse_encoder4x import SparseEncoder8x 2 | 3 | __all__ = ['SparseEncoder8x'] -------------------------------------------------------------------------------- /models/bbox/__init__.py: -------------------------------------------------------------------------------- 1 | from .assigners import __all__ 2 | from .coders import __all__ 3 | from .match_costs import __all__ -------------------------------------------------------------------------------- /models/bbox/assigners/__init__.py: -------------------------------------------------------------------------------- 1 | from .hungarian_assigner_3d import HungarianAssigner3D 2 | 3 | __all__ = ['HungarianAssigner3D'] 4 | -------------------------------------------------------------------------------- /models/core/hook/__init__.py: -------------------------------------------------------------------------------- 1 | from .ema import MEGVIIEMAHook 2 | from .utils import is_parallel 3 | 4 | 5 | __all__ = ['MEGVIIEMAHook','is_parallel'] -------------------------------------------------------------------------------- /dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | GPUS=$1 4 | CONFIG=$2 5 | python3 -m torch.distributed.run --nproc_per_node $GPUS train.py --config $CONFIG ${@:3} 6 | -------------------------------------------------------------------------------- /models/neck/__init__.py: -------------------------------------------------------------------------------- 1 | from .second_fpn_3d import SECONDFPN_3d,SECONDFPN_3dv2,SECONDFPN_3dv3 2 | 3 | __all__ = ['SECONDFPN_3d','SECONDFPN_3dv2','SECONDFPN_3dv3'] -------------------------------------------------------------------------------- /models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .vovnet import VoVNet 2 | from .eva02 import EVA02 3 | from .second_3d import SECOND_3d 4 | 5 | __all__ = ['VoVNet', 'EVA02','SECOND_3d'] 6 | -------------------------------------------------------------------------------- /dist_val.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | GPUS=$1 4 | CONFIG=$2 5 | WEIGHT=$3 6 | python3 -m torch.distributed.run --nproc_per_node $GPUS val.py --config $CONFIG --weights $WEIGHT 7 | -------------------------------------------------------------------------------- /loaders/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipelines import __all__ 2 | from .nuscenes_dataset import CustomNuScenesDataset 3 | from .nuscenes_occ3d_dataset import NuScenesOcc3DDataset 4 | from .nuscenes_occupancy_dataset import NuScenesOccupancyDataset 5 | -------------------------------------------------------------------------------- /loaders/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .loading import LoadMultiViewImageFromMultiSweeps 2 | from .transforms import PadMultiViewImage, NormalizeMultiviewImage, PhotoMetricDistortionMultiViewImage 3 | 4 | __all__ = [ 5 | 'LoadMultiViewImageFromMultiSweeps', 'PadMultiViewImage', 'NormalizeMultiviewImage', 6 | 'PhotoMetricDistortionMultiViewImage' 7 | ] -------------------------------------------------------------------------------- /models/core/hook/utils.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | __all__ = ['is_parallel'] 4 | 5 | 6 | def is_parallel(model): 7 | """check if model is in parallel mode.""" 8 | parallel_type = ( 9 | nn.parallel.DataParallel, 10 | nn.parallel.DistributedDataParallel, 11 | ) 12 | return isinstance(model, parallel_type) 13 | 14 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbones import __all__ 2 | from .bbox import __all__ 3 | from .lidar_encoder import __all__ 4 | from .neck import __all__ 5 | 6 | from .opusv1.opus import OPUSV1 7 | from .opusv1.opus_head import OPUSV1Head 8 | from .opusv1.opus_transformer import OPUSV1Transformer 9 | 10 | from .opusv1_fusion.opus import OPUSV1Fusion 11 | from .opusv1_fusion.opus_head import OPUSV1FusionHead 12 | from .opusv1_fusion.opus_transformer import OPUSV1FusionTransformer -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # OS generated files 2 | .DS_Store 3 | .DS_Store? 4 | ._* 5 | .Spotlight-V100 6 | .Trashes 7 | ehthumbs.db 8 | Thumbs.db 9 | 10 | # Compiled source 11 | build 12 | debug 13 | Debug 14 | release 15 | Release 16 | x64 17 | *.so 18 | *.whl 19 | 20 | # VS project files 21 | *.sln 22 | *.vcxproj 23 | *.vcxproj.filters 24 | *.vcxproj.user 25 | *.rc 26 | .vs 27 | 28 | # Byte-compiled / optimized / DLL files 29 | *__pycache__* 30 | *.py[cod] 31 | *$py.class 32 | 33 | # Distribution / packaging 34 | .Python 35 | build 36 | develop-eggs 37 | dist 38 | downloads 39 | 40 | # IDE 41 | .idea 42 | .vscode 43 | pyrightconfig.json 44 | 45 | # Custom 46 | data 47 | outputs 48 | prediction 49 | submission 50 | checkpoints 51 | pretrain 52 | visualizations -------------------------------------------------------------------------------- /loaders/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def compose_ego2img(ego2global_t, 5 | ego2global_r, 6 | sensor2global_t, 7 | sensor2global_r, 8 | cam_intrinsic): 9 | R = np.linalg.inv(sensor2global_r) @ ego2global_r 10 | # (ego2global_t - sensor2global_t) @ _inv(sensor2global_r).T 11 | # = (ego2global_t - sensor2global_t) @ sensor2global_r 12 | T = (ego2global_t - sensor2global_t) @ sensor2global_r 13 | 14 | ego2cam_rt = np.eye(4) 15 | ego2cam_rt[:3, :3] = R 16 | ego2cam_rt[:3, 3] = T.T 17 | 18 | viewpad = np.eye(4) 19 | viewpad[:cam_intrinsic.shape[0], :cam_intrinsic.shape[1]] = cam_intrinsic 20 | ego2img = (viewpad @ ego2cam_rt).astype(np.float32) 21 | 22 | return ego2img -------------------------------------------------------------------------------- /scripts/gen_fusion_pretrain_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import re 3 | 4 | lidar_ckpt = torch.load('pretrain/dal-tiny-map66.9-nds71.1.pth') 5 | img_ckpt = torch.load('pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth') 6 | 7 | lidar_dict = lidar_ckpt['state_dict'] 8 | img_dict = img_ckpt['state_dict'] 9 | 10 | lidar_prefix_keys_list=['pts_backbone', 'pts_middle_encoder', 'pts_neck'] 11 | for key in list(lidar_dict.keys()): 12 | flag=False 13 | for prefix in lidar_prefix_keys_list: 14 | if key.startswith(prefix): 15 | flag=True 16 | break 17 | if not flag: 18 | del lidar_dict[key] 19 | 20 | img_prefix_keys_list=['backbone'] 21 | for prefix in img_prefix_keys_list: 22 | for key in img_dict: 23 | if key.startswith(prefix): 24 | new_key=re.sub('backbone', 'img_backbone', key) 25 | lidar_dict[new_key] = img_dict[key] 26 | 27 | torch.save({'state_dict': lidar_dict}, 'pretrain/fusion_pretrain_model.pth') 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Multimedia Computing Group, Nanjing University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /models/csrc/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | 5 | def get_ext_modules(): 6 | return [ 7 | CUDAExtension( 8 | name='_msmv_sampling_cuda', 9 | sources=[ 10 | 'msmv_sampling/msmv_sampling.cpp', 11 | 'msmv_sampling/msmv_sampling_forward.cu', 12 | 'msmv_sampling/msmv_sampling_backward.cu' 13 | ], 14 | include_dirs=['msmv_sampling'], 15 | extra_compile_args=dict( 16 | nvcc=[ 17 | "-gencode=arch=compute_60,code=sm_60", 18 | "-gencode=arch=compute_61,code=sm_61", 19 | "-gencode=arch=compute_70,code=sm_70", 20 | "-gencode=arch=compute_75,code=sm_75", 21 | "-gencode=arch=compute_80,code=sm_80", 22 | "-gencode=arch=compute_86,code=sm_86", 23 | "-gencode=arch=compute_86,code=compute_86", 24 | ] 25 | ) 26 | ) 27 | ] 28 | 29 | 30 | setup( 31 | name='csrc', 32 | ext_modules=get_ext_modules(), 33 | cmdclass={'build_ext': BuildExtension} 34 | ) 35 | 36 | -------------------------------------------------------------------------------- /models/backbones/eva02/fpn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import fvcore.nn.weight_init as weight_init 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | 7 | def _assert_strides_are_log2_contiguous(strides): 8 | """ 9 | Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2". 10 | """ 11 | for i, stride in enumerate(strides[1:], 1): 12 | assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format( 13 | stride, strides[i - 1] 14 | ) 15 | 16 | 17 | class LastLevelMaxPool(nn.Module): 18 | """ 19 | This module is used in the original FPN to generate a downsampled 20 | P6 feature from P5. 21 | """ 22 | 23 | def __init__(self): 24 | super().__init__() 25 | self.num_levels = 1 26 | self.in_feature = "p5" 27 | 28 | def forward(self, x): 29 | return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)] 30 | 31 | 32 | class LastLevelP6P7(nn.Module): 33 | """ 34 | This module is used in RetinaNet to generate extra layers, P6 and P7 from 35 | C5 feature. 36 | """ 37 | 38 | def __init__(self, in_channels, out_channels, in_feature="res5"): 39 | super().__init__() 40 | self.num_levels = 2 41 | self.in_feature = in_feature 42 | self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) 43 | self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) 44 | for module in [self.p6, self.p7]: 45 | weight_init.c2_xavier_fill(module) 46 | 47 | def forward(self, c5): 48 | p6 = self.p6(c5) 49 | p7 = self.p7(F.relu(p6)) 50 | return [p6, p7] 51 | -------------------------------------------------------------------------------- /models/backbones/eva02/drop.py: -------------------------------------------------------------------------------- 1 | # https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py 2 | import torch.nn as nn 3 | 4 | 5 | def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True): 6 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). 7 | 8 | This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, 9 | the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... 10 | See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for 11 | changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 12 | 'survival rate' as the argument. 13 | 14 | """ 15 | if drop_prob == 0. or not training: 16 | return x 17 | keep_prob = 1 - drop_prob 18 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 19 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 20 | if keep_prob > 0.0 and scale_by_keep: 21 | random_tensor.div_(keep_prob) 22 | return x * random_tensor 23 | 24 | 25 | class DropPath(nn.Module): 26 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). 27 | """ 28 | def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): 29 | super(DropPath, self).__init__() 30 | self.drop_prob = drop_prob 31 | self.scale_by_keep = scale_by_keep 32 | 33 | def forward(self, x): 34 | return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) 35 | 36 | def extra_repr(self): 37 | return f'drop_prob={round(self.drop_prob,3):0.3f}' -------------------------------------------------------------------------------- /models/csrc/msmv_sampling/msmv_sampling.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | at::Tensor ms_deform_attn_cuda_c2345_forward( 6 | const at::Tensor& feat_c2, // [B, N, H, W, C] 7 | const at::Tensor& feat_c3, // [B, N, H, W, C] 8 | const at::Tensor& feat_c4, // [B, N, H, W, C] 9 | const at::Tensor& feat_c5, // [B, N, H, W, C] 10 | const at::Tensor& sampling_loc, // [B, Q, P, 3] 11 | const at::Tensor& attn_weight // [B, Q, P, 4] 12 | ); 13 | 14 | std::vector ms_deform_attn_cuda_c2345_backward( 15 | const at::Tensor& feat_c2, // [B, N, H, W, C] 16 | const at::Tensor& feat_c3, // [B, N, H, W, C] 17 | const at::Tensor& feat_c4, // [B, N, H, W, C] 18 | const at::Tensor& feat_c5, // [B, N, H, W, C] 19 | const at::Tensor& sampling_loc, // [B, Q, P, 3] 20 | const at::Tensor& attn_weight, // [B, Q, P, 4] 21 | const at::Tensor& grad_output 22 | ); 23 | 24 | at::Tensor ms_deform_attn_cuda_c23456_forward( 25 | const at::Tensor& feat_c2, // [B, N, H, W, C] 26 | const at::Tensor& feat_c3, // [B, N, H, W, C] 27 | const at::Tensor& feat_c4, // [B, N, H, W, C] 28 | const at::Tensor& feat_c5, // [B, N, H, W, C] 29 | const at::Tensor& feat_c6, // [B, N, H, W, C] 30 | const at::Tensor& sampling_loc, // [B, Q, P, 3] 31 | const at::Tensor& attn_weight // [B, Q, P, 4] 32 | ); 33 | 34 | std::vector ms_deform_attn_cuda_c23456_backward( 35 | const at::Tensor& grad_output, 36 | const at::Tensor& feat_c2, // [B, N, H, W, C] 37 | const at::Tensor& feat_c3, // [B, N, H, W, C] 38 | const at::Tensor& feat_c4, // [B, N, H, W, C] 39 | const at::Tensor& feat_c5, // [B, N, H, W, C] 40 | const at::Tensor& feat_c6, // [B, N, H, W, C] 41 | const at::Tensor& sampling_loc, // [B, Q, P, 3] 42 | const at::Tensor& attn_weight // [B, Q, P, 4] 43 | ); -------------------------------------------------------------------------------- /loaders/builder.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from mmcv.parallel import collate 3 | from mmcv.runner import get_dist_info 4 | from torch.utils.data import DataLoader 5 | from mmdet.datasets.builder import worker_init_fn 6 | from mmdet.datasets.samplers import DistributedGroupSampler, DistributedSampler, GroupSampler 7 | 8 | 9 | def build_dataloader(dataset, 10 | samples_per_gpu, 11 | workers_per_gpu, 12 | num_gpus=1, 13 | dist=True, 14 | shuffle=True, 15 | seed=None, 16 | **kwargs): 17 | 18 | rank, world_size = get_dist_info() 19 | if dist: 20 | # DistributedGroupSampler will definitely shuffle the data to satisfy 21 | # that images on each GPU are in the same group 22 | if shuffle: 23 | sampler = DistributedGroupSampler( 24 | dataset, samples_per_gpu, world_size, rank, seed=seed) 25 | else: 26 | sampler = DistributedSampler( 27 | dataset, world_size, rank, shuffle=False, seed=seed) 28 | batch_size = samples_per_gpu 29 | num_workers = workers_per_gpu 30 | else: 31 | sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None 32 | batch_size = num_gpus * samples_per_gpu 33 | num_workers = num_gpus * workers_per_gpu 34 | 35 | init_fn = partial( 36 | worker_init_fn, num_workers=num_workers, rank=rank, 37 | seed=seed) if seed is not None else None 38 | 39 | data_loader = DataLoader( 40 | dataset, 41 | batch_size=batch_size, 42 | sampler=sampler, 43 | num_workers=num_workers, 44 | collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), 45 | pin_memory=False, 46 | worker_init_fn=init_fn, 47 | **kwargs) 48 | 49 | return data_loader 50 | -------------------------------------------------------------------------------- /models/bbox/match_costs/match_cost.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mmdet.core.bbox.match_costs.builder import MATCH_COST 3 | 4 | 5 | @MATCH_COST.register_module() 6 | class BBox3DL1Cost(object): 7 | """BBox3DL1Cost. 8 | Args: 9 | weight (int | float, optional): loss_weight 10 | """ 11 | 12 | def __init__(self, weight=1.0): 13 | self.weight = weight 14 | 15 | def __call__(self, bbox_pred, gt_bboxes): 16 | """ 17 | Args: 18 | bbox_pred (Tensor): Predicted boxes with normalized coordinates 19 | (cx, cy, w, h), which are all in range [0, 1]. Shape 20 | [num_query, 4]. 21 | gt_bboxes (Tensor): Ground truth boxes with normalized 22 | coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. 23 | Returns: 24 | torch.Tensor: bbox_cost value with weight 25 | """ 26 | bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) 27 | return bbox_cost * self.weight 28 | 29 | 30 | @MATCH_COST.register_module() 31 | class BBoxBEVL1Cost(object): 32 | def __init__(self, weight, pc_range): 33 | self.weight = weight 34 | self.pc_range = pc_range 35 | 36 | def __call__(self, bboxes, gt_bboxes): 37 | pc_start = bboxes.new(self.pc_range[0:2]) 38 | pc_range = bboxes.new(self.pc_range[3:5]) - bboxes.new(self.pc_range[0:2]) 39 | # normalize the box center to [0, 1] 40 | normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range 41 | normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range 42 | reg_cost = torch.cdist(normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1) 43 | return reg_cost * self.weight 44 | 45 | 46 | @MATCH_COST.register_module() 47 | class IoU3DCost(object): 48 | def __init__(self, weight): 49 | self.weight = weight 50 | 51 | def __call__(self, iou): 52 | iou_cost = - iou 53 | return iou_cost * self.weight 54 | -------------------------------------------------------------------------------- /lib/dvr/dvr.cpp: -------------------------------------------------------------------------------- 1 | // Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting 2 | // Modified by Haisong Liu 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | /* 9 | * CUDA forward declarations 10 | */ 11 | 12 | std::vector render_forward_cuda(torch::Tensor sigma, 13 | torch::Tensor origin, 14 | torch::Tensor points, 15 | torch::Tensor tindex, 16 | const std::vector grid, 17 | std::string phase_name); 18 | 19 | std::vector 20 | render_cuda(torch::Tensor sigma, torch::Tensor origin, torch::Tensor points, 21 | torch::Tensor tindex, std::string loss_name); 22 | 23 | torch::Tensor init_cuda(torch::Tensor points, torch::Tensor tindex, 24 | const std::vector grid); 25 | 26 | 27 | /* 28 | * C++ interface 29 | */ 30 | 31 | #define CHECK_CUDA(x) \ 32 | TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") 33 | #define CHECK_CONTIGUOUS(x) \ 34 | TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 35 | #define CHECK_INPUT(x) \ 36 | CHECK_CUDA(x); \ 37 | CHECK_CONTIGUOUS(x) 38 | 39 | std::vector 40 | render_forward(torch::Tensor sigma, torch::Tensor origin, torch::Tensor points, 41 | torch::Tensor tindex, const std::vector grid, 42 | std::string phase_name) { 43 | CHECK_INPUT(sigma); 44 | CHECK_INPUT(origin); 45 | CHECK_INPUT(points); 46 | CHECK_INPUT(tindex); 47 | return render_forward_cuda(sigma, origin, points, tindex, grid, phase_name); 48 | } 49 | 50 | 51 | std::vector render(torch::Tensor sigma, torch::Tensor origin, 52 | torch::Tensor points, torch::Tensor tindex, 53 | std::string loss_name) { 54 | CHECK_INPUT(sigma); 55 | CHECK_INPUT(origin); 56 | CHECK_INPUT(points); 57 | CHECK_INPUT(tindex); 58 | return render_cuda(sigma, origin, points, tindex, loss_name); 59 | } 60 | 61 | torch::Tensor init(torch::Tensor points, torch::Tensor tindex, 62 | const std::vector grid) { 63 | CHECK_INPUT(points); 64 | CHECK_INPUT(tindex); 65 | return init_cuda(points, tindex, grid); 66 | } 67 | 68 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 69 | m.def("init", &init, "Initialize"); 70 | m.def("render", &render, "Render"); 71 | m.def("render_forward", &render_forward, "Render (forward pass only)"); 72 | } 73 | -------------------------------------------------------------------------------- /models/backbones/eva02/main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | import torch.nn as nn 4 | from mmcv.runner.checkpoint import load_state_dict 5 | from mmdet.models.builder import BACKBONES 6 | from .vit import ViT, SimpleFeaturePyramid, partial 7 | from .fpn import LastLevelMaxPool 8 | 9 | 10 | @BACKBONES.register_module() 11 | class EVA02(nn.Module): 12 | def __init__( 13 | self, 14 | # args for ViT 15 | img_size=1024, 16 | real_img_size=(256, 704), 17 | patch_size=16, 18 | in_chans=3, 19 | embed_dim=768, 20 | depth=12, 21 | num_heads=12, 22 | mlp_ratio=4*2/3, 23 | qkv_bias=True, 24 | drop_path_rate=0.0, 25 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 26 | use_abs_pos=True, 27 | pt_hw_seq_len=16, 28 | intp_freq=True, 29 | window_size=0, 30 | window_block_indexes=(), 31 | residual_block_indexes=(), 32 | use_act_checkpoint=False, 33 | pretrain_img_size=224, 34 | pretrain_use_cls_token=True, 35 | out_feature="last_feat", 36 | xattn=False, 37 | frozen_blocks=-1, 38 | # args for simple FPN 39 | fpn_in_feature="last_feat", 40 | fpn_out_channels=256, 41 | fpn_scale_factors=(4.0, 2.0, 1.0, 0.5), 42 | fpn_top_block=False, 43 | fpn_norm="LN", 44 | fpn_square_pad=0, 45 | pretrained=None 46 | ): 47 | super().__init__() 48 | 49 | self.backbone = SimpleFeaturePyramid( 50 | ViT( 51 | img_size=img_size, 52 | real_img_size=real_img_size, 53 | patch_size=patch_size, 54 | in_chans=in_chans, 55 | embed_dim=embed_dim, 56 | depth=depth, 57 | num_heads=num_heads, 58 | mlp_ratio=mlp_ratio, 59 | qkv_bias=qkv_bias, 60 | drop_path_rate=drop_path_rate, 61 | norm_layer=norm_layer, 62 | use_abs_pos=use_abs_pos, 63 | pt_hw_seq_len=pt_hw_seq_len, 64 | intp_freq=intp_freq, 65 | window_size=window_size, 66 | window_block_indexes=window_block_indexes, 67 | residual_block_indexes=residual_block_indexes, 68 | use_act_checkpoint=use_act_checkpoint, 69 | pretrain_img_size=pretrain_img_size, 70 | pretrain_use_cls_token=pretrain_use_cls_token, 71 | out_feature=out_feature, 72 | xattn=xattn, 73 | frozen_blocks=frozen_blocks, 74 | ), 75 | in_feature=fpn_in_feature, 76 | out_channels=fpn_out_channels, 77 | scale_factors=fpn_scale_factors, 78 | top_block=LastLevelMaxPool() if fpn_top_block else None, 79 | norm=fpn_norm, 80 | square_pad=fpn_square_pad, 81 | ) 82 | self.init_weights(pretrained) 83 | 84 | def init_weights(self, pretrained=None): 85 | if pretrained is None: 86 | return 87 | logging.info('Loading pretrained weights from %s' % pretrained) 88 | state_dict = torch.load(pretrained)['model'] 89 | load_state_dict(self, state_dict, strict=False) 90 | 91 | def forward(self, x): 92 | outs = self.backbone(x) 93 | return list(outs.values()) 94 | -------------------------------------------------------------------------------- /models/bbox/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def normalize_bbox(bboxes): 5 | cx = bboxes[..., 0:1] 6 | cy = bboxes[..., 1:2] 7 | cz = bboxes[..., 2:3] 8 | w = bboxes[..., 3:4].log() 9 | l = bboxes[..., 4:5].log() 10 | h = bboxes[..., 5:6].log() 11 | rot = bboxes[..., 6:7] 12 | 13 | if bboxes.size(-1) > 7: 14 | vx = bboxes[..., 7:8] 15 | vy = bboxes[..., 8:9] 16 | out = torch.cat([cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy], dim=-1) 17 | else: 18 | out = torch.cat([cx, cy, w, l, cz, h, rot.sin(), rot.cos()], dim=-1) 19 | 20 | return out 21 | 22 | 23 | def denormalize_bbox(normalized_bboxes): 24 | rot_sin = normalized_bboxes[..., 6:7] 25 | rot_cos = normalized_bboxes[..., 7:8] 26 | rot = torch.atan2(rot_sin, rot_cos) 27 | 28 | cx = normalized_bboxes[..., 0:1] 29 | cy = normalized_bboxes[..., 1:2] 30 | cz = normalized_bboxes[..., 4:5] 31 | 32 | w = normalized_bboxes[..., 2:3].exp() 33 | l = normalized_bboxes[..., 3:4].exp() 34 | h = normalized_bboxes[..., 5:6].exp() 35 | 36 | if normalized_bboxes.size(-1) > 8: 37 | vx = normalized_bboxes[..., 8:9] 38 | vy = normalized_bboxes[..., 9:10] 39 | out = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) 40 | else: 41 | out = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) 42 | 43 | return out 44 | 45 | 46 | def encode_bbox(bboxes, pc_range=None): 47 | xyz = bboxes[..., 0:3].clone() 48 | wlh = bboxes[..., 3:6].log() 49 | rot = bboxes[..., 6:7] 50 | 51 | if pc_range is not None: 52 | xyz[..., 0] = (xyz[..., 0] - pc_range[0]) / (pc_range[3] - pc_range[0]) 53 | xyz[..., 1] = (xyz[..., 1] - pc_range[1]) / (pc_range[4] - pc_range[1]) 54 | xyz[..., 2] = (xyz[..., 2] - pc_range[2]) / (pc_range[5] - pc_range[2]) 55 | 56 | if bboxes.shape[-1] > 7: 57 | vel = bboxes[..., 7:9].clone() 58 | return torch.cat([xyz, wlh, rot.sin(), rot.cos(), vel], dim=-1) 59 | else: 60 | return torch.cat([xyz, wlh, rot.sin(), rot.cos()], dim=-1) 61 | 62 | 63 | def decode_bbox(bboxes, pc_range=None): 64 | xyz = bboxes[..., 0:3].clone() 65 | wlh = bboxes[..., 3:6].exp() 66 | rot = torch.atan2(bboxes[..., 6:7], bboxes[..., 7:8]) 67 | 68 | if pc_range is not None: 69 | xyz[..., 0] = xyz[..., 0] * (pc_range[3] - pc_range[0]) + pc_range[0] 70 | xyz[..., 1] = xyz[..., 1] * (pc_range[4] - pc_range[1]) + pc_range[1] 71 | xyz[..., 2] = xyz[..., 2] * (pc_range[5] - pc_range[2]) + pc_range[2] 72 | 73 | if bboxes.shape[-1] > 8: 74 | vel = bboxes[..., 8:10].clone() 75 | return torch.cat([xyz, wlh, rot, vel], dim=-1) 76 | else: 77 | return torch.cat([xyz, wlh, rot], dim=-1) 78 | 79 | 80 | def encode_points(points, pc_range=None): 81 | points = points.clone() 82 | points[..., 0] = (points[..., 0] - pc_range[0]) / (pc_range[3] - pc_range[0]) 83 | points[..., 1] = (points[..., 1] - pc_range[1]) / (pc_range[4] - pc_range[1]) 84 | points[..., 2] = (points[..., 2] - pc_range[2]) / (pc_range[5] - pc_range[2]) 85 | return points 86 | 87 | 88 | def decode_points(points, pc_range=None): 89 | points = points.clone() 90 | points[..., 0] = points[..., 0] * (pc_range[3] - pc_range[0]) + pc_range[0] 91 | points[..., 1] = points[..., 1] * (pc_range[4] - pc_range[1]) + pc_range[1] 92 | points[..., 2] = points[..., 2] * (pc_range[5] - pc_range[2]) + pc_range[2] 93 | return points 94 | -------------------------------------------------------------------------------- /models/backbones/eva02/backbone.py: -------------------------------------------------------------------------------- 1 | # https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | from dataclasses import dataclass 5 | from typing import Optional 6 | 7 | 8 | @dataclass 9 | class ShapeSpec: 10 | """ 11 | A simple structure that contains basic shape specification about a tensor. 12 | It is often used as the auxiliary inputs/outputs of models, 13 | to complement the lack of shape inference ability among pytorch modules. 14 | """ 15 | 16 | channels: Optional[int] = None 17 | height: Optional[int] = None 18 | width: Optional[int] = None 19 | stride: Optional[int] = None 20 | 21 | 22 | # Copyright (c) Facebook, Inc. and its affiliates. 23 | from abc import ABCMeta, abstractmethod 24 | from typing import Dict 25 | import torch.nn as nn 26 | 27 | 28 | __all__ = ["Backbone"] 29 | 30 | 31 | class Backbone(nn.Module, metaclass=ABCMeta): 32 | """ 33 | Abstract base class for network backbones. 34 | """ 35 | 36 | def __init__(self): 37 | """ 38 | The `__init__` method of any subclass can specify its own set of arguments. 39 | """ 40 | super().__init__() 41 | 42 | @abstractmethod 43 | def forward(self): 44 | """ 45 | Subclasses must override this method, but adhere to the same return type. 46 | 47 | Returns: 48 | dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor 49 | """ 50 | pass 51 | 52 | @property 53 | def size_divisibility(self) -> int: 54 | """ 55 | Some backbones require the input height and width to be divisible by a 56 | specific integer. This is typically true for encoder / decoder type networks 57 | with lateral connection (e.g., FPN) for which feature maps need to match 58 | dimension in the "bottom up" and "top down" paths. Set to 0 if no specific 59 | input size divisibility is required. 60 | """ 61 | return 0 62 | 63 | @property 64 | def padding_constraints(self) -> Dict[str, int]: 65 | """ 66 | This property is a generalization of size_divisibility. Some backbones and training 67 | recipes require specific padding constraints, such as enforcing divisibility by a specific 68 | integer (e.g., FPN) or padding to a square (e.g., ViTDet with large-scale jitter 69 | in :paper:vitdet). `padding_constraints` contains these optional items like: 70 | { 71 | "size_divisibility": int, 72 | "square_size": int, 73 | # Future options are possible 74 | } 75 | `size_divisibility` will read from here if presented and `square_size` indicates the 76 | square padding size if `square_size` > 0. 77 | 78 | TODO: use type of Dict[str, int] to avoid torchscipt issues. The type of padding_constraints 79 | could be generalized as TypedDict (Python 3.8+) to support more types in the future. 80 | """ 81 | return {} 82 | 83 | def output_shape(self): 84 | """ 85 | Returns: 86 | dict[str->ShapeSpec] 87 | """ 88 | # this is a backward-compatible default 89 | return { 90 | name: ShapeSpec( 91 | channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] 92 | ) 93 | for name in self._out_features 94 | } -------------------------------------------------------------------------------- /models/backbones/eva02/blocks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | import fvcore.nn.weight_init as weight_init 5 | from torch import nn 6 | 7 | from .batch_norm import FrozenBatchNorm2d, get_norm 8 | from .wrappers import Conv2d 9 | 10 | 11 | """ 12 | CNN building blocks. 13 | """ 14 | 15 | 16 | class CNNBlockBase(nn.Module): 17 | """ 18 | A CNN block is assumed to have input channels, output channels and a stride. 19 | The input and output of `forward()` method must be NCHW tensors. 20 | The method can perform arbitrary computation but must match the given 21 | channels and stride specification. 22 | 23 | Attribute: 24 | in_channels (int): 25 | out_channels (int): 26 | stride (int): 27 | """ 28 | 29 | def __init__(self, in_channels, out_channels, stride): 30 | """ 31 | The `__init__` method of any subclass should also contain these arguments. 32 | 33 | Args: 34 | in_channels (int): 35 | out_channels (int): 36 | stride (int): 37 | """ 38 | super().__init__() 39 | self.in_channels = in_channels 40 | self.out_channels = out_channels 41 | self.stride = stride 42 | 43 | def freeze(self): 44 | """ 45 | Make this block not trainable. 46 | This method sets all parameters to `requires_grad=False`, 47 | and convert all BatchNorm layers to FrozenBatchNorm 48 | 49 | Returns: 50 | the block itself 51 | """ 52 | for p in self.parameters(): 53 | p.requires_grad = False 54 | FrozenBatchNorm2d.convert_frozen_batchnorm(self) 55 | return self 56 | 57 | 58 | class DepthwiseSeparableConv2d(nn.Module): 59 | """ 60 | A kxk depthwise convolution + a 1x1 convolution. 61 | 62 | In :paper:`xception`, norm & activation are applied on the second conv. 63 | :paper:`mobilenet` uses norm & activation on both convs. 64 | """ 65 | 66 | def __init__( 67 | self, 68 | in_channels, 69 | out_channels, 70 | kernel_size=3, 71 | padding=1, 72 | dilation=1, 73 | *, 74 | norm1=None, 75 | activation1=None, 76 | norm2=None, 77 | activation2=None, 78 | ): 79 | """ 80 | Args: 81 | norm1, norm2 (str or callable): normalization for the two conv layers. 82 | activation1, activation2 (callable(Tensor) -> Tensor): activation 83 | function for the two conv layers. 84 | """ 85 | super().__init__() 86 | self.depthwise = Conv2d( 87 | in_channels, 88 | in_channels, 89 | kernel_size=kernel_size, 90 | padding=padding, 91 | dilation=dilation, 92 | groups=in_channels, 93 | bias=not norm1, 94 | norm=get_norm(norm1, in_channels), 95 | activation=activation1, 96 | ) 97 | self.pointwise = Conv2d( 98 | in_channels, 99 | out_channels, 100 | kernel_size=1, 101 | bias=not norm2, 102 | norm=get_norm(norm2, out_channels), 103 | activation=activation2, 104 | ) 105 | 106 | # default initialization 107 | weight_init.c2_msra_fill(self.depthwise) 108 | weight_init.c2_msra_fill(self.pointwise) 109 | 110 | def forward(self, x): 111 | return self.pointwise(self.depthwise(x)) 112 | -------------------------------------------------------------------------------- /loaders/nuscenes_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from mmdet.datasets import DATASETS 4 | from mmdet3d.datasets import NuScenesDataset 5 | from pyquaternion import Quaternion 6 | 7 | 8 | @DATASETS.register_module() 9 | class CustomNuScenesDataset(NuScenesDataset): 10 | 11 | def collect_sweeps(self, index, into_past=60, into_future=60): 12 | all_sweeps_prev = [] 13 | curr_index = index 14 | while len(all_sweeps_prev) < into_past: 15 | curr_sweeps = self.data_infos[curr_index]['sweeps'] 16 | if len(curr_sweeps) == 0: 17 | break 18 | all_sweeps_prev.extend(curr_sweeps) 19 | all_sweeps_prev.append(self.data_infos[curr_index - 1]['cams']) 20 | curr_index = curr_index - 1 21 | 22 | all_sweeps_next = [] 23 | curr_index = index + 1 24 | while len(all_sweeps_next) < into_future: 25 | if curr_index >= len(self.data_infos): 26 | break 27 | curr_sweeps = self.data_infos[curr_index]['sweeps'] 28 | all_sweeps_next.extend(curr_sweeps[::-1]) 29 | all_sweeps_next.append(self.data_infos[curr_index]['cams']) 30 | curr_index = curr_index + 1 31 | 32 | return all_sweeps_prev, all_sweeps_next 33 | 34 | def get_data_info(self, index): 35 | info = self.data_infos[index] 36 | sweeps_prev, sweeps_next = self.collect_sweeps(index) 37 | 38 | ego2global_translation = info['ego2global_translation'] 39 | ego2global_rotation = info['ego2global_rotation'] 40 | lidar2ego_translation = info['lidar2ego_translation'] 41 | lidar2ego_rotation = info['lidar2ego_rotation'] 42 | ego2global_rotation = Quaternion(ego2global_rotation).rotation_matrix 43 | lidar2ego_rotation = Quaternion(lidar2ego_rotation).rotation_matrix 44 | 45 | input_dict = dict( 46 | sample_idx=info['token'], 47 | sweeps={'prev': sweeps_prev, 'next': sweeps_next}, 48 | timestamp=info['timestamp'] / 1e6, 49 | ego2global_translation=ego2global_translation, 50 | ego2global_rotation=ego2global_rotation, 51 | lidar2ego_translation=lidar2ego_translation, 52 | lidar2ego_rotation=lidar2ego_rotation, 53 | ) 54 | 55 | if self.modality['use_camera']: 56 | img_paths = [] 57 | img_timestamps = [] 58 | lidar2img_rts = [] 59 | 60 | for _, cam_info in info['cams'].items(): 61 | img_paths.append(os.path.relpath(cam_info['data_path'])) 62 | img_timestamps.append(cam_info['timestamp'] / 1e6) 63 | 64 | # obtain lidar to image transformation matrix 65 | lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) 66 | lidar2cam_t = cam_info['sensor2lidar_translation'] @ lidar2cam_r.T 67 | 68 | lidar2cam_rt = np.eye(4) 69 | lidar2cam_rt[:3, :3] = lidar2cam_r.T 70 | lidar2cam_rt[3, :3] = -lidar2cam_t 71 | 72 | intrinsic = cam_info['cam_intrinsic'] 73 | viewpad = np.eye(4) 74 | viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic 75 | lidar2img_rt = (viewpad @ lidar2cam_rt.T) 76 | lidar2img_rts.append(lidar2img_rt) 77 | 78 | input_dict.update(dict( 79 | img_filename=img_paths, 80 | img_timestamp=img_timestamps, 81 | lidar2img=lidar2img_rts, 82 | )) 83 | 84 | if not self.test_mode: 85 | annos = self.get_ann_info(index) 86 | input_dict['ann_info'] = annos 87 | 88 | return input_dict 89 | -------------------------------------------------------------------------------- /loaders/ego_pose_dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from pyquaternion import Quaternion 4 | from torch.utils.data import Dataset 5 | np.set_printoptions(precision=3, suppress=True) 6 | 7 | 8 | def trans_matrix(T, R): 9 | tm = np.eye(4) 10 | tm[:3, :3] = R.rotation_matrix 11 | tm[:3, 3] = T 12 | return tm 13 | 14 | 15 | class EgoPoseDataset(Dataset): 16 | def __init__(self, data_infos): 17 | super(EgoPoseDataset, self).__init__() 18 | 19 | self.data_infos = data_infos 20 | self.scene_frames = {} 21 | 22 | for info in data_infos: 23 | scene_token = self.get_scene_token(info) 24 | if scene_token not in self.scene_frames: 25 | self.scene_frames[scene_token] = [] 26 | self.scene_frames[scene_token].append(info) 27 | 28 | def __len__(self): 29 | return len(self.data_infos) 30 | 31 | def get_scene_token(self, info): 32 | if 'scene_token' in info: 33 | scene_name = info['scene_token'] 34 | elif 'scene_name' in info: 35 | scene_name = info['scene_name'] 36 | else: 37 | scene_name = info['occ_path'].split('occupancy/')[-1].split('/')[0] 38 | return scene_name 39 | 40 | def get_ego_from_lidar(self, info): 41 | ego_from_lidar = trans_matrix( 42 | np.array(info['lidar2ego_translation']), 43 | Quaternion(info['lidar2ego_rotation'])) 44 | return ego_from_lidar 45 | 46 | def get_global_pose(self, info, inverse=False): 47 | global_from_ego = trans_matrix( 48 | np.array(info['ego2global_translation']), 49 | Quaternion(info['ego2global_rotation'])) 50 | ego_from_lidar = trans_matrix( 51 | np.array(info['lidar2ego_translation']), 52 | Quaternion(info['lidar2ego_rotation'])) 53 | pose = global_from_ego.dot(ego_from_lidar) 54 | if inverse: 55 | pose = np.linalg.inv(pose) 56 | return pose 57 | 58 | def __getitem__(self, idx): 59 | info = self.data_infos[idx] 60 | 61 | ref_sample_token = info['token'] 62 | ref_lidar_from_global = self.get_global_pose(info, inverse=True) 63 | ref_ego_from_lidar = self.get_ego_from_lidar(info) 64 | 65 | scene_token = self.get_scene_token(info) 66 | scene_frame = self.scene_frames[scene_token] 67 | ref_index = scene_frame.index(info) 68 | 69 | # NOTE: getting output frames 70 | output_origin_list = [] 71 | for curr_index in range(len(scene_frame)): 72 | # if this exists a valid target 73 | if curr_index == ref_index: 74 | origin_tf = np.array([0.0, 0.0, 0.0], dtype=np.float32) 75 | else: 76 | # transform from the current lidar frame to global and then to the reference lidar frame 77 | global_from_curr = self.get_global_pose(scene_frame[curr_index], inverse=False) 78 | ref_from_curr = ref_lidar_from_global.dot(global_from_curr) 79 | origin_tf = np.array(ref_from_curr[:3, 3], dtype=np.float32) 80 | 81 | origin_tf_pad = np.ones([4]) 82 | origin_tf_pad[:3] = origin_tf # pad to [4] 83 | origin_tf = np.dot(ref_ego_from_lidar[:3], origin_tf_pad.T).T # [3] 84 | 85 | # origin 86 | if np.abs(origin_tf[0]) < 39 and np.abs(origin_tf[1]) < 39: 87 | output_origin_list.append(origin_tf) 88 | 89 | # select 8 origins 90 | if len(output_origin_list) > 8: 91 | select_idx = np.round(np.linspace(0, len(output_origin_list) - 1, 8)).astype(np.int64) 92 | output_origin_list = [output_origin_list[i] for i in select_idx] 93 | 94 | output_origin_tensor = torch.from_numpy(np.stack(output_origin_list)) # [T, 3] 95 | 96 | return (ref_sample_token, output_origin_tensor) 97 | -------------------------------------------------------------------------------- /models/bbox/assigners/hungarian_assigner_3d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from mmdet.core.bbox.builder import BBOX_ASSIGNERS 4 | from mmdet.core.bbox.assigners import AssignResult 5 | from mmdet.core.bbox.assigners import BaseAssigner 6 | from mmdet.core.bbox.match_costs import build_match_cost 7 | from ..utils import normalize_bbox 8 | 9 | try: 10 | from scipy.optimize import linear_sum_assignment 11 | except ImportError: 12 | linear_sum_assignment = None 13 | 14 | 15 | @BBOX_ASSIGNERS.register_module() 16 | class HungarianAssigner3D(BaseAssigner): 17 | def __init__(self, 18 | cls_cost=dict(type='ClassificationCost', weight=1.), 19 | reg_cost=dict(type='BBoxL1Cost', weight=1.0), 20 | iou_cost=dict(type='IoUCost', weight=0.0), 21 | pc_range=None): 22 | self.cls_cost = build_match_cost(cls_cost) 23 | self.reg_cost = build_match_cost(reg_cost) 24 | self.iou_cost = build_match_cost(iou_cost) 25 | self.pc_range = pc_range 26 | 27 | def assign(self, 28 | bbox_pred, 29 | cls_pred, 30 | gt_bboxes, 31 | gt_labels, 32 | gt_bboxes_ignore=None, 33 | code_weights=None, 34 | with_velo=False): 35 | assert gt_bboxes_ignore is None, \ 36 | 'Only case when gt_bboxes_ignore is None is supported.' 37 | num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) 38 | 39 | # 1. assign -1 by default 40 | assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), 41 | -1, 42 | dtype=torch.long) 43 | assigned_labels = bbox_pred.new_full((num_bboxes, ), 44 | -1, 45 | dtype=torch.long) 46 | if num_gts == 0 or num_bboxes == 0: 47 | # No ground truth or boxes, return empty assignment 48 | if num_gts == 0: 49 | # No ground truth, assign all to background 50 | assigned_gt_inds[:] = 0 51 | return AssignResult( 52 | num_gts, assigned_gt_inds, None, labels=assigned_labels) 53 | 54 | # 2. compute the weighted costs 55 | # classification and bboxcost. 56 | cls_cost = self.cls_cost(cls_pred, gt_labels) 57 | # regression L1 cost 58 | normalized_gt_bboxes = normalize_bbox(gt_bboxes) 59 | 60 | if code_weights is not None: 61 | bbox_pred = bbox_pred * code_weights 62 | normalized_gt_bboxes = normalized_gt_bboxes * code_weights 63 | 64 | if with_velo: 65 | reg_cost = self.reg_cost(bbox_pred, normalized_gt_bboxes) 66 | else: 67 | reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8]) 68 | 69 | # weighted sum of above two costs 70 | cost = cls_cost + reg_cost 71 | 72 | # 3. do Hungarian matching on CPU using linear_sum_assignment 73 | cost = cost.detach().cpu() 74 | cost = torch.nan_to_num(cost, nan=100.0, posinf=100.0, neginf=-100.0) 75 | 76 | if linear_sum_assignment is None: 77 | raise ImportError('Please run "pip install scipy" ' 78 | 'to install scipy first.') 79 | 80 | matched_row_inds, matched_col_inds = linear_sum_assignment(cost) 81 | matched_row_inds = torch.from_numpy(matched_row_inds).to( 82 | bbox_pred.device) 83 | matched_col_inds = torch.from_numpy(matched_col_inds).to( 84 | bbox_pred.device) 85 | 86 | # 4. assign backgrounds and foregrounds 87 | # assign all indices to backgrounds first 88 | assigned_gt_inds[:] = 0 89 | # assign foregrounds based on matching results 90 | assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 91 | assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] 92 | return AssignResult( 93 | num_gts, assigned_gt_inds, None, labels=assigned_labels) 94 | -------------------------------------------------------------------------------- /models/csrc/wrapper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | try: 5 | from ._msmv_sampling_cuda import _ms_deform_attn_cuda_c2345_forward, _ms_deform_attn_cuda_c2345_backward 6 | from ._msmv_sampling_cuda import _ms_deform_attn_cuda_c23456_forward, _ms_deform_attn_cuda_c23456_backward 7 | MSMV_CUDA = True 8 | except ImportError as e: 9 | print('Warning: failed to load one or more CUDA extensions, performance may be hurt.') 10 | print('Error message:', e) 11 | MSMV_CUDA = False 12 | 13 | 14 | def msmv_sampling_pytorch(mlvl_feats, sampling_locations, scale_weights): 15 | """ 16 | value: [B, N, H1W1 + H2W2..., C] 17 | sampling_locations: [B, Q, P, 3] 18 | scale_weights: [B, Q, P, 4] 19 | """ 20 | assert scale_weights.shape[-1] == len(mlvl_feats) 21 | 22 | B, C, _, _, _ = mlvl_feats[0].shape 23 | _, Q, P, _ = sampling_locations.shape 24 | 25 | sampling_locations = sampling_locations * 2 - 1 26 | sampling_locations = sampling_locations[:, :, :, None, :] # [B, Q, P, 1, 3] 27 | 28 | final = torch.zeros([B, C, Q, P], device=mlvl_feats[0].device) 29 | 30 | for lvl, feat in enumerate(mlvl_feats): 31 | out = F.grid_sample( 32 | feat, sampling_locations, mode='bilinear', 33 | padding_mode='zeros', align_corners=True, 34 | )[..., 0] # [B, C, Q, P] 35 | out = out * scale_weights[..., lvl].reshape(B, 1, Q, P) 36 | final += out 37 | 38 | return final.permute(0, 2, 1, 3) 39 | 40 | 41 | class MSMVSamplingC2345(torch.autograd.Function): 42 | @staticmethod 43 | def forward(ctx, feat_c2, feat_c3, feat_c4, feat_c5, sampling_locations, scale_weights): 44 | ctx.save_for_backward(feat_c2, feat_c3, feat_c4, feat_c5, sampling_locations, scale_weights) 45 | 46 | assert callable(_ms_deform_attn_cuda_c2345_forward) 47 | return _ms_deform_attn_cuda_c2345_forward( 48 | feat_c2, feat_c3, feat_c4, feat_c5, 49 | sampling_locations, scale_weights) 50 | 51 | @staticmethod 52 | def backward(ctx, grad_output): 53 | feat_c2, feat_c3, feat_c4, feat_c5, sampling_locations, scale_weights = ctx.saved_tensors 54 | 55 | assert callable(_ms_deform_attn_cuda_c2345_backward) 56 | grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_sampling_loc, grad_attn_weight = _ms_deform_attn_cuda_c2345_backward(grad_output.contiguous(), 57 | feat_c2, feat_c3, feat_c4, feat_c5, 58 | sampling_locations, scale_weights 59 | ) 60 | 61 | return grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_sampling_loc, grad_attn_weight 62 | 63 | 64 | class MSMVSamplingC23456(torch.autograd.Function): 65 | @staticmethod 66 | def forward(ctx, feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, sampling_locations, scale_weights): 67 | ctx.save_for_backward(feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, sampling_locations, scale_weights) 68 | 69 | assert callable(_ms_deform_attn_cuda_c23456_forward) 70 | return _ms_deform_attn_cuda_c23456_forward( 71 | feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, 72 | sampling_locations, scale_weights) 73 | 74 | @staticmethod 75 | def backward(ctx, grad_output): 76 | feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, sampling_locations, scale_weights = ctx.saved_tensors 77 | 78 | assert callable(_ms_deform_attn_cuda_c23456_backward) 79 | grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_value_c6, grad_sampling_loc, grad_attn_weight = _ms_deform_attn_cuda_c23456_backward(grad_output.contiguous(), 80 | feat_c2, feat_c3, feat_c4, feat_c5, feat_c6, 81 | sampling_locations, scale_weights 82 | ) 83 | 84 | return grad_value_c2, grad_value_c3, grad_value_c4, grad_value_c5, grad_value_c6, grad_sampling_loc, grad_attn_weight 85 | 86 | 87 | def msmv_sampling(mlvl_feats, sampling_locations, scale_weights): 88 | if len(mlvl_feats) == 4 and MSMV_CUDA: 89 | return MSMVSamplingC2345.apply(*mlvl_feats, sampling_locations, scale_weights) 90 | elif len(mlvl_feats) == 5 and MSMV_CUDA: 91 | return MSMVSamplingC23456.apply(*mlvl_feats, sampling_locations, scale_weights) 92 | else: 93 | return msmv_sampling_pytorch(mlvl_feats, sampling_locations, scale_weights) 94 | -------------------------------------------------------------------------------- /scripts/timing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 4 | sys.path.insert(0, path) 5 | 6 | import time 7 | import logging 8 | import argparse 9 | import importlib 10 | import torch 11 | import torch.distributed 12 | import torch.backends.cudnn as cudnn 13 | from mmcv import Config, DictAction 14 | from mmcv.parallel import MMDataParallel 15 | from mmcv.runner import load_checkpoint 16 | from mmdet.apis import set_random_seed 17 | from mmdet3d.datasets import build_dataset, build_dataloader 18 | from mmdet3d.models import build_model 19 | 20 | 21 | def init_logging(filename=None, debug=False): 22 | logging.root = logging.RootLogger('DEBUG' if debug else 'INFO') 23 | formatter = logging.Formatter('[%(asctime)s][%(levelname)s] - %(message)s') 24 | 25 | stream_handler = logging.StreamHandler(sys.stdout) 26 | stream_handler.setFormatter(formatter) 27 | logging.root.addHandler(stream_handler) 28 | 29 | if filename is not None: 30 | file_handler = logging.FileHandler(filename) 31 | file_handler.setFormatter(formatter) 32 | logging.root.addHandler(file_handler) 33 | 34 | 35 | def main(): 36 | parser = argparse.ArgumentParser(description='Validate a detector') 37 | parser.add_argument('--config', required=True) 38 | parser.add_argument('--weights', required=True) 39 | parser.add_argument('--num_warmup', default=10) 40 | parser.add_argument('--samples', default=500) 41 | parser.add_argument('--log-interval', default=50, help='interval of logging') 42 | parser.add_argument('--override', nargs='+', action=DictAction) 43 | args = parser.parse_args() 44 | 45 | # parse configs 46 | cfgs = Config.fromfile(args.config) 47 | if args.override is not None: 48 | cfgs.merge_from_dict(args.override) 49 | 50 | # register custom module 51 | importlib.import_module('models') 52 | importlib.import_module('loaders') 53 | 54 | # MMCV, please shut up 55 | from mmcv.utils.logging import logger_initialized 56 | logger_initialized['root'] = logging.Logger(__name__, logging.WARNING) 57 | logger_initialized['mmcv'] = logging.Logger(__name__, logging.WARNING) 58 | init_logging(None, cfgs.debug) 59 | 60 | # you need GPUs 61 | assert torch.cuda.is_available() and torch.cuda.device_count() == 1 62 | logging.info('Using GPU: %s' % torch.cuda.get_device_name(0)) 63 | torch.cuda.set_device(0) 64 | 65 | logging.info('Setting random seed: 0') 66 | set_random_seed(0, deterministic=True) 67 | cudnn.benchmark = True 68 | 69 | logging.info('Loading validation set from %s' % cfgs.data.val.data_root) 70 | val_dataset = build_dataset(cfgs.data.val) 71 | val_loader = build_dataloader( 72 | val_dataset, 73 | samples_per_gpu=1, 74 | workers_per_gpu=cfgs.data.workers_per_gpu, 75 | num_gpus=1, 76 | dist=False, 77 | shuffle=False, 78 | seed=0, 79 | ) 80 | 81 | logging.info('Creating model: %s' % cfgs.model.type) 82 | model = build_model(cfgs.model) 83 | model.cuda() 84 | 85 | assert torch.cuda.device_count() == 1 86 | model = MMDataParallel(model, [0]) 87 | 88 | logging.info('Loading checkpoint from %s' % args.weights) 89 | load_checkpoint( 90 | model, args.weights, map_location='cuda', strict=False, 91 | logger=logging.Logger(__name__, logging.ERROR) 92 | ) 93 | model.eval() 94 | 95 | pure_inf_time = 0 96 | with torch.no_grad(): 97 | for i, data in enumerate(val_loader): 98 | torch.cuda.synchronize() 99 | start_time = time.perf_counter() 100 | 101 | model(return_loss=False, rescale=True, **data) 102 | 103 | torch.cuda.synchronize() 104 | elapsed = time.perf_counter() - start_time 105 | 106 | if i >= args.num_warmup: 107 | pure_inf_time += elapsed 108 | if (i + 1) % args.log_interval == 0: 109 | fps = (i + 1 - args.num_warmup) / pure_inf_time 110 | print(f'Done sample [{i + 1:<3}/ {args.samples}], ' 111 | f'fps: {fps:.1f} sample / s') 112 | 113 | if (i + 1) == args.samples: 114 | break 115 | 116 | 117 | if __name__ == '__main__': 118 | main() 119 | -------------------------------------------------------------------------------- /models/bbox/coders/nms_free_coder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from mmdet.core.bbox import BaseBBoxCoder 4 | from mmdet.core.bbox.builder import BBOX_CODERS 5 | from ..utils import denormalize_bbox 6 | 7 | 8 | @BBOX_CODERS.register_module() 9 | class NMSFreeCoder(BaseBBoxCoder): 10 | """Bbox coder for NMS-free detector. 11 | Args: 12 | pc_range (list[float]): Range of point cloud. 13 | post_center_range (list[float]): Limit of the center. 14 | Default: None. 15 | max_num (int): Max number to be kept. Default: 100. 16 | score_threshold (float): Threshold to filter boxes based on score. 17 | Default: None. 18 | code_size (int): Code size of bboxes. Default: 9 19 | """ 20 | def __init__(self, 21 | pc_range, 22 | voxel_size=None, 23 | post_center_range=None, 24 | max_num=100, 25 | score_threshold=None, 26 | num_classes=10): 27 | self.pc_range = pc_range 28 | self.voxel_size = voxel_size 29 | self.post_center_range = post_center_range 30 | self.max_num = max_num 31 | self.score_threshold = score_threshold 32 | self.num_classes = num_classes 33 | 34 | def encode(self): 35 | pass 36 | 37 | def decode_single(self, cls_scores, bbox_preds): 38 | """Decode bboxes. 39 | Args: 40 | cls_scores (Tensor): Outputs from the classification head, \ 41 | shape [num_query, cls_out_channels]. Note \ 42 | cls_out_channels should includes background. 43 | bbox_preds (Tensor): Outputs from the regression \ 44 | head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ 45 | Shape [num_query, 9]. 46 | Returns: 47 | list[dict]: Decoded boxes. 48 | """ 49 | max_num = self.max_num 50 | 51 | cls_scores = cls_scores.sigmoid() 52 | scores, indexs = cls_scores.view(-1).topk(max_num) 53 | labels = indexs % self.num_classes 54 | bbox_index = torch.div(indexs, self.num_classes, rounding_mode='trunc') 55 | bbox_preds = bbox_preds[bbox_index] 56 | 57 | final_box_preds = denormalize_bbox(bbox_preds) 58 | final_scores = scores 59 | final_preds = labels 60 | 61 | # use score threshold 62 | if self.score_threshold is not None: 63 | thresh_mask = final_scores > self.score_threshold 64 | 65 | if self.post_center_range is not None: 66 | limit = torch.tensor(self.post_center_range, device=scores.device) 67 | mask = (final_box_preds[..., :3] >= limit[:3]).all(1) 68 | mask &= (final_box_preds[..., :3] <= limit[3:]).all(1) 69 | 70 | if self.score_threshold: 71 | mask &= thresh_mask 72 | 73 | boxes3d = final_box_preds[mask] 74 | scores = final_scores[mask] 75 | labels = final_preds[mask] 76 | predictions_dict = { 77 | 'bboxes': boxes3d, 78 | 'scores': scores, 79 | 'labels': labels 80 | } 81 | 82 | else: 83 | raise NotImplementedError( 84 | 'Need to reorganize output as a batch, only ' 85 | 'support post_center_range is not None for now!' 86 | ) 87 | 88 | return predictions_dict 89 | 90 | def decode(self, preds_dicts): 91 | """Decode bboxes. 92 | Args: 93 | all_cls_scores (Tensor): Outputs from the classification head, \ 94 | shape [nb_dec, bs, num_query, cls_out_channels]. Note \ 95 | cls_out_channels should includes background. 96 | all_bbox_preds (Tensor): Sigmoid outputs from the regression \ 97 | head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ 98 | Shape [nb_dec, bs, num_query, 9]. 99 | Returns: 100 | list[dict]: Decoded boxes. 101 | """ 102 | all_cls_scores = preds_dicts['all_cls_scores'][-1] 103 | all_bbox_preds = preds_dicts['all_bbox_preds'][-1] 104 | 105 | batch_size = all_cls_scores.size()[0] 106 | predictions_list = [] 107 | for i in range(batch_size): 108 | predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i])) 109 | 110 | return predictions_list 111 | -------------------------------------------------------------------------------- /val.py: -------------------------------------------------------------------------------- 1 | import os 2 | import utils 3 | import logging 4 | import argparse 5 | import importlib 6 | import torch 7 | import torch.distributed 8 | import torch.distributed as dist 9 | import torch.backends.cudnn as cudnn 10 | from mmcv import Config 11 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel 12 | from mmcv.runner import load_checkpoint 13 | from mmdet.apis import set_random_seed, multi_gpu_test, single_gpu_test 14 | from mmdet3d.datasets import build_dataset, build_dataloader 15 | from mmdet3d.models import build_model 16 | from models.utils import VERSION 17 | 18 | 19 | def evaluate(dataset, results, epoch): 20 | metrics = dataset.evaluate(results, jsonfile_prefix='submission') 21 | return metrics 22 | 23 | # mAP = metrics['pts_bbox_NuScenes/mAP'] 24 | # mATE = metrics['pts_bbox_NuScenes/mATE'] 25 | # mASE = metrics['pts_bbox_NuScenes/mASE'] 26 | # mAOE = metrics['pts_bbox_NuScenes/mAOE'] 27 | # mAVE = metrics['pts_bbox_NuScenes/mAVE'] 28 | # mAAE = metrics['pts_bbox_NuScenes/mAAE'] 29 | # NDS = metrics['pts_bbox_NuScenes/NDS'] 30 | 31 | # logging.info('--- Evaluation Results (Epoch %d) ---' % epoch) 32 | # logging.info('mAP: %.4f' % metrics['pts_bbox_NuScenes/mAP']) 33 | # logging.info('mATE: %.4f' % metrics['pts_bbox_NuScenes/mATE']) 34 | # logging.info('mASE: %.4f' % metrics['pts_bbox_NuScenes/mASE']) 35 | # logging.info('mAOE: %.4f' % metrics['pts_bbox_NuScenes/mAOE']) 36 | # logging.info('mAVE: %.4f' % metrics['pts_bbox_NuScenes/mAVE']) 37 | # logging.info('mAAE: %.4f' % metrics['pts_bbox_NuScenes/mAAE']) 38 | # logging.info('NDS: %.4f' % metrics['pts_bbox_NuScenes/NDS']) 39 | 40 | # return { 41 | # 'mAP': mAP, 42 | # 'mATE': mATE, 43 | # 'mASE': mASE, 44 | # 'mAOE': mAOE, 45 | # 'mAVE': mAVE, 46 | # 'mAAE': mAAE, 47 | # 'NDS': NDS, 48 | # } 49 | 50 | 51 | def main(): 52 | parser = argparse.ArgumentParser(description='Validate a detector') 53 | parser.add_argument('--config', required=True) 54 | parser.add_argument('--weights', required=True) 55 | parser.add_argument('--local_rank', type=int, default=0) 56 | parser.add_argument('--world_size', type=int, default=1) 57 | parser.add_argument('--batch_size', type=int, default=1) 58 | args = parser.parse_args() 59 | 60 | # parse configs 61 | cfgs = Config.fromfile(args.config) 62 | 63 | # register custom module 64 | importlib.import_module('models') 65 | importlib.import_module('loaders') 66 | 67 | # MMCV, please shut up 68 | from mmcv.utils.logging import logger_initialized 69 | logger_initialized['root'] = logging.Logger(__name__, logging.WARNING) 70 | logger_initialized['mmcv'] = logging.Logger(__name__, logging.WARNING) 71 | 72 | # you need GPUs 73 | assert torch.cuda.is_available() 74 | 75 | # determine local_rank and world_size 76 | if 'LOCAL_RANK' not in os.environ: 77 | os.environ['LOCAL_RANK'] = str(args.local_rank) 78 | 79 | if 'WORLD_SIZE' not in os.environ: 80 | os.environ['WORLD_SIZE'] = str(args.world_size) 81 | 82 | local_rank = int(os.environ['LOCAL_RANK']) 83 | world_size = int(os.environ['WORLD_SIZE']) 84 | 85 | if local_rank == 0: 86 | utils.init_logging(None, cfgs.debug) 87 | else: 88 | logging.root.disabled = True 89 | 90 | logging.info('Using GPU: %s' % torch.cuda.get_device_name(local_rank)) 91 | torch.cuda.set_device(local_rank) 92 | 93 | if world_size > 1: 94 | logging.info('Initializing DDP with %d GPUs...' % world_size) 95 | dist.init_process_group('nccl', init_method='env://') 96 | 97 | logging.info('Setting random seed: 0') 98 | set_random_seed(0, deterministic=True) 99 | cudnn.benchmark = True 100 | 101 | logging.info('Loading validation set from %s' % cfgs.data.val.data_root) 102 | val_dataset = build_dataset(cfgs.data.val) 103 | val_loader = build_dataloader( 104 | val_dataset, 105 | samples_per_gpu=args.batch_size, 106 | workers_per_gpu=cfgs.data.workers_per_gpu, 107 | num_gpus=world_size, 108 | dist=world_size > 1, 109 | shuffle=False, 110 | seed=0, 111 | ) 112 | 113 | logging.info('Creating model: %s' % cfgs.model.type) 114 | model = build_model(cfgs.model) 115 | model.cuda() 116 | model.fp16_enabled = True 117 | 118 | if world_size > 1: 119 | model = MMDistributedDataParallel(model, [local_rank], broadcast_buffers=False) 120 | else: 121 | model = MMDataParallel(model, [0]) 122 | 123 | logging.info('Loading checkpoint from %s' % args.weights) 124 | checkpoint = load_checkpoint( 125 | model, args.weights, map_location='cuda', strict=True, 126 | logger=logging.Logger(__name__, logging.ERROR) 127 | ) 128 | 129 | if 'version' in checkpoint: 130 | VERSION.name = checkpoint['version'] 131 | 132 | if world_size > 1: 133 | results = multi_gpu_test(model, val_loader, gpu_collect=False) 134 | else: 135 | results = single_gpu_test(model, val_loader) 136 | 137 | if local_rank == 0: 138 | evaluate(val_dataset, results, -1) 139 | 140 | 141 | if __name__ == '__main__': 142 | main() 143 | -------------------------------------------------------------------------------- /models/core/hook/ema.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # modified from megvii-bevdepth. 3 | import math 4 | import os 5 | from copy import deepcopy 6 | 7 | import torch 8 | from mmcv.runner import load_state_dict 9 | from mmcv.runner.dist_utils import master_only 10 | from mmcv.runner.hooks import HOOKS, Hook 11 | from .utils import is_parallel 12 | from mmcv.fileio import FileClient 13 | import os.path as osp 14 | 15 | __all__ = ['ModelEMA'] 16 | 17 | 18 | class ModelEMA: 19 | """Model Exponential Moving Average from https://github.com/rwightman/ 20 | pytorch-image-models Keep a moving average of everything in the model 21 | state_dict (parameters and buffers). 22 | 23 | This is intended to allow functionality like 24 | https://www.tensorflow.org/api_docs/python/tf/train/ 25 | ExponentialMovingAverage 26 | A smoothed version of the weights is necessary for some training 27 | schemes to perform well. 28 | This class is sensitive where it is initialized in the sequence 29 | of model init, GPU assignment and distributed training wrappers. 30 | """ 31 | 32 | def __init__(self, model, decay=0.9999, updates=0): 33 | """ 34 | Args: 35 | model (nn.Module): model to apply EMA. 36 | decay (float): ema decay reate. 37 | updates (int): counter of EMA updates. 38 | """ 39 | # Create EMA(FP32) 40 | self.ema_model = deepcopy(model).eval() 41 | self.ema = self.ema_model.module.module if is_parallel( 42 | self.ema_model.module) else self.ema_model.module 43 | self.updates = updates 44 | # decay exponential ramp (to help early epochs) 45 | self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) 46 | for p in self.ema.parameters(): 47 | p.requires_grad_(False) 48 | 49 | # print(f"Model is on device: {next(self.ema.parameters()).device}") 50 | # k=1 51 | 52 | def update(self, trainer, model): 53 | # Update EMA parameters 54 | with torch.no_grad(): 55 | self.updates += 1 56 | d = self.decay(self.updates) 57 | 58 | msd = model.module.state_dict() if is_parallel( 59 | model) else model.state_dict() # model state_dict 60 | for k, v in self.ema.state_dict().items(): 61 | if v.dtype.is_floating_point: 62 | print(k) 63 | v *= d 64 | v += (1.0 - d) * msd[k].detach() 65 | k=1 66 | 67 | @HOOKS.register_module() 68 | class MEGVIIEMAHook(Hook): 69 | """EMAHook used in BEVDepth. 70 | 71 | Modified from https://github.com/Megvii-Base 72 | Detection/BEVDepth/blob/main/callbacks/ema.py. 73 | """ 74 | 75 | def __init__(self, init_updates=0, decay=0.9990, resume=None,max_keep_ckpts=-1): 76 | super().__init__() 77 | self.init_updates = init_updates 78 | self.resume = resume 79 | self.decay = decay 80 | self.interval = 1 81 | self.max_keep_ckpts = max_keep_ckpts 82 | 83 | 84 | def before_run(self, runner): 85 | from torch.nn.modules.batchnorm import SyncBatchNorm 86 | 87 | bn_model_list = list() 88 | bn_model_dist_group_list = list() 89 | for model_ref in runner.model.modules(): 90 | if isinstance(model_ref, SyncBatchNorm): 91 | bn_model_list.append(model_ref) 92 | bn_model_dist_group_list.append(model_ref.process_group) 93 | model_ref.process_group = None 94 | runner.ema_model = ModelEMA(runner.model, self.decay) 95 | 96 | for bn_model, dist_group in zip(bn_model_list, 97 | bn_model_dist_group_list): 98 | bn_model.process_group = dist_group 99 | runner.ema_model.updates = self.init_updates 100 | 101 | if self.resume is not None: 102 | runner.logger.info(f'resume ema checkpoint from {self.resume}') 103 | cpt = torch.load(self.resume, map_location='cpu') 104 | load_state_dict(runner.ema_model.ema, cpt['state_dict']) 105 | runner.ema_model.updates = cpt['updates'] 106 | 107 | def after_train_iter(self, runner): 108 | runner.ema_model.update(runner, runner.model.module) 109 | 110 | def after_train_epoch(self, runner): 111 | self.save_checkpoint(runner) 112 | 113 | @master_only 114 | def save_checkpoint(self, runner): 115 | state_dict = runner.ema_model.ema.state_dict() 116 | ema_checkpoint = { 117 | 'epoch': runner.epoch, 118 | 'state_dict': state_dict, 119 | 'updates': runner.ema_model.updates 120 | } 121 | save_path = f'epoch_{runner.epoch+1}_ema.pth' 122 | save_path = os.path.join(runner.work_dir, save_path) 123 | torch.save(ema_checkpoint, save_path) 124 | 125 | # if self.max_keep_ckpts > 0: 126 | # current_ckpt = runner.epoch + 1 127 | # redundant_ckpts = range( 128 | # current_ckpt - self.max_keep_ckpts, 0, 129 | # -self.interval) 130 | # for _step in redundant_ckpts: 131 | # ckpt_pth=f'epoch_{_step}_ema.pth' 132 | # ckpt_pth = os.path.join(runner.work_dir, ckpt_pth) 133 | # if osp.exists(ckpt_pth): 134 | # os.remove(ckpt_pth) 135 | 136 | 137 | runner.logger.info(f'Saving ema checkpoint at {save_path}') 138 | -------------------------------------------------------------------------------- /models/core/hook/ema2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # modified from megvii-bevdepth. 3 | import math 4 | import os 5 | from copy import deepcopy 6 | 7 | import torch 8 | from mmcv.runner import load_state_dict 9 | from mmcv.runner.dist_utils import master_only 10 | from mmcv.runner.hooks import HOOKS, Hook 11 | from .utils import is_parallel 12 | from mmdet3d.models import build_model 13 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel 14 | 15 | __all__ = ['ModelEMA2'] 16 | 17 | 18 | class ModelEMA2: 19 | """Model Exponential Moving Average from https://github.com/rwightman/ 20 | pytorch-image-models Keep a moving average of everything in the model 21 | state_dict (parameters and buffers). 22 | 23 | This is intended to allow functionality like 24 | https://www.tensorflow.org/api_docs/python/tf/train/ 25 | ExponentialMovingAverage 26 | A smoothed version of the weights is necessary for some training 27 | schemes to perform well. 28 | This class is sensitive where it is initialized in the sequence 29 | of model init, GPU assignment and distributed training wrappers. 30 | """ 31 | 32 | def __init__(self, model, decay=0.9999, updates=0,ema_model_cfg=None): 33 | """ 34 | Args: 35 | model (nn.Module): model to apply EMA. 36 | decay (float): ema decay reate. 37 | updates (int): counter of EMA updates. 38 | """ 39 | # Create EMA(FP32) 40 | assert ema_model_cfg is not None, 'ema_model_cfg is None' 41 | self.ema_model = self.clone_model(model,ema_model_cfg).eval() 42 | 43 | self.ema = self.ema_model.module.module if is_parallel( 44 | self.ema_model.module) else self.ema_model.module 45 | self.updates = updates 46 | # decay exponential ramp (to help early epochs) 47 | self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) 48 | for p in self.ema.parameters(): 49 | p.requires_grad_(False) 50 | k=1 51 | 52 | def clone_model(self, model,ema_model_cfg): 53 | """ 54 | Clone a model by copying its state_dict. 55 | """ 56 | cloned_model = build_model(ema_model_cfg) 57 | if is_parallel(model.module): 58 | if type(model)==MMDistributedDataParallel(): 59 | cloned_model=type(model)(cloned_model,[0],find_unused_parameters=True) 60 | else: 61 | cloned_model=type(model)(cloned_model,[0]) 62 | 63 | cloned_model.load_state_dict(model.state_dict()) 64 | 65 | return cloned_model 66 | 67 | def update(self, trainer, model): 68 | # Update EMA parameters 69 | with torch.no_grad(): 70 | self.updates += 1 71 | d = self.decay(self.updates) 72 | 73 | msd = model.module.state_dict() if is_parallel( 74 | model) else model.state_dict() # model state_dict 75 | for k, v in self.ema.state_dict().items(): 76 | if v.dtype.is_floating_point: 77 | v *= d 78 | v += (1.0 - d) * msd[k].detach() 79 | 80 | 81 | @HOOKS.register_module() 82 | class MEGVIIEMAHook2(Hook): 83 | """EMAHook used in BEVDepth. 84 | 85 | Modified from https://github.com/Megvii-Base 86 | Detection/BEVDepth/blob/main/callbacks/ema.py. 87 | """ 88 | 89 | def __init__(self, init_updates=0, decay=0.9990, resume=None,ema_model_cfg=None): 90 | super().__init__() 91 | self.init_updates = init_updates 92 | self.resume = resume 93 | self.decay = decay 94 | self.ema_model_cfg = ema_model_cfg 95 | 96 | def before_run(self, runner): 97 | from torch.nn.modules.batchnorm import SyncBatchNorm 98 | 99 | bn_model_list = list() 100 | bn_model_dist_group_list = list() 101 | for model_ref in runner.model.modules(): 102 | if isinstance(model_ref, SyncBatchNorm): 103 | bn_model_list.append(model_ref) 104 | bn_model_dist_group_list.append(model_ref.process_group) 105 | model_ref.process_group = None 106 | runner.ema_model = ModelEMA2(runner.model, self.decay,ema_model_cfg=self.ema_model_cfg) 107 | 108 | for bn_model, dist_group in zip(bn_model_list, 109 | bn_model_dist_group_list): 110 | bn_model.process_group = dist_group 111 | runner.ema_model.updates = self.init_updates 112 | 113 | if self.resume is not None: 114 | runner.logger.info(f'resume ema checkpoint from {self.resume}') 115 | cpt = torch.load(self.resume, map_location='cpu') 116 | load_state_dict(runner.ema_model.ema, cpt['state_dict']) 117 | runner.ema_model.updates = cpt['updates'] 118 | 119 | def after_train_iter(self, runner): 120 | runner.ema_model.update(runner, runner.model.module) 121 | 122 | def after_train_epoch(self, runner): 123 | self.save_checkpoint(runner) 124 | 125 | @master_only 126 | def save_checkpoint(self, runner): 127 | state_dict = runner.ema_model.ema.state_dict() 128 | ema_checkpoint = { 129 | 'epoch': runner.epoch, 130 | 'state_dict': state_dict, 131 | 'updates': runner.ema_model.updates 132 | } 133 | save_path = f'epoch_{runner.epoch+1}_ema.pth' 134 | save_path = os.path.join(runner.work_dir, save_path) 135 | torch.save(ema_checkpoint, save_path) 136 | runner.logger.info(f'Saving ema checkpoint at {save_path}') 137 | -------------------------------------------------------------------------------- /scripts/gen_sweep_info.py: -------------------------------------------------------------------------------- 1 | # Generate info files manually 2 | import os 3 | import mmcv 4 | import tqdm 5 | import pickle 6 | import argparse 7 | import numpy as np 8 | from nuscenes import NuScenes 9 | from pyquaternion import Quaternion 10 | 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--data-root', default='data/nuscenes') 14 | parser.add_argument('--version', default='v1.0-trainval') 15 | args = parser.parse_args() 16 | 17 | 18 | def get_cam_info(nusc, sample_data): 19 | pose_record = nusc.get('ego_pose', sample_data['ego_pose_token']) 20 | cs_record = nusc.get('calibrated_sensor', sample_data['calibrated_sensor_token']) 21 | 22 | sensor2ego_translation = cs_record['translation'] 23 | ego2global_translation = pose_record['translation'] 24 | sensor2ego_rotation = Quaternion(cs_record['rotation']).rotation_matrix 25 | ego2global_rotation = Quaternion(pose_record['rotation']).rotation_matrix 26 | cam_intrinsic = np.array(cs_record['camera_intrinsic']) 27 | 28 | sensor2global_rotation = sensor2ego_rotation.T @ ego2global_rotation.T 29 | sensor2global_translation = sensor2ego_translation @ ego2global_rotation.T + ego2global_translation 30 | 31 | return { 32 | 'data_path': os.path.join(args.data_root, sample_data['filename']), 33 | 'sensor2global_rotation': sensor2global_rotation, 34 | 'sensor2global_translation': sensor2global_translation, 35 | 'cam_intrinsic': cam_intrinsic, 36 | 'timestamp': sample_data['timestamp'], 37 | } 38 | 39 | 40 | def add_sweep_info(nusc, sample_infos): 41 | for curr_id in tqdm.tqdm(range(len(sample_infos['infos']))): 42 | sample = nusc.get('sample', sample_infos['infos'][curr_id]['token']) 43 | 44 | # add scene name for occupancy 45 | scene = nusc.get('scene', sample['scene_token']) 46 | sample_infos['infos'][curr_id]['scene_name'] = scene['name'] 47 | sample_infos['infos'][curr_id]['scene_token'] = scene['token'] 48 | sample_infos['infos'][curr_id]['lidar_token'] = sample['data']['LIDAR_TOP'] 49 | 50 | cam_types = [ 51 | 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 52 | 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_FRONT_LEFT' 53 | ] 54 | 55 | curr_cams = dict() 56 | for cam in cam_types: 57 | curr_cams[cam] = nusc.get('sample_data', sample['data'][cam]) 58 | 59 | for cam in cam_types: 60 | sample_data = nusc.get('sample_data', sample['data'][cam]) 61 | sweep_cam = get_cam_info(nusc, sample_data) 62 | sample_infos['infos'][curr_id]['cams'][cam].update(sweep_cam) 63 | 64 | # remove unnecessary 65 | for cam in cam_types: 66 | del sample_infos['infos'][curr_id]['cams'][cam]['sample_data_token'] 67 | del sample_infos['infos'][curr_id]['cams'][cam]['sensor2ego_translation'] 68 | del sample_infos['infos'][curr_id]['cams'][cam]['sensor2ego_rotation'] 69 | del sample_infos['infos'][curr_id]['cams'][cam]['ego2global_translation'] 70 | del sample_infos['infos'][curr_id]['cams'][cam]['ego2global_rotation'] 71 | 72 | sweep_infos = [] 73 | if sample['prev'] != '': # add sweep frame between two key frame 74 | for _ in range(5): 75 | sweep_info = dict() 76 | for cam in cam_types: 77 | if curr_cams[cam]['prev'] == '': 78 | sweep_info = sweep_infos[-1] 79 | break 80 | sample_data = nusc.get('sample_data', curr_cams[cam]['prev']) 81 | sweep_cam = get_cam_info(nusc, sample_data) 82 | curr_cams[cam] = sample_data 83 | sweep_info[cam] = sweep_cam 84 | sweep_infos.append(sweep_info) 85 | 86 | sample_infos['infos'][curr_id]['lidar_sweeps'] = \ 87 | sample_infos['infos'][curr_id].pop('sweeps') 88 | sample_infos['infos'][curr_id]['cam_sweeps'] = sweep_infos 89 | 90 | return sample_infos 91 | 92 | 93 | if __name__ == '__main__': 94 | nusc = NuScenes(args.version, args.data_root) 95 | 96 | if args.version == 'v1.0-trainval': 97 | sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_train.pkl'), 'rb')) 98 | sample_infos = add_sweep_info(nusc, sample_infos) 99 | mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_train_sweep.pkl')) 100 | 101 | sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_val.pkl'), 'rb')) 102 | sample_infos = add_sweep_info(nusc, sample_infos) 103 | mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_val_sweep.pkl')) 104 | 105 | elif args.version == 'v1.0-test': 106 | sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_test.pkl'), 'rb')) 107 | sample_infos = add_sweep_info(nusc, sample_infos) 108 | mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_test_sweep.pkl')) 109 | 110 | elif args.version == 'v1.0-mini': 111 | sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_train_mini.pkl'), 'rb')) 112 | sample_infos = add_sweep_info(nusc, sample_infos) 113 | mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_train_mini_sweep.pkl')) 114 | 115 | sample_infos = pickle.load(open(os.path.join(args.data_root, 'nuscenes_infos_val_mini.pkl'), 'rb')) 116 | sample_infos = add_sweep_info(nusc, sample_infos) 117 | mmcv.dump(sample_infos, os.path.join(args.data_root, 'nuscenes_infos_val_mini_sweep.pkl')) 118 | 119 | else: 120 | raise ValueError 121 | -------------------------------------------------------------------------------- /models/backbones/eva02/wrappers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | """ 3 | Wrappers around on some nn functions, mainly to support empty tensors. 4 | 5 | Ideally, add support directly in PyTorch to empty tensors in those functions. 6 | 7 | These can be removed once https://github.com/pytorch/pytorch/issues/12013 8 | is implemented 9 | """ 10 | 11 | import warnings 12 | from typing import List, Optional 13 | import torch 14 | from torch.nn import functional as F 15 | 16 | 17 | def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor: 18 | """ 19 | Turn a list of integer scalars or integer Tensor scalars into a vector, 20 | in a way that's both traceable and scriptable. 21 | 22 | In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs. 23 | In scripting or eager, `x` should be a list of int. 24 | """ 25 | if torch.jit.is_scripting(): 26 | return torch.as_tensor(x, device=device) 27 | if torch.jit.is_tracing(): 28 | assert all( 29 | [isinstance(t, torch.Tensor) for t in x] 30 | ), "Shape should be tensor during tracing!" 31 | # as_tensor should not be used in tracing because it records a constant 32 | ret = torch.stack(x) 33 | if ret.device != device: # avoid recording a hard-coded device if not necessary 34 | ret = ret.to(device=device) 35 | return ret 36 | return torch.as_tensor(x, device=device) 37 | 38 | 39 | def cat(tensors: List[torch.Tensor], dim: int = 0): 40 | """ 41 | Efficient version of torch.cat that avoids a copy if there is only a single element in a list 42 | """ 43 | assert isinstance(tensors, (list, tuple)) 44 | if len(tensors) == 1: 45 | return tensors[0] 46 | return torch.cat(tensors, dim) 47 | 48 | 49 | def empty_input_loss_func_wrapper(loss_func): 50 | def wrapped_loss_func(input, target, *, reduction="mean", **kwargs): 51 | """ 52 | Same as `loss_func`, but returns 0 (instead of nan) for empty inputs. 53 | """ 54 | if target.numel() == 0 and reduction == "mean": 55 | return input.sum() * 0.0 # connect the gradient 56 | return loss_func(input, target, reduction=reduction, **kwargs) 57 | 58 | return wrapped_loss_func 59 | 60 | 61 | cross_entropy = empty_input_loss_func_wrapper(F.cross_entropy) 62 | 63 | 64 | class _NewEmptyTensorOp(torch.autograd.Function): 65 | @staticmethod 66 | def forward(ctx, x, new_shape): 67 | ctx.shape = x.shape 68 | return x.new_empty(new_shape) 69 | 70 | @staticmethod 71 | def backward(ctx, grad): 72 | shape = ctx.shape 73 | return _NewEmptyTensorOp.apply(grad, shape), None 74 | 75 | 76 | class Conv2d(torch.nn.Conv2d): 77 | """ 78 | A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features. 79 | """ 80 | 81 | def __init__(self, *args, **kwargs): 82 | """ 83 | Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`: 84 | 85 | Args: 86 | norm (nn.Module, optional): a normalization layer 87 | activation (callable(Tensor) -> Tensor): a callable activation function 88 | 89 | It assumes that norm layer is used before activation. 90 | """ 91 | norm = kwargs.pop("norm", None) 92 | activation = kwargs.pop("activation", None) 93 | super().__init__(*args, **kwargs) 94 | 95 | self.norm = norm 96 | self.activation = activation 97 | 98 | def forward(self, x): 99 | # torchscript does not support SyncBatchNorm yet 100 | # https://github.com/pytorch/pytorch/issues/40507 101 | # and we skip these codes in torchscript since: 102 | # 1. currently we only support torchscript in evaluation mode 103 | # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or 104 | # later version, `Conv2d` in these PyTorch versions has already supported empty inputs. 105 | if not torch.jit.is_scripting(): 106 | with warnings.catch_warnings(record=True): 107 | if x.numel() == 0 and self.training: 108 | # https://github.com/pytorch/pytorch/issues/12013 109 | assert not isinstance( 110 | self.norm, torch.nn.SyncBatchNorm 111 | ), "SyncBatchNorm does not support empty inputs!" 112 | 113 | x = F.conv2d( 114 | x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups 115 | ) 116 | if self.norm is not None: 117 | x = self.norm(x) 118 | if self.activation is not None: 119 | x = self.activation(x) 120 | return x 121 | 122 | 123 | ConvTranspose2d = torch.nn.ConvTranspose2d 124 | BatchNorm2d = torch.nn.BatchNorm2d 125 | interpolate = F.interpolate 126 | Linear = torch.nn.Linear 127 | 128 | 129 | def nonzero_tuple(x): 130 | """ 131 | A 'as_tuple=True' version of torch.nonzero to support torchscript. 132 | because of https://github.com/pytorch/pytorch/issues/38718 133 | """ 134 | if torch.jit.is_scripting(): 135 | if x.dim() == 0: 136 | return x.unsqueeze(0).nonzero().unbind(1) 137 | return x.nonzero().unbind(1) 138 | else: 139 | return x.nonzero(as_tuple=True) 140 | 141 | 142 | @torch.jit.script_if_tracing 143 | def move_device_like(src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor: 144 | """ 145 | Tracing friendly way to cast tensor to another tensor's device. Device will be treated 146 | as constant during tracing, scripting the casting process as whole can workaround this issue. 147 | """ 148 | return src.to(dst.device) -------------------------------------------------------------------------------- /models/opusv1/opus_sampling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from ..bbox.utils import decode_points 4 | from ..utils import rotation_3d_in_axis, DUMP 5 | from ..csrc.wrapper import msmv_sampling, msmv_sampling_pytorch 6 | 7 | 8 | def make_sample_points(query_points, offset, pc_range): 9 | ''' 10 | query_points: [B, Q, P, 3] (x, y, z) 11 | offset: [B, Q, G, P, 3] 12 | ''' 13 | xyz = decode_points(query_points, pc_range) # [B, Q, 3] 14 | xyz = xyz[..., None, None, :] # [B, Q, 1, 1, 3] 15 | sample_xyz = xyz + offset # [B, Q, G, P, 3] 16 | return sample_xyz 17 | 18 | 19 | def sampling_4d(sample_points, mlvl_feats, scale_weights, occ2img, image_h, image_w, num_views=6, eps=1e-5): 20 | """ 21 | Args: 22 | sample_points: 3D sampling points in shape [B, Q, T, G, P, 3] 23 | mlvl_feats: list of multi-scale features from neck, each in shape [B*T*G, C, N, H, W] 24 | scale_weights: weights for multi-scale aggregation, [B, Q, G, T, P, L] 25 | occ2img: 4x4 projection matrix in shape [B, TN, 4, 4] 26 | Symbol meaning: 27 | B: batch size 28 | Q: num of queries 29 | T: num of frames 30 | G: num of groups (we follow the group sampling mechanism of AdaMixer) 31 | P: num of sampling points per frame per group 32 | N: num of views (six for nuScenes) 33 | L: num of layers of feature pyramid (typically it is 4: C2, C3, C4, C5) 34 | """ 35 | 36 | B, Q, T, G, P, _ = sample_points.shape # [B, Q, T, G, P, 3] 37 | N = num_views 38 | 39 | sample_points = sample_points.reshape(B, Q, T, G * P, 3) 40 | 41 | # get the projection matrix 42 | occ2img = occ2img[:, :, None, None, :, :] # [B, TN, 1, 1, 4, 4] 43 | occ2img = occ2img.expand(B, T*N, Q, G * P, 4, 4) 44 | occ2img = occ2img.reshape(B, T, N, Q, G*P, 4, 4) 45 | 46 | # expand the points 47 | ones = torch.ones_like(sample_points[..., :1]) 48 | sample_points = torch.cat([sample_points, ones], dim=-1) # [B, Q, GP, 4] 49 | sample_points = sample_points[:, :, None, ..., None] # [B, Q, T, GP, 4] 50 | sample_points = sample_points.expand(B, Q, N, T, G * P, 4, 1) 51 | sample_points = sample_points.transpose(1, 3) # [B, T, N, Q, GP, 4, 1] 52 | 53 | # project 3d sampling points to N views 54 | sample_points_cam = torch.matmul(occ2img, sample_points).squeeze(-1) # [B, T, N, Q, GP, 4] 55 | 56 | # homo coord -> pixel coord 57 | homo = sample_points_cam[..., 2:3] 58 | homo_nonzero = torch.maximum(homo, torch.zeros_like(homo) + eps) 59 | sample_points_cam = sample_points_cam[..., 0:2] / homo_nonzero # [B, T, N, Q, GP, 2] 60 | 61 | # normalize 62 | sample_points_cam[..., 0] /= image_w 63 | sample_points_cam[..., 1] /= image_h 64 | 65 | # check if out of image 66 | valid_mask = ((homo > eps) \ 67 | & (sample_points_cam[..., 1:2] > 0.0) 68 | & (sample_points_cam[..., 1:2] < 1.0) 69 | & (sample_points_cam[..., 0:1] > 0.0) 70 | & (sample_points_cam[..., 0:1] < 1.0) 71 | ).squeeze(-1).float() # [B, T, N, Q, GP] 72 | 73 | # for visualization only 74 | if DUMP.enabled: 75 | torch.save(torch.cat([sample_points_cam, homo_nonzero], dim=-1).cpu(), 76 | '{}/sample_points_cam_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count)) 77 | torch.save(valid_mask.cpu(), 78 | '{}/sample_points_cam_valid_mask_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count)) 79 | 80 | valid_mask = valid_mask.permute(0, 1, 3, 4, 2) # [B, T, Q, GP, N] 81 | sample_points_cam = sample_points_cam.permute(0, 1, 3, 4, 2, 5) # [B, T, Q, GP, N, 2] 82 | 83 | # prepare batched indexing 84 | i_batch = torch.arange(B, dtype=torch.long, device=sample_points.device) 85 | i_query = torch.arange(Q, dtype=torch.long, device=sample_points.device) 86 | i_time = torch.arange(T, dtype=torch.long, device=sample_points.device) 87 | i_point = torch.arange(G * P, dtype=torch.long, device=sample_points.device) 88 | i_batch = i_batch.view(B, 1, 1, 1, 1).expand(B, T, Q, G * P, 1) 89 | i_time = i_time.view(1, T, 1, 1, 1).expand(B, T, Q, G * P, 1) 90 | i_query = i_query.view(1, 1, Q, 1, 1).expand(B, T, Q, G * P, 1) 91 | i_point = i_point.view(1, 1, 1, G * P, 1).expand(B, T, Q, G * P, 1) 92 | 93 | # we only keep at most one valid sampling point, see https://zhuanlan.zhihu.com/p/654821380 94 | i_view = torch.argmax(valid_mask, dim=-1)[..., None] # [B, T, Q, GP, 1] 95 | 96 | # index the only one sampling point and its valid flag 97 | sample_points_cam = sample_points_cam[i_batch, i_time, i_query, i_point, i_view, :] # [B, Q, GP, 1, 2] 98 | valid_mask = valid_mask[i_batch, i_time, i_query, i_point, i_view] # [B, Q, GP, 1] 99 | 100 | # treat the view index as a new axis for grid_sample and normalize the view index to [0, 1] 101 | sample_points_cam = torch.cat([sample_points_cam, i_view[..., None].float() / (N - 1)], dim=-1) 102 | 103 | # reorganize the tensor to stack T and G to the batch dim for better parallelism 104 | sample_points_cam = sample_points_cam.reshape(B, T, Q, G, P, 1, 3) 105 | sample_points_cam = sample_points_cam.permute(0, 1, 3, 2, 4, 5, 6) # [B, T, G, Q, P, 1, 3] 106 | sample_points_cam = sample_points_cam.reshape(B*T*G, Q, P, 3) 107 | sample_points_cam = sample_points_cam.contiguous() 108 | 109 | # reorganize the tensor to stack T and G to the batch dim for better parallelism 110 | scale_weights = scale_weights.reshape(B, Q, G, T, P, -1) 111 | scale_weights = scale_weights.permute(0, 2, 3, 1, 4, 5) 112 | scale_weights = scale_weights.reshape(B*G*T, Q, P, -1) 113 | scale_weights = scale_weights.contiguous() 114 | 115 | # multi-scale multi-view grid sample 116 | final = msmv_sampling(mlvl_feats, sample_points_cam, scale_weights) 117 | 118 | # reorganize the sampled features 119 | C = final.shape[2] # [BTG, Q, C, P] 120 | final = final.reshape(B, T, G, Q, C, P) 121 | final = final.permute(0, 3, 2, 1, 5, 4) 122 | final = final.flatten(3, 4) # [B, Q, G, FP, C] 123 | 124 | return final 125 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import utils 4 | import shutil 5 | import logging 6 | import argparse 7 | import importlib 8 | import os.path as osp 9 | import torch 10 | import torch.distributed as dist 11 | from datetime import datetime 12 | from mmcv import Config, DictAction 13 | from mmcv.parallel import MMDataParallel, MMDistributedDataParallel 14 | from mmcv.runner import EpochBasedRunner, build_optimizer, load_checkpoint 15 | from mmdet.apis import set_random_seed 16 | from mmdet.core import DistEvalHook, EvalHook 17 | from mmdet3d.datasets import build_dataset 18 | from mmdet3d.models import build_model 19 | from loaders.builder import build_dataloader 20 | from models.core.hook.ema import MEGVIIEMAHook 21 | 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser(description='Train a detector') 25 | parser.add_argument('--config', required=True) 26 | parser.add_argument('--override', nargs='+', action=DictAction) 27 | parser.add_argument('--local_rank', type=int, default=0) 28 | parser.add_argument('--world_size', type=int, default=1) 29 | args = parser.parse_args() 30 | 31 | # parse configs 32 | cfgs = Config.fromfile(args.config) 33 | if args.override is not None: 34 | cfgs.merge_from_dict(args.override) 35 | 36 | # register custom module 37 | importlib.import_module('models') 38 | importlib.import_module('loaders') 39 | 40 | # MMCV, please shut up 41 | from mmcv.utils.logging import logger_initialized 42 | logger_initialized['root'] = logging.Logger(__name__, logging.WARNING) 43 | logger_initialized['mmcv'] = logging.Logger(__name__, logging.WARNING) 44 | logger_initialized['mmdet3d'] = logging.Logger(__name__, logging.WARNING) 45 | 46 | # you need GPUs 47 | assert torch.cuda.is_available() 48 | 49 | # determine local_rank and world_size 50 | if 'LOCAL_RANK' not in os.environ: 51 | os.environ['LOCAL_RANK'] = str(args.local_rank) 52 | 53 | if 'WORLD_SIZE' not in os.environ: 54 | os.environ['WORLD_SIZE'] = str(args.world_size) 55 | 56 | local_rank = int(os.environ['LOCAL_RANK']) 57 | world_size = int(os.environ['WORLD_SIZE']) 58 | 59 | logging.info('Using GPU: %s' % torch.cuda.get_device_name(local_rank)) 60 | torch.cuda.set_device(local_rank) 61 | 62 | timestamp = torch.tensor(time.time(), dtype=torch.float64).cuda() 63 | if world_size > 1: 64 | logging.info('Initializing DDP with %d GPUs...' % world_size) 65 | dist.init_process_group('nccl', init_method='env://') 66 | dist.broadcast(timestamp, 0) 67 | 68 | # resume or start a new run 69 | if cfgs.resume_from is not None: 70 | assert os.path.isfile(cfgs.resume_from) 71 | work_dir = os.path.dirname(cfgs.resume_from) 72 | else: 73 | run_name = osp.splitext(osp.split(args.config)[-1])[0] 74 | run_name += '_' + time.strftime("%Y-%m-%d/%H-%M-%S", 75 | time.localtime(timestamp.cpu().item())) 76 | work_dir = os.path.join('outputs', cfgs.model.type, run_name) 77 | 78 | if local_rank == 0: 79 | if os.path.exists(work_dir): # must be an empty dir 80 | raise FileExistsError(work_dir) 81 | os.makedirs(work_dir, exist_ok=False) 82 | 83 | # init logging, backup code 84 | utils.init_logging(os.path.join(work_dir, 'train.log'), cfgs.debug) 85 | utils.backup_code(work_dir) 86 | logging.info('Logs will be saved to %s' % work_dir) 87 | else: 88 | # disable logging on other workers 89 | logging.root.disabled = True 90 | 91 | logging.info('Setting random seed: 0') 92 | set_random_seed(0, deterministic=True) 93 | 94 | logging.info('Loading training set from %s' % cfgs.dataset_root) 95 | train_dataset = build_dataset(cfgs.data.train) 96 | train_loader = build_dataloader( 97 | train_dataset, 98 | samples_per_gpu=cfgs.batch_size // world_size, 99 | workers_per_gpu=cfgs.data.workers_per_gpu, 100 | num_gpus=world_size, 101 | dist=world_size > 1, 102 | shuffle=True, 103 | seed=0, 104 | ) 105 | 106 | logging.info('Loading validation set from %s' % cfgs.dataset_root) 107 | val_dataset = build_dataset(cfgs.data.val) 108 | val_loader = build_dataloader( 109 | val_dataset, 110 | samples_per_gpu=1, 111 | workers_per_gpu=cfgs.data.workers_per_gpu, 112 | num_gpus=world_size, 113 | dist=world_size > 1, 114 | shuffle=False 115 | ) 116 | 117 | logging.info('Creating model: %s' % cfgs.model.type) 118 | model = build_model(cfgs.model) 119 | model.init_weights() 120 | 121 | # logging.info(f'Model:\n{model}') 122 | model.cuda() 123 | model.train() 124 | 125 | n_params = sum([p.numel() for p in model.parameters() if p.requires_grad]) 126 | logging.info('Trainable parameters: %d (%.1fM)' % (n_params, n_params / 1e6)) 127 | logging.info('Batch size per GPU: %d' % (cfgs.batch_size // world_size)) 128 | 129 | if world_size > 1: 130 | model = MMDistributedDataParallel(model, [local_rank], broadcast_buffers=False) 131 | else: 132 | model = MMDataParallel(model, [0]) 133 | 134 | logging.info('Creating optimizer: %s' % cfgs.optimizer.type) 135 | optimizer = build_optimizer(model, cfgs.optimizer) 136 | 137 | runner = EpochBasedRunner( 138 | model, 139 | optimizer=optimizer, 140 | work_dir=work_dir, 141 | logger=logging.root, 142 | max_epochs=cfgs.total_epochs, 143 | meta=dict(), 144 | ) 145 | 146 | runner.register_lr_hook(cfgs.lr_config) 147 | runner.register_optimizer_hook(cfgs.optimizer_config) 148 | runner.register_checkpoint_hook(cfgs.checkpoint_config) 149 | runner.register_logger_hooks(cfgs.log_config) 150 | runner.register_timer_hook(dict(type='IterTimerHook')) 151 | runner.register_custom_hooks(dict(type='DistSamplerSeedHook')) 152 | if cfgs.get('custom_hooks', None) is not None: 153 | for hook_cfg in cfgs.custom_hooks: 154 | runner.register_custom_hooks(hook_cfg) 155 | 156 | if cfgs.eval_config['interval'] > 0: 157 | if world_size > 1: 158 | runner.register_hook(DistEvalHook( 159 | val_loader, interval=cfgs.eval_config['interval'], gpu_collect=False)) 160 | else: 161 | runner.register_hook(EvalHook(val_loader, interval=cfgs.eval_config['interval'])) 162 | 163 | if cfgs.resume_from is not None: 164 | logging.info('Resuming from %s' % cfgs.resume_from) 165 | runner.resume(cfgs.resume_from) 166 | 167 | elif cfgs.load_from is not None: 168 | logging.info('Loading checkpoint from %s' % cfgs.load_from) 169 | if cfgs.revise_keys is not None: 170 | load_checkpoint( 171 | model, cfgs.load_from, map_location='cpu', 172 | revise_keys=cfgs.revise_keys 173 | ) 174 | else: 175 | load_checkpoint( 176 | model, cfgs.load_from, map_location='cpu', 177 | ) 178 | 179 | runner.run([train_loader], [('train', 1)]) 180 | 181 | 182 | if __name__ == '__main__': 183 | main() 184 | -------------------------------------------------------------------------------- /models/opusv1_fusion/opus_sampling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from ..bbox.utils import decode_points, encode_points 4 | from ..utils import rotation_3d_in_axis, DUMP 5 | from ..csrc.wrapper import msmv_sampling, msmv_sampling_pytorch 6 | 7 | 8 | def make_sample_points(query_points, offset, pc_range): 9 | ''' 10 | query_points: [B, Q, P, 3] (x, y, z) 11 | offset: [B, Q, G, P, 3] 12 | ''' 13 | xyz = decode_points(query_points, pc_range) # [B, Q, 3] 14 | xyz = xyz[..., None, None, :] # [B, Q, 1, 1, 3] 15 | sample_xyz = xyz + offset # [B, Q, G, P, 3] 16 | return sample_xyz 17 | 18 | 19 | def sampling_4d(sample_points, mlvl_feats, scale_weights, occ2img, image_h, image_w, num_views=6, eps=1e-5): 20 | """ 21 | Args: 22 | sample_points: 3D sampling points in shape [B, Q, T, G, P, 3] 23 | mlvl_feats: list of multi-scale features from neck, each in shape [B*T*G, C, N, H, W] 24 | scale_weights: weights for multi-scale aggregation, [B, Q, G, T, P, L] 25 | occ2img: 4x4 projection matrix in shape [B, TN, 4, 4] 26 | Symbol meaning: 27 | B: batch size 28 | Q: num of queries 29 | T: num of frames 30 | G: num of groups (we follow the group sampling mechanism of AdaMixer) 31 | P: num of sampling points per frame per group 32 | N: num of views (six for nuScenes) 33 | L: num of layers of feature pyramid (typically it is 4: C2, C3, C4, C5) 34 | """ 35 | 36 | B, Q, T, G, P, _ = sample_points.shape # [B, Q, T, G, P, 3] 37 | N = num_views 38 | 39 | sample_points = sample_points.reshape(B, Q, T, G * P, 3) 40 | 41 | # get the projection matrix 42 | occ2img = occ2img[:, :, None, None, :, :] # [B, TN, 1, 1, 4, 4] 43 | occ2img = occ2img.expand(B, T*N, Q, G * P, 4, 4) 44 | occ2img = occ2img.reshape(B, T, N, Q, G*P, 4, 4) 45 | 46 | # expand the points 47 | ones = torch.ones_like(sample_points[..., :1]) 48 | sample_points = torch.cat([sample_points, ones], dim=-1) # [B, Q, GP, 4] 49 | sample_points = sample_points[:, :, None, ..., None] # [B, Q, T, GP, 4] 50 | sample_points = sample_points.expand(B, Q, N, T, G * P, 4, 1) 51 | sample_points = sample_points.transpose(1, 3) # [B, T, N, Q, GP, 4, 1] 52 | 53 | # project 3d sampling points to N views 54 | sample_points_cam = torch.matmul(occ2img, sample_points).squeeze(-1) # [B, T, N, Q, GP, 4] 55 | 56 | # homo coord -> pixel coord 57 | homo = sample_points_cam[..., 2:3] 58 | homo_nonzero = torch.maximum(homo, torch.zeros_like(homo) + eps) 59 | sample_points_cam = sample_points_cam[..., 0:2] / homo_nonzero # [B, T, N, Q, GP, 2] 60 | 61 | # normalize 62 | sample_points_cam[..., 0] /= image_w 63 | sample_points_cam[..., 1] /= image_h 64 | 65 | # check if out of image 66 | valid_mask = ((homo > eps) \ 67 | & (sample_points_cam[..., 1:2] > 0.0) 68 | & (sample_points_cam[..., 1:2] < 1.0) 69 | & (sample_points_cam[..., 0:1] > 0.0) 70 | & (sample_points_cam[..., 0:1] < 1.0) 71 | ).squeeze(-1).float() # [B, T, N, Q, GP] 72 | 73 | # for visualization only 74 | if DUMP.enabled: 75 | torch.save(torch.cat([sample_points_cam, homo_nonzero], dim=-1).cpu(), 76 | '{}/sample_points_cam_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count)) 77 | torch.save(valid_mask.cpu(), 78 | '{}/sample_points_cam_valid_mask_stage{}.pth'.format(DUMP.out_dir, DUMP.stage_count)) 79 | 80 | valid_mask = valid_mask.permute(0, 1, 3, 4, 2) # [B, T, Q, GP, N] 81 | sample_points_cam = sample_points_cam.permute(0, 1, 3, 4, 2, 5) # [B, T, Q, GP, N, 2] 82 | 83 | # prepare batched indexing 84 | i_batch = torch.arange(B, dtype=torch.long, device=sample_points.device) 85 | i_query = torch.arange(Q, dtype=torch.long, device=sample_points.device) 86 | i_time = torch.arange(T, dtype=torch.long, device=sample_points.device) 87 | i_point = torch.arange(G * P, dtype=torch.long, device=sample_points.device) 88 | i_batch = i_batch.view(B, 1, 1, 1, 1).expand(B, T, Q, G * P, 1) 89 | i_time = i_time.view(1, T, 1, 1, 1).expand(B, T, Q, G * P, 1) 90 | i_query = i_query.view(1, 1, Q, 1, 1).expand(B, T, Q, G * P, 1) 91 | i_point = i_point.view(1, 1, 1, G * P, 1).expand(B, T, Q, G * P, 1) 92 | 93 | # we only keep at most one valid sampling point, see https://zhuanlan.zhihu.com/p/654821380 94 | i_view = torch.argmax(valid_mask, dim=-1)[..., None] # [B, T, Q, GP, 1] 95 | 96 | # index the only one sampling point and its valid flag 97 | sample_points_cam = sample_points_cam[i_batch, i_time, i_query, i_point, i_view, :] # [B, Q, GP, 1, 2] 98 | valid_mask = valid_mask[i_batch, i_time, i_query, i_point, i_view] # [B, Q, GP, 1] 99 | 100 | # treat the view index as a new axis for grid_sample and normalize the view index to [0, 1] 101 | sample_points_cam = torch.cat([sample_points_cam, i_view[..., None].float() / (N - 1)], dim=-1) 102 | 103 | # reorganize the tensor to stack T and G to the batch dim for better parallelism 104 | sample_points_cam = sample_points_cam.reshape(B, T, Q, G, P, 1, 3) 105 | sample_points_cam = sample_points_cam.permute(0, 1, 3, 2, 4, 5, 6) # [B, T, G, Q, P, 1, 3] 106 | sample_points_cam = sample_points_cam.reshape(B*T*G, Q, P, 3) 107 | sample_points_cam = sample_points_cam.contiguous() 108 | 109 | # reorganize the tensor to stack T and G to the batch dim for better parallelism 110 | scale_weights = scale_weights.reshape(B, Q, G, T, P, -1) 111 | scale_weights = scale_weights.permute(0, 2, 3, 1, 4, 5) 112 | scale_weights = scale_weights.reshape(B*G*T, Q, P, -1) 113 | scale_weights = scale_weights.contiguous() 114 | 115 | # multi-scale multi-view grid sample 116 | final = msmv_sampling(mlvl_feats, sample_points_cam, scale_weights) 117 | 118 | # reorganize the sampled features 119 | C = final.shape[2] # [BTG, Q, C, P] 120 | final = final.reshape(B, T, G, Q, C, P) 121 | final = final.permute(0, 3, 2, 1, 5, 4) 122 | final = final.flatten(3, 4) # [B, Q, G, FP, C] 123 | 124 | return final 125 | 126 | 127 | def sampling_pts_feats(sample_points, pts_feats, occ2lidar, pc_range): 128 | C = pts_feats.shape[1] 129 | B, Q, G, P, _ = sample_points.shape # [B, Q, G, P, 3] 130 | sample_points = sample_points.permute(0, 2, 1, 3, 4) 131 | sample_points = sample_points.reshape(B*G, Q, P, 3) # [BG, Q, P, 3] 132 | 133 | occ2lidar = occ2lidar[:, None, None, None, :, :] # [B, 1, 1, 1, 4, 4] 134 | occ2lidar = occ2lidar.expand(B, G, Q, P, 4, 4) 135 | occ2lidar = occ2lidar.reshape(B*G, Q, P, 4, 4) 136 | 137 | ones = torch.ones_like(sample_points[..., :1]) 138 | sample_points = torch.cat([sample_points, ones], dim=-1)[..., None] # [BG, Q, P, 4, 1] 139 | sample_points = torch.matmul(occ2lidar, sample_points).squeeze(-1) 140 | 141 | norm_sample_points = encode_points(sample_points[..., :3], pc_range) 142 | norm_sample_points = norm_sample_points[..., :2] * 2 - 1 # [BG, Q, P, 2] 143 | 144 | feat = F.grid_sample(pts_feats, norm_sample_points, padding_mode='zeros', align_corners=True) 145 | feat = feat.reshape(B, G, C, Q, P) 146 | feat = feat.permute(0, 3, 1, 4, 2) # [B, Q, G, P, C] 147 | 148 | return feat 149 | -------------------------------------------------------------------------------- /models/backbones/second_3d.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import warnings 3 | 4 | from mmcv.cnn import build_conv_layer, build_norm_layer 5 | from mmcv.runner import BaseModule 6 | from torch import nn as nn 7 | 8 | from mmdet3d.models.builder import BACKBONES 9 | 10 | from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE 11 | if IS_SPCONV2_AVAILABLE: 12 | from spconv.pytorch import SparseConvTensor, SparseSequential 13 | else: 14 | from mmcv.ops import SparseConvTensor, SparseSequential 15 | 16 | def make_sparse_convmodule(in_channels, 17 | out_channels, 18 | kernel_size, 19 | indice_key, 20 | stride=1, 21 | padding=0, 22 | conv_type='SubMConv3d', 23 | norm_cfg=None, 24 | order=('conv', 'norm', 'act')): 25 | """Make sparse convolution module. 26 | 27 | Args: 28 | in_channels (int): the number of input channels 29 | out_channels (int): the number of out channels 30 | kernel_size (int|tuple(int)): kernel size of convolution 31 | indice_key (str): the indice key used for sparse tensor 32 | stride (int|tuple(int)): the stride of convolution 33 | padding (int or list[int]): the padding number of input 34 | conv_type (str): sparse conv type in spconv 35 | norm_cfg (dict[str]): config of normalization layer 36 | order (tuple[str]): The order of conv/norm/activation layers. It is a 37 | sequence of "conv", "norm" and "act". Common examples are 38 | ("conv", "norm", "act") and ("act", "conv", "norm"). 39 | 40 | Returns: 41 | spconv.SparseSequential: sparse convolution module. 42 | """ 43 | assert isinstance(order, tuple) and len(order) <= 3 44 | assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'} 45 | 46 | conv_cfg = dict(type=conv_type, indice_key=indice_key) 47 | 48 | layers = list() 49 | for layer in order: 50 | if layer == 'conv': 51 | if conv_type not in [ 52 | 'SparseInverseConv3d', 'SparseInverseConv2d', 53 | 'SparseInverseConv1d' 54 | ]: 55 | layers.append( 56 | build_conv_layer( 57 | conv_cfg, 58 | in_channels, 59 | out_channels, 60 | kernel_size, 61 | stride=stride, 62 | padding=padding, 63 | bias=False)) 64 | else: 65 | layers.append( 66 | build_conv_layer( 67 | conv_cfg, 68 | in_channels, 69 | out_channels, 70 | kernel_size, 71 | bias=False)) 72 | elif layer == 'norm': 73 | layers.append(build_norm_layer(norm_cfg, out_channels)[1]) 74 | elif layer == 'act': 75 | layers.append(nn.ReLU(inplace=True)) 76 | 77 | layers = SparseSequential(*layers) 78 | return layers 79 | 80 | @BACKBONES.register_module() 81 | class SECOND_3d(BaseModule): 82 | """Backbone network for SECOND/PointPillars/PartA2/MVXNet. 83 | 84 | Args: 85 | in_channels (int): Input channels. 86 | out_channels (list[int]): Output channels for multi-scale feature maps. 87 | layer_nums (list[int]): Number of layers in each stage. 88 | layer_strides (list[int]): Strides of each stage. 89 | norm_cfg (dict): Config dict of normalization layers. 90 | conv_cfg (dict): Config dict of convolutional layers. 91 | 92 | use_sparse_conv: (int) the sparse conv layer,note that sparse conv can devoid 93 | the feature 94 | """ 95 | 96 | def __init__(self, 97 | in_channels=128, 98 | out_channels=[128, 128, 256], 99 | sparse_conv_cnt=0, 100 | layer_nums=[3, 5, 5], 101 | layer_strides=[2, 2, 2], 102 | norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), 103 | conv_cfg=dict(type='Conv2d', bias=False), 104 | init_cfg=None, 105 | pretrained=None): 106 | super(SECOND_3d, self).__init__(init_cfg=init_cfg) 107 | assert len(layer_strides) == len(layer_nums) 108 | assert len(out_channels) == len(layer_nums) 109 | self.sparse_conv_cnt=sparse_conv_cnt 110 | 111 | in_filters = [in_channels, *out_channels[:-1]] 112 | # note that when stride > 1, conv2d with same padding isn't 113 | # equal to pad-conv2d. we should use pad-conv2d. 114 | blocks = [] 115 | sparse_cnt=0 116 | sparse_cnt2=0 117 | for i, layer_num in enumerate(layer_nums): 118 | if layer_strides[i]<=1: 119 | conv_cfg = dict(type='SubMConv3d', indice_key=f'subm_second_{sparse_cnt}') 120 | else: 121 | conv_cfg = dict(type='SparseConv3d', indice_key=f'spconv_second_{sparse_cnt2}') 122 | block = [ 123 | build_conv_layer( 124 | conv_cfg, 125 | in_filters[i], 126 | out_channels[i], 127 | 3, 128 | stride=layer_strides[i], 129 | padding=1, 130 | bias=False), 131 | build_norm_layer(norm_cfg, out_channels[i])[1], 132 | nn.ReLU(inplace=True), 133 | ] 134 | if layer_strides[i]>1: 135 | self.sparse_conv_cnt=0 136 | for j in range(layer_num): 137 | if layer_num-j>self.sparse_conv_cnt: 138 | sparse_cnt+=1 139 | conv_cfg = dict(type='SubMConv3d', indice_key=f'subm_second_{sparse_cnt}') 140 | block.append( 141 | build_conv_layer( 142 | conv_cfg, 143 | out_channels[i], 144 | out_channels[i], 145 | 3, 146 | padding=1)) 147 | block.append(build_norm_layer(norm_cfg, out_channels[i])[1]) 148 | block.append(nn.ReLU(inplace=True)) 149 | else: 150 | conv_cfg = dict(type='SparseConv3d', indice_key=f'spconv_second_{sparse_cnt2}') 151 | block.append( 152 | build_conv_layer( 153 | conv_cfg, 154 | out_channels[i], 155 | out_channels[i], 156 | 3, 157 | padding=1)) 158 | block.append(build_norm_layer(norm_cfg, out_channels[i])[1]) 159 | block.append(nn.ReLU(inplace=True)) 160 | sparse_cnt2+=1 161 | 162 | block = SparseSequential(*block) 163 | blocks.append(block) 164 | 165 | self.blocks = nn.ModuleList(blocks) 166 | 167 | 168 | def forward(self, x): 169 | """Forward function. 170 | 171 | Args: 172 | x (torch.Tensor): sparse tensor 173 | 174 | Returns: 175 | tuple[torch.Tensor]: Multi-scale features. 176 | """ 177 | outs = [] 178 | for i in range(len(self.blocks)): 179 | x = self.blocks[i](x) 180 | outs.append(x) 181 | return tuple(outs) 182 | -------------------------------------------------------------------------------- /configs/opusv1_nusc-occ3d/opusv1-s_r50_704x256_8f_nusc-occ3d_100e.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'NuScenesOcc3DDataset' 2 | dataset_root = 'data/nuscenes/' 3 | occ_root = 'data/nuscenes/gts/' 4 | 5 | input_modality = dict( 6 | use_lidar=False, 7 | use_camera=True, 8 | use_radar=False, 9 | use_map=False, 10 | use_external=True 11 | ) 12 | 13 | # For nuScenes we usually do 10-class detection 14 | object_names = [ 15 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 16 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 17 | ] 18 | 19 | occ_names = [ 20 | 'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 21 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 22 | 'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade', 23 | 'vegetation' 24 | ] 25 | 26 | # If point cloud range is changed, the models should also change their point 27 | # cloud range accordingly 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4] 29 | voxel_size = [0.4, 0.4, 0.4] 30 | 31 | # arch config 32 | embed_dims = 256 33 | num_layers = 6 34 | num_query = 1200 35 | num_frames = 8 36 | num_levels = 4 37 | num_points = 2 38 | num_refines = [1, 4, 8, 16, 32, 64] 39 | 40 | img_backbone = dict( 41 | type='ResNet', 42 | depth=50, 43 | num_stages=4, 44 | out_indices=(0, 1, 2, 3), 45 | frozen_stages=1, 46 | norm_cfg=dict(type='BN2d', requires_grad=True), 47 | norm_eval=True, 48 | style='pytorch', 49 | with_cp=True) 50 | img_neck = dict( 51 | type='FPN', 52 | in_channels=[256, 512, 1024, 2048], 53 | out_channels=embed_dims, 54 | num_outs=num_levels) 55 | img_norm_cfg = dict( 56 | mean=[123.675, 116.280, 103.530], 57 | std=[58.395, 57.120, 57.375], 58 | to_rgb=True) 59 | 60 | model = dict( 61 | type='OPUSV1', 62 | use_grid_mask=False, 63 | data_aug=dict( 64 | img_color_aug=True, # Move some augmentations to GPU 65 | img_norm_cfg=img_norm_cfg, 66 | img_pad_cfg=dict(size_divisor=32)), 67 | stop_prev_grad=0, 68 | img_backbone=img_backbone, 69 | img_neck=img_neck, 70 | pts_bbox_head=dict( 71 | type='OPUSV1Head', 72 | num_classes=len(occ_names), 73 | in_channels=embed_dims, 74 | num_query=num_query, 75 | pc_range=point_cloud_range, 76 | voxel_size=voxel_size, 77 | transformer=dict( 78 | type='OPUSV1Transformer', 79 | embed_dims=embed_dims, 80 | num_frames=num_frames, 81 | num_points=num_points, 82 | num_layers=num_layers, 83 | num_levels=num_levels, 84 | num_classes=len(occ_names), 85 | num_refines=num_refines, 86 | scales=[0.5], 87 | pc_range=point_cloud_range), 88 | loss_cls=dict( 89 | type='FocalLoss', 90 | use_sigmoid=True, 91 | gamma=2.0, 92 | alpha=0.25, 93 | loss_weight=2.0), 94 | loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)), 95 | train_cfg=dict( 96 | pts=dict( 97 | cls_weights=[ 98 | 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1], 99 | ) 100 | ), 101 | test_cfg=dict( 102 | pts=dict( 103 | score_thr=0.5, 104 | padding=True 105 | ) 106 | ) 107 | ) 108 | 109 | ida_aug_conf = { 110 | 'resize_lim': (0.38, 0.55), 111 | 'final_dim': (256, 704), 112 | 'bot_pct_lim': (0.0, 0.0), 113 | 'rot_lim': (0.0, 0.0), 114 | 'H': 900, 'W': 1600, 115 | 'rand_flip': True, 116 | } 117 | 118 | train_pipeline = [ 119 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 120 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1), 121 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), 122 | dict(type='LoadOcc3DFromFile', occ_root=occ_root), 123 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 124 | dict(type='ObjectNameFilter', classes=object_names), 125 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True), 126 | dict(type='DefaultFormatBundle3D', class_names=object_names), 127 | dict(type='Collect3D', keys=['img', 'voxel_semantics', 'mask_camera'], meta_keys=( 128 | 'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp')) 129 | ] 130 | 131 | test_pipeline = [ 132 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 133 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True), 134 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False), 135 | dict( 136 | type='MultiScaleFlipAug3D', 137 | img_scale=(1600, 900), 138 | pts_scale_ratio=1, 139 | flip=False, 140 | transforms=[ 141 | dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False), 142 | dict(type='Collect3D', keys=['img'], meta_keys=( 143 | 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape', 144 | 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp')) 145 | ]) 146 | ] 147 | 148 | data = dict( 149 | workers_per_gpu=4, 150 | train=dict( 151 | type=dataset_type, 152 | data_root=dataset_root, 153 | ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl', 154 | pipeline=train_pipeline, 155 | classes=object_names, 156 | modality=input_modality, 157 | test_mode=False, 158 | use_valid_flag=True, 159 | box_type_3d='LiDAR'), 160 | val=dict( 161 | type=dataset_type, 162 | data_root=dataset_root, 163 | ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl', 164 | pipeline=test_pipeline, 165 | classes=object_names, 166 | modality=input_modality, 167 | test_mode=True, 168 | box_type_3d='LiDAR'), 169 | test=dict( 170 | type=dataset_type, 171 | data_root=dataset_root, 172 | ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl', 173 | pipeline=test_pipeline, 174 | classes=object_names, 175 | modality=input_modality, 176 | test_mode=True, 177 | box_type_3d='LiDAR') 178 | ) 179 | 180 | optimizer = dict( 181 | type='AdamW', 182 | lr=2e-4, 183 | paramwise_cfg=dict(custom_keys={ 184 | 'img_backbone': dict(lr_mult=0.1), 185 | 'sampling_offset': dict(lr_mult=0.1), 186 | }), 187 | weight_decay=0.01 188 | ) 189 | 190 | optimizer_config = dict( 191 | type='Fp16OptimizerHook', 192 | loss_scale=512.0, 193 | grad_clip=dict(max_norm=35, norm_type=2) 194 | ) 195 | 196 | # learning policy 197 | lr_config = dict( 198 | policy='CosineAnnealing', 199 | warmup='linear', 200 | warmup_iters=500, 201 | warmup_ratio=1.0 / 3, 202 | min_lr_ratio=1e-3 203 | ) 204 | total_epochs = 100 205 | batch_size = 8 206 | 207 | # load pretrained weights 208 | load_from = 'pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth' 209 | revise_keys = [('backbone', 'img_backbone')] 210 | 211 | # resume the last training 212 | resume_from = None 213 | 214 | # checkpointing 215 | checkpoint_config = dict(interval=1, max_keep_ckpts=1) 216 | 217 | # logging 218 | log_config = dict( 219 | interval=1, 220 | hooks=[ 221 | dict(type='TextLoggerHook', interval=50, reset_flag=True), 222 | dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True) 223 | ] 224 | ) 225 | 226 | # evaluation 227 | eval_config = dict(interval=total_epochs) 228 | 229 | # other flags 230 | debug = False 231 | -------------------------------------------------------------------------------- /configs/opusv1_nusc-occ3d/opusv1-l_r50_704x256_8f_nusc-occ3d_100e.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'NuScenesOcc3DDataset' 2 | dataset_root = 'data/nuscenes/' 3 | occ_root = 'data/nuscenes/gts/' 4 | 5 | input_modality = dict( 6 | use_lidar=False, 7 | use_camera=True, 8 | use_radar=False, 9 | use_map=False, 10 | use_external=True 11 | ) 12 | 13 | # For nuScenes we usually do 10-class detection 14 | object_names = [ 15 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 16 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 17 | ] 18 | 19 | occ_names = [ 20 | 'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 21 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 22 | 'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade', 23 | 'vegetation' 24 | ] 25 | 26 | # If point cloud range is changed, the models should also change their point 27 | # cloud range accordingly 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4] 29 | voxel_size = [0.4, 0.4, 0.4] 30 | 31 | # arch config 32 | embed_dims = 256 33 | num_layers = 6 34 | num_query = 4800 35 | num_frames = 8 36 | num_levels = 4 37 | num_points = 2 38 | num_refines = [1, 2, 4, 8, 16, 16] 39 | 40 | img_backbone = dict( 41 | type='ResNet', 42 | depth=50, 43 | num_stages=4, 44 | out_indices=(0, 1, 2, 3), 45 | frozen_stages=1, 46 | norm_cfg=dict(type='BN2d', requires_grad=True), 47 | norm_eval=True, 48 | style='pytorch', 49 | with_cp=True) 50 | img_neck = dict( 51 | type='FPN', 52 | in_channels=[256, 512, 1024, 2048], 53 | out_channels=embed_dims, 54 | num_outs=num_levels) 55 | img_norm_cfg = dict( 56 | mean=[123.675, 116.280, 103.530], 57 | std=[58.395, 57.120, 57.375], 58 | to_rgb=True) 59 | 60 | model = dict( 61 | type='OPUSV1', 62 | use_grid_mask=False, 63 | data_aug=dict( 64 | img_color_aug=True, # Move some augmentations to GPU 65 | img_norm_cfg=img_norm_cfg, 66 | img_pad_cfg=dict(size_divisor=32)), 67 | stop_prev_grad=0, 68 | img_backbone=img_backbone, 69 | img_neck=img_neck, 70 | pts_bbox_head=dict( 71 | type='OPUSV1Head', 72 | num_classes=len(occ_names), 73 | in_channels=embed_dims, 74 | num_query=num_query, 75 | pc_range=point_cloud_range, 76 | voxel_size=voxel_size, 77 | transformer=dict( 78 | type='OPUSV1Transformer', 79 | embed_dims=embed_dims, 80 | num_frames=num_frames, 81 | num_points=num_points, 82 | num_layers=num_layers, 83 | num_levels=num_levels, 84 | num_classes=len(occ_names), 85 | num_refines=num_refines, 86 | scales=[0.5], 87 | pc_range=point_cloud_range), 88 | loss_cls=dict( 89 | type='FocalLoss', 90 | use_sigmoid=True, 91 | gamma=2.0, 92 | alpha=0.25, 93 | loss_weight=2.0), 94 | loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)), 95 | train_cfg=dict( 96 | pts=dict( 97 | cls_weights=[ 98 | 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1], 99 | ) 100 | ), 101 | test_cfg=dict( 102 | pts=dict( 103 | score_thr=0.5, 104 | padding=True 105 | ) 106 | ) 107 | ) 108 | 109 | ida_aug_conf = { 110 | 'resize_lim': (0.38, 0.55), 111 | 'final_dim': (256, 704), 112 | 'bot_pct_lim': (0.0, 0.0), 113 | 'rot_lim': (0.0, 0.0), 114 | 'H': 900, 'W': 1600, 115 | 'rand_flip': True, 116 | } 117 | 118 | train_pipeline = [ 119 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 120 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1), 121 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), 122 | dict(type='LoadOcc3DFromFile', occ_root=occ_root), 123 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 124 | dict(type='ObjectNameFilter', classes=object_names), 125 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True), 126 | dict(type='DefaultFormatBundle3D', class_names=object_names), 127 | dict(type='Collect3D', keys=['img', 'voxel_semantics', 'mask_camera'], meta_keys=( 128 | 'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 129 | 'ego2lidar', 'img_timestamp')) 130 | ] 131 | 132 | test_pipeline = [ 133 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 134 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True), 135 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False), 136 | dict( 137 | type='MultiScaleFlipAug3D', 138 | img_scale=(1600, 900), 139 | pts_scale_ratio=1, 140 | flip=False, 141 | transforms=[ 142 | dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False), 143 | dict(type='Collect3D', keys=['img'], meta_keys=( 144 | 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape', 145 | 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp')) 146 | ]) 147 | ] 148 | 149 | data = dict( 150 | workers_per_gpu=4, 151 | train=dict( 152 | type=dataset_type, 153 | data_root=dataset_root, 154 | ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl', 155 | pipeline=train_pipeline, 156 | classes=object_names, 157 | modality=input_modality, 158 | test_mode=False, 159 | use_valid_flag=True, 160 | box_type_3d='LiDAR'), 161 | val=dict( 162 | type=dataset_type, 163 | data_root=dataset_root, 164 | ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl', 165 | pipeline=test_pipeline, 166 | classes=object_names, 167 | modality=input_modality, 168 | test_mode=True, 169 | box_type_3d='LiDAR'), 170 | test=dict( 171 | type=dataset_type, 172 | data_root=dataset_root, 173 | ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl', 174 | pipeline=test_pipeline, 175 | classes=object_names, 176 | modality=input_modality, 177 | test_mode=True, 178 | box_type_3d='LiDAR') 179 | ) 180 | 181 | optimizer = dict( 182 | type='AdamW', 183 | lr=2e-4, 184 | paramwise_cfg=dict(custom_keys={ 185 | 'img_backbone': dict(lr_mult=0.1), 186 | 'sampling_offset': dict(lr_mult=0.1), 187 | }), 188 | weight_decay=0.01 189 | ) 190 | 191 | optimizer_config = dict( 192 | type='Fp16OptimizerHook', 193 | loss_scale=512.0, 194 | grad_clip=dict(max_norm=35, norm_type=2) 195 | ) 196 | 197 | # learning policy 198 | lr_config = dict( 199 | policy='CosineAnnealing', 200 | warmup='linear', 201 | warmup_iters=500, 202 | warmup_ratio=1.0 / 3, 203 | min_lr_ratio=1e-3 204 | ) 205 | total_epochs = 100 206 | batch_size = 8 207 | 208 | # load pretrained weights 209 | load_from = 'pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth' 210 | revise_keys = [('backbone', 'img_backbone')] 211 | 212 | # resume the last training 213 | resume_from = None 214 | 215 | # checkpointing 216 | checkpoint_config = dict(interval=1, max_keep_ckpts=1) 217 | 218 | # logging 219 | log_config = dict( 220 | interval=1, 221 | hooks=[ 222 | dict(type='TextLoggerHook', interval=50, reset_flag=True), 223 | dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True) 224 | ] 225 | ) 226 | 227 | # evaluation 228 | eval_config = dict(interval=total_epochs) 229 | 230 | # other flags 231 | debug = False -------------------------------------------------------------------------------- /configs/opusv1_nusc-occ3d/opusv1-m_r50_704x256_8f_nusc-occ3d_100e.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'NuScenesOcc3DDataset' 2 | dataset_root = 'data/nuscenes/' 3 | occ_root = 'data/nuscenes/gts/' 4 | 5 | input_modality = dict( 6 | use_lidar=False, 7 | use_camera=True, 8 | use_radar=False, 9 | use_map=False, 10 | use_external=True 11 | ) 12 | 13 | # For nuScenes we usually do 10-class detection 14 | object_names = [ 15 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 16 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 17 | ] 18 | 19 | occ_names = [ 20 | 'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 21 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 22 | 'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade', 23 | 'vegetation' 24 | ] 25 | 26 | # If point cloud range is changed, the models should also change their point 27 | # cloud range accordingly 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4] 29 | voxel_size = [0.4, 0.4, 0.4] 30 | 31 | # arch config 32 | embed_dims = 256 33 | num_layers = 6 34 | num_query = 2400 35 | num_frames = 8 36 | num_levels = 4 37 | num_points = 2 38 | num_refines = [1, 2, 4, 8, 16, 32] 39 | 40 | img_backbone = dict( 41 | type='ResNet', 42 | depth=50, 43 | num_stages=4, 44 | out_indices=(0, 1, 2, 3), 45 | frozen_stages=1, 46 | norm_cfg=dict(type='BN2d', requires_grad=True), 47 | norm_eval=True, 48 | style='pytorch', 49 | with_cp=True) 50 | img_neck = dict( 51 | type='FPN', 52 | in_channels=[256, 512, 1024, 2048], 53 | out_channels=embed_dims, 54 | num_outs=num_levels) 55 | img_norm_cfg = dict( 56 | mean=[123.675, 116.280, 103.530], 57 | std=[58.395, 57.120, 57.375], 58 | to_rgb=True) 59 | 60 | model = dict( 61 | type='OPUSV1', 62 | use_grid_mask=False, 63 | data_aug=dict( 64 | img_color_aug=True, # Move some augmentations to GPU 65 | img_norm_cfg=img_norm_cfg, 66 | img_pad_cfg=dict(size_divisor=32)), 67 | stop_prev_grad=0, 68 | img_backbone=img_backbone, 69 | img_neck=img_neck, 70 | pts_bbox_head=dict( 71 | type='OPUSV1Head', 72 | num_classes=len(occ_names), 73 | in_channels=embed_dims, 74 | num_query=num_query, 75 | pc_range=point_cloud_range, 76 | voxel_size=voxel_size, 77 | transformer=dict( 78 | type='OPUSV1Transformer', 79 | embed_dims=embed_dims, 80 | num_frames=num_frames, 81 | num_points=num_points, 82 | num_layers=num_layers, 83 | num_levels=num_levels, 84 | num_classes=len(occ_names), 85 | num_refines=num_refines, 86 | scales=[0.5], 87 | pc_range=point_cloud_range), 88 | loss_cls=dict( 89 | type='FocalLoss', 90 | use_sigmoid=True, 91 | gamma=2.0, 92 | alpha=0.25, 93 | loss_weight=2.0), 94 | loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)), 95 | train_cfg=dict( 96 | pts=dict( 97 | cls_weights=[ 98 | 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1], 99 | ) 100 | ), 101 | test_cfg=dict( 102 | pts=dict( 103 | score_thr=0.5, 104 | padding=True 105 | ) 106 | ) 107 | ) 108 | 109 | ida_aug_conf = { 110 | 'resize_lim': (0.38, 0.55), 111 | 'final_dim': (256, 704), 112 | 'bot_pct_lim': (0.0, 0.0), 113 | 'rot_lim': (0.0, 0.0), 114 | 'H': 900, 'W': 1600, 115 | 'rand_flip': True, 116 | } 117 | 118 | train_pipeline = [ 119 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 120 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1), 121 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), 122 | dict(type='LoadOcc3DFromFile', occ_root=occ_root), 123 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 124 | dict(type='ObjectNameFilter', classes=object_names), 125 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True), 126 | dict(type='DefaultFormatBundle3D', class_names=object_names), 127 | dict(type='Collect3D', keys=['img', 'voxel_semantics', 'mask_camera'], meta_keys=( 128 | 'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 129 | 'ego2lidar', 'img_timestamp')) 130 | ] 131 | 132 | test_pipeline = [ 133 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 134 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True), 135 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False), 136 | dict( 137 | type='MultiScaleFlipAug3D', 138 | img_scale=(1600, 900), 139 | pts_scale_ratio=1, 140 | flip=False, 141 | transforms=[ 142 | dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False), 143 | dict(type='Collect3D', keys=['img'], meta_keys=( 144 | 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape', 145 | 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp')) 146 | ]) 147 | ] 148 | 149 | data = dict( 150 | workers_per_gpu=4, 151 | train=dict( 152 | type=dataset_type, 153 | data_root=dataset_root, 154 | ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl', 155 | pipeline=train_pipeline, 156 | classes=object_names, 157 | modality=input_modality, 158 | test_mode=False, 159 | use_valid_flag=True, 160 | box_type_3d='LiDAR'), 161 | val=dict( 162 | type=dataset_type, 163 | data_root=dataset_root, 164 | ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl', 165 | pipeline=test_pipeline, 166 | classes=object_names, 167 | modality=input_modality, 168 | test_mode=True, 169 | box_type_3d='LiDAR'), 170 | test=dict( 171 | type=dataset_type, 172 | data_root=dataset_root, 173 | ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl', 174 | pipeline=test_pipeline, 175 | classes=object_names, 176 | modality=input_modality, 177 | test_mode=True, 178 | box_type_3d='LiDAR') 179 | ) 180 | 181 | optimizer = dict( 182 | type='AdamW', 183 | lr=2e-4, 184 | paramwise_cfg=dict(custom_keys={ 185 | 'img_backbone': dict(lr_mult=0.1), 186 | 'sampling_offset': dict(lr_mult=0.1), 187 | }), 188 | weight_decay=0.01 189 | ) 190 | 191 | optimizer_config = dict( 192 | type='Fp16OptimizerHook', 193 | loss_scale=512.0, 194 | grad_clip=dict(max_norm=35, norm_type=2) 195 | ) 196 | 197 | # learning policy 198 | lr_config = dict( 199 | policy='CosineAnnealing', 200 | warmup='linear', 201 | warmup_iters=500, 202 | warmup_ratio=1.0 / 3, 203 | min_lr_ratio=1e-3 204 | ) 205 | total_epochs = 100 206 | batch_size = 8 207 | 208 | # load pretrained weights 209 | load_from = 'pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth' 210 | revise_keys = [('backbone', 'img_backbone')] 211 | 212 | # resume the last training 213 | resume_from = None 214 | 215 | # checkpointing 216 | checkpoint_config = dict(interval=1, max_keep_ckpts=1) 217 | 218 | # logging 219 | log_config = dict( 220 | interval=1, 221 | hooks=[ 222 | dict(type='TextLoggerHook', interval=50, reset_flag=True), 223 | dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True) 224 | ] 225 | ) 226 | 227 | # evaluation 228 | eval_config = dict(interval=total_epochs) 229 | 230 | # other flags 231 | debug = False 232 | -------------------------------------------------------------------------------- /configs/opusv1_nusc-occ3d/opusv1-t_r50_704x256_8f_nusc-occ3d_100e.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'NuScenesOcc3DDataset' 2 | dataset_root = 'data/nuscenes/' 3 | occ_root = 'data/nuscenes/gts/' 4 | 5 | input_modality = dict( 6 | use_lidar=False, 7 | use_camera=True, 8 | use_radar=False, 9 | use_map=False, 10 | use_external=True 11 | ) 12 | 13 | # For nuScenes we usually do 10-class detection 14 | object_names = [ 15 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 16 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 17 | ] 18 | 19 | occ_names = [ 20 | 'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 21 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 22 | 'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade', 23 | 'vegetation' 24 | ] 25 | 26 | # If point cloud range is changed, the models should also change their point 27 | # cloud range accordingly 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4] 29 | voxel_size = [0.4, 0.4, 0.4] 30 | 31 | # arch config 32 | embed_dims = 256 33 | num_layers = 6 34 | num_query = 600 35 | num_frames = 8 36 | num_levels = 4 37 | num_points = 4 38 | num_refines = [1, 4, 16, 32, 64, 128] 39 | 40 | img_backbone = dict( 41 | type='ResNet', 42 | depth=50, 43 | num_stages=4, 44 | out_indices=(0, 1, 2, 3), 45 | frozen_stages=1, 46 | norm_cfg=dict(type='BN2d', requires_grad=True), 47 | norm_eval=True, 48 | style='pytorch', 49 | with_cp=True) 50 | img_neck = dict( 51 | type='FPN', 52 | in_channels=[256, 512, 1024, 2048], 53 | out_channels=embed_dims, 54 | num_outs=num_levels) 55 | img_norm_cfg = dict( 56 | mean=[123.675, 116.280, 103.530], 57 | std=[58.395, 57.120, 57.375], 58 | to_rgb=True) 59 | 60 | model = dict( 61 | type='OPUSV1', 62 | use_grid_mask=False, 63 | data_aug=dict( 64 | img_color_aug=True, # Move some augmentations to GPU 65 | img_norm_cfg=img_norm_cfg, 66 | img_pad_cfg=dict(size_divisor=32)), 67 | stop_prev_grad=0, 68 | img_backbone=img_backbone, 69 | img_neck=img_neck, 70 | pts_bbox_head=dict( 71 | type='OPUSV1Head', 72 | num_classes=len(occ_names), 73 | in_channels=embed_dims, 74 | num_query=num_query, 75 | pc_range=point_cloud_range, 76 | voxel_size=voxel_size, 77 | transformer=dict( 78 | type='OPUSV1Transformer', 79 | embed_dims=embed_dims, 80 | num_frames=num_frames, 81 | num_points=num_points, 82 | num_layers=num_layers, 83 | num_levels=num_levels, 84 | num_classes=len(occ_names), 85 | num_refines=num_refines, 86 | scales=[0.5], 87 | pc_range=point_cloud_range), 88 | loss_cls=dict( 89 | type='FocalLoss', 90 | use_sigmoid=True, 91 | gamma=2.0, 92 | alpha=0.25, 93 | loss_weight=2.0), 94 | loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)), 95 | train_cfg=dict( 96 | pts=dict( 97 | cls_weights=[ 98 | 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1], 99 | ) 100 | ), 101 | test_cfg=dict( 102 | pts=dict( 103 | score_thr=0.5, 104 | padding=True 105 | ) 106 | ) 107 | ) 108 | 109 | ida_aug_conf = { 110 | 'resize_lim': (0.38, 0.55), 111 | 'final_dim': (256, 704), 112 | 'bot_pct_lim': (0.0, 0.0), 113 | 'rot_lim': (0.0, 0.0), 114 | 'H': 900, 'W': 1600, 115 | 'rand_flip': True, 116 | } 117 | 118 | train_pipeline = [ 119 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 120 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1), 121 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), 122 | dict(type='LoadOcc3DFromFile', occ_root=occ_root), 123 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 124 | dict(type='ObjectNameFilter', classes=object_names), 125 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True), 126 | dict(type='DefaultFormatBundle3D', class_names=object_names), 127 | dict(type='Collect3D', keys=['img', 'voxel_semantics', 'mask_camera'], meta_keys=( 128 | 'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 129 | 'ego2lidar', 'img_timestamp')) 130 | ] 131 | 132 | test_pipeline = [ 133 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 134 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True), 135 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False), 136 | dict( 137 | type='MultiScaleFlipAug3D', 138 | img_scale=(1600, 900), 139 | pts_scale_ratio=1, 140 | flip=False, 141 | transforms=[ 142 | dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False), 143 | dict(type='Collect3D', keys=['img'], meta_keys=( 144 | 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape', 145 | 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp')) 146 | ]) 147 | ] 148 | 149 | data = dict( 150 | workers_per_gpu=4, 151 | train=dict( 152 | type=dataset_type, 153 | data_root=dataset_root, 154 | ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl', 155 | pipeline=train_pipeline, 156 | classes=object_names, 157 | modality=input_modality, 158 | test_mode=False, 159 | use_valid_flag=True, 160 | box_type_3d='LiDAR'), 161 | val=dict( 162 | type=dataset_type, 163 | data_root=dataset_root, 164 | ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl', 165 | pipeline=test_pipeline, 166 | classes=object_names, 167 | modality=input_modality, 168 | test_mode=True, 169 | box_type_3d='LiDAR'), 170 | test=dict( 171 | type=dataset_type, 172 | data_root=dataset_root, 173 | ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl', 174 | pipeline=test_pipeline, 175 | classes=object_names, 176 | modality=input_modality, 177 | test_mode=True, 178 | box_type_3d='LiDAR') 179 | ) 180 | 181 | optimizer = dict( 182 | type='AdamW', 183 | lr=2e-4, 184 | paramwise_cfg=dict(custom_keys={ 185 | 'img_backbone': dict(lr_mult=0.1), 186 | 'sampling_offset': dict(lr_mult=0.1), 187 | }), 188 | weight_decay=0.01 189 | ) 190 | 191 | optimizer_config = dict( 192 | type='Fp16OptimizerHook', 193 | loss_scale=512.0, 194 | grad_clip=dict(max_norm=35, norm_type=2) 195 | ) 196 | 197 | # learning policy 198 | lr_config = dict( 199 | policy='CosineAnnealing', 200 | warmup='linear', 201 | warmup_iters=500, 202 | warmup_ratio=1.0 / 3, 203 | min_lr_ratio=1e-3 204 | ) 205 | total_epochs = 100 206 | batch_size = 1 207 | 208 | # load pretrained weights 209 | load_from = 'pretrain/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth' 210 | revise_keys = [('backbone', 'img_backbone')] 211 | 212 | # resume the last training 213 | resume_from = None 214 | 215 | # checkpointing 216 | checkpoint_config = dict(interval=1, max_keep_ckpts=1) 217 | 218 | # logging 219 | log_config = dict( 220 | interval=1, 221 | hooks=[ 222 | dict(type='TextLoggerHook', interval=50, reset_flag=True), 223 | dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True) 224 | ] 225 | ) 226 | 227 | # evaluation 228 | eval_config = dict(interval=total_epochs) 229 | 230 | # other flags 231 | debug = False 232 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import torch 5 | import shutil 6 | import logging 7 | import datetime 8 | from mmcv.runner.hooks import HOOKS 9 | from mmcv.runner.hooks.logger import LoggerHook, TextLoggerHook 10 | from mmcv.runner.dist_utils import master_only 11 | from torch.utils.tensorboard import SummaryWriter 12 | 13 | 14 | def init_logging(filename=None, debug=False): 15 | logging.root = logging.RootLogger('DEBUG' if debug else 'INFO') 16 | formatter = logging.Formatter('[%(asctime)s][%(levelname)s] - %(message)s') 17 | 18 | stream_handler = logging.StreamHandler(sys.stdout) 19 | stream_handler.setFormatter(formatter) 20 | logging.root.addHandler(stream_handler) 21 | 22 | if filename is not None: 23 | file_handler = logging.FileHandler(filename) 24 | file_handler.setFormatter(formatter) 25 | logging.root.addHandler(file_handler) 26 | 27 | 28 | def backup_code(work_dir, verbose=False): 29 | base_dir = os.path.dirname(os.path.abspath(__file__)) 30 | for pattern in ['*.py', 'configs/*.py', 'models/*.py', 'loaders/*.py', 'loaders/pipelines/*.py']: 31 | for file in glob.glob(pattern): 32 | src = os.path.join(base_dir, file) 33 | dst = os.path.join(work_dir, 'backup', os.path.dirname(file)) 34 | 35 | if verbose: 36 | logging.info('Copying %s -> %s' % (os.path.relpath(src), os.path.relpath(dst))) 37 | 38 | os.makedirs(dst, exist_ok=True) 39 | shutil.copy2(src, dst) 40 | 41 | 42 | @HOOKS.register_module() 43 | class MyTextLoggerHook(TextLoggerHook): 44 | def _log_info(self, log_dict, runner): 45 | # print exp name for users to distinguish experiments 46 | # at every ``interval_exp_name`` iterations and the end of each epoch 47 | if runner.meta is not None and 'exp_name' in runner.meta: 48 | if (self.every_n_iters(runner, self.interval_exp_name)) or ( 49 | self.by_epoch and self.end_of_epoch(runner)): 50 | exp_info = f'Exp name: {runner.meta["exp_name"]}' 51 | runner.logger.info(exp_info) 52 | 53 | # by epoch: Epoch [4][100/1000] 54 | # by iter: Iter [100/100000] 55 | if self.by_epoch: 56 | log_str = f'Epoch [{log_dict["epoch"]}/{runner.max_epochs}]' \ 57 | f'[{log_dict["iter"]}/{len(runner.data_loader)}] ' 58 | else: 59 | log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}] ' 60 | 61 | log_str += 'loss: %.2f, ' % log_dict['loss'] 62 | 63 | if 'time' in log_dict.keys(): 64 | # MOD: skip the first iteration since it's not accurate 65 | if runner.iter == self.start_iter: 66 | time_sec_avg = log_dict['time'] 67 | else: 68 | self.time_sec_tot += (log_dict['time'] * self.interval) 69 | time_sec_avg = self.time_sec_tot / (runner.iter - self.start_iter) 70 | 71 | eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1) 72 | eta_str = str(datetime.timedelta(seconds=int(eta_sec))) 73 | log_str += f'eta: {eta_str}, ' 74 | log_str += f'time: {log_dict["time"]:.2f}s, ' \ 75 | f'data: {log_dict["data_time"] * 1000:.0f}ms, ' 76 | # statistic memory 77 | if torch.cuda.is_available(): 78 | log_str += f'mem: {log_dict["memory"]}M' 79 | 80 | runner.logger.info(log_str) 81 | 82 | def log(self, runner): 83 | if 'eval_iter_num' in runner.log_buffer.output: 84 | # this doesn't modify runner.iter and is regardless of by_epoch 85 | cur_iter = runner.log_buffer.output.pop('eval_iter_num') 86 | else: 87 | cur_iter = self.get_iter(runner, inner_iter=True) 88 | 89 | log_dict = { 90 | 'mode': self.get_mode(runner), 91 | 'epoch': self.get_epoch(runner), 92 | 'iter': cur_iter 93 | } 94 | 95 | # only record lr of the first param group 96 | cur_lr = runner.current_lr() 97 | if isinstance(cur_lr, list): 98 | log_dict['lr'] = cur_lr[0] 99 | else: 100 | assert isinstance(cur_lr, dict) 101 | log_dict['lr'] = {} 102 | for k, lr_ in cur_lr.items(): 103 | assert isinstance(lr_, list) 104 | log_dict['lr'].update({k: lr_[0]}) 105 | 106 | if 'time' in runner.log_buffer.output: 107 | # statistic memory 108 | if torch.cuda.is_available(): 109 | log_dict['memory'] = self._get_max_memory(runner) 110 | 111 | log_dict = dict(log_dict, **runner.log_buffer.output) 112 | 113 | # MOD: disable writing to files 114 | # self._dump_log(log_dict, runner) 115 | self._log_info(log_dict, runner) 116 | 117 | return log_dict 118 | 119 | def after_train_epoch(self, runner): 120 | if runner.log_buffer.ready: 121 | metrics = self.get_loggable_tags(runner) 122 | runner.logger.info('--- Evaluation Results ---') 123 | runner.logger.info('mAP: %.4f' % metrics['val/pts_bbox_NuScenes/mAP']) 124 | runner.logger.info('mATE: %.4f' % metrics['val/pts_bbox_NuScenes/mATE']) 125 | runner.logger.info('mASE: %.4f' % metrics['val/pts_bbox_NuScenes/mASE']) 126 | runner.logger.info('mAOE: %.4f' % metrics['val/pts_bbox_NuScenes/mAOE']) 127 | runner.logger.info('mAVE: %.4f' % metrics['val/pts_bbox_NuScenes/mAVE']) 128 | runner.logger.info('mAAE: %.4f' % metrics['val/pts_bbox_NuScenes/mAAE']) 129 | runner.logger.info('NDS: %.4f' % metrics['val/pts_bbox_NuScenes/NDS']) 130 | 131 | 132 | @HOOKS.register_module() 133 | class MyTensorboardLoggerHook(LoggerHook): 134 | def __init__(self, log_dir=None, interval=10, ignore_last=True, reset_flag=False, by_epoch=True): 135 | super(MyTensorboardLoggerHook, self).__init__( 136 | interval, ignore_last, reset_flag, by_epoch) 137 | self.log_dir = log_dir 138 | 139 | @master_only 140 | def before_run(self, runner): 141 | super(MyTensorboardLoggerHook, self).before_run(runner) 142 | if self.log_dir is None: 143 | self.log_dir = runner.work_dir 144 | self.writer = SummaryWriter(self.log_dir) 145 | 146 | @master_only 147 | def log(self, runner): 148 | tags = self.get_loggable_tags(runner) 149 | 150 | for key, value in tags.items(): 151 | # MOD: merge into the 'train' group 152 | if key == 'learning_rate': 153 | key = 'train/learning_rate' 154 | 155 | # MOD: skip momentum 156 | ignore = False 157 | if key == 'momentum': 158 | ignore = True 159 | 160 | # MOD: skip intermediate losses 161 | for i in range(5): 162 | if key[:13] == 'train/d%d.loss' % i: 163 | ignore = True 164 | 165 | if key[:3] == 'val': 166 | metric_name = key[22:] 167 | if metric_name in ['mAP', 'mATE', 'mASE', 'mAOE', 'mAVE', 'mAAE', 'NDS']: 168 | key = 'val/' + metric_name 169 | else: 170 | ignore = True 171 | 172 | if self.get_mode(runner) == 'train' and key[:5] != 'train': 173 | ignore = True 174 | 175 | if self.get_mode(runner) != 'train' and key[:3] != 'val': 176 | ignore = True 177 | 178 | if ignore: 179 | continue 180 | 181 | if key[:5] == 'train': 182 | self.writer.add_scalar(key, value, self.get_iter(runner)) 183 | elif key[:3] == 'val': 184 | self.writer.add_scalar(key, value, self.get_epoch(runner)) 185 | 186 | @master_only 187 | def after_run(self, runner): 188 | self.writer.close() 189 | -------------------------------------------------------------------------------- /loaders/nuscenes_occupancy_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import mmcv 3 | import numpy as np 4 | import torch 5 | import pickle 6 | import os.path as osp 7 | from tqdm import tqdm 8 | from mmdet.datasets import DATASETS 9 | from mmdet3d.datasets import NuScenesDataset 10 | from nuscenes.eval.common.utils import Quaternion 11 | from nuscenes.utils.geometry_utils import transform_matrix 12 | from torch.utils.data import DataLoader 13 | from models.utils import sparse2dense 14 | from .utils import compose_ego2img 15 | from .old_metrics import Metric_mIoU_Occupancy 16 | 17 | 18 | @DATASETS.register_module() 19 | class NuScenesOccupancyDataset(NuScenesDataset): 20 | def __init__(self, *args, **kwargs): 21 | super().__init__(filter_empty_gt=False, *args, **kwargs) 22 | self.data_infos = self.load_annotations(self.ann_file) 23 | 24 | def collect_cam_sweeps(self, index, into_past=150, into_future=0): 25 | all_sweeps_prev = [] 26 | curr_index = index 27 | while len(all_sweeps_prev) < into_past: 28 | curr_sweeps = self.data_infos[curr_index]['cam_sweeps'] 29 | if len(curr_sweeps) == 0: 30 | break 31 | all_sweeps_prev.extend(curr_sweeps) 32 | all_sweeps_prev.append(self.data_infos[curr_index - 1]['cams']) 33 | curr_index = curr_index - 1 34 | 35 | all_sweeps_next = [] 36 | curr_index = index + 1 37 | while len(all_sweeps_next) < into_future: 38 | if curr_index >= len(self.data_infos): 39 | break 40 | curr_sweeps = self.data_infos[curr_index]['cam_sweeps'] 41 | all_sweeps_next.extend(curr_sweeps[::-1]) 42 | all_sweeps_next.append(self.data_infos[curr_index]['cams']) 43 | curr_index = curr_index + 1 44 | 45 | return all_sweeps_prev, all_sweeps_next 46 | 47 | def collect_lidar_sweeps(self, index, into_past=20, into_future=0): 48 | all_sweeps_prev = [] 49 | curr_index = index 50 | while len(all_sweeps_prev) < into_past: 51 | curr_sweeps = self.data_infos[curr_index]['lidar_sweeps'] 52 | if len(curr_sweeps) == 0: 53 | break 54 | all_sweeps_prev.extend(curr_sweeps) 55 | curr_index = curr_index - 1 56 | 57 | all_sweeps_next = [] 58 | curr_index = index + 1 59 | last_timestamp = self.data_infos[index]['timestamp'] 60 | while len(all_sweeps_next) < into_future: 61 | if curr_index >= len(self.data_infos): 62 | break 63 | curr_sweeps = self.data_infos[curr_index]['lidar_sweeps'][::-1] 64 | if curr_sweeps[0]['timestamp'] == last_timestamp: 65 | curr_sweeps = curr_sweeps[1:] 66 | all_sweeps_next.extend(curr_sweeps) 67 | curr_index = curr_index + 1 68 | last_timestamp = all_sweeps_next[-1]['timestamp'] 69 | 70 | return all_sweeps_prev, all_sweeps_next 71 | 72 | def get_data_info(self, index): 73 | info = self.data_infos[index] 74 | 75 | ego2global_translation = info['ego2global_translation'] 76 | ego2global_rotation = info['ego2global_rotation'] 77 | lidar2ego_translation = info['lidar2ego_translation'] 78 | lidar2ego_rotation = info['lidar2ego_rotation'] 79 | ego2global_rotation_mat = Quaternion(ego2global_rotation).rotation_matrix 80 | lidar2ego_rotation_mat = Quaternion(lidar2ego_rotation).rotation_matrix 81 | ego2lidar = transform_matrix( 82 | lidar2ego_translation, Quaternion(lidar2ego_rotation), inverse=True) 83 | 84 | input_dict = dict( 85 | sample_token=info['token'], 86 | scene_name=info['scene_name'], 87 | scene_token=info['scene_token'], 88 | lidar_token=info['lidar_token'], 89 | timestamp=info['timestamp'] / 1e6, 90 | ego2lidar=ego2lidar, 91 | ego2obj=ego2lidar, 92 | ego2occ=ego2lidar, 93 | ego2global_translation=ego2global_translation, 94 | ego2global_rotation=ego2global_rotation_mat, 95 | lidar2ego_translation=lidar2ego_translation, 96 | lidar2ego_rotation=lidar2ego_rotation_mat, 97 | ) 98 | 99 | if self.modality['use_lidar']: 100 | lidar_sweeps_prev, lidar_sweeps_next = self.collect_lidar_sweeps(index) 101 | input_dict.update(dict( 102 | pts_filename=info['lidar_path'], 103 | lidar_sweeps={'prev': lidar_sweeps_prev, 'next': lidar_sweeps_next}, 104 | )) 105 | 106 | if self.modality['use_camera']: 107 | img_paths = [] 108 | img_timestamps = [] 109 | ego2img = [] 110 | 111 | for _, cam_info in info['cams'].items(): 112 | img_paths.append(os.path.relpath(cam_info['data_path'])) 113 | img_timestamps.append(cam_info['timestamp'] / 1e6) 114 | ego2img.append( 115 | compose_ego2img( 116 | ego2global_translation, 117 | ego2global_rotation_mat, 118 | cam_info['sensor2global_translation'], 119 | cam_info['sensor2global_rotation'].T, 120 | cam_info['cam_intrinsic'] 121 | ) 122 | ) 123 | 124 | cam_sweeps_prev, cam_sweeps_next = self.collect_cam_sweeps(index) 125 | 126 | input_dict.update(dict( 127 | img_filename=img_paths, 128 | img_timestamp=img_timestamps, 129 | ego2img=ego2img, 130 | cam_sweeps={'prev': cam_sweeps_prev, 'next': cam_sweeps_next}, 131 | )) 132 | 133 | if not self.test_mode: 134 | annos = self.get_ann_info(index) 135 | input_dict['ann_info'] = annos 136 | 137 | return input_dict 138 | 139 | def evaluate(self, occ_results, runner=None, show_dir=None, **eval_kwargs): 140 | occ_gts = [] 141 | occ_preds = [] 142 | lidar_origins = [] 143 | 144 | print('\nStarting Evaluation...') 145 | metric = Metric_mIoU_Occupancy() 146 | 147 | occ_class_names = [ 148 | 'noise', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 149 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 150 | 'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade', 'vegetation' 151 | ] 152 | ignore_class_names=['noise'] 153 | pc_range = np.array([-51.2, -51.2, -5.0, 51.2, 51.2, 3]) 154 | voxel_size = np.array([0.2, 0.2, 0.2]) 155 | voxel_num = ((pc_range[3:] - pc_range[:3]) / voxel_size).astype(np.int64) 156 | 157 | from tqdm import tqdm 158 | for i in tqdm(range(len(occ_results))): 159 | result_dict = occ_results[i] 160 | info = self.get_data_info(i) 161 | 162 | scene_token, lidar_token = info['scene_token'], info['lidar_token'] 163 | occ_root = 'data/nuscenes/openoccupancy/' 164 | occ_file = osp.join(occ_root, f'scene_{scene_token}', 'occupancy', f'{lidar_token}.npy') 165 | # load lidar and camera visible label 166 | occ_labels = np.load(occ_file) 167 | coors, labels = occ_labels[:, :3], occ_labels[:, 3] 168 | occ_labels, _ = sparse2dense(coors[:, ::-1], labels, voxel_num, empty_value=len(occ_class_names)) 169 | mask = occ_labels != 0 # ignore noise 170 | 171 | curr_class_names = [n for n in occ_class_names if n not in ignore_class_names] 172 | curr_bg_class_idx = len(curr_class_names) # 16 173 | label_mapper = [curr_class_names.index(n) if n in curr_class_names else 16 174 | for n in occ_class_names] + [curr_bg_class_idx] 175 | label_mapper = np.array(label_mapper) 176 | occ_labels = label_mapper[occ_labels] 177 | 178 | occ_pred, _ = sparse2dense(result_dict['occ_loc'], result_dict['sem_pred'], voxel_num, 16) 179 | metric.add_batch(occ_pred, occ_labels, mask) 180 | 181 | mIoU, IoU = metric.count_miou() 182 | return {'mIoU': mIoU, 'IoU': IoU} 183 | 184 | def format_results(self, occ_results, submission_prefix, **kwargs): 185 | if submission_prefix is not None: 186 | mmcv.mkdir_or_exist(submission_prefix) 187 | 188 | for index, occ_pred in enumerate(tqdm(occ_results)): 189 | info = self.data_infos[index] 190 | sample_token = info['token'] 191 | save_path=os.path.join(submission_prefix, '{}.npz'.format(sample_token)) 192 | np.savez_compressed(save_path,occ_pred.astype(np.uint8)) 193 | print('\nFinished.') -------------------------------------------------------------------------------- /loaders/ray_metrics.py: -------------------------------------------------------------------------------- 1 | # Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting 2 | # Modified by Haisong Liu 3 | import math 4 | import copy 5 | import numpy as np 6 | import torch 7 | from torch.utils.cpp_extension import load 8 | from tqdm import tqdm 9 | from prettytable import PrettyTable 10 | 11 | 12 | dvr = load("dvr", sources=["lib/dvr/dvr.cpp", "lib/dvr/dvr.cu"], verbose=True, extra_cuda_cflags=['-allow-unsupported-compiler']) 13 | _pc_range = [-40, -40, -1.0, 40, 40, 5.4] 14 | _voxel_size = 0.4 15 | 16 | occ_class_names = [ 17 | 'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 18 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 19 | 'driveable_surface', 'other_flat', 'sidewalk', 20 | 'terrain', 'manmade', 'vegetation', 'free' 21 | ] 22 | 23 | # https://github.com/tarashakhurana/4d-occ-forecasting/blob/ff986082cd6ea10e67ab7839bf0e654736b3f4e2/test_fgbg.py#L29C1-L46C16 24 | def get_rendered_pcds(origin, points, tindex, pred_dist): 25 | pcds = [] 26 | 27 | for t in range(len(origin)): 28 | mask = (tindex == t) 29 | # skip the ones with no data 30 | if not mask.any(): 31 | continue 32 | _pts = points[mask, :3] 33 | # use ground truth lidar points for the raycasting direction 34 | v = _pts - origin[t][None, :] 35 | d = v / np.sqrt((v ** 2).sum(axis=1, keepdims=True)) 36 | pred_pts = origin[t][None, :] + d * pred_dist[mask][:, None] 37 | pcds.append(torch.from_numpy(pred_pts)) 38 | 39 | return pcds 40 | 41 | 42 | def meshgrid3d(occ_size, pc_range): 43 | W, H, D = occ_size 44 | 45 | xs = torch.linspace(0.5, W - 0.5, W).view(W, 1, 1).expand(W, H, D) / W 46 | ys = torch.linspace(0.5, H - 0.5, H).view(1, H, 1).expand(W, H, D) / H 47 | zs = torch.linspace(0.5, D - 0.5, D).view(1, 1, D).expand(W, H, D) / D 48 | xs = xs * (pc_range[3] - pc_range[0]) + pc_range[0] 49 | ys = ys * (pc_range[4] - pc_range[1]) + pc_range[1] 50 | zs = zs * (pc_range[5] - pc_range[2]) + pc_range[2] 51 | xyz = torch.stack((xs, ys, zs), -1) 52 | 53 | return xyz 54 | 55 | 56 | def generate_lidar_rays(): 57 | # prepare lidar ray angles 58 | pitch_angles = [] 59 | for k in range(10): 60 | angle = math.pi / 2 - math.atan(k + 1) 61 | pitch_angles.append(-angle) 62 | 63 | # nuscenes lidar fov: [0.2107773983152201, -0.5439104895672159] (rad) 64 | while pitch_angles[-1] < 0.21: 65 | delta = pitch_angles[-1] - pitch_angles[-2] 66 | pitch_angles.append(pitch_angles[-1] + delta) 67 | 68 | lidar_rays = [] 69 | for pitch_angle in pitch_angles: 70 | for azimuth_angle in np.arange(0, 360, 1): 71 | azimuth_angle = np.deg2rad(azimuth_angle) 72 | 73 | x = np.cos(pitch_angle) * np.cos(azimuth_angle) 74 | y = np.cos(pitch_angle) * np.sin(azimuth_angle) 75 | z = np.sin(pitch_angle) 76 | 77 | lidar_rays.append((x, y, z)) 78 | 79 | return np.array(lidar_rays, dtype=np.float32) 80 | 81 | 82 | def process_one_sample(sem_pred, lidar_rays, output_origin): 83 | # lidar origin in ego coordinate 84 | # lidar_origin = torch.tensor([[[0.9858, 0.0000, 1.8402]]]) 85 | T = output_origin.shape[1] 86 | pred_pcds_t = [] 87 | 88 | free_id = len(occ_class_names) - 1 89 | occ_pred = copy.deepcopy(sem_pred) 90 | occ_pred[sem_pred < free_id] = 1 91 | occ_pred[sem_pred == free_id] = 0 92 | occ_pred = torch.from_numpy(occ_pred).permute(2, 1, 0) 93 | occ_pred = occ_pred[None, None, :].contiguous().float() 94 | 95 | offset = torch.Tensor(_pc_range[:3])[None, None, :] 96 | scaler = torch.Tensor([_voxel_size] * 3)[None, None, :] 97 | 98 | lidar_tindex = torch.zeros([1, lidar_rays.shape[0]]) 99 | 100 | for t in range(T): 101 | lidar_origin = output_origin[:, t:t+1, :] # [1, 1, 3] 102 | lidar_endpts = lidar_rays[None] + lidar_origin # [1, 15840, 3] 103 | 104 | output_origin_render = ((lidar_origin - offset) / scaler).float() # [1, 1, 3] 105 | output_points_render = ((lidar_endpts - offset) / scaler).float() # [1, N, 3] 106 | output_tindex_render = lidar_tindex # [1, N], all zeros 107 | 108 | with torch.no_grad(): 109 | pred_dist, _, coord_index = dvr.render_forward( 110 | occ_pred.cuda(), 111 | output_origin_render.cuda(), 112 | output_points_render.cuda(), 113 | output_tindex_render.cuda(), 114 | [1, 16, 200, 200], 115 | "test" 116 | ) 117 | pred_dist *= _voxel_size 118 | 119 | pred_pcds = get_rendered_pcds( 120 | lidar_origin[0].cpu().numpy(), 121 | lidar_endpts[0].cpu().numpy(), 122 | lidar_tindex[0].cpu().numpy(), 123 | pred_dist[0].cpu().numpy() 124 | ) 125 | coord_index = coord_index[0, :, :].int().cpu() # [N, 3] 126 | 127 | pred_label = torch.from_numpy(sem_pred[coord_index[:, 0], coord_index[:, 1], coord_index[:, 2]])[:, None] # [N, 1] 128 | pred_dist = pred_dist[0, :, None].cpu() 129 | pred_pcds = torch.cat([pred_label.float(), pred_dist], dim=-1) 130 | 131 | pred_pcds_t.append(pred_pcds) 132 | 133 | pred_pcds_t = torch.cat(pred_pcds_t, dim=0) 134 | 135 | return pred_pcds_t.numpy() 136 | 137 | 138 | def calc_metrics(pcd_pred_list, pcd_gt_list): 139 | thresholds = [1, 2, 4] 140 | 141 | gt_cnt = np.zeros([len(occ_class_names)]) 142 | pred_cnt = np.zeros([len(occ_class_names)]) 143 | tp_cnt = np.zeros([len(thresholds), len(occ_class_names)]) 144 | 145 | for pcd_pred, pcd_gt in zip(pcd_pred_list, pcd_gt_list): 146 | for j, threshold in enumerate(thresholds): 147 | # L1 148 | depth_pred = pcd_pred[:, 1] 149 | depth_gt = pcd_gt[:, 1] 150 | l1_error = np.abs(depth_pred - depth_gt) 151 | tp_dist_mask = (l1_error < threshold) 152 | 153 | for i, cls in enumerate(occ_class_names): 154 | cls_id = occ_class_names.index(cls) 155 | cls_mask_pred = (pcd_pred[:, 0] == cls_id) 156 | cls_mask_gt = (pcd_gt[:, 0] == cls_id) 157 | 158 | gt_cnt_i = cls_mask_gt.sum() 159 | pred_cnt_i = cls_mask_pred.sum() 160 | if j == 0: 161 | gt_cnt[i] += gt_cnt_i 162 | pred_cnt[i] += pred_cnt_i 163 | 164 | tp_cls = cls_mask_gt & cls_mask_pred # [N] 165 | tp_mask = np.logical_and(tp_cls, tp_dist_mask) 166 | tp_cnt[j][i] += tp_mask.sum() 167 | 168 | iou_list = [] 169 | for j, threshold in enumerate(thresholds): 170 | iou_list.append((tp_cnt[j] / (gt_cnt + pred_cnt - tp_cnt[j]))[:-1]) 171 | 172 | return iou_list 173 | 174 | 175 | def main(sem_pred_list, sem_gt_list, lidar_origin_list): 176 | torch.cuda.empty_cache() 177 | 178 | # generate lidar rays 179 | lidar_rays = generate_lidar_rays() 180 | lidar_rays = torch.from_numpy(lidar_rays) 181 | 182 | pcd_pred_list, pcd_gt_list = [], [] 183 | for sem_pred, sem_gt, lidar_origins in tqdm(zip(sem_pred_list, sem_gt_list, lidar_origin_list), ncols=50): 184 | sem_pred = np.reshape(sem_pred, [200, 200, 16]) 185 | sem_gt = np.reshape(sem_gt, [200, 200, 16]) 186 | 187 | pcd_pred = process_one_sample(sem_pred, lidar_rays, lidar_origins) 188 | pcd_gt = process_one_sample(sem_gt, lidar_rays, lidar_origins) 189 | 190 | # evalute on non-free rays 191 | valid_mask = (pcd_gt[:, 0].astype(np.int32) != len(occ_class_names) - 1) 192 | pcd_pred = pcd_pred[valid_mask] 193 | pcd_gt = pcd_gt[valid_mask] 194 | 195 | assert pcd_pred.shape == pcd_gt.shape 196 | pcd_pred_list.append(pcd_pred) 197 | pcd_gt_list.append(pcd_gt) 198 | 199 | iou_list = calc_metrics(pcd_pred_list, pcd_gt_list) 200 | rayiou = np.nanmean(iou_list) 201 | rayiou_0 = np.nanmean(iou_list[0]) 202 | rayiou_1 = np.nanmean(iou_list[1]) 203 | rayiou_2 = np.nanmean(iou_list[2]) 204 | 205 | table = PrettyTable([ 206 | 'Class Names', 207 | 'RayIoU@1', 'RayIoU@2', 'RayIoU@4' 208 | ]) 209 | table.float_format = '.3' 210 | 211 | for i in range(len(occ_class_names) - 1): 212 | table.add_row([ 213 | occ_class_names[i], 214 | iou_list[0][i], iou_list[1][i], iou_list[2][i] 215 | ], divider=(i == len(occ_class_names) - 2)) 216 | 217 | table.add_row(['MEAN', rayiou_0, rayiou_1, rayiou_2]) 218 | 219 | print(table) 220 | 221 | torch.cuda.empty_cache() 222 | 223 | return { 224 | 'RayIoU': rayiou, 225 | 'RayIoU@1': rayiou_0, 226 | 'RayIoU@2': rayiou_1, 227 | 'RayIoU@4': rayiou_2, 228 | } 229 | -------------------------------------------------------------------------------- /models/backbones/eva02/batch_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | import torch.distributed as dist 4 | from fvcore.nn.distributed import differentiable_all_reduce 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | from .wrappers import BatchNorm2d 9 | 10 | 11 | class FrozenBatchNorm2d(nn.Module): 12 | """ 13 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 14 | 15 | It contains non-trainable buffers called 16 | "weight" and "bias", "running_mean", "running_var", 17 | initialized to perform identity transformation. 18 | 19 | The pre-trained backbone models from Caffe2 only contain "weight" and "bias", 20 | which are computed from the original four parameters of BN. 21 | The affine transform `x * weight + bias` will perform the equivalent 22 | computation of `(x - running_mean) / sqrt(running_var) * weight + bias`. 23 | When loading a backbone model from Caffe2, "running_mean" and "running_var" 24 | will be left unchanged as identity transformation. 25 | 26 | Other pre-trained backbone models may contain all 4 parameters. 27 | 28 | The forward is implemented by `F.batch_norm(..., training=False)`. 29 | """ 30 | 31 | _version = 3 32 | 33 | def __init__(self, num_features, eps=1e-5): 34 | super().__init__() 35 | self.num_features = num_features 36 | self.eps = eps 37 | self.register_buffer("weight", torch.ones(num_features)) 38 | self.register_buffer("bias", torch.zeros(num_features)) 39 | self.register_buffer("running_mean", torch.zeros(num_features)) 40 | self.register_buffer("running_var", torch.ones(num_features) - eps) 41 | 42 | def forward(self, x): 43 | if x.requires_grad: 44 | # When gradients are needed, F.batch_norm will use extra memory 45 | # because its backward op computes gradients for weight/bias as well. 46 | scale = self.weight * (self.running_var + self.eps).rsqrt() 47 | bias = self.bias - self.running_mean * scale 48 | scale = scale.reshape(1, -1, 1, 1) 49 | bias = bias.reshape(1, -1, 1, 1) 50 | out_dtype = x.dtype # may be half 51 | return x * scale.to(out_dtype) + bias.to(out_dtype) 52 | else: 53 | # When gradients are not needed, F.batch_norm is a single fused op 54 | # and provide more optimization opportunities. 55 | return F.batch_norm( 56 | x, 57 | self.running_mean, 58 | self.running_var, 59 | self.weight, 60 | self.bias, 61 | training=False, 62 | eps=self.eps, 63 | ) 64 | 65 | def _load_from_state_dict( 66 | self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 67 | ): 68 | version = local_metadata.get("version", None) 69 | 70 | if version is None or version < 2: 71 | # No running_mean/var in early versions 72 | # This will silent the warnings 73 | if prefix + "running_mean" not in state_dict: 74 | state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean) 75 | if prefix + "running_var" not in state_dict: 76 | state_dict[prefix + "running_var"] = torch.ones_like(self.running_var) 77 | 78 | super()._load_from_state_dict( 79 | state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs 80 | ) 81 | 82 | def __repr__(self): 83 | return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps) 84 | 85 | @classmethod 86 | def convert_frozen_batchnorm(cls, module): 87 | """ 88 | Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm. 89 | 90 | Args: 91 | module (torch.nn.Module): 92 | 93 | Returns: 94 | If module is BatchNorm/SyncBatchNorm, returns a new module. 95 | Otherwise, in-place convert module and return it. 96 | 97 | Similar to convert_sync_batchnorm in 98 | https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py 99 | """ 100 | bn_module = nn.modules.batchnorm 101 | bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm) 102 | res = module 103 | if isinstance(module, bn_module): 104 | res = cls(module.num_features) 105 | if module.affine: 106 | res.weight.data = module.weight.data.clone().detach() 107 | res.bias.data = module.bias.data.clone().detach() 108 | res.running_mean.data = module.running_mean.data 109 | res.running_var.data = module.running_var.data 110 | res.eps = module.eps 111 | else: 112 | for name, child in module.named_children(): 113 | new_child = cls.convert_frozen_batchnorm(child) 114 | if new_child is not child: 115 | res.add_module(name, new_child) 116 | return res 117 | 118 | 119 | def get_norm(norm, out_channels): 120 | """ 121 | Args: 122 | norm (str or callable): either one of BN, SyncBN, FrozenBN, GN; 123 | or a callable that takes a channel number and returns 124 | the normalization layer as a nn.Module. 125 | 126 | Returns: 127 | nn.Module or None: the normalization layer 128 | """ 129 | if norm is None: 130 | return None 131 | if isinstance(norm, str): 132 | if len(norm) == 0: 133 | return None 134 | norm = { 135 | "BN": BatchNorm2d, 136 | # Fixed in https://github.com/pytorch/pytorch/pull/36382 137 | "SyncBN": nn.SyncBatchNorm, 138 | "FrozenBN": FrozenBatchNorm2d, 139 | "GN": lambda channels: nn.GroupNorm(32, channels), 140 | # for debugging: 141 | "nnSyncBN": nn.SyncBatchNorm, 142 | "LN": lambda channels: LayerNorm(channels) 143 | }[norm] 144 | return norm(out_channels) 145 | 146 | 147 | class CycleBatchNormList(nn.ModuleList): 148 | """ 149 | Implement domain-specific BatchNorm by cycling. 150 | 151 | When a BatchNorm layer is used for multiple input domains or input 152 | features, it might need to maintain a separate test-time statistics 153 | for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`. 154 | 155 | This module implements it by using N separate BN layers 156 | and it cycles through them every time a forward() is called. 157 | 158 | NOTE: The caller of this module MUST guarantee to always call 159 | this module by multiple of N times. Otherwise its test-time statistics 160 | will be incorrect. 161 | """ 162 | 163 | def __init__(self, length: int, bn_class=nn.BatchNorm2d, **kwargs): 164 | """ 165 | Args: 166 | length: number of BatchNorm layers to cycle. 167 | bn_class: the BatchNorm class to use 168 | kwargs: arguments of the BatchNorm class, such as num_features. 169 | """ 170 | self._affine = kwargs.pop("affine", True) 171 | super().__init__([bn_class(**kwargs, affine=False) for k in range(length)]) 172 | if self._affine: 173 | # shared affine, domain-specific BN 174 | channels = self[0].num_features 175 | self.weight = nn.Parameter(torch.ones(channels)) 176 | self.bias = nn.Parameter(torch.zeros(channels)) 177 | self._pos = 0 178 | 179 | def forward(self, x): 180 | ret = self[self._pos](x) 181 | self._pos = (self._pos + 1) % len(self) 182 | 183 | if self._affine: 184 | w = self.weight.reshape(1, -1, 1, 1) 185 | b = self.bias.reshape(1, -1, 1, 1) 186 | return ret * w + b 187 | else: 188 | return ret 189 | 190 | def extra_repr(self): 191 | return f"affine={self._affine}" 192 | 193 | 194 | class LayerNorm(nn.Module): 195 | """ 196 | A LayerNorm variant, popularized by Transformers, that performs point-wise mean and 197 | variance normalization over the channel dimension for inputs that have shape 198 | (batch_size, channels, height, width). 199 | https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa B950 200 | """ 201 | 202 | def __init__(self, normalized_shape, eps=1e-6): 203 | super().__init__() 204 | self.weight = nn.Parameter(torch.ones(normalized_shape)) 205 | self.bias = nn.Parameter(torch.zeros(normalized_shape)) 206 | self.eps = eps 207 | self.normalized_shape = (normalized_shape,) 208 | 209 | def forward(self, x): 210 | u = x.mean(1, keepdim=True) 211 | s = (x - u).pow(2).mean(1, keepdim=True) 212 | x = (x - u) / torch.sqrt(s + self.eps) 213 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 214 | return x -------------------------------------------------------------------------------- /models/lidar_encoder/sparse_encoder4x.py: -------------------------------------------------------------------------------- 1 | from mmcv.runner import auto_fp16 2 | from torch import nn as nn 3 | 4 | from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule 5 | from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE 6 | from mmdet3d.models.builder import MIDDLE_ENCODERS 7 | 8 | if IS_SPCONV2_AVAILABLE: 9 | from spconv.pytorch import SparseConvTensor, SparseSequential 10 | else: 11 | from mmcv.ops import SparseConvTensor, SparseSequential 12 | 13 | 14 | @MIDDLE_ENCODERS.register_module() 15 | class SparseEncoder8x(nn.Module): 16 | r"""Sparse encoder for SECOND and Part-A2. 17 | 18 | Args: 19 | in_channels (int): The number of input channels. 20 | sparse_shape (list[int]): The sparse shape of input tensor. 21 | order (list[str], optional): Order of conv module. 22 | Defaults to ('conv', 'norm', 'act'). 23 | norm_cfg (dict, optional): Config of normalization layer. Defaults to 24 | dict(type='BN1d', eps=1e-3, momentum=0.01). 25 | base_channels (int, optional): Out channels for conv_input layer. 26 | Defaults to 16. 27 | output_channels (int, optional): Out channels for conv_out layer. 28 | Defaults to 128. 29 | encoder_channels (tuple[tuple[int]], optional): 30 | Convolutional channels of each encode block. 31 | Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)). 32 | encoder_paddings (tuple[tuple[int]], optional): 33 | Paddings of each encode block. 34 | Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)). 35 | block_type (str, optional): Type of the block to use. 36 | Defaults to 'conv_module'. 37 | """ 38 | 39 | def __init__(self, 40 | in_channels, 41 | sparse_shape, 42 | order=('conv', 'norm', 'act'), 43 | norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), 44 | base_channels=16, 45 | output_channels=128, 46 | encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 47 | 64)), 48 | encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 49 | 1)), 50 | block_type='conv_module'): 51 | super().__init__() 52 | assert block_type in ['conv_module', 'basicblock'] 53 | self.sparse_shape = sparse_shape 54 | self.in_channels = in_channels 55 | self.order = order 56 | self.base_channels = base_channels 57 | self.output_channels = output_channels 58 | self.encoder_channels = encoder_channels 59 | self.encoder_paddings = encoder_paddings 60 | self.stage_num = len(self.encoder_channels) 61 | self.fp16_enabled = False 62 | # Spconv init all weight on its own 63 | 64 | assert isinstance(order, tuple) and len(order) == 3 65 | assert set(order) == {'conv', 'norm', 'act'} 66 | 67 | if self.order[0] != 'conv': # pre activate 68 | self.conv_input = make_sparse_convmodule( 69 | in_channels, 70 | self.base_channels, 71 | 3, 72 | norm_cfg=norm_cfg, 73 | padding=1, 74 | indice_key='subm1', 75 | conv_type='SubMConv3d', 76 | order=('conv', )) 77 | else: # post activate 78 | self.conv_input = make_sparse_convmodule( 79 | in_channels, 80 | self.base_channels, 81 | 3, 82 | norm_cfg=norm_cfg, 83 | padding=1, 84 | indice_key='subm1', 85 | conv_type='SubMConv3d') 86 | 87 | encoder_out_channels = self.make_encoder_layers( 88 | make_sparse_convmodule, 89 | norm_cfg, 90 | self.base_channels, 91 | block_type=block_type) 92 | 93 | self.conv_out = make_sparse_convmodule( 94 | encoder_out_channels, 95 | self.output_channels, 96 | kernel_size=(1, 1, 1), 97 | stride=(1, 1, 1), 98 | norm_cfg=norm_cfg, 99 | padding=0, 100 | indice_key='spconv_down2', 101 | conv_type='SparseConv3d') 102 | 103 | @auto_fp16(apply_to=('voxel_features', )) 104 | def forward(self, voxel_features, coors, batch_size): 105 | """Forward of SparseEncoder. 106 | 107 | Args: 108 | voxel_features (torch.Tensor): Voxel features in shape (N, C). 109 | coors (torch.Tensor): Coordinates in shape (N, 4), 110 | the columns in the order of (batch_idx, z_idx, y_idx, x_idx). 111 | batch_size (int): Batch size. 112 | 113 | Returns: 114 | dict: Backbone features. 115 | """ 116 | coors = coors.int() 117 | input_sp_tensor = SparseConvTensor(voxel_features, coors, 118 | self.sparse_shape, batch_size) 119 | x = self.conv_input(input_sp_tensor) 120 | 121 | encode_features = [] 122 | for encoder_layer in self.encoder_layers: 123 | x = encoder_layer(x) 124 | encode_features.append(x) 125 | 126 | # for detection head 127 | # [200, 176, 5] -> [200, 176, 2] 128 | out = self.conv_out(encode_features[-1]) 129 | 130 | ## for following usage, comment following code 131 | # spatial_features = out.dense() 132 | 133 | # N, C, D, H, W = spatial_features.shape 134 | # spatial_features = spatial_features.view(N, C , D, H, W) 135 | 136 | return out 137 | 138 | def make_encoder_layers(self, 139 | make_block, 140 | norm_cfg, 141 | in_channels, 142 | block_type='conv_module', 143 | conv_cfg=dict(type='SubMConv3d')): 144 | """make encoder layers using sparse convs. 145 | 146 | Args: 147 | make_block (method): A bounded function to build blocks. 148 | norm_cfg (dict[str]): Config of normalization layer. 149 | in_channels (int): The number of encoder input channels. 150 | block_type (str, optional): Type of the block to use. 151 | Defaults to 'conv_module'. 152 | conv_cfg (dict, optional): Config of conv layer. Defaults to 153 | dict(type='SubMConv3d'). 154 | 155 | Returns: 156 | int: The number of encoder output channels. 157 | """ 158 | assert block_type in ['conv_module', 'basicblock'] 159 | self.encoder_layers = SparseSequential() 160 | 161 | for i, blocks in enumerate(self.encoder_channels): 162 | blocks_list = [] 163 | for j, out_channels in enumerate(tuple(blocks)): 164 | padding = tuple(self.encoder_paddings[i])[j] 165 | # each stage started with a spconv layer 166 | # except the first stage 167 | if i != 0 and j == 0 and block_type == 'conv_module': 168 | blocks_list.append( 169 | make_block( 170 | in_channels, 171 | out_channels, 172 | 3, 173 | norm_cfg=norm_cfg, 174 | stride=2, 175 | padding=padding, 176 | indice_key=f'spconv{i + 1}', 177 | conv_type='SparseConv3d')) 178 | elif block_type == 'basicblock': 179 | if j == len(blocks) - 1 and i != len( 180 | self.encoder_channels) - 1: 181 | blocks_list.append( 182 | make_block( 183 | in_channels, 184 | out_channels, 185 | 3, 186 | norm_cfg=norm_cfg, 187 | stride=2, 188 | padding=padding, 189 | indice_key=f'spconv{i + 1}', 190 | conv_type='SparseConv3d')) 191 | else: 192 | blocks_list.append( 193 | SparseBasicBlock( 194 | out_channels, 195 | out_channels, 196 | norm_cfg=norm_cfg, 197 | conv_cfg=conv_cfg)) 198 | else: 199 | blocks_list.append( 200 | make_block( 201 | in_channels, 202 | out_channels, 203 | 3, 204 | norm_cfg=norm_cfg, 205 | padding=padding, 206 | indice_key=f'subm{i + 1}', 207 | conv_type='SubMConv3d')) 208 | in_channels = out_channels 209 | stage_name = f'encoder_layer{i + 1}' 210 | stage_layers = SparseSequential(*blocks_list) 211 | self.encoder_layers.add_module(stage_name, stage_layers) 212 | return out_channels -------------------------------------------------------------------------------- /loaders/old_metrics.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from sklearn.neighbors import KDTree 4 | from termcolor import colored 5 | from functools import reduce 6 | from typing import Iterable 7 | 8 | np.seterr(divide='ignore', invalid='ignore') 9 | os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" 10 | 11 | 12 | def pcolor(string, color, on_color=None, attrs=None): 13 | """ 14 | Produces a colored string for printing 15 | 16 | Parameters 17 | ---------- 18 | string : str 19 | String that will be colored 20 | color : str 21 | Color to use 22 | on_color : str 23 | Background color to use 24 | attrs : list of str 25 | Different attributes for the string 26 | 27 | Returns 28 | ------- 29 | string: str 30 | Colored string 31 | """ 32 | return colored(string, color, on_color, attrs) 33 | 34 | 35 | def getCellCoordinates(points, voxelSize): 36 | return (points / voxelSize).astype(np.int) 37 | 38 | 39 | def getNumUniqueCells(cells): 40 | M = cells.max() + 1 41 | return np.unique(cells[:, 0] + M * cells[:, 1] + M ** 2 * cells[:, 2]).shape[0] 42 | 43 | 44 | class Metric_mIoU_Occ3D(): 45 | def __init__(self, 46 | save_dir='.', 47 | num_classes=18, 48 | use_lidar_mask=False, 49 | use_image_mask=False, 50 | ): 51 | if num_classes == 18: 52 | self.class_names = [ 53 | 'others','barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 54 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 55 | 'driveable_surface', 'other_flat', 'sidewalk', 56 | 'terrain', 'manmade', 'vegetation','free' 57 | ] 58 | elif num_classes == 2: 59 | self.class_names = ['non-free', 'free'] 60 | 61 | self.save_dir = save_dir 62 | self.use_lidar_mask = use_lidar_mask 63 | self.use_image_mask = use_image_mask 64 | self.num_classes = num_classes 65 | 66 | self.point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4] 67 | self.occupancy_size = [0.4, 0.4, 0.4] 68 | self.voxel_size = 0.4 69 | self.occ_xdim = int((self.point_cloud_range[3] - self.point_cloud_range[0]) / self.occupancy_size[0]) 70 | self.occ_ydim = int((self.point_cloud_range[4] - self.point_cloud_range[1]) / self.occupancy_size[1]) 71 | self.occ_zdim = int((self.point_cloud_range[5] - self.point_cloud_range[2]) / self.occupancy_size[2]) 72 | self.voxel_num = self.occ_xdim * self.occ_ydim * self.occ_zdim 73 | self.hist = np.zeros((self.num_classes, self.num_classes)) 74 | self.cnt = 0 75 | 76 | def hist_info(self, n_cl, pred, gt): 77 | """ 78 | build confusion matrix 79 | # empty classes:0 80 | non-empty class: 0-16 81 | free voxel class: 17 82 | 83 | Args: 84 | n_cl (int): num_classes_occupancy 85 | pred (1-d array): pred_occupancy_label 86 | gt (1-d array): gt_occupancu_label 87 | 88 | Returns: 89 | tuple:(hist, correctly number_predicted_labels, num_labelled_sample) 90 | """ 91 | assert pred.shape == gt.shape 92 | k = (gt >= 0) & (gt < n_cl) # exclude 255 93 | labeled = np.sum(k) 94 | correct = np.sum((pred[k] == gt[k])) 95 | 96 | return ( 97 | np.bincount( 98 | n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2 99 | ).reshape(n_cl, n_cl), 100 | correct, 101 | labeled, 102 | ) 103 | 104 | def per_class_iu(self, hist): 105 | #return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) 106 | result = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) 107 | result[hist.sum(1) == 0] = float('nan') 108 | return result 109 | 110 | def compute_mIoU(self, pred, label, n_classes): 111 | hist = np.zeros((n_classes, n_classes)) 112 | new_hist, correct, labeled = self.hist_info(n_classes, pred.flatten(), label.flatten()) 113 | hist += new_hist 114 | mIoUs = self.per_class_iu(hist) 115 | # for ind_class in range(n_classes): 116 | # print(str(round(mIoUs[ind_class] * 100, 2))) 117 | # print('===> mIoU: ' + str(round(np.nanmean(mIoUs) * 100, 2))) 118 | return round(np.nanmean(mIoUs) * 100, 2), hist 119 | 120 | def add_batch(self,semantics_pred,semantics_gt,mask_lidar,mask_camera): 121 | self.cnt += 1 122 | if self.use_image_mask: 123 | masked_semantics_gt = semantics_gt[mask_camera] 124 | masked_semantics_pred = semantics_pred[mask_camera] 125 | elif self.use_lidar_mask: 126 | masked_semantics_gt = semantics_gt[mask_lidar] 127 | masked_semantics_pred = semantics_pred[mask_lidar] 128 | else: 129 | masked_semantics_gt = semantics_gt 130 | masked_semantics_pred = semantics_pred 131 | 132 | if self.num_classes == 2: 133 | masked_semantics_pred = np.copy(masked_semantics_pred) 134 | masked_semantics_gt = np.copy(masked_semantics_gt) 135 | masked_semantics_pred[masked_semantics_pred < 17] = 0 136 | masked_semantics_pred[masked_semantics_pred == 17] = 1 137 | masked_semantics_gt[masked_semantics_gt < 17] = 0 138 | masked_semantics_gt[masked_semantics_gt == 17] = 1 139 | 140 | _, _hist = self.compute_mIoU(masked_semantics_pred, masked_semantics_gt, self.num_classes) 141 | self.hist += _hist 142 | 143 | def count_miou(self): 144 | mIoU = self.per_class_iu(self.hist) 145 | # assert cnt == num_samples, 'some samples are not included in the miou calculation' 146 | print(f'===> per class IoU of {self.cnt} samples:') 147 | for ind_class in range(self.num_classes-1): 148 | print(f'===> {self.class_names[ind_class]} - IoU = ' + str(round(mIoU[ind_class] * 100, 2))) 149 | 150 | print(f'===> mIoU of {self.cnt} samples: ' + str(round(np.nanmean(mIoU[:self.num_classes-1]) * 100, 2))) 151 | # print(f'===> sample-wise averaged mIoU of {cnt} samples: ' + str(round(np.nanmean(mIoU_avg), 2))) 152 | 153 | return round(np.nanmean(mIoU[:self.num_classes-1]) * 100, 2) 154 | 155 | 156 | class Metric_mIoU_Occupancy: 157 | 158 | def __init__(self): 159 | self.class_names = [ 160 | 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 161 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 162 | 'driveable_surface', 'other_flat', 'sidewalk', 163 | 'terrain', 'manmade', 'vegetation','free' 164 | ] 165 | self.point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3] 166 | self.occupancy_size = [0.2, 0.2, 0.2] 167 | self.voxel_size = 0.2 168 | self.occ_xdim = int((self.point_cloud_range[3] - self.point_cloud_range[0]) / self.occupancy_size[0]) 169 | self.occ_ydim = int((self.point_cloud_range[4] - self.point_cloud_range[1]) / self.occupancy_size[1]) 170 | self.occ_zdim = int((self.point_cloud_range[5] - self.point_cloud_range[2]) / self.occupancy_size[2]) 171 | self.voxel_num = self.occ_xdim * self.occ_ydim * self.occ_zdim 172 | self.num_classes = len(self.class_names) 173 | self.hist = np.zeros((self.num_classes, self.num_classes)) 174 | self.bin_hist = np.zeros((2, 2)) 175 | self.cnt = 0 176 | 177 | def hist_info(self, n_cl, pred, gt): 178 | assert pred.shape == gt.shape 179 | k = (gt >= 0) & (gt < n_cl) # exclude 255 180 | return np.bincount( 181 | n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2).reshape(n_cl, n_cl) 182 | 183 | def per_class_iu(self, hist): 184 | #return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) 185 | result = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) 186 | result[hist.sum(1) == 0] = float('nan') 187 | return result 188 | 189 | def add_batch(self, semantics_pred, semantics_gt, mask=None): 190 | self.cnt += 1 191 | if mask is not None: 192 | semantics_pred = semantics_pred[mask] 193 | semantics_gt = semantics_gt[mask] 194 | 195 | pred = semantics_pred.flatten() 196 | binary_pred = pred.copy() 197 | binary_pred[binary_pred < self.num_classes-1] = 0 198 | binary_pred[binary_pred == self.num_classes-1] = 1 199 | 200 | gt = semantics_gt.flatten() 201 | binary_gt = gt.copy() 202 | binary_gt[binary_gt < self.num_classes-1] = 0 203 | binary_gt[binary_gt == self.num_classes-1] = 1 204 | 205 | self.hist += self.hist_info(self.num_classes, pred, gt) 206 | self.bin_hist += self.hist_info(2, binary_pred, binary_gt) 207 | 208 | def count_miou(self): 209 | mIoU = self.per_class_iu(self.hist) 210 | IoU = self.per_class_iu(self.bin_hist) 211 | # assert cnt == num_samples, 'some samples are not included in the miou calculation' 212 | print(f'===> per class IoU of {self.cnt} samples:') 213 | for ind_class in range(self.num_classes-1): 214 | print(f'===> {self.class_names[ind_class]} - IoU = ' + str(round(mIoU[ind_class] * 100, 2))) 215 | 216 | print(f'===> mIoU of {self.cnt} samples: ' + str(round(np.nanmean(mIoU[:self.num_classes-1]) * 100, 2))) 217 | print(f'===> IoU of {self.cnt} samples: ' + str(round(IoU[0] * 100, 2))) 218 | 219 | return round(np.nanmean(mIoU[:self.num_classes-1]) * 100, 2), round(IoU[0] * 100, 2) -------------------------------------------------------------------------------- /configs/opusv1-fusion_nusc-occ3d/opusv1-fusion-l_r50_704x256_8f_nusc-occ3d_100e.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'NuScenesOcc3DDataset' 2 | dataset_root = 'data/nuscenes/' 3 | occ_root = 'data/nuscenes/gts/' 4 | 5 | input_modality = dict( 6 | use_lidar=True, 7 | use_camera=True, 8 | use_radar=False, 9 | use_map=False, 10 | use_external=True 11 | ) 12 | 13 | # For nuScenes we usually do 10-class detection 14 | object_names = [ 15 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 16 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 17 | ] 18 | 19 | occ_names = [ 20 | 'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 21 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 22 | 'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade', 23 | 'vegetation' 24 | ] 25 | 26 | # If point cloud range is changed, the models should also change their point 27 | # cloud range accordingly 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4] 29 | pc_voxel_size = [0.05, 0.05, 0.16] 30 | voxel_size = [0.4, 0.4, 0.4] 31 | 32 | # arch config 33 | embed_dims = 256 34 | num_layers = 6 35 | num_query = 4800 36 | num_frames = 8 37 | num_levels = 4 38 | num_points = 2 39 | num_refines = [1, 2, 4, 8, 16, 16] 40 | 41 | img_backbone = dict( 42 | type='ResNet', 43 | depth=50, 44 | num_stages=4, 45 | out_indices=(0, 1, 2, 3), 46 | frozen_stages=1, 47 | norm_cfg=dict(type='BN2d', requires_grad=True), 48 | norm_eval=True, 49 | style='pytorch', 50 | with_cp=True) 51 | img_neck = dict( 52 | type='FPN', 53 | in_channels=[256, 512, 1024, 2048], 54 | out_channels=embed_dims, 55 | num_outs=num_levels) 56 | img_norm_cfg = dict( 57 | mean=[123.675, 116.280, 103.530], 58 | std=[58.395, 57.120, 57.375], 59 | to_rgb=True) 60 | 61 | pts_voxel_layer=dict(max_num_points=10, voxel_size=pc_voxel_size, deterministic=False, 62 | max_voxels=(90000, 120000), point_cloud_range=point_cloud_range) 63 | pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5) 64 | pts_middle_encoder=dict( 65 | type='SparseEncoder', 66 | in_channels=5, 67 | sparse_shape=[41, 1600, 1600], 68 | output_channels=128, 69 | order=('conv', 'norm', 'act'), 70 | encoder_channels=((16, 16, 32), 71 | (32, 32, 64), 72 | (64, 64, 128), 73 | (128,128)), 74 | encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), 75 | block_type='basicblock') 76 | pts_backbone=dict( 77 | type='SECOND', 78 | in_channels=256, 79 | out_channels=[128, 256], 80 | layer_nums=[5, 5], 81 | layer_strides=[1, 2], 82 | norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), 83 | conv_cfg=dict(type='Conv2d', bias=False)) 84 | pts_neck=dict( 85 | type='SECONDFPN', 86 | in_channels=[128, 256], 87 | out_channels=[256, 256], 88 | upsample_strides=[1, 2], 89 | norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), 90 | upsample_cfg=dict(type='deconv', bias=False), 91 | use_conv_for_no_stride=True) 92 | 93 | model = dict( 94 | type='OPUSV1Fusion', 95 | use_grid_mask=False, 96 | data_aug=dict( 97 | img_color_aug=True, # Move some augmentations to GPU 98 | img_norm_cfg=img_norm_cfg, 99 | img_pad_cfg=dict(size_divisor=32)), 100 | stop_prev_grad=0, 101 | img_backbone=img_backbone, 102 | img_neck=img_neck, 103 | pts_voxel_layer=pts_voxel_layer, 104 | pts_voxel_encoder=pts_voxel_encoder, 105 | pts_middle_encoder=pts_middle_encoder, 106 | pts_backbone=pts_backbone, 107 | pts_neck=pts_neck, 108 | pts_bbox_head=dict( 109 | type='OPUSV1FusionHead', 110 | num_classes=len(occ_names), 111 | in_channels=embed_dims, 112 | num_query=num_query, 113 | pc_range=point_cloud_range, 114 | voxel_size=voxel_size, 115 | init_pos_lidar='curr', 116 | transformer=dict( 117 | type='OPUSV1FusionTransformer', 118 | embed_dims=embed_dims, 119 | num_frames=num_frames, 120 | num_points=num_points, 121 | num_layers=num_layers, 122 | num_levels=num_levels, 123 | num_classes=len(occ_names), 124 | num_refines=num_refines, 125 | scales=[0.5], 126 | pc_range=point_cloud_range), 127 | loss_cls=dict( 128 | type='FocalLoss', 129 | use_sigmoid=True, 130 | gamma=2.0, 131 | alpha=0.25, 132 | loss_weight=2.0), 133 | loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)), 134 | train_cfg=dict( 135 | pts=dict( 136 | cls_weights=[ 137 | 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1], 138 | ) 139 | ), 140 | test_cfg=dict( 141 | pts=dict( 142 | score_thr=0.5, 143 | padding=True) 144 | ) 145 | ) 146 | 147 | ida_aug_conf = { 148 | 'resize_lim': (0.38, 0.55), 149 | 'final_dim': (256, 704), 150 | 'bot_pct_lim': (0.0, 0.0), 151 | 'rot_lim': (0.0, 0.0), 152 | 'H': 900, 'W': 1600, 153 | 'rand_flip': True, 154 | } 155 | 156 | train_pipeline = [ 157 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 158 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1), 159 | dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5), 160 | dict(type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], 161 | pad_empty_sweeps=True, remove_close=True), 162 | dict(type='LiDARToOccSpace'), 163 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), 164 | dict(type='LoadOcc3DFromFile', occ_root=occ_root), 165 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 166 | dict(type='ObjectNameFilter', classes=object_names), 167 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True), 168 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 169 | dict(type='DefaultFormatBundle3D', class_names=object_names), 170 | dict(type='Collect3D', keys=['img', 'points', 'voxel_semantics', 'mask_camera'], meta_keys=( 171 | 'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp')) 172 | ] 173 | 174 | test_pipeline = [ 175 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 176 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True), 177 | dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5), 178 | dict(type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], 179 | pad_empty_sweeps=True, remove_close=True), 180 | dict(type='LiDARToOccSpace'), 181 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False), 182 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 183 | dict( 184 | type='MultiScaleFlipAug3D', 185 | img_scale=(1600, 900), 186 | pts_scale_ratio=1, 187 | flip=False, 188 | transforms=[ 189 | dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False), 190 | dict(type='Collect3D', keys=['img', 'points'], meta_keys=( 191 | 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape', 192 | 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp')) 193 | ]) 194 | ] 195 | 196 | data = dict( 197 | # workers_per_gpu=1, 198 | workers_per_gpu=4, 199 | train=dict( 200 | type=dataset_type, 201 | data_root=dataset_root, 202 | ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl', 203 | pipeline=train_pipeline, 204 | classes=object_names, 205 | modality=input_modality, 206 | test_mode=False, 207 | use_valid_flag=True, 208 | box_type_3d='LiDAR'), 209 | val=dict( 210 | type=dataset_type, 211 | data_root=dataset_root, 212 | ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl', 213 | pipeline=test_pipeline, 214 | classes=object_names, 215 | modality=input_modality, 216 | test_mode=True, 217 | box_type_3d='LiDAR'), 218 | test=dict( 219 | type=dataset_type, 220 | data_root=dataset_root, 221 | ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl', 222 | pipeline=test_pipeline, 223 | classes=object_names, 224 | modality=input_modality, 225 | test_mode=True, 226 | box_type_3d='LiDAR') 227 | ) 228 | 229 | optimizer = dict( 230 | type='AdamW', 231 | lr=2e-4, 232 | paramwise_cfg=dict(custom_keys={ 233 | 'img_backbone': dict(lr_mult=0.1), 234 | 'sampling_offset': dict(lr_mult=0.1), 235 | }), 236 | weight_decay=0.01 237 | ) 238 | 239 | optimizer_config = dict( 240 | type='Fp16OptimizerHook', 241 | loss_scale=512.0, 242 | grad_clip=dict(max_norm=35, norm_type=2) 243 | ) 244 | 245 | # learning policy 246 | lr_config = dict( 247 | policy='CosineAnnealing', 248 | warmup='linear', 249 | warmup_iters=500, 250 | warmup_ratio=1.0 / 3, 251 | min_lr_ratio=1e-3 252 | ) 253 | total_epochs = 100 254 | batch_size = 8 255 | 256 | # load pretrained weights 257 | load_from = 'pretrain/fusion_pretrain_model.pth' 258 | revise_keys = [] 259 | 260 | # resume the last training 261 | resume_from = None 262 | 263 | # checkpointing 264 | checkpoint_config = dict(interval=1, max_keep_ckpts=1) 265 | 266 | # logging 267 | log_config = dict( 268 | interval=1, 269 | hooks=[ 270 | dict(type='TextLoggerHook', interval=50, reset_flag=True), 271 | dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True) 272 | ] 273 | ) 274 | 275 | # evaluation 276 | eval_config = dict(interval=total_epochs) 277 | 278 | # other flags 279 | debug = False 280 | -------------------------------------------------------------------------------- /configs/opusv1-fusion_nusc-occ3d/opusv1-fusion-m_r50_704x256_8f_nusc-occ3d_100e.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'NuScenesOcc3DDataset' 2 | dataset_root = 'data/nuscenes/' 3 | occ_root = 'data/nuscenes/gts/' 4 | 5 | input_modality = dict( 6 | use_lidar=True, 7 | use_camera=True, 8 | use_radar=False, 9 | use_map=False, 10 | use_external=True 11 | ) 12 | 13 | # For nuScenes we usually do 10-class detection 14 | object_names = [ 15 | 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 16 | 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' 17 | ] 18 | 19 | occ_names = [ 20 | 'others', 'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 21 | 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 22 | 'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade', 23 | 'vegetation' 24 | ] 25 | 26 | # If point cloud range is changed, the models should also change their point 27 | # cloud range accordingly 28 | point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4] 29 | pc_voxel_size = [0.05, 0.05, 0.16] 30 | voxel_size = [0.4, 0.4, 0.4] 31 | 32 | # arch config 33 | embed_dims = 256 34 | num_layers = 6 35 | num_query = 2400 36 | num_frames = 8 37 | num_levels = 4 38 | num_points = 2 39 | num_refines = [1, 2, 4, 8, 16, 32] 40 | 41 | img_backbone = dict( 42 | type='ResNet', 43 | depth=50, 44 | num_stages=4, 45 | out_indices=(0, 1, 2, 3), 46 | frozen_stages=1, 47 | norm_cfg=dict(type='BN2d', requires_grad=True), 48 | norm_eval=True, 49 | style='pytorch', 50 | with_cp=True) 51 | img_neck = dict( 52 | type='FPN', 53 | in_channels=[256, 512, 1024, 2048], 54 | out_channels=embed_dims, 55 | num_outs=num_levels) 56 | img_norm_cfg = dict( 57 | mean=[123.675, 116.280, 103.530], 58 | std=[58.395, 57.120, 57.375], 59 | to_rgb=True) 60 | 61 | pts_voxel_layer=dict(max_num_points=10, voxel_size=pc_voxel_size, deterministic=False, 62 | max_voxels=(90000, 120000), point_cloud_range=point_cloud_range) 63 | pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5) 64 | pts_middle_encoder=dict( 65 | type='SparseEncoder', 66 | in_channels=5, 67 | sparse_shape=[41, 1600, 1600], 68 | output_channels=128, 69 | order=('conv', 'norm', 'act'), 70 | encoder_channels=((16, 16, 32), 71 | (32, 32, 64), 72 | (64, 64, 128), 73 | (128,128)), 74 | encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), 75 | block_type='basicblock') 76 | pts_backbone=dict( 77 | type='SECOND', 78 | in_channels=256, 79 | out_channels=[128, 256], 80 | layer_nums=[5, 5], 81 | layer_strides=[1, 2], 82 | norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), 83 | conv_cfg=dict(type='Conv2d', bias=False)) 84 | pts_neck=dict( 85 | type='SECONDFPN', 86 | in_channels=[128, 256], 87 | out_channels=[256, 256], 88 | upsample_strides=[1, 2], 89 | norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), 90 | upsample_cfg=dict(type='deconv', bias=False), 91 | use_conv_for_no_stride=True) 92 | 93 | model = dict( 94 | type='OPUSV1Fusion', 95 | use_grid_mask=False, 96 | data_aug=dict( 97 | img_color_aug=True, # Move some augmentations to GPU 98 | img_norm_cfg=img_norm_cfg, 99 | img_pad_cfg=dict(size_divisor=32)), 100 | stop_prev_grad=0, 101 | img_backbone=img_backbone, 102 | img_neck=img_neck, 103 | pts_voxel_layer=pts_voxel_layer, 104 | pts_voxel_encoder=pts_voxel_encoder, 105 | pts_middle_encoder=pts_middle_encoder, 106 | pts_backbone=pts_backbone, 107 | pts_neck=pts_neck, 108 | pts_bbox_head=dict( 109 | type='OPUSV1FusionHead', 110 | num_classes=len(occ_names), 111 | in_channels=embed_dims, 112 | num_query=num_query, 113 | pc_range=point_cloud_range, 114 | voxel_size=voxel_size, 115 | init_pos_lidar='curr', 116 | transformer=dict( 117 | type='OPUSV1FusionTransformer', 118 | embed_dims=embed_dims, 119 | num_frames=num_frames, 120 | num_points=num_points, 121 | num_layers=num_layers, 122 | num_levels=num_levels, 123 | num_classes=len(occ_names), 124 | num_refines=num_refines, 125 | scales=[0.5], 126 | pc_range=point_cloud_range), 127 | loss_cls=dict( 128 | type='FocalLoss', 129 | use_sigmoid=True, 130 | gamma=2.0, 131 | alpha=0.25, 132 | loss_weight=2.0), 133 | loss_pts=dict(type='SmoothL1Loss', beta=0.2, loss_weight=0.5)), 134 | train_cfg=dict( 135 | pts=dict( 136 | cls_weights=[ 137 | 10, 5, 10, 5, 5, 10, 10, 5, 10, 5, 5, 1, 5, 1, 1, 2, 1], 138 | ) 139 | ), 140 | test_cfg=dict( 141 | pts=dict( 142 | score_thr=0.5, 143 | padding=True) 144 | ) 145 | ) 146 | 147 | ida_aug_conf = { 148 | 'resize_lim': (0.38, 0.55), 149 | 'final_dim': (256, 704), 150 | 'bot_pct_lim': (0.0, 0.0), 151 | 'rot_lim': (0.0, 0.0), 152 | 'H': 900, 'W': 1600, 153 | 'rand_flip': True, 154 | } 155 | 156 | train_pipeline = [ 157 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 158 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1), 159 | dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5), 160 | dict(type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], 161 | pad_empty_sweeps=True, remove_close=True), 162 | dict(type='LiDARToOccSpace'), 163 | dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), 164 | dict(type='LoadOcc3DFromFile', occ_root=occ_root), 165 | dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), 166 | dict(type='ObjectNameFilter', classes=object_names), 167 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True), 168 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 169 | dict(type='DefaultFormatBundle3D', class_names=object_names), 170 | dict(type='Collect3D', keys=['img', 'points', 'voxel_semantics', 'mask_camera'], meta_keys=( 171 | 'filename', 'ori_shape', 'img_shape', 'pad_shape', 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp')) 172 | ] 173 | 174 | test_pipeline = [ 175 | dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'), 176 | dict(type='LoadMultiViewImageFromMultiSweeps', sweeps_num=num_frames - 1, test_mode=True), 177 | dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5), 178 | dict(type='LoadPointsFromMultiSweeps', sweeps_num=9, use_dim=[0, 1, 2, 3, 4], 179 | pad_empty_sweeps=True, remove_close=True), 180 | dict(type='LiDARToOccSpace'), 181 | dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False), 182 | dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), 183 | dict( 184 | type='MultiScaleFlipAug3D', 185 | img_scale=(1600, 900), 186 | pts_scale_ratio=1, 187 | flip=False, 188 | transforms=[ 189 | dict(type='DefaultFormatBundle3D', class_names=object_names, with_label=False), 190 | dict(type='Collect3D', keys=['img', 'points'], meta_keys=( 191 | 'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape', 192 | 'ego2occ', 'ego2img', 'ego2lidar', 'img_timestamp')) 193 | ]) 194 | ] 195 | 196 | data = dict( 197 | # workers_per_gpu=1, 198 | workers_per_gpu=4, 199 | train=dict( 200 | type=dataset_type, 201 | data_root=dataset_root, 202 | ann_file=dataset_root + 'nuscenes_infos_train_sweep.pkl', 203 | pipeline=train_pipeline, 204 | classes=object_names, 205 | modality=input_modality, 206 | test_mode=False, 207 | use_valid_flag=True, 208 | box_type_3d='LiDAR'), 209 | val=dict( 210 | type=dataset_type, 211 | data_root=dataset_root, 212 | ann_file=dataset_root + 'nuscenes_infos_val_sweep.pkl', 213 | pipeline=test_pipeline, 214 | classes=object_names, 215 | modality=input_modality, 216 | test_mode=True, 217 | box_type_3d='LiDAR'), 218 | test=dict( 219 | type=dataset_type, 220 | data_root=dataset_root, 221 | ann_file=dataset_root + 'nuscenes_infos_test_sweep.pkl', 222 | pipeline=test_pipeline, 223 | classes=object_names, 224 | modality=input_modality, 225 | test_mode=True, 226 | box_type_3d='LiDAR') 227 | ) 228 | 229 | optimizer = dict( 230 | type='AdamW', 231 | lr=2e-4, 232 | paramwise_cfg=dict(custom_keys={ 233 | 'img_backbone': dict(lr_mult=0.1), 234 | 'sampling_offset': dict(lr_mult=0.1), 235 | }), 236 | weight_decay=0.01 237 | ) 238 | 239 | optimizer_config = dict( 240 | type='Fp16OptimizerHook', 241 | loss_scale=512.0, 242 | grad_clip=dict(max_norm=35, norm_type=2) 243 | ) 244 | 245 | # learning policy 246 | lr_config = dict( 247 | policy='CosineAnnealing', 248 | warmup='linear', 249 | warmup_iters=500, 250 | warmup_ratio=1.0 / 3, 251 | min_lr_ratio=1e-3 252 | ) 253 | total_epochs = 100 254 | batch_size = 8 255 | 256 | # load pretrained weights 257 | load_from = 'pretrain/fusion_pretrain_model.pth' 258 | revise_keys = [] 259 | 260 | # resume the last training 261 | resume_from = None 262 | 263 | # checkpointing 264 | checkpoint_config = dict(interval=1, max_keep_ckpts=1) 265 | 266 | # logging 267 | log_config = dict( 268 | interval=1, 269 | hooks=[ 270 | dict(type='TextLoggerHook', interval=50, reset_flag=True), 271 | dict(type='MyTensorboardLoggerHook', interval=500, reset_flag=True) 272 | ] 273 | ) 274 | 275 | # evaluation 276 | eval_config = dict(interval=total_epochs) 277 | 278 | # other flags 279 | debug = False 280 | --------------------------------------------------------------------------------